Clean-up MSVC ARM64 NEON support

Also fixing clang-cl support. Pick-to: 6.8 6.8.0 Change-Id: If2130091edfadc0cc4d4cecd95c2256522efc69a Reviewed-by: Maurice Kalinowski <[email protected]> Reviewed-by: Eirik Aavitsland <[email protected]>
author: Allan Sandfeld Jensen <[email protected]> 2024-09-13 13:19:19 +0200
committer: Allan Sandfeld Jensen <[email protected]> 2024-09-17 08:27:53 +0200
commit: 308bca94a72f83624e2e2c92449719e06940e77f (patch)
tree: 35125707bb3440df20e703a5f9f1b324cc490840
parent: 0681e720a9851f1873ce5a5f99b5567d2b418261 (diff)
7 files changed, 69 insertions, 64 deletions
diff --git a/src/corelib/global/qsimd_p.h b/src/corelib/global/qsimd_p.h
index b9cd296c9e8..a5dc6487c3b 100644
--- a/src/corelib/global/qsimd_p.h
+++ b/src/corelib/global/qsimd_p.h
@@ -283,6 +283,59 @@ inline uint8_t vaddv_u8(uint8x8_t v8)
 }
 #endif
 
+// Missing NEON intrinsics, needed due different type definitions:
+inline uint16x8_t qvsetq_n_u16(uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4,
+                               uint16_t v5, uint16_t v6, uint16_t v7, uint16_t v8) {
+#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG)
+    using u64 = uint64_t;
+    const uint16x8_t vmask = {
+        v1 | (v2 << 16) | (u64(v3) << 32) | (u64(v4) << 48),
+        v5 | (v6 << 16) | (u64(v7) << 32) | (u64(v8) << 48)
+    };
+#else
+    const uint16x8_t vmask = { v1, v2, v3, v4, v5, v6, v7, v8 };
+#endif
+    return vmask;
+}
+inline uint8x8_t qvset_n_u8(uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
+                            uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8) {
+#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG)
+    using u64 = uint64_t;
+    const uint8x8_t vmask = {
+        v1 | (v2 << 8) | (v3 << 16) | (v4 << 24) |
+        (u64(v5) << 32) | (u64(v6) << 40) | (u64(v7) << 48) | (u64(v8) << 56)
+    };
+#else
+    const uint8x8_t vmask = { v1, v2, v3, v4, v5, v6, v7, v8 };
+#endif
+    return vmask;
+}
+inline uint8x16_t qvsetq_n_u8(uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,
+                              uint8_t v5,  uint8_t v6,  uint8_t v7,  uint8_t v8,
+                              uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12,
+                              uint8_t v13, uint8_t v14, uint8_t v15, uint8_t v16) {
+#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG)
+    using u64 = uint64_t;
+    const uint8x16_t vmask = {
+        v1 | (v2 << 8) | (v3 << 16) | (v4 << 24) |
+        (u64(v5) << 32) | (u64(v6) << 40) | (u64(v7) << 48) | (u64(v8) << 56),
+        v9 | (v10 << 8) | (v11 << 16) | (v12 << 24) |
+        (u64(v13) << 32) | (u64(v14) << 40) | (u64(v15) << 48) | (u64(v16) << 56)
+    };
+#else
+    const uint8x16_t vmask = { v1, v2,  v3,  v4,  v5,  v6,  v7,  v8,
+                               v9, v10, v11, v12, v13, v14, v15, v16};
+#endif
+    return vmask;
+}
+inline uint32x4_t qvsetq_n_u32(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
+{
+#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG)
+    return uint32x4_t{ (uint64_t(b) << 32) | a, (uint64_t(d) << 32) | c };
+#else
+    return uint32x4_t{ a, b, c, d };
+#endif
+}
 #endif
 
 #if defined(Q_PROCESSOR_ARM) && defined(__ARM_FEATURE_CRC32)
diff --git a/src/corelib/text/qstring.cpp b/src/corelib/text/qstring.cpp
index 077d9accded..c3d1594ac6b 100644
--- a/src/corelib/text/qstring.cpp
+++ b/src/corelib/text/qstring.cpp
@@ -749,11 +749,7 @@ const char16_t *QtPrivate::qustrchr(QStringView str, char16_t c) noexcept
                                    [=](qsizetype i) { return n + i; });
 #  endif
 #elif defined(__ARM_NEON__)
-#ifdef _MSC_VER
-    const uint16x8_t vmask = { 0x0008000400020001ULL, 0x0080004000200010ULL };
-#else
-    const uint16x8_t vmask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
-#endif
+    const uint16x8_t vmask = qvsetq_n_u16(1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7);
     const uint16x8_t ch_vec = vdupq_n_u16(c);
     for (const char16_t *next = n + 8; next <= e; n = next, next += 8) {
         uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(n));
@@ -1294,11 +1290,7 @@ static int ucstrncmp(const char16_t *a, const char16_t *b, size_t l)
 #  elif defined(__ARM_NEON__)
     if (l >= 8) {
         const char16_t *end = a + l;
-#ifdef _MSC_VER
-        const uint16x8_t mask = { 0x0008000400020001ULL, 0x0080004000200010ULL };
-#else
-        const uint16x8_t mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
-#endif
+        const uint16x8_t mask = qvsetq_n_u16( 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 );
         while (end - a > 7) {
             uint16x8_t da = vld1q_u16(reinterpret_cast<const uint16_t *>(a));
             uint16x8_t db = vld1q_u16(reinterpret_cast<const uint16_t *>(b));
diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp
index e411990a456..104c10a7674 100644
--- a/src/corelib/text/qstringconverter.cpp
+++ b/src/corelib/text/qstringconverter.cpp
@@ -351,12 +351,7 @@ static void simdCompareAscii(const qchar8_t *&src8, const qchar8_t *end8, const
 static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
 {
     uint16x8_t maxAscii = vdupq_n_u16(0x7f);
-#ifdef _MSC_VER
-    uint16_t mask1t[8] = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
-    uint16x8_t mask1 = vld1q_u16(mask1t);
-#else
-    uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
-#endif
+    uint16x8_t mask1 = qvsetq_n_u16(1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 );
     uint16x8_t mask2 = vshlq_n_u16(mask1, 1);
 
     // do sixteen characters at a time
@@ -394,12 +389,7 @@ static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, cons
 {
     // do eight characters at a time
     uint8x8_t msb_mask = vdup_n_u8(0x80);
-#ifdef _MSC_VER
-    uint8_t add_maskt[8] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
-    uint8x8_t add_mask = vld1_u8(add_maskt);
-#else
-    uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
-#endif
+    uint8x8_t add_mask = qvset_n_u8(1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 );
     for ( ; end - src >= 8; src += 8, dst += 8) {
         uint8x8_t c = vld1_u8(src);
         uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
@@ -435,12 +425,7 @@ static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end,
 
     // do eight characters at a time
     uint8x8_t msb_mask = vdup_n_u8(0x80);
-#ifdef _MSC_VER
-    uint8_t add_maskt[8] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
-    uint8x8_t add_mask = vld1_u8(add_maskt);
-#else
-    uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
-#endif
+    uint8x8_t add_mask = qvset_n_u8(1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7);
     for ( ; end - src >= 8; src += 8) {
         uint8x8_t c = vld1_u8(src);
         uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
diff --git a/src/gui/painting/qcolortransform.cpp b/src/gui/painting/qcolortransform.cpp
index 2cbc19b9626..42315986b7a 100644
--- a/src/gui/painting/qcolortransform.cpp
+++ b/src/gui/painting/qcolortransform.cpp
@@ -662,21 +662,12 @@ static inline bool test_all_zero(uint32x4_t p)
 #endif
 }
 
-static inline uint32x4_t vsetq_u32(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
-{
-#ifdef _MSC_VER
-    return uint32x4_t{ (uint64_t(b) << 32) | a, (uint64_t(d) << 32) | c };
-#else
-    return uint32x4_t{ a, b, c, d };
-#endif
-}
-
 template<typename T>
 static void loadPremultiplied(QColorVector *buffer, const T *src, const qsizetype len, const QColorTransformPrivate *d_ptr)
 {
     constexpr bool isARGB = isArgb<T>();
     const float iFF00 = 1.0f / (255 * 256);
-    const uint32x4_t vRangeMax = vsetq_u32(
+    const uint32x4_t vRangeMax = qvsetq_n_u32(
             isARGB ? d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear
                    : d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear,
             d_ptr->colorSpaceIn->lut[1]->m_unclampedToLinear,
@@ -749,7 +740,7 @@ void loadUnpremultiplied(QColorVector *buffer, const T *src, const qsizetype len
 {
     constexpr bool isARGB = isArgb<T>();
     const float iFF00 = 1.0f / (255 * 256);
-    const uint32x4_t vRangeMax = vsetq_u32(
+    const uint32x4_t vRangeMax = qvsetq_n_u32(
             isARGB ? d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear
                    : d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear,
             d_ptr->colorSpaceIn->lut[1]->m_unclampedToLinear,
diff --git a/src/gui/painting/qdrawhelper_neon.cpp b/src/gui/painting/qdrawhelper_neon.cpp
index f8de0a85116..9233468a878 100644
--- a/src/gui/painting/qdrawhelper_neon.cpp
+++ b/src/gui/painting/qdrawhelper_neon.cpp
@@ -1069,12 +1069,10 @@ const uint * QT_FASTCALL qt_fetchUntransformed_888_neon(uint *buffer, const Oper
 #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
 static inline uint32x4_t vrgba2argb(uint32x4_t srcVector)
 {
-#if defined(Q_PROCESSOR_ARM_64) && defined(_MSC_VER)
-    const uint8x16_t rgbaMask  = { 0x0704050603000102ULL, 0x0F0C0D0E0B08090AULL };
-#elif defined(Q_PROCESSOR_ARM_64)
-    const uint8x16_t rgbaMask  = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
+#if defined(Q_PROCESSOR_ARM_64)
+    const uint8x16_t rgbaMask  = qvsetq_n_u8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
 #else
-    const uint8x8_t rgbaMask  = { 2, 1, 0, 3, 6, 5, 4, 7 };
+    const uint8x8_t rgbaMask  = qvset_n_u8(2, 1, 0, 3, 6, 5, 4, 7);
 #endif
 #if defined(Q_PROCESSOR_ARM_64)
     srcVector = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(srcVector), rgbaMask));
@@ -1091,11 +1089,7 @@ template<bool RGBA>
 static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int count)
 {
     int i = 0;
-#ifdef _MSC_VER
-    const uint8x8_t shuffleMask = { 0x0707070703030303ULL };
-#else
-    const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7};
-#endif
+    const uint8x8_t shuffleMask = qvset_n_u8(3, 3, 3, 3, 7, 7, 7, 7);
     const uint32x4_t blendMask = vdupq_n_u32(0xff000000);
 
     for (; i < count - 3; i += 4) {
@@ -1147,11 +1141,7 @@ static inline void convertARGB32ToRGBA64PM_neon(QRgba64 *buffer, const uint *src
     if (count <= 0)
         return;
 
-#ifdef _MSC_VER
-    const uint8x8_t shuffleMask = { 0x0707070703030303ULL };
-#else
-    const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7};
-#endif
+    const uint8x8_t shuffleMask = qvset_n_u8(3, 3, 3, 3, 7, 7, 7, 7);
     const uint64x2_t blendMask = vdupq_n_u64(Q_UINT64_C(0xffff000000000000));
 
     int i = 0;
diff --git a/src/gui/painting/qpixellayout.cpp b/src/gui/painting/qpixellayout.cpp
index 31646d2f23d..dab337260d6 100644
--- a/src/gui/painting/qpixellayout.cpp
+++ b/src/gui/painting/qpixellayout.cpp
@@ -1087,12 +1087,10 @@ static inline void qConvertARGB32PMToRGBA64PM_neon(QRgba64 *buffer, const uint *
         return;
 
     const uint32x4_t amask = vdupq_n_u32(0xff000000);
-#if defined(Q_PROCESSOR_ARM_64) && defined(_MSC_VER)
-    const uint8x16_t rgbaMask  = { 0x0704050603000102ULL, 0x0F0C0D0E0B08090AULL };
-#elif defined(Q_PROCESSOR_ARM_64)
-    const uint8x16_t rgbaMask  = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
+#if defined(Q_PROCESSOR_ARM_64)
+    const uint8x16_t rgbaMask  = qvsetq_n_u8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
 #else
-    const uint8x8_t rgbaMask  = { 2, 1, 0, 3, 6, 5, 4, 7 };
+    const uint8x8_t rgbaMask  = qvset_n_u8(2, 1, 0, 3, 6, 5, 4, 7);
 #endif
     int i = 0;
     for (; i < count-3; i += 4) {
diff --git a/src/gui/painting/qrgba64_p.h b/src/gui/painting/qrgba64_p.h
index 6809f1d52cf..058d77f7e90 100644
--- a/src/gui/painting/qrgba64_p.h
+++ b/src/gui/painting/qrgba64_p.h
@@ -232,11 +232,7 @@ static inline uint toArgb32(QRgba64 rgba64)
 #elif defined __ARM_NEON__
     uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&rgba64)));
 #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
-#ifdef _MSC_VER
-    const uint8x8_t shuffleMask = { 0x0706010003020504ULL };
-#else
-    const uint8x8_t shuffleMask = { 4, 5, 2, 3, 0, 1, 6, 7 };
-#endif
+    const uint8x8_t shuffleMask = qvset_n_u8(4, 5, 2, 3, 0, 1, 6, 7);
     v = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(v), shuffleMask));
 #else
     v = vext_u16(v, v, 3);
author	Allan Sandfeld Jensen <[email protected]>	2024-09-13 13:19:19 +0200
committer	Allan Sandfeld Jensen <[email protected]>	2024-09-17 08:27:53 +0200
commit	308bca94a72f83624e2e2c92449719e06940e77f (patch)
tree	35125707bb3440df20e703a5f9f1b324cc490840
parent	0681e720a9851f1873ce5a5f99b5567d2b418261 (diff)