diff options
author | Allan Sandfeld Jensen <[email protected]> | 2024-09-13 13:19:19 +0200 |
---|---|---|
committer | Allan Sandfeld Jensen <[email protected]> | 2024-09-17 08:27:53 +0200 |
commit | 308bca94a72f83624e2e2c92449719e06940e77f (patch) | |
tree | 35125707bb3440df20e703a5f9f1b324cc490840 | |
parent | 0681e720a9851f1873ce5a5f99b5567d2b418261 (diff) |
Clean-up MSVC ARM64 NEON support
Also fixing clang-cl support.
Pick-to: 6.8 6.8.0
Change-Id: If2130091edfadc0cc4d4cecd95c2256522efc69a
Reviewed-by: Maurice Kalinowski <[email protected]>
Reviewed-by: Eirik Aavitsland <[email protected]>
-rw-r--r-- | src/corelib/global/qsimd_p.h | 53 | ||||
-rw-r--r-- | src/corelib/text/qstring.cpp | 12 | ||||
-rw-r--r-- | src/corelib/text/qstringconverter.cpp | 21 | ||||
-rw-r--r-- | src/gui/painting/qcolortransform.cpp | 13 | ||||
-rw-r--r-- | src/gui/painting/qdrawhelper_neon.cpp | 20 | ||||
-rw-r--r-- | src/gui/painting/qpixellayout.cpp | 8 | ||||
-rw-r--r-- | src/gui/painting/qrgba64_p.h | 6 |
7 files changed, 69 insertions, 64 deletions
diff --git a/src/corelib/global/qsimd_p.h b/src/corelib/global/qsimd_p.h index b9cd296c9e8..a5dc6487c3b 100644 --- a/src/corelib/global/qsimd_p.h +++ b/src/corelib/global/qsimd_p.h @@ -283,6 +283,59 @@ inline uint8_t vaddv_u8(uint8x8_t v8) } #endif +// Missing NEON intrinsics, needed due different type definitions: +inline uint16x8_t qvsetq_n_u16(uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, + uint16_t v5, uint16_t v6, uint16_t v7, uint16_t v8) { +#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG) + using u64 = uint64_t; + const uint16x8_t vmask = { + v1 | (v2 << 16) | (u64(v3) << 32) | (u64(v4) << 48), + v5 | (v6 << 16) | (u64(v7) << 32) | (u64(v8) << 48) + }; +#else + const uint16x8_t vmask = { v1, v2, v3, v4, v5, v6, v7, v8 }; +#endif + return vmask; +} +inline uint8x8_t qvset_n_u8(uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, + uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8) { +#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG) + using u64 = uint64_t; + const uint8x8_t vmask = { + v1 | (v2 << 8) | (v3 << 16) | (v4 << 24) | + (u64(v5) << 32) | (u64(v6) << 40) | (u64(v7) << 48) | (u64(v8) << 56) + }; +#else + const uint8x8_t vmask = { v1, v2, v3, v4, v5, v6, v7, v8 }; +#endif + return vmask; +} +inline uint8x16_t qvsetq_n_u8(uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, + uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, + uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, + uint8_t v13, uint8_t v14, uint8_t v15, uint8_t v16) { +#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG) + using u64 = uint64_t; + const uint8x16_t vmask = { + v1 | (v2 << 8) | (v3 << 16) | (v4 << 24) | + (u64(v5) << 32) | (u64(v6) << 40) | (u64(v7) << 48) | (u64(v8) << 56), + v9 | (v10 << 8) | (v11 << 16) | (v12 << 24) | + (u64(v13) << 32) | (u64(v14) << 40) | (u64(v15) << 48) | (u64(v16) << 56) + }; +#else + const uint8x16_t vmask = { v1, v2, v3, v4, v5, v6, v7, v8, + v9, v10, v11, v12, v13, v14, v15, v16}; +#endif + return vmask; +} +inline uint32x4_t qvsetq_n_u32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) +{ +#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG) + return uint32x4_t{ (uint64_t(b) << 32) | a, (uint64_t(d) << 32) | c }; +#else + return uint32x4_t{ a, b, c, d }; +#endif +} #endif #if defined(Q_PROCESSOR_ARM) && defined(__ARM_FEATURE_CRC32) diff --git a/src/corelib/text/qstring.cpp b/src/corelib/text/qstring.cpp index 077d9accded..c3d1594ac6b 100644 --- a/src/corelib/text/qstring.cpp +++ b/src/corelib/text/qstring.cpp @@ -749,11 +749,7 @@ const char16_t *QtPrivate::qustrchr(QStringView str, char16_t c) noexcept [=](qsizetype i) { return n + i; }); # endif #elif defined(__ARM_NEON__) -#ifdef _MSC_VER - const uint16x8_t vmask = { 0x0008000400020001ULL, 0x0080004000200010ULL }; -#else - const uint16x8_t vmask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; -#endif + const uint16x8_t vmask = qvsetq_n_u16(1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7); const uint16x8_t ch_vec = vdupq_n_u16(c); for (const char16_t *next = n + 8; next <= e; n = next, next += 8) { uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(n)); @@ -1294,11 +1290,7 @@ static int ucstrncmp(const char16_t *a, const char16_t *b, size_t l) # elif defined(__ARM_NEON__) if (l >= 8) { const char16_t *end = a + l; -#ifdef _MSC_VER - const uint16x8_t mask = { 0x0008000400020001ULL, 0x0080004000200010ULL }; -#else - const uint16x8_t mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; -#endif + const uint16x8_t mask = qvsetq_n_u16( 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 ); while (end - a > 7) { uint16x8_t da = vld1q_u16(reinterpret_cast<const uint16_t *>(a)); uint16x8_t db = vld1q_u16(reinterpret_cast<const uint16_t *>(b)); diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp index e411990a456..104c10a7674 100644 --- a/src/corelib/text/qstringconverter.cpp +++ b/src/corelib/text/qstringconverter.cpp @@ -351,12 +351,7 @@ static void simdCompareAscii(const qchar8_t *&src8, const qchar8_t *end8, const static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end) { uint16x8_t maxAscii = vdupq_n_u16(0x7f); -#ifdef _MSC_VER - uint16_t mask1t[8] = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 }; - uint16x8_t mask1 = vld1q_u16(mask1t); -#else - uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 }; -#endif + uint16x8_t mask1 = qvsetq_n_u16(1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 ); uint16x8_t mask2 = vshlq_n_u16(mask1, 1); // do sixteen characters at a time @@ -394,12 +389,7 @@ static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, cons { // do eight characters at a time uint8x8_t msb_mask = vdup_n_u8(0x80); -#ifdef _MSC_VER - uint8_t add_maskt[8] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; - uint8x8_t add_mask = vld1_u8(add_maskt); -#else - uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; -#endif + uint8x8_t add_mask = qvset_n_u8(1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 ); for ( ; end - src >= 8; src += 8, dst += 8) { uint8x8_t c = vld1_u8(src); uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask)); @@ -435,12 +425,7 @@ static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, // do eight characters at a time uint8x8_t msb_mask = vdup_n_u8(0x80); -#ifdef _MSC_VER - uint8_t add_maskt[8] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; - uint8x8_t add_mask = vld1_u8(add_maskt); -#else - uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; -#endif + uint8x8_t add_mask = qvset_n_u8(1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7); for ( ; end - src >= 8; src += 8) { uint8x8_t c = vld1_u8(src); uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask)); diff --git a/src/gui/painting/qcolortransform.cpp b/src/gui/painting/qcolortransform.cpp index 2cbc19b9626..42315986b7a 100644 --- a/src/gui/painting/qcolortransform.cpp +++ b/src/gui/painting/qcolortransform.cpp @@ -662,21 +662,12 @@ static inline bool test_all_zero(uint32x4_t p) #endif } -static inline uint32x4_t vsetq_u32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) -{ -#ifdef _MSC_VER - return uint32x4_t{ (uint64_t(b) << 32) | a, (uint64_t(d) << 32) | c }; -#else - return uint32x4_t{ a, b, c, d }; -#endif -} - template<typename T> static void loadPremultiplied(QColorVector *buffer, const T *src, const qsizetype len, const QColorTransformPrivate *d_ptr) { constexpr bool isARGB = isArgb<T>(); const float iFF00 = 1.0f / (255 * 256); - const uint32x4_t vRangeMax = vsetq_u32( + const uint32x4_t vRangeMax = qvsetq_n_u32( isARGB ? d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear : d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear, d_ptr->colorSpaceIn->lut[1]->m_unclampedToLinear, @@ -749,7 +740,7 @@ void loadUnpremultiplied(QColorVector *buffer, const T *src, const qsizetype len { constexpr bool isARGB = isArgb<T>(); const float iFF00 = 1.0f / (255 * 256); - const uint32x4_t vRangeMax = vsetq_u32( + const uint32x4_t vRangeMax = qvsetq_n_u32( isARGB ? d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear : d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear, d_ptr->colorSpaceIn->lut[1]->m_unclampedToLinear, diff --git a/src/gui/painting/qdrawhelper_neon.cpp b/src/gui/painting/qdrawhelper_neon.cpp index f8de0a85116..9233468a878 100644 --- a/src/gui/painting/qdrawhelper_neon.cpp +++ b/src/gui/painting/qdrawhelper_neon.cpp @@ -1069,12 +1069,10 @@ const uint * QT_FASTCALL qt_fetchUntransformed_888_neon(uint *buffer, const Oper #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN static inline uint32x4_t vrgba2argb(uint32x4_t srcVector) { -#if defined(Q_PROCESSOR_ARM_64) && defined(_MSC_VER) - const uint8x16_t rgbaMask = { 0x0704050603000102ULL, 0x0F0C0D0E0B08090AULL }; -#elif defined(Q_PROCESSOR_ARM_64) - const uint8x16_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15}; +#if defined(Q_PROCESSOR_ARM_64) + const uint8x16_t rgbaMask = qvsetq_n_u8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15); #else - const uint8x8_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7 }; + const uint8x8_t rgbaMask = qvset_n_u8(2, 1, 0, 3, 6, 5, 4, 7); #endif #if defined(Q_PROCESSOR_ARM_64) srcVector = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(srcVector), rgbaMask)); @@ -1091,11 +1089,7 @@ template<bool RGBA> static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int count) { int i = 0; -#ifdef _MSC_VER - const uint8x8_t shuffleMask = { 0x0707070703030303ULL }; -#else - const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7}; -#endif + const uint8x8_t shuffleMask = qvset_n_u8(3, 3, 3, 3, 7, 7, 7, 7); const uint32x4_t blendMask = vdupq_n_u32(0xff000000); for (; i < count - 3; i += 4) { @@ -1147,11 +1141,7 @@ static inline void convertARGB32ToRGBA64PM_neon(QRgba64 *buffer, const uint *src if (count <= 0) return; -#ifdef _MSC_VER - const uint8x8_t shuffleMask = { 0x0707070703030303ULL }; -#else - const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7}; -#endif + const uint8x8_t shuffleMask = qvset_n_u8(3, 3, 3, 3, 7, 7, 7, 7); const uint64x2_t blendMask = vdupq_n_u64(Q_UINT64_C(0xffff000000000000)); int i = 0; diff --git a/src/gui/painting/qpixellayout.cpp b/src/gui/painting/qpixellayout.cpp index 31646d2f23d..dab337260d6 100644 --- a/src/gui/painting/qpixellayout.cpp +++ b/src/gui/painting/qpixellayout.cpp @@ -1087,12 +1087,10 @@ static inline void qConvertARGB32PMToRGBA64PM_neon(QRgba64 *buffer, const uint * return; const uint32x4_t amask = vdupq_n_u32(0xff000000); -#if defined(Q_PROCESSOR_ARM_64) && defined(_MSC_VER) - const uint8x16_t rgbaMask = { 0x0704050603000102ULL, 0x0F0C0D0E0B08090AULL }; -#elif defined(Q_PROCESSOR_ARM_64) - const uint8x16_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15}; +#if defined(Q_PROCESSOR_ARM_64) + const uint8x16_t rgbaMask = qvsetq_n_u8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15); #else - const uint8x8_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7 }; + const uint8x8_t rgbaMask = qvset_n_u8(2, 1, 0, 3, 6, 5, 4, 7); #endif int i = 0; for (; i < count-3; i += 4) { diff --git a/src/gui/painting/qrgba64_p.h b/src/gui/painting/qrgba64_p.h index 6809f1d52cf..058d77f7e90 100644 --- a/src/gui/painting/qrgba64_p.h +++ b/src/gui/painting/qrgba64_p.h @@ -232,11 +232,7 @@ static inline uint toArgb32(QRgba64 rgba64) #elif defined __ARM_NEON__ uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&rgba64))); #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN -#ifdef _MSC_VER - const uint8x8_t shuffleMask = { 0x0706010003020504ULL }; -#else - const uint8x8_t shuffleMask = { 4, 5, 2, 3, 0, 1, 6, 7 }; -#endif + const uint8x8_t shuffleMask = qvset_n_u8(4, 5, 2, 3, 0, 1, 6, 7); v = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(v), shuffleMask)); #else v = vext_u16(v, v, 3); |