diff options
author | Chen Zhanwang <[email protected]> | 2024-06-24 15:12:51 +0800 |
---|---|---|
committer | Chen Zhanwang <[email protected]> | 2024-10-23 00:12:15 +0800 |
commit | dfc84993b7125a806de8b1a0ea79cad9866e3aaf (patch) | |
tree | 6cff67ce7133a31ce6f5c45007501b88832f6763 | |
parent | d511a68684c2f76b48c696c5f8a04c22ef2d00fe (diff) |
Add loongarch64(LSX) optimization
List of optimized implementations using LSX:
- fetchPixelsBPP24
- qt_fetchUntransformed_888
- qt_memfill24
- rbSwap_888
- convertARGBToARGB32PM
- convertARGBToRGBA64PM
- convertARGBFromARGB32PM
- convertARGBFromRGBA64PM
- convertRGBA64FromRGBA64PM
- fetchRGBA32FToRGBA32F
- storeRGBX32FFromRGBA32F
- storeRGBX32FFromRGBA32F
- storeRGBA32FFromRGBA32F
Change-Id: I27e13358237200938ea421b6835724b3d923072d
Reviewed-by: Volker Hilsheimer <[email protected]>
-rw-r--r-- | src/gui/image/qimage_conversions.cpp | 14 | ||||
-rw-r--r-- | src/gui/painting/qdrawhelper.cpp | 73 | ||||
-rw-r--r-- | src/gui/painting/qdrawhelper_lsx.cpp | 682 | ||||
-rw-r--r-- | src/gui/painting/qdrawingprimitive_lsx_p.h | 86 | ||||
-rw-r--r-- | src/gui/painting/qpixellayout.cpp | 15 |
5 files changed, 870 insertions, 0 deletions
diff --git a/src/gui/image/qimage_conversions.cpp b/src/gui/image/qimage_conversions.cpp index ec75b8c386e..e91e0d3a0bc 100644 --- a/src/gui/image/qimage_conversions.cpp +++ b/src/gui/image/qimage_conversions.cpp @@ -133,6 +133,10 @@ extern void QT_FASTCALL storeRGB32FromARGB32PM_sse4(uchar *dest, const uint *src #elif defined(__ARM_NEON__) && (Q_BYTE_ORDER == Q_LITTLE_ENDIAN) extern void QT_FASTCALL storeRGB32FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count, const QList<QRgb> *, QDitherInfo *); +#elif defined QT_COMPILER_SUPPORTS_LSX +// from painting/qdrawhelper_lsx.cpp +extern void QT_FASTCALL storeRGB32FromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count, + const QList<QRgb> *, QDitherInfo *); #endif void convert_generic(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags flags) @@ -158,6 +162,11 @@ void convert_generic(QImageData *dest, const QImageData *src, Qt::ImageConversio store = storeRGB32FromARGB32PM_sse4; else store = storeRGB32FromARGB32PM; +#elif defined QT_COMPILER_SUPPORTS_LSX + if (qCpuHasFeature(LSX)) + store = storeRGB32FromARGB32PM_lsx; + else + store = storeRGB32FromARGB32PM; #elif defined(__ARM_NEON__) && (Q_BYTE_ORDER == Q_LITTLE_ENDIAN) store = storeRGB32FromARGB32PM_neon; #else @@ -376,6 +385,11 @@ bool convert_generic_inplace(QImageData *data, QImage::Format dst_format, Qt::Im store = storeRGB32FromARGB32PM_sse4; else store = storeRGB32FromARGB32PM; +#elif defined QT_COMPILER_SUPPORTS_LSX + if (qCpuHasFeature(LSX)) + store = storeRGB32FromARGB32PM_lsx; + else + store = storeRGB32FromARGB32PM; #elif defined(__ARM_NEON__) && (Q_BYTE_ORDER == Q_LITTLE_ENDIAN) store = storeRGB32FromARGB32PM_neon; #else diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp index 2bfca562249..b36044396f9 100644 --- a/src/gui/painting/qdrawhelper.cpp +++ b/src/gui/painting/qdrawhelper.cpp @@ -6386,6 +6386,10 @@ void qt_memfill24(quint24 *dest, quint24 color, qsizetype count) extern void qt_memfill24_ssse3(quint24 *, quint24, qsizetype); if (qCpuHasFeature(SSSE3)) return qt_memfill24_ssse3(dest, color, count); +# elif defined QT_COMPILER_SUPPORTS_LSX + extern void qt_memfill24_lsx(quint24 *, quint24, qsizetype); + if (qCpuHasFeature(LSX)) + return qt_memfill24_lsx(dest, color, count); # endif const quint32 v = color; @@ -6449,6 +6453,8 @@ void qt_memfill32(quint32 *dest, quint32 color, qsizetype count) #ifdef QT_COMPILER_SUPPORTS_SSE4_1 template<QtPixelOrder> void QT_FASTCALL storeA2RGB30PMFromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count, const QList<QRgb> *, QDitherInfo *); +#elif defined(QT_COMPILER_SUPPORTS_LSX) +template<QtPixelOrder> void QT_FASTCALL storeA2RGB30PMFromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count, const QList<QRgb> *, QDitherInfo *); #endif extern void qInitBlendFunctions(); @@ -6773,6 +6779,73 @@ static void qInitDrawhelperFunctions() qt_functionForMode_C[QPainter::CompositionMode_Source] = comp_func_Source_lsx; qt_functionForModeSolid_C[QPainter::CompositionMode_Source] = comp_func_solid_Source_lsx; qt_functionForMode_C[QPainter::CompositionMode_Plus] = comp_func_Plus_lsx; + + extern const uint * QT_FASTCALL qt_fetchUntransformed_888_lsx(uint *buffer, const Operator *, const QSpanData *data, + int y, int x, int length); + sourceFetchUntransformed[QImage::Format_RGB888] = qt_fetchUntransformed_888_lsx; + extern void QT_FASTCALL rbSwap_888_lsx(uchar *dst, const uchar *src, int count); + qPixelLayouts[QImage::Format_RGB888].rbSwap = rbSwap_888_lsx; + qPixelLayouts[QImage::Format_BGR888].rbSwap = rbSwap_888_lsx; + + extern void QT_FASTCALL convertARGB32ToARGB32PM_lsx(uint *buffer, int count, const QList<QRgb> *); + extern void QT_FASTCALL convertRGBA8888ToARGB32PM_lsx(uint *buffer, int count, const QList<QRgb> *); + extern const uint *QT_FASTCALL fetchARGB32ToARGB32PM_lsx(uint *buffer, const uchar *src, int index, int count, + const QList<QRgb> *, QDitherInfo *); + extern const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_lsx(uint *buffer, const uchar *src, int index, int count, + const QList<QRgb> *, QDitherInfo *); + extern const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_lsx(QRgba64 *buffer, const uint *src, int count, + const QList<QRgb> *, QDitherInfo *); + extern const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_lsx(QRgba64 *buffer, const uint *src, int count, + const QList<QRgb> *, QDitherInfo *); + extern const QRgba64 *QT_FASTCALL fetchARGB32ToRGBA64PM_lsx(QRgba64 *buffer, const uchar *src, int index, int count, + const QList<QRgb> *, QDitherInfo *); + extern const QRgba64 *QT_FASTCALL fetchRGBA8888ToRGBA64PM_lsx(QRgba64 *buffer, const uchar *src, int index, int count, + const QList<QRgb> *, QDitherInfo *); + extern void QT_FASTCALL storeARGB32FromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count, + const QList<QRgb> *, QDitherInfo *); + extern void QT_FASTCALL storeRGBA8888FromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count, + const QList<QRgb> *, QDitherInfo *); + extern void QT_FASTCALL storeRGBXFromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count, + const QList<QRgb> *, QDitherInfo *); + extern void QT_FASTCALL storeARGB32FromRGBA64PM_lsx(uchar *dest, const QRgba64 *src, int index, int count, + const QList<QRgb> *, QDitherInfo *); + extern void QT_FASTCALL storeRGBA8888FromRGBA64PM_lsx(uchar *dest, const QRgba64 *src, int index, int count, + const QList<QRgb> *, QDitherInfo *); + extern void QT_FASTCALL destStore64ARGB32_lsx(QRasterBuffer *rasterBuffer, int x, int y, const QRgba64 *buffer, int length); + extern void QT_FASTCALL destStore64RGBA8888_lsx(QRasterBuffer *rasterBuffer, int x, int y, const QRgba64 *buffer, int length); + extern void QT_FASTCALL storeRGBA64FromRGBA64PM_lsx(uchar *, const QRgba64 *, int, int, const QList<QRgb> *, QDitherInfo *); + extern void QT_FASTCALL storeRGBx64FromRGBA64PM_lsx(uchar *, const QRgba64 *, int, int, const QList<QRgb> *, QDitherInfo *); + qPixelLayouts[QImage::Format_ARGB32].fetchToARGB32PM = fetchARGB32ToARGB32PM_lsx; + qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_lsx; + qPixelLayouts[QImage::Format_RGBA8888].fetchToARGB32PM = fetchRGBA8888ToARGB32PM_lsx; + qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_lsx; + qPixelLayouts[QImage::Format_ARGB32].fetchToRGBA64PM = fetchARGB32ToRGBA64PM_lsx; + qPixelLayouts[QImage::Format_ARGB32].convertToRGBA64PM = convertARGB32ToRGBA64PM_lsx; + qPixelLayouts[QImage::Format_RGBA8888].fetchToRGBA64PM = fetchRGBA8888ToRGBA64PM_lsx; + qPixelLayouts[QImage::Format_RGBA8888].convertToRGBA64PM = convertRGBA8888ToRGBA64PM_lsx; + qPixelLayouts[QImage::Format_RGBX8888].fetchToRGBA64PM = fetchRGBA8888ToRGBA64PM_lsx; + qPixelLayouts[QImage::Format_RGBX8888].convertToRGBA64PM = convertRGBA8888ToRGBA64PM_lsx; + qPixelLayouts[QImage::Format_ARGB32].storeFromARGB32PM = storeARGB32FromARGB32PM_lsx; + qPixelLayouts[QImage::Format_RGBA8888].storeFromARGB32PM = storeRGBA8888FromARGB32PM_lsx; + qPixelLayouts[QImage::Format_RGBX8888].storeFromARGB32PM = storeRGBXFromARGB32PM_lsx; + qPixelLayouts[QImage::Format_A2BGR30_Premultiplied].storeFromARGB32PM = storeA2RGB30PMFromARGB32PM_lsx<PixelOrderBGR>; + qPixelLayouts[QImage::Format_A2RGB30_Premultiplied].storeFromARGB32PM = storeA2RGB30PMFromARGB32PM_lsx<PixelOrderRGB>; + qStoreFromRGBA64PM[QImage::Format_ARGB32] = storeARGB32FromRGBA64PM_lsx; + qStoreFromRGBA64PM[QImage::Format_RGBA8888] = storeRGBA8888FromRGBA64PM_lsx; + qStoreFromRGBA64PM[QImage::Format_RGBX64] = storeRGBx64FromRGBA64PM_lsx; + qStoreFromRGBA64PM[QImage::Format_RGBA64] = storeRGBA64FromRGBA64PM_lsx; +#if QT_CONFIG(raster_64bit) + destStoreProc64[QImage::Format_ARGB32] = destStore64ARGB32_lsx; + destStoreProc64[QImage::Format_RGBA8888] = destStore64RGBA8888_lsx; +#endif +#if QT_CONFIG(raster_fp) + extern const QRgbaFloat32 *QT_FASTCALL fetchRGBA32FToRGBA32F_lsx(QRgbaFloat32 *buffer, const uchar *src, int index, int count, const QList<QRgb> *, QDitherInfo *); + extern void QT_FASTCALL storeRGBX32FFromRGBA32F_lsx(uchar *dest, const QRgbaFloat32 *src, int index, int count, const QList<QRgb> *, QDitherInfo *); + extern void QT_FASTCALL storeRGBA32FFromRGBA32F_lsx(uchar *dest, const QRgbaFloat32 *src, int index, int count, const QList<QRgb> *, QDitherInfo *); + qFetchToRGBA32F[QImage::Format_RGBA32FPx4] = fetchRGBA32FToRGBA32F_lsx; + qStoreFromRGBA32F[QImage::Format_RGBX32FPx4] = storeRGBX32FFromRGBA32F_lsx; + qStoreFromRGBA32F[QImage::Format_RGBA32FPx4] = storeRGBA32FFromRGBA32F_lsx; +#endif // QT_CONFIG(raster_fp) } #endif //QT_COMPILER_SUPPORTS_LSX diff --git a/src/gui/painting/qdrawhelper_lsx.cpp b/src/gui/painting/qdrawhelper_lsx.cpp index f28374bc0d3..5859efd1b61 100644 --- a/src/gui/painting/qdrawhelper_lsx.cpp +++ b/src/gui/painting/qdrawhelper_lsx.cpp @@ -588,6 +588,688 @@ void qt_scale_image_argb32_on_argb32_lsx(uchar *destPixels, int dbpl, } } +const uint *QT_FASTCALL fetchPixelsBPP24_lsx(uint *buffer, const uchar *src, int index, int count) +{ + const quint24 *s = reinterpret_cast<const quint24 *>(src); + for (int i = 0; i < count; ++i) + buffer[i] = s[index + i]; + return buffer; +} + +const uint * QT_FASTCALL qt_fetchUntransformed_888_lsx(uint *buffer, const Operator *, + const QSpanData *data, + int y, int x, int length) +{ + const uchar *line = data->texture.scanLine(y) + x * 3; + // from image/qimage_lsx.cpp + extern void QT_FASTCALL qt_convert_rgb888_to_rgb32_lsx(quint32 *dst, const uchar *src, int len); + qt_convert_rgb888_to_rgb32_lsx(buffer, line, length); + return buffer; +} + +void qt_memfill24_lsx(quint24 *dest, quint24 color, qsizetype count) +{ + // LCM of 12 and 16 bytes is 48 bytes (16 px) + quint32 v = color; + __m128i m = __lsx_vinsgr2vr_w(__lsx_vldi(0), v, 0); + quint24 *end = dest + count; + + constexpr uchar x = 2, y = 1, z = 0; + alignas(__m128i) static const uchar + shuffleMask[16 + 1] = { x, y, z, x, y, z, x, y, z, x, y, z, x, y, z, x, y }; + __m128i indexMask = (__m128i)(v16i8){2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}; + + __m128i mval1 = __lsx_vshuf_b(m, m, __lsx_vld(reinterpret_cast<const __m128i *>(shuffleMask), 0)); + __m128i mval2 = __lsx_vshuf_b(m, m, __lsx_vld(reinterpret_cast<const __m128i *>(shuffleMask + 1), 0)); + __m128i mval3 = __lsx_vshuf_b(mval2, mval1, indexMask); + + for ( ; dest + 16 <= end; dest += 16) { + __lsx_vst(mval1, reinterpret_cast<__m128i *>(dest) + 0, 0); + __lsx_vst(mval2, reinterpret_cast<__m128i *>(dest) + 1, 0); + __lsx_vst(mval3, reinterpret_cast<__m128i *>(dest) + 2, 0); + } + + if (count < 3) { + if (count > 1) + end[-2] = v; + if (count) + end[-1] = v; + return; + } + + // less than 16px/48B left + uchar *ptr = reinterpret_cast<uchar *>(dest); + uchar *ptr_end = reinterpret_cast<uchar *>(end); + qptrdiff left = ptr_end - ptr; + if (left >= 24) { + // 8px/24B or more left + __lsx_vst(mval1, reinterpret_cast<__m128i *>(ptr) + 0, 0); + __lsx_vstelm_d(mval2, reinterpret_cast<__m128i *>(ptr) + 1, 0, 0); + ptr += 24; + left -= 24; + } + + // less than 8px/24B left + + if (left >= 16) { + // but more than 5px/15B left + __lsx_vst(mval1, reinterpret_cast<__m128i *>(ptr) , 0); + } else if (left >= 8) { + // but more than 2px/6B left + __lsx_vstelm_d(mval1, reinterpret_cast<__m128i *>(ptr), 0, 0); + } + + if (left) { + // 1 or 2px left + // store 8 bytes ending with the right values (will overwrite a bit) + __lsx_vstelm_d(mval2, reinterpret_cast<__m128i *>(ptr_end - 8), 0, 0); + } +} + +void QT_FASTCALL rbSwap_888_lsx(uchar *dst, const uchar *src, int count) +{ + int i = 0; + const static __m128i shuffleMask1 = (__m128i)(v16i8){2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 15}; + const static __m128i shuffleMask2 = (__m128i)(v16i8){0, 1, 4, 3, 2, 7, 6, 5, 10, 9, 8, 13, 12, 11, 14, 15}; + const static __m128i shuffleMask3 = (__m128i)(v16i8){0, 3, 2, 1, 6, 5, 4, 9, 8, 7, 12, 11, 10, 15, 14, 13}; + + for (; i + 15 < count; i += 16) { + __m128i s1 = __lsx_vld(src, 0); + __m128i s2 = __lsx_vld((src + 16), 0); + __m128i s3 = __lsx_vld((src + 32), 0); + s1 = __lsx_vshuf_b(s1, s1, shuffleMask1); + s2 = __lsx_vshuf_b(s2, s2, shuffleMask2); + s3 = __lsx_vshuf_b(s3, s3, shuffleMask3); + __lsx_vst(s1, dst, 0); + __lsx_vst(s2, (dst + 16), 0); + __lsx_vst(s3, (dst + 32), 0); + + // Now fix the last four misplaced values + std::swap(dst[15], dst[17]); + std::swap(dst[30], dst[32]); + + src += 48; + dst += 48; + } + + if (src != dst) { + SIMD_EPILOGUE(i, count, 15) { + dst[0] = src[2]; + dst[1] = src[1]; + dst[2] = src[0]; + dst += 3; + src += 3; + } + } else { + SIMD_EPILOGUE(i, count, 15) { + std::swap(dst[0], dst[2]); + dst += 3; + } + } +} + +template<bool RGBA> +static void convertARGBToARGB32PM_lsx(uint *buffer, const uint *src, int count) +{ + int i = 0; + const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000); + const __m128i rgbaMask = (__m128i)(v16i8){2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15}; + const __m128i shuffleMask = (__m128i)(v16i8){6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15}; + const __m128i half = __lsx_vreplgr2vr_h(0x0080); + const __m128i zero = __lsx_vldi(0); + + for (; i < count - 3; i += 4) { + __m128i srcVector = __lsx_vld(&src[i], 0); + const v4i32 testz = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector, alphaMask)); + if (testz[0]!=0) { + const v4i32 testc = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector, alphaMask)); + if (testc[0]!=0) { + if (RGBA) + srcVector = __lsx_vshuf_b(zero, srcVector, rgbaMask); + __m128i src1 = __lsx_vilvl_b(zero, srcVector); + __m128i src2 = __lsx_vilvh_b(zero, srcVector); + __m128i alpha1 = __lsx_vshuf_b(zero, src1, shuffleMask); + __m128i alpha2 = __lsx_vshuf_b(zero, src2, shuffleMask); + src1 = __lsx_vmul_h(src1, alpha1); + src2 = __lsx_vmul_h(src2, alpha2); + src1 = __lsx_vadd_h(src1, __lsx_vsrli_h(src1, 8)); + src2 = __lsx_vadd_h(src2, __lsx_vsrli_h(src2, 8)); + src1 = __lsx_vadd_h(src1, half); + src2 = __lsx_vadd_h(src2, half); + src1 = __lsx_vsrli_h(src1, 8); + src2 = __lsx_vsrli_h(src2, 8); + __m128i blendMask = (__m128i)(v8i16){0, 1, 2, 11, 4, 5, 6, 15}; + src1 = __lsx_vshuf_h(blendMask, alpha1, src1); + src2 = __lsx_vshuf_h(blendMask, alpha2, src2); + src1 = __lsx_vmaxi_h(src1, 0); + src2 = __lsx_vmaxi_h(src2, 0); + srcVector = __lsx_vpickev_b(__lsx_vsat_hu(src2, 7), __lsx_vsat_hu(src1, 7)); + __lsx_vst(srcVector, &buffer[i], 0); + } else { + if (RGBA) + __lsx_vst(__lsx_vshuf_b(zero, srcVector, rgbaMask), &buffer[i], 0); + else if (buffer != src) + __lsx_vst(srcVector, &buffer[i], 0); + } + } else { + __lsx_vst(zero, &buffer[i], 0); + } + } + + SIMD_EPILOGUE(i, count, 3) { + uint v = qPremultiply(src[i]); + buffer[i] = RGBA ? RGBA2ARGB(v) : v; + } +} + +template<bool RGBA> +static void convertARGBToRGBA64PM_lsx(QRgba64 *buffer, const uint *src, int count) +{ + int i = 0; + const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000); + const __m128i rgbaMask = (__m128i)(v16i8){2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15}; + const __m128i shuffleMask = (__m128i)(v16i8){6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15}; + const __m128i zero = __lsx_vldi(0); + + for (; i < count - 3; i += 4) { + __m128i srcVector = __lsx_vld(&src[i], 0); + const v4i32 testz = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector, alphaMask)); + if (testz[0]!=0) { + const v4i32 testc = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector, alphaMask)); + if (!RGBA) + srcVector = __lsx_vshuf_b(zero, srcVector, rgbaMask); + const __m128i src1 = __lsx_vilvl_b(srcVector, srcVector); + const __m128i src2 = __lsx_vilvh_b(srcVector, srcVector); + if (testc[0]!=0) { + __m128i alpha1 = __lsx_vshuf_b(zero, src1, shuffleMask); + __m128i alpha2 = __lsx_vshuf_b(zero, src2, shuffleMask); + __m128i dst1 = __lsx_vmuh_hu(src1, alpha1); + __m128i dst2 = __lsx_vmuh_hu(src2, alpha2); + // Map 0->0xfffe to 0->0xffff + dst1 = __lsx_vadd_h(dst1, __lsx_vsrli_h(dst1, 15)); + dst2 = __lsx_vadd_h(dst2, __lsx_vsrli_h(dst2, 15)); + // correct alpha value: + const __m128i blendMask = (__m128i)(v8i16){0, 1, 2, 11, 4, 5, 6, 15}; + dst1 = __lsx_vshuf_h(blendMask, src1, dst1); + dst2 = __lsx_vshuf_h(blendMask, src2, dst2); + __lsx_vst(dst1, &buffer[i], 0); + __lsx_vst(dst2, &buffer[i + 2], 0); + } else { + __lsx_vst(src1, &buffer[i], 0); + __lsx_vst(src2, &buffer[i + 2], 0); + } + } else { + __lsx_vst(zero, &buffer[i], 0); + __lsx_vst(zero, &buffer[i + 2], 0); + } + } + + SIMD_EPILOGUE(i, count, 3) { + const uint s = RGBA ? RGBA2ARGB(src[i]) : src[i]; + buffer[i] = QRgba64::fromArgb32(s).premultiplied(); + } +} + +template<bool RGBA, bool RGBx> +static inline void convertARGBFromARGB32PM_lsx(uint *buffer, const uint *src, int count) +{ + int i = 0; + const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000); + const __m128i rgbaMask = (__m128i)(v16i8){2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15}; + const __m128i zero = __lsx_vldi(0); + + for (; i < count - 3; i += 4) { + __m128i srcVector = __lsx_vld(&src[i], 0); + const v4i32 testz = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector, alphaMask)); + if (testz[0]!=0) { + const v4i32 testc = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector, alphaMask)); + if (testc[0]!=0) { + __m128i srcVectorAlpha = __lsx_vsrli_w(srcVector, 24); + if (RGBA) + srcVector = __lsx_vshuf_b(zero, srcVector, rgbaMask); + const __m128 a = __lsx_vffint_s_w(srcVectorAlpha); + const __m128 ia = reciprocal_mul_ps(a, 255.0f); + __m128i src1 = __lsx_vilvl_b(zero, srcVector); + __m128i src3 = __lsx_vilvh_b(zero, srcVector); + __m128i src2 = __lsx_vilvh_h(zero, src1); + __m128i src4 = __lsx_vilvh_h(zero, src3); + src1 = __lsx_vilvl_h(zero, src1); + src3 = __lsx_vilvl_h(zero, src3); + __m128 ia1 = (__m128)__lsx_vreplvei_w(ia, 0); + __m128 ia2 = (__m128)__lsx_vreplvei_w(ia, 1); + __m128 ia3 = (__m128)__lsx_vreplvei_w(ia, 2); + __m128 ia4 = (__m128)__lsx_vreplvei_w(ia, 3); + src1 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src1), ia1)); + src2 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src2), ia2)); + src3 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src3), ia3)); + src4 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src4), ia4)); + src1 = __lsx_vpickev_h(__lsx_vsat_wu(src2, 15), __lsx_vsat_wu(src1, 15)); + src3 = __lsx_vpickev_h(__lsx_vsat_wu(src4, 15), __lsx_vsat_wu(src3, 15)); + src1 = __lsx_vmaxi_h(src1, 0); + src3 = __lsx_vmaxi_h(src3, 0); + src1 = __lsx_vpickev_b(__lsx_vsat_hu(src3, 7), __lsx_vsat_hu(src1, 7)); + // Handle potential alpha == 0 values: + __m128i srcVectorAlphaMask = __lsx_vseq_w(srcVectorAlpha, zero); + src1 = __lsx_vandn_v(srcVectorAlphaMask, src1); + // Fixup alpha values: + if (RGBx) + srcVector = __lsx_vor_v(src1, alphaMask); + else + srcVector = __lsx_vbitsel_v(src1, srcVector, __lsx_vslti_b(alphaMask, 0)); + __lsx_vst(srcVector, &buffer[i], 0); + } else { + if (RGBA) + __lsx_vst(__lsx_vshuf_b(zero, srcVector, rgbaMask), &buffer[i], 0); + else if (buffer != src) + __lsx_vst(srcVector, &buffer[i], 0); + } + } else { + if (RGBx) + __lsx_vst(alphaMask, &buffer[i], 0); + else + __lsx_vst(zero, &buffer[i], 0); + } + } + + SIMD_EPILOGUE(i, count, 3) { + uint v = qUnpremultiply_lsx(src[i]); + if (RGBx) + v = 0xff000000 | v; + if (RGBA) + v = ARGB2RGBA(v); + buffer[i] = v; + } +} + +template<bool RGBA> +static inline void convertARGBFromRGBA64PM_lsx(uint *buffer, const QRgba64 *src, int count) +{ + int i = 0; + const __m128i alphaMask = __lsx_vreplgr2vr_d(qint64(Q_UINT64_C(0xffff) << 48)); + const __m128i alphaMask32 = __lsx_vreplgr2vr_w(0xff000000); + const __m128i rgbaMask = (__m128i)(v16i8){2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15}; + const __m128i zero = __lsx_vldi(0); + + for (; i < count - 3; i += 4) { + __m128i srcVector1 = __lsx_vld(&src[i], 0); + __m128i srcVector2 = __lsx_vld(&src[i + 2], 0); + const v4i32 testz1 = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector1, alphaMask)); + bool transparent1 = testz1[0]==0; + const v4i32 testc1 = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector1, alphaMask)); + bool opaque1 = testc1[0]==0; + const v4i32 testz2 = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector2, alphaMask)); + bool transparent2 = testz2[0]==0; + const v4i32 testc2 = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector2, alphaMask)); + bool opaque2 = testc2[0]==0; + + if (!(transparent1 && transparent2)) { + if (!(opaque1 && opaque2)) { + __m128i srcVector1Alpha = __lsx_vsrli_d(srcVector1, 48); + __m128i srcVector2Alpha = __lsx_vsrli_d(srcVector2, 48); + __m128i srcVectorAlpha = __lsx_vpickev_h(__lsx_vsat_wu(srcVector2Alpha, 15), + __lsx_vsat_wu(srcVector1Alpha, 15)); + const __m128 a = __lsx_vffint_s_w(srcVectorAlpha); + // Convert srcVectorAlpha to final 8-bit alpha channel + srcVectorAlpha = __lsx_vadd_w(srcVectorAlpha, __lsx_vreplgr2vr_w(128)); + srcVectorAlpha = __lsx_vsub_w(srcVectorAlpha, __lsx_vsrli_w(srcVectorAlpha, 8)); + srcVectorAlpha = __lsx_vsrli_w(srcVectorAlpha, 8); + srcVectorAlpha = __lsx_vslli_w(srcVectorAlpha, 24); + const __m128 ia = reciprocal_mul_ps(a, 255.0f); + __m128i src1 = __lsx_vilvl_h(zero, srcVector1); + __m128i src2 = __lsx_vilvh_h(zero, srcVector1); + __m128i src3 = __lsx_vilvl_h(zero, srcVector2); + __m128i src4 = __lsx_vilvh_h(zero, srcVector2); + __m128 ia1 = (__m128)__lsx_vreplvei_w(ia, 0); + __m128 ia2 = (__m128)__lsx_vreplvei_w(ia, 1); + __m128 ia3 = (__m128)__lsx_vreplvei_w(ia, 2); + __m128 ia4 = (__m128)__lsx_vreplvei_w(ia, 3); + src1 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src1), ia1)); + src2 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src2), ia2)); + src3 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src3), ia3)); + src4 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src4), ia4)); + src1 = __lsx_vpickev_h(__lsx_vsat_wu(src2, 15), __lsx_vsat_wu(src1, 15)); + src3 = __lsx_vpickev_h(__lsx_vsat_wu(src4, 15), __lsx_vsat_wu(src3, 15)); + // Handle potential alpha == 0 values: + __m128i srcVector1AlphaMask = __lsx_vseq_d(srcVector1Alpha, zero); + __m128i srcVector2AlphaMask = __lsx_vseq_d(srcVector2Alpha, zero); + src1 = __lsx_vandn_v(srcVector1AlphaMask, src1); + src3 = __lsx_vandn_v(srcVector2AlphaMask, src3); + src1 = __lsx_vmaxi_h(src1, 0); + src3 = __lsx_vmaxi_h(src3, 0); + src1 = __lsx_vpickev_b(__lsx_vsat_hu(src3, 7), __lsx_vsat_hu(src1, 7)); + // Fixup alpha values: + src1 = __lsx_vbitsel_v(src1, srcVectorAlpha, __lsx_vslti_b(alphaMask32, 0)); + // Fix RGB order + if (!RGBA){ + src1 = __lsx_vshuf_b(zero, src1, rgbaMask);} + __lsx_vst(src1, (__m128i *)&buffer[i], 0); + } else { + __m128i src1 = __lsx_vilvl_h(zero, srcVector1); + __m128i src2 = __lsx_vilvh_h(zero, srcVector1); + __m128i src3 = __lsx_vilvl_h(zero, srcVector2); + __m128i src4 = __lsx_vilvh_h(zero, srcVector2); + src1 = __lsx_vadd_w(src1, __lsx_vreplgr2vr_w(128)); + src2 = __lsx_vadd_w(src2, __lsx_vreplgr2vr_w(128)); + src3 = __lsx_vadd_w(src3, __lsx_vreplgr2vr_w(128)); + src4 = __lsx_vadd_w(src4, __lsx_vreplgr2vr_w(128)); + src1 = __lsx_vsub_w(src1, __lsx_vsrli_w(src1, 8)); + src2 = __lsx_vsub_w(src2, __lsx_vsrli_w(src2, 8)); + src3 = __lsx_vsub_w(src3, __lsx_vsrli_w(src3, 8)); + src4 = __lsx_vsub_w(src4, __lsx_vsrli_w(src4, 8)); + src1 = __lsx_vsrli_w(src1, 8); + src2 = __lsx_vsrli_w(src2, 8); + src3 = __lsx_vsrli_w(src3, 8); + src4 = __lsx_vsrli_w(src4, 8); + src1 = __lsx_vpickev_h(__lsx_vsat_wu(src2, 15), __lsx_vsat_wu(src1, 15)); + src3 = __lsx_vpickev_h(__lsx_vsat_wu(src4, 15), __lsx_vsat_wu(src3, 15)); + src1 = __lsx_vmaxi_h(src1, 0); + src3 = __lsx_vmaxi_h(src3, 0); + src1 = __lsx_vpickev_b(__lsx_vsat_hu(src3, 7), __lsx_vsat_hu(src1, 15)); + if (!RGBA){ + src1 = __lsx_vshuf_b(zero, src1, rgbaMask);} + __lsx_vst(src1, &buffer[i], 0); + } + } else { + __lsx_vst(zero, &buffer[i], 0); + } + } + + SIMD_EPILOGUE(i, count, 3) { + buffer[i] = qConvertRgba64ToRgb32_lsx<RGBA ? PixelOrderRGB : PixelOrderBGR>(src[i]); + } +} + +template<bool mask> +static inline void convertRGBA64FromRGBA64PM_lsx(QRgba64 *buffer, const QRgba64 *src, int count) +{ + int i = 0; + const __m128i alphaMask = __lsx_vreplgr2vr_d(qint64(Q_UINT64_C(0xffff) << 48)); + const __m128i zero = __lsx_vldi(0); + + for (; i < count - 3; i += 4) { + __m128i srcVector1 = __lsx_vld(&src[i + 0], 0); + __m128i srcVector2 = __lsx_vld(&src[i + 2], 0); + const v4i32 testz1 = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector1, alphaMask)); + bool transparent1 = testz1[0]==0; + const v4i32 testc1 = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector1, alphaMask)); + bool opaque1 = testc1[0]==0; + const v4i32 testz2 = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector2, alphaMask)); + bool transparent2 = testz2[0]==0; + const v4i32 testc2 = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector2, alphaMask)); + bool opaque2 = testc2[0]==0; + + if (!(transparent1 && transparent2)) { + if (!(opaque1 && opaque2)) { + __m128i srcVector1Alpha = __lsx_vsrli_d(srcVector1, 48); + __m128i srcVector2Alpha = __lsx_vsrli_d(srcVector2, 48); + __m128i srcVectorAlpha = __lsx_vpickev_h(__lsx_vsat_wu(srcVector2Alpha, 15), + __lsx_vsat_wu(srcVector1Alpha, 15)); + const __m128 a = __lsx_vffint_s_w(srcVectorAlpha); + const __m128 ia = reciprocal_mul_ps(a, 65535.0f); + __m128i src1 = __lsx_vilvl_h(zero, srcVector1); + __m128i src2 = __lsx_vilvh_h(zero, srcVector1); + __m128i src3 = __lsx_vilvl_h(zero, srcVector2); + __m128i src4 = __lsx_vilvh_h(zero, srcVector2); + __m128 ia1 = (__m128)__lsx_vreplvei_w(ia, 0); + __m128 ia2 = (__m128)__lsx_vreplvei_w(ia, 1); + __m128 ia3 = (__m128)__lsx_vreplvei_w(ia, 2); + __m128 ia4 = (__m128)__lsx_vreplvei_w(ia, 3); + src1 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src1), ia1)); + src2 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src2), ia2)); + src3 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src3), ia3)); + src4 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src4), ia4)); + src1 = __lsx_vpickev_h(__lsx_vsat_wu(src2, 15), __lsx_vsat_wu(src1, 15)); + src3 = __lsx_vpickev_h(__lsx_vsat_wu(src4, 15), __lsx_vsat_wu(src3, 15)); + // Handle potential alpha == 0 values: + __m128i srcVector1AlphaMask = __lsx_vseq_d(srcVector1Alpha, zero); + __m128i srcVector2AlphaMask = __lsx_vseq_d(srcVector2Alpha, zero); + src1 = __lsx_vandn_v(srcVector1AlphaMask, src1); + src3 = __lsx_vandn_v(srcVector2AlphaMask, src3); + // Fixup alpha values: + if (mask) { + src1 = __lsx_vor_v(src1, alphaMask); + src3 = __lsx_vor_v(src3, alphaMask); + } else { + src1 = __lsx_vbitsel_v(src1, srcVector1, __lsx_vslti_b(alphaMask, 0)); + src3 = __lsx_vbitsel_v(src3, srcVector2, __lsx_vslti_b(alphaMask, 0)); + } + __lsx_vst(src1, &buffer[i + 0], 0); + __lsx_vst(src3, &buffer[i + 2], 0); + } else { + if (mask) { + srcVector1 = __lsx_vor_v(srcVector1, alphaMask); + srcVector2 = __lsx_vor_v(srcVector2, alphaMask); + } + if (mask || src != buffer) { + __lsx_vst(srcVector1, &buffer[i + 0], 0); + __lsx_vst(srcVector2, &buffer[i + 2], 0); + } + } + } else { + __lsx_vst(zero, &buffer[i + 0], 0); + __lsx_vst(zero, &buffer[i + 2], 0); + } + } + + SIMD_EPILOGUE(i, count, 3) { + QRgba64 v = src[i].unpremultiplied(); + if (mask) + v.setAlpha(65535); + buffer[i] = v; + } +} + +void QT_FASTCALL convertARGB32ToARGB32PM_lsx(uint *buffer, int count, const QList<QRgb> *) +{ + convertARGBToARGB32PM_lsx<false>(buffer, buffer, count); +} + +void QT_FASTCALL convertRGBA8888ToARGB32PM_lsx(uint *buffer, int count, const QList<QRgb> *) +{ + convertARGBToARGB32PM_lsx<true>(buffer, buffer, count); +} + +const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_lsx(QRgba64 *buffer, const uint *src, int count, + const QList<QRgb> *, QDitherInfo *) +{ + convertARGBToRGBA64PM_lsx<false>(buffer, src, count); + return buffer; +} + +const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_lsx(QRgba64 *buffer, const uint *src, int count, + const QList<QRgb> *, QDitherInfo *) +{ + convertARGBToRGBA64PM_lsx<true>(buffer, src, count); + return buffer; +} + +const uint *QT_FASTCALL fetchARGB32ToARGB32PM_lsx(uint *buffer, const uchar *src, int index, int count, + const QList<QRgb> *, QDitherInfo *) +{ + convertARGBToARGB32PM_lsx<false>(buffer, reinterpret_cast<const uint *>(src) + index, count); + return buffer; +} + +const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_lsx(uint *buffer, const uchar *src, int index, int count, + const QList<QRgb> *, QDitherInfo *) +{ + convertARGBToARGB32PM_lsx<true>(buffer, reinterpret_cast<const uint *>(src) + index, count); + return buffer; +} + +const QRgba64 *QT_FASTCALL fetchARGB32ToRGBA64PM_lsx(QRgba64 *buffer, const uchar *src, int index, int count, + const QList<QRgb> *, QDitherInfo *) +{ + convertARGBToRGBA64PM_lsx<false>(buffer, reinterpret_cast<const uint *>(src) + index, count); + return buffer; +} + +const QRgba64 *QT_FASTCALL fetchRGBA8888ToRGBA64PM_lsx(QRgba64 *buffer, const uchar *src, int index, int count, + const QList<QRgb> *, QDitherInfo *) +{ + convertARGBToRGBA64PM_lsx<true>(buffer, reinterpret_cast<const uint *>(src) + index, count); + return buffer; +} + +void QT_FASTCALL storeRGB32FromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count, + const QList<QRgb> *, QDitherInfo *) +{ + uint *d = reinterpret_cast<uint *>(dest) + index; + convertARGBFromARGB32PM_lsx<false,true>(d, src, count); +} + +void QT_FASTCALL storeARGB32FromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count, + const QList<QRgb> *, QDitherInfo *) +{ + uint *d = reinterpret_cast<uint *>(dest) + index; + convertARGBFromARGB32PM_lsx<false,false>(d, src, count); +} + +void QT_FASTCALL storeRGBA8888FromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count, + const QList<QRgb> *, QDitherInfo *) +{ + uint *d = reinterpret_cast<uint *>(dest) + index; + convertARGBFromARGB32PM_lsx<true,false>(d, src, count); +} + +void QT_FASTCALL storeRGBXFromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count, + const QList<QRgb> *, QDitherInfo *) +{ + uint *d = reinterpret_cast<uint *>(dest) + index; + convertARGBFromARGB32PM_lsx<true,true>(d, src, count); +} + +template<QtPixelOrder PixelOrder> +void QT_FASTCALL storeA2RGB30PMFromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count, + const QList<QRgb> *, QDitherInfo *) +{ + uint *d = reinterpret_cast<uint *>(dest) + index; + for (int i = 0; i < count; ++i) + d[i] = qConvertArgb32ToA2rgb30_lsx<PixelOrder>(src[i]); +} + +#if QT_CONFIG(raster_64bit) +void QT_FASTCALL destStore64ARGB32_lsx(QRasterBuffer *rasterBuffer, int x, + int y, const QRgba64 *buffer, int length) +{ + uint *dest = (uint*)rasterBuffer->scanLine(y) + x; + convertARGBFromRGBA64PM_lsx<false>(dest, buffer, length); +} + +void QT_FASTCALL destStore64RGBA8888_lsx(QRasterBuffer *rasterBuffer, int x, + int y, const QRgba64 *buffer, int length) +{ + uint *dest = (uint*)rasterBuffer->scanLine(y) + x; + convertARGBFromRGBA64PM_lsx<true>(dest, buffer, length); +} +#endif + +void QT_FASTCALL storeARGB32FromRGBA64PM_lsx(uchar *dest, const QRgba64 *src, int index, int count, + const QList<QRgb> *, QDitherInfo *) +{ + uint *d = (uint*)dest + index; + convertARGBFromRGBA64PM_lsx<false>(d, src, count); +} + +void QT_FASTCALL storeRGBA8888FromRGBA64PM_lsx(uchar *dest, const QRgba64 *src, int index, int count, + const QList<QRgb> *, QDitherInfo *) +{ + uint *d = (uint*)dest + index; + convertARGBFromRGBA64PM_lsx<true>(d, src, count); +} + +template +void QT_FASTCALL storeA2RGB30PMFromARGB32PM_lsx<PixelOrderBGR>(uchar *dest, const uint *src, int index, int count, + const QList<QRgb> *, QDitherInfo *); +template +void QT_FASTCALL storeA2RGB30PMFromARGB32PM_lsx<PixelOrderRGB>(uchar *dest, const uint *src, int index, int count, + const QList<QRgb> *, QDitherInfo *); + +void QT_FASTCALL storeRGBA64FromRGBA64PM_lsx(uchar *dest, const QRgba64 *src, int index, int count, + const QList<QRgb> *, QDitherInfo *) +{ + QRgba64 *d = (QRgba64 *)dest + index; + convertRGBA64FromRGBA64PM_lsx<false>(d, src, count); +} + +void QT_FASTCALL storeRGBx64FromRGBA64PM_lsx(uchar *dest, const QRgba64 *src, int index, int count, + const QList<QRgb> *, QDitherInfo *) +{ + QRgba64 *d = (QRgba64 *)dest + index; + convertRGBA64FromRGBA64PM_lsx<true>(d, src, count); +} + +#if QT_CONFIG(raster_fp) +const QRgbaFloat32 *QT_FASTCALL fetchRGBA32FToRGBA32F_lsx(QRgbaFloat32 *buffer, const uchar *src, + int index, int count, + const QList<QRgb> *, QDitherInfo *) +{ + const QRgbaFloat32 *s = reinterpret_cast<const QRgbaFloat32 *>(src) + index; + for (int i = 0; i < count; ++i) { + __m128 vsf = (__m128)__lsx_vld(reinterpret_cast<const float *>(s + i), 0); + __m128 vsa = (__m128)__lsx_vreplvei_w(vsf, 3); + vsf = __lsx_vfmul_s(vsf, vsa); + vsf = (__m128)__lsx_vextrins_w(vsf, vsa, 0x30); + __lsx_vst(vsf, reinterpret_cast<float *>(buffer + i), 0); + } + return buffer; +} + +void QT_FASTCALL storeRGBX32FFromRGBA32F_lsx(uchar *dest, const QRgbaFloat32 *src, + int index, int count, + const QList<QRgb> *, QDitherInfo *) +{ + QRgbaFloat32 *d = reinterpret_cast<QRgbaFloat32 *>(dest) + index; + const __m128 zero = (__m128)(v4f32){0.0f, 0.0f, 0.0f, 1.0f}; + for (int i = 0; i < count; ++i) { + __m128 vsf = (__m128)__lsx_vld(reinterpret_cast<const float *>(src + i), 0); + const __m128 vsa = (__m128)__lsx_vreplvei_w(vsf, 3); + FloatInt a; + a.i = __lsx_vpickve2gr_w(vsa, 0); + if (a.f == 1.0f) + { } + else if (a.f == 0.0f) + vsf = zero; + else { + __m128 vsr = __lsx_vfrecip_s(vsa); + vsr = __lsx_vfsub_s(__lsx_vfadd_s(vsr, vsr), + __lsx_vfmul_s(vsr, __lsx_vfmul_s(vsr, vsa))); + vsf = __lsx_vfmul_s(vsf, vsr); + FloatInt b = {.f = 1.0f}; + vsf = (__m128)__lsx_vinsgr2vr_w(vsf, b.i, 3); + } + __lsx_vst(vsf, reinterpret_cast<float *>(d + i), 0); + } +} + +void QT_FASTCALL storeRGBA32FFromRGBA32F_lsx(uchar *dest, const QRgbaFloat32 *src, + int index, int count, + const QList<QRgb> *, QDitherInfo *) +{ + QRgbaFloat32 *d = reinterpret_cast<QRgbaFloat32 *>(dest) + index; + const __m128 zero = (__m128)__lsx_vldi(0); + for (int i = 0; i < count; ++i) { + __m128 vsf = (__m128)__lsx_vld(reinterpret_cast<const float *>(src + i), 0); + const __m128 vsa = (__m128)__lsx_vreplvei_w(vsf, 3); + FloatInt a; + a.i = __lsx_vpickve2gr_w(vsa, 0); + if (a.f == 1.0f) + { } + else if (a.f == 0.0f) + vsf = zero; + else { + __m128 vsr = __lsx_vfrecip_s(vsa); + vsr = __lsx_vfsub_s(__lsx_vfadd_s(vsr, vsr), + __lsx_vfmul_s(vsr, __lsx_vfmul_s(vsr, vsa))); + FloatInt b = {.f = 1.0f}; + vsr = (__m128)__lsx_vinsgr2vr_w(vsr, b.i, 3); + vsf = __lsx_vfmul_s(vsf, vsr); + } + __lsx_vst(vsf, reinterpret_cast<float *>(d + i), 0); + } +} +#endif + QT_END_NAMESPACE #endif // QT_COMPILER_SUPPORTS_LSX diff --git a/src/gui/painting/qdrawingprimitive_lsx_p.h b/src/gui/painting/qdrawingprimitive_lsx_p.h index 06e97139df6..f4d34e2ec2f 100644 --- a/src/gui/painting/qdrawingprimitive_lsx_p.h +++ b/src/gui/painting/qdrawingprimitive_lsx_p.h @@ -224,6 +224,92 @@ static __m128 __lsx_vreplfr2vr_s(float val) return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i); } +Q_ALWAYS_INLINE __m128 Q_DECL_VECTORCALL reciprocal_mul_ps(const __m128 a, float mul) +{ + __m128 ia = __lsx_vfrecip_s(a); // Approximate 1/a + // Improve precision of ia using Newton-Raphson + ia = __lsx_vfsub_s(__lsx_vfadd_s(ia, ia), __lsx_vfmul_s(ia, __lsx_vfmul_s(ia, a))); + ia = __lsx_vfmul_s(ia, __lsx_vreplfr2vr_s(mul)); + return ia; +} + +inline QRgb qUnpremultiply_lsx(QRgb p) +{ + const uint alpha = qAlpha(p); + if (alpha == 255) + return p; + if (alpha == 0) + return 0; + const __m128 va = __lsx_vffint_s_w(__lsx_vreplgr2vr_w(alpha)); + __m128 via = reciprocal_mul_ps(va, 255.0f); // Approximate 1/a + const __m128i shuffleMask = (__m128i)(v16i8){0,16,16,16,1,16,16,16,2,16,16,16,3,16,16,16}; + __m128i vl = __lsx_vshuf_b(__lsx_vldi(0), __lsx_vreplgr2vr_w(p), shuffleMask); + vl = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(vl), via)); + vl = __lsx_vmaxi_w(vl, 0); + vl = __lsx_vpickev_h(__lsx_vsat_wu(vl, 15), __lsx_vsat_wu(vl, 15)); + vl = __lsx_vinsgr2vr_h(vl, alpha, 3); + vl = __lsx_vpickev_b(__lsx_vsat_hu(vl, 7), __lsx_vsat_hu(vl, 7)); + return __lsx_vpickve2gr_w(vl, 0); +} + +template<enum QtPixelOrder PixelOrder> +inline uint qConvertArgb32ToA2rgb30_lsx(QRgb p) +{ + const uint alpha = qAlpha(p); + if (alpha == 255) + return qConvertRgb32ToRgb30<PixelOrder>(p); + if (alpha == 0) + return 0; + Q_CONSTEXPR float mult = 1023.0f / (255 >> 6); + const uint newalpha = (alpha >> 6); + const __m128 va = __lsx_vffint_s_w(__lsx_vreplgr2vr_w(alpha)); + __m128 via = reciprocal_mul_ps(va, mult * newalpha); + const __m128i shuffleMask = (__m128i)(v16i8){0,16,16,16,1,16,16,16,2,16,16,16,3,16,16,16}; + __m128i vl = __lsx_vshuf_b(__lsx_vldi(0), __lsx_vreplgr2vr_w(p), shuffleMask); + vl = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(vl), via)); + vl = __lsx_vmaxi_w(vl, 0); + vl = __lsx_vpickev_h(__lsx_vsat_wu(vl, 15), __lsx_vsat_wu(vl, 15)); + uint rgb30 = (newalpha << 30); + rgb30 |= ((uint)__lsx_vpickve2gr_h(vl, 1)) << 10; + if (PixelOrder == PixelOrderRGB) { + rgb30 |= ((uint)__lsx_vpickve2gr_h(vl, 2)) << 20; + rgb30 |= ((uint)__lsx_vpickve2gr_h(vl, 0)); + } else { + rgb30 |= ((uint)__lsx_vpickve2gr_h(vl, 0)) << 20; + rgb30 |= ((uint)__lsx_vpickve2gr_h(vl, 2)); + } + return rgb30; +} + +template<enum QtPixelOrder PixelOrder> +inline uint qConvertRgba64ToRgb32_lsx(QRgba64 p) +{ + if (p.isTransparent()) + return 0; + __m128i vl = __lsx_vilvl_d(__lsx_vldi(0), __lsx_vldrepl_d(&p, 0)); + if (!p.isOpaque()) { + const __m128 va = __lsx_vffint_s_w(__lsx_vreplgr2vr_w(p.alpha())); + __m128 via = reciprocal_mul_ps(va, 65535.0f); + vl = __lsx_vilvl_h(__lsx_vldi(0), vl); + vl = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(vl) , via)); + vl = __lsx_vmaxi_w(vl, 0); + vl = __lsx_vpickev_h(__lsx_vsat_wu(vl, 15), __lsx_vsat_wu(vl, 15)); + vl = __lsx_vinsgr2vr_h(vl, p.alpha(), 3); + } + if (PixelOrder == PixelOrderBGR){ + const __m128i shuffleMask = (__m128i)(v8i16){2, 1, 0, 3, 4, 5, 6, 7}; + vl = __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), vl); + } + vl = __lsx_vilvl_h(__lsx_vldi(0), vl); + vl = __lsx_vadd_w(vl, __lsx_vreplgr2vr_w(128)); + vl = __lsx_vsub_w(vl, __lsx_vsrli_w(vl, 8)); + vl = __lsx_vsrli_w(vl, 8); + vl = __lsx_vpickev_h(__lsx_vsat_w(vl, 15), __lsx_vsat_w(vl, 15)); + __m128i tmp = __lsx_vmaxi_h(vl, 0); + vl = __lsx_vpickev_b(__lsx_vsat_hu(tmp, 7), __lsx_vsat_hu(tmp, 7)); + return __lsx_vpickve2gr_w(vl, 0); +} + QT_END_NAMESPACE #endif // __loongarch_sx diff --git a/src/gui/painting/qpixellayout.cpp b/src/gui/painting/qpixellayout.cpp index dab337260d6..cdd18aa711f 100644 --- a/src/gui/painting/qpixellayout.cpp +++ b/src/gui/painting/qpixellayout.cpp @@ -313,6 +313,9 @@ static void QT_FASTCALL convertToRGB32(uint *buffer, int count, const QList<QRgb #if defined(__SSE2__) && !defined(__SSSE3__) && QT_COMPILER_SUPPORTS_SSSE3 extern const uint * QT_FASTCALL fetchPixelsBPP24_ssse3(uint *dest, const uchar*src, int index, int count); +#elif defined QT_COMPILER_SUPPORTS_LSX +// from qdrawhelper_lsx.cpp +extern const uint * QT_FASTCALL fetchPixelsBPP24_lsx(uint *dest, const uchar *src, int index, int count); #endif template<QImage::Format Format> @@ -328,6 +331,12 @@ static const uint *QT_FASTCALL fetchRGBToRGB32(uint *buffer, const uchar *src, i convertToRGB32<Format>(buffer, count, nullptr); return buffer; } +#elif defined QT_COMPILER_SUPPORTS_LSX + if (BPP == QPixelLayout::BPP24 && qCpuHasFeature(LSX)) { + fetchPixelsBPP24_lsx(buffer, src, index, count); + convertToRGB32<Format>(buffer, count, nullptr); + return buffer; + } #endif for (int i = 0; i < count; ++i) buffer[i] = convertPixelToRGB32<Format>(fetchPixel<BPP>(src, index + i)); @@ -434,6 +443,12 @@ static const uint *QT_FASTCALL fetchARGBPMToARGB32PM(uint *buffer, const uchar * convertARGBPMToARGB32PM<Format>(buffer, count, nullptr); return buffer; } +#elif defined QT_COMPILER_SUPPORTS_LSX + if (BPP == QPixelLayout::BPP24 && qCpuHasFeature(LSX)) { + fetchPixelsBPP24_lsx(buffer, src, index, count); + convertARGBPMToARGB32PM<Format>(buffer, count, nullptr); + return buffer; + } #endif for (int i = 0; i < count; ++i) buffer[i] = convertPixelToARGB32PM<Format>(fetchPixel<BPP>(src, index + i)); |