summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChen Zhanwang <[email protected]>2024-06-21 17:05:49 +0800
committerVolker Hilsheimer <[email protected]>2024-10-22 18:12:15 +0200
commitd511a68684c2f76b48c696c5f8a04c22ef2d00fe (patch)
tree22959c8691e6484274b0b71cbe1cc0af0e182b1b
parent73ce5a940ab4110e1140bf1ed0a71d34448a4be0 (diff)
Complete drawhelper Func with LSX
List of optimized implementations using LSX: - qt_blend_argb32_on_argb32 - qt_blend_rgb32_on_rgb32 - comp_func_SourceOver - comp_func_Plus - comp_func_Source - comp_func_solid_Source - comp_func_solid_SourceOver - qt_memfill64 - qt_memfill32 - qt_bitmapblit32 - qt_bitmapblit16 - qt_scale_image_argb32_on_argb32 - convert_RGB888_to_RGB32 - qt_qimageScaleAARGBA_up_x_down_y - qt_qimageScaleAARGBA_down_x_up_y - qt_qimageScaleAARGBA_down_xy All of the above functions have passed the tests under tests/auto/gui. Change-Id: I7ae6169305b81bdf7fb704619453c505f8bb960f Reviewed-by: Volker Hilsheimer <[email protected]>
-rw-r--r--src/gui/CMakeLists.txt9
-rw-r--r--src/gui/image/qimage_conversions.cpp12
-rw-r--r--src/gui/image/qimage_lsx.cpp115
-rw-r--r--src/gui/image/qimage_p.h2
-rw-r--r--src/gui/painting/qdrawhelper.cpp72
-rw-r--r--src/gui/painting/qdrawhelper_loongarch64_p.h48
-rw-r--r--src/gui/painting/qdrawhelper_lsx.cpp593
-rw-r--r--src/gui/painting/qdrawhelper_p.h2
-rw-r--r--src/gui/painting/qdrawingprimitive_lsx_p.h231
-rw-r--r--src/gui/painting/qimagescale.cpp36
-rw-r--r--src/gui/painting/qimagescale_lsx.cpp233
11 files changed, 1347 insertions, 6 deletions
diff --git a/src/gui/CMakeLists.txt b/src/gui/CMakeLists.txt
index d7a247fa45c..d6f49cd2bd7 100644
--- a/src/gui/CMakeLists.txt
+++ b/src/gui/CMakeLists.txt
@@ -183,6 +183,8 @@ qt_internal_add_module(Gui
painting/qdrawhelper_p.h
painting/qdrawhelper_x86_p.h
painting/qdrawingprimitive_sse2_p.h
+ painting/qdrawhelper_loongarch64_p.h
+ painting/qdrawingprimitive_lsx_p.h
painting/qemulationpaintengine.cpp painting/qemulationpaintengine_p.h
painting/qfixed_p.h
painting/qgrayraster.c painting/qgrayraster_p.h
@@ -655,6 +657,13 @@ qt_internal_add_simd_part(Gui SIMD neon
painting/qimagescale_neon.cpp
)
+qt_internal_add_simd_part(Gui SIMD lsx
+ SOURCES
+ image/qimage_lsx.cpp
+ painting/qdrawhelper_lsx.cpp
+ painting/qimagescale_lsx.cpp
+)
+
if(NOT ANDROID)
qt_internal_add_simd_part(Gui SIMD mips_dsp
SOURCES
diff --git a/src/gui/image/qimage_conversions.cpp b/src/gui/image/qimage_conversions.cpp
index 09caf558e06..ec75b8c386e 100644
--- a/src/gui/image/qimage_conversions.cpp
+++ b/src/gui/image/qimage_conversions.cpp
@@ -2754,6 +2754,18 @@ static void qInitImageConversions()
}
#endif
+#if defined(QT_COMPILER_SUPPORTS_LSX)
+ if (qCpuHasFeature(LSX)) {
+ extern void convert_RGB888_to_RGB32_lsx(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags);
+ qimage_converter_map[QImage::Format_RGB888][QImage::Format_RGB32] = convert_RGB888_to_RGB32_lsx;
+ qimage_converter_map[QImage::Format_RGB888][QImage::Format_ARGB32] = convert_RGB888_to_RGB32_lsx;
+ qimage_converter_map[QImage::Format_RGB888][QImage::Format_ARGB32_Premultiplied] = convert_RGB888_to_RGB32_lsx;
+ qimage_converter_map[QImage::Format_BGR888][QImage::Format_RGBX8888] = convert_RGB888_to_RGB32_lsx;
+ qimage_converter_map[QImage::Format_BGR888][QImage::Format_RGBA8888] = convert_RGB888_to_RGB32_lsx;
+ qimage_converter_map[QImage::Format_BGR888][QImage::Format_RGBA8888_Premultiplied] = convert_RGB888_to_RGB32_lsx;
+ }
+#endif
+
#if defined(__ARM_NEON__)
extern void convert_RGB888_to_RGB32_neon(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags);
qimage_converter_map[QImage::Format_RGB888][QImage::Format_RGB32] = convert_RGB888_to_RGB32_neon;
diff --git a/src/gui/image/qimage_lsx.cpp b/src/gui/image/qimage_lsx.cpp
new file mode 100644
index 00000000000..e99a6087d02
--- /dev/null
+++ b/src/gui/image/qimage_lsx.cpp
@@ -0,0 +1,115 @@
+// Copyright (C) 2016 The Qt Company Ltd.
+// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
+
+#include <qimage.h>
+#include <private/qimage_p.h>
+#include <private/qsimd_p.h>
+
+#ifdef QT_COMPILER_SUPPORTS_LSX
+
+QT_BEGIN_NAMESPACE
+
+// Convert a scanline of RGB888 (src) to RGB32 (dst)
+// src must be at least len * 3 bytes
+// dst must be at least len * 4 bytes
+Q_GUI_EXPORT void QT_FASTCALL qt_convert_rgb888_to_rgb32_lsx(quint32 *dst, const uchar *src, int len)
+{
+ int i = 0;
+
+ // Prologue, align dst to 16 bytes.
+ ALIGNMENT_PROLOGUE_16BYTES(dst, i, len) {
+ dst[i] = qRgb(src[0], src[1], src[2]);
+ src += 3;
+ }
+
+ // Mask the 4 first colors of the RGB888 vector
+ const __m128i shuffleMask = (__m128i)(v16i8){2, 1, 0, 16, 5, 4, 3, 16,
+ 8, 7, 6, 16, 11, 10, 9, 16};
+ // Mask the 4 last colors of a RGB888 vector with an offset of 1 (so the last 3 bytes are RGB)
+ const __m128i shuffleMaskEnd = (__m128i)(v16i8){6, 5, 4, 16, 9, 8, 7, 16,
+ 12, 11, 10, 16, 15, 14, 13, 16};
+ // Mask to have alpha = 0xff
+ const __m128i alphaMask = __lsx_vreplgr2vr_b(0xff);
+
+ // Mask to concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by 12 bytes
+ const __m128i indexMask1 = (__m128i)(v16i8){12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27};
+
+ // Mask to concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by 8 bytes
+ const __m128i indexMask2 = (__m128i)(v16i8){8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23};
+
+ const __m128i *inVectorPtr = (const __m128i *)src;
+ __m128i *dstVectorPtr = (__m128i *)(dst + i);
+
+ for (; i < (len - 15); i += 16) { // one iteration in the loop converts 16 pixels
+ /*
+ RGB888 has 5 pixels per vector, + 1 byte from the next pixel. The idea here is
+ to load vectors of RGB888 and use palignr to select a vector out of two vectors.
+
+ After 3 loads of RGB888 and 3 stores of RGB32, we have 4 pixels left in the last
+ vector of RGB888, we can mask it directly to get a last store or RGB32. After that,
+ the first next byte is a R, and we can loop for the next 16 pixels.
+
+ The conversion itself is done with a byte permutation (vshuf_b).
+ */
+ __m128i firstSrcVector = __lsx_vld(inVectorPtr, 0);
+ __m128i outputVector = __lsx_vshuf_b(alphaMask, firstSrcVector, shuffleMask);
+ __lsx_vst(outputVector, dstVectorPtr, 0);
+ ++inVectorPtr;
+ ++dstVectorPtr;
+
+ // There are 4 unused bytes left in srcVector, we need to load the next 16 bytes
+ __m128i secondSrcVector = __lsx_vld(inVectorPtr, 0);
+ __m128i srcVector = __lsx_vshuf_b(secondSrcVector, firstSrcVector, indexMask1);
+ outputVector = __lsx_vshuf_b(alphaMask, srcVector, shuffleMask);
+ __lsx_vst(outputVector, dstVectorPtr, 0);
+ ++inVectorPtr;
+ ++dstVectorPtr;
+ firstSrcVector = secondSrcVector;
+
+ // We now have 8 unused bytes left in firstSrcVector
+ secondSrcVector = __lsx_vld(inVectorPtr, 0);
+ srcVector = __lsx_vshuf_b(secondSrcVector, firstSrcVector, indexMask2);
+ outputVector = __lsx_vshuf_b(alphaMask, srcVector, shuffleMask);
+ __lsx_vst(outputVector, dstVectorPtr, 0);
+ ++inVectorPtr;
+ ++dstVectorPtr;
+
+ // There are now 12 unused bytes in firstSrcVector.
+ // We can mask them directly, almost there.
+ outputVector = __lsx_vshuf_b(alphaMask, secondSrcVector, shuffleMaskEnd);
+ __lsx_vst(outputVector, dstVectorPtr, 0);
+ ++dstVectorPtr;
+ }
+ src = (const uchar *)inVectorPtr;
+
+ SIMD_EPILOGUE(i, len, 15) {
+ dst[i] = qRgb(src[0], src[1], src[2]);
+ src += 3;
+ }
+}
+
+void convert_RGB888_to_RGB32_lsx(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags)
+{
+ Q_ASSERT(src->format == QImage::Format_RGB888 || src->format == QImage::Format_BGR888);
+ if (src->format == QImage::Format_BGR888)
+ Q_ASSERT(dest->format == QImage::Format_RGBX8888 || dest->format == QImage::Format_RGBA8888 || dest->format == QImage::Format_RGBA8888_Premultiplied);
+ else
+ Q_ASSERT(dest->format == QImage::Format_RGB32 || dest->format == QImage::Format_ARGB32 || dest->format == QImage::Format_ARGB32_Premultiplied);
+ Q_ASSERT(src->width == dest->width);
+ Q_ASSERT(src->height == dest->height);
+
+ const uchar *src_data = (uchar *) src->data;
+ quint32 *dest_data = (quint32 *) dest->data;
+
+ for (int i = 0; i < src->height; ++i) {
+ qt_convert_rgb888_to_rgb32_lsx(dest_data, src_data, src->width);
+ src_data += src->bytes_per_line;
+ dest_data = (quint32 *)((uchar*)dest_data + dest->bytes_per_line);
+ }
+}
+
+QT_END_NAMESPACE
+
+#endif // QT_COMPILER_SUPPORTS_LSX
diff --git a/src/gui/image/qimage_p.h b/src/gui/image/qimage_p.h
index cdae61698b3..65eafc20b8d 100644
--- a/src/gui/image/qimage_p.h
+++ b/src/gui/image/qimage_p.h
@@ -560,7 +560,7 @@ inline QImage::Format qt_opaqueVersionForPainting(QImage::Format format)
inline QImage::Format qt_alphaVersionForPainting(QImage::Format format)
{
QImage::Format toFormat = qt_alphaVersion(format);
-#if defined(__ARM_NEON__) || defined(__SSE2__)
+#if defined(__ARM_NEON__) || defined(__SSE2__) || defined(QT_COMPILER_SUPPORT_LSX)
// If we are switching depth anyway and we have optimized ARGB32PM routines, upgrade to that.
if (qt_depthForFormat(format) != qt_depthForFormat(toFormat) && qt_depthForFormat(toFormat) <= 32)
toFormat = QImage::Format_ARGB32_Premultiplied;
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp
index 376753879b5..2bfca562249 100644
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@@ -12,6 +12,8 @@
#include <private/qdrawhelper_p.h>
#include <private/qdrawhelper_x86_p.h>
#include <private/qdrawingprimitive_sse2_p.h>
+#include <private/qdrawhelper_loongarch64_p.h>
+#include <private/qdrawingprimitive_lsx_p.h>
#include <private/qdrawhelper_neon_p.h>
#if defined(QT_COMPILER_SUPPORTS_MIPS_DSP) || defined(QT_COMPILER_SUPPORTS_MIPS_DSPR2)
#include <private/qdrawhelper_mips_dsp_p.h>
@@ -4971,7 +4973,7 @@ void qBlendTexture(int count, const QT_FT_Span *spans, void *userData)
case QImage::Format_RGB16:
proc = processTextureSpansRGB16[blendType];
break;
-#if defined(__SSE2__) || defined(__ARM_NEON__) || (Q_PROCESSOR_WORDSIZE == 8)
+#if defined(__SSE2__) || defined(__ARM_NEON__) || defined(QT_COMPILER_SUPPORTS_LSX) || (Q_PROCESSOR_WORDSIZE == 8)
case QImage::Format_ARGB32:
case QImage::Format_RGBA8888:
#endif
@@ -5113,7 +5115,7 @@ void qBlendGradient(int count, const QT_FT_Span *spans, void *userData)
if (isVerticalGradient && blend_vertical_gradient_argb(count, spans, userData))
return;
return blend_src_generic(count, spans, userData);
-#if defined(__SSE2__) || defined(__ARM_NEON__) || (Q_PROCESSOR_WORDSIZE == 8)
+#if defined(__SSE2__) || defined(__ARM_NEON__) || defined(QT_COMPILER_SUPPORTS_LSX) || (Q_PROCESSOR_WORDSIZE == 8)
case QImage::Format_ARGB32:
case QImage::Format_RGBA8888:
#endif
@@ -6368,7 +6370,7 @@ DrawHelper qDrawHelper[] =
static_assert(std::size(qDrawHelper) == QImage::NImageFormats);
-#if !defined(Q_PROCESSOR_X86)
+#if !defined(Q_PROCESSOR_X86) && !defined(QT_COMPILER_SUPPORTS_LSX)
void qt_memfill64(quint64 *dest, quint64 color, qsizetype count)
{
qt_memfill_template<quint64>(dest, color, count);
@@ -6435,7 +6437,7 @@ void qt_memfill16(quint16 *dest, quint16 value, qsizetype count)
qt_memfill32(reinterpret_cast<quint32*>(dest), value32, count / 2);
}
-#if defined(Q_PROCESSOR_X86)
+#if defined(Q_PROCESSOR_X86) || defined(QT_COMPILER_SUPPORTS_LSX)
void (*qt_memfill32)(quint32 *dest, quint32 value, qsizetype count) = nullptr;
void (*qt_memfill64)(quint64 *dest, quint64 value, qsizetype count) = nullptr;
#elif !defined(__ARM_NEON__) && !defined(__MIPS_DSP__)
@@ -6712,6 +6714,68 @@ static void qInitDrawhelperFunctions()
#endif // SSE2
+#if defined(QT_COMPILER_SUPPORTS_LSX)
+ if (qCpuHasFeature(LSX)) {
+ qt_memfill32 = qt_memfill32_lsx;
+ qt_memfill64 = qt_memfill64_lsx;
+
+ qDrawHelper[QImage::Format_RGB32].bitmapBlit = qt_bitmapblit32_lsx;
+ qDrawHelper[QImage::Format_ARGB32].bitmapBlit = qt_bitmapblit32_lsx;
+ qDrawHelper[QImage::Format_ARGB32_Premultiplied].bitmapBlit = qt_bitmapblit32_lsx;
+ qDrawHelper[QImage::Format_RGB16].bitmapBlit = qt_bitmapblit16_lsx;
+ qDrawHelper[QImage::Format_RGBX8888].bitmapBlit = qt_bitmapblit8888_lsx;
+ qDrawHelper[QImage::Format_RGBA8888].bitmapBlit = qt_bitmapblit8888_lsx;
+ qDrawHelper[QImage::Format_RGBA8888_Premultiplied].bitmapBlit = qt_bitmapblit8888_lsx;
+
+ extern void qt_scale_image_argb32_on_argb32_lsx(uchar *destPixels, int dbpl,
+ const uchar *srcPixels, int sbpl, int srch,
+ const QRectF &targetRect,
+ const QRectF &sourceRect,
+ const QRect &clip,
+ int const_alpha);
+
+ qScaleFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_ARGB32_Premultiplied] = qt_scale_image_argb32_on_argb32_lsx;
+ qScaleFunctions[QImage::Format_RGB32][QImage::Format_ARGB32_Premultiplied] = qt_scale_image_argb32_on_argb32_lsx;
+ qScaleFunctions[QImage::Format_RGBA8888_Premultiplied][QImage::Format_RGBA8888_Premultiplied] = qt_scale_image_argb32_on_argb32_lsx;
+ qScaleFunctions[QImage::Format_RGBX8888][QImage::Format_RGBA8888_Premultiplied] = qt_scale_image_argb32_on_argb32_lsx;
+
+ extern void qt_blend_rgb32_on_rgb32_lsx(uchar *destPixels, int dbpl,
+ const uchar *srcPixels, int sbpl,
+ int w, int h,
+ int const_alpha);
+
+ extern void qt_blend_argb32_on_argb32_lsx(uchar *destPixels, int dbpl,
+ const uchar *srcPixels, int sbpl,
+ int w, int h,
+ int const_alpha);
+
+ qBlendFunctions[QImage::Format_RGB32][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_lsx;
+ qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_lsx;
+ qBlendFunctions[QImage::Format_RGB32][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_lsx;
+ qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_lsx;
+ qBlendFunctions[QImage::Format_RGBX8888][QImage::Format_RGBX8888] = qt_blend_rgb32_on_rgb32_lsx;
+ qBlendFunctions[QImage::Format_RGBA8888_Premultiplied][QImage::Format_RGBX8888] = qt_blend_rgb32_on_rgb32_lsx;
+ qBlendFunctions[QImage::Format_RGBX8888][QImage::Format_RGBA8888_Premultiplied] = qt_blend_argb32_on_argb32_lsx;
+ qBlendFunctions[QImage::Format_RGBA8888_Premultiplied][QImage::Format_RGBA8888_Premultiplied] = qt_blend_argb32_on_argb32_lsx;
+
+ extern const uint * QT_FASTCALL qt_fetch_radial_gradient_lsx(uint *buffer, const Operator *op, const QSpanData *data,
+ int y, int x, int length);
+
+ qt_fetch_radial_gradient = qt_fetch_radial_gradient_lsx;
+
+ extern void QT_FASTCALL comp_func_SourceOver_lsx(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
+ extern void QT_FASTCALL comp_func_solid_SourceOver_lsx(uint *destPixels, int length, uint color, uint const_alpha);
+ extern void QT_FASTCALL comp_func_Source_lsx(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
+ extern void QT_FASTCALL comp_func_solid_Source_lsx(uint *destPixels, int length, uint color, uint const_alpha);
+ extern void QT_FASTCALL comp_func_Plus_lsx(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
+ qt_functionForMode_C[QPainter::CompositionMode_SourceOver] = comp_func_SourceOver_lsx;
+ qt_functionForModeSolid_C[QPainter::CompositionMode_SourceOver] = comp_func_solid_SourceOver_lsx;
+ qt_functionForMode_C[QPainter::CompositionMode_Source] = comp_func_Source_lsx;
+ qt_functionForModeSolid_C[QPainter::CompositionMode_Source] = comp_func_solid_Source_lsx;
+ qt_functionForMode_C[QPainter::CompositionMode_Plus] = comp_func_Plus_lsx;
+ }
+#endif //QT_COMPILER_SUPPORTS_LSX
+
#if defined(__ARM_NEON__)
qBlendFunctions[QImage::Format_RGB32][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_neon;
qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_neon;
diff --git a/src/gui/painting/qdrawhelper_loongarch64_p.h b/src/gui/painting/qdrawhelper_loongarch64_p.h
new file mode 100644
index 00000000000..a5513e3e55a
--- /dev/null
+++ b/src/gui/painting/qdrawhelper_loongarch64_p.h
@@ -0,0 +1,48 @@
+// Copyright (C) 2024 Loongson Technology Corporation Limited.
+// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
+
+#ifndef QDRAWHELPER_LOONGARCH64_P_H
+#define QDRAWHELPER_LOONGARCH64_P_H
+
+//
+// W A R N I N G
+// -------------
+//
+// This file is not part of the Qt API. It exists purely as an
+// implementation detail. This header file may change from version to
+// version without notice, or even be removed.
+//
+// We mean it.
+//
+
+#include <QtGui/private/qtguiglobal_p.h>
+#include <private/qdrawhelper_p.h>
+
+QT_BEGIN_NAMESPACE
+
+#ifdef QT_COMPILER_SUPPORTS_LSX
+void qt_memfill64_lsx(quint64 *dest, quint64 value, qsizetype count);
+void qt_memfill32_lsx(quint32 *dest, quint32 value, qsizetype count);
+void qt_bitmapblit32_lsx(QRasterBuffer *rasterBuffer, int x, int y,
+ const QRgba64 &color,
+ const uchar *src, int width, int height, int stride);
+void qt_bitmapblit8888_lsx(QRasterBuffer *rasterBuffer, int x, int y,
+ const QRgba64 &color,
+ const uchar *src, int width, int height, int stride);
+void qt_bitmapblit16_lsx(QRasterBuffer *rasterBuffer, int x, int y,
+ const QRgba64 &color,
+ const uchar *src, int width, int height, int stride);
+void qt_blend_argb32_on_argb32_lsx(uchar *destPixels, int dbpl,
+ const uchar *srcPixels, int sbpl,
+ int w, int h,
+ int const_alpha);
+void qt_blend_rgb32_on_rgb32_lsx(uchar *destPixels, int dbpl,
+ const uchar *srcPixels, int sbpl,
+ int w, int h,
+ int const_alpha);
+
+#endif // QT_COMPILER_SUPPORTS_LSX
+
+QT_END_NAMESPACE
+
+#endif // QDRAWHELPER_LOONGARCH64_P_H
diff --git a/src/gui/painting/qdrawhelper_lsx.cpp b/src/gui/painting/qdrawhelper_lsx.cpp
new file mode 100644
index 00000000000..f28374bc0d3
--- /dev/null
+++ b/src/gui/painting/qdrawhelper_lsx.cpp
@@ -0,0 +1,593 @@
+// Copyright (C) 2024 Loongson Technology Corporation Limited.
+// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
+
+#include <private/qdrawhelper_loongarch64_p.h>
+
+#ifdef QT_COMPILER_SUPPORTS_LSX
+
+#include <private/qdrawingprimitive_lsx_p.h>
+#include <private/qpaintengine_raster_p.h>
+
+QT_BEGIN_NAMESPACE
+
+void qt_blend_argb32_on_argb32_lsx(uchar *destPixels, int dbpl,
+ const uchar *srcPixels, int sbpl,
+ int w, int h,
+ int const_alpha)
+{
+ const quint32 *src = (const quint32 *) srcPixels;
+ quint32 *dst = (quint32 *) destPixels;
+ if (const_alpha == 256) {
+ for (int y = 0; y < h; ++y) {
+ BLEND_SOURCE_OVER_ARGB32_LSX(dst, src, w);
+ dst = (quint32 *)(((uchar *) dst) + dbpl);
+ src = (const quint32 *)(((const uchar *) src) + sbpl);
+ }
+ } else if (const_alpha != 0) {
+ // dest = (s + d * sia) * ca + d * cia
+ // = s * ca + d * (sia * ca + cia)
+ // = s * ca + d * (1 - sa*ca)
+ const_alpha = (const_alpha * 255) >> 8;
+
+ for (int y = 0; y < h; ++y) {
+ BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX(dst, src, w, const_alpha);
+ dst = (quint32 *)(((uchar *) dst) + dbpl);
+ src = (const quint32 *)(((const uchar *) src) + sbpl);
+ }
+ }
+}
+
+// qblendfunctions.cpp
+void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl,
+ const uchar *srcPixels, int sbpl,
+ int w, int h,
+ int const_alpha);
+
+void qt_blend_rgb32_on_rgb32_lsx(uchar *destPixels, int dbpl,
+ const uchar *srcPixels, int sbpl,
+ int w, int h,
+ int const_alpha)
+{
+ const quint32 *src = (const quint32 *) srcPixels;
+ quint32 *dst = (quint32 *) destPixels;
+ if (const_alpha != 256) {
+ if (const_alpha != 0) {
+ const __m128i half = __lsx_vreplgr2vr_h(0x80);
+ const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
+
+ const_alpha = (const_alpha * 255) >> 8;
+ int one_minus_const_alpha = 255 - const_alpha;
+ const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha);
+ const __m128i oneMinusConstAlpha = __lsx_vreplgr2vr_h(one_minus_const_alpha);
+ for (int y = 0; y < h; ++y) {
+ int x = 0;
+
+ // First, align dest to 16 bytes:
+ ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) {
+ dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha,
+ dst[x], one_minus_const_alpha);
+ }
+
+ for (; x < w-3; x += 4) {
+ __m128i srcVector = __lsx_vld(&src[x], 0);
+ __m128i dstVector = __lsx_vld(&dst[x], 0);
+ INTERPOLATE_PIXEL_255_LSX(srcVector, dstVector, constAlphaVector,
+ oneMinusConstAlpha, colorMask, half);
+ __lsx_vst(dstVector, &dst[x], 0);
+ }
+ SIMD_EPILOGUE(x, w, 3)
+ dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha,
+ dst[x], one_minus_const_alpha);
+ dst = (quint32 *)(((uchar *) dst) + dbpl);
+ src = (const quint32 *)(((const uchar *) src) + sbpl);
+ }
+ }
+ } else {
+ qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
+ }
+}
+
+void QT_FASTCALL comp_func_SourceOver_lsx(uint *destPixels, const uint *srcPixels,
+ int length, uint const_alpha)
+{
+ Q_ASSERT(const_alpha < 256);
+
+ const quint32 *src = (const quint32 *) srcPixels;
+ quint32 *dst = (quint32 *) destPixels;
+
+ if (const_alpha == 255) {
+ BLEND_SOURCE_OVER_ARGB32_LSX(dst, src, length);
+ } else {
+ BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX(dst, src, length, const_alpha);
+ }
+}
+
+void QT_FASTCALL comp_func_Plus_lsx(uint *dst, const uint *src, int length, uint const_alpha)
+{
+ int x = 0;
+
+ if (const_alpha == 255) {
+ // 1) Prologue: align destination on 16 bytes
+ ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
+ dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]);
+
+ // 2) composition with LSX
+ for (; x < length - 3; x += 4) {
+ const __m128i srcVector = __lsx_vld(&src[x], 0);
+ const __m128i dstVector = __lsx_vld(&dst[x], 0);
+
+ const __m128i result = __lsx_vsadd_bu(srcVector, dstVector);
+ __lsx_vst(result, &dst[x], 0);
+ }
+
+ // 3) Epilogue:
+ SIMD_EPILOGUE(x, length, 3)
+ dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]);
+ } else {
+ const int one_minus_const_alpha = 255 - const_alpha;
+ const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha);
+ const __m128i oneMinusConstAlpha = __lsx_vreplgr2vr_h(one_minus_const_alpha);
+
+ // 1) Prologue: align destination on 16 bytes
+ ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
+ dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x],
+ const_alpha,
+ one_minus_const_alpha);
+
+ const __m128i half = __lsx_vreplgr2vr_h(0x80);
+ const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
+ // 2) composition with LSX
+ for (; x < length - 3; x += 4) {
+ const __m128i srcVector = __lsx_vld(&src[x], 0);
+ __m128i dstVector = __lsx_vld(&dst[x], 0);
+ __m128i result = __lsx_vsadd_bu(srcVector, dstVector);
+ INTERPOLATE_PIXEL_255_LSX(result, dstVector, constAlphaVector,
+ oneMinusConstAlpha, colorMask, half);
+ __lsx_vst(dstVector, &dst[x], 0);
+ }
+
+ // 3) Epilogue:
+ SIMD_EPILOGUE(x, length, 3)
+ dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x],
+ const_alpha, one_minus_const_alpha);
+ }
+}
+
+void QT_FASTCALL comp_func_Source_lsx(uint *dst, const uint *src, int length, uint const_alpha)
+{
+ if (const_alpha == 255) {
+ ::memcpy(dst, src, length * sizeof(uint));
+ } else {
+ const int ialpha = 255 - const_alpha;
+
+ int x = 0;
+
+ // 1) prologue, align on 16 bytes
+ ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
+ dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
+
+ // 2) interpolate pixels with LSX
+ const __m128i half = __lsx_vreplgr2vr_h(0x80);
+ const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
+
+ const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha);
+ const __m128i oneMinusConstAlpha = __lsx_vreplgr2vr_h(ialpha);
+ for (; x < length - 3; x += 4) {
+ const __m128i srcVector = __lsx_vld(&src[x], 0);
+ __m128i dstVector = __lsx_vld(&dst[x], 0);
+ INTERPOLATE_PIXEL_255_LSX(srcVector, dstVector, constAlphaVector,
+ oneMinusConstAlpha, colorMask, half);
+ __lsx_vst(dstVector, &dst[x], 0);
+ }
+
+ // 3) Epilogue
+ SIMD_EPILOGUE(x, length, 3)
+ dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
+ }
+}
+
+static Q_NEVER_INLINE
+void Q_DECL_VECTORCALL qt_memfillXX_aligned(void *dest, __m128i value128, quintptr bytecount)
+{
+ __m128i *dst128 = reinterpret_cast<__m128i *>(dest);
+ __m128i *end128 = reinterpret_cast<__m128i *>(static_cast<uchar *>(dest) + bytecount);
+
+ while (dst128 + 4 <= end128) {
+ __lsx_vst(value128, dst128 + 0, 0);
+ __lsx_vst(value128, dst128 + 1, 0);
+ __lsx_vst(value128, dst128 + 2, 0);
+ __lsx_vst(value128, dst128 + 3, 0);
+ dst128 += 4;
+ }
+
+ bytecount %= 4 * sizeof(__m128i);
+ switch (bytecount / sizeof(__m128i)) {
+ case 3: __lsx_vst(value128, dst128++, 0); Q_FALLTHROUGH();
+ case 2: __lsx_vst(value128, dst128++, 0); Q_FALLTHROUGH();
+ case 1: __lsx_vst(value128, dst128++, 0);
+ }
+}
+
+void qt_memfill64_lsx(quint64 *dest, quint64 value, qsizetype count)
+{
+ quintptr misaligned = quintptr(dest) % sizeof(__m128i);
+ if (misaligned && count) {
+ *dest++ = value;
+ --count;
+ }
+
+ if (count % 2) {
+ dest[count - 1] = value;
+ --count;
+ }
+
+ qt_memfillXX_aligned(dest, __lsx_vreplgr2vr_d(value), count * sizeof(quint64));
+}
+
+void qt_memfill32_lsx(quint32 *dest, quint32 value, qsizetype count)
+{
+ if (count < 4) {
+ // this simplifies the code below: the first switch can fall through
+ // without checking the value of count
+ switch (count) {
+ case 3: *dest++ = value; Q_FALLTHROUGH();
+ case 2: *dest++ = value; Q_FALLTHROUGH();
+ case 1: *dest = value;
+ }
+ return;
+ }
+
+ const int align = (quintptr)(dest) & 0xf;
+ switch (align) {
+ case 4: *dest++ = value; --count; Q_FALLTHROUGH();
+ case 8: *dest++ = value; --count; Q_FALLTHROUGH();
+ case 12: *dest++ = value; --count;
+ }
+
+ const int rest = count & 0x3;
+ if (rest) {
+ switch (rest) {
+ case 3: dest[count - 3] = value; Q_FALLTHROUGH();
+ case 2: dest[count - 2] = value; Q_FALLTHROUGH();
+ case 1: dest[count - 1] = value;
+ }
+ }
+
+ qt_memfillXX_aligned(dest, __lsx_vreplgr2vr_w(value), count * sizeof(quint32));
+}
+
+void QT_FASTCALL comp_func_solid_Source_lsx(uint *destPixels, int length,
+ uint color, uint const_alpha)
+{
+ if (const_alpha == 255) {
+ qt_memfill32(destPixels, color, length);
+ } else {
+ const quint32 ialpha = 255 - const_alpha;
+ color = BYTE_MUL(color, const_alpha);
+ int x = 0;
+
+ quint32 *dst = (quint32 *) destPixels;
+ const __m128i colorVector = __lsx_vreplgr2vr_w(color);
+ const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
+ const __m128i half = __lsx_vreplgr2vr_h(0x80);
+ const __m128i iAlphaVector = __lsx_vreplgr2vr_h(ialpha);
+
+ ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
+ destPixels[x] = color + BYTE_MUL(destPixels[x], ialpha);
+
+ for (; x < length-3; x += 4) {
+ __m128i dstVector = __lsx_vld(&dst[x], 0);
+ BYTE_MUL_LSX(dstVector, iAlphaVector, colorMask, half);
+ dstVector = __lsx_vadd_b(colorVector, dstVector);
+ __lsx_vst(dstVector, &dst[x], 0);
+ }
+ SIMD_EPILOGUE(x, length, 3)
+ destPixels[x] = color + BYTE_MUL(destPixels[x], ialpha);
+ }
+}
+
+void QT_FASTCALL comp_func_solid_SourceOver_lsx(uint *destPixels, int length,
+ uint color, uint const_alpha)
+{
+ if ((const_alpha & qAlpha(color)) == 255) {
+ qt_memfill32(destPixels, color, length);
+ } else {
+ if (const_alpha != 255)
+ color = BYTE_MUL(color, const_alpha);
+
+ const quint32 minusAlphaOfColor = qAlpha(~color);
+ int x = 0;
+
+ quint32 *dst = (quint32 *) destPixels;
+ const __m128i colorVector = __lsx_vreplgr2vr_w(color);
+ const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
+ const __m128i half = __lsx_vreplgr2vr_h(0x80);
+ const __m128i minusAlphaOfColorVector = __lsx_vreplgr2vr_h(minusAlphaOfColor);
+
+ ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
+ destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
+
+ for (; x < length-3; x += 4) {
+ __m128i dstVector = __lsx_vld(&dst[x], 0);
+ BYTE_MUL_LSX(dstVector, minusAlphaOfColorVector, colorMask, half);
+ dstVector = __lsx_vadd_b(colorVector, dstVector);
+ __lsx_vst(dstVector, &dst[x], 0);
+ }
+ SIMD_EPILOGUE(x, length, 3)
+ destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
+ }
+}
+
+void qt_bitmapblit32_lsx_base(QRasterBuffer *rasterBuffer, int x, int y,
+ quint32 color,
+ const uchar *src, int width, int height, int stride)
+{
+ quint32 *dest = reinterpret_cast<quint32*>(rasterBuffer->scanLine(y)) + x;
+ const int destStride = rasterBuffer->stride<quint32>();
+
+ const __m128i c128 = __lsx_vreplgr2vr_w(color);
+ const __m128i maskmask1 = (__m128i)(v4u32){0x80808080, 0x40404040,
+ 0x20202020, 0x10101010};
+ const __m128i maskadd1 = (__m128i)(v4i32){0x00000000, 0x40404040,
+ 0x60606060, 0x70707070};
+
+ if (width > 4) {
+ const __m128i maskmask2 = (__m128i)(v4i32){0x08080808, 0x04040404,
+ 0x02020202, 0x01010101};
+ const __m128i maskadd2 = (__m128i)(v4i32){0x78787878, 0x7c7c7c7c,
+ 0x7e7e7e7e, 0x7f7f7f7f};
+ while (height--) {
+ for (int x = 0; x < width; x += 8) {
+ const quint8 s = src[x >> 3];
+ if (!s)
+ continue;
+ __m128i mask1 = __lsx_vreplgr2vr_b(s);
+ __m128i mask2 = mask1;
+
+ mask1 = __lsx_vand_v(mask1, maskmask1);
+ mask1 = __lsx_vadd_b(mask1, maskadd1);
+
+ __m128i destSrc1 = __lsx_vld((char*)(dest + x), 0);
+
+ mask1 = __lsx_vslti_b(mask1,0);
+ destSrc1 = __lsx_vbitsel_v(destSrc1, c128, mask1);
+ __lsx_vst(destSrc1, (char*)(dest + x), 0);
+
+ __m128i destSrc2 = __lsx_vld((char*)(dest + x + 4), 0);
+
+ mask2 = __lsx_vand_v(mask2, maskmask2);
+ mask2 = __lsx_vadd_b(mask2, maskadd2);
+
+ mask2 = __lsx_vslti_b(mask2,0);
+ destSrc2 = __lsx_vbitsel_v(destSrc2, c128, mask2);
+ __lsx_vst(destSrc2, (char*)(dest + x + 4), 0);
+ }
+ dest += destStride;
+ src += stride;
+ }
+ } else {
+ while (height--) {
+ const quint8 s = *src;
+ if (s) {
+ __m128i mask1 = __lsx_vreplgr2vr_b(s);
+
+ __m128i destSrc1 = __lsx_vld((char*)(dest), 0);
+ mask1 = __lsx_vand_v(mask1, maskmask1);
+ mask1 = __lsx_vadd_b(mask1, maskadd1);
+
+ mask1 = __lsx_vslti_b(mask1, 0);
+ destSrc1 = __lsx_vbitsel_v(destSrc1, c128, mask1);
+ __lsx_vst(destSrc1, (char*)(dest), 0);
+ }
+ dest += destStride;
+ src += stride;
+ }
+ }
+}
+
+void qt_bitmapblit32_lsx(QRasterBuffer *rasterBuffer, int x, int y,
+ const QRgba64 &color,
+ const uchar *src, int width, int height, int stride)
+{
+ qt_bitmapblit32_lsx_base(rasterBuffer, x, y, color.toArgb32(), src, width, height, stride);
+}
+
+void qt_bitmapblit8888_lsx(QRasterBuffer *rasterBuffer, int x, int y,
+ const QRgba64 &color,
+ const uchar *src, int width, int height, int stride)
+{
+ qt_bitmapblit32_lsx_base(rasterBuffer, x, y, ARGB2RGBA(color.toArgb32()), src, width, height, stride);
+}
+
+void qt_bitmapblit16_lsx(QRasterBuffer *rasterBuffer, int x, int y,
+ const QRgba64 &color,
+ const uchar *src, int width, int height, int stride)
+{
+ const quint16 c = qConvertRgb32To16(color.toArgb32());
+ quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x;
+ const int destStride = rasterBuffer->stride<quint32>();
+
+ const __m128i c128 = __lsx_vreplgr2vr_h(c);
+ const __m128i maskmask = (__m128i)(v8u16){0x8080, 0x4040, 0x2020, 0x1010,
+ 0x0808, 0x0404, 0x0202, 0x0101};
+
+ const __m128i maskadd = (__m128i)(v8i16){0x0000, 0x4040, 0x6060, 0x7070,
+ 0x7878, 0x7c7c, 0x7e7e, 0x7f7f};
+ while (--height >= 0) {
+ for (int x = 0; x < width; x += 8) {
+ const quint8 s = src[x >> 3];
+ if (!s)
+ continue;
+ __m128i mask = __lsx_vreplgr2vr_b(s);
+ __m128i destSrc = __lsx_vld((char*)(dest + x), 0);
+ mask = __lsx_vand_v(mask, maskmask);
+ mask = __lsx_vadd_b(mask, maskadd);
+ mask = __lsx_vslti_b(mask, 0);
+ destSrc = __lsx_vbitsel_v(destSrc, c128, mask);
+ __lsx_vst(destSrc, (char*)(dest + x), 0);
+ }
+ dest += destStride;
+ src += stride;
+ }
+}
+
+class QSimdLsx
+{
+public:
+ typedef __m128i Int32x4;
+ typedef __m128 Float32x4;
+
+ union Vect_buffer_i { Int32x4 v; int i[4]; };
+ union Vect_buffer_f { Float32x4 v; float f[4]; };
+
+ static inline Float32x4 Q_DECL_VECTORCALL v_dup(float x) { return __lsx_vreplfr2vr_s(x); }
+ static inline Float32x4 Q_DECL_VECTORCALL v_dup(double x) { return __lsx_vreplfr2vr_s(x); }
+ static inline Int32x4 Q_DECL_VECTORCALL v_dup(int x) { return __lsx_vreplgr2vr_w(x); }
+ static inline Int32x4 Q_DECL_VECTORCALL v_dup(uint x) { return __lsx_vreplgr2vr_w(x); }
+
+ static inline Float32x4 Q_DECL_VECTORCALL v_add(Float32x4 a, Float32x4 b) { return __lsx_vfadd_s(a, b); }
+ static inline Int32x4 Q_DECL_VECTORCALL v_add(Int32x4 a, Int32x4 b) { return __lsx_vadd_w(a, b); }
+
+ static inline Float32x4 Q_DECL_VECTORCALL v_max(Float32x4 a, Float32x4 b) { return __lsx_vfmax_s(a, b); }
+ static inline Float32x4 Q_DECL_VECTORCALL v_min(Float32x4 a, Float32x4 b) { return __lsx_vfmin_s(a, b); }
+ static inline Int32x4 Q_DECL_VECTORCALL v_min_16(Int32x4 a, Int32x4 b) { return __lsx_vmin_h(a, b); }
+
+ static inline Int32x4 Q_DECL_VECTORCALL v_and(Int32x4 a, Int32x4 b) { return __lsx_vand_v(a, b); }
+
+ static inline Float32x4 Q_DECL_VECTORCALL v_sub(Float32x4 a, Float32x4 b) { return __lsx_vfsub_s(a, b); }
+ static inline Int32x4 Q_DECL_VECTORCALL v_sub(Int32x4 a, Int32x4 b) { return __lsx_vsub_w(a, b); }
+
+ static inline Float32x4 Q_DECL_VECTORCALL v_mul(Float32x4 a, Float32x4 b) { return __lsx_vfmul_s(a, b); }
+
+ static inline Float32x4 Q_DECL_VECTORCALL v_sqrt(Float32x4 x) { return __lsx_vfsqrt_s(x); }
+
+ static inline Int32x4 Q_DECL_VECTORCALL v_toInt(Float32x4 x) { return __lsx_vftintrz_w_s(x); }
+
+ static inline Int32x4 Q_DECL_VECTORCALL v_greaterOrEqual(Float32x4 a, Float32x4 b) { return __lsx_vfcmp_clt_s(b, a); }
+};
+
+const uint * QT_FASTCALL qt_fetch_radial_gradient_lsx(uint *buffer, const Operator *op,
+ const QSpanData *data,
+ int y, int x, int length)
+{
+ return qt_fetch_radial_gradient_template<QRadialFetchSimd<QSimdLsx>,uint>(buffer, op, data, y, x, length);
+}
+
+void qt_scale_image_argb32_on_argb32_lsx(uchar *destPixels, int dbpl,
+ const uchar *srcPixels, int sbpl, int srch,
+ const QRectF &targetRect,
+ const QRectF &sourceRect,
+ const QRect &clip,
+ int const_alpha)
+{
+ if (const_alpha != 256) {
+ // from qblendfunctions.cpp
+ extern void qt_scale_image_argb32_on_argb32(uchar *destPixels, int dbpl,
+ const uchar *srcPixels, int sbpl, int srch,
+ const QRectF &targetRect,
+ const QRectF &sourceRect,
+ const QRect &clip,
+ int const_alpha);
+ return qt_scale_image_argb32_on_argb32(destPixels, dbpl, srcPixels, sbpl, srch,
+ targetRect, sourceRect, clip, const_alpha);
+ }
+
+ qreal sx = sourceRect.width() / (qreal)targetRect.width();
+ qreal sy = sourceRect.height() / (qreal)targetRect.height();
+
+
+ const int ix = 0x00010000 * sx;
+ const int iy = 0x00010000 * sy;
+
+ QRect tr = targetRect.normalized().toRect();
+ tr = tr.intersected(clip);
+ if (tr.isEmpty())
+ return;
+ const int tx1 = tr.left();
+ const int ty1 = tr.top();
+ int h = tr.height();
+ int w = tr.width();
+
+ quint32 basex;
+ quint32 srcy;
+
+ if (sx < 0) {
+ int dstx = qFloor((tx1 + qreal(0.5) - targetRect.right()) * sx * 65536) + 1;
+ basex = quint32(sourceRect.right() * 65536) + dstx;
+ } else {
+ int dstx = qCeil((tx1 + qreal(0.5) - targetRect.left()) * sx * 65536) - 1;
+ basex = quint32(sourceRect.left() * 65536) + dstx;
+ }
+ if (sy < 0) {
+ int dsty = qFloor((ty1 + qreal(0.5) - targetRect.bottom()) * sy * 65536) + 1;
+ srcy = quint32(sourceRect.bottom() * 65536) + dsty;
+ } else {
+ int dsty = qCeil((ty1 + qreal(0.5) - targetRect.top()) * sy * 65536) - 1;
+ srcy = quint32(sourceRect.top() * 65536) + dsty;
+ }
+
+ quint32 *dst = ((quint32 *) (destPixels + ty1 * dbpl)) + tx1;
+
+ const __m128i nullVector = __lsx_vreplgr2vr_w(0);
+ const __m128i half = __lsx_vreplgr2vr_h(0x80);
+ const __m128i one = __lsx_vreplgr2vr_h(0xff);
+ const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
+ const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000);
+ const __m128i ixVector = __lsx_vreplgr2vr_w(4*ix);
+
+ // this bounds check here is required as floating point rounding above might in some cases lead to
+ // w/h values that are one pixel too large, falling outside of the valid image area.
+ const int ystart = srcy >> 16;
+ if (ystart >= srch && iy < 0) {
+ srcy += iy;
+ --h;
+ }
+ const int xstart = basex >> 16;
+ if (xstart >= (int)(sbpl/sizeof(quint32)) && ix < 0) {
+ basex += ix;
+ --w;
+ }
+ int yend = (srcy + iy * (h - 1)) >> 16;
+ if (yend < 0 || yend >= srch)
+ --h;
+ int xend = (basex + ix * (w - 1)) >> 16;
+ if (xend < 0 || xend >= (int)(sbpl/sizeof(quint32)))
+ --w;
+
+ while (--h >= 0) {
+ const uint *src = (const quint32 *) (srcPixels + (srcy >> 16) * sbpl);
+ int srcx = basex;
+ int x = 0;
+
+ ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) {
+ uint s = src[srcx >> 16];
+ dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
+ srcx += ix;
+ }
+
+ __m128i srcxVector = (__m128i)(v4i32){srcx + ix + ix + ix, srcx + ix + ix, srcx + ix, srcx};
+
+ for (; x < (w - 3); x += 4) {
+ const int idx0 = __lsx_vpickve2gr_h(srcxVector, 1);
+ const int idx1 = __lsx_vpickve2gr_h(srcxVector, 3);
+ const int idx2 = __lsx_vpickve2gr_h(srcxVector, 5);
+ const int idx3 = __lsx_vpickve2gr_h(srcxVector, 7);
+ srcxVector = __lsx_vadd_w(srcxVector, ixVector);
+
+ const __m128i srcVector = (__m128i)((v4u32){src[idx3], src[idx2], src[idx1], src[idx0]});
+
+ BLEND_SOURCE_OVER_ARGB32_LSX_helper(dst, x, srcVector, nullVector, half, one, colorMask, alphaMask);
+ }
+
+ SIMD_EPILOGUE(x, w, 3) {
+ uint s = src[(basex + x*ix) >> 16];
+ dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
+ }
+ dst = (quint32 *)(((uchar *) dst) + dbpl);
+ srcy += iy;
+ }
+}
+
+QT_END_NAMESPACE
+
+#endif // QT_COMPILER_SUPPORTS_LSX
diff --git a/src/gui/painting/qdrawhelper_p.h b/src/gui/painting/qdrawhelper_p.h
index 833ddd7b166..482a2da206e 100644
--- a/src/gui/painting/qdrawhelper_p.h
+++ b/src/gui/painting/qdrawhelper_p.h
@@ -142,7 +142,7 @@ struct quint24 {
void qBlendGradient(int count, const QT_FT_Span *spans, void *userData);
void qBlendTexture(int count, const QT_FT_Span *spans, void *userData);
-#ifdef Q_PROCESSOR_X86
+#if defined(Q_PROCESSOR_X86) || defined(QT_COMPILER_SUPPORTS_LSX)
extern void (*qt_memfill64)(quint64 *dest, quint64 value, qsizetype count);
extern void (*qt_memfill32)(quint32 *dest, quint32 value, qsizetype count);
#else
diff --git a/src/gui/painting/qdrawingprimitive_lsx_p.h b/src/gui/painting/qdrawingprimitive_lsx_p.h
new file mode 100644
index 00000000000..06e97139df6
--- /dev/null
+++ b/src/gui/painting/qdrawingprimitive_lsx_p.h
@@ -0,0 +1,231 @@
+// Copyright (C) 2024 Loongson Technology Corporation Limited.
+// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
+
+#ifndef QDRAWINGPRIMITIVE_LSX_P_H
+#define QDRAWINGPRIMITIVE_LSX_P_H
+
+#include <QtGui/private/qtguiglobal_p.h>
+#include <private/qsimd_p.h>
+#include "qdrawhelper_loongarch64_p.h"
+#include "qrgba64_p.h"
+
+#ifdef __loongarch_sx
+
+//
+// W A R N I N G
+// -------------
+//
+// This file is not part of the Qt API. It exists purely as an
+// implementation detail. This header file may change from version to
+// version without notice, or even be removed.
+//
+// We mean it.
+//
+
+QT_BEGIN_NAMESPACE
+
+/*
+ * Multiply the components of pixelVector by alphaChannel
+ * Each 32bits components of alphaChannel must be in the form 0x00AA00AA
+ * colorMask must have 0x00ff00ff on each 32 bits component
+ * half must have the value 128 (0x80) for each 32 bits component
+ */
+inline static void Q_DECL_VECTORCALL
+BYTE_MUL_LSX(__m128i &pixelVector, __m128i alphaChannel, __m128i colorMask, __m128i half)
+{
+ /* 1. separate the colors in 2 vectors so each color is on 16 bits
+ (in order to be multiplied by the alpha
+ each 32 bit of dstVectorAG are in the form 0x00AA00GG
+ each 32 bit of dstVectorRB are in the form 0x00RR00BB */
+ __m128i pixelVectorAG = __lsx_vsrli_h(pixelVector, 8);
+ __m128i pixelVectorRB = __lsx_vand_v(pixelVector, colorMask);
+
+ /* 2. multiply the vectors by the alpha channel */
+ pixelVectorAG = __lsx_vmul_h(pixelVectorAG, alphaChannel);
+ pixelVectorRB = __lsx_vmul_h(pixelVectorRB, alphaChannel);
+
+ /* 3. divide by 255, that's the tricky part.
+ we do it like for BYTE_MUL(), with bit shift: X/255 ~= (X + X/256 + rounding)/256 */
+ /** so first (X + X/256 + rounding) */
+ pixelVectorRB = __lsx_vadd_h(pixelVectorRB, __lsx_vsrli_h(pixelVectorRB, 8));
+ pixelVectorRB = __lsx_vadd_h(pixelVectorRB, half);
+ pixelVectorAG = __lsx_vadd_h(pixelVectorAG, __lsx_vsrli_h(pixelVectorAG, 8));
+ pixelVectorAG = __lsx_vadd_h(pixelVectorAG, half);
+
+ /** second divide by 256 */
+ pixelVectorRB = __lsx_vsrli_h(pixelVectorRB, 8);
+ /** for AG, we could >> 8 to divide followed by << 8 to put the
+ bytes in the correct position. By masking instead, we execute
+ only one instruction */
+ pixelVectorAG = __lsx_vandn_v(colorMask, pixelVectorAG);
+
+ /* 4. combine the 2 pairs of colors */
+ pixelVector = __lsx_vor_v(pixelVectorAG, pixelVectorRB);
+}
+
+/*
+ * Each 32bits components of alphaChannel must be in the form 0x00AA00AA
+ * oneMinusAlphaChannel must be 255 - alpha for each 32 bits component
+ * colorMask must have 0x00ff00ff on each 32 bits component
+ * half must have the value 128 (0x80) for each 32 bits component
+ */
+inline static void Q_DECL_VECTORCALL
+INTERPOLATE_PIXEL_255_LSX(__m128i srcVector, __m128i &dstVector, __m128i alphaChannel,
+ __m128i oneMinusAlphaChannel, __m128i colorMask, __m128i half)
+{
+ /* interpolate AG */
+ __m128i srcVectorAG = __lsx_vsrli_h(srcVector, 8);
+ __m128i dstVectorAG = __lsx_vsrli_h(dstVector, 8);
+ __m128i srcVectorAGalpha = __lsx_vmul_h(srcVectorAG, alphaChannel);
+ __m128i dstVectorAGoneMinusAlphalpha = __lsx_vmul_h(dstVectorAG, oneMinusAlphaChannel);
+ __m128i finalAG = __lsx_vadd_h(srcVectorAGalpha, dstVectorAGoneMinusAlphalpha);
+ finalAG = __lsx_vadd_h(finalAG, __lsx_vsrli_h(finalAG, 8));
+ finalAG = __lsx_vadd_h(finalAG, half);
+ finalAG = __lsx_vandn_v(colorMask, finalAG);
+
+ /* interpolate RB */
+ __m128i srcVectorRB = __lsx_vand_v(srcVector, colorMask);
+ __m128i dstVectorRB = __lsx_vand_v(dstVector, colorMask);
+ __m128i srcVectorRBalpha = __lsx_vmul_h(srcVectorRB, alphaChannel);
+ __m128i dstVectorRBoneMinusAlphalpha = __lsx_vmul_h(dstVectorRB, oneMinusAlphaChannel);
+ __m128i finalRB = __lsx_vadd_h(srcVectorRBalpha, dstVectorRBoneMinusAlphalpha);
+ finalRB = __lsx_vadd_h(finalRB, __lsx_vsrli_h(finalRB, 8));
+ finalRB = __lsx_vadd_h(finalRB, half);
+ finalRB = __lsx_vsrli_h(finalRB, 8);
+
+ /* combine */
+ dstVector = __lsx_vor_v(finalAG, finalRB);
+}
+
+// same as BLEND_SOURCE_OVER_ARGB32_LSX, but for one vector srcVector
+inline static void Q_DECL_VECTORCALL
+BLEND_SOURCE_OVER_ARGB32_LSX_helper(quint32 *dst, int x, __m128i srcVector,
+ __m128i nullVector, __m128i half, __m128i one,
+ __m128i colorMask, __m128i alphaMask)
+{
+ const __m128i srcVectorAlpha = __lsx_vand_v(srcVector, alphaMask);
+ __m128i vseq = __lsx_vseq_w(srcVectorAlpha, alphaMask);
+ v4i32 vseq_res = (v4i32)__lsx_vmsknz_b(vseq);
+ if (vseq_res[0] == (0x0000ffff)) {
+ /* all opaque */
+ __lsx_vst(srcVector, &dst[x], 0);
+ } else {
+ __m128i vseq_n = __lsx_vseq_w(srcVectorAlpha, nullVector);
+ v4i32 vseq_n_res = (v4i32)__lsx_vmsknz_b(vseq_n);
+ if (vseq_n_res[0] != (0x0000ffff)) {
+ /* not fully transparent */
+ /* extract the alpha channel on 2 x 16 bits */
+ /* so we have room for the multiplication */
+ /* each 32 bits will be in the form 0x00AA00AA */
+ /* with A being the 1 - alpha */
+ __m128i alphaChannel = __lsx_vsrli_w(srcVector, 24);
+ alphaChannel = __lsx_vor_v(alphaChannel, __lsx_vslli_w(alphaChannel, 16));
+ alphaChannel = __lsx_vsub_h(one, alphaChannel);
+
+ __m128i dstVector = __lsx_vld(&dst[x], 0);
+ BYTE_MUL_LSX(dstVector, alphaChannel, colorMask, half);
+
+ /* result = s + d * (1-alpha) */
+ const __m128i result = __lsx_vadd_b(srcVector, dstVector);
+ __lsx_vst(result, &dst[x], 0);
+ }
+ }
+}
+
+// Basically blend src over dst with the const alpha defined as constAlphaVector.
+// nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as:
+//const __m128i nullVector = __lsx_vreplgr2vr_w(0);
+//const __m128i half = __lsx_vreplgr2vr_h(0x80);
+//const __m128i one = __lsx_vreplgr2vr_h(0xff);
+//const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
+//const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000);
+//
+// The computation being done is:
+// result = s + d * (1-alpha)
+// with shortcuts if fully opaque or fully transparent.
+inline static void Q_DECL_VECTORCALL
+BLEND_SOURCE_OVER_ARGB32_LSX(quint32 *dst, const quint32 *src, int length)
+{
+ int x = 0;
+
+ /* First, get dst aligned. */
+ ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) {
+ blend_pixel(dst[x], src[x]);
+ }
+
+ const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000);
+ const __m128i nullVector = __lsx_vreplgr2vr_w(0);
+ const __m128i half = __lsx_vreplgr2vr_h(0x80);
+ const __m128i one = __lsx_vreplgr2vr_h(0xff);
+ const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
+
+ for (; x < length-3; x += 4) {
+ const __m128i srcVector = __lsx_vld((const __m128i *)&src[x], 0);
+ BLEND_SOURCE_OVER_ARGB32_LSX_helper(dst, x, srcVector, nullVector, half, one, colorMask, alphaMask);
+ }
+ SIMD_EPILOGUE(x, length, 3) {
+ blend_pixel(dst[x], src[x]);
+ }
+}
+
+// Basically blend src over dst with the const alpha defined as constAlphaVector.
+// The computation being done is:
+// dest = (s + d * sia) * ca + d * cia
+// = s * ca + d * (sia * ca + cia)
+// = s * ca + d * (1 - sa*ca)
+inline static void Q_DECL_VECTORCALL
+BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX(quint32 *dst, const quint32 *src, int length, uint const_alpha)
+{
+ int x = 0;
+
+ ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) {
+ blend_pixel(dst[x], src[x], const_alpha);
+ }
+
+ const __m128i nullVector = __lsx_vreplgr2vr_w(0);
+ const __m128i half = __lsx_vreplgr2vr_h(0x80);
+ const __m128i one = __lsx_vreplgr2vr_h(0xff);
+ const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
+ const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha);
+
+ for (; x < length-3; x += 4) {
+ __m128i srcVector = __lsx_vld((const __m128i *)&src[x], 0);
+ __m128i vseq = __lsx_vseq_w(srcVector, nullVector);
+ v4i32 vseq_res = (v4i32)__lsx_vmsknz_b(vseq);
+ if (vseq_res[0] != 0x0000ffff) {
+ BYTE_MUL_LSX(srcVector, constAlphaVector, colorMask, half);
+
+ __m128i alphaChannel = __lsx_vsrli_w(srcVector, 24);
+ alphaChannel = __lsx_vor_v(alphaChannel, __lsx_vslli_w(alphaChannel, 16));
+ alphaChannel = __lsx_vsub_h(one, alphaChannel);
+
+ __m128i dstVector = __lsx_vld((__m128i *)&dst[x], 0);
+ BYTE_MUL_LSX(dstVector, alphaChannel, colorMask, half);
+
+ const __m128i result = __lsx_vadd_b(srcVector, dstVector);
+ __lsx_vst(result, &dst[x], 0);
+ }
+ }
+ SIMD_EPILOGUE(x, length, 3) {
+ blend_pixel(dst[x], src[x], const_alpha);
+ }
+}
+
+typedef union
+{
+ int i;
+ float f;
+} FloatInt;
+
+/* float type data load instructions */
+static __m128 __lsx_vreplfr2vr_s(float val)
+{
+ FloatInt fi_tmpval = {.f = val};
+ return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
+}
+
+QT_END_NAMESPACE
+
+#endif // __loongarch_sx
+
+#endif // QDRAWINGPRIMITIVE_LSX_P_H
diff --git a/src/gui/painting/qimagescale.cpp b/src/gui/painting/qimagescale.cpp
index cc95f6773fe..1b1f2e8b8c8 100644
--- a/src/gui/painting/qimagescale.cpp
+++ b/src/gui/painting/qimagescale.cpp
@@ -257,6 +257,18 @@ void qt_qimageScaleAARGBA_down_xy_sse4(QImageScaleInfo *isi, unsigned int *dest,
int dw, int dh, int dow, int sow);
#endif
+#if defined(QT_COMPILER_SUPPORTS_LSX)
+template<bool RGB>
+void qt_qimageScaleAARGBA_up_x_down_y_lsx(QImageScaleInfo *isi, unsigned int *dest,
+ int dw, int dh, int dow, int sow);
+template<bool RGB>
+void qt_qimageScaleAARGBA_down_x_up_y_lsx(QImageScaleInfo *isi, unsigned int *dest,
+ int dw, int dh, int dow, int sow);
+template<bool RGB>
+void qt_qimageScaleAARGBA_down_xy_lsx(QImageScaleInfo *isi, unsigned int *dest,
+ int dw, int dh, int dow, int sow);
+#endif
+
#if defined(__ARM_NEON__)
template<bool RGB>
void qt_qimageScaleAARGBA_up_x_down_y_neon(QImageScaleInfo *isi, unsigned int *dest,
@@ -351,6 +363,10 @@ static void qt_qimageScaleAARGBA(QImageScaleInfo *isi, unsigned int *dest,
if (qCpuHasFeature(SSE4_1))
qt_qimageScaleAARGBA_up_x_down_y_sse4<false>(isi, dest, dw, dh, dow, sow);
else
+#elif defined(QT_COMPILER_SUPPORTS_LSX)
+ if (qCpuHasFeature(LSX))
+ qt_qimageScaleAARGBA_up_x_down_y_lsx<false>(isi, dest, dw, dh, dow, sow);
+ else
#elif defined(__ARM_NEON__)
if (qCpuHasFeature(NEON))
qt_qimageScaleAARGBA_up_x_down_y_neon<false>(isi, dest, dw, dh, dow, sow);
@@ -364,6 +380,10 @@ static void qt_qimageScaleAARGBA(QImageScaleInfo *isi, unsigned int *dest,
if (qCpuHasFeature(SSE4_1))
qt_qimageScaleAARGBA_down_x_up_y_sse4<false>(isi, dest, dw, dh, dow, sow);
else
+#elif defined(QT_COMPILER_SUPPORTS_LSX)
+ if (qCpuHasFeature(LSX))
+ qt_qimageScaleAARGBA_down_x_up_y_lsx<false>(isi, dest, dw, dh, dow, sow);
+ else
#elif defined(__ARM_NEON__)
if (qCpuHasFeature(NEON))
qt_qimageScaleAARGBA_down_x_up_y_neon<false>(isi, dest, dw, dh, dow, sow);
@@ -377,6 +397,10 @@ static void qt_qimageScaleAARGBA(QImageScaleInfo *isi, unsigned int *dest,
if (qCpuHasFeature(SSE4_1))
qt_qimageScaleAARGBA_down_xy_sse4<false>(isi, dest, dw, dh, dow, sow);
else
+#elif defined(QT_COMPILER_SUPPORTS_LSX)
+ if (qCpuHasFeature(LSX))
+ qt_qimageScaleAARGBA_down_xy_lsx<false>(isi, dest, dw, dh, dow, sow);
+ else
#elif defined(__ARM_NEON__)
if (qCpuHasFeature(NEON))
qt_qimageScaleAARGBA_down_xy_neon<false>(isi, dest, dw, dh, dow, sow);
@@ -995,6 +1019,10 @@ static void qt_qimageScaleAARGB(QImageScaleInfo *isi, unsigned int *dest,
if (qCpuHasFeature(SSE4_1))
qt_qimageScaleAARGBA_up_x_down_y_sse4<true>(isi, dest, dw, dh, dow, sow);
else
+#elif defined QT_COMPILER_SUPPORTS_LSX
+ if (qCpuHasFeature(LSX))
+ qt_qimageScaleAARGBA_up_x_down_y_lsx<true>(isi, dest, dw, dh, dow, sow);
+ else
#elif defined(__ARM_NEON__)
if (qCpuHasFeature(NEON))
qt_qimageScaleAARGBA_up_x_down_y_neon<true>(isi, dest, dw, dh, dow, sow);
@@ -1008,6 +1036,10 @@ static void qt_qimageScaleAARGB(QImageScaleInfo *isi, unsigned int *dest,
if (qCpuHasFeature(SSE4_1))
qt_qimageScaleAARGBA_down_x_up_y_sse4<true>(isi, dest, dw, dh, dow, sow);
else
+#elif defined QT_COMPILER_SUPPORTS_LSX
+ if (qCpuHasFeature(LSX))
+ qt_qimageScaleAARGBA_down_x_up_y_lsx<true>(isi, dest, dw, dh, dow, sow);
+ else
#elif defined(__ARM_NEON__)
if (qCpuHasFeature(NEON))
qt_qimageScaleAARGBA_down_x_up_y_neon<true>(isi, dest, dw, dh, dow, sow);
@@ -1021,6 +1053,10 @@ static void qt_qimageScaleAARGB(QImageScaleInfo *isi, unsigned int *dest,
if (qCpuHasFeature(SSE4_1))
qt_qimageScaleAARGBA_down_xy_sse4<true>(isi, dest, dw, dh, dow, sow);
else
+#elif defined QT_COMPILER_SUPPORTS_LSX
+ if (qCpuHasFeature(LSX))
+ qt_qimageScaleAARGBA_down_xy_lsx<true>(isi, dest, dw, dh, dow, sow);
+ else
#elif defined(__ARM_NEON__)
if (qCpuHasFeature(NEON))
qt_qimageScaleAARGBA_down_xy_neon<true>(isi, dest, dw, dh, dow, sow);
diff --git a/src/gui/painting/qimagescale_lsx.cpp b/src/gui/painting/qimagescale_lsx.cpp
new file mode 100644
index 00000000000..c128b014b8c
--- /dev/null
+++ b/src/gui/painting/qimagescale_lsx.cpp
@@ -0,0 +1,233 @@
+// Copyright (C) 2024 Loongson Technology Corporation Limited.
+// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
+
+#include "qimagescale_p.h"
+#include "qimage.h"
+#include <private/qdrawhelper_loongarch64_p.h>
+#include <private/qsimd_p.h>
+
+#if QT_CONFIG(thread) && !defined(Q_OS_WASM)
+#include <qsemaphore.h>
+#include <private/qthreadpool_p.h>
+#endif
+
+#if defined(QT_COMPILER_SUPPORTS_LSX)
+
+QT_BEGIN_NAMESPACE
+
+using namespace QImageScale;
+
+template<typename T>
+static inline void multithread_pixels_function(QImageScaleInfo *isi, int dh, const T &scaleSection)
+{
+#if QT_CONFIG(thread) && !defined(Q_OS_WASM)
+ int segments = (qsizetype(isi->sh) * isi->sw) / (1<<16);
+ segments = std::min(segments, dh);
+ QThreadPool *threadPool = QThreadPoolPrivate::qtGuiInstance();
+ if (segments > 1 && threadPool && !threadPool->contains(QThread::currentThread())) {
+ QSemaphore semaphore;
+ int y = 0;
+ for (int i = 0; i < segments; ++i) {
+ int yn = (dh - y) / (segments - i);
+ threadPool->start([&, y, yn]() {
+ scaleSection(y, y + yn);
+ semaphore.release(1);
+ });
+ y += yn;
+ }
+ semaphore.acquire(segments);
+ return;
+ }
+#else
+ Q_UNUSED(isi);
+#endif
+ scaleSection(0, dh);
+}
+
+inline static __m128i Q_DECL_VECTORCALL
+qt_qimageScaleAARGBA_helper(const unsigned int *pix, int xyap, int Cxy,
+ int step, const __m128i vxyap, const __m128i vCxy)
+{
+ const __m128i shuffleMask = (__m128i)(v16i8){0, 16, 16, 16, 1, 16, 16, 16,
+ 2, 16, 16, 16, 3, 16, 16, 16};
+ __m128i vpix = __lsx_vshuf_b(__lsx_vldi(0), __lsx_vreplgr2vr_w(*pix), shuffleMask);
+ __m128i vx = __lsx_vmul_w(vpix, vxyap);
+ int i;
+ for (i = (1 << 14) - xyap; i > Cxy; i -= Cxy) {
+ pix += step;
+ vpix = __lsx_vshuf_b(__lsx_vldi(0), __lsx_vreplgr2vr_w(*pix), shuffleMask);
+ vx = __lsx_vadd_w(vx, __lsx_vmul_w(vpix, vCxy));
+ }
+ pix += step;
+ vpix = __lsx_vshuf_b(__lsx_vldi(0), __lsx_vreplgr2vr_w(*pix), shuffleMask);
+ vx = __lsx_vadd_w(vx, __lsx_vmul_w(vpix, __lsx_vreplgr2vr_w(i)));
+ return vx;
+}
+
+template<bool RGB>
+void qt_qimageScaleAARGBA_up_x_down_y_lsx(QImageScaleInfo *isi, unsigned int *dest,
+ int dw, int dh, int dow, int sow)
+{
+ const unsigned int **ypoints = isi->ypoints;
+ const int *xpoints = isi->xpoints;
+ const int *xapoints = isi->xapoints;
+ const int *yapoints = isi->yapoints;
+
+ const __m128i v256 = __lsx_vreplgr2vr_w(256);
+
+ /* go through every scanline in the output buffer */
+ auto scaleSection = [&] (int yStart, int yEnd) {
+ for (int y = yStart; y < yEnd; ++y) {
+ const int Cy = yapoints[y] >> 16;
+ const int yap = yapoints[y] & 0xffff;
+ const __m128i vCy = __lsx_vreplgr2vr_w(Cy);
+ const __m128i vyap = __lsx_vreplgr2vr_w(yap);
+
+ unsigned int *dptr = dest + (y * dow);
+ for (int x = 0; x < dw; x++) {
+ const unsigned int *sptr = ypoints[y] + xpoints[x];
+ __m128i vx = qt_qimageScaleAARGBA_helper(sptr, yap, Cy, sow, vyap, vCy);
+
+ const int xap = xapoints[x];
+ if (xap > 0) {
+ const __m128i vxap = __lsx_vreplgr2vr_w(xap);
+ const __m128i vinvxap = __lsx_vsub_w(v256, vxap);
+ __m128i vr = qt_qimageScaleAARGBA_helper(sptr + 1, yap, Cy, sow, vyap, vCy);
+
+ vx = __lsx_vmul_w(vx, vinvxap);
+ vr = __lsx_vmul_w(vr, vxap);
+ vx = __lsx_vadd_w(vx, vr);
+ vx = __lsx_vsrli_w(vx, 8);
+ }
+ vx = __lsx_vsrli_w(vx, 14);
+ vx = __lsx_vpickev_h(__lsx_vsat_wu(vx, 15), __lsx_vsat_wu(vx, 15));
+ vx = __lsx_vpickev_b(__lsx_vsat_hu(vx, 7), __lsx_vsat_hu(vx, 7));
+ *dptr = __lsx_vpickve2gr_w(vx, 0);
+ if (RGB)
+ *dptr |= 0xff000000;
+ dptr++;
+ }
+ }
+ };
+ multithread_pixels_function(isi, dh, scaleSection);
+}
+
+template<bool RGB>
+void qt_qimageScaleAARGBA_down_x_up_y_lsx(QImageScaleInfo *isi, unsigned int *dest,
+ int dw, int dh, int dow, int sow)
+{
+ const unsigned int **ypoints = isi->ypoints;
+ int *xpoints = isi->xpoints;
+ int *xapoints = isi->xapoints;
+ int *yapoints = isi->yapoints;
+
+ const __m128i v256 = __lsx_vreplgr2vr_w(256);
+
+ /* go through every scanline in the output buffer */
+ auto scaleSection = [&] (int yStart, int yEnd) {
+ for (int y = yStart; y < yEnd; ++y) {
+ unsigned int *dptr = dest + (y * dow);
+ for (int x = 0; x < dw; x++) {
+ int Cx = xapoints[x] >> 16;
+ int xap = xapoints[x] & 0xffff;
+ const __m128i vCx = __lsx_vreplgr2vr_w(Cx);
+ const __m128i vxap = __lsx_vreplgr2vr_w(xap);
+
+ const unsigned int *sptr = ypoints[y] + xpoints[x];
+ __m128i vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
+
+ int yap = yapoints[y];
+ if (yap > 0) {
+ const __m128i vyap = __lsx_vreplgr2vr_w(yap);
+ const __m128i vinvyap = __lsx_vsub_w(v256, vyap);
+ __m128i vr = qt_qimageScaleAARGBA_helper(sptr + sow, xap, Cx, 1, vxap, vCx);
+
+ vx = __lsx_vmul_w(vx, vinvyap);
+ vr = __lsx_vmul_w(vr, vyap);
+ vx = __lsx_vadd_w(vx, vr);
+ vx = __lsx_vsrli_w(vx, 8);
+ }
+ vx = __lsx_vsrli_w(vx, 14);
+ vx = __lsx_vpickev_h(__lsx_vsat_wu(vx, 15), __lsx_vsat_wu(vx, 15));
+ vx = __lsx_vpickev_b(__lsx_vsat_wu(vx, 7), __lsx_vsat_hu(vx, 7));
+ *dptr = __lsx_vpickve2gr_w(vx, 0);
+ if (RGB)
+ *dptr |= 0xff000000;
+ dptr++;
+ }
+ }
+ };
+ multithread_pixels_function(isi, dh, scaleSection);
+}
+
+template<bool RGB>
+void qt_qimageScaleAARGBA_down_xy_lsx(QImageScaleInfo *isi, unsigned int *dest,
+ int dw, int dh, int dow, int sow)
+{
+ const unsigned int **ypoints = isi->ypoints;
+ int *xpoints = isi->xpoints;
+ int *xapoints = isi->xapoints;
+ int *yapoints = isi->yapoints;
+
+ auto scaleSection = [&] (int yStart, int yEnd) {
+ for (int y = yStart; y < yEnd; ++y) {
+ int Cy = yapoints[y] >> 16;
+ int yap = yapoints[y] & 0xffff;
+ const __m128i vCy = __lsx_vreplgr2vr_w(Cy);
+ const __m128i vyap = __lsx_vreplgr2vr_w(yap);
+
+ unsigned int *dptr = dest + (y * dow);
+ for (int x = 0; x < dw; x++) {
+ const int Cx = xapoints[x] >> 16;
+ const int xap = xapoints[x] & 0xffff;
+ const __m128i vCx = __lsx_vreplgr2vr_w(Cx);
+ const __m128i vxap = __lsx_vreplgr2vr_w(xap);
+
+ const unsigned int *sptr = ypoints[y] + xpoints[x];
+ __m128i vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
+ __m128i vr = __lsx_vmul_w(__lsx_vsrli_w(vx, 4), vyap);
+
+ int j;
+ for (j = (1 << 14) - yap; j > Cy; j -= Cy) {
+ sptr += sow;
+ vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
+ vr = __lsx_vadd_w(vr, __lsx_vmul_w(__lsx_vsrli_w(vx, 4), vCy));
+ }
+ sptr += sow;
+ vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
+ vr = __lsx_vadd_w(vr, __lsx_vmul_w(__lsx_vsrli_w(vx, 4), __lsx_vreplgr2vr_w(j)));
+
+ vr = __lsx_vsrli_w(vr, 24);
+ vr = __lsx_vpickev_h(__lsx_vldi(0), __lsx_vsat_wu(vr, 15));
+ vr = __lsx_vpickev_b(__lsx_vldi(0), __lsx_vsat_hu(vr, 7));
+ *dptr = __lsx_vpickve2gr_w(vr, 0);
+ if (RGB)
+ *dptr |= 0xff000000;
+ dptr++;
+ }
+ }
+ };
+ multithread_pixels_function(isi, dh, scaleSection);
+}
+
+template void qt_qimageScaleAARGBA_up_x_down_y_lsx<false>(QImageScaleInfo *isi, unsigned int *dest,
+ int dw, int dh, int dow, int sow);
+
+template void qt_qimageScaleAARGBA_up_x_down_y_lsx<true>(QImageScaleInfo *isi, unsigned int *dest,
+ int dw, int dh, int dow, int sow);
+
+template void qt_qimageScaleAARGBA_down_x_up_y_lsx<false>(QImageScaleInfo *isi, unsigned int *dest,
+ int dw, int dh, int dow, int sow);
+
+template void qt_qimageScaleAARGBA_down_x_up_y_lsx<true>(QImageScaleInfo *isi, unsigned int *dest,
+ int dw, int dh, int dow, int sow);
+
+template void qt_qimageScaleAARGBA_down_xy_lsx<false>(QImageScaleInfo *isi, unsigned int *dest,
+ int dw, int dh, int dow, int sow);
+
+template void qt_qimageScaleAARGBA_down_xy_lsx<true>(QImageScaleInfo *isi, unsigned int *dest,
+ int dw, int dh, int dow, int sow);
+
+QT_END_NAMESPACE
+
+#endif