diff options
author | Chen Zhanwang <[email protected]> | 2024-06-21 17:05:49 +0800 |
---|---|---|
committer | Volker Hilsheimer <[email protected]> | 2024-10-22 18:12:15 +0200 |
commit | d511a68684c2f76b48c696c5f8a04c22ef2d00fe (patch) | |
tree | 22959c8691e6484274b0b71cbe1cc0af0e182b1b | |
parent | 73ce5a940ab4110e1140bf1ed0a71d34448a4be0 (diff) |
Complete drawhelper Func with LSX
List of optimized implementations using LSX:
- qt_blend_argb32_on_argb32
- qt_blend_rgb32_on_rgb32
- comp_func_SourceOver
- comp_func_Plus
- comp_func_Source
- comp_func_solid_Source
- comp_func_solid_SourceOver
- qt_memfill64
- qt_memfill32
- qt_bitmapblit32
- qt_bitmapblit16
- qt_scale_image_argb32_on_argb32
- convert_RGB888_to_RGB32
- qt_qimageScaleAARGBA_up_x_down_y
- qt_qimageScaleAARGBA_down_x_up_y
- qt_qimageScaleAARGBA_down_xy
All of the above functions have passed the tests under tests/auto/gui.
Change-Id: I7ae6169305b81bdf7fb704619453c505f8bb960f
Reviewed-by: Volker Hilsheimer <[email protected]>
-rw-r--r-- | src/gui/CMakeLists.txt | 9 | ||||
-rw-r--r-- | src/gui/image/qimage_conversions.cpp | 12 | ||||
-rw-r--r-- | src/gui/image/qimage_lsx.cpp | 115 | ||||
-rw-r--r-- | src/gui/image/qimage_p.h | 2 | ||||
-rw-r--r-- | src/gui/painting/qdrawhelper.cpp | 72 | ||||
-rw-r--r-- | src/gui/painting/qdrawhelper_loongarch64_p.h | 48 | ||||
-rw-r--r-- | src/gui/painting/qdrawhelper_lsx.cpp | 593 | ||||
-rw-r--r-- | src/gui/painting/qdrawhelper_p.h | 2 | ||||
-rw-r--r-- | src/gui/painting/qdrawingprimitive_lsx_p.h | 231 | ||||
-rw-r--r-- | src/gui/painting/qimagescale.cpp | 36 | ||||
-rw-r--r-- | src/gui/painting/qimagescale_lsx.cpp | 233 |
11 files changed, 1347 insertions, 6 deletions
diff --git a/src/gui/CMakeLists.txt b/src/gui/CMakeLists.txt index d7a247fa45c..d6f49cd2bd7 100644 --- a/src/gui/CMakeLists.txt +++ b/src/gui/CMakeLists.txt @@ -183,6 +183,8 @@ qt_internal_add_module(Gui painting/qdrawhelper_p.h painting/qdrawhelper_x86_p.h painting/qdrawingprimitive_sse2_p.h + painting/qdrawhelper_loongarch64_p.h + painting/qdrawingprimitive_lsx_p.h painting/qemulationpaintengine.cpp painting/qemulationpaintengine_p.h painting/qfixed_p.h painting/qgrayraster.c painting/qgrayraster_p.h @@ -655,6 +657,13 @@ qt_internal_add_simd_part(Gui SIMD neon painting/qimagescale_neon.cpp ) +qt_internal_add_simd_part(Gui SIMD lsx + SOURCES + image/qimage_lsx.cpp + painting/qdrawhelper_lsx.cpp + painting/qimagescale_lsx.cpp +) + if(NOT ANDROID) qt_internal_add_simd_part(Gui SIMD mips_dsp SOURCES diff --git a/src/gui/image/qimage_conversions.cpp b/src/gui/image/qimage_conversions.cpp index 09caf558e06..ec75b8c386e 100644 --- a/src/gui/image/qimage_conversions.cpp +++ b/src/gui/image/qimage_conversions.cpp @@ -2754,6 +2754,18 @@ static void qInitImageConversions() } #endif +#if defined(QT_COMPILER_SUPPORTS_LSX) + if (qCpuHasFeature(LSX)) { + extern void convert_RGB888_to_RGB32_lsx(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags); + qimage_converter_map[QImage::Format_RGB888][QImage::Format_RGB32] = convert_RGB888_to_RGB32_lsx; + qimage_converter_map[QImage::Format_RGB888][QImage::Format_ARGB32] = convert_RGB888_to_RGB32_lsx; + qimage_converter_map[QImage::Format_RGB888][QImage::Format_ARGB32_Premultiplied] = convert_RGB888_to_RGB32_lsx; + qimage_converter_map[QImage::Format_BGR888][QImage::Format_RGBX8888] = convert_RGB888_to_RGB32_lsx; + qimage_converter_map[QImage::Format_BGR888][QImage::Format_RGBA8888] = convert_RGB888_to_RGB32_lsx; + qimage_converter_map[QImage::Format_BGR888][QImage::Format_RGBA8888_Premultiplied] = convert_RGB888_to_RGB32_lsx; + } +#endif + #if defined(__ARM_NEON__) extern void convert_RGB888_to_RGB32_neon(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags); qimage_converter_map[QImage::Format_RGB888][QImage::Format_RGB32] = convert_RGB888_to_RGB32_neon; diff --git a/src/gui/image/qimage_lsx.cpp b/src/gui/image/qimage_lsx.cpp new file mode 100644 index 00000000000..e99a6087d02 --- /dev/null +++ b/src/gui/image/qimage_lsx.cpp @@ -0,0 +1,115 @@ +// Copyright (C) 2016 The Qt Company Ltd. +// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only + +#include <qimage.h> +#include <private/qimage_p.h> +#include <private/qsimd_p.h> + +#ifdef QT_COMPILER_SUPPORTS_LSX + +QT_BEGIN_NAMESPACE + +// Convert a scanline of RGB888 (src) to RGB32 (dst) +// src must be at least len * 3 bytes +// dst must be at least len * 4 bytes +Q_GUI_EXPORT void QT_FASTCALL qt_convert_rgb888_to_rgb32_lsx(quint32 *dst, const uchar *src, int len) +{ + int i = 0; + + // Prologue, align dst to 16 bytes. + ALIGNMENT_PROLOGUE_16BYTES(dst, i, len) { + dst[i] = qRgb(src[0], src[1], src[2]); + src += 3; + } + + // Mask the 4 first colors of the RGB888 vector + const __m128i shuffleMask = (__m128i)(v16i8){2, 1, 0, 16, 5, 4, 3, 16, + 8, 7, 6, 16, 11, 10, 9, 16}; + // Mask the 4 last colors of a RGB888 vector with an offset of 1 (so the last 3 bytes are RGB) + const __m128i shuffleMaskEnd = (__m128i)(v16i8){6, 5, 4, 16, 9, 8, 7, 16, + 12, 11, 10, 16, 15, 14, 13, 16}; + // Mask to have alpha = 0xff + const __m128i alphaMask = __lsx_vreplgr2vr_b(0xff); + + // Mask to concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by 12 bytes + const __m128i indexMask1 = (__m128i)(v16i8){12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27}; + + // Mask to concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by 8 bytes + const __m128i indexMask2 = (__m128i)(v16i8){8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23}; + + const __m128i *inVectorPtr = (const __m128i *)src; + __m128i *dstVectorPtr = (__m128i *)(dst + i); + + for (; i < (len - 15); i += 16) { // one iteration in the loop converts 16 pixels + /* + RGB888 has 5 pixels per vector, + 1 byte from the next pixel. The idea here is + to load vectors of RGB888 and use palignr to select a vector out of two vectors. + + After 3 loads of RGB888 and 3 stores of RGB32, we have 4 pixels left in the last + vector of RGB888, we can mask it directly to get a last store or RGB32. After that, + the first next byte is a R, and we can loop for the next 16 pixels. + + The conversion itself is done with a byte permutation (vshuf_b). + */ + __m128i firstSrcVector = __lsx_vld(inVectorPtr, 0); + __m128i outputVector = __lsx_vshuf_b(alphaMask, firstSrcVector, shuffleMask); + __lsx_vst(outputVector, dstVectorPtr, 0); + ++inVectorPtr; + ++dstVectorPtr; + + // There are 4 unused bytes left in srcVector, we need to load the next 16 bytes + __m128i secondSrcVector = __lsx_vld(inVectorPtr, 0); + __m128i srcVector = __lsx_vshuf_b(secondSrcVector, firstSrcVector, indexMask1); + outputVector = __lsx_vshuf_b(alphaMask, srcVector, shuffleMask); + __lsx_vst(outputVector, dstVectorPtr, 0); + ++inVectorPtr; + ++dstVectorPtr; + firstSrcVector = secondSrcVector; + + // We now have 8 unused bytes left in firstSrcVector + secondSrcVector = __lsx_vld(inVectorPtr, 0); + srcVector = __lsx_vshuf_b(secondSrcVector, firstSrcVector, indexMask2); + outputVector = __lsx_vshuf_b(alphaMask, srcVector, shuffleMask); + __lsx_vst(outputVector, dstVectorPtr, 0); + ++inVectorPtr; + ++dstVectorPtr; + + // There are now 12 unused bytes in firstSrcVector. + // We can mask them directly, almost there. + outputVector = __lsx_vshuf_b(alphaMask, secondSrcVector, shuffleMaskEnd); + __lsx_vst(outputVector, dstVectorPtr, 0); + ++dstVectorPtr; + } + src = (const uchar *)inVectorPtr; + + SIMD_EPILOGUE(i, len, 15) { + dst[i] = qRgb(src[0], src[1], src[2]); + src += 3; + } +} + +void convert_RGB888_to_RGB32_lsx(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags) +{ + Q_ASSERT(src->format == QImage::Format_RGB888 || src->format == QImage::Format_BGR888); + if (src->format == QImage::Format_BGR888) + Q_ASSERT(dest->format == QImage::Format_RGBX8888 || dest->format == QImage::Format_RGBA8888 || dest->format == QImage::Format_RGBA8888_Premultiplied); + else + Q_ASSERT(dest->format == QImage::Format_RGB32 || dest->format == QImage::Format_ARGB32 || dest->format == QImage::Format_ARGB32_Premultiplied); + Q_ASSERT(src->width == dest->width); + Q_ASSERT(src->height == dest->height); + + const uchar *src_data = (uchar *) src->data; + quint32 *dest_data = (quint32 *) dest->data; + + for (int i = 0; i < src->height; ++i) { + qt_convert_rgb888_to_rgb32_lsx(dest_data, src_data, src->width); + src_data += src->bytes_per_line; + dest_data = (quint32 *)((uchar*)dest_data + dest->bytes_per_line); + } +} + +QT_END_NAMESPACE + +#endif // QT_COMPILER_SUPPORTS_LSX diff --git a/src/gui/image/qimage_p.h b/src/gui/image/qimage_p.h index cdae61698b3..65eafc20b8d 100644 --- a/src/gui/image/qimage_p.h +++ b/src/gui/image/qimage_p.h @@ -560,7 +560,7 @@ inline QImage::Format qt_opaqueVersionForPainting(QImage::Format format) inline QImage::Format qt_alphaVersionForPainting(QImage::Format format) { QImage::Format toFormat = qt_alphaVersion(format); -#if defined(__ARM_NEON__) || defined(__SSE2__) +#if defined(__ARM_NEON__) || defined(__SSE2__) || defined(QT_COMPILER_SUPPORT_LSX) // If we are switching depth anyway and we have optimized ARGB32PM routines, upgrade to that. if (qt_depthForFormat(format) != qt_depthForFormat(toFormat) && qt_depthForFormat(toFormat) <= 32) toFormat = QImage::Format_ARGB32_Premultiplied; diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp index 376753879b5..2bfca562249 100644 --- a/src/gui/painting/qdrawhelper.cpp +++ b/src/gui/painting/qdrawhelper.cpp @@ -12,6 +12,8 @@ #include <private/qdrawhelper_p.h> #include <private/qdrawhelper_x86_p.h> #include <private/qdrawingprimitive_sse2_p.h> +#include <private/qdrawhelper_loongarch64_p.h> +#include <private/qdrawingprimitive_lsx_p.h> #include <private/qdrawhelper_neon_p.h> #if defined(QT_COMPILER_SUPPORTS_MIPS_DSP) || defined(QT_COMPILER_SUPPORTS_MIPS_DSPR2) #include <private/qdrawhelper_mips_dsp_p.h> @@ -4971,7 +4973,7 @@ void qBlendTexture(int count, const QT_FT_Span *spans, void *userData) case QImage::Format_RGB16: proc = processTextureSpansRGB16[blendType]; break; -#if defined(__SSE2__) || defined(__ARM_NEON__) || (Q_PROCESSOR_WORDSIZE == 8) +#if defined(__SSE2__) || defined(__ARM_NEON__) || defined(QT_COMPILER_SUPPORTS_LSX) || (Q_PROCESSOR_WORDSIZE == 8) case QImage::Format_ARGB32: case QImage::Format_RGBA8888: #endif @@ -5113,7 +5115,7 @@ void qBlendGradient(int count, const QT_FT_Span *spans, void *userData) if (isVerticalGradient && blend_vertical_gradient_argb(count, spans, userData)) return; return blend_src_generic(count, spans, userData); -#if defined(__SSE2__) || defined(__ARM_NEON__) || (Q_PROCESSOR_WORDSIZE == 8) +#if defined(__SSE2__) || defined(__ARM_NEON__) || defined(QT_COMPILER_SUPPORTS_LSX) || (Q_PROCESSOR_WORDSIZE == 8) case QImage::Format_ARGB32: case QImage::Format_RGBA8888: #endif @@ -6368,7 +6370,7 @@ DrawHelper qDrawHelper[] = static_assert(std::size(qDrawHelper) == QImage::NImageFormats); -#if !defined(Q_PROCESSOR_X86) +#if !defined(Q_PROCESSOR_X86) && !defined(QT_COMPILER_SUPPORTS_LSX) void qt_memfill64(quint64 *dest, quint64 color, qsizetype count) { qt_memfill_template<quint64>(dest, color, count); @@ -6435,7 +6437,7 @@ void qt_memfill16(quint16 *dest, quint16 value, qsizetype count) qt_memfill32(reinterpret_cast<quint32*>(dest), value32, count / 2); } -#if defined(Q_PROCESSOR_X86) +#if defined(Q_PROCESSOR_X86) || defined(QT_COMPILER_SUPPORTS_LSX) void (*qt_memfill32)(quint32 *dest, quint32 value, qsizetype count) = nullptr; void (*qt_memfill64)(quint64 *dest, quint64 value, qsizetype count) = nullptr; #elif !defined(__ARM_NEON__) && !defined(__MIPS_DSP__) @@ -6712,6 +6714,68 @@ static void qInitDrawhelperFunctions() #endif // SSE2 +#if defined(QT_COMPILER_SUPPORTS_LSX) + if (qCpuHasFeature(LSX)) { + qt_memfill32 = qt_memfill32_lsx; + qt_memfill64 = qt_memfill64_lsx; + + qDrawHelper[QImage::Format_RGB32].bitmapBlit = qt_bitmapblit32_lsx; + qDrawHelper[QImage::Format_ARGB32].bitmapBlit = qt_bitmapblit32_lsx; + qDrawHelper[QImage::Format_ARGB32_Premultiplied].bitmapBlit = qt_bitmapblit32_lsx; + qDrawHelper[QImage::Format_RGB16].bitmapBlit = qt_bitmapblit16_lsx; + qDrawHelper[QImage::Format_RGBX8888].bitmapBlit = qt_bitmapblit8888_lsx; + qDrawHelper[QImage::Format_RGBA8888].bitmapBlit = qt_bitmapblit8888_lsx; + qDrawHelper[QImage::Format_RGBA8888_Premultiplied].bitmapBlit = qt_bitmapblit8888_lsx; + + extern void qt_scale_image_argb32_on_argb32_lsx(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, int srch, + const QRectF &targetRect, + const QRectF &sourceRect, + const QRect &clip, + int const_alpha); + + qScaleFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_ARGB32_Premultiplied] = qt_scale_image_argb32_on_argb32_lsx; + qScaleFunctions[QImage::Format_RGB32][QImage::Format_ARGB32_Premultiplied] = qt_scale_image_argb32_on_argb32_lsx; + qScaleFunctions[QImage::Format_RGBA8888_Premultiplied][QImage::Format_RGBA8888_Premultiplied] = qt_scale_image_argb32_on_argb32_lsx; + qScaleFunctions[QImage::Format_RGBX8888][QImage::Format_RGBA8888_Premultiplied] = qt_scale_image_argb32_on_argb32_lsx; + + extern void qt_blend_rgb32_on_rgb32_lsx(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + int w, int h, + int const_alpha); + + extern void qt_blend_argb32_on_argb32_lsx(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + int w, int h, + int const_alpha); + + qBlendFunctions[QImage::Format_RGB32][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_lsx; + qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_lsx; + qBlendFunctions[QImage::Format_RGB32][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_lsx; + qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_lsx; + qBlendFunctions[QImage::Format_RGBX8888][QImage::Format_RGBX8888] = qt_blend_rgb32_on_rgb32_lsx; + qBlendFunctions[QImage::Format_RGBA8888_Premultiplied][QImage::Format_RGBX8888] = qt_blend_rgb32_on_rgb32_lsx; + qBlendFunctions[QImage::Format_RGBX8888][QImage::Format_RGBA8888_Premultiplied] = qt_blend_argb32_on_argb32_lsx; + qBlendFunctions[QImage::Format_RGBA8888_Premultiplied][QImage::Format_RGBA8888_Premultiplied] = qt_blend_argb32_on_argb32_lsx; + + extern const uint * QT_FASTCALL qt_fetch_radial_gradient_lsx(uint *buffer, const Operator *op, const QSpanData *data, + int y, int x, int length); + + qt_fetch_radial_gradient = qt_fetch_radial_gradient_lsx; + + extern void QT_FASTCALL comp_func_SourceOver_lsx(uint *destPixels, const uint *srcPixels, int length, uint const_alpha); + extern void QT_FASTCALL comp_func_solid_SourceOver_lsx(uint *destPixels, int length, uint color, uint const_alpha); + extern void QT_FASTCALL comp_func_Source_lsx(uint *destPixels, const uint *srcPixels, int length, uint const_alpha); + extern void QT_FASTCALL comp_func_solid_Source_lsx(uint *destPixels, int length, uint color, uint const_alpha); + extern void QT_FASTCALL comp_func_Plus_lsx(uint *destPixels, const uint *srcPixels, int length, uint const_alpha); + qt_functionForMode_C[QPainter::CompositionMode_SourceOver] = comp_func_SourceOver_lsx; + qt_functionForModeSolid_C[QPainter::CompositionMode_SourceOver] = comp_func_solid_SourceOver_lsx; + qt_functionForMode_C[QPainter::CompositionMode_Source] = comp_func_Source_lsx; + qt_functionForModeSolid_C[QPainter::CompositionMode_Source] = comp_func_solid_Source_lsx; + qt_functionForMode_C[QPainter::CompositionMode_Plus] = comp_func_Plus_lsx; + } +#endif //QT_COMPILER_SUPPORTS_LSX + #if defined(__ARM_NEON__) qBlendFunctions[QImage::Format_RGB32][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_neon; qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_neon; diff --git a/src/gui/painting/qdrawhelper_loongarch64_p.h b/src/gui/painting/qdrawhelper_loongarch64_p.h new file mode 100644 index 00000000000..a5513e3e55a --- /dev/null +++ b/src/gui/painting/qdrawhelper_loongarch64_p.h @@ -0,0 +1,48 @@ +// Copyright (C) 2024 Loongson Technology Corporation Limited. +// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only + +#ifndef QDRAWHELPER_LOONGARCH64_P_H +#define QDRAWHELPER_LOONGARCH64_P_H + +// +// W A R N I N G +// ------------- +// +// This file is not part of the Qt API. It exists purely as an +// implementation detail. This header file may change from version to +// version without notice, or even be removed. +// +// We mean it. +// + +#include <QtGui/private/qtguiglobal_p.h> +#include <private/qdrawhelper_p.h> + +QT_BEGIN_NAMESPACE + +#ifdef QT_COMPILER_SUPPORTS_LSX +void qt_memfill64_lsx(quint64 *dest, quint64 value, qsizetype count); +void qt_memfill32_lsx(quint32 *dest, quint32 value, qsizetype count); +void qt_bitmapblit32_lsx(QRasterBuffer *rasterBuffer, int x, int y, + const QRgba64 &color, + const uchar *src, int width, int height, int stride); +void qt_bitmapblit8888_lsx(QRasterBuffer *rasterBuffer, int x, int y, + const QRgba64 &color, + const uchar *src, int width, int height, int stride); +void qt_bitmapblit16_lsx(QRasterBuffer *rasterBuffer, int x, int y, + const QRgba64 &color, + const uchar *src, int width, int height, int stride); +void qt_blend_argb32_on_argb32_lsx(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + int w, int h, + int const_alpha); +void qt_blend_rgb32_on_rgb32_lsx(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + int w, int h, + int const_alpha); + +#endif // QT_COMPILER_SUPPORTS_LSX + +QT_END_NAMESPACE + +#endif // QDRAWHELPER_LOONGARCH64_P_H diff --git a/src/gui/painting/qdrawhelper_lsx.cpp b/src/gui/painting/qdrawhelper_lsx.cpp new file mode 100644 index 00000000000..f28374bc0d3 --- /dev/null +++ b/src/gui/painting/qdrawhelper_lsx.cpp @@ -0,0 +1,593 @@ +// Copyright (C) 2024 Loongson Technology Corporation Limited. +// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only + +#include <private/qdrawhelper_loongarch64_p.h> + +#ifdef QT_COMPILER_SUPPORTS_LSX + +#include <private/qdrawingprimitive_lsx_p.h> +#include <private/qpaintengine_raster_p.h> + +QT_BEGIN_NAMESPACE + +void qt_blend_argb32_on_argb32_lsx(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + int w, int h, + int const_alpha) +{ + const quint32 *src = (const quint32 *) srcPixels; + quint32 *dst = (quint32 *) destPixels; + if (const_alpha == 256) { + for (int y = 0; y < h; ++y) { + BLEND_SOURCE_OVER_ARGB32_LSX(dst, src, w); + dst = (quint32 *)(((uchar *) dst) + dbpl); + src = (const quint32 *)(((const uchar *) src) + sbpl); + } + } else if (const_alpha != 0) { + // dest = (s + d * sia) * ca + d * cia + // = s * ca + d * (sia * ca + cia) + // = s * ca + d * (1 - sa*ca) + const_alpha = (const_alpha * 255) >> 8; + + for (int y = 0; y < h; ++y) { + BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX(dst, src, w, const_alpha); + dst = (quint32 *)(((uchar *) dst) + dbpl); + src = (const quint32 *)(((const uchar *) src) + sbpl); + } + } +} + +// qblendfunctions.cpp +void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + int w, int h, + int const_alpha); + +void qt_blend_rgb32_on_rgb32_lsx(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + int w, int h, + int const_alpha) +{ + const quint32 *src = (const quint32 *) srcPixels; + quint32 *dst = (quint32 *) destPixels; + if (const_alpha != 256) { + if (const_alpha != 0) { + const __m128i half = __lsx_vreplgr2vr_h(0x80); + const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff); + + const_alpha = (const_alpha * 255) >> 8; + int one_minus_const_alpha = 255 - const_alpha; + const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha); + const __m128i oneMinusConstAlpha = __lsx_vreplgr2vr_h(one_minus_const_alpha); + for (int y = 0; y < h; ++y) { + int x = 0; + + // First, align dest to 16 bytes: + ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) { + dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, + dst[x], one_minus_const_alpha); + } + + for (; x < w-3; x += 4) { + __m128i srcVector = __lsx_vld(&src[x], 0); + __m128i dstVector = __lsx_vld(&dst[x], 0); + INTERPOLATE_PIXEL_255_LSX(srcVector, dstVector, constAlphaVector, + oneMinusConstAlpha, colorMask, half); + __lsx_vst(dstVector, &dst[x], 0); + } + SIMD_EPILOGUE(x, w, 3) + dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, + dst[x], one_minus_const_alpha); + dst = (quint32 *)(((uchar *) dst) + dbpl); + src = (const quint32 *)(((const uchar *) src) + sbpl); + } + } + } else { + qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha); + } +} + +void QT_FASTCALL comp_func_SourceOver_lsx(uint *destPixels, const uint *srcPixels, + int length, uint const_alpha) +{ + Q_ASSERT(const_alpha < 256); + + const quint32 *src = (const quint32 *) srcPixels; + quint32 *dst = (quint32 *) destPixels; + + if (const_alpha == 255) { + BLEND_SOURCE_OVER_ARGB32_LSX(dst, src, length); + } else { + BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX(dst, src, length, const_alpha); + } +} + +void QT_FASTCALL comp_func_Plus_lsx(uint *dst, const uint *src, int length, uint const_alpha) +{ + int x = 0; + + if (const_alpha == 255) { + // 1) Prologue: align destination on 16 bytes + ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) + dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]); + + // 2) composition with LSX + for (; x < length - 3; x += 4) { + const __m128i srcVector = __lsx_vld(&src[x], 0); + const __m128i dstVector = __lsx_vld(&dst[x], 0); + + const __m128i result = __lsx_vsadd_bu(srcVector, dstVector); + __lsx_vst(result, &dst[x], 0); + } + + // 3) Epilogue: + SIMD_EPILOGUE(x, length, 3) + dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]); + } else { + const int one_minus_const_alpha = 255 - const_alpha; + const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha); + const __m128i oneMinusConstAlpha = __lsx_vreplgr2vr_h(one_minus_const_alpha); + + // 1) Prologue: align destination on 16 bytes + ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) + dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], + const_alpha, + one_minus_const_alpha); + + const __m128i half = __lsx_vreplgr2vr_h(0x80); + const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff); + // 2) composition with LSX + for (; x < length - 3; x += 4) { + const __m128i srcVector = __lsx_vld(&src[x], 0); + __m128i dstVector = __lsx_vld(&dst[x], 0); + __m128i result = __lsx_vsadd_bu(srcVector, dstVector); + INTERPOLATE_PIXEL_255_LSX(result, dstVector, constAlphaVector, + oneMinusConstAlpha, colorMask, half); + __lsx_vst(dstVector, &dst[x], 0); + } + + // 3) Epilogue: + SIMD_EPILOGUE(x, length, 3) + dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], + const_alpha, one_minus_const_alpha); + } +} + +void QT_FASTCALL comp_func_Source_lsx(uint *dst, const uint *src, int length, uint const_alpha) +{ + if (const_alpha == 255) { + ::memcpy(dst, src, length * sizeof(uint)); + } else { + const int ialpha = 255 - const_alpha; + + int x = 0; + + // 1) prologue, align on 16 bytes + ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) + dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha); + + // 2) interpolate pixels with LSX + const __m128i half = __lsx_vreplgr2vr_h(0x80); + const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff); + + const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha); + const __m128i oneMinusConstAlpha = __lsx_vreplgr2vr_h(ialpha); + for (; x < length - 3; x += 4) { + const __m128i srcVector = __lsx_vld(&src[x], 0); + __m128i dstVector = __lsx_vld(&dst[x], 0); + INTERPOLATE_PIXEL_255_LSX(srcVector, dstVector, constAlphaVector, + oneMinusConstAlpha, colorMask, half); + __lsx_vst(dstVector, &dst[x], 0); + } + + // 3) Epilogue + SIMD_EPILOGUE(x, length, 3) + dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha); + } +} + +static Q_NEVER_INLINE +void Q_DECL_VECTORCALL qt_memfillXX_aligned(void *dest, __m128i value128, quintptr bytecount) +{ + __m128i *dst128 = reinterpret_cast<__m128i *>(dest); + __m128i *end128 = reinterpret_cast<__m128i *>(static_cast<uchar *>(dest) + bytecount); + + while (dst128 + 4 <= end128) { + __lsx_vst(value128, dst128 + 0, 0); + __lsx_vst(value128, dst128 + 1, 0); + __lsx_vst(value128, dst128 + 2, 0); + __lsx_vst(value128, dst128 + 3, 0); + dst128 += 4; + } + + bytecount %= 4 * sizeof(__m128i); + switch (bytecount / sizeof(__m128i)) { + case 3: __lsx_vst(value128, dst128++, 0); Q_FALLTHROUGH(); + case 2: __lsx_vst(value128, dst128++, 0); Q_FALLTHROUGH(); + case 1: __lsx_vst(value128, dst128++, 0); + } +} + +void qt_memfill64_lsx(quint64 *dest, quint64 value, qsizetype count) +{ + quintptr misaligned = quintptr(dest) % sizeof(__m128i); + if (misaligned && count) { + *dest++ = value; + --count; + } + + if (count % 2) { + dest[count - 1] = value; + --count; + } + + qt_memfillXX_aligned(dest, __lsx_vreplgr2vr_d(value), count * sizeof(quint64)); +} + +void qt_memfill32_lsx(quint32 *dest, quint32 value, qsizetype count) +{ + if (count < 4) { + // this simplifies the code below: the first switch can fall through + // without checking the value of count + switch (count) { + case 3: *dest++ = value; Q_FALLTHROUGH(); + case 2: *dest++ = value; Q_FALLTHROUGH(); + case 1: *dest = value; + } + return; + } + + const int align = (quintptr)(dest) & 0xf; + switch (align) { + case 4: *dest++ = value; --count; Q_FALLTHROUGH(); + case 8: *dest++ = value; --count; Q_FALLTHROUGH(); + case 12: *dest++ = value; --count; + } + + const int rest = count & 0x3; + if (rest) { + switch (rest) { + case 3: dest[count - 3] = value; Q_FALLTHROUGH(); + case 2: dest[count - 2] = value; Q_FALLTHROUGH(); + case 1: dest[count - 1] = value; + } + } + + qt_memfillXX_aligned(dest, __lsx_vreplgr2vr_w(value), count * sizeof(quint32)); +} + +void QT_FASTCALL comp_func_solid_Source_lsx(uint *destPixels, int length, + uint color, uint const_alpha) +{ + if (const_alpha == 255) { + qt_memfill32(destPixels, color, length); + } else { + const quint32 ialpha = 255 - const_alpha; + color = BYTE_MUL(color, const_alpha); + int x = 0; + + quint32 *dst = (quint32 *) destPixels; + const __m128i colorVector = __lsx_vreplgr2vr_w(color); + const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff); + const __m128i half = __lsx_vreplgr2vr_h(0x80); + const __m128i iAlphaVector = __lsx_vreplgr2vr_h(ialpha); + + ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) + destPixels[x] = color + BYTE_MUL(destPixels[x], ialpha); + + for (; x < length-3; x += 4) { + __m128i dstVector = __lsx_vld(&dst[x], 0); + BYTE_MUL_LSX(dstVector, iAlphaVector, colorMask, half); + dstVector = __lsx_vadd_b(colorVector, dstVector); + __lsx_vst(dstVector, &dst[x], 0); + } + SIMD_EPILOGUE(x, length, 3) + destPixels[x] = color + BYTE_MUL(destPixels[x], ialpha); + } +} + +void QT_FASTCALL comp_func_solid_SourceOver_lsx(uint *destPixels, int length, + uint color, uint const_alpha) +{ + if ((const_alpha & qAlpha(color)) == 255) { + qt_memfill32(destPixels, color, length); + } else { + if (const_alpha != 255) + color = BYTE_MUL(color, const_alpha); + + const quint32 minusAlphaOfColor = qAlpha(~color); + int x = 0; + + quint32 *dst = (quint32 *) destPixels; + const __m128i colorVector = __lsx_vreplgr2vr_w(color); + const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff); + const __m128i half = __lsx_vreplgr2vr_h(0x80); + const __m128i minusAlphaOfColorVector = __lsx_vreplgr2vr_h(minusAlphaOfColor); + + ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) + destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor); + + for (; x < length-3; x += 4) { + __m128i dstVector = __lsx_vld(&dst[x], 0); + BYTE_MUL_LSX(dstVector, minusAlphaOfColorVector, colorMask, half); + dstVector = __lsx_vadd_b(colorVector, dstVector); + __lsx_vst(dstVector, &dst[x], 0); + } + SIMD_EPILOGUE(x, length, 3) + destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor); + } +} + +void qt_bitmapblit32_lsx_base(QRasterBuffer *rasterBuffer, int x, int y, + quint32 color, + const uchar *src, int width, int height, int stride) +{ + quint32 *dest = reinterpret_cast<quint32*>(rasterBuffer->scanLine(y)) + x; + const int destStride = rasterBuffer->stride<quint32>(); + + const __m128i c128 = __lsx_vreplgr2vr_w(color); + const __m128i maskmask1 = (__m128i)(v4u32){0x80808080, 0x40404040, + 0x20202020, 0x10101010}; + const __m128i maskadd1 = (__m128i)(v4i32){0x00000000, 0x40404040, + 0x60606060, 0x70707070}; + + if (width > 4) { + const __m128i maskmask2 = (__m128i)(v4i32){0x08080808, 0x04040404, + 0x02020202, 0x01010101}; + const __m128i maskadd2 = (__m128i)(v4i32){0x78787878, 0x7c7c7c7c, + 0x7e7e7e7e, 0x7f7f7f7f}; + while (height--) { + for (int x = 0; x < width; x += 8) { + const quint8 s = src[x >> 3]; + if (!s) + continue; + __m128i mask1 = __lsx_vreplgr2vr_b(s); + __m128i mask2 = mask1; + + mask1 = __lsx_vand_v(mask1, maskmask1); + mask1 = __lsx_vadd_b(mask1, maskadd1); + + __m128i destSrc1 = __lsx_vld((char*)(dest + x), 0); + + mask1 = __lsx_vslti_b(mask1,0); + destSrc1 = __lsx_vbitsel_v(destSrc1, c128, mask1); + __lsx_vst(destSrc1, (char*)(dest + x), 0); + + __m128i destSrc2 = __lsx_vld((char*)(dest + x + 4), 0); + + mask2 = __lsx_vand_v(mask2, maskmask2); + mask2 = __lsx_vadd_b(mask2, maskadd2); + + mask2 = __lsx_vslti_b(mask2,0); + destSrc2 = __lsx_vbitsel_v(destSrc2, c128, mask2); + __lsx_vst(destSrc2, (char*)(dest + x + 4), 0); + } + dest += destStride; + src += stride; + } + } else { + while (height--) { + const quint8 s = *src; + if (s) { + __m128i mask1 = __lsx_vreplgr2vr_b(s); + + __m128i destSrc1 = __lsx_vld((char*)(dest), 0); + mask1 = __lsx_vand_v(mask1, maskmask1); + mask1 = __lsx_vadd_b(mask1, maskadd1); + + mask1 = __lsx_vslti_b(mask1, 0); + destSrc1 = __lsx_vbitsel_v(destSrc1, c128, mask1); + __lsx_vst(destSrc1, (char*)(dest), 0); + } + dest += destStride; + src += stride; + } + } +} + +void qt_bitmapblit32_lsx(QRasterBuffer *rasterBuffer, int x, int y, + const QRgba64 &color, + const uchar *src, int width, int height, int stride) +{ + qt_bitmapblit32_lsx_base(rasterBuffer, x, y, color.toArgb32(), src, width, height, stride); +} + +void qt_bitmapblit8888_lsx(QRasterBuffer *rasterBuffer, int x, int y, + const QRgba64 &color, + const uchar *src, int width, int height, int stride) +{ + qt_bitmapblit32_lsx_base(rasterBuffer, x, y, ARGB2RGBA(color.toArgb32()), src, width, height, stride); +} + +void qt_bitmapblit16_lsx(QRasterBuffer *rasterBuffer, int x, int y, + const QRgba64 &color, + const uchar *src, int width, int height, int stride) +{ + const quint16 c = qConvertRgb32To16(color.toArgb32()); + quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x; + const int destStride = rasterBuffer->stride<quint32>(); + + const __m128i c128 = __lsx_vreplgr2vr_h(c); + const __m128i maskmask = (__m128i)(v8u16){0x8080, 0x4040, 0x2020, 0x1010, + 0x0808, 0x0404, 0x0202, 0x0101}; + + const __m128i maskadd = (__m128i)(v8i16){0x0000, 0x4040, 0x6060, 0x7070, + 0x7878, 0x7c7c, 0x7e7e, 0x7f7f}; + while (--height >= 0) { + for (int x = 0; x < width; x += 8) { + const quint8 s = src[x >> 3]; + if (!s) + continue; + __m128i mask = __lsx_vreplgr2vr_b(s); + __m128i destSrc = __lsx_vld((char*)(dest + x), 0); + mask = __lsx_vand_v(mask, maskmask); + mask = __lsx_vadd_b(mask, maskadd); + mask = __lsx_vslti_b(mask, 0); + destSrc = __lsx_vbitsel_v(destSrc, c128, mask); + __lsx_vst(destSrc, (char*)(dest + x), 0); + } + dest += destStride; + src += stride; + } +} + +class QSimdLsx +{ +public: + typedef __m128i Int32x4; + typedef __m128 Float32x4; + + union Vect_buffer_i { Int32x4 v; int i[4]; }; + union Vect_buffer_f { Float32x4 v; float f[4]; }; + + static inline Float32x4 Q_DECL_VECTORCALL v_dup(float x) { return __lsx_vreplfr2vr_s(x); } + static inline Float32x4 Q_DECL_VECTORCALL v_dup(double x) { return __lsx_vreplfr2vr_s(x); } + static inline Int32x4 Q_DECL_VECTORCALL v_dup(int x) { return __lsx_vreplgr2vr_w(x); } + static inline Int32x4 Q_DECL_VECTORCALL v_dup(uint x) { return __lsx_vreplgr2vr_w(x); } + + static inline Float32x4 Q_DECL_VECTORCALL v_add(Float32x4 a, Float32x4 b) { return __lsx_vfadd_s(a, b); } + static inline Int32x4 Q_DECL_VECTORCALL v_add(Int32x4 a, Int32x4 b) { return __lsx_vadd_w(a, b); } + + static inline Float32x4 Q_DECL_VECTORCALL v_max(Float32x4 a, Float32x4 b) { return __lsx_vfmax_s(a, b); } + static inline Float32x4 Q_DECL_VECTORCALL v_min(Float32x4 a, Float32x4 b) { return __lsx_vfmin_s(a, b); } + static inline Int32x4 Q_DECL_VECTORCALL v_min_16(Int32x4 a, Int32x4 b) { return __lsx_vmin_h(a, b); } + + static inline Int32x4 Q_DECL_VECTORCALL v_and(Int32x4 a, Int32x4 b) { return __lsx_vand_v(a, b); } + + static inline Float32x4 Q_DECL_VECTORCALL v_sub(Float32x4 a, Float32x4 b) { return __lsx_vfsub_s(a, b); } + static inline Int32x4 Q_DECL_VECTORCALL v_sub(Int32x4 a, Int32x4 b) { return __lsx_vsub_w(a, b); } + + static inline Float32x4 Q_DECL_VECTORCALL v_mul(Float32x4 a, Float32x4 b) { return __lsx_vfmul_s(a, b); } + + static inline Float32x4 Q_DECL_VECTORCALL v_sqrt(Float32x4 x) { return __lsx_vfsqrt_s(x); } + + static inline Int32x4 Q_DECL_VECTORCALL v_toInt(Float32x4 x) { return __lsx_vftintrz_w_s(x); } + + static inline Int32x4 Q_DECL_VECTORCALL v_greaterOrEqual(Float32x4 a, Float32x4 b) { return __lsx_vfcmp_clt_s(b, a); } +}; + +const uint * QT_FASTCALL qt_fetch_radial_gradient_lsx(uint *buffer, const Operator *op, + const QSpanData *data, + int y, int x, int length) +{ + return qt_fetch_radial_gradient_template<QRadialFetchSimd<QSimdLsx>,uint>(buffer, op, data, y, x, length); +} + +void qt_scale_image_argb32_on_argb32_lsx(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, int srch, + const QRectF &targetRect, + const QRectF &sourceRect, + const QRect &clip, + int const_alpha) +{ + if (const_alpha != 256) { + // from qblendfunctions.cpp + extern void qt_scale_image_argb32_on_argb32(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, int srch, + const QRectF &targetRect, + const QRectF &sourceRect, + const QRect &clip, + int const_alpha); + return qt_scale_image_argb32_on_argb32(destPixels, dbpl, srcPixels, sbpl, srch, + targetRect, sourceRect, clip, const_alpha); + } + + qreal sx = sourceRect.width() / (qreal)targetRect.width(); + qreal sy = sourceRect.height() / (qreal)targetRect.height(); + + + const int ix = 0x00010000 * sx; + const int iy = 0x00010000 * sy; + + QRect tr = targetRect.normalized().toRect(); + tr = tr.intersected(clip); + if (tr.isEmpty()) + return; + const int tx1 = tr.left(); + const int ty1 = tr.top(); + int h = tr.height(); + int w = tr.width(); + + quint32 basex; + quint32 srcy; + + if (sx < 0) { + int dstx = qFloor((tx1 + qreal(0.5) - targetRect.right()) * sx * 65536) + 1; + basex = quint32(sourceRect.right() * 65536) + dstx; + } else { + int dstx = qCeil((tx1 + qreal(0.5) - targetRect.left()) * sx * 65536) - 1; + basex = quint32(sourceRect.left() * 65536) + dstx; + } + if (sy < 0) { + int dsty = qFloor((ty1 + qreal(0.5) - targetRect.bottom()) * sy * 65536) + 1; + srcy = quint32(sourceRect.bottom() * 65536) + dsty; + } else { + int dsty = qCeil((ty1 + qreal(0.5) - targetRect.top()) * sy * 65536) - 1; + srcy = quint32(sourceRect.top() * 65536) + dsty; + } + + quint32 *dst = ((quint32 *) (destPixels + ty1 * dbpl)) + tx1; + + const __m128i nullVector = __lsx_vreplgr2vr_w(0); + const __m128i half = __lsx_vreplgr2vr_h(0x80); + const __m128i one = __lsx_vreplgr2vr_h(0xff); + const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff); + const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000); + const __m128i ixVector = __lsx_vreplgr2vr_w(4*ix); + + // this bounds check here is required as floating point rounding above might in some cases lead to + // w/h values that are one pixel too large, falling outside of the valid image area. + const int ystart = srcy >> 16; + if (ystart >= srch && iy < 0) { + srcy += iy; + --h; + } + const int xstart = basex >> 16; + if (xstart >= (int)(sbpl/sizeof(quint32)) && ix < 0) { + basex += ix; + --w; + } + int yend = (srcy + iy * (h - 1)) >> 16; + if (yend < 0 || yend >= srch) + --h; + int xend = (basex + ix * (w - 1)) >> 16; + if (xend < 0 || xend >= (int)(sbpl/sizeof(quint32))) + --w; + + while (--h >= 0) { + const uint *src = (const quint32 *) (srcPixels + (srcy >> 16) * sbpl); + int srcx = basex; + int x = 0; + + ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) { + uint s = src[srcx >> 16]; + dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); + srcx += ix; + } + + __m128i srcxVector = (__m128i)(v4i32){srcx + ix + ix + ix, srcx + ix + ix, srcx + ix, srcx}; + + for (; x < (w - 3); x += 4) { + const int idx0 = __lsx_vpickve2gr_h(srcxVector, 1); + const int idx1 = __lsx_vpickve2gr_h(srcxVector, 3); + const int idx2 = __lsx_vpickve2gr_h(srcxVector, 5); + const int idx3 = __lsx_vpickve2gr_h(srcxVector, 7); + srcxVector = __lsx_vadd_w(srcxVector, ixVector); + + const __m128i srcVector = (__m128i)((v4u32){src[idx3], src[idx2], src[idx1], src[idx0]}); + + BLEND_SOURCE_OVER_ARGB32_LSX_helper(dst, x, srcVector, nullVector, half, one, colorMask, alphaMask); + } + + SIMD_EPILOGUE(x, w, 3) { + uint s = src[(basex + x*ix) >> 16]; + dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); + } + dst = (quint32 *)(((uchar *) dst) + dbpl); + srcy += iy; + } +} + +QT_END_NAMESPACE + +#endif // QT_COMPILER_SUPPORTS_LSX diff --git a/src/gui/painting/qdrawhelper_p.h b/src/gui/painting/qdrawhelper_p.h index 833ddd7b166..482a2da206e 100644 --- a/src/gui/painting/qdrawhelper_p.h +++ b/src/gui/painting/qdrawhelper_p.h @@ -142,7 +142,7 @@ struct quint24 { void qBlendGradient(int count, const QT_FT_Span *spans, void *userData); void qBlendTexture(int count, const QT_FT_Span *spans, void *userData); -#ifdef Q_PROCESSOR_X86 +#if defined(Q_PROCESSOR_X86) || defined(QT_COMPILER_SUPPORTS_LSX) extern void (*qt_memfill64)(quint64 *dest, quint64 value, qsizetype count); extern void (*qt_memfill32)(quint32 *dest, quint32 value, qsizetype count); #else diff --git a/src/gui/painting/qdrawingprimitive_lsx_p.h b/src/gui/painting/qdrawingprimitive_lsx_p.h new file mode 100644 index 00000000000..06e97139df6 --- /dev/null +++ b/src/gui/painting/qdrawingprimitive_lsx_p.h @@ -0,0 +1,231 @@ +// Copyright (C) 2024 Loongson Technology Corporation Limited. +// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only + +#ifndef QDRAWINGPRIMITIVE_LSX_P_H +#define QDRAWINGPRIMITIVE_LSX_P_H + +#include <QtGui/private/qtguiglobal_p.h> +#include <private/qsimd_p.h> +#include "qdrawhelper_loongarch64_p.h" +#include "qrgba64_p.h" + +#ifdef __loongarch_sx + +// +// W A R N I N G +// ------------- +// +// This file is not part of the Qt API. It exists purely as an +// implementation detail. This header file may change from version to +// version without notice, or even be removed. +// +// We mean it. +// + +QT_BEGIN_NAMESPACE + +/* + * Multiply the components of pixelVector by alphaChannel + * Each 32bits components of alphaChannel must be in the form 0x00AA00AA + * colorMask must have 0x00ff00ff on each 32 bits component + * half must have the value 128 (0x80) for each 32 bits component + */ +inline static void Q_DECL_VECTORCALL +BYTE_MUL_LSX(__m128i &pixelVector, __m128i alphaChannel, __m128i colorMask, __m128i half) +{ + /* 1. separate the colors in 2 vectors so each color is on 16 bits + (in order to be multiplied by the alpha + each 32 bit of dstVectorAG are in the form 0x00AA00GG + each 32 bit of dstVectorRB are in the form 0x00RR00BB */ + __m128i pixelVectorAG = __lsx_vsrli_h(pixelVector, 8); + __m128i pixelVectorRB = __lsx_vand_v(pixelVector, colorMask); + + /* 2. multiply the vectors by the alpha channel */ + pixelVectorAG = __lsx_vmul_h(pixelVectorAG, alphaChannel); + pixelVectorRB = __lsx_vmul_h(pixelVectorRB, alphaChannel); + + /* 3. divide by 255, that's the tricky part. + we do it like for BYTE_MUL(), with bit shift: X/255 ~= (X + X/256 + rounding)/256 */ + /** so first (X + X/256 + rounding) */ + pixelVectorRB = __lsx_vadd_h(pixelVectorRB, __lsx_vsrli_h(pixelVectorRB, 8)); + pixelVectorRB = __lsx_vadd_h(pixelVectorRB, half); + pixelVectorAG = __lsx_vadd_h(pixelVectorAG, __lsx_vsrli_h(pixelVectorAG, 8)); + pixelVectorAG = __lsx_vadd_h(pixelVectorAG, half); + + /** second divide by 256 */ + pixelVectorRB = __lsx_vsrli_h(pixelVectorRB, 8); + /** for AG, we could >> 8 to divide followed by << 8 to put the + bytes in the correct position. By masking instead, we execute + only one instruction */ + pixelVectorAG = __lsx_vandn_v(colorMask, pixelVectorAG); + + /* 4. combine the 2 pairs of colors */ + pixelVector = __lsx_vor_v(pixelVectorAG, pixelVectorRB); +} + +/* + * Each 32bits components of alphaChannel must be in the form 0x00AA00AA + * oneMinusAlphaChannel must be 255 - alpha for each 32 bits component + * colorMask must have 0x00ff00ff on each 32 bits component + * half must have the value 128 (0x80) for each 32 bits component + */ +inline static void Q_DECL_VECTORCALL +INTERPOLATE_PIXEL_255_LSX(__m128i srcVector, __m128i &dstVector, __m128i alphaChannel, + __m128i oneMinusAlphaChannel, __m128i colorMask, __m128i half) +{ + /* interpolate AG */ + __m128i srcVectorAG = __lsx_vsrli_h(srcVector, 8); + __m128i dstVectorAG = __lsx_vsrli_h(dstVector, 8); + __m128i srcVectorAGalpha = __lsx_vmul_h(srcVectorAG, alphaChannel); + __m128i dstVectorAGoneMinusAlphalpha = __lsx_vmul_h(dstVectorAG, oneMinusAlphaChannel); + __m128i finalAG = __lsx_vadd_h(srcVectorAGalpha, dstVectorAGoneMinusAlphalpha); + finalAG = __lsx_vadd_h(finalAG, __lsx_vsrli_h(finalAG, 8)); + finalAG = __lsx_vadd_h(finalAG, half); + finalAG = __lsx_vandn_v(colorMask, finalAG); + + /* interpolate RB */ + __m128i srcVectorRB = __lsx_vand_v(srcVector, colorMask); + __m128i dstVectorRB = __lsx_vand_v(dstVector, colorMask); + __m128i srcVectorRBalpha = __lsx_vmul_h(srcVectorRB, alphaChannel); + __m128i dstVectorRBoneMinusAlphalpha = __lsx_vmul_h(dstVectorRB, oneMinusAlphaChannel); + __m128i finalRB = __lsx_vadd_h(srcVectorRBalpha, dstVectorRBoneMinusAlphalpha); + finalRB = __lsx_vadd_h(finalRB, __lsx_vsrli_h(finalRB, 8)); + finalRB = __lsx_vadd_h(finalRB, half); + finalRB = __lsx_vsrli_h(finalRB, 8); + + /* combine */ + dstVector = __lsx_vor_v(finalAG, finalRB); +} + +// same as BLEND_SOURCE_OVER_ARGB32_LSX, but for one vector srcVector +inline static void Q_DECL_VECTORCALL +BLEND_SOURCE_OVER_ARGB32_LSX_helper(quint32 *dst, int x, __m128i srcVector, + __m128i nullVector, __m128i half, __m128i one, + __m128i colorMask, __m128i alphaMask) +{ + const __m128i srcVectorAlpha = __lsx_vand_v(srcVector, alphaMask); + __m128i vseq = __lsx_vseq_w(srcVectorAlpha, alphaMask); + v4i32 vseq_res = (v4i32)__lsx_vmsknz_b(vseq); + if (vseq_res[0] == (0x0000ffff)) { + /* all opaque */ + __lsx_vst(srcVector, &dst[x], 0); + } else { + __m128i vseq_n = __lsx_vseq_w(srcVectorAlpha, nullVector); + v4i32 vseq_n_res = (v4i32)__lsx_vmsknz_b(vseq_n); + if (vseq_n_res[0] != (0x0000ffff)) { + /* not fully transparent */ + /* extract the alpha channel on 2 x 16 bits */ + /* so we have room for the multiplication */ + /* each 32 bits will be in the form 0x00AA00AA */ + /* with A being the 1 - alpha */ + __m128i alphaChannel = __lsx_vsrli_w(srcVector, 24); + alphaChannel = __lsx_vor_v(alphaChannel, __lsx_vslli_w(alphaChannel, 16)); + alphaChannel = __lsx_vsub_h(one, alphaChannel); + + __m128i dstVector = __lsx_vld(&dst[x], 0); + BYTE_MUL_LSX(dstVector, alphaChannel, colorMask, half); + + /* result = s + d * (1-alpha) */ + const __m128i result = __lsx_vadd_b(srcVector, dstVector); + __lsx_vst(result, &dst[x], 0); + } + } +} + +// Basically blend src over dst with the const alpha defined as constAlphaVector. +// nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as: +//const __m128i nullVector = __lsx_vreplgr2vr_w(0); +//const __m128i half = __lsx_vreplgr2vr_h(0x80); +//const __m128i one = __lsx_vreplgr2vr_h(0xff); +//const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff); +//const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000); +// +// The computation being done is: +// result = s + d * (1-alpha) +// with shortcuts if fully opaque or fully transparent. +inline static void Q_DECL_VECTORCALL +BLEND_SOURCE_OVER_ARGB32_LSX(quint32 *dst, const quint32 *src, int length) +{ + int x = 0; + + /* First, get dst aligned. */ + ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) { + blend_pixel(dst[x], src[x]); + } + + const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000); + const __m128i nullVector = __lsx_vreplgr2vr_w(0); + const __m128i half = __lsx_vreplgr2vr_h(0x80); + const __m128i one = __lsx_vreplgr2vr_h(0xff); + const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff); + + for (; x < length-3; x += 4) { + const __m128i srcVector = __lsx_vld((const __m128i *)&src[x], 0); + BLEND_SOURCE_OVER_ARGB32_LSX_helper(dst, x, srcVector, nullVector, half, one, colorMask, alphaMask); + } + SIMD_EPILOGUE(x, length, 3) { + blend_pixel(dst[x], src[x]); + } +} + +// Basically blend src over dst with the const alpha defined as constAlphaVector. +// The computation being done is: +// dest = (s + d * sia) * ca + d * cia +// = s * ca + d * (sia * ca + cia) +// = s * ca + d * (1 - sa*ca) +inline static void Q_DECL_VECTORCALL +BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX(quint32 *dst, const quint32 *src, int length, uint const_alpha) +{ + int x = 0; + + ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) { + blend_pixel(dst[x], src[x], const_alpha); + } + + const __m128i nullVector = __lsx_vreplgr2vr_w(0); + const __m128i half = __lsx_vreplgr2vr_h(0x80); + const __m128i one = __lsx_vreplgr2vr_h(0xff); + const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff); + const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha); + + for (; x < length-3; x += 4) { + __m128i srcVector = __lsx_vld((const __m128i *)&src[x], 0); + __m128i vseq = __lsx_vseq_w(srcVector, nullVector); + v4i32 vseq_res = (v4i32)__lsx_vmsknz_b(vseq); + if (vseq_res[0] != 0x0000ffff) { + BYTE_MUL_LSX(srcVector, constAlphaVector, colorMask, half); + + __m128i alphaChannel = __lsx_vsrli_w(srcVector, 24); + alphaChannel = __lsx_vor_v(alphaChannel, __lsx_vslli_w(alphaChannel, 16)); + alphaChannel = __lsx_vsub_h(one, alphaChannel); + + __m128i dstVector = __lsx_vld((__m128i *)&dst[x], 0); + BYTE_MUL_LSX(dstVector, alphaChannel, colorMask, half); + + const __m128i result = __lsx_vadd_b(srcVector, dstVector); + __lsx_vst(result, &dst[x], 0); + } + } + SIMD_EPILOGUE(x, length, 3) { + blend_pixel(dst[x], src[x], const_alpha); + } +} + +typedef union +{ + int i; + float f; +} FloatInt; + +/* float type data load instructions */ +static __m128 __lsx_vreplfr2vr_s(float val) +{ + FloatInt fi_tmpval = {.f = val}; + return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i); +} + +QT_END_NAMESPACE + +#endif // __loongarch_sx + +#endif // QDRAWINGPRIMITIVE_LSX_P_H diff --git a/src/gui/painting/qimagescale.cpp b/src/gui/painting/qimagescale.cpp index cc95f6773fe..1b1f2e8b8c8 100644 --- a/src/gui/painting/qimagescale.cpp +++ b/src/gui/painting/qimagescale.cpp @@ -257,6 +257,18 @@ void qt_qimageScaleAARGBA_down_xy_sse4(QImageScaleInfo *isi, unsigned int *dest, int dw, int dh, int dow, int sow); #endif +#if defined(QT_COMPILER_SUPPORTS_LSX) +template<bool RGB> +void qt_qimageScaleAARGBA_up_x_down_y_lsx(QImageScaleInfo *isi, unsigned int *dest, + int dw, int dh, int dow, int sow); +template<bool RGB> +void qt_qimageScaleAARGBA_down_x_up_y_lsx(QImageScaleInfo *isi, unsigned int *dest, + int dw, int dh, int dow, int sow); +template<bool RGB> +void qt_qimageScaleAARGBA_down_xy_lsx(QImageScaleInfo *isi, unsigned int *dest, + int dw, int dh, int dow, int sow); +#endif + #if defined(__ARM_NEON__) template<bool RGB> void qt_qimageScaleAARGBA_up_x_down_y_neon(QImageScaleInfo *isi, unsigned int *dest, @@ -351,6 +363,10 @@ static void qt_qimageScaleAARGBA(QImageScaleInfo *isi, unsigned int *dest, if (qCpuHasFeature(SSE4_1)) qt_qimageScaleAARGBA_up_x_down_y_sse4<false>(isi, dest, dw, dh, dow, sow); else +#elif defined(QT_COMPILER_SUPPORTS_LSX) + if (qCpuHasFeature(LSX)) + qt_qimageScaleAARGBA_up_x_down_y_lsx<false>(isi, dest, dw, dh, dow, sow); + else #elif defined(__ARM_NEON__) if (qCpuHasFeature(NEON)) qt_qimageScaleAARGBA_up_x_down_y_neon<false>(isi, dest, dw, dh, dow, sow); @@ -364,6 +380,10 @@ static void qt_qimageScaleAARGBA(QImageScaleInfo *isi, unsigned int *dest, if (qCpuHasFeature(SSE4_1)) qt_qimageScaleAARGBA_down_x_up_y_sse4<false>(isi, dest, dw, dh, dow, sow); else +#elif defined(QT_COMPILER_SUPPORTS_LSX) + if (qCpuHasFeature(LSX)) + qt_qimageScaleAARGBA_down_x_up_y_lsx<false>(isi, dest, dw, dh, dow, sow); + else #elif defined(__ARM_NEON__) if (qCpuHasFeature(NEON)) qt_qimageScaleAARGBA_down_x_up_y_neon<false>(isi, dest, dw, dh, dow, sow); @@ -377,6 +397,10 @@ static void qt_qimageScaleAARGBA(QImageScaleInfo *isi, unsigned int *dest, if (qCpuHasFeature(SSE4_1)) qt_qimageScaleAARGBA_down_xy_sse4<false>(isi, dest, dw, dh, dow, sow); else +#elif defined(QT_COMPILER_SUPPORTS_LSX) + if (qCpuHasFeature(LSX)) + qt_qimageScaleAARGBA_down_xy_lsx<false>(isi, dest, dw, dh, dow, sow); + else #elif defined(__ARM_NEON__) if (qCpuHasFeature(NEON)) qt_qimageScaleAARGBA_down_xy_neon<false>(isi, dest, dw, dh, dow, sow); @@ -995,6 +1019,10 @@ static void qt_qimageScaleAARGB(QImageScaleInfo *isi, unsigned int *dest, if (qCpuHasFeature(SSE4_1)) qt_qimageScaleAARGBA_up_x_down_y_sse4<true>(isi, dest, dw, dh, dow, sow); else +#elif defined QT_COMPILER_SUPPORTS_LSX + if (qCpuHasFeature(LSX)) + qt_qimageScaleAARGBA_up_x_down_y_lsx<true>(isi, dest, dw, dh, dow, sow); + else #elif defined(__ARM_NEON__) if (qCpuHasFeature(NEON)) qt_qimageScaleAARGBA_up_x_down_y_neon<true>(isi, dest, dw, dh, dow, sow); @@ -1008,6 +1036,10 @@ static void qt_qimageScaleAARGB(QImageScaleInfo *isi, unsigned int *dest, if (qCpuHasFeature(SSE4_1)) qt_qimageScaleAARGBA_down_x_up_y_sse4<true>(isi, dest, dw, dh, dow, sow); else +#elif defined QT_COMPILER_SUPPORTS_LSX + if (qCpuHasFeature(LSX)) + qt_qimageScaleAARGBA_down_x_up_y_lsx<true>(isi, dest, dw, dh, dow, sow); + else #elif defined(__ARM_NEON__) if (qCpuHasFeature(NEON)) qt_qimageScaleAARGBA_down_x_up_y_neon<true>(isi, dest, dw, dh, dow, sow); @@ -1021,6 +1053,10 @@ static void qt_qimageScaleAARGB(QImageScaleInfo *isi, unsigned int *dest, if (qCpuHasFeature(SSE4_1)) qt_qimageScaleAARGBA_down_xy_sse4<true>(isi, dest, dw, dh, dow, sow); else +#elif defined QT_COMPILER_SUPPORTS_LSX + if (qCpuHasFeature(LSX)) + qt_qimageScaleAARGBA_down_xy_lsx<true>(isi, dest, dw, dh, dow, sow); + else #elif defined(__ARM_NEON__) if (qCpuHasFeature(NEON)) qt_qimageScaleAARGBA_down_xy_neon<true>(isi, dest, dw, dh, dow, sow); diff --git a/src/gui/painting/qimagescale_lsx.cpp b/src/gui/painting/qimagescale_lsx.cpp new file mode 100644 index 00000000000..c128b014b8c --- /dev/null +++ b/src/gui/painting/qimagescale_lsx.cpp @@ -0,0 +1,233 @@ +// Copyright (C) 2024 Loongson Technology Corporation Limited. +// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only + +#include "qimagescale_p.h" +#include "qimage.h" +#include <private/qdrawhelper_loongarch64_p.h> +#include <private/qsimd_p.h> + +#if QT_CONFIG(thread) && !defined(Q_OS_WASM) +#include <qsemaphore.h> +#include <private/qthreadpool_p.h> +#endif + +#if defined(QT_COMPILER_SUPPORTS_LSX) + +QT_BEGIN_NAMESPACE + +using namespace QImageScale; + +template<typename T> +static inline void multithread_pixels_function(QImageScaleInfo *isi, int dh, const T &scaleSection) +{ +#if QT_CONFIG(thread) && !defined(Q_OS_WASM) + int segments = (qsizetype(isi->sh) * isi->sw) / (1<<16); + segments = std::min(segments, dh); + QThreadPool *threadPool = QThreadPoolPrivate::qtGuiInstance(); + if (segments > 1 && threadPool && !threadPool->contains(QThread::currentThread())) { + QSemaphore semaphore; + int y = 0; + for (int i = 0; i < segments; ++i) { + int yn = (dh - y) / (segments - i); + threadPool->start([&, y, yn]() { + scaleSection(y, y + yn); + semaphore.release(1); + }); + y += yn; + } + semaphore.acquire(segments); + return; + } +#else + Q_UNUSED(isi); +#endif + scaleSection(0, dh); +} + +inline static __m128i Q_DECL_VECTORCALL +qt_qimageScaleAARGBA_helper(const unsigned int *pix, int xyap, int Cxy, + int step, const __m128i vxyap, const __m128i vCxy) +{ + const __m128i shuffleMask = (__m128i)(v16i8){0, 16, 16, 16, 1, 16, 16, 16, + 2, 16, 16, 16, 3, 16, 16, 16}; + __m128i vpix = __lsx_vshuf_b(__lsx_vldi(0), __lsx_vreplgr2vr_w(*pix), shuffleMask); + __m128i vx = __lsx_vmul_w(vpix, vxyap); + int i; + for (i = (1 << 14) - xyap; i > Cxy; i -= Cxy) { + pix += step; + vpix = __lsx_vshuf_b(__lsx_vldi(0), __lsx_vreplgr2vr_w(*pix), shuffleMask); + vx = __lsx_vadd_w(vx, __lsx_vmul_w(vpix, vCxy)); + } + pix += step; + vpix = __lsx_vshuf_b(__lsx_vldi(0), __lsx_vreplgr2vr_w(*pix), shuffleMask); + vx = __lsx_vadd_w(vx, __lsx_vmul_w(vpix, __lsx_vreplgr2vr_w(i))); + return vx; +} + +template<bool RGB> +void qt_qimageScaleAARGBA_up_x_down_y_lsx(QImageScaleInfo *isi, unsigned int *dest, + int dw, int dh, int dow, int sow) +{ + const unsigned int **ypoints = isi->ypoints; + const int *xpoints = isi->xpoints; + const int *xapoints = isi->xapoints; + const int *yapoints = isi->yapoints; + + const __m128i v256 = __lsx_vreplgr2vr_w(256); + + /* go through every scanline in the output buffer */ + auto scaleSection = [&] (int yStart, int yEnd) { + for (int y = yStart; y < yEnd; ++y) { + const int Cy = yapoints[y] >> 16; + const int yap = yapoints[y] & 0xffff; + const __m128i vCy = __lsx_vreplgr2vr_w(Cy); + const __m128i vyap = __lsx_vreplgr2vr_w(yap); + + unsigned int *dptr = dest + (y * dow); + for (int x = 0; x < dw; x++) { + const unsigned int *sptr = ypoints[y] + xpoints[x]; + __m128i vx = qt_qimageScaleAARGBA_helper(sptr, yap, Cy, sow, vyap, vCy); + + const int xap = xapoints[x]; + if (xap > 0) { + const __m128i vxap = __lsx_vreplgr2vr_w(xap); + const __m128i vinvxap = __lsx_vsub_w(v256, vxap); + __m128i vr = qt_qimageScaleAARGBA_helper(sptr + 1, yap, Cy, sow, vyap, vCy); + + vx = __lsx_vmul_w(vx, vinvxap); + vr = __lsx_vmul_w(vr, vxap); + vx = __lsx_vadd_w(vx, vr); + vx = __lsx_vsrli_w(vx, 8); + } + vx = __lsx_vsrli_w(vx, 14); + vx = __lsx_vpickev_h(__lsx_vsat_wu(vx, 15), __lsx_vsat_wu(vx, 15)); + vx = __lsx_vpickev_b(__lsx_vsat_hu(vx, 7), __lsx_vsat_hu(vx, 7)); + *dptr = __lsx_vpickve2gr_w(vx, 0); + if (RGB) + *dptr |= 0xff000000; + dptr++; + } + } + }; + multithread_pixels_function(isi, dh, scaleSection); +} + +template<bool RGB> +void qt_qimageScaleAARGBA_down_x_up_y_lsx(QImageScaleInfo *isi, unsigned int *dest, + int dw, int dh, int dow, int sow) +{ + const unsigned int **ypoints = isi->ypoints; + int *xpoints = isi->xpoints; + int *xapoints = isi->xapoints; + int *yapoints = isi->yapoints; + + const __m128i v256 = __lsx_vreplgr2vr_w(256); + + /* go through every scanline in the output buffer */ + auto scaleSection = [&] (int yStart, int yEnd) { + for (int y = yStart; y < yEnd; ++y) { + unsigned int *dptr = dest + (y * dow); + for (int x = 0; x < dw; x++) { + int Cx = xapoints[x] >> 16; + int xap = xapoints[x] & 0xffff; + const __m128i vCx = __lsx_vreplgr2vr_w(Cx); + const __m128i vxap = __lsx_vreplgr2vr_w(xap); + + const unsigned int *sptr = ypoints[y] + xpoints[x]; + __m128i vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx); + + int yap = yapoints[y]; + if (yap > 0) { + const __m128i vyap = __lsx_vreplgr2vr_w(yap); + const __m128i vinvyap = __lsx_vsub_w(v256, vyap); + __m128i vr = qt_qimageScaleAARGBA_helper(sptr + sow, xap, Cx, 1, vxap, vCx); + + vx = __lsx_vmul_w(vx, vinvyap); + vr = __lsx_vmul_w(vr, vyap); + vx = __lsx_vadd_w(vx, vr); + vx = __lsx_vsrli_w(vx, 8); + } + vx = __lsx_vsrli_w(vx, 14); + vx = __lsx_vpickev_h(__lsx_vsat_wu(vx, 15), __lsx_vsat_wu(vx, 15)); + vx = __lsx_vpickev_b(__lsx_vsat_wu(vx, 7), __lsx_vsat_hu(vx, 7)); + *dptr = __lsx_vpickve2gr_w(vx, 0); + if (RGB) + *dptr |= 0xff000000; + dptr++; + } + } + }; + multithread_pixels_function(isi, dh, scaleSection); +} + +template<bool RGB> +void qt_qimageScaleAARGBA_down_xy_lsx(QImageScaleInfo *isi, unsigned int *dest, + int dw, int dh, int dow, int sow) +{ + const unsigned int **ypoints = isi->ypoints; + int *xpoints = isi->xpoints; + int *xapoints = isi->xapoints; + int *yapoints = isi->yapoints; + + auto scaleSection = [&] (int yStart, int yEnd) { + for (int y = yStart; y < yEnd; ++y) { + int Cy = yapoints[y] >> 16; + int yap = yapoints[y] & 0xffff; + const __m128i vCy = __lsx_vreplgr2vr_w(Cy); + const __m128i vyap = __lsx_vreplgr2vr_w(yap); + + unsigned int *dptr = dest + (y * dow); + for (int x = 0; x < dw; x++) { + const int Cx = xapoints[x] >> 16; + const int xap = xapoints[x] & 0xffff; + const __m128i vCx = __lsx_vreplgr2vr_w(Cx); + const __m128i vxap = __lsx_vreplgr2vr_w(xap); + + const unsigned int *sptr = ypoints[y] + xpoints[x]; + __m128i vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx); + __m128i vr = __lsx_vmul_w(__lsx_vsrli_w(vx, 4), vyap); + + int j; + for (j = (1 << 14) - yap; j > Cy; j -= Cy) { + sptr += sow; + vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx); + vr = __lsx_vadd_w(vr, __lsx_vmul_w(__lsx_vsrli_w(vx, 4), vCy)); + } + sptr += sow; + vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx); + vr = __lsx_vadd_w(vr, __lsx_vmul_w(__lsx_vsrli_w(vx, 4), __lsx_vreplgr2vr_w(j))); + + vr = __lsx_vsrli_w(vr, 24); + vr = __lsx_vpickev_h(__lsx_vldi(0), __lsx_vsat_wu(vr, 15)); + vr = __lsx_vpickev_b(__lsx_vldi(0), __lsx_vsat_hu(vr, 7)); + *dptr = __lsx_vpickve2gr_w(vr, 0); + if (RGB) + *dptr |= 0xff000000; + dptr++; + } + } + }; + multithread_pixels_function(isi, dh, scaleSection); +} + +template void qt_qimageScaleAARGBA_up_x_down_y_lsx<false>(QImageScaleInfo *isi, unsigned int *dest, + int dw, int dh, int dow, int sow); + +template void qt_qimageScaleAARGBA_up_x_down_y_lsx<true>(QImageScaleInfo *isi, unsigned int *dest, + int dw, int dh, int dow, int sow); + +template void qt_qimageScaleAARGBA_down_x_up_y_lsx<false>(QImageScaleInfo *isi, unsigned int *dest, + int dw, int dh, int dow, int sow); + +template void qt_qimageScaleAARGBA_down_x_up_y_lsx<true>(QImageScaleInfo *isi, unsigned int *dest, + int dw, int dh, int dow, int sow); + +template void qt_qimageScaleAARGBA_down_xy_lsx<false>(QImageScaleInfo *isi, unsigned int *dest, + int dw, int dh, int dow, int sow); + +template void qt_qimageScaleAARGBA_down_xy_lsx<true>(QImageScaleInfo *isi, unsigned int *dest, + int dw, int dh, int dow, int sow); + +QT_END_NAMESPACE + +#endif |