diff options
author | Thiago Macieira <[email protected]> | 2024-02-15 15:04:18 -0800 |
---|---|---|
committer | Qt Cherry-pick Bot <[email protected]> | 2024-04-30 18:23:37 +0000 |
commit | 47dd5b7aed6ccc623b3c08f74ed89112be112ec4 (patch) | |
tree | 355f64c2f36dfbed58843ec8baf6c801af3f9f3a | |
parent | e88c2a3b3571ab549425e13b6ad108ec220e2713 (diff) |
QXmlStreamWriter: decode UTF-8 into code points
We were iterating over code *units* and that yielded wrong results. The
one from the bug report was simply caused by the fact that
QUtf8StringView::value_type is char, which is signed on x86, so the
expression:
*it <= u'\x1F'
was true for all non-Latin1 content.
But in attempting to fix this, I needed to do the proper UTF-8 decoding,
as otherwise we wouldn't catch non-Latin1 sequences and such.
[ChangeLog][QtCore][QXmlStreamWriter] Fixed a bug that caused the class
to fail to write UTF-8 strings with non-US-ASCII content when passed as
a QUtf8StringView.
Fixes: QTBUG-122241
Change-Id: I83dda2d36c904517b3c0fffd17b42bbf09a493d0
Reviewed-by: Mate Barany <[email protected]>
(cherry picked from commit 94c62e322264e2e7d61193ae74ba8556a330385c)
Reviewed-by: Qt Cherry-pick Bot <[email protected]>
(cherry picked from commit 6bef40cb821bcaa0df62c17b7e6d19e95c9cea21)
-rw-r--r-- | src/corelib/serialization/qxmlstream.cpp | 46 | ||||
-rw-r--r-- | src/corelib/text/qstringconverter_p.h | 9 | ||||
-rw-r--r-- | tests/auto/corelib/serialization/qxmlstream/tst_qxmlstream.cpp | 125 |
3 files changed, 170 insertions, 10 deletions
diff --git a/src/corelib/serialization/qxmlstream.cpp b/src/corelib/serialization/qxmlstream.cpp index f34929bfa2d..15d9d6ba3b2 100644 --- a/src/corelib/serialization/qxmlstream.cpp +++ b/src/corelib/serialization/qxmlstream.cpp @@ -2981,54 +2981,80 @@ void QXmlStreamWriterPrivate::write(QAnyStringView s) void QXmlStreamWriterPrivate::writeEscaped(QAnyStringView s, bool escapeWhitespace) { + struct NextLatin1 { + char32_t operator()(const char *&it, const char *) const + { return uchar(*it++); } + }; + struct NextUtf8 { + char32_t operator()(const char *&it, const char *end) const + { + uchar uc = *it++; + char32_t utf32 = 0; + char32_t *output = &utf32; + qsizetype n = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(uc, output, it, end); + return n < 0 ? 0 : utf32; + } + }; + struct NextUtf16 { + char32_t operator()(const QChar *&it, const QChar *) const + { + return (it++)->unicode(); + } + }; + QString escaped; escaped.reserve(s.size()); s.visit([&] (auto s) { using View = decltype(s); + using Decoder = std::conditional_t<std::is_same_v<View, QLatin1StringView>, NextLatin1, + std::conditional_t<std::is_same_v<View, QUtf8StringView>, NextUtf8, NextUtf16>>; auto it = s.begin(); const auto end = s.end(); + Decoder decoder; while (it != end) { QLatin1StringView replacement; auto mark = it; while (it != end) { - if (*it == u'<') { + auto next_it = it; + char32_t uc = decoder(next_it, end); + if (uc == u'<') { replacement = "<"_L1; break; - } else if (*it == u'>') { + } else if (uc == u'>') { replacement = ">"_L1; break; - } else if (*it == u'&') { + } else if (uc == u'&') { replacement = "&"_L1; break; - } else if (*it == u'\"') { + } else if (uc == u'\"') { replacement = """_L1; break; - } else if (*it == u'\t') { + } else if (uc == u'\t') { if (escapeWhitespace) { replacement = "	"_L1; break; } - } else if (*it == u'\n') { + } else if (uc == u'\n') { if (escapeWhitespace) { replacement = " "_L1; break; } - } else if (*it == u'\v' || *it == u'\f') { + } else if (uc == u'\v' || uc == u'\f') { hasEncodingError = true; break; - } else if (*it == u'\r') { + } else if (uc == u'\r') { if (escapeWhitespace) { replacement = " "_L1; break; } - } else if (*it <= u'\x1F' || *it >= u'\uFFFE') { + } else if (uc <= u'\x1F' || uc == u'\uFFFE' || uc == u'\uFFFF') { hasEncodingError = true; break; } - ++it; + it = next_it; } escaped.append(View{mark, it}); diff --git a/src/corelib/text/qstringconverter_p.h b/src/corelib/text/qstringconverter_p.h index 9b633a8f7e1..f1fe50da971 100644 --- a/src/corelib/text/qstringconverter_p.h +++ b/src/corelib/text/qstringconverter_p.h @@ -43,18 +43,27 @@ struct QUtf8BaseTraits static void appendByte(qchar8_t *&ptr, qchar8_t b) { *ptr++ = b; } + static uchar peekByte(const char *ptr, qsizetype n = 0) + { return ptr[n]; } + static uchar peekByte(const uchar *ptr, qsizetype n = 0) { return ptr[n]; } static uchar peekByte(const qchar8_t *ptr, qsizetype n = 0) { return ptr[n]; } + static qptrdiff availableBytes(const char *ptr, const char *end) + { return end - ptr; } + static qptrdiff availableBytes(const uchar *ptr, const uchar *end) { return end - ptr; } static qptrdiff availableBytes(const qchar8_t *ptr, const qchar8_t *end) { return end - ptr; } + static void advanceByte(const char *&ptr, qsizetype n = 1) + { ptr += n; } + static void advanceByte(const uchar *&ptr, qsizetype n = 1) { ptr += n; } diff --git a/tests/auto/corelib/serialization/qxmlstream/tst_qxmlstream.cpp b/tests/auto/corelib/serialization/qxmlstream/tst_qxmlstream.cpp index aa35468b936..08de2c36356 100644 --- a/tests/auto/corelib/serialization/qxmlstream/tst_qxmlstream.cpp +++ b/tests/auto/corelib/serialization/qxmlstream/tst_qxmlstream.cpp @@ -569,6 +569,12 @@ private slots: void hasAttribute() const; void writeWithUtf8Codec() const; void writeWithStandalone() const; + void writeCharacters_data() const; + void writeCharacters() const; + void writeAttribute_data() const; + void writeAttribute() const; + void writeBadCharactersUtf8_data() const; + void writeBadCharactersUtf8() const; void entitiesAndWhitespace_1() const; void entitiesAndWhitespace_2() const; void testFalsePrematureError() const; @@ -1368,6 +1374,125 @@ void tst_QXmlStream::writeWithStandalone() const } } +static void writeCharacters_data_common() +{ + QTest::addColumn<QString>("input"); + QTest::addColumn<QString>("output"); + + QTest::newRow("empty") << QString() << QString(); + + // invalid content + QTest::newRow("null-character") << u"\0"_s << QString(); + QTest::newRow("vertical-tab") << "\v" << QString(); + QTest::newRow("form-feed") << "\f" << QString(); + QTest::newRow("esc") << "\x1f" << QString(); + QTest::newRow("U+FFFE") << u"\xfffe"_s << QString(); + QTest::newRow("U+FFFF") << u"\xffff"_s << QString(); + + // simple strings + QTest::newRow("us-ascii") << "Hello, world" << "Hello, world"; + QTest::newRow("latin1") << "Bokmål" << "Bokmål"; + QTest::newRow("nonlatin1") << "Ελληνικά" << "Ελληνικά"; + QTest::newRow("nonbmp") << u"\U00010000"_s << u"\U00010000"_s; + + // escaped content + QTest::newRow("less-than") << "<" << "<"; + QTest::newRow("greater-than") << ">" << ">"; + QTest::newRow("ampersand") << "&" << "&"; + QTest::newRow("quote") << "\"" << """; +} + +template <typename Execute, typename Transform> +static void writeCharacters_common(Execute &&exec, Transform &&transform) +{ + QFETCH(QString, input); + QFETCH(QString, output); + QStringView utf16 = input; + QByteArray utf8ba = input.toUtf8(); + QUtf8StringView utf8(utf8ba); + + // may be invalid if input is not Latin1 + QByteArray l1ba = input.toLatin1(); + QLatin1StringView l1(l1ba); + if (l1 != input) + l1 = {}; + + auto write = [&](auto input) -> std::optional<QString> { + QString result; + QXmlStreamWriter writer(&result); + writer.writeStartElement("a"); + exec(writer, input); + writer.writeEndElement(); + if (writer.hasError()) + return std::nullopt; + return result; + }; + + if (input.isNull() != output.isNull()) { + // error + QCOMPARE(write(utf16), std::nullopt); + QCOMPARE(write(utf8), std::nullopt); + if (!l1.isEmpty()) + QCOMPARE(write(l1), std::nullopt); + } else { + output = transform(output); + QCOMPARE(write(utf16), output); + QCOMPARE(write(utf8), output); + if (!l1.isEmpty()) + QCOMPARE(write(l1), output); + } +} + +void tst_QXmlStream::writeCharacters_data() const +{ + writeCharacters_data_common(); + QTest::newRow("tab") << "\t" << "\t"; + QTest::newRow("newline") << "\n" << "\n"; + QTest::newRow("carriage-return") << "\r" << "\r"; +} + +void tst_QXmlStream::writeCharacters() const +{ + auto exec = [](QXmlStreamWriter &writer, auto input) { + writer.writeCharacters(input); + }; + auto transform = [](auto output) { return "<a>" + output + "</a>"; }; + writeCharacters_common(exec, transform); +} + +void tst_QXmlStream::writeAttribute_data() const +{ + writeCharacters_data_common(); + QTest::newRow("tab") << "\t" << "	"; + QTest::newRow("newline") << "\n" << " "; + QTest::newRow("carriage-return") << "\r" << " "; +} + +void tst_QXmlStream::writeAttribute() const +{ + auto exec = [](QXmlStreamWriter &writer, auto input) { + writer.writeAttribute("b", input); + }; + auto transform = [](auto output) { return "<a b=\"" + output + "\"/>"; }; + writeCharacters_common(exec, transform); +} + +#include "../../io/qurlinternal/utf8data.cpp" +void tst_QXmlStream::writeBadCharactersUtf8_data() const +{ + QTest::addColumn<QByteArray>("input"); + loadInvalidUtf8Rows(); +} + +void tst_QXmlStream::writeBadCharactersUtf8() const +{ + QFETCH(QByteArray, input); + QString target; + QXmlStreamWriter writer(&target); + writer.writeTextElement("a", QUtf8StringView(input)); + QVERIFY(writer.hasError()); +} + void tst_QXmlStream::entitiesAndWhitespace_1() const { QXmlStreamReader reader(QLatin1String("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\"><test>&extEnt;</test>")); |