summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThiago Macieira <[email protected]>2024-02-15 15:04:18 -0800
committerQt Cherry-pick Bot <[email protected]>2024-04-30 18:23:37 +0000
commit47dd5b7aed6ccc623b3c08f74ed89112be112ec4 (patch)
tree355f64c2f36dfbed58843ec8baf6c801af3f9f3a
parente88c2a3b3571ab549425e13b6ad108ec220e2713 (diff)
QXmlStreamWriter: decode UTF-8 into code points
We were iterating over code *units* and that yielded wrong results. The one from the bug report was simply caused by the fact that QUtf8StringView::value_type is char, which is signed on x86, so the expression: *it <= u'\x1F' was true for all non-Latin1 content. But in attempting to fix this, I needed to do the proper UTF-8 decoding, as otherwise we wouldn't catch non-Latin1 sequences and such. [ChangeLog][QtCore][QXmlStreamWriter] Fixed a bug that caused the class to fail to write UTF-8 strings with non-US-ASCII content when passed as a QUtf8StringView. Fixes: QTBUG-122241 Change-Id: I83dda2d36c904517b3c0fffd17b42bbf09a493d0 Reviewed-by: Mate Barany <[email protected]> (cherry picked from commit 94c62e322264e2e7d61193ae74ba8556a330385c) Reviewed-by: Qt Cherry-pick Bot <[email protected]> (cherry picked from commit 6bef40cb821bcaa0df62c17b7e6d19e95c9cea21)
-rw-r--r--src/corelib/serialization/qxmlstream.cpp46
-rw-r--r--src/corelib/text/qstringconverter_p.h9
-rw-r--r--tests/auto/corelib/serialization/qxmlstream/tst_qxmlstream.cpp125
3 files changed, 170 insertions, 10 deletions
diff --git a/src/corelib/serialization/qxmlstream.cpp b/src/corelib/serialization/qxmlstream.cpp
index f34929bfa2d..15d9d6ba3b2 100644
--- a/src/corelib/serialization/qxmlstream.cpp
+++ b/src/corelib/serialization/qxmlstream.cpp
@@ -2981,54 +2981,80 @@ void QXmlStreamWriterPrivate::write(QAnyStringView s)
void QXmlStreamWriterPrivate::writeEscaped(QAnyStringView s, bool escapeWhitespace)
{
+ struct NextLatin1 {
+ char32_t operator()(const char *&it, const char *) const
+ { return uchar(*it++); }
+ };
+ struct NextUtf8 {
+ char32_t operator()(const char *&it, const char *end) const
+ {
+ uchar uc = *it++;
+ char32_t utf32 = 0;
+ char32_t *output = &utf32;
+ qsizetype n = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(uc, output, it, end);
+ return n < 0 ? 0 : utf32;
+ }
+ };
+ struct NextUtf16 {
+ char32_t operator()(const QChar *&it, const QChar *) const
+ {
+ return (it++)->unicode();
+ }
+ };
+
QString escaped;
escaped.reserve(s.size());
s.visit([&] (auto s) {
using View = decltype(s);
+ using Decoder = std::conditional_t<std::is_same_v<View, QLatin1StringView>, NextLatin1,
+ std::conditional_t<std::is_same_v<View, QUtf8StringView>, NextUtf8, NextUtf16>>;
auto it = s.begin();
const auto end = s.end();
+ Decoder decoder;
while (it != end) {
QLatin1StringView replacement;
auto mark = it;
while (it != end) {
- if (*it == u'<') {
+ auto next_it = it;
+ char32_t uc = decoder(next_it, end);
+ if (uc == u'<') {
replacement = "&lt;"_L1;
break;
- } else if (*it == u'>') {
+ } else if (uc == u'>') {
replacement = "&gt;"_L1;
break;
- } else if (*it == u'&') {
+ } else if (uc == u'&') {
replacement = "&amp;"_L1;
break;
- } else if (*it == u'\"') {
+ } else if (uc == u'\"') {
replacement = "&quot;"_L1;
break;
- } else if (*it == u'\t') {
+ } else if (uc == u'\t') {
if (escapeWhitespace) {
replacement = "&#9;"_L1;
break;
}
- } else if (*it == u'\n') {
+ } else if (uc == u'\n') {
if (escapeWhitespace) {
replacement = "&#10;"_L1;
break;
}
- } else if (*it == u'\v' || *it == u'\f') {
+ } else if (uc == u'\v' || uc == u'\f') {
hasEncodingError = true;
break;
- } else if (*it == u'\r') {
+ } else if (uc == u'\r') {
if (escapeWhitespace) {
replacement = "&#13;"_L1;
break;
}
- } else if (*it <= u'\x1F' || *it >= u'\uFFFE') {
+ } else if (uc <= u'\x1F' || uc == u'\uFFFE' || uc == u'\uFFFF') {
hasEncodingError = true;
break;
}
- ++it;
+ it = next_it;
}
escaped.append(View{mark, it});
diff --git a/src/corelib/text/qstringconverter_p.h b/src/corelib/text/qstringconverter_p.h
index 9b633a8f7e1..f1fe50da971 100644
--- a/src/corelib/text/qstringconverter_p.h
+++ b/src/corelib/text/qstringconverter_p.h
@@ -43,18 +43,27 @@ struct QUtf8BaseTraits
static void appendByte(qchar8_t *&ptr, qchar8_t b)
{ *ptr++ = b; }
+ static uchar peekByte(const char *ptr, qsizetype n = 0)
+ { return ptr[n]; }
+
static uchar peekByte(const uchar *ptr, qsizetype n = 0)
{ return ptr[n]; }
static uchar peekByte(const qchar8_t *ptr, qsizetype n = 0)
{ return ptr[n]; }
+ static qptrdiff availableBytes(const char *ptr, const char *end)
+ { return end - ptr; }
+
static qptrdiff availableBytes(const uchar *ptr, const uchar *end)
{ return end - ptr; }
static qptrdiff availableBytes(const qchar8_t *ptr, const qchar8_t *end)
{ return end - ptr; }
+ static void advanceByte(const char *&ptr, qsizetype n = 1)
+ { ptr += n; }
+
static void advanceByte(const uchar *&ptr, qsizetype n = 1)
{ ptr += n; }
diff --git a/tests/auto/corelib/serialization/qxmlstream/tst_qxmlstream.cpp b/tests/auto/corelib/serialization/qxmlstream/tst_qxmlstream.cpp
index aa35468b936..08de2c36356 100644
--- a/tests/auto/corelib/serialization/qxmlstream/tst_qxmlstream.cpp
+++ b/tests/auto/corelib/serialization/qxmlstream/tst_qxmlstream.cpp
@@ -569,6 +569,12 @@ private slots:
void hasAttribute() const;
void writeWithUtf8Codec() const;
void writeWithStandalone() const;
+ void writeCharacters_data() const;
+ void writeCharacters() const;
+ void writeAttribute_data() const;
+ void writeAttribute() const;
+ void writeBadCharactersUtf8_data() const;
+ void writeBadCharactersUtf8() const;
void entitiesAndWhitespace_1() const;
void entitiesAndWhitespace_2() const;
void testFalsePrematureError() const;
@@ -1368,6 +1374,125 @@ void tst_QXmlStream::writeWithStandalone() const
}
}
+static void writeCharacters_data_common()
+{
+ QTest::addColumn<QString>("input");
+ QTest::addColumn<QString>("output");
+
+ QTest::newRow("empty") << QString() << QString();
+
+ // invalid content
+ QTest::newRow("null-character") << u"\0"_s << QString();
+ QTest::newRow("vertical-tab") << "\v" << QString();
+ QTest::newRow("form-feed") << "\f" << QString();
+ QTest::newRow("esc") << "\x1f" << QString();
+ QTest::newRow("U+FFFE") << u"\xfffe"_s << QString();
+ QTest::newRow("U+FFFF") << u"\xffff"_s << QString();
+
+ // simple strings
+ QTest::newRow("us-ascii") << "Hello, world" << "Hello, world";
+ QTest::newRow("latin1") << "Bokmål" << "Bokmål";
+ QTest::newRow("nonlatin1") << "Ελληνικά" << "Ελληνικά";
+ QTest::newRow("nonbmp") << u"\U00010000"_s << u"\U00010000"_s;
+
+ // escaped content
+ QTest::newRow("less-than") << "<" << "&lt;";
+ QTest::newRow("greater-than") << ">" << "&gt;";
+ QTest::newRow("ampersand") << "&" << "&amp;";
+ QTest::newRow("quote") << "\"" << "&quot;";
+}
+
+template <typename Execute, typename Transform>
+static void writeCharacters_common(Execute &&exec, Transform &&transform)
+{
+ QFETCH(QString, input);
+ QFETCH(QString, output);
+ QStringView utf16 = input;
+ QByteArray utf8ba = input.toUtf8();
+ QUtf8StringView utf8(utf8ba);
+
+ // may be invalid if input is not Latin1
+ QByteArray l1ba = input.toLatin1();
+ QLatin1StringView l1(l1ba);
+ if (l1 != input)
+ l1 = {};
+
+ auto write = [&](auto input) -> std::optional<QString> {
+ QString result;
+ QXmlStreamWriter writer(&result);
+ writer.writeStartElement("a");
+ exec(writer, input);
+ writer.writeEndElement();
+ if (writer.hasError())
+ return std::nullopt;
+ return result;
+ };
+
+ if (input.isNull() != output.isNull()) {
+ // error
+ QCOMPARE(write(utf16), std::nullopt);
+ QCOMPARE(write(utf8), std::nullopt);
+ if (!l1.isEmpty())
+ QCOMPARE(write(l1), std::nullopt);
+ } else {
+ output = transform(output);
+ QCOMPARE(write(utf16), output);
+ QCOMPARE(write(utf8), output);
+ if (!l1.isEmpty())
+ QCOMPARE(write(l1), output);
+ }
+}
+
+void tst_QXmlStream::writeCharacters_data() const
+{
+ writeCharacters_data_common();
+ QTest::newRow("tab") << "\t" << "\t";
+ QTest::newRow("newline") << "\n" << "\n";
+ QTest::newRow("carriage-return") << "\r" << "\r";
+}
+
+void tst_QXmlStream::writeCharacters() const
+{
+ auto exec = [](QXmlStreamWriter &writer, auto input) {
+ writer.writeCharacters(input);
+ };
+ auto transform = [](auto output) { return "<a>" + output + "</a>"; };
+ writeCharacters_common(exec, transform);
+}
+
+void tst_QXmlStream::writeAttribute_data() const
+{
+ writeCharacters_data_common();
+ QTest::newRow("tab") << "\t" << "&#9;";
+ QTest::newRow("newline") << "\n" << "&#10;";
+ QTest::newRow("carriage-return") << "\r" << "&#13;";
+}
+
+void tst_QXmlStream::writeAttribute() const
+{
+ auto exec = [](QXmlStreamWriter &writer, auto input) {
+ writer.writeAttribute("b", input);
+ };
+ auto transform = [](auto output) { return "<a b=\"" + output + "\"/>"; };
+ writeCharacters_common(exec, transform);
+}
+
+#include "../../io/qurlinternal/utf8data.cpp"
+void tst_QXmlStream::writeBadCharactersUtf8_data() const
+{
+ QTest::addColumn<QByteArray>("input");
+ loadInvalidUtf8Rows();
+}
+
+void tst_QXmlStream::writeBadCharactersUtf8() const
+{
+ QFETCH(QByteArray, input);
+ QString target;
+ QXmlStreamWriter writer(&target);
+ writer.writeTextElement("a", QUtf8StringView(input));
+ QVERIFY(writer.hasError());
+}
+
void tst_QXmlStream::entitiesAndWhitespace_1() const
{
QXmlStreamReader reader(QLatin1String("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\"><test>&extEnt;</test>"));