From a3b939ce583b925439d3b549780bbdbb68611ea6 Mon Sep 17 00:00:00 2001 From: Jay Berkenbilt Date: Wed, 27 Dec 2023 10:48:54 -0500 Subject: [PATCH] Tweak utf8 checks --- libqpdf/QUtil.cc | 12 ++++-------- libtests/qutil.cc | 29 ++++++++++++++++------------- qpdf/qtest/qpdf/weird-tokens-alt.json | 2 ++ qpdf/qtest/qpdf/weird-tokens.json | 2 ++ qpdf/qtest/qpdf/weird-tokens.pdf | 14 ++++++++------ 5 files changed, 32 insertions(+), 27 deletions(-) diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc index 25c7281..7b4b119 100644 --- a/libqpdf/QUtil.cc +++ b/libqpdf/QUtil.cc @@ -1825,16 +1825,12 @@ QUtil::analyze_encoding( bool any_errors = false; while (pos < len) { bool error = false; - auto old_pos = pos; - unsigned long codepoint = get_next_utf8_codepoint(val, pos, error); + auto o_pos = pos; + get_next_utf8_codepoint(val, pos, error); if (error) { any_errors = true; - for (auto p = old_pos; p < pos; p++) { - if (static_cast(val.at(p)) >= 128) { - has_8bit_chars = true; - } - } - } else if (codepoint >= 128) { + } + if (pos - o_pos > 1 || val[o_pos] & 0x80) { has_8bit_chars = true; } } diff --git a/libtests/qutil.cc b/libtests/qutil.cc index ca6ee31..07232de 100644 --- a/libtests/qutil.cc +++ b/libtests/qutil.cc @@ -269,20 +269,21 @@ to_utf8_test() // Overlong characters: characters represented by more bytes than necessary. size_t pos = 0; - std::string utf8 = "\xC0\x80" // 1 << 7 - "\xE0\x80\x80" // 1 << 11 - "\xF0\x80\x80\x80" // 1 << 16 - "\xF8\x80\x80\x80\x80" // 1 << 21 - "\xFC\x80\x80\x80\x80\x80"; // 1 << 26 - auto check = [&pos, &utf8](unsigned long wanted_pos) { + std::string utf8 = "\xC0\x81" // 1 << 7 + "\xE0\x80\x82" // 1 << 11 + "\xF0\x80\x80\x83" // 1 << 16 + "\xF8\x80\x80\x80\x84" // 1 << 21 + "\xFC\x80\x80\x80\x80\x85"; // 1 << 26 + auto check = [&pos, &utf8](unsigned long val, unsigned long wanted_pos) { bool error = false; - assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0 && error && pos == wanted_pos); + assert( + QUtil::get_next_utf8_codepoint(utf8, pos, error) == val && error && pos == wanted_pos); }; - check(2); - check(5); - check(9); - check(14); - check(20); + check(1, 2); + check(2, 5); + check(3, 9); + check(4, 14); + check(5, 20); } static void @@ -361,7 +362,8 @@ check_analyze(std::string const& str, bool has8bit, bool utf8, bool utf16) bool is_utf16 = false; QUtil::analyze_encoding(str, has_8bit_chars, is_valid_utf8, is_utf16); if (!((has_8bit_chars == has8bit) && (is_valid_utf8 == utf8) && (is_utf16 == utf16))) { - std::cout << "analysis failed: " << str << std::endl; + std::cout << "analysis failed: " << str << ": 8bit: " << has_8bit_chars + << ", utf8: " << is_valid_utf8 << ", utf16: " << is_utf16 << std::endl; } } @@ -389,6 +391,7 @@ transcoding_test() check_analyze("pi = \317\200", true, true, false); check_analyze("pi != \317", true, false, false); check_analyze("pi != 22/7", false, false, false); + check_analyze("\xE0\x80\x82", true, false, false); check_analyze(std::string("\xfe\xff\x00\x51", 4), true, false, true); check_analyze(std::string("\xff\xfe\x51\x00", 4), true, false, true); std::cout << "analysis done" << std::endl; diff --git a/qpdf/qtest/qpdf/weird-tokens-alt.json b/qpdf/qtest/qpdf/weird-tokens-alt.json index bd54bd1..adeb5bd 100644 --- a/qpdf/qtest/qpdf/weird-tokens-alt.json +++ b/qpdf/qtest/qpdf/weird-tokens-alt.json @@ -16,6 +16,8 @@ "n:/OVERLONG+#c0#81", "n:/OVERLONG+#e0#81#82", "n:/OVERLONG+#f0#81#82#83", + "n:/range+#01", + "n:/low+#18", "/ABCEDEF+π", "n:/one+#a0two", "n:/text#2fplain", diff --git a/qpdf/qtest/qpdf/weird-tokens.json b/qpdf/qtest/qpdf/weird-tokens.json index 92a663b..bc68573 100644 --- a/qpdf/qtest/qpdf/weird-tokens.json +++ b/qpdf/qtest/qpdf/weird-tokens.json @@ -16,6 +16,8 @@ "n:/OVERLONG+#c0#81", "n:/OVERLONG+#e0#81#82", "n:/OVERLONG+#f0#81#82#83", + "/range+\u0001", + "/low+\u0018", "/ABCEDEF+π", "n:/one+#a0two", "/text/plain", diff --git a/qpdf/qtest/qpdf/weird-tokens.pdf b/qpdf/qtest/qpdf/weird-tokens.pdf index 68f4dcf..fc1a71f 100644 --- a/qpdf/qtest/qpdf/weird-tokens.pdf +++ b/qpdf/qtest/qpdf/weird-tokens.pdf @@ -10,6 +10,8 @@ /OVERLONG+#c0#81 /OVERLONG+#e0#81#82 /OVERLONG+#f0#81#82#83 + /range+#01 + /low+#18 /ABCEDEF+#cf#80 /one+#a0two /text#2fplain @@ -83,16 +85,16 @@ xref 0 7 0000000000 65535 f 0000000025 00000 n -0000000333 00000 n -0000000415 00000 n -0000000611 00000 n -0000000710 00000 n -0000000729 00000 n +0000000361 00000 n +0000000443 00000 n +0000000639 00000 n +0000000738 00000 n +0000000757 00000 n trailer << /Root 1 0 R /Size 7 /ID [<42841c13bbf709d79a200fa1691836f8><728c020f464c3cf7e02c12605fa7d88b>] >> startxref -835 +863 %%EOF -- libgit2 0.21.4