Commit a3b939ce583b925439d3b549780bbdbb68611ea6
1 parent
f5a1e287
Tweak utf8 checks
Showing
5 changed files
with
32 additions
and
27 deletions
libqpdf/QUtil.cc
| @@ -1825,16 +1825,12 @@ QUtil::analyze_encoding( | @@ -1825,16 +1825,12 @@ QUtil::analyze_encoding( | ||
| 1825 | bool any_errors = false; | 1825 | bool any_errors = false; |
| 1826 | while (pos < len) { | 1826 | while (pos < len) { |
| 1827 | bool error = false; | 1827 | bool error = false; |
| 1828 | - auto old_pos = pos; | ||
| 1829 | - unsigned long codepoint = get_next_utf8_codepoint(val, pos, error); | 1828 | + auto o_pos = pos; |
| 1829 | + get_next_utf8_codepoint(val, pos, error); | ||
| 1830 | if (error) { | 1830 | if (error) { |
| 1831 | any_errors = true; | 1831 | any_errors = true; |
| 1832 | - for (auto p = old_pos; p < pos; p++) { | ||
| 1833 | - if (static_cast<unsigned char>(val.at(p)) >= 128) { | ||
| 1834 | - has_8bit_chars = true; | ||
| 1835 | - } | ||
| 1836 | - } | ||
| 1837 | - } else if (codepoint >= 128) { | 1832 | + } |
| 1833 | + if (pos - o_pos > 1 || val[o_pos] & 0x80) { | ||
| 1838 | has_8bit_chars = true; | 1834 | has_8bit_chars = true; |
| 1839 | } | 1835 | } |
| 1840 | } | 1836 | } |
libtests/qutil.cc
| @@ -269,20 +269,21 @@ to_utf8_test() | @@ -269,20 +269,21 @@ to_utf8_test() | ||
| 269 | 269 | ||
| 270 | // Overlong characters: characters represented by more bytes than necessary. | 270 | // Overlong characters: characters represented by more bytes than necessary. |
| 271 | size_t pos = 0; | 271 | size_t pos = 0; |
| 272 | - std::string utf8 = "\xC0\x80" // 1 << 7 | ||
| 273 | - "\xE0\x80\x80" // 1 << 11 | ||
| 274 | - "\xF0\x80\x80\x80" // 1 << 16 | ||
| 275 | - "\xF8\x80\x80\x80\x80" // 1 << 21 | ||
| 276 | - "\xFC\x80\x80\x80\x80\x80"; // 1 << 26 | ||
| 277 | - auto check = [&pos, &utf8](unsigned long wanted_pos) { | 272 | + std::string utf8 = "\xC0\x81" // 1 << 7 |
| 273 | + "\xE0\x80\x82" // 1 << 11 | ||
| 274 | + "\xF0\x80\x80\x83" // 1 << 16 | ||
| 275 | + "\xF8\x80\x80\x80\x84" // 1 << 21 | ||
| 276 | + "\xFC\x80\x80\x80\x80\x85"; // 1 << 26 | ||
| 277 | + auto check = [&pos, &utf8](unsigned long val, unsigned long wanted_pos) { | ||
| 278 | bool error = false; | 278 | bool error = false; |
| 279 | - assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0 && error && pos == wanted_pos); | 279 | + assert( |
| 280 | + QUtil::get_next_utf8_codepoint(utf8, pos, error) == val && error && pos == wanted_pos); | ||
| 280 | }; | 281 | }; |
| 281 | - check(2); | ||
| 282 | - check(5); | ||
| 283 | - check(9); | ||
| 284 | - check(14); | ||
| 285 | - check(20); | 282 | + check(1, 2); |
| 283 | + check(2, 5); | ||
| 284 | + check(3, 9); | ||
| 285 | + check(4, 14); | ||
| 286 | + check(5, 20); | ||
| 286 | } | 287 | } |
| 287 | 288 | ||
| 288 | static void | 289 | static void |
| @@ -361,7 +362,8 @@ check_analyze(std::string const& str, bool has8bit, bool utf8, bool utf16) | @@ -361,7 +362,8 @@ check_analyze(std::string const& str, bool has8bit, bool utf8, bool utf16) | ||
| 361 | bool is_utf16 = false; | 362 | bool is_utf16 = false; |
| 362 | QUtil::analyze_encoding(str, has_8bit_chars, is_valid_utf8, is_utf16); | 363 | QUtil::analyze_encoding(str, has_8bit_chars, is_valid_utf8, is_utf16); |
| 363 | if (!((has_8bit_chars == has8bit) && (is_valid_utf8 == utf8) && (is_utf16 == utf16))) { | 364 | if (!((has_8bit_chars == has8bit) && (is_valid_utf8 == utf8) && (is_utf16 == utf16))) { |
| 364 | - std::cout << "analysis failed: " << str << std::endl; | 365 | + std::cout << "analysis failed: " << str << ": 8bit: " << has_8bit_chars |
| 366 | + << ", utf8: " << is_valid_utf8 << ", utf16: " << is_utf16 << std::endl; | ||
| 365 | } | 367 | } |
| 366 | } | 368 | } |
| 367 | 369 | ||
| @@ -389,6 +391,7 @@ transcoding_test() | @@ -389,6 +391,7 @@ transcoding_test() | ||
| 389 | check_analyze("pi = \317\200", true, true, false); | 391 | check_analyze("pi = \317\200", true, true, false); |
| 390 | check_analyze("pi != \317", true, false, false); | 392 | check_analyze("pi != \317", true, false, false); |
| 391 | check_analyze("pi != 22/7", false, false, false); | 393 | check_analyze("pi != 22/7", false, false, false); |
| 394 | + check_analyze("\xE0\x80\x82", true, false, false); | ||
| 392 | check_analyze(std::string("\xfe\xff\x00\x51", 4), true, false, true); | 395 | check_analyze(std::string("\xfe\xff\x00\x51", 4), true, false, true); |
| 393 | check_analyze(std::string("\xff\xfe\x51\x00", 4), true, false, true); | 396 | check_analyze(std::string("\xff\xfe\x51\x00", 4), true, false, true); |
| 394 | std::cout << "analysis done" << std::endl; | 397 | std::cout << "analysis done" << std::endl; |
qpdf/qtest/qpdf/weird-tokens-alt.json
| @@ -16,6 +16,8 @@ | @@ -16,6 +16,8 @@ | ||
| 16 | "n:/OVERLONG+#c0#81", | 16 | "n:/OVERLONG+#c0#81", |
| 17 | "n:/OVERLONG+#e0#81#82", | 17 | "n:/OVERLONG+#e0#81#82", |
| 18 | "n:/OVERLONG+#f0#81#82#83", | 18 | "n:/OVERLONG+#f0#81#82#83", |
| 19 | + "n:/range+#01", | ||
| 20 | + "n:/low+#18", | ||
| 19 | "/ABCEDEF+π", | 21 | "/ABCEDEF+π", |
| 20 | "n:/one+#a0two", | 22 | "n:/one+#a0two", |
| 21 | "n:/text#2fplain", | 23 | "n:/text#2fplain", |
qpdf/qtest/qpdf/weird-tokens.json
| @@ -16,6 +16,8 @@ | @@ -16,6 +16,8 @@ | ||
| 16 | "n:/OVERLONG+#c0#81", | 16 | "n:/OVERLONG+#c0#81", |
| 17 | "n:/OVERLONG+#e0#81#82", | 17 | "n:/OVERLONG+#e0#81#82", |
| 18 | "n:/OVERLONG+#f0#81#82#83", | 18 | "n:/OVERLONG+#f0#81#82#83", |
| 19 | + "/range+\u0001", | ||
| 20 | + "/low+\u0018", | ||
| 19 | "/ABCEDEF+π", | 21 | "/ABCEDEF+π", |
| 20 | "n:/one+#a0two", | 22 | "n:/one+#a0two", |
| 21 | "/text/plain", | 23 | "/text/plain", |
qpdf/qtest/qpdf/weird-tokens.pdf
| @@ -10,6 +10,8 @@ | @@ -10,6 +10,8 @@ | ||
| 10 | /OVERLONG+#c0#81 | 10 | /OVERLONG+#c0#81 |
| 11 | /OVERLONG+#e0#81#82 | 11 | /OVERLONG+#e0#81#82 |
| 12 | /OVERLONG+#f0#81#82#83 | 12 | /OVERLONG+#f0#81#82#83 |
| 13 | + /range+#01 | ||
| 14 | + /low+#18 | ||
| 13 | /ABCEDEF+#cf#80 | 15 | /ABCEDEF+#cf#80 |
| 14 | /one+#a0two | 16 | /one+#a0two |
| 15 | /text#2fplain | 17 | /text#2fplain |
| @@ -83,16 +85,16 @@ xref | @@ -83,16 +85,16 @@ xref | ||
| 83 | 0 7 | 85 | 0 7 |
| 84 | 0000000000 65535 f | 86 | 0000000000 65535 f |
| 85 | 0000000025 00000 n | 87 | 0000000025 00000 n |
| 86 | -0000000333 00000 n | ||
| 87 | -0000000415 00000 n | ||
| 88 | -0000000611 00000 n | ||
| 89 | -0000000710 00000 n | ||
| 90 | -0000000729 00000 n | 88 | +0000000361 00000 n |
| 89 | +0000000443 00000 n | ||
| 90 | +0000000639 00000 n | ||
| 91 | +0000000738 00000 n | ||
| 92 | +0000000757 00000 n | ||
| 91 | trailer << | 93 | trailer << |
| 92 | /Root 1 0 R | 94 | /Root 1 0 R |
| 93 | /Size 7 | 95 | /Size 7 |
| 94 | /ID [<42841c13bbf709d79a200fa1691836f8><728c020f464c3cf7e02c12605fa7d88b>] | 96 | /ID [<42841c13bbf709d79a200fa1691836f8><728c020f464c3cf7e02c12605fa7d88b>] |
| 95 | >> | 97 | >> |
| 96 | startxref | 98 | startxref |
| 97 | -835 | 99 | +863 |
| 98 | %%EOF | 100 | %%EOF |