Commit a3b939ce583b925439d3b549780bbdbb68611ea6
1 parent
f5a1e287
Tweak utf8 checks
Showing
5 changed files
with
32 additions
and
27 deletions
libqpdf/QUtil.cc
| ... | ... | @@ -1825,16 +1825,12 @@ QUtil::analyze_encoding( |
| 1825 | 1825 | bool any_errors = false; |
| 1826 | 1826 | while (pos < len) { |
| 1827 | 1827 | bool error = false; |
| 1828 | - auto old_pos = pos; | |
| 1829 | - unsigned long codepoint = get_next_utf8_codepoint(val, pos, error); | |
| 1828 | + auto o_pos = pos; | |
| 1829 | + get_next_utf8_codepoint(val, pos, error); | |
| 1830 | 1830 | if (error) { |
| 1831 | 1831 | any_errors = true; |
| 1832 | - for (auto p = old_pos; p < pos; p++) { | |
| 1833 | - if (static_cast<unsigned char>(val.at(p)) >= 128) { | |
| 1834 | - has_8bit_chars = true; | |
| 1835 | - } | |
| 1836 | - } | |
| 1837 | - } else if (codepoint >= 128) { | |
| 1832 | + } | |
| 1833 | + if (pos - o_pos > 1 || val[o_pos] & 0x80) { | |
| 1838 | 1834 | has_8bit_chars = true; |
| 1839 | 1835 | } |
| 1840 | 1836 | } | ... | ... |
libtests/qutil.cc
| ... | ... | @@ -269,20 +269,21 @@ to_utf8_test() |
| 269 | 269 | |
| 270 | 270 | // Overlong characters: characters represented by more bytes than necessary. |
| 271 | 271 | size_t pos = 0; |
| 272 | - std::string utf8 = "\xC0\x80" // 1 << 7 | |
| 273 | - "\xE0\x80\x80" // 1 << 11 | |
| 274 | - "\xF0\x80\x80\x80" // 1 << 16 | |
| 275 | - "\xF8\x80\x80\x80\x80" // 1 << 21 | |
| 276 | - "\xFC\x80\x80\x80\x80\x80"; // 1 << 26 | |
| 277 | - auto check = [&pos, &utf8](unsigned long wanted_pos) { | |
| 272 | + std::string utf8 = "\xC0\x81" // 1 << 7 | |
| 273 | + "\xE0\x80\x82" // 1 << 11 | |
| 274 | + "\xF0\x80\x80\x83" // 1 << 16 | |
| 275 | + "\xF8\x80\x80\x80\x84" // 1 << 21 | |
| 276 | + "\xFC\x80\x80\x80\x80\x85"; // 1 << 26 | |
| 277 | + auto check = [&pos, &utf8](unsigned long val, unsigned long wanted_pos) { | |
| 278 | 278 | bool error = false; |
| 279 | - assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0 && error && pos == wanted_pos); | |
| 279 | + assert( | |
| 280 | + QUtil::get_next_utf8_codepoint(utf8, pos, error) == val && error && pos == wanted_pos); | |
| 280 | 281 | }; |
| 281 | - check(2); | |
| 282 | - check(5); | |
| 283 | - check(9); | |
| 284 | - check(14); | |
| 285 | - check(20); | |
| 282 | + check(1, 2); | |
| 283 | + check(2, 5); | |
| 284 | + check(3, 9); | |
| 285 | + check(4, 14); | |
| 286 | + check(5, 20); | |
| 286 | 287 | } |
| 287 | 288 | |
| 288 | 289 | static void |
| ... | ... | @@ -361,7 +362,8 @@ check_analyze(std::string const& str, bool has8bit, bool utf8, bool utf16) |
| 361 | 362 | bool is_utf16 = false; |
| 362 | 363 | QUtil::analyze_encoding(str, has_8bit_chars, is_valid_utf8, is_utf16); |
| 363 | 364 | if (!((has_8bit_chars == has8bit) && (is_valid_utf8 == utf8) && (is_utf16 == utf16))) { |
| 364 | - std::cout << "analysis failed: " << str << std::endl; | |
| 365 | + std::cout << "analysis failed: " << str << ": 8bit: " << has_8bit_chars | |
| 366 | + << ", utf8: " << is_valid_utf8 << ", utf16: " << is_utf16 << std::endl; | |
| 365 | 367 | } |
| 366 | 368 | } |
| 367 | 369 | |
| ... | ... | @@ -389,6 +391,7 @@ transcoding_test() |
| 389 | 391 | check_analyze("pi = \317\200", true, true, false); |
| 390 | 392 | check_analyze("pi != \317", true, false, false); |
| 391 | 393 | check_analyze("pi != 22/7", false, false, false); |
| 394 | + check_analyze("\xE0\x80\x82", true, false, false); | |
| 392 | 395 | check_analyze(std::string("\xfe\xff\x00\x51", 4), true, false, true); |
| 393 | 396 | check_analyze(std::string("\xff\xfe\x51\x00", 4), true, false, true); |
| 394 | 397 | std::cout << "analysis done" << std::endl; | ... | ... |
qpdf/qtest/qpdf/weird-tokens-alt.json
qpdf/qtest/qpdf/weird-tokens.json
qpdf/qtest/qpdf/weird-tokens.pdf
| ... | ... | @@ -10,6 +10,8 @@ |
| 10 | 10 | /OVERLONG+#c0#81 |
| 11 | 11 | /OVERLONG+#e0#81#82 |
| 12 | 12 | /OVERLONG+#f0#81#82#83 |
| 13 | + /range+#01 | |
| 14 | + /low+#18 | |
| 13 | 15 | /ABCEDEF+#cf#80 |
| 14 | 16 | /one+#a0two |
| 15 | 17 | /text#2fplain |
| ... | ... | @@ -83,16 +85,16 @@ xref |
| 83 | 85 | 0 7 |
| 84 | 86 | 0000000000 65535 f |
| 85 | 87 | 0000000025 00000 n |
| 86 | -0000000333 00000 n | |
| 87 | -0000000415 00000 n | |
| 88 | -0000000611 00000 n | |
| 89 | -0000000710 00000 n | |
| 90 | -0000000729 00000 n | |
| 88 | +0000000361 00000 n | |
| 89 | +0000000443 00000 n | |
| 90 | +0000000639 00000 n | |
| 91 | +0000000738 00000 n | |
| 92 | +0000000757 00000 n | |
| 91 | 93 | trailer << |
| 92 | 94 | /Root 1 0 R |
| 93 | 95 | /Size 7 |
| 94 | 96 | /ID [<42841c13bbf709d79a200fa1691836f8><728c020f464c3cf7e02c12605fa7d88b>] |
| 95 | 97 | >> |
| 96 | 98 | startxref |
| 97 | -835 | |
| 99 | +863 | |
| 98 | 100 | %%EOF | ... | ... |