Commit a3b939ce583b925439d3b549780bbdbb68611ea6

Authored by Jay Berkenbilt
1 parent f5a1e287

Tweak utf8 checks

libqpdf/QUtil.cc
@@ -1825,16 +1825,12 @@ QUtil::analyze_encoding( @@ -1825,16 +1825,12 @@ QUtil::analyze_encoding(
1825 bool any_errors = false; 1825 bool any_errors = false;
1826 while (pos < len) { 1826 while (pos < len) {
1827 bool error = false; 1827 bool error = false;
1828 - auto old_pos = pos;  
1829 - unsigned long codepoint = get_next_utf8_codepoint(val, pos, error); 1828 + auto o_pos = pos;
  1829 + get_next_utf8_codepoint(val, pos, error);
1830 if (error) { 1830 if (error) {
1831 any_errors = true; 1831 any_errors = true;
1832 - for (auto p = old_pos; p < pos; p++) {  
1833 - if (static_cast<unsigned char>(val.at(p)) >= 128) {  
1834 - has_8bit_chars = true;  
1835 - }  
1836 - }  
1837 - } else if (codepoint >= 128) { 1832 + }
  1833 + if (pos - o_pos > 1 || val[o_pos] & 0x80) {
1838 has_8bit_chars = true; 1834 has_8bit_chars = true;
1839 } 1835 }
1840 } 1836 }
libtests/qutil.cc
@@ -269,20 +269,21 @@ to_utf8_test() @@ -269,20 +269,21 @@ to_utf8_test()
269 269
270 // Overlong characters: characters represented by more bytes than necessary. 270 // Overlong characters: characters represented by more bytes than necessary.
271 size_t pos = 0; 271 size_t pos = 0;
272 - std::string utf8 = "\xC0\x80" // 1 << 7  
273 - "\xE0\x80\x80" // 1 << 11  
274 - "\xF0\x80\x80\x80" // 1 << 16  
275 - "\xF8\x80\x80\x80\x80" // 1 << 21  
276 - "\xFC\x80\x80\x80\x80\x80"; // 1 << 26  
277 - auto check = [&pos, &utf8](unsigned long wanted_pos) { 272 + std::string utf8 = "\xC0\x81" // 1 << 7
  273 + "\xE0\x80\x82" // 1 << 11
  274 + "\xF0\x80\x80\x83" // 1 << 16
  275 + "\xF8\x80\x80\x80\x84" // 1 << 21
  276 + "\xFC\x80\x80\x80\x80\x85"; // 1 << 26
  277 + auto check = [&pos, &utf8](unsigned long val, unsigned long wanted_pos) {
278 bool error = false; 278 bool error = false;
279 - assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0 && error && pos == wanted_pos); 279 + assert(
  280 + QUtil::get_next_utf8_codepoint(utf8, pos, error) == val && error && pos == wanted_pos);
280 }; 281 };
281 - check(2);  
282 - check(5);  
283 - check(9);  
284 - check(14);  
285 - check(20); 282 + check(1, 2);
  283 + check(2, 5);
  284 + check(3, 9);
  285 + check(4, 14);
  286 + check(5, 20);
286 } 287 }
287 288
288 static void 289 static void
@@ -361,7 +362,8 @@ check_analyze(std::string const&amp; str, bool has8bit, bool utf8, bool utf16) @@ -361,7 +362,8 @@ check_analyze(std::string const&amp; str, bool has8bit, bool utf8, bool utf16)
361 bool is_utf16 = false; 362 bool is_utf16 = false;
362 QUtil::analyze_encoding(str, has_8bit_chars, is_valid_utf8, is_utf16); 363 QUtil::analyze_encoding(str, has_8bit_chars, is_valid_utf8, is_utf16);
363 if (!((has_8bit_chars == has8bit) && (is_valid_utf8 == utf8) && (is_utf16 == utf16))) { 364 if (!((has_8bit_chars == has8bit) && (is_valid_utf8 == utf8) && (is_utf16 == utf16))) {
364 - std::cout << "analysis failed: " << str << std::endl; 365 + std::cout << "analysis failed: " << str << ": 8bit: " << has_8bit_chars
  366 + << ", utf8: " << is_valid_utf8 << ", utf16: " << is_utf16 << std::endl;
365 } 367 }
366 } 368 }
367 369
@@ -389,6 +391,7 @@ transcoding_test() @@ -389,6 +391,7 @@ transcoding_test()
389 check_analyze("pi = \317\200", true, true, false); 391 check_analyze("pi = \317\200", true, true, false);
390 check_analyze("pi != \317", true, false, false); 392 check_analyze("pi != \317", true, false, false);
391 check_analyze("pi != 22/7", false, false, false); 393 check_analyze("pi != 22/7", false, false, false);
  394 + check_analyze("\xE0\x80\x82", true, false, false);
392 check_analyze(std::string("\xfe\xff\x00\x51", 4), true, false, true); 395 check_analyze(std::string("\xfe\xff\x00\x51", 4), true, false, true);
393 check_analyze(std::string("\xff\xfe\x51\x00", 4), true, false, true); 396 check_analyze(std::string("\xff\xfe\x51\x00", 4), true, false, true);
394 std::cout << "analysis done" << std::endl; 397 std::cout << "analysis done" << std::endl;
qpdf/qtest/qpdf/weird-tokens-alt.json
@@ -16,6 +16,8 @@ @@ -16,6 +16,8 @@
16 "n:/OVERLONG+#c0#81", 16 "n:/OVERLONG+#c0#81",
17 "n:/OVERLONG+#e0#81#82", 17 "n:/OVERLONG+#e0#81#82",
18 "n:/OVERLONG+#f0#81#82#83", 18 "n:/OVERLONG+#f0#81#82#83",
  19 + "n:/range+#01",
  20 + "n:/low+#18",
19 "/ABCEDEF+π", 21 "/ABCEDEF+π",
20 "n:/one+#a0two", 22 "n:/one+#a0two",
21 "n:/text#2fplain", 23 "n:/text#2fplain",
qpdf/qtest/qpdf/weird-tokens.json
@@ -16,6 +16,8 @@ @@ -16,6 +16,8 @@
16 "n:/OVERLONG+#c0#81", 16 "n:/OVERLONG+#c0#81",
17 "n:/OVERLONG+#e0#81#82", 17 "n:/OVERLONG+#e0#81#82",
18 "n:/OVERLONG+#f0#81#82#83", 18 "n:/OVERLONG+#f0#81#82#83",
  19 + "/range+\u0001",
  20 + "/low+\u0018",
19 "/ABCEDEF+π", 21 "/ABCEDEF+π",
20 "n:/one+#a0two", 22 "n:/one+#a0two",
21 "/text/plain", 23 "/text/plain",
qpdf/qtest/qpdf/weird-tokens.pdf
@@ -10,6 +10,8 @@ @@ -10,6 +10,8 @@
10 /OVERLONG+#c0#81 10 /OVERLONG+#c0#81
11 /OVERLONG+#e0#81#82 11 /OVERLONG+#e0#81#82
12 /OVERLONG+#f0#81#82#83 12 /OVERLONG+#f0#81#82#83
  13 + /range+#01
  14 + /low+#18
13 /ABCEDEF+#cf#80 15 /ABCEDEF+#cf#80
14 /one+#a0two 16 /one+#a0two
15 /text#2fplain 17 /text#2fplain
@@ -83,16 +85,16 @@ xref @@ -83,16 +85,16 @@ xref
83 0 7 85 0 7
84 0000000000 65535 f 86 0000000000 65535 f
85 0000000025 00000 n 87 0000000025 00000 n
86 -0000000333 00000 n  
87 -0000000415 00000 n  
88 -0000000611 00000 n  
89 -0000000710 00000 n  
90 -0000000729 00000 n 88 +0000000361 00000 n
  89 +0000000443 00000 n
  90 +0000000639 00000 n
  91 +0000000738 00000 n
  92 +0000000757 00000 n
91 trailer << 93 trailer <<
92 /Root 1 0 R 94 /Root 1 0 R
93 /Size 7 95 /Size 7
94 /ID [<42841c13bbf709d79a200fa1691836f8><728c020f464c3cf7e02c12605fa7d88b>] 96 /ID [<42841c13bbf709d79a200fa1691836f8><728c020f464c3cf7e02c12605fa7d88b>]
95 >> 97 >>
96 startxref 98 startxref
97 -835 99 +863
98 %%EOF 100 %%EOF