Commit a478cbb6dc0e630b919813ad0e7ae1a72510c69d

Authored by Jay Berkenbilt
1 parent fbd3e56d

Silently/transparently recognize UTF-16LE as UTF-16 (fixes #649)

The PDF spec only allows UTF-16BE, but most readers seem to accept
UTF-16LE as well, so now qpdf does too.
ChangeLog
1 2022-02-15 Jay Berkenbilt <ejb@ql.org> 1 2022-02-15 Jay Berkenbilt <ejb@ql.org>
2 2
  3 + * When analyzing PDF strings, recognize UTF-16LE as UTF-16. The
  4 + PDF spec only allows UTF-16BE, but most readers seem to allow
  5 + both. Fixes #649.
  6 +
3 * Bug fix: 10.6.0 inadvertently removed an unknown/undocumented 7 * Bug fix: 10.6.0 inadvertently removed an unknown/undocumented
4 CLI parsing feature, which has been restored in 10.6.2. Fixes #652. 8 CLI parsing feature, which has been restored in 10.6.2. Fixes #652.
5 9
include/qpdf/QUtil.hh
@@ -267,8 +267,11 @@ namespace QUtil @@ -267,8 +267,11 @@ namespace QUtil
267 QPDF_DLL 267 QPDF_DLL
268 std::string toUTF16(unsigned long uval); 268 std::string toUTF16(unsigned long uval);
269 269
270 - // Test whether this is a UTF-16 big-endian string. This is  
271 - // indicated by first two bytes being 0xFE 0xFF. 270 + // Test whether this is a UTF-16 string. This is indicated by
  271 + // first two bytes being 0xFE 0xFF (big-endian) or 0xFF 0xFE
  272 + // (little-endian). Starting in qpdf 10.6.2, this detects
  273 + // little-endian as well as big-endian. Even though the PDF spec
  274 + // doesn't allow little-endian, most readers seem to accept it.
272 QPDF_DLL 275 QPDF_DLL
273 bool is_utf16(std::string const&); 276 bool is_utf16(std::string const&);
274 277
@@ -309,8 +312,8 @@ namespace QUtil @@ -309,8 +312,8 @@ namespace QUtil
309 bool utf8_to_pdf_doc( 312 bool utf8_to_pdf_doc(
310 std::string const& utf8, std::string& pdfdoc, char unknown_char = '?'); 313 std::string const& utf8, std::string& pdfdoc, char unknown_char = '?');
311 314
312 - // Convert a UTF-16 big-endian encoded string to UTF-8.  
313 - // Unrepresentable code points are converted to U+FFFD. 315 + // Convert a UTF-16 encoded string to UTF-8. Unrepresentable code
  316 + // points are converted to U+FFFD.
314 QPDF_DLL 317 QPDF_DLL
315 std::string utf16_to_utf8(std::string const& utf16); 318 std::string utf16_to_utf8(std::string const& utf16);
316 319
@@ -331,7 +334,9 @@ namespace QUtil @@ -331,7 +334,9 @@ namespace QUtil
331 // help us guess. If there are no characters with the high bit 334 // help us guess. If there are no characters with the high bit
332 // set, has_8bit_chars is false, and the other values are also 335 // set, has_8bit_chars is false, and the other values are also
333 // false, even though ASCII strings are valid UTF-8. is_valid_utf8 336 // false, even though ASCII strings are valid UTF-8. is_valid_utf8
334 - // means that the string is non-trivially valid UTF-8. 337 + // means that the string is non-trivially valid UTF-8. Although
  338 + // the PDF spec requires UTF-16 to be UTF-16BE, qpdf (and just
  339 + // about everything else) accepts UTF-16LE (as of 10.6.2).
335 QPDF_DLL 340 QPDF_DLL
336 void analyze_encoding(std::string const& str, 341 void analyze_encoding(std::string const& str,
337 bool& has_8bit_chars, 342 bool& has_8bit_chars,
libqpdf/QUtil.cc
@@ -2400,7 +2400,8 @@ bool @@ -2400,7 +2400,8 @@ bool
2400 QUtil::is_utf16(std::string const& val) 2400 QUtil::is_utf16(std::string const& val)
2401 { 2401 {
2402 return ((val.length() >= 2) && 2402 return ((val.length() >= 2) &&
2403 - (val.at(0) == '\xfe') && (val.at(1) == '\xff')); 2403 + (((val.at(0) == '\xfe') && (val.at(1) == '\xff')) ||
  2404 + ((val.at(0) == '\xff') && (val.at(1) == '\xfe'))));
2404 } 2405 }
2405 2406
2406 std::string 2407 std::string
@@ -2414,8 +2415,13 @@ QUtil::utf16_to_utf8(std::string const&amp; val) @@ -2414,8 +2415,13 @@ QUtil::utf16_to_utf8(std::string const&amp; val)
2414 unsigned long codepoint = 0L; 2415 unsigned long codepoint = 0L;
2415 size_t len = val.length(); 2416 size_t len = val.length();
2416 size_t start = 0; 2417 size_t start = 0;
  2418 + bool is_le = false;
2417 if (is_utf16(val)) 2419 if (is_utf16(val))
2418 { 2420 {
  2421 + if (static_cast<unsigned char>(val.at(0)) == 0xff)
  2422 + {
  2423 + is_le = true;
  2424 + }
2419 start += 2; 2425 start += 2;
2420 } 2426 }
2421 // If the string has an odd number of bytes, the last byte is 2427 // If the string has an odd number of bytes, the last byte is
@@ -2428,10 +2434,12 @@ QUtil::utf16_to_utf8(std::string const&amp; val) @@ -2428,10 +2434,12 @@ QUtil::utf16_to_utf8(std::string const&amp; val)
2428 // codepoint not followed by a low codepoint will be 2434 // codepoint not followed by a low codepoint will be
2429 // discarded, and a low codepoint not preceded by a high 2435 // discarded, and a low codepoint not preceded by a high
2430 // codepoint will just get its low 10 bits output. 2436 // codepoint will just get its low 10 bits output.
  2437 + auto msb = is_le ? i+1 : i;
  2438 + auto lsb = is_le ? i : i+1;
2431 unsigned short bits = 2439 unsigned short bits =
2432 QIntC::to_ushort( 2440 QIntC::to_ushort(
2433 - (static_cast<unsigned char>(val.at(i)) << 8) +  
2434 - static_cast<unsigned char>(val.at(i+1))); 2441 + (static_cast<unsigned char>(val.at(msb)) << 8) +
  2442 + static_cast<unsigned char>(val.at(lsb)));
2435 if ((bits & 0xFC00) == 0xD800) 2443 if ((bits & 0xFC00) == 0xD800)
2436 { 2444 {
2437 codepoint = 0x10000U + ((bits & 0x3FFU) << 10U); 2445 codepoint = 0x10000U + ((bits & 0x3FFU) << 10U);
libtests/qtest/qutil/qutil.out
@@ -63,6 +63,7 @@ HAGOOGAMAGOOGLE: 0 @@ -63,6 +63,7 @@ HAGOOGAMAGOOGLE: 0
63 0x80000000 -> ff fd 63 0x80000000 -> ff fd
64 π 64 π
65 π 65 π
  66 +LE: π
66 ---- utf8_to_ascii 67 ---- utf8_to_ascii
67 ¿Does π have fingers? 68 ¿Does π have fingers?
68 ?Does ? have fingers? 69 ?Does ? have fingers?
libtests/qutil.cc
@@ -303,6 +303,7 @@ void to_utf16_test() @@ -303,6 +303,7 @@ void to_utf16_test()
303 std::string s(QUtil::utf8_to_utf16("\xcf\x80")); 303 std::string s(QUtil::utf8_to_utf16("\xcf\x80"));
304 std::cout << QUtil::utf16_to_utf8(s) << std::endl; 304 std::cout << QUtil::utf16_to_utf8(s) << std::endl;
305 std::cout << QUtil::utf16_to_utf8(s + ".") << std::endl; 305 std::cout << QUtil::utf16_to_utf8(s + ".") << std::endl;
  306 + std::cout << "LE: " << QUtil::utf16_to_utf8("\xff\xfe\xc0\x03") << std::endl;
306 } 307 }
307 308
308 void utf8_to_ascii_test() 309 void utf8_to_ascii_test()
@@ -388,7 +389,8 @@ void transcoding_test() @@ -388,7 +389,8 @@ void transcoding_test()
388 check_analyze("pi = \317\200", true, true, false); 389 check_analyze("pi = \317\200", true, true, false);
389 check_analyze("pi != \317", true, false, false); 390 check_analyze("pi != \317", true, false, false);
390 check_analyze("pi != 22/7", false, false, false); 391 check_analyze("pi != 22/7", false, false, false);
391 - check_analyze(std::string("\xfe\xff\00\x51", 4), true, false, true); 392 + check_analyze(std::string("\xfe\xff\x00\x51", 4), true, false, true);
  393 + check_analyze(std::string("\xff\xfe\x51\x00", 4), true, false, true);
392 std::cout << "analysis done" << std::endl; 394 std::cout << "analysis done" << std::endl;
393 std::string input1("a\302\277b"); 395 std::string input1("a\302\277b");
394 std::string input2("a\317\200b"); 396 std::string input2("a\317\200b");
qpdf/qtest/qpdf.test
@@ -73,7 +73,7 @@ flush_tiff_cache(); @@ -73,7 +73,7 @@ flush_tiff_cache();
73 show_ntests(); 73 show_ntests();
74 # ---------- 74 # ----------
75 $td->notify("--- Character Encoding ---"); 75 $td->notify("--- Character Encoding ---");
76 -$n_tests += 3; 76 +$n_tests += 4;
77 77
78 $td->runtest("PDF doc encoding to Unicode", 78 $td->runtest("PDF doc encoding to Unicode",
79 {$td->COMMAND => "test_pdf_doc_encoding pdf-doc-to-utf8.in"}, 79 {$td->COMMAND => "test_pdf_doc_encoding pdf-doc-to-utf8.in"},
@@ -88,6 +88,13 @@ $td-&gt;runtest(&quot;UTF-16 encoding errors&quot;, @@ -88,6 +88,13 @@ $td-&gt;runtest(&quot;UTF-16 encoding errors&quot;,
88 {$td->FILE => "unicode-errors.out", $td->EXIT_STATUS => 0}, 88 {$td->FILE => "unicode-errors.out", $td->EXIT_STATUS => 0},
89 $td->NORMALIZE_NEWLINES); 89 $td->NORMALIZE_NEWLINES);
90 90
  91 +# UTF-16LE is not allowed by the PDF spec, but it seems that most
  92 +# readers accept it.
  93 +$td->runtest("UTF-16LE strings",
  94 + {$td->COMMAND => "qpdf --list-attachments --verbose utf16le.pdf"},
  95 + {$td->FILE => "utf16le-attachments.out", $td->EXIT_STATUS => 0},
  96 + $td->NORMALIZE_NEWLINES);
  97 +
91 # Tests to exercise QPDFArgParser belong in arg_parser.test in 98 # Tests to exercise QPDFArgParser belong in arg_parser.test in
92 # libtests. These tests are supposed to be specific to the qpdf cli. 99 # libtests. These tests are supposed to be specific to the qpdf cli.
93 # Since they were written prior to moving QPDFArgParser into the 100 # Since they were written prior to moving QPDFArgParser into the
qpdf/qtest/qpdf/utf16le-attachments.out 0 → 100644
  1 +potato.png -> 6,0
  2 + preferred name: π.png
  3 + all names:
  4 + /F -> π.png
  5 + /UF -> π.png
  6 + all data streams:
  7 + /F -> 6,0
  8 + /UF -> 6,0
qpdf/qtest/qpdf/utf16le.pdf 0 → 100644
No preview for this file type