Commit a478cbb6dc0e630b919813ad0e7ae1a72510c69d

Authored by Jay Berkenbilt
1 parent fbd3e56d

Silently/transparently recognize UTF-16LE as UTF-16 (fixes #649)

The PDF spec only allows UTF-16BE, but most readers seem to accept
UTF-16LE as well, so now qpdf does too.
ChangeLog
1 1 2022-02-15 Jay Berkenbilt <ejb@ql.org>
2 2  
  3 + * When analyzing PDF strings, recognize UTF-16LE as UTF-16. The
  4 + PDF spec only allows UTF-16BE, but most readers seem to allow
  5 + both. Fixes #649.
  6 +
3 7 * Bug fix: 10.6.0 inadvertently removed an unknown/undocumented
4 8 CLI parsing feature, which has been restored in 10.6.2. Fixes #652.
5 9  
... ...
include/qpdf/QUtil.hh
... ... @@ -267,8 +267,11 @@ namespace QUtil
267 267 QPDF_DLL
268 268 std::string toUTF16(unsigned long uval);
269 269  
270   - // Test whether this is a UTF-16 big-endian string. This is
271   - // indicated by first two bytes being 0xFE 0xFF.
  270 + // Test whether this is a UTF-16 string. This is indicated by
  271 + // first two bytes being 0xFE 0xFF (big-endian) or 0xFF 0xFE
  272 + // (little-endian). Starting in qpdf 10.6.2, this detects
  273 + // little-endian as well as big-endian. Even though the PDF spec
  274 + // doesn't allow little-endian, most readers seem to accept it.
272 275 QPDF_DLL
273 276 bool is_utf16(std::string const&);
274 277  
... ... @@ -309,8 +312,8 @@ namespace QUtil
309 312 bool utf8_to_pdf_doc(
310 313 std::string const& utf8, std::string& pdfdoc, char unknown_char = '?');
311 314  
312   - // Convert a UTF-16 big-endian encoded string to UTF-8.
313   - // Unrepresentable code points are converted to U+FFFD.
  315 + // Convert a UTF-16 encoded string to UTF-8. Unrepresentable code
  316 + // points are converted to U+FFFD.
314 317 QPDF_DLL
315 318 std::string utf16_to_utf8(std::string const& utf16);
316 319  
... ... @@ -331,7 +334,9 @@ namespace QUtil
331 334 // help us guess. If there are no characters with the high bit
332 335 // set, has_8bit_chars is false, and the other values are also
333 336 // false, even though ASCII strings are valid UTF-8. is_valid_utf8
334   - // means that the string is non-trivially valid UTF-8.
  337 + // means that the string is non-trivially valid UTF-8. Although
  338 + // the PDF spec requires UTF-16 to be UTF-16BE, qpdf (and just
  339 + // about everything else) accepts UTF-16LE (as of 10.6.2).
335 340 QPDF_DLL
336 341 void analyze_encoding(std::string const& str,
337 342 bool& has_8bit_chars,
... ...
libqpdf/QUtil.cc
... ... @@ -2400,7 +2400,8 @@ bool
2400 2400 QUtil::is_utf16(std::string const& val)
2401 2401 {
2402 2402 return ((val.length() >= 2) &&
2403   - (val.at(0) == '\xfe') && (val.at(1) == '\xff'));
  2403 + (((val.at(0) == '\xfe') && (val.at(1) == '\xff')) ||
  2404 + ((val.at(0) == '\xff') && (val.at(1) == '\xfe'))));
2404 2405 }
2405 2406  
2406 2407 std::string
... ... @@ -2414,8 +2415,13 @@ QUtil::utf16_to_utf8(std::string const&amp; val)
2414 2415 unsigned long codepoint = 0L;
2415 2416 size_t len = val.length();
2416 2417 size_t start = 0;
  2418 + bool is_le = false;
2417 2419 if (is_utf16(val))
2418 2420 {
  2421 + if (static_cast<unsigned char>(val.at(0)) == 0xff)
  2422 + {
  2423 + is_le = true;
  2424 + }
2419 2425 start += 2;
2420 2426 }
2421 2427 // If the string has an odd number of bytes, the last byte is
... ... @@ -2428,10 +2434,12 @@ QUtil::utf16_to_utf8(std::string const&amp; val)
2428 2434 // codepoint not followed by a low codepoint will be
2429 2435 // discarded, and a low codepoint not preceded by a high
2430 2436 // codepoint will just get its low 10 bits output.
  2437 + auto msb = is_le ? i+1 : i;
  2438 + auto lsb = is_le ? i : i+1;
2431 2439 unsigned short bits =
2432 2440 QIntC::to_ushort(
2433   - (static_cast<unsigned char>(val.at(i)) << 8) +
2434   - static_cast<unsigned char>(val.at(i+1)));
  2441 + (static_cast<unsigned char>(val.at(msb)) << 8) +
  2442 + static_cast<unsigned char>(val.at(lsb)));
2435 2443 if ((bits & 0xFC00) == 0xD800)
2436 2444 {
2437 2445 codepoint = 0x10000U + ((bits & 0x3FFU) << 10U);
... ...
libtests/qtest/qutil/qutil.out
... ... @@ -63,6 +63,7 @@ HAGOOGAMAGOOGLE: 0
63 63 0x80000000 -> ff fd
64 64 π
65 65 π
  66 +LE: π
66 67 ---- utf8_to_ascii
67 68 ¿Does π have fingers?
68 69 ?Does ? have fingers?
... ...
libtests/qutil.cc
... ... @@ -303,6 +303,7 @@ void to_utf16_test()
303 303 std::string s(QUtil::utf8_to_utf16("\xcf\x80"));
304 304 std::cout << QUtil::utf16_to_utf8(s) << std::endl;
305 305 std::cout << QUtil::utf16_to_utf8(s + ".") << std::endl;
  306 + std::cout << "LE: " << QUtil::utf16_to_utf8("\xff\xfe\xc0\x03") << std::endl;
306 307 }
307 308  
308 309 void utf8_to_ascii_test()
... ... @@ -388,7 +389,8 @@ void transcoding_test()
388 389 check_analyze("pi = \317\200", true, true, false);
389 390 check_analyze("pi != \317", true, false, false);
390 391 check_analyze("pi != 22/7", false, false, false);
391   - check_analyze(std::string("\xfe\xff\00\x51", 4), true, false, true);
  392 + check_analyze(std::string("\xfe\xff\x00\x51", 4), true, false, true);
  393 + check_analyze(std::string("\xff\xfe\x51\x00", 4), true, false, true);
392 394 std::cout << "analysis done" << std::endl;
393 395 std::string input1("a\302\277b");
394 396 std::string input2("a\317\200b");
... ...
qpdf/qtest/qpdf.test
... ... @@ -73,7 +73,7 @@ flush_tiff_cache();
73 73 show_ntests();
74 74 # ----------
75 75 $td->notify("--- Character Encoding ---");
76   -$n_tests += 3;
  76 +$n_tests += 4;
77 77  
78 78 $td->runtest("PDF doc encoding to Unicode",
79 79 {$td->COMMAND => "test_pdf_doc_encoding pdf-doc-to-utf8.in"},
... ... @@ -88,6 +88,13 @@ $td-&gt;runtest(&quot;UTF-16 encoding errors&quot;,
88 88 {$td->FILE => "unicode-errors.out", $td->EXIT_STATUS => 0},
89 89 $td->NORMALIZE_NEWLINES);
90 90  
  91 +# UTF-16LE is not allowed by the PDF spec, but it seems that most
  92 +# readers accept it.
  93 +$td->runtest("UTF-16LE strings",
  94 + {$td->COMMAND => "qpdf --list-attachments --verbose utf16le.pdf"},
  95 + {$td->FILE => "utf16le-attachments.out", $td->EXIT_STATUS => 0},
  96 + $td->NORMALIZE_NEWLINES);
  97 +
91 98 # Tests to exercise QPDFArgParser belong in arg_parser.test in
92 99 # libtests. These tests are supposed to be specific to the qpdf cli.
93 100 # Since they were written prior to moving QPDFArgParser into the
... ...
qpdf/qtest/qpdf/utf16le-attachments.out 0 → 100644
  1 +potato.png -> 6,0
  2 + preferred name: π.png
  3 + all names:
  4 + /F -> π.png
  5 + /UF -> π.png
  6 + all data streams:
  7 + /F -> 6,0
  8 + /UF -> 6,0
... ...
qpdf/qtest/qpdf/utf16le.pdf 0 → 100644
No preview for this file type