Commit a478cbb6dc0e630b919813ad0e7ae1a72510c69d
1 parent
fbd3e56d
Silently/transparently recognize UTF-16LE as UTF-16 (fixes #649)
The PDF spec only allows UTF-16BE, but most readers seem to accept UTF-16LE as well, so now qpdf does too.
Showing
8 changed files
with
45 additions
and
10 deletions
ChangeLog
| 1 | 1 | 2022-02-15 Jay Berkenbilt <ejb@ql.org> |
| 2 | 2 | |
| 3 | + * When analyzing PDF strings, recognize UTF-16LE as UTF-16. The | |
| 4 | + PDF spec only allows UTF-16BE, but most readers seem to allow | |
| 5 | + both. Fixes #649. | |
| 6 | + | |
| 3 | 7 | * Bug fix: 10.6.0 inadvertently removed an unknown/undocumented |
| 4 | 8 | CLI parsing feature, which has been restored in 10.6.2. Fixes #652. |
| 5 | 9 | ... | ... |
include/qpdf/QUtil.hh
| ... | ... | @@ -267,8 +267,11 @@ namespace QUtil |
| 267 | 267 | QPDF_DLL |
| 268 | 268 | std::string toUTF16(unsigned long uval); |
| 269 | 269 | |
| 270 | - // Test whether this is a UTF-16 big-endian string. This is | |
| 271 | - // indicated by first two bytes being 0xFE 0xFF. | |
| 270 | + // Test whether this is a UTF-16 string. This is indicated by | |
| 271 | + // first two bytes being 0xFE 0xFF (big-endian) or 0xFF 0xFE | |
| 272 | + // (little-endian). Starting in qpdf 10.6.2, this detects | |
| 273 | + // little-endian as well as big-endian. Even though the PDF spec | |
| 274 | + // doesn't allow little-endian, most readers seem to accept it. | |
| 272 | 275 | QPDF_DLL |
| 273 | 276 | bool is_utf16(std::string const&); |
| 274 | 277 | |
| ... | ... | @@ -309,8 +312,8 @@ namespace QUtil |
| 309 | 312 | bool utf8_to_pdf_doc( |
| 310 | 313 | std::string const& utf8, std::string& pdfdoc, char unknown_char = '?'); |
| 311 | 314 | |
| 312 | - // Convert a UTF-16 big-endian encoded string to UTF-8. | |
| 313 | - // Unrepresentable code points are converted to U+FFFD. | |
| 315 | + // Convert a UTF-16 encoded string to UTF-8. Unrepresentable code | |
| 316 | + // points are converted to U+FFFD. | |
| 314 | 317 | QPDF_DLL |
| 315 | 318 | std::string utf16_to_utf8(std::string const& utf16); |
| 316 | 319 | |
| ... | ... | @@ -331,7 +334,9 @@ namespace QUtil |
| 331 | 334 | // help us guess. If there are no characters with the high bit |
| 332 | 335 | // set, has_8bit_chars is false, and the other values are also |
| 333 | 336 | // false, even though ASCII strings are valid UTF-8. is_valid_utf8 |
| 334 | - // means that the string is non-trivially valid UTF-8. | |
| 337 | + // means that the string is non-trivially valid UTF-8. Although | |
| 338 | + // the PDF spec requires UTF-16 to be UTF-16BE, qpdf (and just | |
| 339 | + // about everything else) accepts UTF-16LE (as of 10.6.2). | |
| 335 | 340 | QPDF_DLL |
| 336 | 341 | void analyze_encoding(std::string const& str, |
| 337 | 342 | bool& has_8bit_chars, | ... | ... |
libqpdf/QUtil.cc
| ... | ... | @@ -2400,7 +2400,8 @@ bool |
| 2400 | 2400 | QUtil::is_utf16(std::string const& val) |
| 2401 | 2401 | { |
| 2402 | 2402 | return ((val.length() >= 2) && |
| 2403 | - (val.at(0) == '\xfe') && (val.at(1) == '\xff')); | |
| 2403 | + (((val.at(0) == '\xfe') && (val.at(1) == '\xff')) || | |
| 2404 | + ((val.at(0) == '\xff') && (val.at(1) == '\xfe')))); | |
| 2404 | 2405 | } |
| 2405 | 2406 | |
| 2406 | 2407 | std::string |
| ... | ... | @@ -2414,8 +2415,13 @@ QUtil::utf16_to_utf8(std::string const& val) |
| 2414 | 2415 | unsigned long codepoint = 0L; |
| 2415 | 2416 | size_t len = val.length(); |
| 2416 | 2417 | size_t start = 0; |
| 2418 | + bool is_le = false; | |
| 2417 | 2419 | if (is_utf16(val)) |
| 2418 | 2420 | { |
| 2421 | + if (static_cast<unsigned char>(val.at(0)) == 0xff) | |
| 2422 | + { | |
| 2423 | + is_le = true; | |
| 2424 | + } | |
| 2419 | 2425 | start += 2; |
| 2420 | 2426 | } |
| 2421 | 2427 | // If the string has an odd number of bytes, the last byte is |
| ... | ... | @@ -2428,10 +2434,12 @@ QUtil::utf16_to_utf8(std::string const& val) |
| 2428 | 2434 | // codepoint not followed by a low codepoint will be |
| 2429 | 2435 | // discarded, and a low codepoint not preceded by a high |
| 2430 | 2436 | // codepoint will just get its low 10 bits output. |
| 2437 | + auto msb = is_le ? i+1 : i; | |
| 2438 | + auto lsb = is_le ? i : i+1; | |
| 2431 | 2439 | unsigned short bits = |
| 2432 | 2440 | QIntC::to_ushort( |
| 2433 | - (static_cast<unsigned char>(val.at(i)) << 8) + | |
| 2434 | - static_cast<unsigned char>(val.at(i+1))); | |
| 2441 | + (static_cast<unsigned char>(val.at(msb)) << 8) + | |
| 2442 | + static_cast<unsigned char>(val.at(lsb))); | |
| 2435 | 2443 | if ((bits & 0xFC00) == 0xD800) |
| 2436 | 2444 | { |
| 2437 | 2445 | codepoint = 0x10000U + ((bits & 0x3FFU) << 10U); | ... | ... |
libtests/qtest/qutil/qutil.out
libtests/qutil.cc
| ... | ... | @@ -303,6 +303,7 @@ void to_utf16_test() |
| 303 | 303 | std::string s(QUtil::utf8_to_utf16("\xcf\x80")); |
| 304 | 304 | std::cout << QUtil::utf16_to_utf8(s) << std::endl; |
| 305 | 305 | std::cout << QUtil::utf16_to_utf8(s + ".") << std::endl; |
| 306 | + std::cout << "LE: " << QUtil::utf16_to_utf8("\xff\xfe\xc0\x03") << std::endl; | |
| 306 | 307 | } |
| 307 | 308 | |
| 308 | 309 | void utf8_to_ascii_test() |
| ... | ... | @@ -388,7 +389,8 @@ void transcoding_test() |
| 388 | 389 | check_analyze("pi = \317\200", true, true, false); |
| 389 | 390 | check_analyze("pi != \317", true, false, false); |
| 390 | 391 | check_analyze("pi != 22/7", false, false, false); |
| 391 | - check_analyze(std::string("\xfe\xff\00\x51", 4), true, false, true); | |
| 392 | + check_analyze(std::string("\xfe\xff\x00\x51", 4), true, false, true); | |
| 393 | + check_analyze(std::string("\xff\xfe\x51\x00", 4), true, false, true); | |
| 392 | 394 | std::cout << "analysis done" << std::endl; |
| 393 | 395 | std::string input1("a\302\277b"); |
| 394 | 396 | std::string input2("a\317\200b"); | ... | ... |
qpdf/qtest/qpdf.test
| ... | ... | @@ -73,7 +73,7 @@ flush_tiff_cache(); |
| 73 | 73 | show_ntests(); |
| 74 | 74 | # ---------- |
| 75 | 75 | $td->notify("--- Character Encoding ---"); |
| 76 | -$n_tests += 3; | |
| 76 | +$n_tests += 4; | |
| 77 | 77 | |
| 78 | 78 | $td->runtest("PDF doc encoding to Unicode", |
| 79 | 79 | {$td->COMMAND => "test_pdf_doc_encoding pdf-doc-to-utf8.in"}, |
| ... | ... | @@ -88,6 +88,13 @@ $td->runtest("UTF-16 encoding errors", |
| 88 | 88 | {$td->FILE => "unicode-errors.out", $td->EXIT_STATUS => 0}, |
| 89 | 89 | $td->NORMALIZE_NEWLINES); |
| 90 | 90 | |
| 91 | +# UTF-16LE is not allowed by the PDF spec, but it seems that most | |
| 92 | +# readers accept it. | |
| 93 | +$td->runtest("UTF-16LE strings", | |
| 94 | + {$td->COMMAND => "qpdf --list-attachments --verbose utf16le.pdf"}, | |
| 95 | + {$td->FILE => "utf16le-attachments.out", $td->EXIT_STATUS => 0}, | |
| 96 | + $td->NORMALIZE_NEWLINES); | |
| 97 | + | |
| 91 | 98 | # Tests to exercise QPDFArgParser belong in arg_parser.test in |
| 92 | 99 | # libtests. These tests are supposed to be specific to the qpdf cli. |
| 93 | 100 | # Since they were written prior to moving QPDFArgParser into the | ... | ... |
qpdf/qtest/qpdf/utf16le-attachments.out
0 → 100644
qpdf/qtest/qpdf/utf16le.pdf
0 → 100644
No preview for this file type