Commit f4ca04cec1a0c4a3c8341ff15f68c06bed89c0d7
1 parent
4fb7d133
Fix edge case in character encoding (fixes #778)
Avoid representing as PDF Doc encoding any string whose PDF Doc encoding representation starts with a UTF-16 or UTF-8 marker.
Showing
5 changed files
with
61 additions
and
2 deletions
ChangeLog
libqpdf/QUtil.cc
| ... | ... | @@ -1565,10 +1565,38 @@ transcode_utf8( |
| 1565 | 1565 | { |
| 1566 | 1566 | bool okay = true; |
| 1567 | 1567 | result.clear(); |
| 1568 | - if (encoding == e_utf16) { | |
| 1568 | + size_t len = utf8_val.length(); | |
| 1569 | + switch (encoding) { | |
| 1570 | + case e_utf16: | |
| 1569 | 1571 | result += "\xfe\xff"; |
| 1572 | + break; | |
| 1573 | + case e_pdfdoc: | |
| 1574 | + // We need to avoid having the result start with something | |
| 1575 | + // that will be interpreted as UTF-16 or UTF-8, meaning we | |
| 1576 | + // can't end up with a string that starts with "fe ff", | |
| 1577 | + // (UTF-16-BE) "ff fe" (UTF-16-LE, not officially part of the | |
| 1578 | + // PDF spec, but recognized by most readers including qpdf), | |
| 1579 | + // or "ef bb bf" (UTF-8). It's more efficient to check the | |
| 1580 | + // input string to see if it will map to one of those | |
| 1581 | + // sequences than to check the output string since all cases | |
| 1582 | + // start with the same starting character. | |
| 1583 | + if ((len >= 4) && (utf8_val[0] == '\xc3')) { | |
| 1584 | + static std::string fe_ff("\xbe\xc3\xbf"); | |
| 1585 | + static std::string ff_fe("\xbf\xc3\xbe"); | |
| 1586 | + static std::string ef_bb_bf("\xaf\xc2\xbb\xc2\xbf"); | |
| 1587 | + // C++-20 has starts_with, but when this was written, qpdf | |
| 1588 | + // had a minimum supported version of C++-17. | |
| 1589 | + if ((utf8_val.compare(1, 3, fe_ff) == 0) || | |
| 1590 | + (utf8_val.compare(1, 3, ff_fe) == 0) || | |
| 1591 | + (utf8_val.compare(1, 5, ef_bb_bf) == 0)) { | |
| 1592 | + result += unknown; | |
| 1593 | + okay = false; | |
| 1594 | + } | |
| 1595 | + } | |
| 1596 | + break; | |
| 1597 | + default: | |
| 1598 | + break; | |
| 1570 | 1599 | } |
| 1571 | - size_t len = utf8_val.length(); | |
| 1572 | 1600 | size_t pos = 0; |
| 1573 | 1601 | while (pos < len) { |
| 1574 | 1602 | bool error = false; | ... | ... |
libtests/qutil.cc
| ... | ... | @@ -436,6 +436,21 @@ transcoding_test() |
| 436 | 436 | assert(!QUtil::utf8_to_pdf_doc(other_utf8, other_to_utf8)); |
| 437 | 437 | std::cout << other_to_utf8 << std::endl; |
| 438 | 438 | std::cout << "done other characters" << std::endl; |
| 439 | + // These valid UTF8 strings when converted to PDFDoc would end up | |
| 440 | + // with a byte sequence that would be recognized as UTF-8 or | |
| 441 | + // UTF-16 rather than PDFDoc. A special case is required to store | |
| 442 | + // them as UTF-16 rather than PDFDoc. | |
| 443 | + static std::string fe_ff("\xc3\xbe\xc3\xbf potato"); | |
| 444 | + static std::string ff_fe("\xc3\xbf\xc3\xbe potato"); | |
| 445 | + static std::string ef_bb_bf("\xc3\xaf\xc2\xbb\xc2\xbf potato"); | |
| 446 | + assert(!QUtil::utf8_to_pdf_doc(fe_ff, pdfdoc)); | |
| 447 | + assert(pdfdoc == "?\xfe\xff potato"); | |
| 448 | + assert(!QUtil::utf8_to_pdf_doc(ff_fe, pdfdoc)); | |
| 449 | + assert(pdfdoc == "?\xff\xfe potato"); | |
| 450 | + assert(!QUtil::utf8_to_pdf_doc(ef_bb_bf, pdfdoc)); | |
| 451 | + assert(pdfdoc == "?\xef\xbb\xbf potato"); | |
| 452 | + assert(QUtil::utf8_to_pdf_doc("\xc3\xbe\xc3\xbe", pdfdoc)); | |
| 453 | + assert(QUtil::utf8_to_pdf_doc("\xc3\xaf\xc2\xbb\xc2\xbe", pdfdoc)); | |
| 439 | 454 | } |
| 440 | 455 | |
| 441 | 456 | void | ... | ... |
qpdf/qtest/qpdf/unicode.in
| ... | ... | @@ -5,3 +5,8 @@ If you think wwwwww is good, you should try ʬʬʬʬʬʬ. |
| 5 | 5 | 𝄞 𝄢 𝄪 𝅂 |
| 6 | 6 | This can be encoded in ASCII. |
| 7 | 7 | This can be encoded in PDFDocEncoding (€). |
| 8 | +þÿ -- PDFDoc would look like UTF-16-BE | |
| 9 | +ÿþ -- PDFDoc would look like UTF-16-LE | |
| 10 | + -- PDFDoc would look like UTF-8 | |
| 11 | +ï»» -- PDFDoc okay | |
| 12 | +þþ -- PDFDoc okay | ... | ... |
qpdf/qtest/qpdf/unicode.out
| ... | ... | @@ -5,3 +5,8 @@ If you think wwwwww is good, you should try ʬʬʬʬʬʬ. // <feff00490066002000 |
| 5 | 5 | 𝄞 𝄢 𝄪 𝅂 // <feffd834dd1e0020d834dd220020d834dd2a0020d834dd42> |
| 6 | 6 | This can be encoded in ASCII. // <546869732063616e20626520656e636f64656420696e2041534349492e> |
| 7 | 7 | This can be encoded in PDFDocEncoding (€). // <546869732063616e20626520656e636f64656420696e20504446446f63456e636f64696e672028a0292e> |
| 8 | +þÿ -- PDFDoc would look like UTF-16-BE // <feff00fe00ff0020002d002d00200050004400460044006f006300200077006f0075006c00640020006c006f006f006b0020006c0069006b00650020005500540046002d00310036002d00420045> | |
| 9 | +ÿþ -- PDFDoc would look like UTF-16-LE // <feff00ff00fe0020002d002d00200050004400460044006f006300200077006f0075006c00640020006c006f006f006b0020006c0069006b00650020005500540046002d00310036002d004c0045> | |
| 10 | + -- PDFDoc would look like UTF-8 // <feff00ef00bb00bf0020002d002d00200050004400460044006f006300200077006f0075006c00640020006c006f006f006b0020006c0069006b00650020005500540046002d0038> | |
| 11 | +ï»» -- PDFDoc okay // <efbbbb202d2d20504446446f63206f6b6179> | |
| 12 | +þþ -- PDFDoc okay // <fefe202d2d20504446446f63206f6b6179> | ... | ... |