Commit f4ca04cec1a0c4a3c8341ff15f68c06bed89c0d7
1 parent
4fb7d133
Fix edge case in character encoding (fixes #778)
Avoid representing as PDF Doc encoding any string whose PDF Doc encoding representation starts with a UTF-16 or UTF-8 marker.
Showing
5 changed files
with
61 additions
and
2 deletions
ChangeLog
| 1 | +2022-09-26 Jay Berkenbilt <ejb@ql.org> | ||
| 2 | + | ||
| 3 | + * Bug fix: avoid using PDF Doc encoding for strings whose PDF Doc | ||
| 4 | + encoding representation starts with UTF-16 or UTF-8 markers. Fixes | ||
| 5 | + #778. | ||
| 6 | + | ||
| 1 | 2022-09-14 Jay Berkenbilt <ejb@ql.org> | 7 | 2022-09-14 Jay Berkenbilt <ejb@ql.org> |
| 2 | 8 | ||
| 3 | * 11.1.0: release | 9 | * 11.1.0: release |
libqpdf/QUtil.cc
| @@ -1565,10 +1565,38 @@ transcode_utf8( | @@ -1565,10 +1565,38 @@ transcode_utf8( | ||
| 1565 | { | 1565 | { |
| 1566 | bool okay = true; | 1566 | bool okay = true; |
| 1567 | result.clear(); | 1567 | result.clear(); |
| 1568 | - if (encoding == e_utf16) { | 1568 | + size_t len = utf8_val.length(); |
| 1569 | + switch (encoding) { | ||
| 1570 | + case e_utf16: | ||
| 1569 | result += "\xfe\xff"; | 1571 | result += "\xfe\xff"; |
| 1572 | + break; | ||
| 1573 | + case e_pdfdoc: | ||
| 1574 | + // We need to avoid having the result start with something | ||
| 1575 | + // that will be interpreted as UTF-16 or UTF-8, meaning we | ||
| 1576 | + // can't end up with a string that starts with "fe ff", | ||
| 1577 | + // (UTF-16-BE) "ff fe" (UTF-16-LE, not officially part of the | ||
| 1578 | + // PDF spec, but recognized by most readers including qpdf), | ||
| 1579 | + // or "ef bb bf" (UTF-8). It's more efficient to check the | ||
| 1580 | + // input string to see if it will map to one of those | ||
| 1581 | + // sequences than to check the output string since all cases | ||
| 1582 | + // start with the same starting character. | ||
| 1583 | + if ((len >= 4) && (utf8_val[0] == '\xc3')) { | ||
| 1584 | + static std::string fe_ff("\xbe\xc3\xbf"); | ||
| 1585 | + static std::string ff_fe("\xbf\xc3\xbe"); | ||
| 1586 | + static std::string ef_bb_bf("\xaf\xc2\xbb\xc2\xbf"); | ||
| 1587 | + // C++-20 has starts_with, but when this was written, qpdf | ||
| 1588 | + // had a minimum supported version of C++-17. | ||
| 1589 | + if ((utf8_val.compare(1, 3, fe_ff) == 0) || | ||
| 1590 | + (utf8_val.compare(1, 3, ff_fe) == 0) || | ||
| 1591 | + (utf8_val.compare(1, 5, ef_bb_bf) == 0)) { | ||
| 1592 | + result += unknown; | ||
| 1593 | + okay = false; | ||
| 1594 | + } | ||
| 1595 | + } | ||
| 1596 | + break; | ||
| 1597 | + default: | ||
| 1598 | + break; | ||
| 1570 | } | 1599 | } |
| 1571 | - size_t len = utf8_val.length(); | ||
| 1572 | size_t pos = 0; | 1600 | size_t pos = 0; |
| 1573 | while (pos < len) { | 1601 | while (pos < len) { |
| 1574 | bool error = false; | 1602 | bool error = false; |
libtests/qutil.cc
| @@ -436,6 +436,21 @@ transcoding_test() | @@ -436,6 +436,21 @@ transcoding_test() | ||
| 436 | assert(!QUtil::utf8_to_pdf_doc(other_utf8, other_to_utf8)); | 436 | assert(!QUtil::utf8_to_pdf_doc(other_utf8, other_to_utf8)); |
| 437 | std::cout << other_to_utf8 << std::endl; | 437 | std::cout << other_to_utf8 << std::endl; |
| 438 | std::cout << "done other characters" << std::endl; | 438 | std::cout << "done other characters" << std::endl; |
| 439 | + // These valid UTF8 strings when converted to PDFDoc would end up | ||
| 440 | + // with a byte sequence that would be recognized as UTF-8 or | ||
| 441 | + // UTF-16 rather than PDFDoc. A special case is required to store | ||
| 442 | + // them as UTF-16 rather than PDFDoc. | ||
| 443 | + static std::string fe_ff("\xc3\xbe\xc3\xbf potato"); | ||
| 444 | + static std::string ff_fe("\xc3\xbf\xc3\xbe potato"); | ||
| 445 | + static std::string ef_bb_bf("\xc3\xaf\xc2\xbb\xc2\xbf potato"); | ||
| 446 | + assert(!QUtil::utf8_to_pdf_doc(fe_ff, pdfdoc)); | ||
| 447 | + assert(pdfdoc == "?\xfe\xff potato"); | ||
| 448 | + assert(!QUtil::utf8_to_pdf_doc(ff_fe, pdfdoc)); | ||
| 449 | + assert(pdfdoc == "?\xff\xfe potato"); | ||
| 450 | + assert(!QUtil::utf8_to_pdf_doc(ef_bb_bf, pdfdoc)); | ||
| 451 | + assert(pdfdoc == "?\xef\xbb\xbf potato"); | ||
| 452 | + assert(QUtil::utf8_to_pdf_doc("\xc3\xbe\xc3\xbe", pdfdoc)); | ||
| 453 | + assert(QUtil::utf8_to_pdf_doc("\xc3\xaf\xc2\xbb\xc2\xbe", pdfdoc)); | ||
| 439 | } | 454 | } |
| 440 | 455 | ||
| 441 | void | 456 | void |
qpdf/qtest/qpdf/unicode.in
| @@ -5,3 +5,8 @@ If you think wwwwww is good, you should try ʬʬʬʬʬʬ. | @@ -5,3 +5,8 @@ If you think wwwwww is good, you should try ʬʬʬʬʬʬ. | ||
| 5 | 𝄞 𝄢 𝄪 𝅂 | 5 | 𝄞 𝄢 𝄪 𝅂 |
| 6 | This can be encoded in ASCII. | 6 | This can be encoded in ASCII. |
| 7 | This can be encoded in PDFDocEncoding (€). | 7 | This can be encoded in PDFDocEncoding (€). |
| 8 | +þÿ -- PDFDoc would look like UTF-16-BE | ||
| 9 | +ÿþ -- PDFDoc would look like UTF-16-LE | ||
| 10 | + -- PDFDoc would look like UTF-8 | ||
| 11 | +ï»» -- PDFDoc okay | ||
| 12 | +þþ -- PDFDoc okay |
qpdf/qtest/qpdf/unicode.out
| @@ -5,3 +5,8 @@ If you think wwwwww is good, you should try ʬʬʬʬʬʬ. // <feff00490066002000 | @@ -5,3 +5,8 @@ If you think wwwwww is good, you should try ʬʬʬʬʬʬ. // <feff00490066002000 | ||
| 5 | 𝄞 𝄢 𝄪 𝅂 // <feffd834dd1e0020d834dd220020d834dd2a0020d834dd42> | 5 | 𝄞 𝄢 𝄪 𝅂 // <feffd834dd1e0020d834dd220020d834dd2a0020d834dd42> |
| 6 | This can be encoded in ASCII. // <546869732063616e20626520656e636f64656420696e2041534349492e> | 6 | This can be encoded in ASCII. // <546869732063616e20626520656e636f64656420696e2041534349492e> |
| 7 | This can be encoded in PDFDocEncoding (€). // <546869732063616e20626520656e636f64656420696e20504446446f63456e636f64696e672028a0292e> | 7 | This can be encoded in PDFDocEncoding (€). // <546869732063616e20626520656e636f64656420696e20504446446f63456e636f64696e672028a0292e> |
| 8 | +þÿ -- PDFDoc would look like UTF-16-BE // <feff00fe00ff0020002d002d00200050004400460044006f006300200077006f0075006c00640020006c006f006f006b0020006c0069006b00650020005500540046002d00310036002d00420045> | ||
| 9 | +ÿþ -- PDFDoc would look like UTF-16-LE // <feff00ff00fe0020002d002d00200050004400460044006f006300200077006f0075006c00640020006c006f006f006b0020006c0069006b00650020005500540046002d00310036002d004c0045> | ||
| 10 | + -- PDFDoc would look like UTF-8 // <feff00ef00bb00bf0020002d002d00200050004400460044006f006300200077006f0075006c00640020006c006f006f006b0020006c0069006b00650020005500540046002d0038> | ||
| 11 | +ï»» -- PDFDoc okay // <efbbbb202d2d20504446446f63206f6b6179> | ||
| 12 | +þþ -- PDFDoc okay // <fefe202d2d20504446446f63206f6b6179> |