Commit f4ca04cec1a0c4a3c8341ff15f68c06bed89c0d7

Authored by Jay Berkenbilt
1 parent 4fb7d133

Fix edge case in character encoding (fixes #778)

Avoid representing as PDF Doc encoding any string whose PDF Doc
encoding representation starts with a UTF-16 or UTF-8 marker.
ChangeLog
  1 +2022-09-26 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * Bug fix: avoid using PDF Doc encoding for strings whose PDF Doc
  4 + encoding representation starts with UTF-16 or UTF-8 markers. Fixes
  5 + #778.
  6 +
1 2022-09-14 Jay Berkenbilt <ejb@ql.org> 7 2022-09-14 Jay Berkenbilt <ejb@ql.org>
2 8
3 * 11.1.0: release 9 * 11.1.0: release
libqpdf/QUtil.cc
@@ -1565,10 +1565,38 @@ transcode_utf8( @@ -1565,10 +1565,38 @@ transcode_utf8(
1565 { 1565 {
1566 bool okay = true; 1566 bool okay = true;
1567 result.clear(); 1567 result.clear();
1568 - if (encoding == e_utf16) { 1568 + size_t len = utf8_val.length();
  1569 + switch (encoding) {
  1570 + case e_utf16:
1569 result += "\xfe\xff"; 1571 result += "\xfe\xff";
  1572 + break;
  1573 + case e_pdfdoc:
  1574 + // We need to avoid having the result start with something
  1575 + // that will be interpreted as UTF-16 or UTF-8, meaning we
  1576 + // can't end up with a string that starts with "fe ff",
  1577 + // (UTF-16-BE) "ff fe" (UTF-16-LE, not officially part of the
  1578 + // PDF spec, but recognized by most readers including qpdf),
  1579 + // or "ef bb bf" (UTF-8). It's more efficient to check the
  1580 + // input string to see if it will map to one of those
  1581 + // sequences than to check the output string since all cases
  1582 + // start with the same starting character.
  1583 + if ((len >= 4) && (utf8_val[0] == '\xc3')) {
  1584 + static std::string fe_ff("\xbe\xc3\xbf");
  1585 + static std::string ff_fe("\xbf\xc3\xbe");
  1586 + static std::string ef_bb_bf("\xaf\xc2\xbb\xc2\xbf");
  1587 + // C++-20 has starts_with, but when this was written, qpdf
  1588 + // had a minimum supported version of C++-17.
  1589 + if ((utf8_val.compare(1, 3, fe_ff) == 0) ||
  1590 + (utf8_val.compare(1, 3, ff_fe) == 0) ||
  1591 + (utf8_val.compare(1, 5, ef_bb_bf) == 0)) {
  1592 + result += unknown;
  1593 + okay = false;
  1594 + }
  1595 + }
  1596 + break;
  1597 + default:
  1598 + break;
1570 } 1599 }
1571 - size_t len = utf8_val.length();  
1572 size_t pos = 0; 1600 size_t pos = 0;
1573 while (pos < len) { 1601 while (pos < len) {
1574 bool error = false; 1602 bool error = false;
libtests/qutil.cc
@@ -436,6 +436,21 @@ transcoding_test() @@ -436,6 +436,21 @@ transcoding_test()
436 assert(!QUtil::utf8_to_pdf_doc(other_utf8, other_to_utf8)); 436 assert(!QUtil::utf8_to_pdf_doc(other_utf8, other_to_utf8));
437 std::cout << other_to_utf8 << std::endl; 437 std::cout << other_to_utf8 << std::endl;
438 std::cout << "done other characters" << std::endl; 438 std::cout << "done other characters" << std::endl;
  439 + // These valid UTF8 strings when converted to PDFDoc would end up
  440 + // with a byte sequence that would be recognized as UTF-8 or
  441 + // UTF-16 rather than PDFDoc. A special case is required to store
  442 + // them as UTF-16 rather than PDFDoc.
  443 + static std::string fe_ff("\xc3\xbe\xc3\xbf potato");
  444 + static std::string ff_fe("\xc3\xbf\xc3\xbe potato");
  445 + static std::string ef_bb_bf("\xc3\xaf\xc2\xbb\xc2\xbf potato");
  446 + assert(!QUtil::utf8_to_pdf_doc(fe_ff, pdfdoc));
  447 + assert(pdfdoc == "?\xfe\xff potato");
  448 + assert(!QUtil::utf8_to_pdf_doc(ff_fe, pdfdoc));
  449 + assert(pdfdoc == "?\xff\xfe potato");
  450 + assert(!QUtil::utf8_to_pdf_doc(ef_bb_bf, pdfdoc));
  451 + assert(pdfdoc == "?\xef\xbb\xbf potato");
  452 + assert(QUtil::utf8_to_pdf_doc("\xc3\xbe\xc3\xbe", pdfdoc));
  453 + assert(QUtil::utf8_to_pdf_doc("\xc3\xaf\xc2\xbb\xc2\xbe", pdfdoc));
439 } 454 }
440 455
441 void 456 void
qpdf/qtest/qpdf/unicode.in
@@ -5,3 +5,8 @@ If you think wwwwww is good, you should try ʬʬʬʬʬʬ. @@ -5,3 +5,8 @@ If you think wwwwww is good, you should try ʬʬʬʬʬʬ.
5 𝄞 𝄢 𝄪 𝅂 5 𝄞 𝄢 𝄪 𝅂
6 This can be encoded in ASCII. 6 This can be encoded in ASCII.
7 This can be encoded in PDFDocEncoding (€). 7 This can be encoded in PDFDocEncoding (€).
  8 +þÿ -- PDFDoc would look like UTF-16-BE
  9 +ÿþ -- PDFDoc would look like UTF-16-LE
  10 + -- PDFDoc would look like UTF-8
  11 +ï»» -- PDFDoc okay
  12 +þþ -- PDFDoc okay
qpdf/qtest/qpdf/unicode.out
@@ -5,3 +5,8 @@ If you think wwwwww is good, you should try ʬʬʬʬʬʬ. // &lt;feff00490066002000 @@ -5,3 +5,8 @@ If you think wwwwww is good, you should try ʬʬʬʬʬʬ. // &lt;feff00490066002000
5 𝄞 𝄢 𝄪 𝅂 // <feffd834dd1e0020d834dd220020d834dd2a0020d834dd42> 5 𝄞 𝄢 𝄪 𝅂 // <feffd834dd1e0020d834dd220020d834dd2a0020d834dd42>
6 This can be encoded in ASCII. // <546869732063616e20626520656e636f64656420696e2041534349492e> 6 This can be encoded in ASCII. // <546869732063616e20626520656e636f64656420696e2041534349492e>
7 This can be encoded in PDFDocEncoding (€). // <546869732063616e20626520656e636f64656420696e20504446446f63456e636f64696e672028a0292e> 7 This can be encoded in PDFDocEncoding (€). // <546869732063616e20626520656e636f64656420696e20504446446f63456e636f64696e672028a0292e>
  8 +þÿ -- PDFDoc would look like UTF-16-BE // <feff00fe00ff0020002d002d00200050004400460044006f006300200077006f0075006c00640020006c006f006f006b0020006c0069006b00650020005500540046002d00310036002d00420045>
  9 +ÿþ -- PDFDoc would look like UTF-16-LE // <feff00ff00fe0020002d002d00200050004400460044006f006300200077006f0075006c00640020006c006f006f006b0020006c0069006b00650020005500540046002d00310036002d004c0045>
  10 + -- PDFDoc would look like UTF-8 // <feff00ef00bb00bf0020002d002d00200050004400460044006f006300200077006f0075006c00640020006c006f006f006b0020006c0069006b00650020005500540046002d0038>
  11 +ï»» -- PDFDoc okay // <efbbbb202d2d20504446446f63206f6b6179>
  12 +þþ -- PDFDoc okay // <fefe202d2d20504446446f63206f6b6179>