Commit 22b35c49289157204b35a851f3cb9cade9e98559

Authored by Jay Berkenbilt
1 parent 5bbb0d4c

Expose QUtil::get_next_utf8_codepoint

.dir-locals.el
1   -((nil . ((indent-tabs-mode . t)
  1 +((nil . ((indent-tabs-mode . nil)
2 2 (qpdf-cc-style
3 3 .
4 4 ("qpdf"
... ...
ChangeLog
  1 +2022-04-23 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * Add new method QUtil::is_explicit_utf8 that tests whether a
  4 + string is explicitly marked as being UTF-8 encoded, as allowed by
  5 + the PDF 2.0 spec. Such a string starts with the bytes 0xEF 0xBB
  6 + 0xBF, which is the UTF-8 encoding of U+FEFF.
  7 +
  8 + * Add new method QUtil::get_next_utf8_codepoint as a low-level
  9 + helper for iterating through the UTF-8 characters in a byte
  10 + string.
  11 +
1 12 2022-04-16 Jay Berkenbilt <ejb@ql.org>
2 13  
3 14 * Breaking CLI change: the default value for --json is now
... ...
... ... @@ -11,9 +11,6 @@ In order:
11 11 Other (do in any order):
12 12  
13 13 Misc
14   -* Consider exposing get_next_utf8_codepoint in QUtil
15   -* Add QUtil::is_explicit_utf8 that does what QPDF_String::getUTF8Val
16   - does to detect UTF-8 encoded strings per PDF 2.0 spec.
17 14 * Add an option --ignore-encryption to ignore encryption information
18 15 and treat encrypted files as if they weren't encrypted. This should
19 16 make it possible to solve #598 (--show-encryption without a
... ...
include/qpdf/QUtil.hh
... ... @@ -268,14 +268,33 @@ namespace QUtil
268 268 QPDF_DLL
269 269 std::string toUTF16(unsigned long uval);
270 270  
  271 + // If utf8_val.at(pos) points to the beginning of a valid
  272 + // UTF-8-encoded character, return the codepoint of the character
  273 + // and set error to false. Otherwise, return 0xfffd and set error
  274 + // to true. In all cases, pos is advanced to the next position
  275 + // that may begin a valid character. When the string has been
  276 + // consumed, pos will be set to the string length. It is an error
  277 + // to pass a value of pos that is greater than or equal to the
  278 + // length of the string.
  279 + QPDF_DLL
  280 + unsigned long get_next_utf8_codepoint(
  281 + std::string const& utf8_val, size_t& pos, bool& error);
  282 +
271 283 // Test whether this is a UTF-16 string. This is indicated by
272 284 // first two bytes being 0xFE 0xFF (big-endian) or 0xFF 0xFE
273   - // (little-endian). Starting in qpdf 10.6.2, this detects
  285 + // (little-endian), each of which is the encoding of U+FEFF, the
  286 + // Unicode marker. Starting in qpdf 10.6.2, this detects
274 287 // little-endian as well as big-endian. Even though the PDF spec
275 288 // doesn't allow little-endian, most readers seem to accept it.
276 289 QPDF_DLL
277 290 bool is_utf16(std::string const&);
278 291  
  292 + // Test whether this is an explicit UTF-8 string as allowed by the
  293 + // PDF 2.0 spec. This is indicated by first three bytes being 0xEF
  294 + // 0xBB 0xBF, which is the UTF-8 encoding of U+FEFF.
  295 + QPDF_DLL
  296 + bool is_explicit_utf8(std::string const&);
  297 +
279 298 // Convert a UTF-8 encoded string to UTF-16 big-endian.
280 299 // Unrepresentable code points are converted to U+FFFD.
281 300 QPDF_DLL
... ...
libqpdf/QPDF_String.cc
... ... @@ -166,11 +166,9 @@ QPDF_String::getUTF8Val() const
166 166 {
167 167 if (QUtil::is_utf16(this->val)) {
168 168 return QUtil::utf16_to_utf8(this->val);
169   - } else if (
170   - (val.length() >= 3) && (val.at(0) == '\xEF') && (val.at(1) == '\xBB') &&
171   - (val.at(2) == '\xBF')) {
  169 + } else if (QUtil::is_explicit_utf8(this->val)) {
172 170 // PDF 2.0 allows UTF-8 strings when explicitly prefixed with
173   - // the above bytes, which is just UTF-8 encoding of U+FEFF.
  171 + // the three-byte representation of U+FEFF.
174 172 return this->val.substr(3);
175 173 } else {
176 174 return QUtil::pdf_doc_to_utf8(this->val);
... ...
libqpdf/QUtil.cc
... ... @@ -1529,10 +1529,11 @@ encode_pdfdoc(unsigned long codepoint)
1529 1529 }
1530 1530  
1531 1531 unsigned long
1532   -get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error)
  1532 +QUtil::get_next_utf8_codepoint(
  1533 + std::string const& utf8_val, size_t& pos, bool& error)
1533 1534 {
1534 1535 size_t len = utf8_val.length();
1535   - unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos));
  1536 + unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos++));
1536 1537 error = false;
1537 1538 if (ch < 128) {
1538 1539 return static_cast<unsigned long>(ch);
... ... @@ -1547,7 +1548,7 @@ get_next_utf8_codepoint(std::string const&amp; utf8_val, size_t&amp; pos, bool&amp; error)
1547 1548 bit_check >>= 1;
1548 1549 }
1549 1550 if (((bytes_needed > 5) || (bytes_needed < 1)) ||
1550   - ((pos + bytes_needed) >= len)) {
  1551 + ((pos + bytes_needed) > len)) {
1551 1552 error = true;
1552 1553 return 0xfffd;
1553 1554 }
... ... @@ -1555,11 +1556,11 @@ get_next_utf8_codepoint(std::string const&amp; utf8_val, size_t&amp; pos, bool&amp; error)
1555 1556 unsigned long codepoint = static_cast<unsigned long>(ch & ~to_clear);
1556 1557 while (bytes_needed > 0) {
1557 1558 --bytes_needed;
1558   - ch = static_cast<unsigned char>(utf8_val.at(++pos));
  1559 + ch = static_cast<unsigned char>(utf8_val.at(pos++));
1559 1560 if ((ch & 0xc0) != 0x80) {
1560 1561 --pos;
1561   - codepoint = 0xfffd;
1562   - break;
  1562 + error = true;
  1563 + return 0xfffd;
1563 1564 }
1564 1565 codepoint <<= 6;
1565 1566 codepoint += (ch & 0x3f);
... ... @@ -1580,9 +1581,11 @@ transcode_utf8(
1580 1581 result += "\xfe\xff";
1581 1582 }
1582 1583 size_t len = utf8_val.length();
1583   - for (size_t i = 0; i < len; ++i) {
  1584 + size_t pos = 0;
  1585 + while (pos < len) {
1584 1586 bool error = false;
1585   - unsigned long codepoint = get_next_utf8_codepoint(utf8_val, i, error);
  1587 + unsigned long codepoint =
  1588 + QUtil::get_next_utf8_codepoint(utf8_val, pos, error);
1586 1589 if (error) {
1587 1590 okay = false;
1588 1591 if (encoding == e_utf16) {
... ... @@ -1710,6 +1713,15 @@ QUtil::is_utf16(std::string const&amp; val)
1710 1713 ((val.at(0) == '\xff') && (val.at(1) == '\xfe'))));
1711 1714 }
1712 1715  
  1716 +bool
  1717 +QUtil::is_explicit_utf8(std::string const& val)
  1718 +{
  1719 + // QPDF_String.cc knows that this is a 3-byte sequence.
  1720 + return (
  1721 + (val.length() >= 3) && (val.at(0) == '\xef') && (val.at(1) == '\xbb') &&
  1722 + (val.at(2) == '\xbf'));
  1723 +}
  1724 +
1713 1725 std::string
1714 1726 QUtil::utf16_to_utf8(std::string const& val)
1715 1727 {
... ... @@ -1826,10 +1838,11 @@ QUtil::analyze_encoding(
1826 1838 return;
1827 1839 }
1828 1840 size_t len = val.length();
  1841 + size_t pos = 0;
1829 1842 bool any_errors = false;
1830   - for (size_t i = 0; i < len; ++i) {
  1843 + while (pos < len) {
1831 1844 bool error = false;
1832   - unsigned long codepoint = get_next_utf8_codepoint(val, i, error);
  1845 + unsigned long codepoint = get_next_utf8_codepoint(val, pos, error);
1833 1846 if (error) {
1834 1847 any_errors = true;
1835 1848 }
... ...
libtests/qutil.cc
... ... @@ -240,6 +240,33 @@ print_utf8(unsigned long val)
240 240 }
241 241 }
242 242 std::cout << std::endl;
  243 +
  244 + // Boundary conditions for QUtil::get_next_utf8_codepoint, which is
  245 + // also tested indirectly through test_pdf_unicode.cc.
  246 + std::string utf8 = "\xcf\x80\xcf\x30\xEF\xBF\x30\x31\xcf";
  247 + size_t pos = 0;
  248 + bool error = false;
  249 + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x3c0);
  250 + assert(pos == 2);
  251 + assert(!error);
  252 + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd);
  253 + assert(pos == 3);
  254 + assert(error);
  255 + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x30);
  256 + assert(pos == 4);
  257 + assert(!error);
  258 + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd);
  259 + assert(pos == 6);
  260 + assert(error);
  261 + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x30);
  262 + assert(pos == 7);
  263 + assert(!error);
  264 + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x31);
  265 + assert(pos == 8);
  266 + assert(!error);
  267 + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd);
  268 + assert(pos == 9);
  269 + assert(error);
243 270 }
244 271  
245 272 void
... ...
qpdf/qtest/qpdf/unicode-errors.out
... ... @@ -3,5 +3,5 @@ This file has utf-8 encoding errors and should be edited as a binary file. // &lt;5
3 3 0: too many bytes: �after // <feff0030003a00200074006f006f0020006d0061006e0079002000620079007400650073003a0020fffd00610066007400650072>
4 4 1: too few bytes: �after // <feff0031003a00200074006f006f0020006600650077002000620079007400650073003a0020fffd00610066007400650072>
5 5 2: invalid codepoint (U+DEAD): �after // <feff0032003a00200069006e00760061006c0069006400200063006f006400650070006f0069006e0074002000280055002b00440045004100440029003a0020fffd00610066007400650072>
6   -3: not enough bytes for character: �!after (! included) // <333a206e6f7420656e6f75676820627974657320666f72206368617261637465723a209f21616674657220282120696e636c7564656429>
  6 +3: not enough bytes for character: �!after (! included) // <feff0033003a0020006e006f007400200065006e006f00750067006800200062007900740065007300200066006f00720020006300680061007200610063007400650072003a0020fffd00210061006600740065007200200028002100200069006e0063006c00750064006500640029>
7 7 4: not enough bytes left in file � // <feff0034003a0020006e006f007400200065006e006f0075006700680020006200790074006500730020006c00650066007400200069006e002000660069006c00650020fffd>
... ...