Commit 22b35c49289157204b35a851f3cb9cade9e98559
1 parent
5bbb0d4c
Expose QUtil::get_next_utf8_codepoint
Showing
8 changed files
with
85 additions
and
20 deletions
.dir-locals.el
ChangeLog
| 1 | +2022-04-23 Jay Berkenbilt <ejb@ql.org> | |
| 2 | + | |
| 3 | + * Add new method QUtil::is_explicit_utf8 that tests whether a | |
| 4 | + string is explicitly marked as being UTF-8 encoded, as allowed by | |
| 5 | + the PDF 2.0 spec. Such a string starts with the bytes 0xEF 0xBB | |
| 6 | + 0xBF, which is the UTF-8 encoding of U+FEFF. | |
| 7 | + | |
| 8 | + * Add new method QUtil::get_next_utf8_codepoint as a low-level | |
| 9 | + helper for iterating through the UTF-8 characters in a byte | |
| 10 | + string. | |
| 11 | + | |
| 1 | 12 | 2022-04-16 Jay Berkenbilt <ejb@ql.org> |
| 2 | 13 | |
| 3 | 14 | * Breaking CLI change: the default value for --json is now | ... | ... |
TODO
| ... | ... | @@ -11,9 +11,6 @@ In order: |
| 11 | 11 | Other (do in any order): |
| 12 | 12 | |
| 13 | 13 | Misc |
| 14 | -* Consider exposing get_next_utf8_codepoint in QUtil | |
| 15 | -* Add QUtil::is_explicit_utf8 that does what QPDF_String::getUTF8Val | |
| 16 | - does to detect UTF-8 encoded strings per PDF 2.0 spec. | |
| 17 | 14 | * Add an option --ignore-encryption to ignore encryption information |
| 18 | 15 | and treat encrypted files as if they weren't encrypted. This should |
| 19 | 16 | make it possible to solve #598 (--show-encryption without a | ... | ... |
include/qpdf/QUtil.hh
| ... | ... | @@ -268,14 +268,33 @@ namespace QUtil |
| 268 | 268 | QPDF_DLL |
| 269 | 269 | std::string toUTF16(unsigned long uval); |
| 270 | 270 | |
| 271 | + // If utf8_val.at(pos) points to the beginning of a valid | |
| 272 | + // UTF-8-encoded character, return the codepoint of the character | |
| 273 | + // and set error to false. Otherwise, return 0xfffd and set error | |
| 274 | + // to true. In all cases, pos is advanced to the next position | |
| 275 | + // that may begin a valid character. When the string has been | |
| 276 | + // consumed, pos will be set to the string length. It is an error | |
| 277 | + // to pass a value of pos that is greater than or equal to the | |
| 278 | + // length of the string. | |
| 279 | + QPDF_DLL | |
| 280 | + unsigned long get_next_utf8_codepoint( | |
| 281 | + std::string const& utf8_val, size_t& pos, bool& error); | |
| 282 | + | |
| 271 | 283 | // Test whether this is a UTF-16 string. This is indicated by |
| 272 | 284 | // first two bytes being 0xFE 0xFF (big-endian) or 0xFF 0xFE |
| 273 | - // (little-endian). Starting in qpdf 10.6.2, this detects | |
| 285 | + // (little-endian), each of which is the encoding of U+FEFF, the | |
| 286 | + // Unicode marker. Starting in qpdf 10.6.2, this detects | |
| 274 | 287 | // little-endian as well as big-endian. Even though the PDF spec |
| 275 | 288 | // doesn't allow little-endian, most readers seem to accept it. |
| 276 | 289 | QPDF_DLL |
| 277 | 290 | bool is_utf16(std::string const&); |
| 278 | 291 | |
| 292 | + // Test whether this is an explicit UTF-8 string as allowed by the | |
| 293 | + // PDF 2.0 spec. This is indicated by first three bytes being 0xEF | |
| 294 | + // 0xBB 0xBF, which is the UTF-8 encoding of U+FEFF. | |
| 295 | + QPDF_DLL | |
| 296 | + bool is_explicit_utf8(std::string const&); | |
| 297 | + | |
| 279 | 298 | // Convert a UTF-8 encoded string to UTF-16 big-endian. |
| 280 | 299 | // Unrepresentable code points are converted to U+FFFD. |
| 281 | 300 | QPDF_DLL | ... | ... |
libqpdf/QPDF_String.cc
| ... | ... | @@ -166,11 +166,9 @@ QPDF_String::getUTF8Val() const |
| 166 | 166 | { |
| 167 | 167 | if (QUtil::is_utf16(this->val)) { |
| 168 | 168 | return QUtil::utf16_to_utf8(this->val); |
| 169 | - } else if ( | |
| 170 | - (val.length() >= 3) && (val.at(0) == '\xEF') && (val.at(1) == '\xBB') && | |
| 171 | - (val.at(2) == '\xBF')) { | |
| 169 | + } else if (QUtil::is_explicit_utf8(this->val)) { | |
| 172 | 170 | // PDF 2.0 allows UTF-8 strings when explicitly prefixed with |
| 173 | - // the above bytes, which is just UTF-8 encoding of U+FEFF. | |
| 171 | + // the three-byte representation of U+FEFF. | |
| 174 | 172 | return this->val.substr(3); |
| 175 | 173 | } else { |
| 176 | 174 | return QUtil::pdf_doc_to_utf8(this->val); | ... | ... |
libqpdf/QUtil.cc
| ... | ... | @@ -1529,10 +1529,11 @@ encode_pdfdoc(unsigned long codepoint) |
| 1529 | 1529 | } |
| 1530 | 1530 | |
| 1531 | 1531 | unsigned long |
| 1532 | -get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error) | |
| 1532 | +QUtil::get_next_utf8_codepoint( | |
| 1533 | + std::string const& utf8_val, size_t& pos, bool& error) | |
| 1533 | 1534 | { |
| 1534 | 1535 | size_t len = utf8_val.length(); |
| 1535 | - unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos)); | |
| 1536 | + unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos++)); | |
| 1536 | 1537 | error = false; |
| 1537 | 1538 | if (ch < 128) { |
| 1538 | 1539 | return static_cast<unsigned long>(ch); |
| ... | ... | @@ -1547,7 +1548,7 @@ get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error) |
| 1547 | 1548 | bit_check >>= 1; |
| 1548 | 1549 | } |
| 1549 | 1550 | if (((bytes_needed > 5) || (bytes_needed < 1)) || |
| 1550 | - ((pos + bytes_needed) >= len)) { | |
| 1551 | + ((pos + bytes_needed) > len)) { | |
| 1551 | 1552 | error = true; |
| 1552 | 1553 | return 0xfffd; |
| 1553 | 1554 | } |
| ... | ... | @@ -1555,11 +1556,11 @@ get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error) |
| 1555 | 1556 | unsigned long codepoint = static_cast<unsigned long>(ch & ~to_clear); |
| 1556 | 1557 | while (bytes_needed > 0) { |
| 1557 | 1558 | --bytes_needed; |
| 1558 | - ch = static_cast<unsigned char>(utf8_val.at(++pos)); | |
| 1559 | + ch = static_cast<unsigned char>(utf8_val.at(pos++)); | |
| 1559 | 1560 | if ((ch & 0xc0) != 0x80) { |
| 1560 | 1561 | --pos; |
| 1561 | - codepoint = 0xfffd; | |
| 1562 | - break; | |
| 1562 | + error = true; | |
| 1563 | + return 0xfffd; | |
| 1563 | 1564 | } |
| 1564 | 1565 | codepoint <<= 6; |
| 1565 | 1566 | codepoint += (ch & 0x3f); |
| ... | ... | @@ -1580,9 +1581,11 @@ transcode_utf8( |
| 1580 | 1581 | result += "\xfe\xff"; |
| 1581 | 1582 | } |
| 1582 | 1583 | size_t len = utf8_val.length(); |
| 1583 | - for (size_t i = 0; i < len; ++i) { | |
| 1584 | + size_t pos = 0; | |
| 1585 | + while (pos < len) { | |
| 1584 | 1586 | bool error = false; |
| 1585 | - unsigned long codepoint = get_next_utf8_codepoint(utf8_val, i, error); | |
| 1587 | + unsigned long codepoint = | |
| 1588 | + QUtil::get_next_utf8_codepoint(utf8_val, pos, error); | |
| 1586 | 1589 | if (error) { |
| 1587 | 1590 | okay = false; |
| 1588 | 1591 | if (encoding == e_utf16) { |
| ... | ... | @@ -1710,6 +1713,15 @@ QUtil::is_utf16(std::string const& val) |
| 1710 | 1713 | ((val.at(0) == '\xff') && (val.at(1) == '\xfe')))); |
| 1711 | 1714 | } |
| 1712 | 1715 | |
| 1716 | +bool | |
| 1717 | +QUtil::is_explicit_utf8(std::string const& val) | |
| 1718 | +{ | |
| 1719 | + // QPDF_String.cc knows that this is a 3-byte sequence. | |
| 1720 | + return ( | |
| 1721 | + (val.length() >= 3) && (val.at(0) == '\xef') && (val.at(1) == '\xbb') && | |
| 1722 | + (val.at(2) == '\xbf')); | |
| 1723 | +} | |
| 1724 | + | |
| 1713 | 1725 | std::string |
| 1714 | 1726 | QUtil::utf16_to_utf8(std::string const& val) |
| 1715 | 1727 | { |
| ... | ... | @@ -1826,10 +1838,11 @@ QUtil::analyze_encoding( |
| 1826 | 1838 | return; |
| 1827 | 1839 | } |
| 1828 | 1840 | size_t len = val.length(); |
| 1841 | + size_t pos = 0; | |
| 1829 | 1842 | bool any_errors = false; |
| 1830 | - for (size_t i = 0; i < len; ++i) { | |
| 1843 | + while (pos < len) { | |
| 1831 | 1844 | bool error = false; |
| 1832 | - unsigned long codepoint = get_next_utf8_codepoint(val, i, error); | |
| 1845 | + unsigned long codepoint = get_next_utf8_codepoint(val, pos, error); | |
| 1833 | 1846 | if (error) { |
| 1834 | 1847 | any_errors = true; |
| 1835 | 1848 | } | ... | ... |
libtests/qutil.cc
| ... | ... | @@ -240,6 +240,33 @@ print_utf8(unsigned long val) |
| 240 | 240 | } |
| 241 | 241 | } |
| 242 | 242 | std::cout << std::endl; |
| 243 | + | |
| 244 | + // Boundary conditions for QUtil::get_next_utf8_codepoint, which is | |
| 245 | + // also tested indirectly through test_pdf_unicode.cc. | |
| 246 | + std::string utf8 = "\xcf\x80\xcf\x30\xEF\xBF\x30\x31\xcf"; | |
| 247 | + size_t pos = 0; | |
| 248 | + bool error = false; | |
| 249 | + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x3c0); | |
| 250 | + assert(pos == 2); | |
| 251 | + assert(!error); | |
| 252 | + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd); | |
| 253 | + assert(pos == 3); | |
| 254 | + assert(error); | |
| 255 | + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x30); | |
| 256 | + assert(pos == 4); | |
| 257 | + assert(!error); | |
| 258 | + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd); | |
| 259 | + assert(pos == 6); | |
| 260 | + assert(error); | |
| 261 | + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x30); | |
| 262 | + assert(pos == 7); | |
| 263 | + assert(!error); | |
| 264 | + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x31); | |
| 265 | + assert(pos == 8); | |
| 266 | + assert(!error); | |
| 267 | + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd); | |
| 268 | + assert(pos == 9); | |
| 269 | + assert(error); | |
| 243 | 270 | } |
| 244 | 271 | |
| 245 | 272 | void | ... | ... |
qpdf/qtest/qpdf/unicode-errors.out
| ... | ... | @@ -3,5 +3,5 @@ This file has utf-8 encoding errors and should be edited as a binary file. // <5 |
| 3 | 3 | 0: too many bytes: �after // <feff0030003a00200074006f006f0020006d0061006e0079002000620079007400650073003a0020fffd00610066007400650072> |
| 4 | 4 | 1: too few bytes: �after // <feff0031003a00200074006f006f0020006600650077002000620079007400650073003a0020fffd00610066007400650072> |
| 5 | 5 | 2: invalid codepoint (U+DEAD): �after // <feff0032003a00200069006e00760061006c0069006400200063006f006400650070006f0069006e0074002000280055002b00440045004100440029003a0020fffd00610066007400650072> |
| 6 | -3: not enough bytes for character: �!after (! included) // <333a206e6f7420656e6f75676820627974657320666f72206368617261637465723a209f21616674657220282120696e636c7564656429> | |
| 6 | +3: not enough bytes for character: �!after (! included) // <feff0033003a0020006e006f007400200065006e006f00750067006800200062007900740065007300200066006f00720020006300680061007200610063007400650072003a0020fffd00210061006600740065007200200028002100200069006e0063006c00750064006500640029> | |
| 7 | 7 | 4: not enough bytes left in file � // <feff0034003a0020006e006f007400200065006e006f0075006700680020006200790074006500730020006c00650066007400200069006e002000660069006c00650020fffd> | ... | ... |