Commit 22b35c49289157204b35a851f3cb9cade9e98559

Authored by Jay Berkenbilt
1 parent 5bbb0d4c

Expose QUtil::get_next_utf8_codepoint

.dir-locals.el
1 -((nil . ((indent-tabs-mode . t) 1 +((nil . ((indent-tabs-mode . nil)
2 (qpdf-cc-style 2 (qpdf-cc-style
3 . 3 .
4 ("qpdf" 4 ("qpdf"
ChangeLog
  1 +2022-04-23 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * Add new method QUtil::is_explicit_utf8 that tests whether a
  4 + string is explicitly marked as being UTF-8 encoded, as allowed by
  5 + the PDF 2.0 spec. Such a string starts with the bytes 0xEF 0xBB
  6 + 0xBF, which is the UTF-8 encoding of U+FEFF.
  7 +
  8 + * Add new method QUtil::get_next_utf8_codepoint as a low-level
  9 + helper for iterating through the UTF-8 characters in a byte
  10 + string.
  11 +
1 2022-04-16 Jay Berkenbilt <ejb@ql.org> 12 2022-04-16 Jay Berkenbilt <ejb@ql.org>
2 13
3 * Breaking CLI change: the default value for --json is now 14 * Breaking CLI change: the default value for --json is now
@@ -11,9 +11,6 @@ In order: @@ -11,9 +11,6 @@ In order:
11 Other (do in any order): 11 Other (do in any order):
12 12
13 Misc 13 Misc
14 -* Consider exposing get_next_utf8_codepoint in QUtil  
15 -* Add QUtil::is_explicit_utf8 that does what QPDF_String::getUTF8Val  
16 - does to detect UTF-8 encoded strings per PDF 2.0 spec.  
17 * Add an option --ignore-encryption to ignore encryption information 14 * Add an option --ignore-encryption to ignore encryption information
18 and treat encrypted files as if they weren't encrypted. This should 15 and treat encrypted files as if they weren't encrypted. This should
19 make it possible to solve #598 (--show-encryption without a 16 make it possible to solve #598 (--show-encryption without a
include/qpdf/QUtil.hh
@@ -268,14 +268,33 @@ namespace QUtil @@ -268,14 +268,33 @@ namespace QUtil
268 QPDF_DLL 268 QPDF_DLL
269 std::string toUTF16(unsigned long uval); 269 std::string toUTF16(unsigned long uval);
270 270
  271 + // If utf8_val.at(pos) points to the beginning of a valid
  272 + // UTF-8-encoded character, return the codepoint of the character
  273 + // and set error to false. Otherwise, return 0xfffd and set error
  274 + // to true. In all cases, pos is advanced to the next position
  275 + // that may begin a valid character. When the string has been
  276 + // consumed, pos will be set to the string length. It is an error
  277 + // to pass a value of pos that is greater than or equal to the
  278 + // length of the string.
  279 + QPDF_DLL
  280 + unsigned long get_next_utf8_codepoint(
  281 + std::string const& utf8_val, size_t& pos, bool& error);
  282 +
271 // Test whether this is a UTF-16 string. This is indicated by 283 // Test whether this is a UTF-16 string. This is indicated by
272 // first two bytes being 0xFE 0xFF (big-endian) or 0xFF 0xFE 284 // first two bytes being 0xFE 0xFF (big-endian) or 0xFF 0xFE
273 - // (little-endian). Starting in qpdf 10.6.2, this detects 285 + // (little-endian), each of which is the encoding of U+FEFF, the
  286 + // Unicode marker. Starting in qpdf 10.6.2, this detects
274 // little-endian as well as big-endian. Even though the PDF spec 287 // little-endian as well as big-endian. Even though the PDF spec
275 // doesn't allow little-endian, most readers seem to accept it. 288 // doesn't allow little-endian, most readers seem to accept it.
276 QPDF_DLL 289 QPDF_DLL
277 bool is_utf16(std::string const&); 290 bool is_utf16(std::string const&);
278 291
  292 + // Test whether this is an explicit UTF-8 string as allowed by the
  293 + // PDF 2.0 spec. This is indicated by first three bytes being 0xEF
  294 + // 0xBB 0xBF, which is the UTF-8 encoding of U+FEFF.
  295 + QPDF_DLL
  296 + bool is_explicit_utf8(std::string const&);
  297 +
279 // Convert a UTF-8 encoded string to UTF-16 big-endian. 298 // Convert a UTF-8 encoded string to UTF-16 big-endian.
280 // Unrepresentable code points are converted to U+FFFD. 299 // Unrepresentable code points are converted to U+FFFD.
281 QPDF_DLL 300 QPDF_DLL
libqpdf/QPDF_String.cc
@@ -166,11 +166,9 @@ QPDF_String::getUTF8Val() const @@ -166,11 +166,9 @@ QPDF_String::getUTF8Val() const
166 { 166 {
167 if (QUtil::is_utf16(this->val)) { 167 if (QUtil::is_utf16(this->val)) {
168 return QUtil::utf16_to_utf8(this->val); 168 return QUtil::utf16_to_utf8(this->val);
169 - } else if (  
170 - (val.length() >= 3) && (val.at(0) == '\xEF') && (val.at(1) == '\xBB') &&  
171 - (val.at(2) == '\xBF')) { 169 + } else if (QUtil::is_explicit_utf8(this->val)) {
172 // PDF 2.0 allows UTF-8 strings when explicitly prefixed with 170 // PDF 2.0 allows UTF-8 strings when explicitly prefixed with
173 - // the above bytes, which is just UTF-8 encoding of U+FEFF. 171 + // the three-byte representation of U+FEFF.
174 return this->val.substr(3); 172 return this->val.substr(3);
175 } else { 173 } else {
176 return QUtil::pdf_doc_to_utf8(this->val); 174 return QUtil::pdf_doc_to_utf8(this->val);
libqpdf/QUtil.cc
@@ -1529,10 +1529,11 @@ encode_pdfdoc(unsigned long codepoint) @@ -1529,10 +1529,11 @@ encode_pdfdoc(unsigned long codepoint)
1529 } 1529 }
1530 1530
1531 unsigned long 1531 unsigned long
1532 -get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error) 1532 +QUtil::get_next_utf8_codepoint(
  1533 + std::string const& utf8_val, size_t& pos, bool& error)
1533 { 1534 {
1534 size_t len = utf8_val.length(); 1535 size_t len = utf8_val.length();
1535 - unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos)); 1536 + unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos++));
1536 error = false; 1537 error = false;
1537 if (ch < 128) { 1538 if (ch < 128) {
1538 return static_cast<unsigned long>(ch); 1539 return static_cast<unsigned long>(ch);
@@ -1547,7 +1548,7 @@ get_next_utf8_codepoint(std::string const&amp; utf8_val, size_t&amp; pos, bool&amp; error) @@ -1547,7 +1548,7 @@ get_next_utf8_codepoint(std::string const&amp; utf8_val, size_t&amp; pos, bool&amp; error)
1547 bit_check >>= 1; 1548 bit_check >>= 1;
1548 } 1549 }
1549 if (((bytes_needed > 5) || (bytes_needed < 1)) || 1550 if (((bytes_needed > 5) || (bytes_needed < 1)) ||
1550 - ((pos + bytes_needed) >= len)) { 1551 + ((pos + bytes_needed) > len)) {
1551 error = true; 1552 error = true;
1552 return 0xfffd; 1553 return 0xfffd;
1553 } 1554 }
@@ -1555,11 +1556,11 @@ get_next_utf8_codepoint(std::string const&amp; utf8_val, size_t&amp; pos, bool&amp; error) @@ -1555,11 +1556,11 @@ get_next_utf8_codepoint(std::string const&amp; utf8_val, size_t&amp; pos, bool&amp; error)
1555 unsigned long codepoint = static_cast<unsigned long>(ch & ~to_clear); 1556 unsigned long codepoint = static_cast<unsigned long>(ch & ~to_clear);
1556 while (bytes_needed > 0) { 1557 while (bytes_needed > 0) {
1557 --bytes_needed; 1558 --bytes_needed;
1558 - ch = static_cast<unsigned char>(utf8_val.at(++pos)); 1559 + ch = static_cast<unsigned char>(utf8_val.at(pos++));
1559 if ((ch & 0xc0) != 0x80) { 1560 if ((ch & 0xc0) != 0x80) {
1560 --pos; 1561 --pos;
1561 - codepoint = 0xfffd;  
1562 - break; 1562 + error = true;
  1563 + return 0xfffd;
1563 } 1564 }
1564 codepoint <<= 6; 1565 codepoint <<= 6;
1565 codepoint += (ch & 0x3f); 1566 codepoint += (ch & 0x3f);
@@ -1580,9 +1581,11 @@ transcode_utf8( @@ -1580,9 +1581,11 @@ transcode_utf8(
1580 result += "\xfe\xff"; 1581 result += "\xfe\xff";
1581 } 1582 }
1582 size_t len = utf8_val.length(); 1583 size_t len = utf8_val.length();
1583 - for (size_t i = 0; i < len; ++i) { 1584 + size_t pos = 0;
  1585 + while (pos < len) {
1584 bool error = false; 1586 bool error = false;
1585 - unsigned long codepoint = get_next_utf8_codepoint(utf8_val, i, error); 1587 + unsigned long codepoint =
  1588 + QUtil::get_next_utf8_codepoint(utf8_val, pos, error);
1586 if (error) { 1589 if (error) {
1587 okay = false; 1590 okay = false;
1588 if (encoding == e_utf16) { 1591 if (encoding == e_utf16) {
@@ -1710,6 +1713,15 @@ QUtil::is_utf16(std::string const&amp; val) @@ -1710,6 +1713,15 @@ QUtil::is_utf16(std::string const&amp; val)
1710 ((val.at(0) == '\xff') && (val.at(1) == '\xfe')))); 1713 ((val.at(0) == '\xff') && (val.at(1) == '\xfe'))));
1711 } 1714 }
1712 1715
  1716 +bool
  1717 +QUtil::is_explicit_utf8(std::string const& val)
  1718 +{
  1719 + // QPDF_String.cc knows that this is a 3-byte sequence.
  1720 + return (
  1721 + (val.length() >= 3) && (val.at(0) == '\xef') && (val.at(1) == '\xbb') &&
  1722 + (val.at(2) == '\xbf'));
  1723 +}
  1724 +
1713 std::string 1725 std::string
1714 QUtil::utf16_to_utf8(std::string const& val) 1726 QUtil::utf16_to_utf8(std::string const& val)
1715 { 1727 {
@@ -1826,10 +1838,11 @@ QUtil::analyze_encoding( @@ -1826,10 +1838,11 @@ QUtil::analyze_encoding(
1826 return; 1838 return;
1827 } 1839 }
1828 size_t len = val.length(); 1840 size_t len = val.length();
  1841 + size_t pos = 0;
1829 bool any_errors = false; 1842 bool any_errors = false;
1830 - for (size_t i = 0; i < len; ++i) { 1843 + while (pos < len) {
1831 bool error = false; 1844 bool error = false;
1832 - unsigned long codepoint = get_next_utf8_codepoint(val, i, error); 1845 + unsigned long codepoint = get_next_utf8_codepoint(val, pos, error);
1833 if (error) { 1846 if (error) {
1834 any_errors = true; 1847 any_errors = true;
1835 } 1848 }
libtests/qutil.cc
@@ -240,6 +240,33 @@ print_utf8(unsigned long val) @@ -240,6 +240,33 @@ print_utf8(unsigned long val)
240 } 240 }
241 } 241 }
242 std::cout << std::endl; 242 std::cout << std::endl;
  243 +
  244 + // Boundary conditions for QUtil::get_next_utf8_codepoint, which is
  245 + // also tested indirectly through test_pdf_unicode.cc.
  246 + std::string utf8 = "\xcf\x80\xcf\x30\xEF\xBF\x30\x31\xcf";
  247 + size_t pos = 0;
  248 + bool error = false;
  249 + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x3c0);
  250 + assert(pos == 2);
  251 + assert(!error);
  252 + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd);
  253 + assert(pos == 3);
  254 + assert(error);
  255 + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x30);
  256 + assert(pos == 4);
  257 + assert(!error);
  258 + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd);
  259 + assert(pos == 6);
  260 + assert(error);
  261 + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x30);
  262 + assert(pos == 7);
  263 + assert(!error);
  264 + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x31);
  265 + assert(pos == 8);
  266 + assert(!error);
  267 + assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd);
  268 + assert(pos == 9);
  269 + assert(error);
243 } 270 }
244 271
245 void 272 void
qpdf/qtest/qpdf/unicode-errors.out
@@ -3,5 +3,5 @@ This file has utf-8 encoding errors and should be edited as a binary file. // &lt;5 @@ -3,5 +3,5 @@ This file has utf-8 encoding errors and should be edited as a binary file. // &lt;5
3 0: too many bytes: �after // <feff0030003a00200074006f006f0020006d0061006e0079002000620079007400650073003a0020fffd00610066007400650072> 3 0: too many bytes: �after // <feff0030003a00200074006f006f0020006d0061006e0079002000620079007400650073003a0020fffd00610066007400650072>
4 1: too few bytes: �after // <feff0031003a00200074006f006f0020006600650077002000620079007400650073003a0020fffd00610066007400650072> 4 1: too few bytes: �after // <feff0031003a00200074006f006f0020006600650077002000620079007400650073003a0020fffd00610066007400650072>
5 2: invalid codepoint (U+DEAD): �after // <feff0032003a00200069006e00760061006c0069006400200063006f006400650070006f0069006e0074002000280055002b00440045004100440029003a0020fffd00610066007400650072> 5 2: invalid codepoint (U+DEAD): �after // <feff0032003a00200069006e00760061006c0069006400200063006f006400650070006f0069006e0074002000280055002b00440045004100440029003a0020fffd00610066007400650072>
6 -3: not enough bytes for character: �!after (! included) // <333a206e6f7420656e6f75676820627974657320666f72206368617261637465723a209f21616674657220282120696e636c7564656429> 6 +3: not enough bytes for character: �!after (! included) // <feff0033003a0020006e006f007400200065006e006f00750067006800200062007900740065007300200066006f00720020006300680061007200610063007400650072003a0020fffd00210061006600740065007200200028002100200069006e0063006c00750064006500640029>
7 4: not enough bytes left in file � // <feff0034003a0020006e006f007400200065006e006f0075006700680020006200790074006500730020006c00650066007400200069006e002000660069006c00650020fffd> 7 4: not enough bytes left in file � // <feff0034003a0020006e006f007400200065006e006f0075006700680020006200790074006500730020006c00650066007400200069006e002000660069006c00650020fffd>