Expose QUtil::get_next_utf8_codepoint

Jay Berkenbilt
1 parent 5bbb0d4c
Showing 8 changed files with 85 additions and 20 deletions
.dir-locals.el
ChangeLog
TODO
include/qpdf/QUtil.hh
libqpdf/QPDF_String.cc
libqpdf/QUtil.cc
libtests/qutil.cc
qpdf/qtest/qpdf/unicode-errors.out
-((nil . ((indent-tabs-mode . t)
+((nil . ((indent-tabs-mode . nil)
          (qpdf-cc-style
           .
           ("qpdf"
+2022-04-23  Jay Berkenbilt  <ejb@ql.org>
+
+	* Add new method QUtil::is_explicit_utf8 that tests whether a
+	string is explicitly marked as being UTF-8 encoded, as allowed by
+	the PDF 2.0 spec. Such a string starts with the bytes 0xEF 0xBB
+	0xBF, which is the UTF-8 encoding of U+FEFF.
+
+	* Add new method QUtil::get_next_utf8_codepoint as a low-level
+	helper for iterating through the UTF-8 characters in a byte
+	string.
+
 2022-04-16  Jay Berkenbilt  <ejb@ql.org>
 	* Breaking CLI change: the default value for --json is now
@@ -11,9 +11,6 @@ In order:
 Other (do in any order):
 Misc
-* Consider exposing get_next_utf8_codepoint in QUtil
-* Add QUtil::is_explicit_utf8 that does what QPDF_String::getUTF8Val
-  does to detect UTF-8 encoded strings per PDF 2.0 spec.
 * Add an option --ignore-encryption to ignore encryption information
   and treat encrypted files as if they weren't encrypted. This should
   make it possible to solve #598 (--show-encryption without a
@@ -268,14 +268,33 @@ namespace QUtil
     QPDF_DLL
     std::string toUTF16(unsigned long uval);
+    // If utf8_val.at(pos) points to the beginning of a valid
+    // UTF-8-encoded character, return the codepoint of the character
+    // and set error to false. Otherwise, return 0xfffd and set error
+    // to true. In all cases, pos is advanced to the next position
+    // that may begin a valid character. When the string has been
+    // consumed, pos will be set to the string length. It is an error
+    // to pass a value of pos that is greater than or equal to the
+    // length of the string.
+    QPDF_DLL
+    unsigned long get_next_utf8_codepoint(
+        std::string const& utf8_val, size_t& pos, bool& error);
+
     // Test whether this is a UTF-16 string. This is indicated by
     // first two bytes being 0xFE 0xFF (big-endian) or 0xFF 0xFE
-    // (little-endian). Starting in qpdf 10.6.2, this detects
+    // (little-endian), each of which is the encoding of U+FEFF, the
+    // Unicode marker. Starting in qpdf 10.6.2, this detects
     // little-endian as well as big-endian. Even though the PDF spec
     // doesn't allow little-endian, most readers seem to accept it.
     QPDF_DLL
     bool is_utf16(std::string const&);
+    // Test whether this is an explicit UTF-8 string as allowed by the
+    // PDF 2.0 spec. This is indicated by first three bytes being 0xEF
+    // 0xBB 0xBF, which is the UTF-8 encoding of U+FEFF.
+    QPDF_DLL
+    bool is_explicit_utf8(std::string const&);
+
     // Convert a UTF-8 encoded string to UTF-16 big-endian.
     // Unrepresentable code points are converted to U+FFFD.
     QPDF_DLL
@@ -166,11 +166,9 @@ QPDF_String::getUTF8Val() const
 {
     if (QUtil::is_utf16(this->val)) {
         return QUtil::utf16_to_utf8(this->val);
-    } else if (
-        (val.length() >= 3) && (val.at(0) == '\xEF') && (val.at(1) == '\xBB') &&
-        (val.at(2) == '\xBF')) {
+    } else if (QUtil::is_explicit_utf8(this->val)) {
         // PDF 2.0 allows UTF-8 strings when explicitly prefixed with
-        // the above bytes, which is just UTF-8 encoding of U+FEFF.
+        // the three-byte representation of U+FEFF.
         return this->val.substr(3);
     } else {
         return QUtil::pdf_doc_to_utf8(this->val);
@@ -1529,10 +1529,11 @@ encode_pdfdoc(unsigned long codepoint)
 }
 unsigned long
-get_next_utf8_codepoint(std::string const& utf8_val, size_t& pos, bool& error)
+QUtil::get_next_utf8_codepoint(
+    std::string const& utf8_val, size_t& pos, bool& error)
 {
     size_t len = utf8_val.length();
-    unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos));
+    unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos++));
     error = false;
     if (ch < 128) {
         return static_cast<unsigned long>(ch);
@@ -1547,7 +1548,7 @@ get_next_utf8_codepoint(std::string const&amp; utf8_val, size_t&amp; pos, bool&amp; error)
         bit_check >>= 1;
     }
     if (((bytes_needed > 5) || (bytes_needed < 1)) ||
-        ((pos + bytes_needed) >= len)) {
+        ((pos + bytes_needed) > len)) {
         error = true;
         return 0xfffd;
     }
@@ -1555,11 +1556,11 @@ get_next_utf8_codepoint(std::string const&amp; utf8_val, size_t&amp; pos, bool&amp; error)
     unsigned long codepoint = static_cast<unsigned long>(ch & ~to_clear);
     while (bytes_needed > 0) {
         --bytes_needed;
-        ch = static_cast<unsigned char>(utf8_val.at(++pos));
+        ch = static_cast<unsigned char>(utf8_val.at(pos++));
         if ((ch & 0xc0) != 0x80) {
             --pos;
-            codepoint = 0xfffd;
-            break;
+            error = true;
+            return 0xfffd;
         }
         codepoint <<= 6;
         codepoint += (ch & 0x3f);
@@ -1580,9 +1581,11 @@ transcode_utf8(
         result += "\xfe\xff";
     }
     size_t len = utf8_val.length();
-    for (size_t i = 0; i < len; ++i) {
+    size_t pos = 0;
+    while (pos < len) {
         bool error = false;
-        unsigned long codepoint = get_next_utf8_codepoint(utf8_val, i, error);
+        unsigned long codepoint =
+            QUtil::get_next_utf8_codepoint(utf8_val, pos, error);
         if (error) {
             okay = false;
             if (encoding == e_utf16) {
@@ -1710,6 +1713,15 @@ QUtil::is_utf16(std::string const&amp; val)
          ((val.at(0) == '\xff') && (val.at(1) == '\xfe'))));
 }
+bool
+QUtil::is_explicit_utf8(std::string const& val)
+{
+    // QPDF_String.cc knows that this is a 3-byte sequence.
+    return (
+        (val.length() >= 3) && (val.at(0) == '\xef') && (val.at(1) == '\xbb') &&
+        (val.at(2) == '\xbf'));
+}
+
 std::string
 QUtil::utf16_to_utf8(std::string const& val)
 {
@@ -1826,10 +1838,11 @@ QUtil::analyze_encoding(
         return;
     }
     size_t len = val.length();
+    size_t pos = 0;
     bool any_errors = false;
-    for (size_t i = 0; i < len; ++i) {
+    while (pos < len) {
         bool error = false;
-        unsigned long codepoint = get_next_utf8_codepoint(val, i, error);
+        unsigned long codepoint = get_next_utf8_codepoint(val, pos, error);
         if (error) {
             any_errors = true;
         }
@@ -240,6 +240,33 @@ print_utf8(unsigned long val)
         }
     }
     std::cout << std::endl;
+
+    // Boundary conditions for QUtil::get_next_utf8_codepoint, which is
+    // also tested indirectly through test_pdf_unicode.cc.
+    std::string utf8 = "\xcf\x80\xcf\x30\xEF\xBF\x30\x31\xcf";
+    size_t pos = 0;
+    bool error = false;
+    assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x3c0);
+    assert(pos == 2);
+    assert(!error);
+    assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd);
+    assert(pos == 3);
+    assert(error);
+    assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x30);
+    assert(pos == 4);
+    assert(!error);
+    assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd);
+    assert(pos == 6);
+    assert(error);
+    assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x30);
+    assert(pos == 7);
+    assert(!error);
+    assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0x31);
+    assert(pos == 8);
+    assert(!error);
+    assert(QUtil::get_next_utf8_codepoint(utf8, pos, error) == 0xfffd);
+    assert(pos == 9);
+    assert(error);
 }
 void
@@ -3,5 +3,5 @@ This file has utf-8 encoding errors and should be edited as a binary file. // &lt;5
 0: too many bytes: �after // <feff0030003a00200074006f006f0020006d0061006e0079002000620079007400650073003a0020fffd00610066007400650072>
 1: too few bytes: �after // <feff0031003a00200074006f006f0020006600650077002000620079007400650073003a0020fffd00610066007400650072>
 2: invalid codepoint (U+DEAD): �after // <feff0032003a00200069006e00760061006c0069006400200063006f006400650070006f0069006e0074002000280055002b00440045004100440029003a0020fffd00610066007400650072>
-3: not enough bytes for character: �!after (! included) // <333a206e6f7420656e6f75676820627974657320666f72206368617261637465723a209f21616674657220282120696e636c7564656429>
+3: not enough bytes for character: �!after (! included) // <feff0033003a0020006e006f007400200065006e006f00750067006800200062007900740065007300200066006f00720020006300680061007200610063007400650072003a0020fffd00210061006600740065007200200028002100200069006e0063006c00750064006500640029>
 4: not enough bytes left in file � // <feff0034003a0020006e006f007400200065006e006f0075006700680020006200790074006500730020006c00650066007400200069006e002000660069006c00650020fffd>