Silently/transparently recognize UTF-16LE as UTF-16 (fixes #649)

The PDF spec only allows UTF-16BE, but most readers seem to accept UTF-16LE as well, so now qpdf does too.

Silently/transparently recognize UTF-16LE as UTF-16 (fixes #649)
The PDF spec only allows UTF-16BE, but most readers seem to accept UTF-16LE as well, so now qpdf does too.
Jay Berkenbilt
1 parent fbd3e56d
Showing 8 changed files with 45 additions and 10 deletions
ChangeLog
include/qpdf/QUtil.hh
libqpdf/QUtil.cc
libtests/qtest/qutil/qutil.out
libtests/qutil.cc
qpdf/qtest/qpdf.test
qpdf/qtest/qpdf/utf16le-attachments.out
qpdf/qtest/qpdf/utf16le.pdf
 2022-02-15  Jay Berkenbilt  <ejb@ql.org>
  
+	* When analyzing PDF strings, recognize UTF-16LE as UTF-16. The
+	PDF spec only allows UTF-16BE, but most readers seem to allow
+	both. Fixes #649.
+
 	* Bug fix: 10.6.0 inadvertently removed an unknown/undocumented
 	CLI parsing feature, which has been restored in 10.6.2. Fixes #652.
  
@@ -267,8 +267,11 @@ namespace QUtil
     QPDF_DLL
     std::string toUTF16(unsigned long uval);
  
-    // Test whether this is a UTF-16 big-endian string. This is
-    // indicated by first two bytes being 0xFE 0xFF.
+    // Test whether this is a UTF-16 string. This is indicated by
+    // first two bytes being 0xFE 0xFF (big-endian) or 0xFF 0xFE
+    // (little-endian). Starting in qpdf 10.6.2, this detects
+    // little-endian as well as big-endian. Even though the PDF spec
+    // doesn't allow little-endian, most readers seem to accept it.
     QPDF_DLL
     bool is_utf16(std::string const&);
  
@@ -309,8 +312,8 @@ namespace QUtil
     bool utf8_to_pdf_doc(
         std::string const& utf8, std::string& pdfdoc, char unknown_char = '?');
  
-    // Convert a UTF-16 big-endian encoded string to UTF-8.
-    // Unrepresentable code points are converted to U+FFFD.
+    // Convert a UTF-16 encoded string to UTF-8. Unrepresentable code
+    // points are converted to U+FFFD.
     QPDF_DLL
     std::string utf16_to_utf8(std::string const& utf16);
  
@@ -331,7 +334,9 @@ namespace QUtil
     // help us guess. If there are no characters with the high bit
     // set, has_8bit_chars is false, and the other values are also
     // false, even though ASCII strings are valid UTF-8. is_valid_utf8
-    // means that the string is non-trivially valid UTF-8.
+    // means that the string is non-trivially valid UTF-8. Although
+    // the PDF spec requires UTF-16 to be UTF-16BE, qpdf (and just
+    // about everything else) accepts UTF-16LE (as of 10.6.2).
     QPDF_DLL
     void analyze_encoding(std::string const& str,
                           bool& has_8bit_chars,
@@ -2400,7 +2400,8 @@ bool
 QUtil::is_utf16(std::string const& val)
 {
     return ((val.length() >= 2) &&
-            (val.at(0) == '\xfe') && (val.at(1) == '\xff'));
+            (((val.at(0) == '\xfe') && (val.at(1) == '\xff')) ||
+             ((val.at(0) == '\xff') && (val.at(1) == '\xfe'))));
 }
  
 std::string
@@ -2414,8 +2415,13 @@ QUtil::utf16_to_utf8(std::string const&amp; val)
     unsigned long codepoint = 0L;
     size_t len = val.length();
     size_t start = 0;
+    bool is_le = false;
     if (is_utf16(val))
     {
+        if (static_cast<unsigned char>(val.at(0)) == 0xff)
+        {
+            is_le = true;
+        }
         start += 2;
     }
     // If the string has an odd number of bytes, the last byte is
@@ -2428,10 +2434,12 @@ QUtil::utf16_to_utf8(std::string const&amp; val)
         // codepoint not followed by a low codepoint will be
         // discarded, and a low codepoint not preceded by a high
         // codepoint will just get its low 10 bits output.
+        auto msb = is_le ? i+1 : i;
+        auto lsb = is_le ? i : i+1;
         unsigned short bits =
             QIntC::to_ushort(
-                (static_cast<unsigned char>(val.at(i)) << 8) +
-                static_cast<unsigned char>(val.at(i+1)));
+                (static_cast<unsigned char>(val.at(msb)) << 8) +
+                static_cast<unsigned char>(val.at(lsb)));
         if ((bits & 0xFC00) == 0xD800)
         {
             codepoint = 0x10000U + ((bits & 0x3FFU) << 10U);
@@ -63,6 +63,7 @@ HAGOOGAMAGOOGLE: 0
 0x80000000 -> ff fd
 π
 π
+LE: π
 ---- utf8_to_ascii
 ¿Does π have fingers?
 ?Does ? have fingers?
@@ -303,6 +303,7 @@ void to_utf16_test()
     std::string s(QUtil::utf8_to_utf16("\xcf\x80"));
     std::cout << QUtil::utf16_to_utf8(s) << std::endl;
     std::cout << QUtil::utf16_to_utf8(s + ".") << std::endl;
+    std::cout << "LE: " << QUtil::utf16_to_utf8("\xff\xfe\xc0\x03") << std::endl;
 }
  
 void utf8_to_ascii_test()
@@ -388,7 +389,8 @@ void transcoding_test()
     check_analyze("pi = \317\200", true, true, false);
     check_analyze("pi != \317", true, false, false);
     check_analyze("pi != 22/7", false, false, false);
-    check_analyze(std::string("\xfe\xff\00\x51", 4), true, false, true);
+    check_analyze(std::string("\xfe\xff\x00\x51", 4), true, false, true);
+    check_analyze(std::string("\xff\xfe\x51\x00", 4), true, false, true);
     std::cout << "analysis done" << std::endl;
     std::string input1("a\302\277b");
     std::string input2("a\317\200b");
@@ -73,7 +73,7 @@ flush_tiff_cache();
 show_ntests();
 # ----------
 $td->notify("--- Character Encoding ---");
-$n_tests += 3;
+$n_tests += 4;
  
 $td->runtest("PDF doc encoding to Unicode",
              {$td->COMMAND => "test_pdf_doc_encoding pdf-doc-to-utf8.in"},
@@ -88,6 +88,13 @@ $td-&gt;runtest(&quot;UTF-16 encoding errors&quot;,
              {$td->FILE => "unicode-errors.out", $td->EXIT_STATUS => 0},
              $td->NORMALIZE_NEWLINES);
  
+# UTF-16LE is not allowed by the PDF spec, but it seems that most
+# readers accept it.
+$td->runtest("UTF-16LE strings",
+             {$td->COMMAND => "qpdf --list-attachments --verbose utf16le.pdf"},
+             {$td->FILE => "utf16le-attachments.out", $td->EXIT_STATUS => 0},
+             $td->NORMALIZE_NEWLINES);
+
 # Tests to exercise QPDFArgParser belong in arg_parser.test in
 # libtests. These tests are supposed to be specific to the qpdf cli.
 # Since they were written prior to moving QPDFArgParser into the
+potato.png -> 6,0
+  preferred name: π.png
+  all names:
+    /F -> π.png
+    /UF -> π.png
+  all data streams:
+    /F -> 6,0
+    /UF -> 6,0