Commit 82497ea8cb3893459f642462fa2a0e3dd90e63c8

Authored by m-holger
1 parent 09020472

Refactor UTF detection: move logic from `QUtil` to `qpdf::util` and update references

libqpdf/QPDF_String.cc
... ... @@ -2,6 +2,7 @@
2 2  
3 3 #include <qpdf/QPDFObjectHandle_private.hh>
4 4 #include <qpdf/QUtil.hh>
  5 +#include <qpdf/Util.hh>
5 6  
6 7 // DO NOT USE ctype -- it is locale dependent for some things, and it's not worth the risk of
7 8 // including it in case it may accidentally be used.
... ... @@ -9,7 +10,7 @@
9 10 static bool
10 11 is_iso_latin1_printable(char ch)
11 12 {
12   - return (((ch >= 32) && (ch <= 126)) || (static_cast<unsigned char>(ch) >= 160));
  13 + return (ch >= 32 && ch <= 126) || static_cast<unsigned char>(ch) >= 160;
13 14 }
14 15  
15 16 std::shared_ptr<QPDFObject>
... ... @@ -30,7 +31,7 @@ QPDF_String::writeJSON(int json_version, JSON::Writer&amp; p)
30 31 p << "\"" << JSON::Writer::encode_string(candidate) << "\"";
31 32 } else {
32 33 // See if we can unambiguously represent as Unicode.
33   - if (QUtil::is_utf16(val) || QUtil::is_explicit_utf8(val)) {
  34 + if (util::is_utf16(val) || util::is_explicit_utf8(val)) {
34 35 p << "\"u:" << JSON::Writer::encode_string(candidate) << "\"";
35 36 return;
36 37 } else if (!useHexString()) {
... ... @@ -137,13 +138,13 @@ QPDF_String::unparse(bool force_binary)
137 138 std::string
138 139 QPDF_String::getUTF8Val() const
139 140 {
140   - if (QUtil::is_utf16(val)) {
  141 + if (util::is_utf16(val)) {
141 142 return QUtil::utf16_to_utf8(val);
142   - } else if (QUtil::is_explicit_utf8(val)) {
  143 + }
  144 + if (util::is_explicit_utf8(val)) {
143 145 // PDF 2.0 allows UTF-8 strings when explicitly prefixed with the three-byte representation
144 146 // of U+FEFF.
145 147 return val.substr(3);
146   - } else {
147   - return QUtil::pdf_doc_to_utf8(val);
148 148 }
  149 + return QUtil::pdf_doc_to_utf8(val);
149 150 }
... ...
libqpdf/QUtil.cc
... ... @@ -1688,19 +1688,13 @@ QUtil::utf8_to_pdf_doc(std::string const&amp; utf8, std::string&amp; pdfdoc, char unknow
1688 1688 bool
1689 1689 QUtil::is_utf16(std::string const& val)
1690 1690 {
1691   - return (
1692   - (val.length() >= 2) &&
1693   - (((val.at(0) == '\xfe') && (val.at(1) == '\xff')) ||
1694   - ((val.at(0) == '\xff') && (val.at(1) == '\xfe'))));
  1691 + return util::is_utf16(val);
1695 1692 }
1696 1693  
1697 1694 bool
1698 1695 QUtil::is_explicit_utf8(std::string const& val)
1699 1696 {
1700   - // QPDF_String.cc knows that this is a 3-byte sequence.
1701   - return (
1702   - (val.length() >= 3) && (val.at(0) == '\xef') && (val.at(1) == '\xbb') &&
1703   - (val.at(2) == '\xbf'));
  1697 + return util::is_explicit_utf8(val);
1704 1698 }
1705 1699  
1706 1700 std::string
... ...
libqpdf/qpdf/Util.hh
... ... @@ -74,6 +74,19 @@ namespace qpdf::util
74 74 s.insert(0, 1, '1');
75 75 }
76 76  
  77 + inline bool
  78 + is_utf16(std::string const& str)
  79 + {
  80 + return str.starts_with("\xfe\xff") || str.starts_with("\xff\xfe");
  81 + }
  82 +
  83 + inline bool
  84 + is_explicit_utf8(std::string const& str)
  85 + {
  86 + // QPDF_String.cc knows that this is a 3-byte sequence.
  87 + return str.starts_with("\xef\xbb\xbf");
  88 + }
  89 +
77 90 std::string random_string(size_t len);
78 91  
79 92 } // namespace qpdf::util
... ...
libtests/qutil.cc
... ... @@ -367,6 +367,16 @@ check_analyze(std::string const&amp; str, bool has8bit, bool utf8, bool utf16)
367 367 }
368 368  
369 369 void
  370 +explicit_utf8_test()
  371 +{
  372 + assert(QUtil::is_explicit_utf8("\xef\xbb\xbfnot empty"));
  373 + assert(QUtil::is_explicit_utf8("\xef\xbb\xbf"));
  374 + assert(!QUtil::is_explicit_utf8("\xef\xbb\xbenot explicit"));
  375 + assert(!QUtil::is_explicit_utf8("\xef\xbe\xbfnot explicit"));
  376 + assert(!QUtil::is_explicit_utf8("\xee\xbb\xbfnot explicit"));
  377 +}
  378 +
  379 +void
370 380 print_alternatives(std::string const& str)
371 381 {
372 382 std::vector<std::string> result = QUtil::possible_repaired_encodings(str);
... ... @@ -432,7 +442,7 @@ transcoding_test()
432 442 std::string other_to_utf8;
433 443 assert(!QUtil::utf8_to_pdf_doc(other_utf8, other_to_utf8));
434 444 std::cout << other_to_utf8 << '\n';
435   - std::cout << "done other characters" << '\n';
  445 + std::cout << "done other characters\n";
436 446 // These valid UTF8 strings when converted to PDFDoc would end up
437 447 // with a byte sequence that would be recognized as UTF-8 or
438 448 // UTF-16 rather than PDFDoc. A special case is required to store
... ... @@ -747,6 +757,7 @@ main(int argc, char* argv[])
747 757 getenv_test();
748 758 std::cout << "---- utf8" << '\n';
749 759 to_utf8_test();
  760 + explicit_utf8_test();
750 761 std::cout << "---- utf16" << '\n';
751 762 to_utf16_test();
752 763 std::cout << "---- utf8_to_ascii" << '\n';
... ...