Commit 82497ea8cb3893459f642462fa2a0e3dd90e63c8

Authored by m-holger
1 parent 09020472

Refactor UTF detection: move logic from `QUtil` to `qpdf::util` and update references

libqpdf/QPDF_String.cc
@@ -2,6 +2,7 @@ @@ -2,6 +2,7 @@
2 2
3 #include <qpdf/QPDFObjectHandle_private.hh> 3 #include <qpdf/QPDFObjectHandle_private.hh>
4 #include <qpdf/QUtil.hh> 4 #include <qpdf/QUtil.hh>
  5 +#include <qpdf/Util.hh>
5 6
6 // DO NOT USE ctype -- it is locale dependent for some things, and it's not worth the risk of 7 // DO NOT USE ctype -- it is locale dependent for some things, and it's not worth the risk of
7 // including it in case it may accidentally be used. 8 // including it in case it may accidentally be used.
@@ -9,7 +10,7 @@ @@ -9,7 +10,7 @@
9 static bool 10 static bool
10 is_iso_latin1_printable(char ch) 11 is_iso_latin1_printable(char ch)
11 { 12 {
12 - return (((ch >= 32) && (ch <= 126)) || (static_cast<unsigned char>(ch) >= 160)); 13 + return (ch >= 32 && ch <= 126) || static_cast<unsigned char>(ch) >= 160;
13 } 14 }
14 15
15 std::shared_ptr<QPDFObject> 16 std::shared_ptr<QPDFObject>
@@ -30,7 +31,7 @@ QPDF_String::writeJSON(int json_version, JSON::Writer&amp; p) @@ -30,7 +31,7 @@ QPDF_String::writeJSON(int json_version, JSON::Writer&amp; p)
30 p << "\"" << JSON::Writer::encode_string(candidate) << "\""; 31 p << "\"" << JSON::Writer::encode_string(candidate) << "\"";
31 } else { 32 } else {
32 // See if we can unambiguously represent as Unicode. 33 // See if we can unambiguously represent as Unicode.
33 - if (QUtil::is_utf16(val) || QUtil::is_explicit_utf8(val)) { 34 + if (util::is_utf16(val) || util::is_explicit_utf8(val)) {
34 p << "\"u:" << JSON::Writer::encode_string(candidate) << "\""; 35 p << "\"u:" << JSON::Writer::encode_string(candidate) << "\"";
35 return; 36 return;
36 } else if (!useHexString()) { 37 } else if (!useHexString()) {
@@ -137,13 +138,13 @@ QPDF_String::unparse(bool force_binary) @@ -137,13 +138,13 @@ QPDF_String::unparse(bool force_binary)
137 std::string 138 std::string
138 QPDF_String::getUTF8Val() const 139 QPDF_String::getUTF8Val() const
139 { 140 {
140 - if (QUtil::is_utf16(val)) { 141 + if (util::is_utf16(val)) {
141 return QUtil::utf16_to_utf8(val); 142 return QUtil::utf16_to_utf8(val);
142 - } else if (QUtil::is_explicit_utf8(val)) { 143 + }
  144 + if (util::is_explicit_utf8(val)) {
143 // PDF 2.0 allows UTF-8 strings when explicitly prefixed with the three-byte representation 145 // PDF 2.0 allows UTF-8 strings when explicitly prefixed with the three-byte representation
144 // of U+FEFF. 146 // of U+FEFF.
145 return val.substr(3); 147 return val.substr(3);
146 - } else {  
147 - return QUtil::pdf_doc_to_utf8(val);  
148 } 148 }
  149 + return QUtil::pdf_doc_to_utf8(val);
149 } 150 }
libqpdf/QUtil.cc
@@ -1688,19 +1688,13 @@ QUtil::utf8_to_pdf_doc(std::string const&amp; utf8, std::string&amp; pdfdoc, char unknow @@ -1688,19 +1688,13 @@ QUtil::utf8_to_pdf_doc(std::string const&amp; utf8, std::string&amp; pdfdoc, char unknow
1688 bool 1688 bool
1689 QUtil::is_utf16(std::string const& val) 1689 QUtil::is_utf16(std::string const& val)
1690 { 1690 {
1691 - return (  
1692 - (val.length() >= 2) &&  
1693 - (((val.at(0) == '\xfe') && (val.at(1) == '\xff')) ||  
1694 - ((val.at(0) == '\xff') && (val.at(1) == '\xfe')))); 1691 + return util::is_utf16(val);
1695 } 1692 }
1696 1693
1697 bool 1694 bool
1698 QUtil::is_explicit_utf8(std::string const& val) 1695 QUtil::is_explicit_utf8(std::string const& val)
1699 { 1696 {
1700 - // QPDF_String.cc knows that this is a 3-byte sequence.  
1701 - return (  
1702 - (val.length() >= 3) && (val.at(0) == '\xef') && (val.at(1) == '\xbb') &&  
1703 - (val.at(2) == '\xbf')); 1697 + return util::is_explicit_utf8(val);
1704 } 1698 }
1705 1699
1706 std::string 1700 std::string
libqpdf/qpdf/Util.hh
@@ -74,6 +74,19 @@ namespace qpdf::util @@ -74,6 +74,19 @@ namespace qpdf::util
74 s.insert(0, 1, '1'); 74 s.insert(0, 1, '1');
75 } 75 }
76 76
  77 + inline bool
  78 + is_utf16(std::string const& str)
  79 + {
  80 + return str.starts_with("\xfe\xff") || str.starts_with("\xff\xfe");
  81 + }
  82 +
  83 + inline bool
  84 + is_explicit_utf8(std::string const& str)
  85 + {
  86 + // QPDF_String.cc knows that this is a 3-byte sequence.
  87 + return str.starts_with("\xef\xbb\xbf");
  88 + }
  89 +
77 std::string random_string(size_t len); 90 std::string random_string(size_t len);
78 91
79 } // namespace qpdf::util 92 } // namespace qpdf::util
libtests/qutil.cc
@@ -367,6 +367,16 @@ check_analyze(std::string const&amp; str, bool has8bit, bool utf8, bool utf16) @@ -367,6 +367,16 @@ check_analyze(std::string const&amp; str, bool has8bit, bool utf8, bool utf16)
367 } 367 }
368 368
369 void 369 void
  370 +explicit_utf8_test()
  371 +{
  372 + assert(QUtil::is_explicit_utf8("\xef\xbb\xbfnot empty"));
  373 + assert(QUtil::is_explicit_utf8("\xef\xbb\xbf"));
  374 + assert(!QUtil::is_explicit_utf8("\xef\xbb\xbenot explicit"));
  375 + assert(!QUtil::is_explicit_utf8("\xef\xbe\xbfnot explicit"));
  376 + assert(!QUtil::is_explicit_utf8("\xee\xbb\xbfnot explicit"));
  377 +}
  378 +
  379 +void
370 print_alternatives(std::string const& str) 380 print_alternatives(std::string const& str)
371 { 381 {
372 std::vector<std::string> result = QUtil::possible_repaired_encodings(str); 382 std::vector<std::string> result = QUtil::possible_repaired_encodings(str);
@@ -432,7 +442,7 @@ transcoding_test() @@ -432,7 +442,7 @@ transcoding_test()
432 std::string other_to_utf8; 442 std::string other_to_utf8;
433 assert(!QUtil::utf8_to_pdf_doc(other_utf8, other_to_utf8)); 443 assert(!QUtil::utf8_to_pdf_doc(other_utf8, other_to_utf8));
434 std::cout << other_to_utf8 << '\n'; 444 std::cout << other_to_utf8 << '\n';
435 - std::cout << "done other characters" << '\n'; 445 + std::cout << "done other characters\n";
436 // These valid UTF8 strings when converted to PDFDoc would end up 446 // These valid UTF8 strings when converted to PDFDoc would end up
437 // with a byte sequence that would be recognized as UTF-8 or 447 // with a byte sequence that would be recognized as UTF-8 or
438 // UTF-16 rather than PDFDoc. A special case is required to store 448 // UTF-16 rather than PDFDoc. A special case is required to store
@@ -747,6 +757,7 @@ main(int argc, char* argv[]) @@ -747,6 +757,7 @@ main(int argc, char* argv[])
747 getenv_test(); 757 getenv_test();
748 std::cout << "---- utf8" << '\n'; 758 std::cout << "---- utf8" << '\n';
749 to_utf8_test(); 759 to_utf8_test();
  760 + explicit_utf8_test();
750 std::cout << "---- utf16" << '\n'; 761 std::cout << "---- utf16" << '\n';
751 to_utf16_test(); 762 to_utf16_test();
752 std::cout << "---- utf8_to_ascii" << '\n'; 763 std::cout << "---- utf8_to_ascii" << '\n';