Commit 82497ea8cb3893459f642462fa2a0e3dd90e63c8
1 parent
09020472
Refactor UTF detection: move logic from `QUtil` to `qpdf::util` and update references
Showing
4 changed files
with
34 additions
and
15 deletions
libqpdf/QPDF_String.cc
| @@ -2,6 +2,7 @@ | @@ -2,6 +2,7 @@ | ||
| 2 | 2 | ||
| 3 | #include <qpdf/QPDFObjectHandle_private.hh> | 3 | #include <qpdf/QPDFObjectHandle_private.hh> |
| 4 | #include <qpdf/QUtil.hh> | 4 | #include <qpdf/QUtil.hh> |
| 5 | +#include <qpdf/Util.hh> | ||
| 5 | 6 | ||
| 6 | // DO NOT USE ctype -- it is locale dependent for some things, and it's not worth the risk of | 7 | // DO NOT USE ctype -- it is locale dependent for some things, and it's not worth the risk of |
| 7 | // including it in case it may accidentally be used. | 8 | // including it in case it may accidentally be used. |
| @@ -9,7 +10,7 @@ | @@ -9,7 +10,7 @@ | ||
| 9 | static bool | 10 | static bool |
| 10 | is_iso_latin1_printable(char ch) | 11 | is_iso_latin1_printable(char ch) |
| 11 | { | 12 | { |
| 12 | - return (((ch >= 32) && (ch <= 126)) || (static_cast<unsigned char>(ch) >= 160)); | 13 | + return (ch >= 32 && ch <= 126) || static_cast<unsigned char>(ch) >= 160; |
| 13 | } | 14 | } |
| 14 | 15 | ||
| 15 | std::shared_ptr<QPDFObject> | 16 | std::shared_ptr<QPDFObject> |
| @@ -30,7 +31,7 @@ QPDF_String::writeJSON(int json_version, JSON::Writer& p) | @@ -30,7 +31,7 @@ QPDF_String::writeJSON(int json_version, JSON::Writer& p) | ||
| 30 | p << "\"" << JSON::Writer::encode_string(candidate) << "\""; | 31 | p << "\"" << JSON::Writer::encode_string(candidate) << "\""; |
| 31 | } else { | 32 | } else { |
| 32 | // See if we can unambiguously represent as Unicode. | 33 | // See if we can unambiguously represent as Unicode. |
| 33 | - if (QUtil::is_utf16(val) || QUtil::is_explicit_utf8(val)) { | 34 | + if (util::is_utf16(val) || util::is_explicit_utf8(val)) { |
| 34 | p << "\"u:" << JSON::Writer::encode_string(candidate) << "\""; | 35 | p << "\"u:" << JSON::Writer::encode_string(candidate) << "\""; |
| 35 | return; | 36 | return; |
| 36 | } else if (!useHexString()) { | 37 | } else if (!useHexString()) { |
| @@ -137,13 +138,13 @@ QPDF_String::unparse(bool force_binary) | @@ -137,13 +138,13 @@ QPDF_String::unparse(bool force_binary) | ||
| 137 | std::string | 138 | std::string |
| 138 | QPDF_String::getUTF8Val() const | 139 | QPDF_String::getUTF8Val() const |
| 139 | { | 140 | { |
| 140 | - if (QUtil::is_utf16(val)) { | 141 | + if (util::is_utf16(val)) { |
| 141 | return QUtil::utf16_to_utf8(val); | 142 | return QUtil::utf16_to_utf8(val); |
| 142 | - } else if (QUtil::is_explicit_utf8(val)) { | 143 | + } |
| 144 | + if (util::is_explicit_utf8(val)) { | ||
| 143 | // PDF 2.0 allows UTF-8 strings when explicitly prefixed with the three-byte representation | 145 | // PDF 2.0 allows UTF-8 strings when explicitly prefixed with the three-byte representation |
| 144 | // of U+FEFF. | 146 | // of U+FEFF. |
| 145 | return val.substr(3); | 147 | return val.substr(3); |
| 146 | - } else { | ||
| 147 | - return QUtil::pdf_doc_to_utf8(val); | ||
| 148 | } | 148 | } |
| 149 | + return QUtil::pdf_doc_to_utf8(val); | ||
| 149 | } | 150 | } |
libqpdf/QUtil.cc
| @@ -1688,19 +1688,13 @@ QUtil::utf8_to_pdf_doc(std::string const& utf8, std::string& pdfdoc, char unknow | @@ -1688,19 +1688,13 @@ QUtil::utf8_to_pdf_doc(std::string const& utf8, std::string& pdfdoc, char unknow | ||
| 1688 | bool | 1688 | bool |
| 1689 | QUtil::is_utf16(std::string const& val) | 1689 | QUtil::is_utf16(std::string const& val) |
| 1690 | { | 1690 | { |
| 1691 | - return ( | ||
| 1692 | - (val.length() >= 2) && | ||
| 1693 | - (((val.at(0) == '\xfe') && (val.at(1) == '\xff')) || | ||
| 1694 | - ((val.at(0) == '\xff') && (val.at(1) == '\xfe')))); | 1691 | + return util::is_utf16(val); |
| 1695 | } | 1692 | } |
| 1696 | 1693 | ||
| 1697 | bool | 1694 | bool |
| 1698 | QUtil::is_explicit_utf8(std::string const& val) | 1695 | QUtil::is_explicit_utf8(std::string const& val) |
| 1699 | { | 1696 | { |
| 1700 | - // QPDF_String.cc knows that this is a 3-byte sequence. | ||
| 1701 | - return ( | ||
| 1702 | - (val.length() >= 3) && (val.at(0) == '\xef') && (val.at(1) == '\xbb') && | ||
| 1703 | - (val.at(2) == '\xbf')); | 1697 | + return util::is_explicit_utf8(val); |
| 1704 | } | 1698 | } |
| 1705 | 1699 | ||
| 1706 | std::string | 1700 | std::string |
libqpdf/qpdf/Util.hh
| @@ -74,6 +74,19 @@ namespace qpdf::util | @@ -74,6 +74,19 @@ namespace qpdf::util | ||
| 74 | s.insert(0, 1, '1'); | 74 | s.insert(0, 1, '1'); |
| 75 | } | 75 | } |
| 76 | 76 | ||
| 77 | + inline bool | ||
| 78 | + is_utf16(std::string const& str) | ||
| 79 | + { | ||
| 80 | + return str.starts_with("\xfe\xff") || str.starts_with("\xff\xfe"); | ||
| 81 | + } | ||
| 82 | + | ||
| 83 | + inline bool | ||
| 84 | + is_explicit_utf8(std::string const& str) | ||
| 85 | + { | ||
| 86 | + // QPDF_String.cc knows that this is a 3-byte sequence. | ||
| 87 | + return str.starts_with("\xef\xbb\xbf"); | ||
| 88 | + } | ||
| 89 | + | ||
| 77 | std::string random_string(size_t len); | 90 | std::string random_string(size_t len); |
| 78 | 91 | ||
| 79 | } // namespace qpdf::util | 92 | } // namespace qpdf::util |
libtests/qutil.cc
| @@ -367,6 +367,16 @@ check_analyze(std::string const& str, bool has8bit, bool utf8, bool utf16) | @@ -367,6 +367,16 @@ check_analyze(std::string const& str, bool has8bit, bool utf8, bool utf16) | ||
| 367 | } | 367 | } |
| 368 | 368 | ||
| 369 | void | 369 | void |
| 370 | +explicit_utf8_test() | ||
| 371 | +{ | ||
| 372 | + assert(QUtil::is_explicit_utf8("\xef\xbb\xbfnot empty")); | ||
| 373 | + assert(QUtil::is_explicit_utf8("\xef\xbb\xbf")); | ||
| 374 | + assert(!QUtil::is_explicit_utf8("\xef\xbb\xbenot explicit")); | ||
| 375 | + assert(!QUtil::is_explicit_utf8("\xef\xbe\xbfnot explicit")); | ||
| 376 | + assert(!QUtil::is_explicit_utf8("\xee\xbb\xbfnot explicit")); | ||
| 377 | +} | ||
| 378 | + | ||
| 379 | +void | ||
| 370 | print_alternatives(std::string const& str) | 380 | print_alternatives(std::string const& str) |
| 371 | { | 381 | { |
| 372 | std::vector<std::string> result = QUtil::possible_repaired_encodings(str); | 382 | std::vector<std::string> result = QUtil::possible_repaired_encodings(str); |
| @@ -432,7 +442,7 @@ transcoding_test() | @@ -432,7 +442,7 @@ transcoding_test() | ||
| 432 | std::string other_to_utf8; | 442 | std::string other_to_utf8; |
| 433 | assert(!QUtil::utf8_to_pdf_doc(other_utf8, other_to_utf8)); | 443 | assert(!QUtil::utf8_to_pdf_doc(other_utf8, other_to_utf8)); |
| 434 | std::cout << other_to_utf8 << '\n'; | 444 | std::cout << other_to_utf8 << '\n'; |
| 435 | - std::cout << "done other characters" << '\n'; | 445 | + std::cout << "done other characters\n"; |
| 436 | // These valid UTF8 strings when converted to PDFDoc would end up | 446 | // These valid UTF8 strings when converted to PDFDoc would end up |
| 437 | // with a byte sequence that would be recognized as UTF-8 or | 447 | // with a byte sequence that would be recognized as UTF-8 or |
| 438 | // UTF-16 rather than PDFDoc. A special case is required to store | 448 | // UTF-16 rather than PDFDoc. A special case is required to store |
| @@ -747,6 +757,7 @@ main(int argc, char* argv[]) | @@ -747,6 +757,7 @@ main(int argc, char* argv[]) | ||
| 747 | getenv_test(); | 757 | getenv_test(); |
| 748 | std::cout << "---- utf8" << '\n'; | 758 | std::cout << "---- utf8" << '\n'; |
| 749 | to_utf8_test(); | 759 | to_utf8_test(); |
| 760 | + explicit_utf8_test(); | ||
| 750 | std::cout << "---- utf16" << '\n'; | 761 | std::cout << "---- utf16" << '\n'; |
| 751 | to_utf16_test(); | 762 | to_utf16_test(); |
| 752 | std::cout << "---- utf8_to_ascii" << '\n'; | 763 | std::cout << "---- utf8_to_ascii" << '\n'; |