Commit 82497ea8cb3893459f642462fa2a0e3dd90e63c8
1 parent
09020472
Refactor UTF detection: move logic from `QUtil` to `qpdf::util` and update references
Showing
4 changed files
with
34 additions
and
15 deletions
libqpdf/QPDF_String.cc
| ... | ... | @@ -2,6 +2,7 @@ |
| 2 | 2 | |
| 3 | 3 | #include <qpdf/QPDFObjectHandle_private.hh> |
| 4 | 4 | #include <qpdf/QUtil.hh> |
| 5 | +#include <qpdf/Util.hh> | |
| 5 | 6 | |
| 6 | 7 | // DO NOT USE ctype -- it is locale dependent for some things, and it's not worth the risk of |
| 7 | 8 | // including it in case it may accidentally be used. |
| ... | ... | @@ -9,7 +10,7 @@ |
| 9 | 10 | static bool |
| 10 | 11 | is_iso_latin1_printable(char ch) |
| 11 | 12 | { |
| 12 | - return (((ch >= 32) && (ch <= 126)) || (static_cast<unsigned char>(ch) >= 160)); | |
| 13 | + return (ch >= 32 && ch <= 126) || static_cast<unsigned char>(ch) >= 160; | |
| 13 | 14 | } |
| 14 | 15 | |
| 15 | 16 | std::shared_ptr<QPDFObject> |
| ... | ... | @@ -30,7 +31,7 @@ QPDF_String::writeJSON(int json_version, JSON::Writer& p) |
| 30 | 31 | p << "\"" << JSON::Writer::encode_string(candidate) << "\""; |
| 31 | 32 | } else { |
| 32 | 33 | // See if we can unambiguously represent as Unicode. |
| 33 | - if (QUtil::is_utf16(val) || QUtil::is_explicit_utf8(val)) { | |
| 34 | + if (util::is_utf16(val) || util::is_explicit_utf8(val)) { | |
| 34 | 35 | p << "\"u:" << JSON::Writer::encode_string(candidate) << "\""; |
| 35 | 36 | return; |
| 36 | 37 | } else if (!useHexString()) { |
| ... | ... | @@ -137,13 +138,13 @@ QPDF_String::unparse(bool force_binary) |
| 137 | 138 | std::string |
| 138 | 139 | QPDF_String::getUTF8Val() const |
| 139 | 140 | { |
| 140 | - if (QUtil::is_utf16(val)) { | |
| 141 | + if (util::is_utf16(val)) { | |
| 141 | 142 | return QUtil::utf16_to_utf8(val); |
| 142 | - } else if (QUtil::is_explicit_utf8(val)) { | |
| 143 | + } | |
| 144 | + if (util::is_explicit_utf8(val)) { | |
| 143 | 145 | // PDF 2.0 allows UTF-8 strings when explicitly prefixed with the three-byte representation |
| 144 | 146 | // of U+FEFF. |
| 145 | 147 | return val.substr(3); |
| 146 | - } else { | |
| 147 | - return QUtil::pdf_doc_to_utf8(val); | |
| 148 | 148 | } |
| 149 | + return QUtil::pdf_doc_to_utf8(val); | |
| 149 | 150 | } | ... | ... |
libqpdf/QUtil.cc
| ... | ... | @@ -1688,19 +1688,13 @@ QUtil::utf8_to_pdf_doc(std::string const& utf8, std::string& pdfdoc, char unknow |
| 1688 | 1688 | bool |
| 1689 | 1689 | QUtil::is_utf16(std::string const& val) |
| 1690 | 1690 | { |
| 1691 | - return ( | |
| 1692 | - (val.length() >= 2) && | |
| 1693 | - (((val.at(0) == '\xfe') && (val.at(1) == '\xff')) || | |
| 1694 | - ((val.at(0) == '\xff') && (val.at(1) == '\xfe')))); | |
| 1691 | + return util::is_utf16(val); | |
| 1695 | 1692 | } |
| 1696 | 1693 | |
| 1697 | 1694 | bool |
| 1698 | 1695 | QUtil::is_explicit_utf8(std::string const& val) |
| 1699 | 1696 | { |
| 1700 | - // QPDF_String.cc knows that this is a 3-byte sequence. | |
| 1701 | - return ( | |
| 1702 | - (val.length() >= 3) && (val.at(0) == '\xef') && (val.at(1) == '\xbb') && | |
| 1703 | - (val.at(2) == '\xbf')); | |
| 1697 | + return util::is_explicit_utf8(val); | |
| 1704 | 1698 | } |
| 1705 | 1699 | |
| 1706 | 1700 | std::string | ... | ... |
libqpdf/qpdf/Util.hh
| ... | ... | @@ -74,6 +74,19 @@ namespace qpdf::util |
| 74 | 74 | s.insert(0, 1, '1'); |
| 75 | 75 | } |
| 76 | 76 | |
| 77 | + inline bool | |
| 78 | + is_utf16(std::string const& str) | |
| 79 | + { | |
| 80 | + return str.starts_with("\xfe\xff") || str.starts_with("\xff\xfe"); | |
| 81 | + } | |
| 82 | + | |
| 83 | + inline bool | |
| 84 | + is_explicit_utf8(std::string const& str) | |
| 85 | + { | |
| 86 | + // QPDF_String.cc knows that this is a 3-byte sequence. | |
| 87 | + return str.starts_with("\xef\xbb\xbf"); | |
| 88 | + } | |
| 89 | + | |
| 77 | 90 | std::string random_string(size_t len); |
| 78 | 91 | |
| 79 | 92 | } // namespace qpdf::util | ... | ... |
libtests/qutil.cc
| ... | ... | @@ -367,6 +367,16 @@ check_analyze(std::string const& str, bool has8bit, bool utf8, bool utf16) |
| 367 | 367 | } |
| 368 | 368 | |
| 369 | 369 | void |
| 370 | +explicit_utf8_test() | |
| 371 | +{ | |
| 372 | + assert(QUtil::is_explicit_utf8("\xef\xbb\xbfnot empty")); | |
| 373 | + assert(QUtil::is_explicit_utf8("\xef\xbb\xbf")); | |
| 374 | + assert(!QUtil::is_explicit_utf8("\xef\xbb\xbenot explicit")); | |
| 375 | + assert(!QUtil::is_explicit_utf8("\xef\xbe\xbfnot explicit")); | |
| 376 | + assert(!QUtil::is_explicit_utf8("\xee\xbb\xbfnot explicit")); | |
| 377 | +} | |
| 378 | + | |
| 379 | +void | |
| 370 | 380 | print_alternatives(std::string const& str) |
| 371 | 381 | { |
| 372 | 382 | std::vector<std::string> result = QUtil::possible_repaired_encodings(str); |
| ... | ... | @@ -432,7 +442,7 @@ transcoding_test() |
| 432 | 442 | std::string other_to_utf8; |
| 433 | 443 | assert(!QUtil::utf8_to_pdf_doc(other_utf8, other_to_utf8)); |
| 434 | 444 | std::cout << other_to_utf8 << '\n'; |
| 435 | - std::cout << "done other characters" << '\n'; | |
| 445 | + std::cout << "done other characters\n"; | |
| 436 | 446 | // These valid UTF8 strings when converted to PDFDoc would end up |
| 437 | 447 | // with a byte sequence that would be recognized as UTF-8 or |
| 438 | 448 | // UTF-16 rather than PDFDoc. A special case is required to store |
| ... | ... | @@ -747,6 +757,7 @@ main(int argc, char* argv[]) |
| 747 | 757 | getenv_test(); |
| 748 | 758 | std::cout << "---- utf8" << '\n'; |
| 749 | 759 | to_utf8_test(); |
| 760 | + explicit_utf8_test(); | |
| 750 | 761 | std::cout << "---- utf16" << '\n'; |
| 751 | 762 | to_utf16_test(); |
| 752 | 763 | std::cout << "---- utf8_to_ascii" << '\n'; | ... | ... |