From 82497ea8cb3893459f642462fa2a0e3dd90e63c8 Mon Sep 17 00:00:00 2001 From: m-holger Date: Sat, 1 Nov 2025 13:10:24 +0000 Subject: [PATCH] Refactor UTF detection: move logic from `QUtil` to `qpdf::util` and update references --- libqpdf/QPDF_String.cc | 13 +++++++------ libqpdf/QUtil.cc | 10 ++-------- libqpdf/qpdf/Util.hh | 13 +++++++++++++ libtests/qutil.cc | 13 ++++++++++++- 4 files changed, 34 insertions(+), 15 deletions(-) diff --git a/libqpdf/QPDF_String.cc b/libqpdf/QPDF_String.cc index ffdcaab..568abb3 100644 --- a/libqpdf/QPDF_String.cc +++ b/libqpdf/QPDF_String.cc @@ -2,6 +2,7 @@ #include #include +#include // DO NOT USE ctype -- it is locale dependent for some things, and it's not worth the risk of // including it in case it may accidentally be used. @@ -9,7 +10,7 @@ static bool is_iso_latin1_printable(char ch) { - return (((ch >= 32) && (ch <= 126)) || (static_cast(ch) >= 160)); + return (ch >= 32 && ch <= 126) || static_cast(ch) >= 160; } std::shared_ptr @@ -30,7 +31,7 @@ QPDF_String::writeJSON(int json_version, JSON::Writer& p) p << "\"" << JSON::Writer::encode_string(candidate) << "\""; } else { // See if we can unambiguously represent as Unicode. - if (QUtil::is_utf16(val) || QUtil::is_explicit_utf8(val)) { + if (util::is_utf16(val) || util::is_explicit_utf8(val)) { p << "\"u:" << JSON::Writer::encode_string(candidate) << "\""; return; } else if (!useHexString()) { @@ -137,13 +138,13 @@ QPDF_String::unparse(bool force_binary) std::string QPDF_String::getUTF8Val() const { - if (QUtil::is_utf16(val)) { + if (util::is_utf16(val)) { return QUtil::utf16_to_utf8(val); - } else if (QUtil::is_explicit_utf8(val)) { + } + if (util::is_explicit_utf8(val)) { // PDF 2.0 allows UTF-8 strings when explicitly prefixed with the three-byte representation // of U+FEFF. return val.substr(3); - } else { - return QUtil::pdf_doc_to_utf8(val); } + return QUtil::pdf_doc_to_utf8(val); } diff --git a/libqpdf/QUtil.cc b/libqpdf/QUtil.cc index c235133..c879941 100644 --- a/libqpdf/QUtil.cc +++ b/libqpdf/QUtil.cc @@ -1688,19 +1688,13 @@ QUtil::utf8_to_pdf_doc(std::string const& utf8, std::string& pdfdoc, char unknow bool QUtil::is_utf16(std::string const& val) { - return ( - (val.length() >= 2) && - (((val.at(0) == '\xfe') && (val.at(1) == '\xff')) || - ((val.at(0) == '\xff') && (val.at(1) == '\xfe')))); + return util::is_utf16(val); } bool QUtil::is_explicit_utf8(std::string const& val) { - // QPDF_String.cc knows that this is a 3-byte sequence. - return ( - (val.length() >= 3) && (val.at(0) == '\xef') && (val.at(1) == '\xbb') && - (val.at(2) == '\xbf')); + return util::is_explicit_utf8(val); } std::string diff --git a/libqpdf/qpdf/Util.hh b/libqpdf/qpdf/Util.hh index d812fdf..ef480c7 100644 --- a/libqpdf/qpdf/Util.hh +++ b/libqpdf/qpdf/Util.hh @@ -74,6 +74,19 @@ namespace qpdf::util s.insert(0, 1, '1'); } + inline bool + is_utf16(std::string const& str) + { + return str.starts_with("\xfe\xff") || str.starts_with("\xff\xfe"); + } + + inline bool + is_explicit_utf8(std::string const& str) + { + // QPDF_String.cc knows that this is a 3-byte sequence. + return str.starts_with("\xef\xbb\xbf"); + } + std::string random_string(size_t len); } // namespace qpdf::util diff --git a/libtests/qutil.cc b/libtests/qutil.cc index 30dcd1e..62cd9df 100644 --- a/libtests/qutil.cc +++ b/libtests/qutil.cc @@ -367,6 +367,16 @@ check_analyze(std::string const& str, bool has8bit, bool utf8, bool utf16) } void +explicit_utf8_test() +{ + assert(QUtil::is_explicit_utf8("\xef\xbb\xbfnot empty")); + assert(QUtil::is_explicit_utf8("\xef\xbb\xbf")); + assert(!QUtil::is_explicit_utf8("\xef\xbb\xbenot explicit")); + assert(!QUtil::is_explicit_utf8("\xef\xbe\xbfnot explicit")); + assert(!QUtil::is_explicit_utf8("\xee\xbb\xbfnot explicit")); +} + +void print_alternatives(std::string const& str) { std::vector result = QUtil::possible_repaired_encodings(str); @@ -432,7 +442,7 @@ transcoding_test() std::string other_to_utf8; assert(!QUtil::utf8_to_pdf_doc(other_utf8, other_to_utf8)); std::cout << other_to_utf8 << '\n'; - std::cout << "done other characters" << '\n'; + std::cout << "done other characters\n"; // These valid UTF8 strings when converted to PDFDoc would end up // with a byte sequence that would be recognized as UTF-8 or // UTF-16 rather than PDFDoc. A special case is required to store @@ -747,6 +757,7 @@ main(int argc, char* argv[]) getenv_test(); std::cout << "---- utf8" << '\n'; to_utf8_test(); + explicit_utf8_test(); std::cout << "---- utf16" << '\n'; to_utf16_test(); std::cout << "---- utf8_to_ascii" << '\n'; -- libgit2 0.21.4