Commit 8f389f14c0584861c712c049bdba3ed0d9036506
1 parent
6817ca58
QUtil::analyze_encoding
Showing
5 changed files
with
155 additions
and
66 deletions
ChangeLog
| ... | ... | @@ -14,6 +14,13 @@ |
| 14 | 14 | the first bug in qpdf's history that could result in silent loss |
| 15 | 15 | of data when processing a correct input file. Fixes #276. |
| 16 | 16 | |
| 17 | +2019-01-13 Jay Berkenbilt <ejb@ql.org> | |
| 18 | + | |
| 19 | + * Add several more string transcoding and analysis methods to | |
| 20 | + QUtil for bidirectional conversion between PDF Doc, Win Ansi, Mac | |
| 21 | + Roman, UTF-6, and UTF-16 along with detection of valid UTF-8 and | |
| 22 | + UTF-16. | |
| 23 | + | |
| 17 | 24 | 2019-01-12 Jay Berkenbilt <ejb@ql.org> |
| 18 | 25 | |
| 19 | 26 | * In the --pages option, allow the same page to be specified more | ... | ... |
include/qpdf/QUtil.hh
| ... | ... | @@ -193,6 +193,20 @@ namespace QUtil |
| 193 | 193 | QPDF_DLL |
| 194 | 194 | std::string pdf_doc_to_utf8(std::string const& pdfdoc); |
| 195 | 195 | |
| 196 | + // Analyze a string for encoding. We can't tell the difference | |
| 197 | + // between any single-byte encodings, and we can't tell for sure | |
| 198 | + // whether a string that happens to be valid UTF-8 isn't a | |
| 199 | + // different encoding, but we can at least tell a few things to | |
| 200 | + // help us guess. If there are no characters with the high bit | |
| 201 | + // set, has_8bit_chars is false, and the other values are also | |
| 202 | + // false, even though ASCII strings are valid UTF-8. is_valid_utf8 | |
| 203 | + // means that the string is non-trivially valid UTF-8. | |
| 204 | + QPDF_DLL | |
| 205 | + void analyze_encoding(std::string const& str, | |
| 206 | + bool& has_8bit_chars, | |
| 207 | + bool& is_valid_utf8, | |
| 208 | + bool& is_utf16); | |
| 209 | + | |
| 196 | 210 | // If secure random number generation is supported on your |
| 197 | 211 | // platform and qpdf was not compiled with insecure random number |
| 198 | 212 | // generation, this returns a cryptographically secure random | ... | ... |
libqpdf/QUtil.cc
| ... | ... | @@ -1661,6 +1661,50 @@ encode_pdfdoc(unsigned long codepoint) |
| 1661 | 1661 | return ch; |
| 1662 | 1662 | } |
| 1663 | 1663 | |
| 1664 | +unsigned long get_next_utf8_codepoint( | |
| 1665 | + std::string const& utf8_val, size_t& pos, bool& error) | |
| 1666 | +{ | |
| 1667 | + size_t len = utf8_val.length(); | |
| 1668 | + unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos)); | |
| 1669 | + error = false; | |
| 1670 | + if (ch < 128) | |
| 1671 | + { | |
| 1672 | + return static_cast<unsigned long>(ch); | |
| 1673 | + } | |
| 1674 | + | |
| 1675 | + size_t bytes_needed = 0; | |
| 1676 | + unsigned bit_check = 0x40; | |
| 1677 | + unsigned char to_clear = 0x80; | |
| 1678 | + while (ch & bit_check) | |
| 1679 | + { | |
| 1680 | + ++bytes_needed; | |
| 1681 | + to_clear |= bit_check; | |
| 1682 | + bit_check >>= 1; | |
| 1683 | + } | |
| 1684 | + if (((bytes_needed > 5) || (bytes_needed < 1)) || | |
| 1685 | + ((pos + bytes_needed) >= len)) | |
| 1686 | + { | |
| 1687 | + error = true; | |
| 1688 | + return 0xfffd; | |
| 1689 | + } | |
| 1690 | + | |
| 1691 | + unsigned long codepoint = (ch & ~to_clear); | |
| 1692 | + while (bytes_needed > 0) | |
| 1693 | + { | |
| 1694 | + --bytes_needed; | |
| 1695 | + ch = utf8_val.at(++pos); | |
| 1696 | + if ((ch & 0xc0) != 0x80) | |
| 1697 | + { | |
| 1698 | + --pos; | |
| 1699 | + codepoint = 0xfffd; | |
| 1700 | + break; | |
| 1701 | + } | |
| 1702 | + codepoint <<= 6; | |
| 1703 | + codepoint += (ch & 0x3f); | |
| 1704 | + } | |
| 1705 | + return codepoint; | |
| 1706 | +} | |
| 1707 | + | |
| 1664 | 1708 | static std::string |
| 1665 | 1709 | transcode_utf8(std::string const& utf8_val, encoding_e encoding, |
| 1666 | 1710 | char unknown) |
| ... | ... | @@ -1673,9 +1717,22 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding, |
| 1673 | 1717 | size_t len = utf8_val.length(); |
| 1674 | 1718 | for (size_t i = 0; i < len; ++i) |
| 1675 | 1719 | { |
| 1676 | - unsigned char ch = static_cast<unsigned char>(utf8_val.at(i)); | |
| 1677 | - if (ch < 128) | |
| 1720 | + bool error = false; | |
| 1721 | + unsigned long codepoint = get_next_utf8_codepoint(utf8_val, i, error); | |
| 1722 | + if (error) | |
| 1723 | + { | |
| 1724 | + if (encoding == e_utf16) | |
| 1725 | + { | |
| 1726 | + result += "\xff\xfd"; | |
| 1727 | + } | |
| 1728 | + else | |
| 1729 | + { | |
| 1730 | + result.append(1, unknown); | |
| 1731 | + } | |
| 1732 | + } | |
| 1733 | + else if (codepoint < 128) | |
| 1678 | 1734 | { |
| 1735 | + char ch = static_cast<char>(codepoint); | |
| 1679 | 1736 | if (encoding == e_utf16) |
| 1680 | 1737 | { |
| 1681 | 1738 | result += QUtil::toUTF16(ch); |
| ... | ... | @@ -1685,78 +1742,35 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding, |
| 1685 | 1742 | result.append(1, ch); |
| 1686 | 1743 | } |
| 1687 | 1744 | } |
| 1745 | + else if (encoding == e_utf16) | |
| 1746 | + { | |
| 1747 | + result += QUtil::toUTF16(codepoint); | |
| 1748 | + } | |
| 1749 | + else if ((codepoint > 160) && (codepoint < 256) && | |
| 1750 | + ((encoding == e_winansi) || (encoding == e_pdfdoc))) | |
| 1751 | + { | |
| 1752 | + result.append(1, static_cast<unsigned char>(codepoint & 0xff)); | |
| 1753 | + } | |
| 1688 | 1754 | else |
| 1689 | 1755 | { |
| 1690 | - size_t bytes_needed = 0; | |
| 1691 | - unsigned bit_check = 0x40; | |
| 1692 | - unsigned char to_clear = 0x80; | |
| 1693 | - while (ch & bit_check) | |
| 1756 | + unsigned char ch = '\0'; | |
| 1757 | + if (encoding == e_winansi) | |
| 1694 | 1758 | { |
| 1695 | - ++bytes_needed; | |
| 1696 | - to_clear |= bit_check; | |
| 1697 | - bit_check >>= 1; | |
| 1759 | + ch = encode_winansi(codepoint); | |
| 1698 | 1760 | } |
| 1699 | - | |
| 1700 | - if (((bytes_needed > 5) || (bytes_needed < 1)) || | |
| 1701 | - ((i + bytes_needed) >= len)) | |
| 1761 | + else if (encoding == e_macroman) | |
| 1702 | 1762 | { |
| 1703 | - if (encoding == e_utf16) | |
| 1704 | - { | |
| 1705 | - result += "\xff\xfd"; | |
| 1706 | - } | |
| 1707 | - else | |
| 1708 | - { | |
| 1709 | - result.append(1, unknown); | |
| 1710 | - } | |
| 1763 | + ch = encode_macroman(codepoint); | |
| 1711 | 1764 | } |
| 1712 | - else | |
| 1765 | + else if (encoding == e_pdfdoc) | |
| 1713 | 1766 | { |
| 1714 | - unsigned long codepoint = (ch & ~to_clear); | |
| 1715 | - while (bytes_needed > 0) | |
| 1716 | - { | |
| 1717 | - --bytes_needed; | |
| 1718 | - ch = utf8_val.at(++i); | |
| 1719 | - if ((ch & 0xc0) != 0x80) | |
| 1720 | - { | |
| 1721 | - --i; | |
| 1722 | - codepoint = 0xfffd; | |
| 1723 | - break; | |
| 1724 | - } | |
| 1725 | - codepoint <<= 6; | |
| 1726 | - codepoint += (ch & 0x3f); | |
| 1727 | - } | |
| 1728 | - if (encoding == e_utf16) | |
| 1729 | - { | |
| 1730 | - result += QUtil::toUTF16(codepoint); | |
| 1731 | - } | |
| 1732 | - else if ((codepoint > 160) && (codepoint < 256) && | |
| 1733 | - ((encoding == e_winansi) || (encoding == e_pdfdoc))) | |
| 1734 | - { | |
| 1735 | - ch = static_cast<unsigned char>(codepoint & 0xff); | |
| 1736 | - result.append(1, ch); | |
| 1737 | - } | |
| 1738 | - else | |
| 1739 | - { | |
| 1740 | - ch = '\0'; | |
| 1741 | - if (encoding == e_winansi) | |
| 1742 | - { | |
| 1743 | - ch = encode_winansi(codepoint); | |
| 1744 | - } | |
| 1745 | - else if (encoding == e_macroman) | |
| 1746 | - { | |
| 1747 | - ch = encode_macroman(codepoint); | |
| 1748 | - } | |
| 1749 | - else if (encoding == e_pdfdoc) | |
| 1750 | - { | |
| 1751 | - ch = encode_pdfdoc(codepoint); | |
| 1752 | - } | |
| 1753 | - if (ch == '\0') | |
| 1754 | - { | |
| 1755 | - ch = static_cast<unsigned char>(unknown); | |
| 1756 | - } | |
| 1757 | - result.append(1, ch); | |
| 1758 | - } | |
| 1767 | + ch = encode_pdfdoc(codepoint); | |
| 1759 | 1768 | } |
| 1769 | + if (ch == '\0') | |
| 1770 | + { | |
| 1771 | + ch = static_cast<unsigned char>(unknown); | |
| 1772 | + } | |
| 1773 | + result.append(1, ch); | |
| 1760 | 1774 | } |
| 1761 | 1775 | } |
| 1762 | 1776 | return result; |
| ... | ... | @@ -1904,3 +1918,37 @@ QUtil::pdf_doc_to_utf8(std::string const& val) |
| 1904 | 1918 | } |
| 1905 | 1919 | return result; |
| 1906 | 1920 | } |
| 1921 | + | |
| 1922 | +void | |
| 1923 | +QUtil::analyze_encoding(std::string const& val, | |
| 1924 | + bool& has_8bit_chars, | |
| 1925 | + bool& is_valid_utf8, | |
| 1926 | + bool& is_utf16) | |
| 1927 | +{ | |
| 1928 | + has_8bit_chars = is_utf16 = is_valid_utf8 = false; | |
| 1929 | + if (QUtil::is_utf16(val)) | |
| 1930 | + { | |
| 1931 | + has_8bit_chars = true; | |
| 1932 | + is_utf16 = true; | |
| 1933 | + return; | |
| 1934 | + } | |
| 1935 | + size_t len = val.length(); | |
| 1936 | + bool any_errors = false; | |
| 1937 | + for (size_t i = 0; i < len; ++i) | |
| 1938 | + { | |
| 1939 | + bool error = false; | |
| 1940 | + unsigned long codepoint = get_next_utf8_codepoint(val, i, error); | |
| 1941 | + if (error) | |
| 1942 | + { | |
| 1943 | + any_errors = true; | |
| 1944 | + } | |
| 1945 | + if (codepoint >= 128) | |
| 1946 | + { | |
| 1947 | + has_8bit_chars = true; | |
| 1948 | + } | |
| 1949 | + } | |
| 1950 | + if (has_8bit_chars && (! any_errors)) | |
| 1951 | + { | |
| 1952 | + is_valid_utf8 = true; | |
| 1953 | + } | |
| 1954 | +} | ... | ... |
libtests/qtest/qutil/qutil.out
libtests/qutil.cc
| ... | ... | @@ -262,6 +262,20 @@ void transcoding_test(std::string (*to_utf8)(std::string const&), |
| 262 | 262 | } |
| 263 | 263 | } |
| 264 | 264 | |
| 265 | +void check_analyze(std::string const& str, bool has8bit, bool utf8, bool utf16) | |
| 266 | +{ | |
| 267 | + bool has_8bit_chars = false; | |
| 268 | + bool is_valid_utf8 = false; | |
| 269 | + bool is_utf16 = false; | |
| 270 | + QUtil::analyze_encoding(str, has_8bit_chars, is_valid_utf8, is_utf16); | |
| 271 | + if (! ((has_8bit_chars == has8bit) && | |
| 272 | + (is_valid_utf8 == utf8) && | |
| 273 | + (is_utf16 == utf16))) | |
| 274 | + { | |
| 275 | + std::cout << "analysis failed: " << str << std::endl; | |
| 276 | + } | |
| 277 | +} | |
| 278 | + | |
| 265 | 279 | void transcoding_test() |
| 266 | 280 | { |
| 267 | 281 | transcoding_test(&QUtil::pdf_doc_to_utf8, |
| ... | ... | @@ -273,6 +287,11 @@ void transcoding_test() |
| 273 | 287 | transcoding_test(&QUtil::mac_roman_to_utf8, |
| 274 | 288 | &QUtil::utf8_to_mac_roman, 255, "?"); |
| 275 | 289 | std::cout << "bidirectional mac roman done" << std::endl; |
| 290 | + check_analyze("pi = \317\200", true, true, false); | |
| 291 | + check_analyze("pi != \317", true, false, false); | |
| 292 | + check_analyze("pi != 22/7", false, false, false); | |
| 293 | + check_analyze(std::string("\xfe\xff\00\x51", 4), true, false, true); | |
| 294 | + std::cout << "analysis done" << std::endl; | |
| 276 | 295 | } |
| 277 | 296 | |
| 278 | 297 | void print_whoami(char const* str) | ... | ... |