Commit 8f389f14c0584861c712c049bdba3ed0d9036506

Authored by Jay Berkenbilt
1 parent 6817ca58

QUtil::analyze_encoding

ChangeLog
... ... @@ -14,6 +14,13 @@
14 14 the first bug in qpdf's history that could result in silent loss
15 15 of data when processing a correct input file. Fixes #276.
16 16  
  17 +2019-01-13 Jay Berkenbilt <ejb@ql.org>
  18 +
  19 + * Add several more string transcoding and analysis methods to
  20 + QUtil for bidirectional conversion between PDF Doc, Win Ansi, Mac
  21 + Roman, UTF-6, and UTF-16 along with detection of valid UTF-8 and
  22 + UTF-16.
  23 +
17 24 2019-01-12 Jay Berkenbilt <ejb@ql.org>
18 25  
19 26 * In the --pages option, allow the same page to be specified more
... ...
include/qpdf/QUtil.hh
... ... @@ -193,6 +193,20 @@ namespace QUtil
193 193 QPDF_DLL
194 194 std::string pdf_doc_to_utf8(std::string const& pdfdoc);
195 195  
  196 + // Analyze a string for encoding. We can't tell the difference
  197 + // between any single-byte encodings, and we can't tell for sure
  198 + // whether a string that happens to be valid UTF-8 isn't a
  199 + // different encoding, but we can at least tell a few things to
  200 + // help us guess. If there are no characters with the high bit
  201 + // set, has_8bit_chars is false, and the other values are also
  202 + // false, even though ASCII strings are valid UTF-8. is_valid_utf8
  203 + // means that the string is non-trivially valid UTF-8.
  204 + QPDF_DLL
  205 + void analyze_encoding(std::string const& str,
  206 + bool& has_8bit_chars,
  207 + bool& is_valid_utf8,
  208 + bool& is_utf16);
  209 +
196 210 // If secure random number generation is supported on your
197 211 // platform and qpdf was not compiled with insecure random number
198 212 // generation, this returns a cryptographically secure random
... ...
libqpdf/QUtil.cc
... ... @@ -1661,6 +1661,50 @@ encode_pdfdoc(unsigned long codepoint)
1661 1661 return ch;
1662 1662 }
1663 1663  
  1664 +unsigned long get_next_utf8_codepoint(
  1665 + std::string const& utf8_val, size_t& pos, bool& error)
  1666 +{
  1667 + size_t len = utf8_val.length();
  1668 + unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos));
  1669 + error = false;
  1670 + if (ch < 128)
  1671 + {
  1672 + return static_cast<unsigned long>(ch);
  1673 + }
  1674 +
  1675 + size_t bytes_needed = 0;
  1676 + unsigned bit_check = 0x40;
  1677 + unsigned char to_clear = 0x80;
  1678 + while (ch & bit_check)
  1679 + {
  1680 + ++bytes_needed;
  1681 + to_clear |= bit_check;
  1682 + bit_check >>= 1;
  1683 + }
  1684 + if (((bytes_needed > 5) || (bytes_needed < 1)) ||
  1685 + ((pos + bytes_needed) >= len))
  1686 + {
  1687 + error = true;
  1688 + return 0xfffd;
  1689 + }
  1690 +
  1691 + unsigned long codepoint = (ch & ~to_clear);
  1692 + while (bytes_needed > 0)
  1693 + {
  1694 + --bytes_needed;
  1695 + ch = utf8_val.at(++pos);
  1696 + if ((ch & 0xc0) != 0x80)
  1697 + {
  1698 + --pos;
  1699 + codepoint = 0xfffd;
  1700 + break;
  1701 + }
  1702 + codepoint <<= 6;
  1703 + codepoint += (ch & 0x3f);
  1704 + }
  1705 + return codepoint;
  1706 +}
  1707 +
1664 1708 static std::string
1665 1709 transcode_utf8(std::string const& utf8_val, encoding_e encoding,
1666 1710 char unknown)
... ... @@ -1673,9 +1717,22 @@ transcode_utf8(std::string const&amp; utf8_val, encoding_e encoding,
1673 1717 size_t len = utf8_val.length();
1674 1718 for (size_t i = 0; i < len; ++i)
1675 1719 {
1676   - unsigned char ch = static_cast<unsigned char>(utf8_val.at(i));
1677   - if (ch < 128)
  1720 + bool error = false;
  1721 + unsigned long codepoint = get_next_utf8_codepoint(utf8_val, i, error);
  1722 + if (error)
  1723 + {
  1724 + if (encoding == e_utf16)
  1725 + {
  1726 + result += "\xff\xfd";
  1727 + }
  1728 + else
  1729 + {
  1730 + result.append(1, unknown);
  1731 + }
  1732 + }
  1733 + else if (codepoint < 128)
1678 1734 {
  1735 + char ch = static_cast<char>(codepoint);
1679 1736 if (encoding == e_utf16)
1680 1737 {
1681 1738 result += QUtil::toUTF16(ch);
... ... @@ -1685,78 +1742,35 @@ transcode_utf8(std::string const&amp; utf8_val, encoding_e encoding,
1685 1742 result.append(1, ch);
1686 1743 }
1687 1744 }
  1745 + else if (encoding == e_utf16)
  1746 + {
  1747 + result += QUtil::toUTF16(codepoint);
  1748 + }
  1749 + else if ((codepoint > 160) && (codepoint < 256) &&
  1750 + ((encoding == e_winansi) || (encoding == e_pdfdoc)))
  1751 + {
  1752 + result.append(1, static_cast<unsigned char>(codepoint & 0xff));
  1753 + }
1688 1754 else
1689 1755 {
1690   - size_t bytes_needed = 0;
1691   - unsigned bit_check = 0x40;
1692   - unsigned char to_clear = 0x80;
1693   - while (ch & bit_check)
  1756 + unsigned char ch = '\0';
  1757 + if (encoding == e_winansi)
1694 1758 {
1695   - ++bytes_needed;
1696   - to_clear |= bit_check;
1697   - bit_check >>= 1;
  1759 + ch = encode_winansi(codepoint);
1698 1760 }
1699   -
1700   - if (((bytes_needed > 5) || (bytes_needed < 1)) ||
1701   - ((i + bytes_needed) >= len))
  1761 + else if (encoding == e_macroman)
1702 1762 {
1703   - if (encoding == e_utf16)
1704   - {
1705   - result += "\xff\xfd";
1706   - }
1707   - else
1708   - {
1709   - result.append(1, unknown);
1710   - }
  1763 + ch = encode_macroman(codepoint);
1711 1764 }
1712   - else
  1765 + else if (encoding == e_pdfdoc)
1713 1766 {
1714   - unsigned long codepoint = (ch & ~to_clear);
1715   - while (bytes_needed > 0)
1716   - {
1717   - --bytes_needed;
1718   - ch = utf8_val.at(++i);
1719   - if ((ch & 0xc0) != 0x80)
1720   - {
1721   - --i;
1722   - codepoint = 0xfffd;
1723   - break;
1724   - }
1725   - codepoint <<= 6;
1726   - codepoint += (ch & 0x3f);
1727   - }
1728   - if (encoding == e_utf16)
1729   - {
1730   - result += QUtil::toUTF16(codepoint);
1731   - }
1732   - else if ((codepoint > 160) && (codepoint < 256) &&
1733   - ((encoding == e_winansi) || (encoding == e_pdfdoc)))
1734   - {
1735   - ch = static_cast<unsigned char>(codepoint & 0xff);
1736   - result.append(1, ch);
1737   - }
1738   - else
1739   - {
1740   - ch = '\0';
1741   - if (encoding == e_winansi)
1742   - {
1743   - ch = encode_winansi(codepoint);
1744   - }
1745   - else if (encoding == e_macroman)
1746   - {
1747   - ch = encode_macroman(codepoint);
1748   - }
1749   - else if (encoding == e_pdfdoc)
1750   - {
1751   - ch = encode_pdfdoc(codepoint);
1752   - }
1753   - if (ch == '\0')
1754   - {
1755   - ch = static_cast<unsigned char>(unknown);
1756   - }
1757   - result.append(1, ch);
1758   - }
  1767 + ch = encode_pdfdoc(codepoint);
1759 1768 }
  1769 + if (ch == '\0')
  1770 + {
  1771 + ch = static_cast<unsigned char>(unknown);
  1772 + }
  1773 + result.append(1, ch);
1760 1774 }
1761 1775 }
1762 1776 return result;
... ... @@ -1904,3 +1918,37 @@ QUtil::pdf_doc_to_utf8(std::string const&amp; val)
1904 1918 }
1905 1919 return result;
1906 1920 }
  1921 +
  1922 +void
  1923 +QUtil::analyze_encoding(std::string const& val,
  1924 + bool& has_8bit_chars,
  1925 + bool& is_valid_utf8,
  1926 + bool& is_utf16)
  1927 +{
  1928 + has_8bit_chars = is_utf16 = is_valid_utf8 = false;
  1929 + if (QUtil::is_utf16(val))
  1930 + {
  1931 + has_8bit_chars = true;
  1932 + is_utf16 = true;
  1933 + return;
  1934 + }
  1935 + size_t len = val.length();
  1936 + bool any_errors = false;
  1937 + for (size_t i = 0; i < len; ++i)
  1938 + {
  1939 + bool error = false;
  1940 + unsigned long codepoint = get_next_utf8_codepoint(val, i, error);
  1941 + if (error)
  1942 + {
  1943 + any_errors = true;
  1944 + }
  1945 + if (codepoint >= 128)
  1946 + {
  1947 + has_8bit_chars = true;
  1948 + }
  1949 + }
  1950 + if (has_8bit_chars && (! any_errors))
  1951 + {
  1952 + is_valid_utf8 = true;
  1953 + }
  1954 +}
... ...
libtests/qtest/qutil/qutil.out
... ... @@ -57,6 +57,7 @@ HAGOOGAMAGOOGLE: 0
57 57 bidirectional pdf doc done
58 58 bidirectional win ansi done
59 59 bidirectional mac roman done
  60 +analysis done
60 61 ---- whoami
61 62 quack1
62 63 quack2
... ...
libtests/qutil.cc
... ... @@ -262,6 +262,20 @@ void transcoding_test(std::string (*to_utf8)(std::string const&amp;),
262 262 }
263 263 }
264 264  
  265 +void check_analyze(std::string const& str, bool has8bit, bool utf8, bool utf16)
  266 +{
  267 + bool has_8bit_chars = false;
  268 + bool is_valid_utf8 = false;
  269 + bool is_utf16 = false;
  270 + QUtil::analyze_encoding(str, has_8bit_chars, is_valid_utf8, is_utf16);
  271 + if (! ((has_8bit_chars == has8bit) &&
  272 + (is_valid_utf8 == utf8) &&
  273 + (is_utf16 == utf16)))
  274 + {
  275 + std::cout << "analysis failed: " << str << std::endl;
  276 + }
  277 +}
  278 +
265 279 void transcoding_test()
266 280 {
267 281 transcoding_test(&QUtil::pdf_doc_to_utf8,
... ... @@ -273,6 +287,11 @@ void transcoding_test()
273 287 transcoding_test(&QUtil::mac_roman_to_utf8,
274 288 &QUtil::utf8_to_mac_roman, 255, "?");
275 289 std::cout << "bidirectional mac roman done" << std::endl;
  290 + check_analyze("pi = \317\200", true, true, false);
  291 + check_analyze("pi != \317", true, false, false);
  292 + check_analyze("pi != 22/7", false, false, false);
  293 + check_analyze(std::string("\xfe\xff\00\x51", 4), true, false, true);
  294 + std::cout << "analysis done" << std::endl;
276 295 }
277 296  
278 297 void print_whoami(char const* str)
... ...