Commit 8f389f14c0584861c712c049bdba3ed0d9036506

Authored by Jay Berkenbilt
1 parent 6817ca58

QUtil::analyze_encoding

ChangeLog
@@ -14,6 +14,13 @@ @@ -14,6 +14,13 @@
14 the first bug in qpdf's history that could result in silent loss 14 the first bug in qpdf's history that could result in silent loss
15 of data when processing a correct input file. Fixes #276. 15 of data when processing a correct input file. Fixes #276.
16 16
  17 +2019-01-13 Jay Berkenbilt <ejb@ql.org>
  18 +
  19 + * Add several more string transcoding and analysis methods to
  20 + QUtil for bidirectional conversion between PDF Doc, Win Ansi, Mac
  21 + Roman, UTF-6, and UTF-16 along with detection of valid UTF-8 and
  22 + UTF-16.
  23 +
17 2019-01-12 Jay Berkenbilt <ejb@ql.org> 24 2019-01-12 Jay Berkenbilt <ejb@ql.org>
18 25
19 * In the --pages option, allow the same page to be specified more 26 * In the --pages option, allow the same page to be specified more
include/qpdf/QUtil.hh
@@ -193,6 +193,20 @@ namespace QUtil @@ -193,6 +193,20 @@ namespace QUtil
193 QPDF_DLL 193 QPDF_DLL
194 std::string pdf_doc_to_utf8(std::string const& pdfdoc); 194 std::string pdf_doc_to_utf8(std::string const& pdfdoc);
195 195
  196 + // Analyze a string for encoding. We can't tell the difference
  197 + // between any single-byte encodings, and we can't tell for sure
  198 + // whether a string that happens to be valid UTF-8 isn't a
  199 + // different encoding, but we can at least tell a few things to
  200 + // help us guess. If there are no characters with the high bit
  201 + // set, has_8bit_chars is false, and the other values are also
  202 + // false, even though ASCII strings are valid UTF-8. is_valid_utf8
  203 + // means that the string is non-trivially valid UTF-8.
  204 + QPDF_DLL
  205 + void analyze_encoding(std::string const& str,
  206 + bool& has_8bit_chars,
  207 + bool& is_valid_utf8,
  208 + bool& is_utf16);
  209 +
196 // If secure random number generation is supported on your 210 // If secure random number generation is supported on your
197 // platform and qpdf was not compiled with insecure random number 211 // platform and qpdf was not compiled with insecure random number
198 // generation, this returns a cryptographically secure random 212 // generation, this returns a cryptographically secure random
libqpdf/QUtil.cc
@@ -1661,6 +1661,50 @@ encode_pdfdoc(unsigned long codepoint) @@ -1661,6 +1661,50 @@ encode_pdfdoc(unsigned long codepoint)
1661 return ch; 1661 return ch;
1662 } 1662 }
1663 1663
  1664 +unsigned long get_next_utf8_codepoint(
  1665 + std::string const& utf8_val, size_t& pos, bool& error)
  1666 +{
  1667 + size_t len = utf8_val.length();
  1668 + unsigned char ch = static_cast<unsigned char>(utf8_val.at(pos));
  1669 + error = false;
  1670 + if (ch < 128)
  1671 + {
  1672 + return static_cast<unsigned long>(ch);
  1673 + }
  1674 +
  1675 + size_t bytes_needed = 0;
  1676 + unsigned bit_check = 0x40;
  1677 + unsigned char to_clear = 0x80;
  1678 + while (ch & bit_check)
  1679 + {
  1680 + ++bytes_needed;
  1681 + to_clear |= bit_check;
  1682 + bit_check >>= 1;
  1683 + }
  1684 + if (((bytes_needed > 5) || (bytes_needed < 1)) ||
  1685 + ((pos + bytes_needed) >= len))
  1686 + {
  1687 + error = true;
  1688 + return 0xfffd;
  1689 + }
  1690 +
  1691 + unsigned long codepoint = (ch & ~to_clear);
  1692 + while (bytes_needed > 0)
  1693 + {
  1694 + --bytes_needed;
  1695 + ch = utf8_val.at(++pos);
  1696 + if ((ch & 0xc0) != 0x80)
  1697 + {
  1698 + --pos;
  1699 + codepoint = 0xfffd;
  1700 + break;
  1701 + }
  1702 + codepoint <<= 6;
  1703 + codepoint += (ch & 0x3f);
  1704 + }
  1705 + return codepoint;
  1706 +}
  1707 +
1664 static std::string 1708 static std::string
1665 transcode_utf8(std::string const& utf8_val, encoding_e encoding, 1709 transcode_utf8(std::string const& utf8_val, encoding_e encoding,
1666 char unknown) 1710 char unknown)
@@ -1673,9 +1717,22 @@ transcode_utf8(std::string const&amp; utf8_val, encoding_e encoding, @@ -1673,9 +1717,22 @@ transcode_utf8(std::string const&amp; utf8_val, encoding_e encoding,
1673 size_t len = utf8_val.length(); 1717 size_t len = utf8_val.length();
1674 for (size_t i = 0; i < len; ++i) 1718 for (size_t i = 0; i < len; ++i)
1675 { 1719 {
1676 - unsigned char ch = static_cast<unsigned char>(utf8_val.at(i));  
1677 - if (ch < 128) 1720 + bool error = false;
  1721 + unsigned long codepoint = get_next_utf8_codepoint(utf8_val, i, error);
  1722 + if (error)
  1723 + {
  1724 + if (encoding == e_utf16)
  1725 + {
  1726 + result += "\xff\xfd";
  1727 + }
  1728 + else
  1729 + {
  1730 + result.append(1, unknown);
  1731 + }
  1732 + }
  1733 + else if (codepoint < 128)
1678 { 1734 {
  1735 + char ch = static_cast<char>(codepoint);
1679 if (encoding == e_utf16) 1736 if (encoding == e_utf16)
1680 { 1737 {
1681 result += QUtil::toUTF16(ch); 1738 result += QUtil::toUTF16(ch);
@@ -1685,78 +1742,35 @@ transcode_utf8(std::string const&amp; utf8_val, encoding_e encoding, @@ -1685,78 +1742,35 @@ transcode_utf8(std::string const&amp; utf8_val, encoding_e encoding,
1685 result.append(1, ch); 1742 result.append(1, ch);
1686 } 1743 }
1687 } 1744 }
  1745 + else if (encoding == e_utf16)
  1746 + {
  1747 + result += QUtil::toUTF16(codepoint);
  1748 + }
  1749 + else if ((codepoint > 160) && (codepoint < 256) &&
  1750 + ((encoding == e_winansi) || (encoding == e_pdfdoc)))
  1751 + {
  1752 + result.append(1, static_cast<unsigned char>(codepoint & 0xff));
  1753 + }
1688 else 1754 else
1689 { 1755 {
1690 - size_t bytes_needed = 0;  
1691 - unsigned bit_check = 0x40;  
1692 - unsigned char to_clear = 0x80;  
1693 - while (ch & bit_check) 1756 + unsigned char ch = '\0';
  1757 + if (encoding == e_winansi)
1694 { 1758 {
1695 - ++bytes_needed;  
1696 - to_clear |= bit_check;  
1697 - bit_check >>= 1; 1759 + ch = encode_winansi(codepoint);
1698 } 1760 }
1699 -  
1700 - if (((bytes_needed > 5) || (bytes_needed < 1)) ||  
1701 - ((i + bytes_needed) >= len)) 1761 + else if (encoding == e_macroman)
1702 { 1762 {
1703 - if (encoding == e_utf16)  
1704 - {  
1705 - result += "\xff\xfd";  
1706 - }  
1707 - else  
1708 - {  
1709 - result.append(1, unknown);  
1710 - } 1763 + ch = encode_macroman(codepoint);
1711 } 1764 }
1712 - else 1765 + else if (encoding == e_pdfdoc)
1713 { 1766 {
1714 - unsigned long codepoint = (ch & ~to_clear);  
1715 - while (bytes_needed > 0)  
1716 - {  
1717 - --bytes_needed;  
1718 - ch = utf8_val.at(++i);  
1719 - if ((ch & 0xc0) != 0x80)  
1720 - {  
1721 - --i;  
1722 - codepoint = 0xfffd;  
1723 - break;  
1724 - }  
1725 - codepoint <<= 6;  
1726 - codepoint += (ch & 0x3f);  
1727 - }  
1728 - if (encoding == e_utf16)  
1729 - {  
1730 - result += QUtil::toUTF16(codepoint);  
1731 - }  
1732 - else if ((codepoint > 160) && (codepoint < 256) &&  
1733 - ((encoding == e_winansi) || (encoding == e_pdfdoc)))  
1734 - {  
1735 - ch = static_cast<unsigned char>(codepoint & 0xff);  
1736 - result.append(1, ch);  
1737 - }  
1738 - else  
1739 - {  
1740 - ch = '\0';  
1741 - if (encoding == e_winansi)  
1742 - {  
1743 - ch = encode_winansi(codepoint);  
1744 - }  
1745 - else if (encoding == e_macroman)  
1746 - {  
1747 - ch = encode_macroman(codepoint);  
1748 - }  
1749 - else if (encoding == e_pdfdoc)  
1750 - {  
1751 - ch = encode_pdfdoc(codepoint);  
1752 - }  
1753 - if (ch == '\0')  
1754 - {  
1755 - ch = static_cast<unsigned char>(unknown);  
1756 - }  
1757 - result.append(1, ch);  
1758 - } 1767 + ch = encode_pdfdoc(codepoint);
1759 } 1768 }
  1769 + if (ch == '\0')
  1770 + {
  1771 + ch = static_cast<unsigned char>(unknown);
  1772 + }
  1773 + result.append(1, ch);
1760 } 1774 }
1761 } 1775 }
1762 return result; 1776 return result;
@@ -1904,3 +1918,37 @@ QUtil::pdf_doc_to_utf8(std::string const&amp; val) @@ -1904,3 +1918,37 @@ QUtil::pdf_doc_to_utf8(std::string const&amp; val)
1904 } 1918 }
1905 return result; 1919 return result;
1906 } 1920 }
  1921 +
  1922 +void
  1923 +QUtil::analyze_encoding(std::string const& val,
  1924 + bool& has_8bit_chars,
  1925 + bool& is_valid_utf8,
  1926 + bool& is_utf16)
  1927 +{
  1928 + has_8bit_chars = is_utf16 = is_valid_utf8 = false;
  1929 + if (QUtil::is_utf16(val))
  1930 + {
  1931 + has_8bit_chars = true;
  1932 + is_utf16 = true;
  1933 + return;
  1934 + }
  1935 + size_t len = val.length();
  1936 + bool any_errors = false;
  1937 + for (size_t i = 0; i < len; ++i)
  1938 + {
  1939 + bool error = false;
  1940 + unsigned long codepoint = get_next_utf8_codepoint(val, i, error);
  1941 + if (error)
  1942 + {
  1943 + any_errors = true;
  1944 + }
  1945 + if (codepoint >= 128)
  1946 + {
  1947 + has_8bit_chars = true;
  1948 + }
  1949 + }
  1950 + if (has_8bit_chars && (! any_errors))
  1951 + {
  1952 + is_valid_utf8 = true;
  1953 + }
  1954 +}
libtests/qtest/qutil/qutil.out
@@ -57,6 +57,7 @@ HAGOOGAMAGOOGLE: 0 @@ -57,6 +57,7 @@ HAGOOGAMAGOOGLE: 0
57 bidirectional pdf doc done 57 bidirectional pdf doc done
58 bidirectional win ansi done 58 bidirectional win ansi done
59 bidirectional mac roman done 59 bidirectional mac roman done
  60 +analysis done
60 ---- whoami 61 ---- whoami
61 quack1 62 quack1
62 quack2 63 quack2
libtests/qutil.cc
@@ -262,6 +262,20 @@ void transcoding_test(std::string (*to_utf8)(std::string const&amp;), @@ -262,6 +262,20 @@ void transcoding_test(std::string (*to_utf8)(std::string const&amp;),
262 } 262 }
263 } 263 }
264 264
  265 +void check_analyze(std::string const& str, bool has8bit, bool utf8, bool utf16)
  266 +{
  267 + bool has_8bit_chars = false;
  268 + bool is_valid_utf8 = false;
  269 + bool is_utf16 = false;
  270 + QUtil::analyze_encoding(str, has_8bit_chars, is_valid_utf8, is_utf16);
  271 + if (! ((has_8bit_chars == has8bit) &&
  272 + (is_valid_utf8 == utf8) &&
  273 + (is_utf16 == utf16)))
  274 + {
  275 + std::cout << "analysis failed: " << str << std::endl;
  276 + }
  277 +}
  278 +
265 void transcoding_test() 279 void transcoding_test()
266 { 280 {
267 transcoding_test(&QUtil::pdf_doc_to_utf8, 281 transcoding_test(&QUtil::pdf_doc_to_utf8,
@@ -273,6 +287,11 @@ void transcoding_test() @@ -273,6 +287,11 @@ void transcoding_test()
273 transcoding_test(&QUtil::mac_roman_to_utf8, 287 transcoding_test(&QUtil::mac_roman_to_utf8,
274 &QUtil::utf8_to_mac_roman, 255, "?"); 288 &QUtil::utf8_to_mac_roman, 255, "?");
275 std::cout << "bidirectional mac roman done" << std::endl; 289 std::cout << "bidirectional mac roman done" << std::endl;
  290 + check_analyze("pi = \317\200", true, true, false);
  291 + check_analyze("pi != \317", true, false, false);
  292 + check_analyze("pi != 22/7", false, false, false);
  293 + check_analyze(std::string("\xfe\xff\00\x51", 4), true, false, true);
  294 + std::cout << "analysis done" << std::endl;
276 } 295 }
277 296
278 void print_whoami(char const* str) 297 void print_whoami(char const* str)