Commit 463037773177eb616f2dfd9a58f039b3eebd192c

Authored by Jay Berkenbilt
1 parent 429ffcf3

Add status-reporting transcoders to QUtil

ChangeLog
... ... @@ -14,6 +14,11 @@
14 14 the first bug in qpdf's history that could result in silent loss
15 15 of data when processing a correct input file. Fixes #276.
16 16  
  17 +2019-01-14 Jay Berkenbilt <ejb@ql.org>
  18 +
  19 + * Add versions of utf8 to single-byte character transcoders that
  20 + return a success code.
  21 +
17 22 2019-01-13 Jay Berkenbilt <ejb@ql.org>
18 23  
19 24 * Add several more string transcoding and analysis methods to
... ...
include/qpdf/QUtil.hh
... ... @@ -178,6 +178,22 @@ namespace QUtil
178 178 std::string utf8_to_pdf_doc(
179 179 std::string const& utf8, char unknown_char = '?');
180 180  
  181 + // These versions return true if the conversion was successful and
  182 + // false if any unrepresentable characters were found and had to
  183 + // be substituted with the unknown character.
  184 + QPDF_DLL
  185 + bool utf8_to_ascii(
  186 + std::string const& utf8, std::string& ascii, char unknown_char = '?');
  187 + QPDF_DLL
  188 + bool utf8_to_win_ansi(
  189 + std::string const& utf8, std::string& win, char unknown_char = '?');
  190 + QPDF_DLL
  191 + bool utf8_to_mac_roman(
  192 + std::string const& utf8, std::string& mac, char unknown_char = '?');
  193 + QPDF_DLL
  194 + bool utf8_to_pdf_doc(
  195 + std::string const& utf8, std::string& pdfdoc, char unknown_char = '?');
  196 +
181 197 // Convert a UTF-16 big-endian encoded string to UTF-8.
182 198 // Unrepresentable code points are converted to U+FFFD.
183 199 QPDF_DLL
... ...
libqpdf/QUtil.cc
... ... @@ -1705,11 +1705,12 @@ unsigned long get_next_utf8_codepoint(
1705 1705 return codepoint;
1706 1706 }
1707 1707  
1708   -static std::string
1709   -transcode_utf8(std::string const& utf8_val, encoding_e encoding,
1710   - char unknown)
  1708 +static bool
  1709 +transcode_utf8(std::string const& utf8_val, std::string& result,
  1710 + encoding_e encoding, char unknown)
1711 1711 {
1712   - std::string result;
  1712 + bool okay = true;
  1713 + result.clear();
1713 1714 if (encoding == e_utf16)
1714 1715 {
1715 1716 result += "\xfe\xff";
... ... @@ -1721,6 +1722,7 @@ transcode_utf8(std::string const&amp; utf8_val, encoding_e encoding,
1721 1722 unsigned long codepoint = get_next_utf8_codepoint(utf8_val, i, error);
1722 1723 if (error)
1723 1724 {
  1725 + okay = false;
1724 1726 if (encoding == e_utf16)
1725 1727 {
1726 1728 result += "\xff\xfd";
... ... @@ -1768,11 +1770,21 @@ transcode_utf8(std::string const&amp; utf8_val, encoding_e encoding,
1768 1770 }
1769 1771 if (ch == '\0')
1770 1772 {
  1773 + okay = false;
1771 1774 ch = static_cast<unsigned char>(unknown);
1772 1775 }
1773 1776 result.append(1, ch);
1774 1777 }
1775 1778 }
  1779 + return okay;
  1780 +}
  1781 +
  1782 +static std::string
  1783 +transcode_utf8(std::string const& utf8_val, encoding_e encoding,
  1784 + char unknown)
  1785 +{
  1786 + std::string result;
  1787 + transcode_utf8(utf8_val, result, encoding, unknown);
1776 1788 return result;
1777 1789 }
1778 1790  
... ... @@ -1807,6 +1819,34 @@ QUtil::utf8_to_pdf_doc(std::string const&amp; utf8, char unknown_char)
1807 1819 }
1808 1820  
1809 1821 bool
  1822 +QUtil::utf8_to_ascii(std::string const& utf8, std::string& ascii,
  1823 + char unknown_char)
  1824 +{
  1825 + return transcode_utf8(utf8, ascii, e_ascii, unknown_char);
  1826 +}
  1827 +
  1828 +bool
  1829 +QUtil::utf8_to_win_ansi(std::string const& utf8, std::string& win,
  1830 + char unknown_char)
  1831 +{
  1832 + return transcode_utf8(utf8, win, e_winansi, unknown_char);
  1833 +}
  1834 +
  1835 +bool
  1836 +QUtil::utf8_to_mac_roman(std::string const& utf8, std::string& mac,
  1837 + char unknown_char)
  1838 +{
  1839 + return transcode_utf8(utf8, mac, e_macroman, unknown_char);
  1840 +}
  1841 +
  1842 +bool
  1843 +QUtil::utf8_to_pdf_doc(std::string const& utf8, std::string& pdfdoc,
  1844 + char unknown_char)
  1845 +{
  1846 + return transcode_utf8(utf8, pdfdoc, e_pdfdoc, unknown_char);
  1847 +}
  1848 +
  1849 +bool
1810 1850 QUtil::is_utf16(std::string const& val)
1811 1851 {
1812 1852 return ((val.length() >= 2) &&
... ...
libtests/qutil.cc
... ... @@ -292,6 +292,22 @@ void transcoding_test()
292 292 check_analyze("pi != 22/7", false, false, false);
293 293 check_analyze(std::string("\xfe\xff\00\x51", 4), true, false, true);
294 294 std::cout << "analysis done" << std::endl;
  295 + std::string input1("a\302\277b");
  296 + std::string input2("a\317\200b");
  297 + std::string input3("ab");
  298 + std::string output;
  299 + assert(! QUtil::utf8_to_ascii(input1, output));
  300 + assert(! QUtil::utf8_to_ascii(input2, output));
  301 + assert(QUtil::utf8_to_ascii(input3, output));
  302 + assert(QUtil::utf8_to_win_ansi(input1, output));
  303 + assert(! QUtil::utf8_to_win_ansi(input2, output));
  304 + assert(QUtil::utf8_to_win_ansi(input3, output));
  305 + assert(QUtil::utf8_to_mac_roman(input1, output));
  306 + assert(! QUtil::utf8_to_mac_roman(input2, output));
  307 + assert(QUtil::utf8_to_mac_roman(input3, output));
  308 + assert(QUtil::utf8_to_pdf_doc(input1, output));
  309 + assert(! QUtil::utf8_to_pdf_doc(input2, output));
  310 + assert(QUtil::utf8_to_pdf_doc(input3, output));
295 311 }
296 312  
297 313 void print_whoami(char const* str)
... ...