Commit 463037773177eb616f2dfd9a58f039b3eebd192c
1 parent
429ffcf3
Add status-reporting transcoders to QUtil
Showing
4 changed files
with
81 additions
and
4 deletions
ChangeLog
| ... | ... | @@ -14,6 +14,11 @@ |
| 14 | 14 | the first bug in qpdf's history that could result in silent loss |
| 15 | 15 | of data when processing a correct input file. Fixes #276. |
| 16 | 16 | |
| 17 | +2019-01-14 Jay Berkenbilt <ejb@ql.org> | |
| 18 | + | |
| 19 | + * Add versions of utf8 to single-byte character transcoders that | |
| 20 | + return a success code. | |
| 21 | + | |
| 17 | 22 | 2019-01-13 Jay Berkenbilt <ejb@ql.org> |
| 18 | 23 | |
| 19 | 24 | * Add several more string transcoding and analysis methods to | ... | ... |
include/qpdf/QUtil.hh
| ... | ... | @@ -178,6 +178,22 @@ namespace QUtil |
| 178 | 178 | std::string utf8_to_pdf_doc( |
| 179 | 179 | std::string const& utf8, char unknown_char = '?'); |
| 180 | 180 | |
| 181 | + // These versions return true if the conversion was successful and | |
| 182 | + // false if any unrepresentable characters were found and had to | |
| 183 | + // be substituted with the unknown character. | |
| 184 | + QPDF_DLL | |
| 185 | + bool utf8_to_ascii( | |
| 186 | + std::string const& utf8, std::string& ascii, char unknown_char = '?'); | |
| 187 | + QPDF_DLL | |
| 188 | + bool utf8_to_win_ansi( | |
| 189 | + std::string const& utf8, std::string& win, char unknown_char = '?'); | |
| 190 | + QPDF_DLL | |
| 191 | + bool utf8_to_mac_roman( | |
| 192 | + std::string const& utf8, std::string& mac, char unknown_char = '?'); | |
| 193 | + QPDF_DLL | |
| 194 | + bool utf8_to_pdf_doc( | |
| 195 | + std::string const& utf8, std::string& pdfdoc, char unknown_char = '?'); | |
| 196 | + | |
| 181 | 197 | // Convert a UTF-16 big-endian encoded string to UTF-8. |
| 182 | 198 | // Unrepresentable code points are converted to U+FFFD. |
| 183 | 199 | QPDF_DLL | ... | ... |
libqpdf/QUtil.cc
| ... | ... | @@ -1705,11 +1705,12 @@ unsigned long get_next_utf8_codepoint( |
| 1705 | 1705 | return codepoint; |
| 1706 | 1706 | } |
| 1707 | 1707 | |
| 1708 | -static std::string | |
| 1709 | -transcode_utf8(std::string const& utf8_val, encoding_e encoding, | |
| 1710 | - char unknown) | |
| 1708 | +static bool | |
| 1709 | +transcode_utf8(std::string const& utf8_val, std::string& result, | |
| 1710 | + encoding_e encoding, char unknown) | |
| 1711 | 1711 | { |
| 1712 | - std::string result; | |
| 1712 | + bool okay = true; | |
| 1713 | + result.clear(); | |
| 1713 | 1714 | if (encoding == e_utf16) |
| 1714 | 1715 | { |
| 1715 | 1716 | result += "\xfe\xff"; |
| ... | ... | @@ -1721,6 +1722,7 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding, |
| 1721 | 1722 | unsigned long codepoint = get_next_utf8_codepoint(utf8_val, i, error); |
| 1722 | 1723 | if (error) |
| 1723 | 1724 | { |
| 1725 | + okay = false; | |
| 1724 | 1726 | if (encoding == e_utf16) |
| 1725 | 1727 | { |
| 1726 | 1728 | result += "\xff\xfd"; |
| ... | ... | @@ -1768,11 +1770,21 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding, |
| 1768 | 1770 | } |
| 1769 | 1771 | if (ch == '\0') |
| 1770 | 1772 | { |
| 1773 | + okay = false; | |
| 1771 | 1774 | ch = static_cast<unsigned char>(unknown); |
| 1772 | 1775 | } |
| 1773 | 1776 | result.append(1, ch); |
| 1774 | 1777 | } |
| 1775 | 1778 | } |
| 1779 | + return okay; | |
| 1780 | +} | |
| 1781 | + | |
| 1782 | +static std::string | |
| 1783 | +transcode_utf8(std::string const& utf8_val, encoding_e encoding, | |
| 1784 | + char unknown) | |
| 1785 | +{ | |
| 1786 | + std::string result; | |
| 1787 | + transcode_utf8(utf8_val, result, encoding, unknown); | |
| 1776 | 1788 | return result; |
| 1777 | 1789 | } |
| 1778 | 1790 | |
| ... | ... | @@ -1807,6 +1819,34 @@ QUtil::utf8_to_pdf_doc(std::string const& utf8, char unknown_char) |
| 1807 | 1819 | } |
| 1808 | 1820 | |
| 1809 | 1821 | bool |
| 1822 | +QUtil::utf8_to_ascii(std::string const& utf8, std::string& ascii, | |
| 1823 | + char unknown_char) | |
| 1824 | +{ | |
| 1825 | + return transcode_utf8(utf8, ascii, e_ascii, unknown_char); | |
| 1826 | +} | |
| 1827 | + | |
| 1828 | +bool | |
| 1829 | +QUtil::utf8_to_win_ansi(std::string const& utf8, std::string& win, | |
| 1830 | + char unknown_char) | |
| 1831 | +{ | |
| 1832 | + return transcode_utf8(utf8, win, e_winansi, unknown_char); | |
| 1833 | +} | |
| 1834 | + | |
| 1835 | +bool | |
| 1836 | +QUtil::utf8_to_mac_roman(std::string const& utf8, std::string& mac, | |
| 1837 | + char unknown_char) | |
| 1838 | +{ | |
| 1839 | + return transcode_utf8(utf8, mac, e_macroman, unknown_char); | |
| 1840 | +} | |
| 1841 | + | |
| 1842 | +bool | |
| 1843 | +QUtil::utf8_to_pdf_doc(std::string const& utf8, std::string& pdfdoc, | |
| 1844 | + char unknown_char) | |
| 1845 | +{ | |
| 1846 | + return transcode_utf8(utf8, pdfdoc, e_pdfdoc, unknown_char); | |
| 1847 | +} | |
| 1848 | + | |
| 1849 | +bool | |
| 1810 | 1850 | QUtil::is_utf16(std::string const& val) |
| 1811 | 1851 | { |
| 1812 | 1852 | return ((val.length() >= 2) && | ... | ... |
libtests/qutil.cc
| ... | ... | @@ -292,6 +292,22 @@ void transcoding_test() |
| 292 | 292 | check_analyze("pi != 22/7", false, false, false); |
| 293 | 293 | check_analyze(std::string("\xfe\xff\00\x51", 4), true, false, true); |
| 294 | 294 | std::cout << "analysis done" << std::endl; |
| 295 | + std::string input1("a\302\277b"); | |
| 296 | + std::string input2("a\317\200b"); | |
| 297 | + std::string input3("ab"); | |
| 298 | + std::string output; | |
| 299 | + assert(! QUtil::utf8_to_ascii(input1, output)); | |
| 300 | + assert(! QUtil::utf8_to_ascii(input2, output)); | |
| 301 | + assert(QUtil::utf8_to_ascii(input3, output)); | |
| 302 | + assert(QUtil::utf8_to_win_ansi(input1, output)); | |
| 303 | + assert(! QUtil::utf8_to_win_ansi(input2, output)); | |
| 304 | + assert(QUtil::utf8_to_win_ansi(input3, output)); | |
| 305 | + assert(QUtil::utf8_to_mac_roman(input1, output)); | |
| 306 | + assert(! QUtil::utf8_to_mac_roman(input2, output)); | |
| 307 | + assert(QUtil::utf8_to_mac_roman(input3, output)); | |
| 308 | + assert(QUtil::utf8_to_pdf_doc(input1, output)); | |
| 309 | + assert(! QUtil::utf8_to_pdf_doc(input2, output)); | |
| 310 | + assert(QUtil::utf8_to_pdf_doc(input3, output)); | |
| 295 | 311 | } |
| 296 | 312 | |
| 297 | 313 | void print_whoami(char const* str) | ... | ... |