Commit 698485468a8b7d0f38d817d6055898932f46cc26

Authored by Jay Berkenbilt
1 parent 5cfcd4f3

Move remaining existing transcoding to QUtil

include/qpdf/QUtil.hh
@@ -147,13 +147,18 @@ namespace QUtil @@ -147,13 +147,18 @@ namespace QUtil
147 std::string toUTF8(unsigned long uval); 147 std::string toUTF8(unsigned long uval);
148 148
149 // Return a string containing the byte representation of the 149 // Return a string containing the byte representation of the
150 - // UTF-16 BE encoding for the unicode value passed in. 150 + // UTF-16 big-endian encoding for the unicode value passed in.
151 // Unrepresentable code points are converted to U+FFFD. 151 // Unrepresentable code points are converted to U+FFFD.
152 QPDF_DLL 152 QPDF_DLL
153 std::string toUTF16(unsigned long uval); 153 std::string toUTF16(unsigned long uval);
154 154
155 - // Convert a UTF-8 encoded string to UTF-16. Unrepresentable code  
156 - // points are converted to U+FFFD. 155 + // Test whether this is a UTF-16 big-endian string. This is
  156 + // indicated by first two bytes being 0xFE 0xFF.
  157 + QPDF_DLL
  158 + bool is_utf16(std::string const&);
  159 +
  160 + // Convert a UTF-8 encoded string to UTF-16 big-endian.
  161 + // Unrepresentable code points are converted to U+FFFD.
157 QPDF_DLL 162 QPDF_DLL
158 std::string utf8_to_utf16(std::string const& utf8); 163 std::string utf8_to_utf16(std::string const& utf8);
159 164
@@ -169,6 +174,24 @@ namespace QUtil @@ -169,6 +174,24 @@ namespace QUtil
169 QPDF_DLL 174 QPDF_DLL
170 std::string utf8_to_mac_roman( 175 std::string utf8_to_mac_roman(
171 std::string const& utf8, char unknown_char = '?'); 176 std::string const& utf8, char unknown_char = '?');
  177 + QPDF_DLL
  178 + std::string utf8_to_pdf_doc(
  179 + std::string const& utf8, char unknown_char = '?');
  180 +
  181 + // Convert a UTF-16 big-endian encoded string to UTF-8.
  182 + // Unrepresentable code points are converted to U+FFFD.
  183 + QPDF_DLL
  184 + std::string utf16_to_utf8(std::string const& utf16);
  185 +
  186 + // Convert from the specified single-byte encoding system to
  187 + // UTF-8. There is no ascii_to_utf8 because all ASCII strings are
  188 + // already valid UTF-8.
  189 + QPDF_DLL
  190 + std::string win_ansi_to_utf8(std::string const& win);
  191 + QPDF_DLL
  192 + std::string mac_roman_to_utf8(std::string const& mac);
  193 + QPDF_DLL
  194 + std::string pdf_doc_to_utf8(std::string const& pdfdoc);
172 195
173 // If secure random number generation is supported on your 196 // If secure random number generation is supported on your
174 // platform and qpdf was not compiled with insecure random number 197 // platform and qpdf was not compiled with insecure random number
libqpdf/QPDF_String.cc
@@ -8,43 +8,6 @@ @@ -8,43 +8,6 @@
8 // be used. 8 // be used.
9 #include <string.h> 9 #include <string.h>
10 10
11 -// First element is 128  
12 -static unsigned short pdf_doc_to_unicode[] = {  
13 - 0x2022, // 0x80 BULLET  
14 - 0x2020, // 0x81 DAGGER  
15 - 0x2021, // 0x82 DOUBLE DAGGER  
16 - 0x2026, // 0x83 HORIZONTAL ELLIPSIS  
17 - 0x2014, // 0x84 EM DASH  
18 - 0x2013, // 0x85 EN DASH  
19 - 0x0192, // 0x86 SMALL LETTER F WITH HOOK  
20 - 0x2044, // 0x87 FRACTION SLASH (solidus)  
21 - 0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK  
22 - 0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK  
23 - 0x2212, // 0x8a MINUS SIGN  
24 - 0x2030, // 0x8b PER MILLE SIGN  
25 - 0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase)  
26 - 0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left)  
27 - 0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright)  
28 - 0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft)  
29 - 0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright)  
30 - 0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase)  
31 - 0x2122, // 0x92 TRADE MARK SIGN  
32 - 0xfb01, // 0x93 LATIN SMALL LIGATURE FI  
33 - 0xfb02, // 0x94 LATIN SMALL LIGATURE FL  
34 - 0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE  
35 - 0x0152, // 0x96 LATIN CAPITAL LIGATURE OE  
36 - 0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON  
37 - 0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS  
38 - 0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON  
39 - 0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I  
40 - 0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE  
41 - 0x0153, // 0x9c LATIN SMALL LIGATURE OE  
42 - 0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON  
43 - 0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON  
44 - 0xfffd, // 0x9f UNDEFINED  
45 - 0x20ac, // 0xa0 EURO SIGN  
46 -};  
47 -  
48 // See above about ctype. 11 // See above about ctype.
49 static bool is_ascii_printable(unsigned char ch) 12 static bool is_ascii_printable(unsigned char ch)
50 { 13 {
@@ -210,62 +173,12 @@ QPDF_String::getVal() const @@ -210,62 +173,12 @@ QPDF_String::getVal() const
210 std::string 173 std::string
211 QPDF_String::getUTF8Val() const 174 QPDF_String::getUTF8Val() const
212 { 175 {
213 - std::string result;  
214 - size_t len = this->val.length();  
215 - if ((len >= 2) && (len % 2 == 0) &&  
216 - (this->val.at(0) == '\xfe') && (this->val.at(1) == '\xff')) 176 + if (QUtil::is_utf16(this->val))
217 { 177 {
218 - // This is a Unicode string using big-endian UTF-16. This  
219 - // code uses unsigned long and unsigned short to hold  
220 - // codepoint values. It requires unsigned long to be at least  
221 - // 32 bits and unsigned short to be at least 16 bits, but it  
222 - // will work fine if they are larger.  
223 - unsigned long codepoint = 0L;  
224 - for (unsigned int i = 2; i < len; i += 2)  
225 - {  
226 - // Convert from UTF16-BE. If we get a malformed  
227 - // codepoint, this code will generate incorrect output  
228 - // without giving a warning. Specifically, a high  
229 - // codepoint not followed by a low codepoint will be  
230 - // discarded, and a low codepoint not preceded by a high  
231 - // codepoint will just get its low 10 bits output.  
232 - unsigned short bits =  
233 - (static_cast<unsigned char>(this->val.at(i)) << 8) +  
234 - static_cast<unsigned char>(this->val.at(i+1));  
235 - if ((bits & 0xFC00) == 0xD800)  
236 - {  
237 - codepoint = 0x10000 + ((bits & 0x3FF) << 10);  
238 - continue;  
239 - }  
240 - else if ((bits & 0xFC00) == 0xDC00)  
241 - {  
242 - if (codepoint != 0)  
243 - {  
244 - QTC::TC("qpdf", "QPDF_String non-trivial UTF-16");  
245 - }  
246 - codepoint += bits & 0x3FF;  
247 - }  
248 - else  
249 - {  
250 - codepoint = bits;  
251 - }  
252 -  
253 - result += QUtil::toUTF8(codepoint);  
254 - codepoint = 0;  
255 - } 178 + return QUtil::utf16_to_utf8(this->val);
256 } 179 }
257 else 180 else
258 { 181 {
259 - for (unsigned int i = 0; i < len; ++i)  
260 - {  
261 - unsigned char ch = static_cast<unsigned char>(this->val.at(i));  
262 - unsigned short val = ch;  
263 - if ((ch >= 128) && (ch <= 160))  
264 - {  
265 - val = pdf_doc_to_unicode[ch - 128];  
266 - }  
267 - result += QUtil::toUTF8(val);  
268 - } 182 + return QUtil::pdf_doc_to_utf8(this->val);
269 } 183 }
270 - return result;  
271 } 184 }
libqpdf/QUtil.cc
@@ -8,6 +8,7 @@ @@ -8,6 +8,7 @@
8 #endif 8 #endif
9 #include <qpdf/SecureRandomDataProvider.hh> 9 #include <qpdf/SecureRandomDataProvider.hh>
10 #include <qpdf/QPDFSystemError.hh> 10 #include <qpdf/QPDFSystemError.hh>
  11 +#include <qpdf/QTC.hh>
11 12
12 #include <cmath> 13 #include <cmath>
13 #include <iomanip> 14 #include <iomanip>
@@ -29,6 +30,43 @@ @@ -29,6 +30,43 @@
29 #include <sys/stat.h> 30 #include <sys/stat.h>
30 #endif 31 #endif
31 32
  33 +// First element is 128
  34 +static unsigned short pdf_doc_to_unicode[] = {
  35 + 0x2022, // 0x80 BULLET
  36 + 0x2020, // 0x81 DAGGER
  37 + 0x2021, // 0x82 DOUBLE DAGGER
  38 + 0x2026, // 0x83 HORIZONTAL ELLIPSIS
  39 + 0x2014, // 0x84 EM DASH
  40 + 0x2013, // 0x85 EN DASH
  41 + 0x0192, // 0x86 SMALL LETTER F WITH HOOK
  42 + 0x2044, // 0x87 FRACTION SLASH (solidus)
  43 + 0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
  44 + 0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
  45 + 0x2212, // 0x8a MINUS SIGN
  46 + 0x2030, // 0x8b PER MILLE SIGN
  47 + 0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
  48 + 0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left)
  49 + 0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright)
  50 + 0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft)
  51 + 0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright)
  52 + 0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
  53 + 0x2122, // 0x92 TRADE MARK SIGN
  54 + 0xfb01, // 0x93 LATIN SMALL LIGATURE FI
  55 + 0xfb02, // 0x94 LATIN SMALL LIGATURE FL
  56 + 0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE
  57 + 0x0152, // 0x96 LATIN CAPITAL LIGATURE OE
  58 + 0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON
  59 + 0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS
  60 + 0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON
  61 + 0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I
  62 + 0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE
  63 + 0x0153, // 0x9c LATIN SMALL LIGATURE OE
  64 + 0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON
  65 + 0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON
  66 + 0xfffd, // 0x9f UNDEFINED
  67 + 0x20ac, // 0xa0 EURO SIGN
  68 +};
  69 +
32 std::string 70 std::string
33 QUtil::int_to_string(long long num, int length) 71 QUtil::int_to_string(long long num, int length)
34 { 72 {
@@ -895,7 +933,7 @@ QUtil::parse_numrange(char const* range, int max) @@ -895,7 +933,7 @@ QUtil::parse_numrange(char const* range, int max)
895 return result; 933 return result;
896 } 934 }
897 935
898 -enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman }; 936 +enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman, e_pdfdoc };
899 937
900 static unsigned char 938 static unsigned char
901 encode_winansi(unsigned long codepoint) 939 encode_winansi(unsigned long codepoint)
@@ -1342,6 +1380,119 @@ encode_macroman(unsigned long codepoint) @@ -1342,6 +1380,119 @@ encode_macroman(unsigned long codepoint)
1342 return ch; 1380 return ch;
1343 } 1381 }
1344 1382
  1383 +static unsigned char
  1384 +encode_pdfdoc(unsigned long codepoint)
  1385 +{
  1386 + // Use this ugly switch statement to avoid a static, which is not
  1387 + // thread-safe.
  1388 + unsigned char ch = '\0';
  1389 + switch (codepoint)
  1390 + {
  1391 + case 0x2022:
  1392 + ch = 0x80;
  1393 + break;
  1394 + case 0x2020:
  1395 + ch = 0x81;
  1396 + break;
  1397 + case 0x2021:
  1398 + ch = 0x82;
  1399 + break;
  1400 + case 0x2026:
  1401 + ch = 0x83;
  1402 + break;
  1403 + case 0x2014:
  1404 + ch = 0x84;
  1405 + break;
  1406 + case 0x2013:
  1407 + ch = 0x85;
  1408 + break;
  1409 + case 0x0192:
  1410 + ch = 0x86;
  1411 + break;
  1412 + case 0x2044:
  1413 + ch = 0x87;
  1414 + break;
  1415 + case 0x2039:
  1416 + ch = 0x88;
  1417 + break;
  1418 + case 0x203a:
  1419 + ch = 0x89;
  1420 + break;
  1421 + case 0x2212:
  1422 + ch = 0x8a;
  1423 + break;
  1424 + case 0x2030:
  1425 + ch = 0x8b;
  1426 + break;
  1427 + case 0x201e:
  1428 + ch = 0x8c;
  1429 + break;
  1430 + case 0x201c:
  1431 + ch = 0x8d;
  1432 + break;
  1433 + case 0x201d:
  1434 + ch = 0x8e;
  1435 + break;
  1436 + case 0x2018:
  1437 + ch = 0x8f;
  1438 + break;
  1439 + case 0x2019:
  1440 + ch = 0x90;
  1441 + break;
  1442 + case 0x201a:
  1443 + ch = 0x91;
  1444 + break;
  1445 + case 0x2122:
  1446 + ch = 0x92;
  1447 + break;
  1448 + case 0xfb01:
  1449 + ch = 0x93;
  1450 + break;
  1451 + case 0xfb02:
  1452 + ch = 0x94;
  1453 + break;
  1454 + case 0x0141:
  1455 + ch = 0x95;
  1456 + break;
  1457 + case 0x0152:
  1458 + ch = 0x96;
  1459 + break;
  1460 + case 0x0160:
  1461 + ch = 0x97;
  1462 + break;
  1463 + case 0x0178:
  1464 + ch = 0x98;
  1465 + break;
  1466 + case 0x017d:
  1467 + ch = 0x99;
  1468 + break;
  1469 + case 0x0131:
  1470 + ch = 0x9a;
  1471 + break;
  1472 + case 0x0142:
  1473 + ch = 0x9b;
  1474 + break;
  1475 + case 0x0153:
  1476 + ch = 0x9c;
  1477 + break;
  1478 + case 0x0161:
  1479 + ch = 0x9d;
  1480 + break;
  1481 + case 0x017e:
  1482 + ch = 0x9e;
  1483 + break;
  1484 + case 0xfffd:
  1485 + ch = 0x9f;
  1486 + break;
  1487 + case 0x20ac:
  1488 + ch = 0xa0;
  1489 + break;
  1490 + default:
  1491 + break;
  1492 + }
  1493 + return ch;
  1494 +}
  1495 +
1345 static std::string 1496 static std::string
1346 transcode_utf8(std::string const& utf8_val, encoding_e encoding, 1497 transcode_utf8(std::string const& utf8_val, encoding_e encoding,
1347 char unknown) 1498 char unknown)
@@ -1410,24 +1561,27 @@ transcode_utf8(std::string const&amp; utf8_val, encoding_e encoding, @@ -1410,24 +1561,27 @@ transcode_utf8(std::string const&amp; utf8_val, encoding_e encoding,
1410 { 1561 {
1411 result += QUtil::toUTF16(codepoint); 1562 result += QUtil::toUTF16(codepoint);
1412 } 1563 }
  1564 + else if ((codepoint >= 160) && (codepoint < 256) &&
  1565 + ((encoding == e_winansi) || (encoding == e_pdfdoc)))
  1566 + {
  1567 + ch = static_cast<unsigned char>(codepoint & 0xff);
  1568 + result.append(1, ch);
  1569 + }
1413 else 1570 else
1414 { 1571 {
1415 ch = '\0'; 1572 ch = '\0';
1416 if (encoding == e_winansi) 1573 if (encoding == e_winansi)
1417 { 1574 {
1418 - if ((codepoint >= 160) && (codepoint < 256))  
1419 - {  
1420 - ch = static_cast<unsigned char>(codepoint & 0xff);  
1421 - }  
1422 - else  
1423 - {  
1424 - ch = encode_winansi(codepoint);  
1425 - } 1575 + ch = encode_winansi(codepoint);
1426 } 1576 }
1427 else if (encoding == e_macroman) 1577 else if (encoding == e_macroman)
1428 { 1578 {
1429 ch = encode_macroman(codepoint); 1579 ch = encode_macroman(codepoint);
1430 } 1580 }
  1581 + else if (encoding == e_pdfdoc)
  1582 + {
  1583 + ch = encode_pdfdoc(codepoint);
  1584 + }
1431 if (ch == '\0') 1585 if (ch == '\0')
1432 { 1586 {
1433 ch = static_cast<unsigned char>(unknown); 1587 ch = static_cast<unsigned char>(unknown);
@@ -1463,3 +1617,98 @@ QUtil::utf8_to_mac_roman(std::string const&amp; utf8, char unknown_char) @@ -1463,3 +1617,98 @@ QUtil::utf8_to_mac_roman(std::string const&amp; utf8, char unknown_char)
1463 { 1617 {
1464 return transcode_utf8(utf8, e_macroman, unknown_char); 1618 return transcode_utf8(utf8, e_macroman, unknown_char);
1465 } 1619 }
  1620 +
  1621 +std::string
  1622 +QUtil::utf8_to_pdf_doc(std::string const& utf8, char unknown_char)
  1623 +{
  1624 + return transcode_utf8(utf8, e_pdfdoc, unknown_char);
  1625 +}
  1626 +
  1627 +bool
  1628 +QUtil::is_utf16(std::string const& val)
  1629 +{
  1630 + return ((val.length() >= 2) &&
  1631 + (val.at(0) == '\xfe') && (val.at(1) == '\xff'));
  1632 +}
  1633 +
  1634 +std::string
  1635 +QUtil::utf16_to_utf8(std::string const& val)
  1636 +{
  1637 + std::string result;
  1638 + // This code uses unsigned long and unsigned short to hold
  1639 + // codepoint values. It requires unsigned long to be at least
  1640 + // 32 bits and unsigned short to be at least 16 bits, but it
  1641 + // will work fine if they are larger.
  1642 + unsigned long codepoint = 0L;
  1643 + size_t len = val.length();
  1644 + size_t start = 0;
  1645 + if (is_utf16(val))
  1646 + {
  1647 + start += 2;
  1648 + }
  1649 + // If the string has an odd number of bytes, the last byte is
  1650 + // ignored.
  1651 + for (unsigned int i = start; i < len; i += 2)
  1652 + {
  1653 + // Convert from UTF16-BE. If we get a malformed
  1654 + // codepoint, this code will generate incorrect output
  1655 + // without giving a warning. Specifically, a high
  1656 + // codepoint not followed by a low codepoint will be
  1657 + // discarded, and a low codepoint not preceded by a high
  1658 + // codepoint will just get its low 10 bits output.
  1659 + unsigned short bits =
  1660 + (static_cast<unsigned char>(val.at(i)) << 8) +
  1661 + static_cast<unsigned char>(val.at(i+1));
  1662 + if ((bits & 0xFC00) == 0xD800)
  1663 + {
  1664 + codepoint = 0x10000 + ((bits & 0x3FF) << 10);
  1665 + continue;
  1666 + }
  1667 + else if ((bits & 0xFC00) == 0xDC00)
  1668 + {
  1669 + if (codepoint != 0)
  1670 + {
  1671 + QTC::TC("qpdf", "QUtil non-trivial UTF-16");
  1672 + }
  1673 + codepoint += bits & 0x3FF;
  1674 + }
  1675 + else
  1676 + {
  1677 + codepoint = bits;
  1678 + }
  1679 +
  1680 + result += QUtil::toUTF8(codepoint);
  1681 + codepoint = 0;
  1682 + }
  1683 + return result;
  1684 +}
  1685 +
  1686 +std::string
  1687 +QUtil::win_ansi_to_utf8(std::string const& val)
  1688 +{
  1689 + return "QXXXQ";
  1690 +}
  1691 +
  1692 +std::string
  1693 +QUtil::mac_roman_to_utf8(std::string const& val)
  1694 +{
  1695 + return "QXXXQ";
  1696 +}
  1697 +
  1698 +std::string
  1699 +QUtil::pdf_doc_to_utf8(std::string const& val)
  1700 +{
  1701 + std::string result;
  1702 + size_t len = val.length();
  1703 + for (unsigned int i = 0; i < len; ++i)
  1704 + {
  1705 + unsigned char ch = static_cast<unsigned char>(val.at(i));
  1706 + unsigned short val = ch;
  1707 + if ((ch >= 128) && (ch <= 160))
  1708 + {
  1709 + val = pdf_doc_to_unicode[ch - 128];
  1710 + }
  1711 + result += QUtil::toUTF8(val);
  1712 + }
  1713 + return result;
  1714 +}
qpdf/qpdf.testcov
@@ -108,7 +108,7 @@ QPDF_Stream pipeStreamData with null pipeline 0 @@ -108,7 +108,7 @@ QPDF_Stream pipeStreamData with null pipeline 0
108 QPDFWriter not recompressing /FlateDecode 0 108 QPDFWriter not recompressing /FlateDecode 0
109 QPDF_encryption xref stream from encrypted file 0 109 QPDF_encryption xref stream from encrypted file 0
110 qpdf unable to filter 0 110 qpdf unable to filter 0
111 -QPDF_String non-trivial UTF-16 0 111 +QUtil non-trivial UTF-16 0
112 QPDF xref overwrite object 0 112 QPDF xref overwrite object 0
113 QPDF decoding error warning 0 113 QPDF decoding error warning 0
114 qpdf-c called qpdf_init 0 114 qpdf-c called qpdf_init 0