Commit 698485468a8b7d0f38d817d6055898932f46cc26

Authored by Jay Berkenbilt
1 parent 5cfcd4f3

Move remaining existing transcoding to QUtil

include/qpdf/QUtil.hh
... ... @@ -147,13 +147,18 @@ namespace QUtil
147 147 std::string toUTF8(unsigned long uval);
148 148  
149 149 // Return a string containing the byte representation of the
150   - // UTF-16 BE encoding for the unicode value passed in.
  150 + // UTF-16 big-endian encoding for the unicode value passed in.
151 151 // Unrepresentable code points are converted to U+FFFD.
152 152 QPDF_DLL
153 153 std::string toUTF16(unsigned long uval);
154 154  
155   - // Convert a UTF-8 encoded string to UTF-16. Unrepresentable code
156   - // points are converted to U+FFFD.
  155 + // Test whether this is a UTF-16 big-endian string. This is
  156 + // indicated by first two bytes being 0xFE 0xFF.
  157 + QPDF_DLL
  158 + bool is_utf16(std::string const&);
  159 +
  160 + // Convert a UTF-8 encoded string to UTF-16 big-endian.
  161 + // Unrepresentable code points are converted to U+FFFD.
157 162 QPDF_DLL
158 163 std::string utf8_to_utf16(std::string const& utf8);
159 164  
... ... @@ -169,6 +174,24 @@ namespace QUtil
169 174 QPDF_DLL
170 175 std::string utf8_to_mac_roman(
171 176 std::string const& utf8, char unknown_char = '?');
  177 + QPDF_DLL
  178 + std::string utf8_to_pdf_doc(
  179 + std::string const& utf8, char unknown_char = '?');
  180 +
  181 + // Convert a UTF-16 big-endian encoded string to UTF-8.
  182 + // Unrepresentable code points are converted to U+FFFD.
  183 + QPDF_DLL
  184 + std::string utf16_to_utf8(std::string const& utf16);
  185 +
  186 + // Convert from the specified single-byte encoding system to
  187 + // UTF-8. There is no ascii_to_utf8 because all ASCII strings are
  188 + // already valid UTF-8.
  189 + QPDF_DLL
  190 + std::string win_ansi_to_utf8(std::string const& win);
  191 + QPDF_DLL
  192 + std::string mac_roman_to_utf8(std::string const& mac);
  193 + QPDF_DLL
  194 + std::string pdf_doc_to_utf8(std::string const& pdfdoc);
172 195  
173 196 // If secure random number generation is supported on your
174 197 // platform and qpdf was not compiled with insecure random number
... ...
libqpdf/QPDF_String.cc
... ... @@ -8,43 +8,6 @@
8 8 // be used.
9 9 #include <string.h>
10 10  
11   -// First element is 128
12   -static unsigned short pdf_doc_to_unicode[] = {
13   - 0x2022, // 0x80 BULLET
14   - 0x2020, // 0x81 DAGGER
15   - 0x2021, // 0x82 DOUBLE DAGGER
16   - 0x2026, // 0x83 HORIZONTAL ELLIPSIS
17   - 0x2014, // 0x84 EM DASH
18   - 0x2013, // 0x85 EN DASH
19   - 0x0192, // 0x86 SMALL LETTER F WITH HOOK
20   - 0x2044, // 0x87 FRACTION SLASH (solidus)
21   - 0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
22   - 0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
23   - 0x2212, // 0x8a MINUS SIGN
24   - 0x2030, // 0x8b PER MILLE SIGN
25   - 0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
26   - 0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left)
27   - 0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright)
28   - 0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft)
29   - 0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright)
30   - 0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
31   - 0x2122, // 0x92 TRADE MARK SIGN
32   - 0xfb01, // 0x93 LATIN SMALL LIGATURE FI
33   - 0xfb02, // 0x94 LATIN SMALL LIGATURE FL
34   - 0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE
35   - 0x0152, // 0x96 LATIN CAPITAL LIGATURE OE
36   - 0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON
37   - 0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS
38   - 0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON
39   - 0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I
40   - 0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE
41   - 0x0153, // 0x9c LATIN SMALL LIGATURE OE
42   - 0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON
43   - 0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON
44   - 0xfffd, // 0x9f UNDEFINED
45   - 0x20ac, // 0xa0 EURO SIGN
46   -};
47   -
48 11 // See above about ctype.
49 12 static bool is_ascii_printable(unsigned char ch)
50 13 {
... ... @@ -210,62 +173,12 @@ QPDF_String::getVal() const
210 173 std::string
211 174 QPDF_String::getUTF8Val() const
212 175 {
213   - std::string result;
214   - size_t len = this->val.length();
215   - if ((len >= 2) && (len % 2 == 0) &&
216   - (this->val.at(0) == '\xfe') && (this->val.at(1) == '\xff'))
  176 + if (QUtil::is_utf16(this->val))
217 177 {
218   - // This is a Unicode string using big-endian UTF-16. This
219   - // code uses unsigned long and unsigned short to hold
220   - // codepoint values. It requires unsigned long to be at least
221   - // 32 bits and unsigned short to be at least 16 bits, but it
222   - // will work fine if they are larger.
223   - unsigned long codepoint = 0L;
224   - for (unsigned int i = 2; i < len; i += 2)
225   - {
226   - // Convert from UTF16-BE. If we get a malformed
227   - // codepoint, this code will generate incorrect output
228   - // without giving a warning. Specifically, a high
229   - // codepoint not followed by a low codepoint will be
230   - // discarded, and a low codepoint not preceded by a high
231   - // codepoint will just get its low 10 bits output.
232   - unsigned short bits =
233   - (static_cast<unsigned char>(this->val.at(i)) << 8) +
234   - static_cast<unsigned char>(this->val.at(i+1));
235   - if ((bits & 0xFC00) == 0xD800)
236   - {
237   - codepoint = 0x10000 + ((bits & 0x3FF) << 10);
238   - continue;
239   - }
240   - else if ((bits & 0xFC00) == 0xDC00)
241   - {
242   - if (codepoint != 0)
243   - {
244   - QTC::TC("qpdf", "QPDF_String non-trivial UTF-16");
245   - }
246   - codepoint += bits & 0x3FF;
247   - }
248   - else
249   - {
250   - codepoint = bits;
251   - }
252   -
253   - result += QUtil::toUTF8(codepoint);
254   - codepoint = 0;
255   - }
  178 + return QUtil::utf16_to_utf8(this->val);
256 179 }
257 180 else
258 181 {
259   - for (unsigned int i = 0; i < len; ++i)
260   - {
261   - unsigned char ch = static_cast<unsigned char>(this->val.at(i));
262   - unsigned short val = ch;
263   - if ((ch >= 128) && (ch <= 160))
264   - {
265   - val = pdf_doc_to_unicode[ch - 128];
266   - }
267   - result += QUtil::toUTF8(val);
268   - }
  182 + return QUtil::pdf_doc_to_utf8(this->val);
269 183 }
270   - return result;
271 184 }
... ...
libqpdf/QUtil.cc
... ... @@ -8,6 +8,7 @@
8 8 #endif
9 9 #include <qpdf/SecureRandomDataProvider.hh>
10 10 #include <qpdf/QPDFSystemError.hh>
  11 +#include <qpdf/QTC.hh>
11 12  
12 13 #include <cmath>
13 14 #include <iomanip>
... ... @@ -29,6 +30,43 @@
29 30 #include <sys/stat.h>
30 31 #endif
31 32  
  33 +// First element is 128
  34 +static unsigned short pdf_doc_to_unicode[] = {
  35 + 0x2022, // 0x80 BULLET
  36 + 0x2020, // 0x81 DAGGER
  37 + 0x2021, // 0x82 DOUBLE DAGGER
  38 + 0x2026, // 0x83 HORIZONTAL ELLIPSIS
  39 + 0x2014, // 0x84 EM DASH
  40 + 0x2013, // 0x85 EN DASH
  41 + 0x0192, // 0x86 SMALL LETTER F WITH HOOK
  42 + 0x2044, // 0x87 FRACTION SLASH (solidus)
  43 + 0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
  44 + 0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
  45 + 0x2212, // 0x8a MINUS SIGN
  46 + 0x2030, // 0x8b PER MILLE SIGN
  47 + 0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
  48 + 0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left)
  49 + 0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright)
  50 + 0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft)
  51 + 0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright)
  52 + 0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
  53 + 0x2122, // 0x92 TRADE MARK SIGN
  54 + 0xfb01, // 0x93 LATIN SMALL LIGATURE FI
  55 + 0xfb02, // 0x94 LATIN SMALL LIGATURE FL
  56 + 0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE
  57 + 0x0152, // 0x96 LATIN CAPITAL LIGATURE OE
  58 + 0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON
  59 + 0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS
  60 + 0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON
  61 + 0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I
  62 + 0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE
  63 + 0x0153, // 0x9c LATIN SMALL LIGATURE OE
  64 + 0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON
  65 + 0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON
  66 + 0xfffd, // 0x9f UNDEFINED
  67 + 0x20ac, // 0xa0 EURO SIGN
  68 +};
  69 +
32 70 std::string
33 71 QUtil::int_to_string(long long num, int length)
34 72 {
... ... @@ -895,7 +933,7 @@ QUtil::parse_numrange(char const* range, int max)
895 933 return result;
896 934 }
897 935  
898   -enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman };
  936 +enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman, e_pdfdoc };
899 937  
900 938 static unsigned char
901 939 encode_winansi(unsigned long codepoint)
... ... @@ -1342,6 +1380,119 @@ encode_macroman(unsigned long codepoint)
1342 1380 return ch;
1343 1381 }
1344 1382  
  1383 +static unsigned char
  1384 +encode_pdfdoc(unsigned long codepoint)
  1385 +{
  1386 + // Use this ugly switch statement to avoid a static, which is not
  1387 + // thread-safe.
  1388 + unsigned char ch = '\0';
  1389 + switch (codepoint)
  1390 + {
  1391 + case 0x2022:
  1392 + ch = 0x80;
  1393 + break;
  1394 + case 0x2020:
  1395 + ch = 0x81;
  1396 + break;
  1397 + case 0x2021:
  1398 + ch = 0x82;
  1399 + break;
  1400 + case 0x2026:
  1401 + ch = 0x83;
  1402 + break;
  1403 + case 0x2014:
  1404 + ch = 0x84;
  1405 + break;
  1406 + case 0x2013:
  1407 + ch = 0x85;
  1408 + break;
  1409 + case 0x0192:
  1410 + ch = 0x86;
  1411 + break;
  1412 + case 0x2044:
  1413 + ch = 0x87;
  1414 + break;
  1415 + case 0x2039:
  1416 + ch = 0x88;
  1417 + break;
  1418 + case 0x203a:
  1419 + ch = 0x89;
  1420 + break;
  1421 + case 0x2212:
  1422 + ch = 0x8a;
  1423 + break;
  1424 + case 0x2030:
  1425 + ch = 0x8b;
  1426 + break;
  1427 + case 0x201e:
  1428 + ch = 0x8c;
  1429 + break;
  1430 + case 0x201c:
  1431 + ch = 0x8d;
  1432 + break;
  1433 + case 0x201d:
  1434 + ch = 0x8e;
  1435 + break;
  1436 + case 0x2018:
  1437 + ch = 0x8f;
  1438 + break;
  1439 + case 0x2019:
  1440 + ch = 0x90;
  1441 + break;
  1442 + case 0x201a:
  1443 + ch = 0x91;
  1444 + break;
  1445 + case 0x2122:
  1446 + ch = 0x92;
  1447 + break;
  1448 + case 0xfb01:
  1449 + ch = 0x93;
  1450 + break;
  1451 + case 0xfb02:
  1452 + ch = 0x94;
  1453 + break;
  1454 + case 0x0141:
  1455 + ch = 0x95;
  1456 + break;
  1457 + case 0x0152:
  1458 + ch = 0x96;
  1459 + break;
  1460 + case 0x0160:
  1461 + ch = 0x97;
  1462 + break;
  1463 + case 0x0178:
  1464 + ch = 0x98;
  1465 + break;
  1466 + case 0x017d:
  1467 + ch = 0x99;
  1468 + break;
  1469 + case 0x0131:
  1470 + ch = 0x9a;
  1471 + break;
  1472 + case 0x0142:
  1473 + ch = 0x9b;
  1474 + break;
  1475 + case 0x0153:
  1476 + ch = 0x9c;
  1477 + break;
  1478 + case 0x0161:
  1479 + ch = 0x9d;
  1480 + break;
  1481 + case 0x017e:
  1482 + ch = 0x9e;
  1483 + break;
  1484 + case 0xfffd:
  1485 + ch = 0x9f;
  1486 + break;
  1487 + case 0x20ac:
  1488 + ch = 0xa0;
  1489 + break;
  1490 + default:
  1491 + break;
  1492 + }
  1493 + return ch;
  1494 +}
  1495 +
1345 1496 static std::string
1346 1497 transcode_utf8(std::string const& utf8_val, encoding_e encoding,
1347 1498 char unknown)
... ... @@ -1410,24 +1561,27 @@ transcode_utf8(std::string const&amp; utf8_val, encoding_e encoding,
1410 1561 {
1411 1562 result += QUtil::toUTF16(codepoint);
1412 1563 }
  1564 + else if ((codepoint >= 160) && (codepoint < 256) &&
  1565 + ((encoding == e_winansi) || (encoding == e_pdfdoc)))
  1566 + {
  1567 + ch = static_cast<unsigned char>(codepoint & 0xff);
  1568 + result.append(1, ch);
  1569 + }
1413 1570 else
1414 1571 {
1415 1572 ch = '\0';
1416 1573 if (encoding == e_winansi)
1417 1574 {
1418   - if ((codepoint >= 160) && (codepoint < 256))
1419   - {
1420   - ch = static_cast<unsigned char>(codepoint & 0xff);
1421   - }
1422   - else
1423   - {
1424   - ch = encode_winansi(codepoint);
1425   - }
  1575 + ch = encode_winansi(codepoint);
1426 1576 }
1427 1577 else if (encoding == e_macroman)
1428 1578 {
1429 1579 ch = encode_macroman(codepoint);
1430 1580 }
  1581 + else if (encoding == e_pdfdoc)
  1582 + {
  1583 + ch = encode_pdfdoc(codepoint);
  1584 + }
1431 1585 if (ch == '\0')
1432 1586 {
1433 1587 ch = static_cast<unsigned char>(unknown);
... ... @@ -1463,3 +1617,98 @@ QUtil::utf8_to_mac_roman(std::string const&amp; utf8, char unknown_char)
1463 1617 {
1464 1618 return transcode_utf8(utf8, e_macroman, unknown_char);
1465 1619 }
  1620 +
  1621 +std::string
  1622 +QUtil::utf8_to_pdf_doc(std::string const& utf8, char unknown_char)
  1623 +{
  1624 + return transcode_utf8(utf8, e_pdfdoc, unknown_char);
  1625 +}
  1626 +
  1627 +bool
  1628 +QUtil::is_utf16(std::string const& val)
  1629 +{
  1630 + return ((val.length() >= 2) &&
  1631 + (val.at(0) == '\xfe') && (val.at(1) == '\xff'));
  1632 +}
  1633 +
  1634 +std::string
  1635 +QUtil::utf16_to_utf8(std::string const& val)
  1636 +{
  1637 + std::string result;
  1638 + // This code uses unsigned long and unsigned short to hold
  1639 + // codepoint values. It requires unsigned long to be at least
  1640 + // 32 bits and unsigned short to be at least 16 bits, but it
  1641 + // will work fine if they are larger.
  1642 + unsigned long codepoint = 0L;
  1643 + size_t len = val.length();
  1644 + size_t start = 0;
  1645 + if (is_utf16(val))
  1646 + {
  1647 + start += 2;
  1648 + }
  1649 + // If the string has an odd number of bytes, the last byte is
  1650 + // ignored.
  1651 + for (unsigned int i = start; i < len; i += 2)
  1652 + {
  1653 + // Convert from UTF16-BE. If we get a malformed
  1654 + // codepoint, this code will generate incorrect output
  1655 + // without giving a warning. Specifically, a high
  1656 + // codepoint not followed by a low codepoint will be
  1657 + // discarded, and a low codepoint not preceded by a high
  1658 + // codepoint will just get its low 10 bits output.
  1659 + unsigned short bits =
  1660 + (static_cast<unsigned char>(val.at(i)) << 8) +
  1661 + static_cast<unsigned char>(val.at(i+1));
  1662 + if ((bits & 0xFC00) == 0xD800)
  1663 + {
  1664 + codepoint = 0x10000 + ((bits & 0x3FF) << 10);
  1665 + continue;
  1666 + }
  1667 + else if ((bits & 0xFC00) == 0xDC00)
  1668 + {
  1669 + if (codepoint != 0)
  1670 + {
  1671 + QTC::TC("qpdf", "QUtil non-trivial UTF-16");
  1672 + }
  1673 + codepoint += bits & 0x3FF;
  1674 + }
  1675 + else
  1676 + {
  1677 + codepoint = bits;
  1678 + }
  1679 +
  1680 + result += QUtil::toUTF8(codepoint);
  1681 + codepoint = 0;
  1682 + }
  1683 + return result;
  1684 +}
  1685 +
  1686 +std::string
  1687 +QUtil::win_ansi_to_utf8(std::string const& val)
  1688 +{
  1689 + return "QXXXQ";
  1690 +}
  1691 +
  1692 +std::string
  1693 +QUtil::mac_roman_to_utf8(std::string const& val)
  1694 +{
  1695 + return "QXXXQ";
  1696 +}
  1697 +
  1698 +std::string
  1699 +QUtil::pdf_doc_to_utf8(std::string const& val)
  1700 +{
  1701 + std::string result;
  1702 + size_t len = val.length();
  1703 + for (unsigned int i = 0; i < len; ++i)
  1704 + {
  1705 + unsigned char ch = static_cast<unsigned char>(val.at(i));
  1706 + unsigned short val = ch;
  1707 + if ((ch >= 128) && (ch <= 160))
  1708 + {
  1709 + val = pdf_doc_to_unicode[ch - 128];
  1710 + }
  1711 + result += QUtil::toUTF8(val);
  1712 + }
  1713 + return result;
  1714 +}
... ...
qpdf/qpdf.testcov
... ... @@ -108,7 +108,7 @@ QPDF_Stream pipeStreamData with null pipeline 0
108 108 QPDFWriter not recompressing /FlateDecode 0
109 109 QPDF_encryption xref stream from encrypted file 0
110 110 qpdf unable to filter 0
111   -QPDF_String non-trivial UTF-16 0
  111 +QUtil non-trivial UTF-16 0
112 112 QPDF xref overwrite object 0
113 113 QPDF decoding error warning 0
114 114 qpdf-c called qpdf_init 0
... ...