Commit 698485468a8b7d0f38d817d6055898932f46cc26
1 parent
5cfcd4f3
Move remaining existing transcoding to QUtil
Showing
4 changed files
with
288 additions
and
103 deletions
include/qpdf/QUtil.hh
| @@ -147,13 +147,18 @@ namespace QUtil | @@ -147,13 +147,18 @@ namespace QUtil | ||
| 147 | std::string toUTF8(unsigned long uval); | 147 | std::string toUTF8(unsigned long uval); |
| 148 | 148 | ||
| 149 | // Return a string containing the byte representation of the | 149 | // Return a string containing the byte representation of the |
| 150 | - // UTF-16 BE encoding for the unicode value passed in. | 150 | + // UTF-16 big-endian encoding for the unicode value passed in. |
| 151 | // Unrepresentable code points are converted to U+FFFD. | 151 | // Unrepresentable code points are converted to U+FFFD. |
| 152 | QPDF_DLL | 152 | QPDF_DLL |
| 153 | std::string toUTF16(unsigned long uval); | 153 | std::string toUTF16(unsigned long uval); |
| 154 | 154 | ||
| 155 | - // Convert a UTF-8 encoded string to UTF-16. Unrepresentable code | ||
| 156 | - // points are converted to U+FFFD. | 155 | + // Test whether this is a UTF-16 big-endian string. This is |
| 156 | + // indicated by first two bytes being 0xFE 0xFF. | ||
| 157 | + QPDF_DLL | ||
| 158 | + bool is_utf16(std::string const&); | ||
| 159 | + | ||
| 160 | + // Convert a UTF-8 encoded string to UTF-16 big-endian. | ||
| 161 | + // Unrepresentable code points are converted to U+FFFD. | ||
| 157 | QPDF_DLL | 162 | QPDF_DLL |
| 158 | std::string utf8_to_utf16(std::string const& utf8); | 163 | std::string utf8_to_utf16(std::string const& utf8); |
| 159 | 164 | ||
| @@ -169,6 +174,24 @@ namespace QUtil | @@ -169,6 +174,24 @@ namespace QUtil | ||
| 169 | QPDF_DLL | 174 | QPDF_DLL |
| 170 | std::string utf8_to_mac_roman( | 175 | std::string utf8_to_mac_roman( |
| 171 | std::string const& utf8, char unknown_char = '?'); | 176 | std::string const& utf8, char unknown_char = '?'); |
| 177 | + QPDF_DLL | ||
| 178 | + std::string utf8_to_pdf_doc( | ||
| 179 | + std::string const& utf8, char unknown_char = '?'); | ||
| 180 | + | ||
| 181 | + // Convert a UTF-16 big-endian encoded string to UTF-8. | ||
| 182 | + // Unrepresentable code points are converted to U+FFFD. | ||
| 183 | + QPDF_DLL | ||
| 184 | + std::string utf16_to_utf8(std::string const& utf16); | ||
| 185 | + | ||
| 186 | + // Convert from the specified single-byte encoding system to | ||
| 187 | + // UTF-8. There is no ascii_to_utf8 because all ASCII strings are | ||
| 188 | + // already valid UTF-8. | ||
| 189 | + QPDF_DLL | ||
| 190 | + std::string win_ansi_to_utf8(std::string const& win); | ||
| 191 | + QPDF_DLL | ||
| 192 | + std::string mac_roman_to_utf8(std::string const& mac); | ||
| 193 | + QPDF_DLL | ||
| 194 | + std::string pdf_doc_to_utf8(std::string const& pdfdoc); | ||
| 172 | 195 | ||
| 173 | // If secure random number generation is supported on your | 196 | // If secure random number generation is supported on your |
| 174 | // platform and qpdf was not compiled with insecure random number | 197 | // platform and qpdf was not compiled with insecure random number |
libqpdf/QPDF_String.cc
| @@ -8,43 +8,6 @@ | @@ -8,43 +8,6 @@ | ||
| 8 | // be used. | 8 | // be used. |
| 9 | #include <string.h> | 9 | #include <string.h> |
| 10 | 10 | ||
| 11 | -// First element is 128 | ||
| 12 | -static unsigned short pdf_doc_to_unicode[] = { | ||
| 13 | - 0x2022, // 0x80 BULLET | ||
| 14 | - 0x2020, // 0x81 DAGGER | ||
| 15 | - 0x2021, // 0x82 DOUBLE DAGGER | ||
| 16 | - 0x2026, // 0x83 HORIZONTAL ELLIPSIS | ||
| 17 | - 0x2014, // 0x84 EM DASH | ||
| 18 | - 0x2013, // 0x85 EN DASH | ||
| 19 | - 0x0192, // 0x86 SMALL LETTER F WITH HOOK | ||
| 20 | - 0x2044, // 0x87 FRACTION SLASH (solidus) | ||
| 21 | - 0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK | ||
| 22 | - 0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK | ||
| 23 | - 0x2212, // 0x8a MINUS SIGN | ||
| 24 | - 0x2030, // 0x8b PER MILLE SIGN | ||
| 25 | - 0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase) | ||
| 26 | - 0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left) | ||
| 27 | - 0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright) | ||
| 28 | - 0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft) | ||
| 29 | - 0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright) | ||
| 30 | - 0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase) | ||
| 31 | - 0x2122, // 0x92 TRADE MARK SIGN | ||
| 32 | - 0xfb01, // 0x93 LATIN SMALL LIGATURE FI | ||
| 33 | - 0xfb02, // 0x94 LATIN SMALL LIGATURE FL | ||
| 34 | - 0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE | ||
| 35 | - 0x0152, // 0x96 LATIN CAPITAL LIGATURE OE | ||
| 36 | - 0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON | ||
| 37 | - 0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS | ||
| 38 | - 0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON | ||
| 39 | - 0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I | ||
| 40 | - 0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE | ||
| 41 | - 0x0153, // 0x9c LATIN SMALL LIGATURE OE | ||
| 42 | - 0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON | ||
| 43 | - 0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON | ||
| 44 | - 0xfffd, // 0x9f UNDEFINED | ||
| 45 | - 0x20ac, // 0xa0 EURO SIGN | ||
| 46 | -}; | ||
| 47 | - | ||
| 48 | // See above about ctype. | 11 | // See above about ctype. |
| 49 | static bool is_ascii_printable(unsigned char ch) | 12 | static bool is_ascii_printable(unsigned char ch) |
| 50 | { | 13 | { |
| @@ -210,62 +173,12 @@ QPDF_String::getVal() const | @@ -210,62 +173,12 @@ QPDF_String::getVal() const | ||
| 210 | std::string | 173 | std::string |
| 211 | QPDF_String::getUTF8Val() const | 174 | QPDF_String::getUTF8Val() const |
| 212 | { | 175 | { |
| 213 | - std::string result; | ||
| 214 | - size_t len = this->val.length(); | ||
| 215 | - if ((len >= 2) && (len % 2 == 0) && | ||
| 216 | - (this->val.at(0) == '\xfe') && (this->val.at(1) == '\xff')) | 176 | + if (QUtil::is_utf16(this->val)) |
| 217 | { | 177 | { |
| 218 | - // This is a Unicode string using big-endian UTF-16. This | ||
| 219 | - // code uses unsigned long and unsigned short to hold | ||
| 220 | - // codepoint values. It requires unsigned long to be at least | ||
| 221 | - // 32 bits and unsigned short to be at least 16 bits, but it | ||
| 222 | - // will work fine if they are larger. | ||
| 223 | - unsigned long codepoint = 0L; | ||
| 224 | - for (unsigned int i = 2; i < len; i += 2) | ||
| 225 | - { | ||
| 226 | - // Convert from UTF16-BE. If we get a malformed | ||
| 227 | - // codepoint, this code will generate incorrect output | ||
| 228 | - // without giving a warning. Specifically, a high | ||
| 229 | - // codepoint not followed by a low codepoint will be | ||
| 230 | - // discarded, and a low codepoint not preceded by a high | ||
| 231 | - // codepoint will just get its low 10 bits output. | ||
| 232 | - unsigned short bits = | ||
| 233 | - (static_cast<unsigned char>(this->val.at(i)) << 8) + | ||
| 234 | - static_cast<unsigned char>(this->val.at(i+1)); | ||
| 235 | - if ((bits & 0xFC00) == 0xD800) | ||
| 236 | - { | ||
| 237 | - codepoint = 0x10000 + ((bits & 0x3FF) << 10); | ||
| 238 | - continue; | ||
| 239 | - } | ||
| 240 | - else if ((bits & 0xFC00) == 0xDC00) | ||
| 241 | - { | ||
| 242 | - if (codepoint != 0) | ||
| 243 | - { | ||
| 244 | - QTC::TC("qpdf", "QPDF_String non-trivial UTF-16"); | ||
| 245 | - } | ||
| 246 | - codepoint += bits & 0x3FF; | ||
| 247 | - } | ||
| 248 | - else | ||
| 249 | - { | ||
| 250 | - codepoint = bits; | ||
| 251 | - } | ||
| 252 | - | ||
| 253 | - result += QUtil::toUTF8(codepoint); | ||
| 254 | - codepoint = 0; | ||
| 255 | - } | 178 | + return QUtil::utf16_to_utf8(this->val); |
| 256 | } | 179 | } |
| 257 | else | 180 | else |
| 258 | { | 181 | { |
| 259 | - for (unsigned int i = 0; i < len; ++i) | ||
| 260 | - { | ||
| 261 | - unsigned char ch = static_cast<unsigned char>(this->val.at(i)); | ||
| 262 | - unsigned short val = ch; | ||
| 263 | - if ((ch >= 128) && (ch <= 160)) | ||
| 264 | - { | ||
| 265 | - val = pdf_doc_to_unicode[ch - 128]; | ||
| 266 | - } | ||
| 267 | - result += QUtil::toUTF8(val); | ||
| 268 | - } | 182 | + return QUtil::pdf_doc_to_utf8(this->val); |
| 269 | } | 183 | } |
| 270 | - return result; | ||
| 271 | } | 184 | } |
libqpdf/QUtil.cc
| @@ -8,6 +8,7 @@ | @@ -8,6 +8,7 @@ | ||
| 8 | #endif | 8 | #endif |
| 9 | #include <qpdf/SecureRandomDataProvider.hh> | 9 | #include <qpdf/SecureRandomDataProvider.hh> |
| 10 | #include <qpdf/QPDFSystemError.hh> | 10 | #include <qpdf/QPDFSystemError.hh> |
| 11 | +#include <qpdf/QTC.hh> | ||
| 11 | 12 | ||
| 12 | #include <cmath> | 13 | #include <cmath> |
| 13 | #include <iomanip> | 14 | #include <iomanip> |
| @@ -29,6 +30,43 @@ | @@ -29,6 +30,43 @@ | ||
| 29 | #include <sys/stat.h> | 30 | #include <sys/stat.h> |
| 30 | #endif | 31 | #endif |
| 31 | 32 | ||
| 33 | +// First element is 128 | ||
| 34 | +static unsigned short pdf_doc_to_unicode[] = { | ||
| 35 | + 0x2022, // 0x80 BULLET | ||
| 36 | + 0x2020, // 0x81 DAGGER | ||
| 37 | + 0x2021, // 0x82 DOUBLE DAGGER | ||
| 38 | + 0x2026, // 0x83 HORIZONTAL ELLIPSIS | ||
| 39 | + 0x2014, // 0x84 EM DASH | ||
| 40 | + 0x2013, // 0x85 EN DASH | ||
| 41 | + 0x0192, // 0x86 SMALL LETTER F WITH HOOK | ||
| 42 | + 0x2044, // 0x87 FRACTION SLASH (solidus) | ||
| 43 | + 0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK | ||
| 44 | + 0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK | ||
| 45 | + 0x2212, // 0x8a MINUS SIGN | ||
| 46 | + 0x2030, // 0x8b PER MILLE SIGN | ||
| 47 | + 0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase) | ||
| 48 | + 0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left) | ||
| 49 | + 0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright) | ||
| 50 | + 0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft) | ||
| 51 | + 0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright) | ||
| 52 | + 0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase) | ||
| 53 | + 0x2122, // 0x92 TRADE MARK SIGN | ||
| 54 | + 0xfb01, // 0x93 LATIN SMALL LIGATURE FI | ||
| 55 | + 0xfb02, // 0x94 LATIN SMALL LIGATURE FL | ||
| 56 | + 0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE | ||
| 57 | + 0x0152, // 0x96 LATIN CAPITAL LIGATURE OE | ||
| 58 | + 0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON | ||
| 59 | + 0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS | ||
| 60 | + 0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON | ||
| 61 | + 0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I | ||
| 62 | + 0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE | ||
| 63 | + 0x0153, // 0x9c LATIN SMALL LIGATURE OE | ||
| 64 | + 0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON | ||
| 65 | + 0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON | ||
| 66 | + 0xfffd, // 0x9f UNDEFINED | ||
| 67 | + 0x20ac, // 0xa0 EURO SIGN | ||
| 68 | +}; | ||
| 69 | + | ||
| 32 | std::string | 70 | std::string |
| 33 | QUtil::int_to_string(long long num, int length) | 71 | QUtil::int_to_string(long long num, int length) |
| 34 | { | 72 | { |
| @@ -895,7 +933,7 @@ QUtil::parse_numrange(char const* range, int max) | @@ -895,7 +933,7 @@ QUtil::parse_numrange(char const* range, int max) | ||
| 895 | return result; | 933 | return result; |
| 896 | } | 934 | } |
| 897 | 935 | ||
| 898 | -enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman }; | 936 | +enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman, e_pdfdoc }; |
| 899 | 937 | ||
| 900 | static unsigned char | 938 | static unsigned char |
| 901 | encode_winansi(unsigned long codepoint) | 939 | encode_winansi(unsigned long codepoint) |
| @@ -1342,6 +1380,119 @@ encode_macroman(unsigned long codepoint) | @@ -1342,6 +1380,119 @@ encode_macroman(unsigned long codepoint) | ||
| 1342 | return ch; | 1380 | return ch; |
| 1343 | } | 1381 | } |
| 1344 | 1382 | ||
| 1383 | +static unsigned char | ||
| 1384 | +encode_pdfdoc(unsigned long codepoint) | ||
| 1385 | +{ | ||
| 1386 | + // Use this ugly switch statement to avoid a static, which is not | ||
| 1387 | + // thread-safe. | ||
| 1388 | + unsigned char ch = '\0'; | ||
| 1389 | + switch (codepoint) | ||
| 1390 | + { | ||
| 1391 | + case 0x2022: | ||
| 1392 | + ch = 0x80; | ||
| 1393 | + break; | ||
| 1394 | + case 0x2020: | ||
| 1395 | + ch = 0x81; | ||
| 1396 | + break; | ||
| 1397 | + case 0x2021: | ||
| 1398 | + ch = 0x82; | ||
| 1399 | + break; | ||
| 1400 | + case 0x2026: | ||
| 1401 | + ch = 0x83; | ||
| 1402 | + break; | ||
| 1403 | + case 0x2014: | ||
| 1404 | + ch = 0x84; | ||
| 1405 | + break; | ||
| 1406 | + case 0x2013: | ||
| 1407 | + ch = 0x85; | ||
| 1408 | + break; | ||
| 1409 | + case 0x0192: | ||
| 1410 | + ch = 0x86; | ||
| 1411 | + break; | ||
| 1412 | + case 0x2044: | ||
| 1413 | + ch = 0x87; | ||
| 1414 | + break; | ||
| 1415 | + case 0x2039: | ||
| 1416 | + ch = 0x88; | ||
| 1417 | + break; | ||
| 1418 | + case 0x203a: | ||
| 1419 | + ch = 0x89; | ||
| 1420 | + break; | ||
| 1421 | + case 0x2212: | ||
| 1422 | + ch = 0x8a; | ||
| 1423 | + break; | ||
| 1424 | + case 0x2030: | ||
| 1425 | + ch = 0x8b; | ||
| 1426 | + break; | ||
| 1427 | + case 0x201e: | ||
| 1428 | + ch = 0x8c; | ||
| 1429 | + break; | ||
| 1430 | + case 0x201c: | ||
| 1431 | + ch = 0x8d; | ||
| 1432 | + break; | ||
| 1433 | + case 0x201d: | ||
| 1434 | + ch = 0x8e; | ||
| 1435 | + break; | ||
| 1436 | + case 0x2018: | ||
| 1437 | + ch = 0x8f; | ||
| 1438 | + break; | ||
| 1439 | + case 0x2019: | ||
| 1440 | + ch = 0x90; | ||
| 1441 | + break; | ||
| 1442 | + case 0x201a: | ||
| 1443 | + ch = 0x91; | ||
| 1444 | + break; | ||
| 1445 | + case 0x2122: | ||
| 1446 | + ch = 0x92; | ||
| 1447 | + break; | ||
| 1448 | + case 0xfb01: | ||
| 1449 | + ch = 0x93; | ||
| 1450 | + break; | ||
| 1451 | + case 0xfb02: | ||
| 1452 | + ch = 0x94; | ||
| 1453 | + break; | ||
| 1454 | + case 0x0141: | ||
| 1455 | + ch = 0x95; | ||
| 1456 | + break; | ||
| 1457 | + case 0x0152: | ||
| 1458 | + ch = 0x96; | ||
| 1459 | + break; | ||
| 1460 | + case 0x0160: | ||
| 1461 | + ch = 0x97; | ||
| 1462 | + break; | ||
| 1463 | + case 0x0178: | ||
| 1464 | + ch = 0x98; | ||
| 1465 | + break; | ||
| 1466 | + case 0x017d: | ||
| 1467 | + ch = 0x99; | ||
| 1468 | + break; | ||
| 1469 | + case 0x0131: | ||
| 1470 | + ch = 0x9a; | ||
| 1471 | + break; | ||
| 1472 | + case 0x0142: | ||
| 1473 | + ch = 0x9b; | ||
| 1474 | + break; | ||
| 1475 | + case 0x0153: | ||
| 1476 | + ch = 0x9c; | ||
| 1477 | + break; | ||
| 1478 | + case 0x0161: | ||
| 1479 | + ch = 0x9d; | ||
| 1480 | + break; | ||
| 1481 | + case 0x017e: | ||
| 1482 | + ch = 0x9e; | ||
| 1483 | + break; | ||
| 1484 | + case 0xfffd: | ||
| 1485 | + ch = 0x9f; | ||
| 1486 | + break; | ||
| 1487 | + case 0x20ac: | ||
| 1488 | + ch = 0xa0; | ||
| 1489 | + break; | ||
| 1490 | + default: | ||
| 1491 | + break; | ||
| 1492 | + } | ||
| 1493 | + return ch; | ||
| 1494 | +} | ||
| 1495 | + | ||
| 1345 | static std::string | 1496 | static std::string |
| 1346 | transcode_utf8(std::string const& utf8_val, encoding_e encoding, | 1497 | transcode_utf8(std::string const& utf8_val, encoding_e encoding, |
| 1347 | char unknown) | 1498 | char unknown) |
| @@ -1410,24 +1561,27 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding, | @@ -1410,24 +1561,27 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding, | ||
| 1410 | { | 1561 | { |
| 1411 | result += QUtil::toUTF16(codepoint); | 1562 | result += QUtil::toUTF16(codepoint); |
| 1412 | } | 1563 | } |
| 1564 | + else if ((codepoint >= 160) && (codepoint < 256) && | ||
| 1565 | + ((encoding == e_winansi) || (encoding == e_pdfdoc))) | ||
| 1566 | + { | ||
| 1567 | + ch = static_cast<unsigned char>(codepoint & 0xff); | ||
| 1568 | + result.append(1, ch); | ||
| 1569 | + } | ||
| 1413 | else | 1570 | else |
| 1414 | { | 1571 | { |
| 1415 | ch = '\0'; | 1572 | ch = '\0'; |
| 1416 | if (encoding == e_winansi) | 1573 | if (encoding == e_winansi) |
| 1417 | { | 1574 | { |
| 1418 | - if ((codepoint >= 160) && (codepoint < 256)) | ||
| 1419 | - { | ||
| 1420 | - ch = static_cast<unsigned char>(codepoint & 0xff); | ||
| 1421 | - } | ||
| 1422 | - else | ||
| 1423 | - { | ||
| 1424 | - ch = encode_winansi(codepoint); | ||
| 1425 | - } | 1575 | + ch = encode_winansi(codepoint); |
| 1426 | } | 1576 | } |
| 1427 | else if (encoding == e_macroman) | 1577 | else if (encoding == e_macroman) |
| 1428 | { | 1578 | { |
| 1429 | ch = encode_macroman(codepoint); | 1579 | ch = encode_macroman(codepoint); |
| 1430 | } | 1580 | } |
| 1581 | + else if (encoding == e_pdfdoc) | ||
| 1582 | + { | ||
| 1583 | + ch = encode_pdfdoc(codepoint); | ||
| 1584 | + } | ||
| 1431 | if (ch == '\0') | 1585 | if (ch == '\0') |
| 1432 | { | 1586 | { |
| 1433 | ch = static_cast<unsigned char>(unknown); | 1587 | ch = static_cast<unsigned char>(unknown); |
| @@ -1463,3 +1617,98 @@ QUtil::utf8_to_mac_roman(std::string const& utf8, char unknown_char) | @@ -1463,3 +1617,98 @@ QUtil::utf8_to_mac_roman(std::string const& utf8, char unknown_char) | ||
| 1463 | { | 1617 | { |
| 1464 | return transcode_utf8(utf8, e_macroman, unknown_char); | 1618 | return transcode_utf8(utf8, e_macroman, unknown_char); |
| 1465 | } | 1619 | } |
| 1620 | + | ||
| 1621 | +std::string | ||
| 1622 | +QUtil::utf8_to_pdf_doc(std::string const& utf8, char unknown_char) | ||
| 1623 | +{ | ||
| 1624 | + return transcode_utf8(utf8, e_pdfdoc, unknown_char); | ||
| 1625 | +} | ||
| 1626 | + | ||
| 1627 | +bool | ||
| 1628 | +QUtil::is_utf16(std::string const& val) | ||
| 1629 | +{ | ||
| 1630 | + return ((val.length() >= 2) && | ||
| 1631 | + (val.at(0) == '\xfe') && (val.at(1) == '\xff')); | ||
| 1632 | +} | ||
| 1633 | + | ||
| 1634 | +std::string | ||
| 1635 | +QUtil::utf16_to_utf8(std::string const& val) | ||
| 1636 | +{ | ||
| 1637 | + std::string result; | ||
| 1638 | + // This code uses unsigned long and unsigned short to hold | ||
| 1639 | + // codepoint values. It requires unsigned long to be at least | ||
| 1640 | + // 32 bits and unsigned short to be at least 16 bits, but it | ||
| 1641 | + // will work fine if they are larger. | ||
| 1642 | + unsigned long codepoint = 0L; | ||
| 1643 | + size_t len = val.length(); | ||
| 1644 | + size_t start = 0; | ||
| 1645 | + if (is_utf16(val)) | ||
| 1646 | + { | ||
| 1647 | + start += 2; | ||
| 1648 | + } | ||
| 1649 | + // If the string has an odd number of bytes, the last byte is | ||
| 1650 | + // ignored. | ||
| 1651 | + for (unsigned int i = start; i < len; i += 2) | ||
| 1652 | + { | ||
| 1653 | + // Convert from UTF16-BE. If we get a malformed | ||
| 1654 | + // codepoint, this code will generate incorrect output | ||
| 1655 | + // without giving a warning. Specifically, a high | ||
| 1656 | + // codepoint not followed by a low codepoint will be | ||
| 1657 | + // discarded, and a low codepoint not preceded by a high | ||
| 1658 | + // codepoint will just get its low 10 bits output. | ||
| 1659 | + unsigned short bits = | ||
| 1660 | + (static_cast<unsigned char>(val.at(i)) << 8) + | ||
| 1661 | + static_cast<unsigned char>(val.at(i+1)); | ||
| 1662 | + if ((bits & 0xFC00) == 0xD800) | ||
| 1663 | + { | ||
| 1664 | + codepoint = 0x10000 + ((bits & 0x3FF) << 10); | ||
| 1665 | + continue; | ||
| 1666 | + } | ||
| 1667 | + else if ((bits & 0xFC00) == 0xDC00) | ||
| 1668 | + { | ||
| 1669 | + if (codepoint != 0) | ||
| 1670 | + { | ||
| 1671 | + QTC::TC("qpdf", "QUtil non-trivial UTF-16"); | ||
| 1672 | + } | ||
| 1673 | + codepoint += bits & 0x3FF; | ||
| 1674 | + } | ||
| 1675 | + else | ||
| 1676 | + { | ||
| 1677 | + codepoint = bits; | ||
| 1678 | + } | ||
| 1679 | + | ||
| 1680 | + result += QUtil::toUTF8(codepoint); | ||
| 1681 | + codepoint = 0; | ||
| 1682 | + } | ||
| 1683 | + return result; | ||
| 1684 | +} | ||
| 1685 | + | ||
| 1686 | +std::string | ||
| 1687 | +QUtil::win_ansi_to_utf8(std::string const& val) | ||
| 1688 | +{ | ||
| 1689 | + return "QXXXQ"; | ||
| 1690 | +} | ||
| 1691 | + | ||
| 1692 | +std::string | ||
| 1693 | +QUtil::mac_roman_to_utf8(std::string const& val) | ||
| 1694 | +{ | ||
| 1695 | + return "QXXXQ"; | ||
| 1696 | +} | ||
| 1697 | + | ||
| 1698 | +std::string | ||
| 1699 | +QUtil::pdf_doc_to_utf8(std::string const& val) | ||
| 1700 | +{ | ||
| 1701 | + std::string result; | ||
| 1702 | + size_t len = val.length(); | ||
| 1703 | + for (unsigned int i = 0; i < len; ++i) | ||
| 1704 | + { | ||
| 1705 | + unsigned char ch = static_cast<unsigned char>(val.at(i)); | ||
| 1706 | + unsigned short val = ch; | ||
| 1707 | + if ((ch >= 128) && (ch <= 160)) | ||
| 1708 | + { | ||
| 1709 | + val = pdf_doc_to_unicode[ch - 128]; | ||
| 1710 | + } | ||
| 1711 | + result += QUtil::toUTF8(val); | ||
| 1712 | + } | ||
| 1713 | + return result; | ||
| 1714 | +} |
qpdf/qpdf.testcov
| @@ -108,7 +108,7 @@ QPDF_Stream pipeStreamData with null pipeline 0 | @@ -108,7 +108,7 @@ QPDF_Stream pipeStreamData with null pipeline 0 | ||
| 108 | QPDFWriter not recompressing /FlateDecode 0 | 108 | QPDFWriter not recompressing /FlateDecode 0 |
| 109 | QPDF_encryption xref stream from encrypted file 0 | 109 | QPDF_encryption xref stream from encrypted file 0 |
| 110 | qpdf unable to filter 0 | 110 | qpdf unable to filter 0 |
| 111 | -QPDF_String non-trivial UTF-16 0 | 111 | +QUtil non-trivial UTF-16 0 |
| 112 | QPDF xref overwrite object 0 | 112 | QPDF xref overwrite object 0 |
| 113 | QPDF decoding error warning 0 | 113 | QPDF decoding error warning 0 |
| 114 | qpdf-c called qpdf_init 0 | 114 | qpdf-c called qpdf_init 0 |