Commit 370710657a7e7c771668107d1b6407fc350a2891
Committed by
Jay Berkenbilt
1 parent
77c31305
Add missing characters from PDF doc encoding (fixes #606)
Showing
10 changed files
with
72 additions
and
17 deletions
ChangeLog
libqpdf/QUtil.cc
| @@ -37,8 +37,20 @@ | @@ -37,8 +37,20 @@ | ||
| 37 | # include <sys/stat.h> | 37 | # include <sys/stat.h> |
| 38 | #endif | 38 | #endif |
| 39 | 39 | ||
| 40 | -// First element is 128 | 40 | +// First element is 24 |
| 41 | +static unsigned short pdf_doc_low_to_unicode[] = { | ||
| 42 | + 0x02d8, // 0x18 BREVE | ||
| 43 | + 0x02c7, // 0x19 CARON | ||
| 44 | + 0x02c6, // 0x1a MODIFIER LETTER CIRCUMFLEX ACCENT | ||
| 45 | + 0x02d9, // 0x1b DOT ABOVE | ||
| 46 | + 0x02dd, // 0x1c DOUBLE ACUTE ACCENT | ||
| 47 | + 0x02db, // 0x1d OGONEK | ||
| 48 | + 0x02da, // 0x1e RING ABOVE | ||
| 49 | + 0x02dc, // 0x1f SMALL TILDE | ||
| 50 | +}; | ||
| 51 | +// First element is 127 | ||
| 41 | static unsigned short pdf_doc_to_unicode[] = { | 52 | static unsigned short pdf_doc_to_unicode[] = { |
| 53 | + 0xfffd, // 0x7f UNDEFINED | ||
| 42 | 0x2022, // 0x80 BULLET | 54 | 0x2022, // 0x80 BULLET |
| 43 | 0x2020, // 0x81 DAGGER | 55 | 0x2020, // 0x81 DAGGER |
| 44 | 0x2021, // 0x82 DOUBLE DAGGER | 56 | 0x2021, // 0x82 DOUBLE DAGGER |
| @@ -2032,6 +2044,30 @@ encode_pdfdoc(unsigned long codepoint) | @@ -2032,6 +2044,30 @@ encode_pdfdoc(unsigned long codepoint) | ||
| 2032 | unsigned char ch = '\0'; | 2044 | unsigned char ch = '\0'; |
| 2033 | switch (codepoint) | 2045 | switch (codepoint) |
| 2034 | { | 2046 | { |
| 2047 | + case 0x02d8: | ||
| 2048 | + ch = 0x18; | ||
| 2049 | + break; | ||
| 2050 | + case 0x02c7: | ||
| 2051 | + ch = 0x19; | ||
| 2052 | + break; | ||
| 2053 | + case 0x02c6: | ||
| 2054 | + ch = 0x1a; | ||
| 2055 | + break; | ||
| 2056 | + case 0x02d9: | ||
| 2057 | + ch = 0x1b; | ||
| 2058 | + break; | ||
| 2059 | + case 0x02dd: | ||
| 2060 | + ch = 0x1c; | ||
| 2061 | + break; | ||
| 2062 | + case 0x02db: | ||
| 2063 | + ch = 0x1d; | ||
| 2064 | + break; | ||
| 2065 | + case 0x02da: | ||
| 2066 | + ch = 0x1e; | ||
| 2067 | + break; | ||
| 2068 | + case 0x02dc: | ||
| 2069 | + ch = 0x1f; | ||
| 2070 | + break; | ||
| 2035 | case 0x2022: | 2071 | case 0x2022: |
| 2036 | ch = 0x80; | 2072 | ch = 0x80; |
| 2037 | break; | 2073 | break; |
| @@ -2427,9 +2463,13 @@ QUtil::pdf_doc_to_utf8(std::string const& val) | @@ -2427,9 +2463,13 @@ QUtil::pdf_doc_to_utf8(std::string const& val) | ||
| 2427 | { | 2463 | { |
| 2428 | unsigned char ch = static_cast<unsigned char>(val.at(i)); | 2464 | unsigned char ch = static_cast<unsigned char>(val.at(i)); |
| 2429 | unsigned short ch_short = ch; | 2465 | unsigned short ch_short = ch; |
| 2430 | - if ((ch >= 128) && (ch <= 160)) | 2466 | + if ((ch >= 127) && (ch <= 160)) |
| 2467 | + { | ||
| 2468 | + ch_short = pdf_doc_to_unicode[ch - 127]; | ||
| 2469 | + } | ||
| 2470 | + else if ((ch >= 24) && (ch <= 31)) | ||
| 2431 | { | 2471 | { |
| 2432 | - ch_short = pdf_doc_to_unicode[ch - 128]; | 2472 | + ch_short = pdf_doc_low_to_unicode[ch - 24]; |
| 2433 | } | 2473 | } |
| 2434 | result += QUtil::toUTF8(ch_short); | 2474 | result += QUtil::toUTF8(ch_short); |
| 2435 | } | 2475 | } |
libtests/qtest/qutil/qutil.out
| @@ -69,6 +69,7 @@ HAGOOGAMAGOOGLE: 0 | @@ -69,6 +69,7 @@ HAGOOGAMAGOOGLE: 0 | ||
| 69 | <c0>Does * have fingers? | 69 | <c0>Does * have fingers? |
| 70 | ---- transcoding | 70 | ---- transcoding |
| 71 | bidirectional pdf doc done | 71 | bidirectional pdf doc done |
| 72 | +bidirectional pdf doc low done | ||
| 72 | bidirectional win ansi done | 73 | bidirectional win ansi done |
| 73 | bidirectional mac roman done | 74 | bidirectional mac roman done |
| 74 | analysis done | 75 | analysis done |
| @@ -85,6 +86,8 @@ alternatives | @@ -85,6 +86,8 @@ alternatives | ||
| 85 | 2: 83a9e99e | 86 | 2: 83a9e99e |
| 86 | 0: 717561636b | 87 | 0: 717561636b |
| 87 | done alternatives | 88 | done alternatives |
| 89 | +w˘wˇwˆw˙w˝w˛w˚w˜w�w | ||
| 90 | +done low characters | ||
| 88 | ---- whoami | 91 | ---- whoami |
| 89 | quack1 | 92 | quack1 |
| 90 | quack2 | 93 | quack2 |
libtests/qutil.cc
| @@ -308,12 +308,12 @@ void utf8_to_ascii_test() | @@ -308,12 +308,12 @@ void utf8_to_ascii_test() | ||
| 308 | 308 | ||
| 309 | void transcoding_test(std::string (*to_utf8)(std::string const&), | 309 | void transcoding_test(std::string (*to_utf8)(std::string const&), |
| 310 | std::string (*from_utf8)(std::string const&, char), | 310 | std::string (*from_utf8)(std::string const&, char), |
| 311 | - int last, std::string unknown) | 311 | + int first, int last, std::string unknown) |
| 312 | { | 312 | { |
| 313 | std::string in(" "); | 313 | std::string in(" "); |
| 314 | std::string out; | 314 | std::string out; |
| 315 | std::string back; | 315 | std::string back; |
| 316 | - for (int i = 128; i <= last; ++i) | 316 | + for (int i = first; i <= last; ++i) |
| 317 | { | 317 | { |
| 318 | in.at(0) = static_cast<char>(static_cast<unsigned char>(i)); | 318 | in.at(0) = static_cast<char>(static_cast<unsigned char>(i)); |
| 319 | out = (*to_utf8)(in); | 319 | out = (*to_utf8)(in); |
| @@ -355,13 +355,16 @@ void print_alternatives(std::string const& str) | @@ -355,13 +355,16 @@ void print_alternatives(std::string const& str) | ||
| 355 | void transcoding_test() | 355 | void transcoding_test() |
| 356 | { | 356 | { |
| 357 | transcoding_test(&QUtil::pdf_doc_to_utf8, | 357 | transcoding_test(&QUtil::pdf_doc_to_utf8, |
| 358 | - &QUtil::utf8_to_pdf_doc, 160, "\x9f"); | 358 | + &QUtil::utf8_to_pdf_doc, 127, 160, "\x9f"); |
| 359 | std::cout << "bidirectional pdf doc done" << std::endl; | 359 | std::cout << "bidirectional pdf doc done" << std::endl; |
| 360 | + transcoding_test(&QUtil::pdf_doc_to_utf8, | ||
| 361 | + &QUtil::utf8_to_pdf_doc, 24, 31, "?"); | ||
| 362 | + std::cout << "bidirectional pdf doc low done" << std::endl; | ||
| 360 | transcoding_test(&QUtil::win_ansi_to_utf8, | 363 | transcoding_test(&QUtil::win_ansi_to_utf8, |
| 361 | - &QUtil::utf8_to_win_ansi, 160, "?"); | 364 | + &QUtil::utf8_to_win_ansi, 128, 160, "?"); |
| 362 | std::cout << "bidirectional win ansi done" << std::endl; | 365 | std::cout << "bidirectional win ansi done" << std::endl; |
| 363 | transcoding_test(&QUtil::mac_roman_to_utf8, | 366 | transcoding_test(&QUtil::mac_roman_to_utf8, |
| 364 | - &QUtil::utf8_to_mac_roman, 255, "?"); | 367 | + &QUtil::utf8_to_mac_roman, 128, 255, "?"); |
| 365 | std::cout << "bidirectional mac roman done" << std::endl; | 368 | std::cout << "bidirectional mac roman done" << std::endl; |
| 366 | check_analyze("pi = \317\200", true, true, false); | 369 | check_analyze("pi = \317\200", true, true, false); |
| 367 | check_analyze("pi != \317", true, false, false); | 370 | check_analyze("pi != \317", true, false, false); |
| @@ -396,6 +399,10 @@ void transcoding_test() | @@ -396,6 +399,10 @@ void transcoding_test() | ||
| 396 | print_alternatives(utf8); | 399 | print_alternatives(utf8); |
| 397 | print_alternatives("quack"); | 400 | print_alternatives("quack"); |
| 398 | std::cout << "done alternatives" << std::endl; | 401 | std::cout << "done alternatives" << std::endl; |
| 402 | + std::string low = QUtil::pdf_doc_to_utf8( | ||
| 403 | + "w\030w\031w\032w\033w\034w\035w\036w\037w\177w"); | ||
| 404 | + std::cout << low << std::endl; | ||
| 405 | + std::cout << "done low characters" << std::endl; | ||
| 399 | } | 406 | } |
| 400 | 407 | ||
| 401 | void print_whoami(char const* str) | 408 | void print_whoami(char const* str) |
qpdf/qtest/qpdf/json-image-streams-all.out
| @@ -604,7 +604,7 @@ | @@ -604,7 +604,7 @@ | ||
| 604 | "trailer": { | 604 | "trailer": { |
| 605 | "/ID": [ | 605 | "/ID": [ |
| 606 | "S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶", | 606 | "S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶", |
| 607 | - "'+“‰¤V2«PP ç`m\"\u001d" | 607 | + "'+“‰¤V2«PP ç`m\"˛" |
| 608 | ], | 608 | ], |
| 609 | "/Root": "1 0 R", | 609 | "/Root": "1 0 R", |
| 610 | "/Size": 31 | 610 | "/Size": 31 |
qpdf/qtest/qpdf/json-image-streams-small.out
| @@ -615,8 +615,8 @@ | @@ -615,8 +615,8 @@ | ||
| 615 | }, | 615 | }, |
| 616 | "trailer": { | 616 | "trailer": { |
| 617 | "/ID": [ | 617 | "/ID": [ |
| 618 | - "Z§¯•Py»’~’46\u001dı\u0011¢", | ||
| 619 | - "Z§¯•Py»’~’46\u001dı\u0011¢" | 618 | + "Z§¯•Py»’~’46˛ı\u0011¢", |
| 619 | + "Z§¯•Py»’~’46˛ı\u0011¢" | ||
| 620 | ], | 620 | ], |
| 621 | "/Root": "1 0 R", | 621 | "/Root": "1 0 R", |
| 622 | "/Size": 31 | 622 | "/Size": 31 |
qpdf/qtest/qpdf/json-image-streams-specialized.out
| @@ -604,7 +604,7 @@ | @@ -604,7 +604,7 @@ | ||
| 604 | "trailer": { | 604 | "trailer": { |
| 605 | "/ID": [ | 605 | "/ID": [ |
| 606 | "S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶", | 606 | "S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶", |
| 607 | - "'+“‰¤V2«PP ç`m\"\u001d" | 607 | + "'+“‰¤V2«PP ç`m\"˛" |
| 608 | ], | 608 | ], |
| 609 | "/Root": "1 0 R", | 609 | "/Root": "1 0 R", |
| 610 | "/Size": 31 | 610 | "/Size": 31 |
qpdf/qtest/qpdf/json-image-streams.out
| @@ -604,7 +604,7 @@ | @@ -604,7 +604,7 @@ | ||
| 604 | "trailer": { | 604 | "trailer": { |
| 605 | "/ID": [ | 605 | "/ID": [ |
| 606 | "S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶", | 606 | "S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶", |
| 607 | - "'+“‰¤V2«PP ç`m\"\u001d" | 607 | + "'+“‰¤V2«PP ç`m\"˛" |
| 608 | ], | 608 | ], |
| 609 | "/Root": "1 0 R", | 609 | "/Root": "1 0 R", |
| 610 | "/Size": 31 | 610 | "/Size": 31 |
qpdf/qtest/qpdf/json-page-labels-num-tree.out
| @@ -1518,8 +1518,8 @@ | @@ -1518,8 +1518,8 @@ | ||
| 1518 | "99 0 R": 47, | 1518 | "99 0 R": 47, |
| 1519 | "trailer": { | 1519 | "trailer": { |
| 1520 | "/ID": [ | 1520 | "/ID": [ |
| 1521 | - "’ù\u0019Þxtó¼\\·¯½\u001eŁ7»", | ||
| 1522 | - "\rþ\u0018©LÞ\u000fKýÈl\u0003¯\u0019\u0001\u000e" | 1521 | + "’ùˇÞxtó¼\\·¯½˚Ł7»", |
| 1522 | + "\rþ˘©LÞ\u000fKýÈl\u0003¯ˇ\u0001\u000e" | ||
| 1523 | ], | 1523 | ], |
| 1524 | "/Root": "1 0 R", | 1524 | "/Root": "1 0 R", |
| 1525 | "/Size": 100 | 1525 | "/Size": 100 |
qpdf/qtest/qpdf/page_api_2-json.out
| @@ -178,8 +178,8 @@ | @@ -178,8 +178,8 @@ | ||
| 178 | }, | 178 | }, |
| 179 | "trailer": { | 179 | "trailer": { |
| 180 | "/ID": [ | 180 | "/ID": [ |
| 181 | - "û\u0018·ƒÿ{5⁄\u0005Ú−S*º‘o", | ||
| 182 | - "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý\u001f\u0002" | 181 | + "û˘·ƒÿ{5⁄\u0005Ú−S*º‘o", |
| 182 | + "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý˜\u0002" | ||
| 183 | ], | 183 | ], |
| 184 | "/Info": "2 0 R", | 184 | "/Info": "2 0 R", |
| 185 | "/Root": "1 0 R", | 185 | "/Root": "1 0 R", |