Commit 370710657a7e7c771668107d1b6407fc350a2891
Committed by
Jay Berkenbilt
1 parent
77c31305
Add missing characters from PDF doc encoding (fixes #606)
Showing
10 changed files
with
72 additions
and
17 deletions
ChangeLog
libqpdf/QUtil.cc
| ... | ... | @@ -37,8 +37,20 @@ |
| 37 | 37 | # include <sys/stat.h> |
| 38 | 38 | #endif |
| 39 | 39 | |
| 40 | -// First element is 128 | |
| 40 | +// First element is 24 | |
| 41 | +static unsigned short pdf_doc_low_to_unicode[] = { | |
| 42 | + 0x02d8, // 0x18 BREVE | |
| 43 | + 0x02c7, // 0x19 CARON | |
| 44 | + 0x02c6, // 0x1a MODIFIER LETTER CIRCUMFLEX ACCENT | |
| 45 | + 0x02d9, // 0x1b DOT ABOVE | |
| 46 | + 0x02dd, // 0x1c DOUBLE ACUTE ACCENT | |
| 47 | + 0x02db, // 0x1d OGONEK | |
| 48 | + 0x02da, // 0x1e RING ABOVE | |
| 49 | + 0x02dc, // 0x1f SMALL TILDE | |
| 50 | +}; | |
| 51 | +// First element is 127 | |
| 41 | 52 | static unsigned short pdf_doc_to_unicode[] = { |
| 53 | + 0xfffd, // 0x7f UNDEFINED | |
| 42 | 54 | 0x2022, // 0x80 BULLET |
| 43 | 55 | 0x2020, // 0x81 DAGGER |
| 44 | 56 | 0x2021, // 0x82 DOUBLE DAGGER |
| ... | ... | @@ -2032,6 +2044,30 @@ encode_pdfdoc(unsigned long codepoint) |
| 2032 | 2044 | unsigned char ch = '\0'; |
| 2033 | 2045 | switch (codepoint) |
| 2034 | 2046 | { |
| 2047 | + case 0x02d8: | |
| 2048 | + ch = 0x18; | |
| 2049 | + break; | |
| 2050 | + case 0x02c7: | |
| 2051 | + ch = 0x19; | |
| 2052 | + break; | |
| 2053 | + case 0x02c6: | |
| 2054 | + ch = 0x1a; | |
| 2055 | + break; | |
| 2056 | + case 0x02d9: | |
| 2057 | + ch = 0x1b; | |
| 2058 | + break; | |
| 2059 | + case 0x02dd: | |
| 2060 | + ch = 0x1c; | |
| 2061 | + break; | |
| 2062 | + case 0x02db: | |
| 2063 | + ch = 0x1d; | |
| 2064 | + break; | |
| 2065 | + case 0x02da: | |
| 2066 | + ch = 0x1e; | |
| 2067 | + break; | |
| 2068 | + case 0x02dc: | |
| 2069 | + ch = 0x1f; | |
| 2070 | + break; | |
| 2035 | 2071 | case 0x2022: |
| 2036 | 2072 | ch = 0x80; |
| 2037 | 2073 | break; |
| ... | ... | @@ -2427,9 +2463,13 @@ QUtil::pdf_doc_to_utf8(std::string const& val) |
| 2427 | 2463 | { |
| 2428 | 2464 | unsigned char ch = static_cast<unsigned char>(val.at(i)); |
| 2429 | 2465 | unsigned short ch_short = ch; |
| 2430 | - if ((ch >= 128) && (ch <= 160)) | |
| 2466 | + if ((ch >= 127) && (ch <= 160)) | |
| 2467 | + { | |
| 2468 | + ch_short = pdf_doc_to_unicode[ch - 127]; | |
| 2469 | + } | |
| 2470 | + else if ((ch >= 24) && (ch <= 31)) | |
| 2431 | 2471 | { |
| 2432 | - ch_short = pdf_doc_to_unicode[ch - 128]; | |
| 2472 | + ch_short = pdf_doc_low_to_unicode[ch - 24]; | |
| 2433 | 2473 | } |
| 2434 | 2474 | result += QUtil::toUTF8(ch_short); |
| 2435 | 2475 | } | ... | ... |
libtests/qtest/qutil/qutil.out
| ... | ... | @@ -69,6 +69,7 @@ HAGOOGAMAGOOGLE: 0 |
| 69 | 69 | <c0>Does * have fingers? |
| 70 | 70 | ---- transcoding |
| 71 | 71 | bidirectional pdf doc done |
| 72 | +bidirectional pdf doc low done | |
| 72 | 73 | bidirectional win ansi done |
| 73 | 74 | bidirectional mac roman done |
| 74 | 75 | analysis done |
| ... | ... | @@ -85,6 +86,8 @@ alternatives |
| 85 | 86 | 2: 83a9e99e |
| 86 | 87 | 0: 717561636b |
| 87 | 88 | done alternatives |
| 89 | +w˘wˇwˆw˙w˝w˛w˚w˜w�w | |
| 90 | +done low characters | |
| 88 | 91 | ---- whoami |
| 89 | 92 | quack1 |
| 90 | 93 | quack2 | ... | ... |
libtests/qutil.cc
| ... | ... | @@ -308,12 +308,12 @@ void utf8_to_ascii_test() |
| 308 | 308 | |
| 309 | 309 | void transcoding_test(std::string (*to_utf8)(std::string const&), |
| 310 | 310 | std::string (*from_utf8)(std::string const&, char), |
| 311 | - int last, std::string unknown) | |
| 311 | + int first, int last, std::string unknown) | |
| 312 | 312 | { |
| 313 | 313 | std::string in(" "); |
| 314 | 314 | std::string out; |
| 315 | 315 | std::string back; |
| 316 | - for (int i = 128; i <= last; ++i) | |
| 316 | + for (int i = first; i <= last; ++i) | |
| 317 | 317 | { |
| 318 | 318 | in.at(0) = static_cast<char>(static_cast<unsigned char>(i)); |
| 319 | 319 | out = (*to_utf8)(in); |
| ... | ... | @@ -355,13 +355,16 @@ void print_alternatives(std::string const& str) |
| 355 | 355 | void transcoding_test() |
| 356 | 356 | { |
| 357 | 357 | transcoding_test(&QUtil::pdf_doc_to_utf8, |
| 358 | - &QUtil::utf8_to_pdf_doc, 160, "\x9f"); | |
| 358 | + &QUtil::utf8_to_pdf_doc, 127, 160, "\x9f"); | |
| 359 | 359 | std::cout << "bidirectional pdf doc done" << std::endl; |
| 360 | + transcoding_test(&QUtil::pdf_doc_to_utf8, | |
| 361 | + &QUtil::utf8_to_pdf_doc, 24, 31, "?"); | |
| 362 | + std::cout << "bidirectional pdf doc low done" << std::endl; | |
| 360 | 363 | transcoding_test(&QUtil::win_ansi_to_utf8, |
| 361 | - &QUtil::utf8_to_win_ansi, 160, "?"); | |
| 364 | + &QUtil::utf8_to_win_ansi, 128, 160, "?"); | |
| 362 | 365 | std::cout << "bidirectional win ansi done" << std::endl; |
| 363 | 366 | transcoding_test(&QUtil::mac_roman_to_utf8, |
| 364 | - &QUtil::utf8_to_mac_roman, 255, "?"); | |
| 367 | + &QUtil::utf8_to_mac_roman, 128, 255, "?"); | |
| 365 | 368 | std::cout << "bidirectional mac roman done" << std::endl; |
| 366 | 369 | check_analyze("pi = \317\200", true, true, false); |
| 367 | 370 | check_analyze("pi != \317", true, false, false); |
| ... | ... | @@ -396,6 +399,10 @@ void transcoding_test() |
| 396 | 399 | print_alternatives(utf8); |
| 397 | 400 | print_alternatives("quack"); |
| 398 | 401 | std::cout << "done alternatives" << std::endl; |
| 402 | + std::string low = QUtil::pdf_doc_to_utf8( | |
| 403 | + "w\030w\031w\032w\033w\034w\035w\036w\037w\177w"); | |
| 404 | + std::cout << low << std::endl; | |
| 405 | + std::cout << "done low characters" << std::endl; | |
| 399 | 406 | } |
| 400 | 407 | |
| 401 | 408 | void print_whoami(char const* str) | ... | ... |
qpdf/qtest/qpdf/json-image-streams-all.out
qpdf/qtest/qpdf/json-image-streams-small.out
qpdf/qtest/qpdf/json-image-streams-specialized.out
qpdf/qtest/qpdf/json-image-streams.out
qpdf/qtest/qpdf/json-page-labels-num-tree.out
| ... | ... | @@ -1518,8 +1518,8 @@ |
| 1518 | 1518 | "99 0 R": 47, |
| 1519 | 1519 | "trailer": { |
| 1520 | 1520 | "/ID": [ |
| 1521 | - "’ù\u0019Þxtó¼\\·¯½\u001eŁ7»", | |
| 1522 | - "\rþ\u0018©LÞ\u000fKýÈl\u0003¯\u0019\u0001\u000e" | |
| 1521 | + "’ùˇÞxtó¼\\·¯½˚Ł7»", | |
| 1522 | + "\rþ˘©LÞ\u000fKýÈl\u0003¯ˇ\u0001\u000e" | |
| 1523 | 1523 | ], |
| 1524 | 1524 | "/Root": "1 0 R", |
| 1525 | 1525 | "/Size": 100 | ... | ... |
qpdf/qtest/qpdf/page_api_2-json.out
| ... | ... | @@ -178,8 +178,8 @@ |
| 178 | 178 | }, |
| 179 | 179 | "trailer": { |
| 180 | 180 | "/ID": [ |
| 181 | - "û\u0018·ƒÿ{5⁄\u0005Ú−S*º‘o", | |
| 182 | - "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý\u001f\u0002" | |
| 181 | + "û˘·ƒÿ{5⁄\u0005Ú−S*º‘o", | |
| 182 | + "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý˜\u0002" | |
| 183 | 183 | ], |
| 184 | 184 | "/Info": "2 0 R", |
| 185 | 185 | "/Root": "1 0 R", | ... | ... |