Commit 370710657a7e7c771668107d1b6407fc350a2891

Authored by Jay Berkenbilt
Committed by Jay Berkenbilt
1 parent 77c31305

Add missing characters from PDF doc encoding (fixes #606)

ChangeLog
  1 +2022-01-11 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * Bug fix: add missing characters from PDF doc encoding.
  4 + Fixes #606.
  5 +
1 6 2021-12-29 Jay Berkenbilt <ejb@ql.org>
2 7  
3 8 * Add method QUtil::file_can_be_opened
... ...
libqpdf/QUtil.cc
... ... @@ -37,8 +37,20 @@
37 37 # include <sys/stat.h>
38 38 #endif
39 39  
40   -// First element is 128
  40 +// First element is 24
  41 +static unsigned short pdf_doc_low_to_unicode[] = {
  42 + 0x02d8, // 0x18 BREVE
  43 + 0x02c7, // 0x19 CARON
  44 + 0x02c6, // 0x1a MODIFIER LETTER CIRCUMFLEX ACCENT
  45 + 0x02d9, // 0x1b DOT ABOVE
  46 + 0x02dd, // 0x1c DOUBLE ACUTE ACCENT
  47 + 0x02db, // 0x1d OGONEK
  48 + 0x02da, // 0x1e RING ABOVE
  49 + 0x02dc, // 0x1f SMALL TILDE
  50 +};
  51 +// First element is 127
41 52 static unsigned short pdf_doc_to_unicode[] = {
  53 + 0xfffd, // 0x7f UNDEFINED
42 54 0x2022, // 0x80 BULLET
43 55 0x2020, // 0x81 DAGGER
44 56 0x2021, // 0x82 DOUBLE DAGGER
... ... @@ -2032,6 +2044,30 @@ encode_pdfdoc(unsigned long codepoint)
2032 2044 unsigned char ch = '\0';
2033 2045 switch (codepoint)
2034 2046 {
  2047 + case 0x02d8:
  2048 + ch = 0x18;
  2049 + break;
  2050 + case 0x02c7:
  2051 + ch = 0x19;
  2052 + break;
  2053 + case 0x02c6:
  2054 + ch = 0x1a;
  2055 + break;
  2056 + case 0x02d9:
  2057 + ch = 0x1b;
  2058 + break;
  2059 + case 0x02dd:
  2060 + ch = 0x1c;
  2061 + break;
  2062 + case 0x02db:
  2063 + ch = 0x1d;
  2064 + break;
  2065 + case 0x02da:
  2066 + ch = 0x1e;
  2067 + break;
  2068 + case 0x02dc:
  2069 + ch = 0x1f;
  2070 + break;
2035 2071 case 0x2022:
2036 2072 ch = 0x80;
2037 2073 break;
... ... @@ -2427,9 +2463,13 @@ QUtil::pdf_doc_to_utf8(std::string const&amp; val)
2427 2463 {
2428 2464 unsigned char ch = static_cast<unsigned char>(val.at(i));
2429 2465 unsigned short ch_short = ch;
2430   - if ((ch >= 128) && (ch <= 160))
  2466 + if ((ch >= 127) && (ch <= 160))
  2467 + {
  2468 + ch_short = pdf_doc_to_unicode[ch - 127];
  2469 + }
  2470 + else if ((ch >= 24) && (ch <= 31))
2431 2471 {
2432   - ch_short = pdf_doc_to_unicode[ch - 128];
  2472 + ch_short = pdf_doc_low_to_unicode[ch - 24];
2433 2473 }
2434 2474 result += QUtil::toUTF8(ch_short);
2435 2475 }
... ...
libtests/qtest/qutil/qutil.out
... ... @@ -69,6 +69,7 @@ HAGOOGAMAGOOGLE: 0
69 69 <c0>Does * have fingers?
70 70 ---- transcoding
71 71 bidirectional pdf doc done
  72 +bidirectional pdf doc low done
72 73 bidirectional win ansi done
73 74 bidirectional mac roman done
74 75 analysis done
... ... @@ -85,6 +86,8 @@ alternatives
85 86 2: 83a9e99e
86 87 0: 717561636b
87 88 done alternatives
  89 +w˘wˇwˆw˙w˝w˛w˚w˜w�w
  90 +done low characters
88 91 ---- whoami
89 92 quack1
90 93 quack2
... ...
libtests/qutil.cc
... ... @@ -308,12 +308,12 @@ void utf8_to_ascii_test()
308 308  
309 309 void transcoding_test(std::string (*to_utf8)(std::string const&),
310 310 std::string (*from_utf8)(std::string const&, char),
311   - int last, std::string unknown)
  311 + int first, int last, std::string unknown)
312 312 {
313 313 std::string in(" ");
314 314 std::string out;
315 315 std::string back;
316   - for (int i = 128; i <= last; ++i)
  316 + for (int i = first; i <= last; ++i)
317 317 {
318 318 in.at(0) = static_cast<char>(static_cast<unsigned char>(i));
319 319 out = (*to_utf8)(in);
... ... @@ -355,13 +355,16 @@ void print_alternatives(std::string const&amp; str)
355 355 void transcoding_test()
356 356 {
357 357 transcoding_test(&QUtil::pdf_doc_to_utf8,
358   - &QUtil::utf8_to_pdf_doc, 160, "\x9f");
  358 + &QUtil::utf8_to_pdf_doc, 127, 160, "\x9f");
359 359 std::cout << "bidirectional pdf doc done" << std::endl;
  360 + transcoding_test(&QUtil::pdf_doc_to_utf8,
  361 + &QUtil::utf8_to_pdf_doc, 24, 31, "?");
  362 + std::cout << "bidirectional pdf doc low done" << std::endl;
360 363 transcoding_test(&QUtil::win_ansi_to_utf8,
361   - &QUtil::utf8_to_win_ansi, 160, "?");
  364 + &QUtil::utf8_to_win_ansi, 128, 160, "?");
362 365 std::cout << "bidirectional win ansi done" << std::endl;
363 366 transcoding_test(&QUtil::mac_roman_to_utf8,
364   - &QUtil::utf8_to_mac_roman, 255, "?");
  367 + &QUtil::utf8_to_mac_roman, 128, 255, "?");
365 368 std::cout << "bidirectional mac roman done" << std::endl;
366 369 check_analyze("pi = \317\200", true, true, false);
367 370 check_analyze("pi != \317", true, false, false);
... ... @@ -396,6 +399,10 @@ void transcoding_test()
396 399 print_alternatives(utf8);
397 400 print_alternatives("quack");
398 401 std::cout << "done alternatives" << std::endl;
  402 + std::string low = QUtil::pdf_doc_to_utf8(
  403 + "w\030w\031w\032w\033w\034w\035w\036w\037w\177w");
  404 + std::cout << low << std::endl;
  405 + std::cout << "done low characters" << std::endl;
399 406 }
400 407  
401 408 void print_whoami(char const* str)
... ...
qpdf/qtest/qpdf/json-image-streams-all.out
... ... @@ -604,7 +604,7 @@
604 604 "trailer": {
605 605 "/ID": [
606 606 "S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶",
607   - "'+“‰¤V2«PP ç`m\"\u001d"
  607 + "'+“‰¤V2«PP ç`m\"˛"
608 608 ],
609 609 "/Root": "1 0 R",
610 610 "/Size": 31
... ...
qpdf/qtest/qpdf/json-image-streams-small.out
... ... @@ -615,8 +615,8 @@
615 615 },
616 616 "trailer": {
617 617 "/ID": [
618   - "Z§¯•Py»’~’46\u001dı\u0011¢",
619   - "Z§¯•Py»’~’46\u001dı\u0011¢"
  618 + "Z§¯•Py»’~’46˛ı\u0011¢",
  619 + "Z§¯•Py»’~’46˛ı\u0011¢"
620 620 ],
621 621 "/Root": "1 0 R",
622 622 "/Size": 31
... ...
qpdf/qtest/qpdf/json-image-streams-specialized.out
... ... @@ -604,7 +604,7 @@
604 604 "trailer": {
605 605 "/ID": [
606 606 "S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶",
607   - "'+“‰¤V2«PP ç`m\"\u001d"
  607 + "'+“‰¤V2«PP ç`m\"˛"
608 608 ],
609 609 "/Root": "1 0 R",
610 610 "/Size": 31
... ...
qpdf/qtest/qpdf/json-image-streams.out
... ... @@ -604,7 +604,7 @@
604 604 "trailer": {
605 605 "/ID": [
606 606 "S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶",
607   - "'+“‰¤V2«PP ç`m\"\u001d"
  607 + "'+“‰¤V2«PP ç`m\"˛"
608 608 ],
609 609 "/Root": "1 0 R",
610 610 "/Size": 31
... ...
qpdf/qtest/qpdf/json-page-labels-num-tree.out
... ... @@ -1518,8 +1518,8 @@
1518 1518 "99 0 R": 47,
1519 1519 "trailer": {
1520 1520 "/ID": [
1521   - "’ù\u0019Þxtó¼\\·¯½\u001eŁ7»",
1522   - "\rþ\u0018©LÞ\u000fKýÈl\u0003¯\u0019\u0001\u000e"
  1521 + "’ùˇÞxtó¼\\·¯½˚Ł7»",
  1522 + "\rþ˘©LÞ\u000fKýÈl\u0003¯ˇ\u0001\u000e"
1523 1523 ],
1524 1524 "/Root": "1 0 R",
1525 1525 "/Size": 100
... ...
qpdf/qtest/qpdf/page_api_2-json.out
... ... @@ -178,8 +178,8 @@
178 178 },
179 179 "trailer": {
180 180 "/ID": [
181   - "û\u0018·ƒÿ{5⁄\u0005Ú−S*º‘o",
182   - "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý\u001f\u0002"
  181 + "û˘·ƒÿ{5⁄\u0005Ú−S*º‘o",
  182 + "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý˜\u0002"
183 183 ],
184 184 "/Info": "2 0 R",
185 185 "/Root": "1 0 R",
... ...