Commit 370710657a7e7c771668107d1b6407fc350a2891

Authored by Jay Berkenbilt
Committed by Jay Berkenbilt
1 parent 77c31305

Add missing characters from PDF doc encoding (fixes #606)

ChangeLog
  1 +2022-01-11 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * Bug fix: add missing characters from PDF doc encoding.
  4 + Fixes #606.
  5 +
1 2021-12-29 Jay Berkenbilt <ejb@ql.org> 6 2021-12-29 Jay Berkenbilt <ejb@ql.org>
2 7
3 * Add method QUtil::file_can_be_opened 8 * Add method QUtil::file_can_be_opened
libqpdf/QUtil.cc
@@ -37,8 +37,20 @@ @@ -37,8 +37,20 @@
37 # include <sys/stat.h> 37 # include <sys/stat.h>
38 #endif 38 #endif
39 39
40 -// First element is 128 40 +// First element is 24
  41 +static unsigned short pdf_doc_low_to_unicode[] = {
  42 + 0x02d8, // 0x18 BREVE
  43 + 0x02c7, // 0x19 CARON
  44 + 0x02c6, // 0x1a MODIFIER LETTER CIRCUMFLEX ACCENT
  45 + 0x02d9, // 0x1b DOT ABOVE
  46 + 0x02dd, // 0x1c DOUBLE ACUTE ACCENT
  47 + 0x02db, // 0x1d OGONEK
  48 + 0x02da, // 0x1e RING ABOVE
  49 + 0x02dc, // 0x1f SMALL TILDE
  50 +};
  51 +// First element is 127
41 static unsigned short pdf_doc_to_unicode[] = { 52 static unsigned short pdf_doc_to_unicode[] = {
  53 + 0xfffd, // 0x7f UNDEFINED
42 0x2022, // 0x80 BULLET 54 0x2022, // 0x80 BULLET
43 0x2020, // 0x81 DAGGER 55 0x2020, // 0x81 DAGGER
44 0x2021, // 0x82 DOUBLE DAGGER 56 0x2021, // 0x82 DOUBLE DAGGER
@@ -2032,6 +2044,30 @@ encode_pdfdoc(unsigned long codepoint) @@ -2032,6 +2044,30 @@ encode_pdfdoc(unsigned long codepoint)
2032 unsigned char ch = '\0'; 2044 unsigned char ch = '\0';
2033 switch (codepoint) 2045 switch (codepoint)
2034 { 2046 {
  2047 + case 0x02d8:
  2048 + ch = 0x18;
  2049 + break;
  2050 + case 0x02c7:
  2051 + ch = 0x19;
  2052 + break;
  2053 + case 0x02c6:
  2054 + ch = 0x1a;
  2055 + break;
  2056 + case 0x02d9:
  2057 + ch = 0x1b;
  2058 + break;
  2059 + case 0x02dd:
  2060 + ch = 0x1c;
  2061 + break;
  2062 + case 0x02db:
  2063 + ch = 0x1d;
  2064 + break;
  2065 + case 0x02da:
  2066 + ch = 0x1e;
  2067 + break;
  2068 + case 0x02dc:
  2069 + ch = 0x1f;
  2070 + break;
2035 case 0x2022: 2071 case 0x2022:
2036 ch = 0x80; 2072 ch = 0x80;
2037 break; 2073 break;
@@ -2427,9 +2463,13 @@ QUtil::pdf_doc_to_utf8(std::string const&amp; val) @@ -2427,9 +2463,13 @@ QUtil::pdf_doc_to_utf8(std::string const&amp; val)
2427 { 2463 {
2428 unsigned char ch = static_cast<unsigned char>(val.at(i)); 2464 unsigned char ch = static_cast<unsigned char>(val.at(i));
2429 unsigned short ch_short = ch; 2465 unsigned short ch_short = ch;
2430 - if ((ch >= 128) && (ch <= 160)) 2466 + if ((ch >= 127) && (ch <= 160))
  2467 + {
  2468 + ch_short = pdf_doc_to_unicode[ch - 127];
  2469 + }
  2470 + else if ((ch >= 24) && (ch <= 31))
2431 { 2471 {
2432 - ch_short = pdf_doc_to_unicode[ch - 128]; 2472 + ch_short = pdf_doc_low_to_unicode[ch - 24];
2433 } 2473 }
2434 result += QUtil::toUTF8(ch_short); 2474 result += QUtil::toUTF8(ch_short);
2435 } 2475 }
libtests/qtest/qutil/qutil.out
@@ -69,6 +69,7 @@ HAGOOGAMAGOOGLE: 0 @@ -69,6 +69,7 @@ HAGOOGAMAGOOGLE: 0
69 <c0>Does * have fingers? 69 <c0>Does * have fingers?
70 ---- transcoding 70 ---- transcoding
71 bidirectional pdf doc done 71 bidirectional pdf doc done
  72 +bidirectional pdf doc low done
72 bidirectional win ansi done 73 bidirectional win ansi done
73 bidirectional mac roman done 74 bidirectional mac roman done
74 analysis done 75 analysis done
@@ -85,6 +86,8 @@ alternatives @@ -85,6 +86,8 @@ alternatives
85 2: 83a9e99e 86 2: 83a9e99e
86 0: 717561636b 87 0: 717561636b
87 done alternatives 88 done alternatives
  89 +w˘wˇwˆw˙w˝w˛w˚w˜w�w
  90 +done low characters
88 ---- whoami 91 ---- whoami
89 quack1 92 quack1
90 quack2 93 quack2
libtests/qutil.cc
@@ -308,12 +308,12 @@ void utf8_to_ascii_test() @@ -308,12 +308,12 @@ void utf8_to_ascii_test()
308 308
309 void transcoding_test(std::string (*to_utf8)(std::string const&), 309 void transcoding_test(std::string (*to_utf8)(std::string const&),
310 std::string (*from_utf8)(std::string const&, char), 310 std::string (*from_utf8)(std::string const&, char),
311 - int last, std::string unknown) 311 + int first, int last, std::string unknown)
312 { 312 {
313 std::string in(" "); 313 std::string in(" ");
314 std::string out; 314 std::string out;
315 std::string back; 315 std::string back;
316 - for (int i = 128; i <= last; ++i) 316 + for (int i = first; i <= last; ++i)
317 { 317 {
318 in.at(0) = static_cast<char>(static_cast<unsigned char>(i)); 318 in.at(0) = static_cast<char>(static_cast<unsigned char>(i));
319 out = (*to_utf8)(in); 319 out = (*to_utf8)(in);
@@ -355,13 +355,16 @@ void print_alternatives(std::string const&amp; str) @@ -355,13 +355,16 @@ void print_alternatives(std::string const&amp; str)
355 void transcoding_test() 355 void transcoding_test()
356 { 356 {
357 transcoding_test(&QUtil::pdf_doc_to_utf8, 357 transcoding_test(&QUtil::pdf_doc_to_utf8,
358 - &QUtil::utf8_to_pdf_doc, 160, "\x9f"); 358 + &QUtil::utf8_to_pdf_doc, 127, 160, "\x9f");
359 std::cout << "bidirectional pdf doc done" << std::endl; 359 std::cout << "bidirectional pdf doc done" << std::endl;
  360 + transcoding_test(&QUtil::pdf_doc_to_utf8,
  361 + &QUtil::utf8_to_pdf_doc, 24, 31, "?");
  362 + std::cout << "bidirectional pdf doc low done" << std::endl;
360 transcoding_test(&QUtil::win_ansi_to_utf8, 363 transcoding_test(&QUtil::win_ansi_to_utf8,
361 - &QUtil::utf8_to_win_ansi, 160, "?"); 364 + &QUtil::utf8_to_win_ansi, 128, 160, "?");
362 std::cout << "bidirectional win ansi done" << std::endl; 365 std::cout << "bidirectional win ansi done" << std::endl;
363 transcoding_test(&QUtil::mac_roman_to_utf8, 366 transcoding_test(&QUtil::mac_roman_to_utf8,
364 - &QUtil::utf8_to_mac_roman, 255, "?"); 367 + &QUtil::utf8_to_mac_roman, 128, 255, "?");
365 std::cout << "bidirectional mac roman done" << std::endl; 368 std::cout << "bidirectional mac roman done" << std::endl;
366 check_analyze("pi = \317\200", true, true, false); 369 check_analyze("pi = \317\200", true, true, false);
367 check_analyze("pi != \317", true, false, false); 370 check_analyze("pi != \317", true, false, false);
@@ -396,6 +399,10 @@ void transcoding_test() @@ -396,6 +399,10 @@ void transcoding_test()
396 print_alternatives(utf8); 399 print_alternatives(utf8);
397 print_alternatives("quack"); 400 print_alternatives("quack");
398 std::cout << "done alternatives" << std::endl; 401 std::cout << "done alternatives" << std::endl;
  402 + std::string low = QUtil::pdf_doc_to_utf8(
  403 + "w\030w\031w\032w\033w\034w\035w\036w\037w\177w");
  404 + std::cout << low << std::endl;
  405 + std::cout << "done low characters" << std::endl;
399 } 406 }
400 407
401 void print_whoami(char const* str) 408 void print_whoami(char const* str)
qpdf/qtest/qpdf/json-image-streams-all.out
@@ -604,7 +604,7 @@ @@ -604,7 +604,7 @@
604 "trailer": { 604 "trailer": {
605 "/ID": [ 605 "/ID": [
606 "S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶", 606 "S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶",
607 - "'+“‰¤V2«PP ç`m\"\u001d" 607 + "'+“‰¤V2«PP ç`m\"˛"
608 ], 608 ],
609 "/Root": "1 0 R", 609 "/Root": "1 0 R",
610 "/Size": 31 610 "/Size": 31
qpdf/qtest/qpdf/json-image-streams-small.out
@@ -615,8 +615,8 @@ @@ -615,8 +615,8 @@
615 }, 615 },
616 "trailer": { 616 "trailer": {
617 "/ID": [ 617 "/ID": [
618 - "Z§¯•Py»’~’46\u001dı\u0011¢",  
619 - "Z§¯•Py»’~’46\u001dı\u0011¢" 618 + "Z§¯•Py»’~’46˛ı\u0011¢",
  619 + "Z§¯•Py»’~’46˛ı\u0011¢"
620 ], 620 ],
621 "/Root": "1 0 R", 621 "/Root": "1 0 R",
622 "/Size": 31 622 "/Size": 31
qpdf/qtest/qpdf/json-image-streams-specialized.out
@@ -604,7 +604,7 @@ @@ -604,7 +604,7 @@
604 "trailer": { 604 "trailer": {
605 "/ID": [ 605 "/ID": [
606 "S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶", 606 "S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶",
607 - "'+“‰¤V2«PP ç`m\"\u001d" 607 + "'+“‰¤V2«PP ç`m\"˛"
608 ], 608 ],
609 "/Root": "1 0 R", 609 "/Root": "1 0 R",
610 "/Size": 31 610 "/Size": 31
qpdf/qtest/qpdf/json-image-streams.out
@@ -604,7 +604,7 @@ @@ -604,7 +604,7 @@
604 "trailer": { 604 "trailer": {
605 "/ID": [ 605 "/ID": [
606 "S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶", 606 "S¶Ł”łîð\u000e¢¬\u0007}_)\u0012¶",
607 - "'+“‰¤V2«PP ç`m\"\u001d" 607 + "'+“‰¤V2«PP ç`m\"˛"
608 ], 608 ],
609 "/Root": "1 0 R", 609 "/Root": "1 0 R",
610 "/Size": 31 610 "/Size": 31
qpdf/qtest/qpdf/json-page-labels-num-tree.out
@@ -1518,8 +1518,8 @@ @@ -1518,8 +1518,8 @@
1518 "99 0 R": 47, 1518 "99 0 R": 47,
1519 "trailer": { 1519 "trailer": {
1520 "/ID": [ 1520 "/ID": [
1521 - "’ù\u0019Þxtó¼\\·¯½\u001eŁ7»",  
1522 - "\rþ\u0018©LÞ\u000fKýÈl\u0003¯\u0019\u0001\u000e" 1521 + "’ùˇÞxtó¼\\·¯½˚Ł7»",
  1522 + "\rþ˘©LÞ\u000fKýÈl\u0003¯ˇ\u0001\u000e"
1523 ], 1523 ],
1524 "/Root": "1 0 R", 1524 "/Root": "1 0 R",
1525 "/Size": 100 1525 "/Size": 100
qpdf/qtest/qpdf/page_api_2-json.out
@@ -178,8 +178,8 @@ @@ -178,8 +178,8 @@
178 }, 178 },
179 "trailer": { 179 "trailer": {
180 "/ID": [ 180 "/ID": [
181 - "û\u0018·ƒÿ{5⁄\u0005Ú−S*º‘o",  
182 - "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý\u001f\u0002" 181 + "û˘·ƒÿ{5⁄\u0005Ú−S*º‘o",
  182 + "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý˜\u0002"
183 ], 183 ],
184 "/Info": "2 0 R", 184 "/Info": "2 0 R",
185 "/Root": "1 0 R", 185 "/Root": "1 0 R",