Commit 4bb3046f0b139337a00e9182c9b47d1a3f8f8bb3

Authored by Jay Berkenbilt
1 parent 2780a187

Properly handle strings with PDF Doc Encoding (fixes #179)

The QPDF_String::getUTF8Val() method was not treating strings that
weren't explicitly Unicode as PDF Doc Encoded. This only affects
characters in the range 0x80 through 0xa0.
ChangeLog
1 2018-02-17 Jay Berkenbilt <ejb@ql.org> 1 2018-02-17 Jay Berkenbilt <ejb@ql.org>
2 2
  3 + * Fix QPDFObjectHandle::getUTF8Val() to properly handle strings
  4 + that are encoded with PDF Doc Encoding. Fixes #179.
  5 +
3 * Add qpdf_check_pdf to the "C" API. This method just attempts to 6 * Add qpdf_check_pdf to the "C" API. This method just attempts to
4 read the entire file and produce no output, making possible to 7 read the entire file and produce no output, making possible to
5 assess whether the file has any errors that qpdf can detect. 8 assess whether the file has any errors that qpdf can detect.
examples/qtest/bookmarks.test
@@ -48,4 +48,10 @@ $td-&gt;runtest(&quot;bookmarks deleted&quot;, @@ -48,4 +48,10 @@ $td-&gt;runtest(&quot;bookmarks deleted&quot;,
48 $td->EXIT_STATUS => 0}, 48 $td->EXIT_STATUS => 0},
49 $td->NORMALIZE_NEWLINES); 49 $td->NORMALIZE_NEWLINES);
50 50
51 -$td->report(10); 51 +$td->runtest("non-trivial pdf doc to unicode",
  52 + {$td->COMMAND => "pdf-bookmarks issue-179.pdf"},
  53 + {$td->FILE => "issue-179.out",
  54 + $td->EXIT_STATUS => 0},
  55 + $td->NORMALIZE_NEWLINES);
  56 +
  57 +$td->report(11);
examples/qtest/bookmarks/issue-179.out 0 → 100644
  1
  2 +žč
  3 +žđ
  4 +žć
  5 +žš
  6 +ž ajklyghvbnmxcseqwuioprtzdf
  7
  8 +šč
  9 +šđ
  10 +šć
  11 +šž
  12 +š ajklyghvbnmxcseqwuioprtzdf
examples/qtest/bookmarks/issue-179.pdf 0 → 100644
No preview for this file type
include/qpdf/QPDFObjectHandle.hh
@@ -442,6 +442,13 @@ class QPDFObjectHandle @@ -442,6 +442,13 @@ class QPDFObjectHandle
442 // Methods for string objects 442 // Methods for string objects
443 QPDF_DLL 443 QPDF_DLL
444 std::string getStringValue(); 444 std::string getStringValue();
  445 + // If a string starts with the UTF-16 marker, it is converted from
  446 + // UTF-16 to UTF-8. Otherwise, it is treated as a string encoded
  447 + // with PDF Doc Encoding. PDF Doc Encoding is identical to
  448 + // ISO-8859-1 except in the range from 0200 through 0240, where
  449 + // there is a mapping of characters to Unicode. QPDF versions
  450 + // prior to version erroneously left characters in that range
  451 + // unmapped.
445 QPDF_DLL 452 QPDF_DLL
446 std::string getUTF8Value(); 453 std::string getUTF8Value();
447 454
libqpdf/QPDF_String.cc
@@ -8,6 +8,43 @@ @@ -8,6 +8,43 @@
8 // be used. 8 // be used.
9 #include <string.h> 9 #include <string.h>
10 10
  11 +// First element is 128
  12 +static unsigned short pdf_doc_to_unicode[] = {
  13 + 0x2022, // 0x80 BULLET
  14 + 0x2020, // 0x81 DAGGER
  15 + 0x2021, // 0x82 DOUBLE DAGGER
  16 + 0x2026, // 0x83 HORIZONTAL ELLIPSIS
  17 + 0x2014, // 0x84 EM DASH
  18 + 0x2013, // 0x85 EN DASH
  19 + 0x0192, // 0x86 SMALL LETTER F WITH HOOK
  20 + 0x2044, // 0x87 FRACTION SLASH (solidus)
  21 + 0x2039, // 0x88 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
  22 + 0x203a, // 0x89 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
  23 + 0x2212, // 0x8a MINUS SIGN
  24 + 0x2030, // 0x8b PER MILLE SIGN
  25 + 0x201e, // 0x8c DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
  26 + 0x201c, // 0x8d LEFT DOUBLE QUOTATION MARK (double quote left)
  27 + 0x201d, // 0x8e RIGHT DOUBLE QUOTATION MARK (quotedblright)
  28 + 0x2018, // 0x8f LEFT SINGLE QUOTATION MARK (quoteleft)
  29 + 0x2019, // 0x90 RIGHT SINGLE QUOTATION MARK (quoteright)
  30 + 0x201a, // 0x91 SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
  31 + 0x2122, // 0x92 TRADE MARK SIGN
  32 + 0xfb01, // 0x93 LATIN SMALL LIGATURE FI
  33 + 0xfb02, // 0x94 LATIN SMALL LIGATURE FL
  34 + 0x0141, // 0x95 LATIN CAPITAL LETTER L WITH STROKE
  35 + 0x0152, // 0x96 LATIN CAPITAL LIGATURE OE
  36 + 0x0160, // 0x97 LATIN CAPITAL LETTER S WITH CARON
  37 + 0x0178, // 0x98 LATIN CAPITAL LETTER Y WITH DIAERESIS
  38 + 0x017d, // 0x99 LATIN CAPITAL LETTER Z WITH CARON
  39 + 0x0131, // 0x9a LATIN SMALL LETTER DOTLESS I
  40 + 0x0142, // 0x9b LATIN SMALL LETTER L WITH STROKE
  41 + 0x0153, // 0x9c LATIN SMALL LIGATURE OE
  42 + 0x0161, // 0x9d LATIN SMALL LETTER S WITH CARON
  43 + 0x017e, // 0x9e LATIN SMALL LETTER Z WITH CARON
  44 + 0xfffd, // 0x9f UNDEFINED
  45 + 0x20ac, // 0xa0 EURO SIGN
  46 +};
  47 +
11 // See above about ctype. 48 // See above about ctype.
12 static bool is_ascii_printable(unsigned char ch) 49 static bool is_ascii_printable(unsigned char ch)
13 { 50 {
@@ -209,7 +246,13 @@ QPDF_String::getUTF8Val() const @@ -209,7 +246,13 @@ QPDF_String::getUTF8Val() const
209 { 246 {
210 for (unsigned int i = 0; i < len; ++i) 247 for (unsigned int i = 0; i < len; ++i)
211 { 248 {
212 - result += QUtil::toUTF8(static_cast<unsigned char>(this->val.at(i))); 249 + unsigned char ch = static_cast<unsigned char>(this->val.at(i));
  250 + unsigned short val = ch;
  251 + if ((ch >= 128) && (ch <= 160))
  252 + {
  253 + val = pdf_doc_to_unicode[ch - 128];
  254 + }
  255 + result += QUtil::toUTF8(val);
213 } 256 }
214 } 257 }
215 return result; 258 return result;
qpdf/build.mk
1 -BINS_qpdf = qpdf test_driver pdf_from_scratch test_large_file test_tokenizer 1 +BINS_qpdf = \
  2 + qpdf \
  3 + pdf_from_scratch \
  4 + test_driver \
  5 + test_large_file \
  6 + test_pdf_doc_encoding \
  7 + test_tokenizer
2 CBINS_qpdf = qpdf-ctest 8 CBINS_qpdf = qpdf-ctest
3 9
4 TARGETS_qpdf = $(foreach B,$(BINS_qpdf) $(CBINS_qpdf),qpdf/$(OUTPUT_DIR)/$(call binname,$(B))) 10 TARGETS_qpdf = $(foreach B,$(BINS_qpdf) $(CBINS_qpdf),qpdf/$(OUTPUT_DIR)/$(call binname,$(B)))
qpdf/qtest/qpdf.test
@@ -84,6 +84,16 @@ flush_tiff_cache(); @@ -84,6 +84,16 @@ flush_tiff_cache();
84 84
85 show_ntests(); 85 show_ntests();
86 # ---------- 86 # ----------
  87 +$td->notify("--- PDF Doc Encoding ---");
  88 +$n_tests += 1;
  89 +
  90 +$td->runtest("PDF doc encoding to Unicode",
  91 + {$td->COMMAND => "test_pdf_doc_encoding pdf-doc-to-utf8.in"},
  92 + {$td->FILE => "pdf-doc-to-utf8.out", $td->EXIT_STATUS => 0},
  93 + $td->NORMALIZE_NEWLINES);
  94 +
  95 +show_ntests();
  96 +# ----------
87 $td->notify("--- Stream Replacement Tests ---"); 97 $td->notify("--- Stream Replacement Tests ---");
88 $n_tests += 8; 98 $n_tests += 8;
89 99
qpdf/qtest/qpdf/pdf-doc-to-utf8.in 0 → 100644
  1 +€ 128 0x80 0200 U+2022 BULLET
  2 + 129 0x81 0201 U+2020 DAGGER
  3 +‚ 130 0x82 0202 U+2021 DOUBLE DAGGER
  4 +ƒ 131 0x83 0203 U+2026 HORIZONTAL ELLIPSIS
  5 +„ 132 0x84 0204 U+2014 EM DASH
  6 +… 133 0x85 0205 U+2013 EN DASH
  7 +† 134 0x86 0206 U+0192 SMALL LETTER F WITH HOOK
  8 +‡ 135 0x87 0207 U+2044 FRACTION SLASH (solidus)
  9 +ˆ 136 0x88 0210 U+2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
  10 +‰ 137 0x89 0211 U+203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
  11 +Š 138 0x8a 0212 U+2212 MINUS SIGN
  12 +‹ 139 0x8b 0213 U+2030 PER MILLE SIGN
  13 +Œ 140 0x8c 0214 U+201E DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
  14 + 141 0x8d 0215 U+201C LEFT DOUBLE QUOTATION MARK (double quote left)
  15 +Ž 142 0x8e 0216 U+201D RIGHT DOUBLE QUOTATION MARK (quotedblright)
  16 + 143 0x8f 0217 U+2018 LEFT SINGLE QUOTATION MARK (quoteleft)
  17 + 144 0x90 0220 U+2019 RIGHT SINGLE QUOTATION MARK (quoteright)
  18 +‘ 145 0x91 0221 U+201A SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
  19 +’ 146 0x92 0222 U+2122 TRADE MARK SIGN
  20 +“ 147 0x93 0223 U+FB01 LATIN SMALL LIGATURE FI
  21 +” 148 0x94 0224 U+FB02 LATIN SMALL LIGATURE FL
  22 +• 149 0x95 0225 U+0141 LATIN CAPITAL LETTER L WITH STROKE
  23 +– 150 0x96 0226 U+0152 LATIN CAPITAL LIGATURE OE
  24 +— 151 0x97 0227 U+0160 LATIN CAPITAL LETTER S WITH CARON
  25 +˜ 152 0x98 0230 U+0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
  26 +™ 153 0x99 0231 U+017D LATIN CAPITAL LETTER Z WITH CARON
  27 +š 154 0x9a 0232 U+0131 LATIN SMALL LETTER DOTLESS I
  28 +› 155 0x9b 0233 U+0142 LATIN SMALL LETTER L WITH STROKE
  29 +œ 156 0x9c 0234 U+0153 LATIN SMALL LIGATURE OE
  30 + 157 0x9d 0235 U+0161 LATIN SMALL LETTER S WITH CARON
  31 +ž 158 0x9e 0236 U+017E LATIN SMALL LETTER Z WITH CARON
  32 +Ÿ 159 0x9f 0237 U+FFFD UNDEFINED
  33 +  160 0xa0 0240 U+20AC EURO SIGN
qpdf/qtest/qpdf/pdf-doc-to-utf8.out 0 → 100644
  1 +• 128 0x80 0200 U+2022 BULLET
  2 +† 129 0x81 0201 U+2020 DAGGER
  3 +‡ 130 0x82 0202 U+2021 DOUBLE DAGGER
  4 +… 131 0x83 0203 U+2026 HORIZONTAL ELLIPSIS
  5 +— 132 0x84 0204 U+2014 EM DASH
  6 +– 133 0x85 0205 U+2013 EN DASH
  7 +ƒ 134 0x86 0206 U+0192 SMALL LETTER F WITH HOOK
  8 +⁄ 135 0x87 0207 U+2044 FRACTION SLASH (solidus)
  9 +‹ 136 0x88 0210 U+2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
  10 +› 137 0x89 0211 U+203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
  11 +− 138 0x8a 0212 U+2212 MINUS SIGN
  12 +‰ 139 0x8b 0213 U+2030 PER MILLE SIGN
  13 +„ 140 0x8c 0214 U+201E DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
  14 +“ 141 0x8d 0215 U+201C LEFT DOUBLE QUOTATION MARK (double quote left)
  15 +” 142 0x8e 0216 U+201D RIGHT DOUBLE QUOTATION MARK (quotedblright)
  16 +‘ 143 0x8f 0217 U+2018 LEFT SINGLE QUOTATION MARK (quoteleft)
  17 +’ 144 0x90 0220 U+2019 RIGHT SINGLE QUOTATION MARK (quoteright)
  18 +‚ 145 0x91 0221 U+201A SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
  19 +™ 146 0x92 0222 U+2122 TRADE MARK SIGN
  20 +fi 147 0x93 0223 U+FB01 LATIN SMALL LIGATURE FI
  21 +fl 148 0x94 0224 U+FB02 LATIN SMALL LIGATURE FL
  22 +Ł 149 0x95 0225 U+0141 LATIN CAPITAL LETTER L WITH STROKE
  23 +Œ 150 0x96 0226 U+0152 LATIN CAPITAL LIGATURE OE
  24 +Š 151 0x97 0227 U+0160 LATIN CAPITAL LETTER S WITH CARON
  25 +Ÿ 152 0x98 0230 U+0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
  26 +Ž 153 0x99 0231 U+017D LATIN CAPITAL LETTER Z WITH CARON
  27 +ı 154 0x9a 0232 U+0131 LATIN SMALL LETTER DOTLESS I
  28 +ł 155 0x9b 0233 U+0142 LATIN SMALL LETTER L WITH STROKE
  29 +œ 156 0x9c 0234 U+0153 LATIN SMALL LIGATURE OE
  30 +š 157 0x9d 0235 U+0161 LATIN SMALL LETTER S WITH CARON
  31 +ž 158 0x9e 0236 U+017E LATIN SMALL LETTER Z WITH CARON
  32 +� 159 0x9f 0237 U+FFFD UNDEFINED
  33 +€ 160 0xa0 0240 U+20AC EURO SIGN
qpdf/test_pdf_doc_encoding.cc 0 → 100644
  1 +#include <qpdf/QUtil.hh>
  2 +#include <qpdf/QPDFObjectHandle.hh>
  3 +#include <iostream>
  4 +#include <stdlib.h>
  5 +#include <string.h>
  6 +
  7 +static char const* whoami = 0;
  8 +
  9 +void usage()
  10 +{
  11 + std::cerr << "Usage: " << whoami << " infile" << std::endl;
  12 + exit(2);
  13 +}
  14 +
  15 +int main(int argc, char* argv[])
  16 +{
  17 + if ((whoami = strrchr(argv[0], '/')) == NULL)
  18 + {
  19 + whoami = argv[0];
  20 + }
  21 + else
  22 + {
  23 + ++whoami;
  24 + }
  25 + // For libtool's sake....
  26 + if (strncmp(whoami, "lt-", 3) == 0)
  27 + {
  28 + whoami += 3;
  29 + }
  30 +
  31 + if (argc != 2)
  32 + {
  33 + usage();
  34 + }
  35 + char const* infilename = argv[1];
  36 + std::list<std::string> lines =
  37 + QUtil::read_lines_from_file(infilename);
  38 + for (std::list<std::string>::iterator iter = lines.begin();
  39 + iter != lines.end(); ++iter)
  40 + {
  41 + QPDFObjectHandle str = QPDFObjectHandle::newString(*iter);
  42 + std::cout << str.getUTF8Value() << std::endl;
  43 + }
  44 + return 0;
  45 +}