Commit f7ac5915909c7197acf84265f8d8ad41b95a36a8
1 parent
07a2bb33
Recognize explicit UTF-8 strings (fixes #654)
Showing
6 changed files
with
38 additions
and
11 deletions
ChangeLog
| 1 | +2022-02-22 Jay Berkenbilt <ejb@ql.org> | ||
| 2 | + | ||
| 3 | + * Recognize PDF strings explicitly marked as UTF-8 as allowed by | ||
| 4 | + the PDF 2.0 spec. Fixes #654. | ||
| 5 | + | ||
| 1 | 2022-02-18 Jay Berkenbilt <ejb@ql.org> | 6 | 2022-02-18 Jay Berkenbilt <ejb@ql.org> |
| 2 | 7 | ||
| 3 | * Bug fix: when generating appearance streams, the font size was | 8 | * Bug fix: when generating appearance streams, the font size was |
TODO
| @@ -10,6 +10,14 @@ Priorities for 11: | @@ -10,6 +10,14 @@ Priorities for 11: | ||
| 10 | * PointerHolder -> shared_ptr | 10 | * PointerHolder -> shared_ptr |
| 11 | * ABI | 11 | * ABI |
| 12 | 12 | ||
| 13 | +Misc | ||
| 14 | +* Get rid of "ugly switch statements" in QUtil.cc -- replace with | ||
| 15 | + static map initializers. (Search for "ugly switch statements" below | ||
| 16 | + as well.) | ||
| 17 | +* Consider exposing get_next_utf8_codepoint in QUtil | ||
| 18 | +* Add QUtil::is_explicit_utf8 that does what QPDF_String::getUTF8Val | ||
| 19 | + does to detect UTF-8 encoded strings per PDF 2.0 spec. | ||
| 20 | + | ||
| 13 | Soon: Break ground on "Document-level work" | 21 | Soon: Break ground on "Document-level work" |
| 14 | 22 | ||
| 15 | Code Formatting | 23 | Code Formatting |
libqpdf/QPDF_String.cc
| @@ -183,6 +183,15 @@ QPDF_String::getUTF8Val() const | @@ -183,6 +183,15 @@ QPDF_String::getUTF8Val() const | ||
| 183 | { | 183 | { |
| 184 | return QUtil::utf16_to_utf8(this->val); | 184 | return QUtil::utf16_to_utf8(this->val); |
| 185 | } | 185 | } |
| 186 | + else if ((val.length() >= 3) && | ||
| 187 | + (val[0] == '\xEF') && | ||
| 188 | + (val[1] == '\xBB') && | ||
| 189 | + (val[2] == '\xBF')) | ||
| 190 | + { | ||
| 191 | + // PDF 2.0 allows UTF-8 strings when explicitly prefixed with | ||
| 192 | + // the above bytes, which is just UTF-8 encoding of U+FEFF. | ||
| 193 | + return this->val.substr(3); | ||
| 194 | + } | ||
| 186 | else | 195 | else |
| 187 | { | 196 | { |
| 188 | return QUtil::pdf_doc_to_utf8(this->val); | 197 | return QUtil::pdf_doc_to_utf8(this->val); |
manual/release-notes.rst
| @@ -9,6 +9,9 @@ For a detailed list of changes, please see the file | @@ -9,6 +9,9 @@ For a detailed list of changes, please see the file | ||
| 9 | 10.6.3: XXX | 9 | 10.6.3: XXX |
| 10 | - Bug fixes: | 10 | - Bug fixes: |
| 11 | 11 | ||
| 12 | + - Recognize strings explicitly encoded as UTF-8 as allowed by the | ||
| 13 | + PDF 2.0 spec. | ||
| 14 | + | ||
| 12 | - Fix edge cases with appearance stream generation for form fields | 15 | - Fix edge cases with appearance stream generation for form fields |
| 13 | whose ``/DA`` field lacks proper font size specification or that | 16 | whose ``/DA`` field lacks proper font size specification or that |
| 14 | specifies auto sizing. At this time, qpdf does not support auto | 17 | specifies auto sizing. At this time, qpdf does not support auto |
qpdf/qtest/qpdf/numeric-and-string-3.out
| @@ -7,8 +7,9 @@ end page 1 | @@ -7,8 +7,9 @@ end page 1 | ||
| 7 | QStrings: | 7 | QStrings: |
| 8 | No Special Characters | 8 | No Special Characters |
| 9 | These: ¿÷¢þ and no more | 9 | These: ¿÷¢þ and no more |
| 10 | +Explicit utf-8 with π | ||
| 10 | πωτατω | 11 | πωτατω |
| 11 | -treble clef: 𝄠; sixteenth note: 𝅘𝅥𝅮 | 12 | +treble clef: 𝄠; sixteenth note: 𝅘𝅥𝅯 |
| 12 | QNumbers: | 13 | QNumbers: |
| 13 | 1.000 | 14 | 1.000 |
| 14 | 3.142 | 15 | 3.142 |
qpdf/qtest/qpdf/numeric-and-string-3.pdf
| @@ -12,8 +12,9 @@ | @@ -12,8 +12,9 @@ | ||
| 12 | /QStrings [ | 12 | /QStrings [ |
| 13 | (No Special Characters) | 13 | (No Special Characters) |
| 14 | (These: ¿÷¢þ and no more) | 14 | (These: ¿÷¢þ and no more) |
| 15 | + (\357\273\277Explicit utf-8 with \317\200) | ||
| 15 | <feff03c003c903c403b103c403c9> | 16 | <feff03c003c903c403b103c403c9> |
| 16 | - <feff0074007200650062006c006500200063006c00650066003a0020d834dd20003b0020007300690078007400650065006e007400680020006e006f00740065003a0020d834dd60> | 17 | + <feff0074007200650062006c006500200063006c00650066003a0020d834dd20003b0020007300690078007400650065006e007400680020006e006f00740065003a0020d834dd61> |
| 17 | ] | 18 | ] |
| 18 | /Type /Catalog | 19 | /Type /Catalog |
| 19 | >> | 20 | >> |
| @@ -110,19 +111,19 @@ xref | @@ -110,19 +111,19 @@ xref | ||
| 110 | 0 10 | 111 | 0 10 |
| 111 | 0000000000 65535 f | 112 | 0000000000 65535 f |
| 112 | 0000000025 00000 n | 113 | 0000000025 00000 n |
| 113 | -0000000377 00000 n | ||
| 114 | -0000000459 00000 n | ||
| 115 | -0000000694 00000 n | ||
| 116 | -0000000793 00000 n | ||
| 117 | -0000000835 00000 n | ||
| 118 | -0000000933 00000 n | ||
| 119 | -0000000952 00000 n | ||
| 120 | -0000001070 00000 n | 114 | +0000000424 00000 n |
| 115 | +0000000506 00000 n | ||
| 116 | +0000000741 00000 n | ||
| 117 | +0000000840 00000 n | ||
| 118 | +0000000882 00000 n | ||
| 119 | +0000000980 00000 n | ||
| 120 | +0000000999 00000 n | ||
| 121 | +0000001117 00000 n | ||
| 121 | trailer << | 122 | trailer << |
| 122 | /Root 1 0 R | 123 | /Root 1 0 R |
| 123 | /Size 10 | 124 | /Size 10 |
| 124 | /ID [<e017d8dc1fe53a81e40aa79bcb43fdec><76269ee0b6579446b731e060af8ef436>] | 125 | /ID [<e017d8dc1fe53a81e40aa79bcb43fdec><76269ee0b6579446b731e060af8ef436>] |
| 125 | >> | 126 | >> |
| 126 | startxref | 127 | startxref |
| 127 | -1105 | 128 | +1152 |
| 128 | %%EOF | 129 | %%EOF |