Commit 4400ce84eeb204cdcb35950dd8fde094fc249051
1 parent
bb12a7ff
Add "n:/pdf-name" to qpdf JSON for binary names (fixes #1072)
Showing
8 changed files
with
234 additions
and
2 deletions
ChangeLog
| 1 | 1 | 2023-12-21 Jay Berkenbilt <ejb@ql.org> |
| 2 | 2 | |
| 3 | + * Fix to QPDF JSON: the syntax "n:/pdf-syntax" is now accepted as | |
| 4 | + an alternative way to represent names. This can be used for any | |
| 5 | + name (e.g. "n:/text#2fplain"), but it is necessary when the name | |
| 6 | + contains binary characters. For example, /one#a0two must be | |
| 7 | + represented as "n:/one#a0two" since the single byte a0 is not | |
| 8 | + valid in JSON. Fixes #1072. | |
| 9 | + | |
| 3 | 10 | * From M. Holger: Refactor QPDFParser for performance. See #1059 |
| 4 | 11 | for a discussion. |
| 5 | 12 | ... | ... |
libqpdf/QPDF_Name.cc
| ... | ... | @@ -57,6 +57,14 @@ QPDF_Name::getJSON(int json_version) |
| 57 | 57 | if (json_version == 1) { |
| 58 | 58 | return JSON::makeString(normalizeName(this->name)); |
| 59 | 59 | } else { |
| 60 | - return JSON::makeString(this->name); | |
| 60 | + bool has_8bit_chars; | |
| 61 | + bool is_valid_utf8; | |
| 62 | + bool is_utf16; | |
| 63 | + QUtil::analyze_encoding(this->name, has_8bit_chars, is_valid_utf8, is_utf16); | |
| 64 | + if (!has_8bit_chars || is_valid_utf8) { | |
| 65 | + return JSON::makeString(this->name); | |
| 66 | + } else { | |
| 67 | + return JSON::makeString("n:" + normalizeName(this->name)); | |
| 68 | + } | |
| 61 | 69 | } |
| 62 | 70 | } | ... | ... |
libqpdf/QPDF_json.cc
| ... | ... | @@ -144,6 +144,12 @@ is_name(std::string const& v) |
| 144 | 144 | return ((v.length() > 1) && (v.at(0) == '/')); |
| 145 | 145 | } |
| 146 | 146 | |
| 147 | +static bool | |
| 148 | +is_pdf_name(std::string const& v) | |
| 149 | +{ | |
| 150 | + return ((v.length() > 3) && (v.substr(0, 3) == "n:/")); | |
| 151 | +} | |
| 152 | + | |
| 147 | 153 | bool |
| 148 | 154 | QPDF::test_json_validators() |
| 149 | 155 | { |
| ... | ... | @@ -740,6 +746,8 @@ QPDF::JSONReactor::makeObject(JSON const& value) |
| 740 | 746 | result = QPDFObjectHandle::newString(QUtil::hex_decode(str)); |
| 741 | 747 | } else if (is_name(str_v)) { |
| 742 | 748 | result = QPDFObjectHandle::newName(str_v); |
| 749 | + } else if (is_pdf_name(str_v)) { | |
| 750 | + result = QPDFObjectHandle::parse(str_v.substr(2)); | |
| 743 | 751 | } else { |
| 744 | 752 | QTC::TC("qpdf", "QPDF_json unrecognized string value"); |
| 745 | 753 | error(value.getStart(), "unrecognized string value"); | ... | ... |
manual/json.rst
| ... | ... | @@ -258,6 +258,12 @@ Object Values |
| 258 | 258 | syntax resolved. For example, the name whose canonical form (per |
| 259 | 259 | the PDF specification) is ``text/plain`` would be represented in |
| 260 | 260 | JSON as ``"/text/plain"`` and in PDF as ``"/text#2fplain"``. |
| 261 | + Starting with qpdf 11.7.0, the syntax ``"n:/pdf-syntax"`` is | |
| 262 | + accepted as an alternative. This can be used for any name (e.g. | |
| 263 | + ``"n:/text#2fplain"``), but it is necessary when the name contains | |
| 264 | + binary characters. For example, ``/one#a0two`` must be represented | |
| 265 | + as ``"n:/one#a0two"`` since the single byte ``a0`` is not valid in | |
| 266 | + JSON. | |
| 261 | 267 | |
| 262 | 268 | - Indirect object references are represented as JSON strings that |
| 263 | 269 | look like a PDF indirect object reference and have the form |
| ... | ... | @@ -824,7 +830,8 @@ version 2. |
| 824 | 830 | - Names are shown in qpdf's canonical form rather than in PDF |
| 825 | 831 | syntax. (Example: the PDF-syntax name ``/text#2fplain`` appeared |
| 826 | 832 | as ``"/text#2fplain"`` in v1 but appears as ``"/text/plain"`` in |
| 827 | - v2. | |
| 833 | + v2. In qpdf 11.7.0, a fix was made to accept ``"n:/pdf-syntax"`` | |
| 834 | + for names containing binary characters. | |
| 828 | 835 | |
| 829 | 836 | - The top-level representation of an object in ``"objects"`` is a |
| 830 | 837 | dictionary containing either a ``"value"`` key or a ``"stream"`` | ... | ... |
manual/release-notes.rst
| ... | ... | @@ -45,6 +45,13 @@ Planned changes for future 12.x (subject to change): |
| 45 | 45 | reference streams, linearization hint streams, and object |
| 46 | 46 | streams. This has been fixed. |
| 47 | 47 | |
| 48 | + - Fix to QPDF JSON: the syntax ``"n:/pdf-syntax"`` is now accepted | |
| 49 | + as an alternative way to represent names. This can be used for | |
| 50 | + any name (e.g. ``"n:/text#2fplain"``), but it is necessary when | |
| 51 | + the name contains binary characters. For example, ``/one#a0two`` | |
| 52 | + must be represented as ``"n:/one#a0two"`` since the single byte | |
| 53 | + ``a0`` is not valid in JSON. | |
| 54 | + | |
| 48 | 55 | - Build Enhancements: |
| 49 | 56 | |
| 50 | 57 | - The qpdf test suite now passes when qpdf is linked with an | ... | ... |
qpdf/qtest/qpdf-json.test
| ... | ... | @@ -61,6 +61,7 @@ my @goodfiles = ( |
| 61 | 61 | 'form-fields-and-annotations.pdf', |
| 62 | 62 | 'need-appearances.pdf', |
| 63 | 63 | 'fxo-blue.pdf', |
| 64 | + 'weird-tokens.pdf', | |
| 64 | 65 | ); |
| 65 | 66 | $n_tests += 6 * scalar(@goodfiles); |
| 66 | 67 | |
| ... | ... | @@ -341,5 +342,21 @@ $td->runtest("check C API write to JSON stream", |
| 341 | 342 | {$td->FILE => "auto-4"}, |
| 342 | 343 | {$td->FILE => "qpdf-ctest-47-4"}); |
| 343 | 344 | |
| 345 | +# Bugs #1072 and #1079 illustrate cases that qpdf-json got wrong. In | |
| 346 | +# #1072, it was noticed that name tokens containing binary characters | |
| 347 | +# (using #xx) would generate invalid JSON, even though qpdf's own JSON | |
| 348 | +# parser would accept it. Also, the JSON spec allows real numbers in | |
| 349 | +# scientific notation, but the PDF spec does not. | |
| 350 | +$n_tests += 2; | |
| 351 | +$td->runtest("handle binary names", | |
| 352 | + {$td->COMMAND => | |
| 353 | + "qpdf --json-output weird-tokens.pdf a.json"}, | |
| 354 | + {$td->STRING => "", $td->EXIT_STATUS => 0}); | |
| 355 | +# Round-trip is tested above. | |
| 356 | +$td->runtest("check json", | |
| 357 | + {$td->FILE => "a.json"}, | |
| 358 | + {$td->FILE => "weird-tokens.json"}, | |
| 359 | + $td->NORMALIZE_NEWLINES); | |
| 360 | + | |
| 344 | 361 | cleanup(); |
| 345 | 362 | $td->report($n_tests); | ... | ... |
qpdf/qtest/qpdf/weird-tokens.json
0 → 100644
| 1 | +{ | |
| 2 | + "qpdf": [ | |
| 3 | + { | |
| 4 | + "jsonversion": 2, | |
| 5 | + "pdfversion": "2.0", | |
| 6 | + "pushedinheritedpageresources": false, | |
| 7 | + "calledgetallpages": false, | |
| 8 | + "maxobjectid": 6 | |
| 9 | + }, | |
| 10 | + { | |
| 11 | + "obj:1 0 R": { | |
| 12 | + "value": { | |
| 13 | + "/Extra": [ | |
| 14 | + "u:Names with binary data", | |
| 15 | + "n:/ABCDEF+#ba#da#cc#e5", | |
| 16 | + "/ABCEDEF+π", | |
| 17 | + "n:/one+#a0two", | |
| 18 | + "/text/plain", | |
| 19 | + "u:Very small/large reals", | |
| 20 | + 0.00001, | |
| 21 | + 1000000000000 | |
| 22 | + ], | |
| 23 | + "/Pages": "2 0 R", | |
| 24 | + "/Type": "/Catalog" | |
| 25 | + } | |
| 26 | + }, | |
| 27 | + "obj:2 0 R": { | |
| 28 | + "value": { | |
| 29 | + "/Count": 1, | |
| 30 | + "/Kids": [ | |
| 31 | + "3 0 R" | |
| 32 | + ], | |
| 33 | + "/Type": "/Pages" | |
| 34 | + } | |
| 35 | + }, | |
| 36 | + "obj:3 0 R": { | |
| 37 | + "value": { | |
| 38 | + "/Contents": "4 0 R", | |
| 39 | + "/MediaBox": [ | |
| 40 | + 0, | |
| 41 | + 0, | |
| 42 | + 612, | |
| 43 | + 792 | |
| 44 | + ], | |
| 45 | + "/Parent": "2 0 R", | |
| 46 | + "/Resources": { | |
| 47 | + "/Font": { | |
| 48 | + "/F1": "6 0 R" | |
| 49 | + } | |
| 50 | + }, | |
| 51 | + "/Type": "/Page" | |
| 52 | + } | |
| 53 | + }, | |
| 54 | + "obj:4 0 R": { | |
| 55 | + "stream": { | |
| 56 | + "data": "QlQKICAvRjEgMjQgVGYKICA3MiA3MjAgVGQKICAoUG90YXRvKSBUagpFVAo=", | |
| 57 | + "dict": {} | |
| 58 | + } | |
| 59 | + }, | |
| 60 | + "obj:5 0 R": { | |
| 61 | + "value": 44 | |
| 62 | + }, | |
| 63 | + "obj:6 0 R": { | |
| 64 | + "value": { | |
| 65 | + "/BaseFont": "/Helvetica", | |
| 66 | + "/Encoding": "/WinAnsiEncoding", | |
| 67 | + "/Subtype": "/Type1", | |
| 68 | + "/Type": "/Font" | |
| 69 | + } | |
| 70 | + }, | |
| 71 | + "trailer": { | |
| 72 | + "value": { | |
| 73 | + "/ID": [ | |
| 74 | + "b:42841c13bbf709d79a200fa1691836f8", | |
| 75 | + "b:728c020f464c3cf7e02c12605fa7d88b" | |
| 76 | + ], | |
| 77 | + "/Root": "1 0 R", | |
| 78 | + "/Size": 7 | |
| 79 | + } | |
| 80 | + } | |
| 81 | + } | |
| 82 | + ] | |
| 83 | +} | ... | ... |
qpdf/qtest/qpdf/weird-tokens.pdf
0 → 100644
| 1 | +%PDF-2.0 | |
| 2 | +%¿÷¢þ | |
| 3 | +%QDF-1.0 | |
| 4 | + | |
| 5 | +1 0 obj | |
| 6 | +<< | |
| 7 | + /Extra [ | |
| 8 | + (Names with binary data) | |
| 9 | + /ABCDEF+#ba#da#cc#e5 | |
| 10 | + /ABCEDEF+#cf#80 | |
| 11 | + /one+#a0two | |
| 12 | + /text#2fplain | |
| 13 | + (Very small/large reals) | |
| 14 | + 0.00001 | |
| 15 | + 1000000000000 | |
| 16 | + ] | |
| 17 | + /Pages 2 0 R | |
| 18 | + /Type /Catalog | |
| 19 | +>> | |
| 20 | +endobj | |
| 21 | + | |
| 22 | +2 0 obj | |
| 23 | +<< | |
| 24 | + /Count 1 | |
| 25 | + /Kids [ | |
| 26 | + 3 0 R | |
| 27 | + ] | |
| 28 | + /Type /Pages | |
| 29 | +>> | |
| 30 | +endobj | |
| 31 | + | |
| 32 | +%% Page 1 | |
| 33 | +3 0 obj | |
| 34 | +<< | |
| 35 | + /Contents 4 0 R | |
| 36 | + /MediaBox [ | |
| 37 | + 0 | |
| 38 | + 0 | |
| 39 | + 612 | |
| 40 | + 792 | |
| 41 | + ] | |
| 42 | + /Parent 2 0 R | |
| 43 | + /Resources << | |
| 44 | + /Font << | |
| 45 | + /F1 6 0 R | |
| 46 | + >> | |
| 47 | + >> | |
| 48 | + /Type /Page | |
| 49 | +>> | |
| 50 | +endobj | |
| 51 | + | |
| 52 | +%% Contents for page 1 | |
| 53 | +4 0 obj | |
| 54 | +<< | |
| 55 | + /Length 5 0 R | |
| 56 | +>> | |
| 57 | +stream | |
| 58 | +BT | |
| 59 | + /F1 24 Tf | |
| 60 | + 72 720 Td | |
| 61 | + (Potato) Tj | |
| 62 | +ET | |
| 63 | +endstream | |
| 64 | +endobj | |
| 65 | + | |
| 66 | +5 0 obj | |
| 67 | +44 | |
| 68 | +endobj | |
| 69 | + | |
| 70 | +6 0 obj | |
| 71 | +<< | |
| 72 | + /BaseFont /Helvetica | |
| 73 | + /Encoding /WinAnsiEncoding | |
| 74 | + /Subtype /Type1 | |
| 75 | + /Type /Font | |
| 76 | +>> | |
| 77 | +endobj | |
| 78 | + | |
| 79 | +xref | |
| 80 | +0 7 | |
| 81 | +0000000000 65535 f | |
| 82 | +0000000025 00000 n | |
| 83 | +0000000261 00000 n | |
| 84 | +0000000343 00000 n | |
| 85 | +0000000539 00000 n | |
| 86 | +0000000638 00000 n | |
| 87 | +0000000657 00000 n | |
| 88 | +trailer << | |
| 89 | + /Root 1 0 R | |
| 90 | + /Size 7 | |
| 91 | + /ID [<42841c13bbf709d79a200fa1691836f8><728c020f464c3cf7e02c12605fa7d88b>] | |
| 92 | +>> | |
| 93 | +startxref | |
| 94 | +763 | |
| 95 | +%%EOF | ... | ... |