Commit 4400ce84eeb204cdcb35950dd8fde094fc249051

Authored by Jay Berkenbilt
1 parent bb12a7ff

Add "n:/pdf-name" to qpdf JSON for binary names (fixes #1072)

ChangeLog
1 2023-12-21 Jay Berkenbilt <ejb@ql.org> 1 2023-12-21 Jay Berkenbilt <ejb@ql.org>
2 2
  3 + * Fix to QPDF JSON: the syntax "n:/pdf-syntax" is now accepted as
  4 + an alternative way to represent names. This can be used for any
  5 + name (e.g. "n:/text#2fplain"), but it is necessary when the name
  6 + contains binary characters. For example, /one#a0two must be
  7 + represented as "n:/one#a0two" since the single byte a0 is not
  8 + valid in JSON. Fixes #1072.
  9 +
3 * From M. Holger: Refactor QPDFParser for performance. See #1059 10 * From M. Holger: Refactor QPDFParser for performance. See #1059
4 for a discussion. 11 for a discussion.
5 12
libqpdf/QPDF_Name.cc
@@ -57,6 +57,14 @@ QPDF_Name::getJSON(int json_version) @@ -57,6 +57,14 @@ QPDF_Name::getJSON(int json_version)
57 if (json_version == 1) { 57 if (json_version == 1) {
58 return JSON::makeString(normalizeName(this->name)); 58 return JSON::makeString(normalizeName(this->name));
59 } else { 59 } else {
60 - return JSON::makeString(this->name); 60 + bool has_8bit_chars;
  61 + bool is_valid_utf8;
  62 + bool is_utf16;
  63 + QUtil::analyze_encoding(this->name, has_8bit_chars, is_valid_utf8, is_utf16);
  64 + if (!has_8bit_chars || is_valid_utf8) {
  65 + return JSON::makeString(this->name);
  66 + } else {
  67 + return JSON::makeString("n:" + normalizeName(this->name));
  68 + }
61 } 69 }
62 } 70 }
libqpdf/QPDF_json.cc
@@ -144,6 +144,12 @@ is_name(std::string const&amp; v) @@ -144,6 +144,12 @@ is_name(std::string const&amp; v)
144 return ((v.length() > 1) && (v.at(0) == '/')); 144 return ((v.length() > 1) && (v.at(0) == '/'));
145 } 145 }
146 146
  147 +static bool
  148 +is_pdf_name(std::string const& v)
  149 +{
  150 + return ((v.length() > 3) && (v.substr(0, 3) == "n:/"));
  151 +}
  152 +
147 bool 153 bool
148 QPDF::test_json_validators() 154 QPDF::test_json_validators()
149 { 155 {
@@ -740,6 +746,8 @@ QPDF::JSONReactor::makeObject(JSON const&amp; value) @@ -740,6 +746,8 @@ QPDF::JSONReactor::makeObject(JSON const&amp; value)
740 result = QPDFObjectHandle::newString(QUtil::hex_decode(str)); 746 result = QPDFObjectHandle::newString(QUtil::hex_decode(str));
741 } else if (is_name(str_v)) { 747 } else if (is_name(str_v)) {
742 result = QPDFObjectHandle::newName(str_v); 748 result = QPDFObjectHandle::newName(str_v);
  749 + } else if (is_pdf_name(str_v)) {
  750 + result = QPDFObjectHandle::parse(str_v.substr(2));
743 } else { 751 } else {
744 QTC::TC("qpdf", "QPDF_json unrecognized string value"); 752 QTC::TC("qpdf", "QPDF_json unrecognized string value");
745 error(value.getStart(), "unrecognized string value"); 753 error(value.getStart(), "unrecognized string value");
manual/json.rst
@@ -258,6 +258,12 @@ Object Values @@ -258,6 +258,12 @@ Object Values
258 syntax resolved. For example, the name whose canonical form (per 258 syntax resolved. For example, the name whose canonical form (per
259 the PDF specification) is ``text/plain`` would be represented in 259 the PDF specification) is ``text/plain`` would be represented in
260 JSON as ``"/text/plain"`` and in PDF as ``"/text#2fplain"``. 260 JSON as ``"/text/plain"`` and in PDF as ``"/text#2fplain"``.
  261 + Starting with qpdf 11.7.0, the syntax ``"n:/pdf-syntax"`` is
  262 + accepted as an alternative. This can be used for any name (e.g.
  263 + ``"n:/text#2fplain"``), but it is necessary when the name contains
  264 + binary characters. For example, ``/one#a0two`` must be represented
  265 + as ``"n:/one#a0two"`` since the single byte ``a0`` is not valid in
  266 + JSON.
261 267
262 - Indirect object references are represented as JSON strings that 268 - Indirect object references are represented as JSON strings that
263 look like a PDF indirect object reference and have the form 269 look like a PDF indirect object reference and have the form
@@ -824,7 +830,8 @@ version 2. @@ -824,7 +830,8 @@ version 2.
824 - Names are shown in qpdf's canonical form rather than in PDF 830 - Names are shown in qpdf's canonical form rather than in PDF
825 syntax. (Example: the PDF-syntax name ``/text#2fplain`` appeared 831 syntax. (Example: the PDF-syntax name ``/text#2fplain`` appeared
826 as ``"/text#2fplain"`` in v1 but appears as ``"/text/plain"`` in 832 as ``"/text#2fplain"`` in v1 but appears as ``"/text/plain"`` in
827 - v2. 833 + v2. In qpdf 11.7.0, a fix was made to accept ``"n:/pdf-syntax"``
  834 + for names containing binary characters.
828 835
829 - The top-level representation of an object in ``"objects"`` is a 836 - The top-level representation of an object in ``"objects"`` is a
830 dictionary containing either a ``"value"`` key or a ``"stream"`` 837 dictionary containing either a ``"value"`` key or a ``"stream"``
manual/release-notes.rst
@@ -45,6 +45,13 @@ Planned changes for future 12.x (subject to change): @@ -45,6 +45,13 @@ Planned changes for future 12.x (subject to change):
45 reference streams, linearization hint streams, and object 45 reference streams, linearization hint streams, and object
46 streams. This has been fixed. 46 streams. This has been fixed.
47 47
  48 + - Fix to QPDF JSON: the syntax ``"n:/pdf-syntax"`` is now accepted
  49 + as an alternative way to represent names. This can be used for
  50 + any name (e.g. ``"n:/text#2fplain"``), but it is necessary when
  51 + the name contains binary characters. For example, ``/one#a0two``
  52 + must be represented as ``"n:/one#a0two"`` since the single byte
  53 + ``a0`` is not valid in JSON.
  54 +
48 - Build Enhancements: 55 - Build Enhancements:
49 56
50 - The qpdf test suite now passes when qpdf is linked with an 57 - The qpdf test suite now passes when qpdf is linked with an
qpdf/qtest/qpdf-json.test
@@ -61,6 +61,7 @@ my @goodfiles = ( @@ -61,6 +61,7 @@ my @goodfiles = (
61 'form-fields-and-annotations.pdf', 61 'form-fields-and-annotations.pdf',
62 'need-appearances.pdf', 62 'need-appearances.pdf',
63 'fxo-blue.pdf', 63 'fxo-blue.pdf',
  64 + 'weird-tokens.pdf',
64 ); 65 );
65 $n_tests += 6 * scalar(@goodfiles); 66 $n_tests += 6 * scalar(@goodfiles);
66 67
@@ -341,5 +342,21 @@ $td-&gt;runtest(&quot;check C API write to JSON stream&quot;, @@ -341,5 +342,21 @@ $td-&gt;runtest(&quot;check C API write to JSON stream&quot;,
341 {$td->FILE => "auto-4"}, 342 {$td->FILE => "auto-4"},
342 {$td->FILE => "qpdf-ctest-47-4"}); 343 {$td->FILE => "qpdf-ctest-47-4"});
343 344
  345 +# Bugs #1072 and #1079 illustrate cases that qpdf-json got wrong. In
  346 +# #1072, it was noticed that name tokens containing binary characters
  347 +# (using #xx) would generate invalid JSON, even though qpdf's own JSON
  348 +# parser would accept it. Also, the JSON spec allows real numbers in
  349 +# scientific notation, but the PDF spec does not.
  350 +$n_tests += 2;
  351 +$td->runtest("handle binary names",
  352 + {$td->COMMAND =>
  353 + "qpdf --json-output weird-tokens.pdf a.json"},
  354 + {$td->STRING => "", $td->EXIT_STATUS => 0});
  355 +# Round-trip is tested above.
  356 +$td->runtest("check json",
  357 + {$td->FILE => "a.json"},
  358 + {$td->FILE => "weird-tokens.json"},
  359 + $td->NORMALIZE_NEWLINES);
  360 +
344 cleanup(); 361 cleanup();
345 $td->report($n_tests); 362 $td->report($n_tests);
qpdf/qtest/qpdf/weird-tokens.json 0 โ†’ 100644
  1 +{
  2 + "qpdf": [
  3 + {
  4 + "jsonversion": 2,
  5 + "pdfversion": "2.0",
  6 + "pushedinheritedpageresources": false,
  7 + "calledgetallpages": false,
  8 + "maxobjectid": 6
  9 + },
  10 + {
  11 + "obj:1 0 R": {
  12 + "value": {
  13 + "/Extra": [
  14 + "u:Names with binary data",
  15 + "n:/ABCDEF+#ba#da#cc#e5",
  16 + "/ABCEDEF+ฯ€",
  17 + "n:/one+#a0two",
  18 + "/text/plain",
  19 + "u:Very small/large reals",
  20 + 0.00001,
  21 + 1000000000000
  22 + ],
  23 + "/Pages": "2 0 R",
  24 + "/Type": "/Catalog"
  25 + }
  26 + },
  27 + "obj:2 0 R": {
  28 + "value": {
  29 + "/Count": 1,
  30 + "/Kids": [
  31 + "3 0 R"
  32 + ],
  33 + "/Type": "/Pages"
  34 + }
  35 + },
  36 + "obj:3 0 R": {
  37 + "value": {
  38 + "/Contents": "4 0 R",
  39 + "/MediaBox": [
  40 + 0,
  41 + 0,
  42 + 612,
  43 + 792
  44 + ],
  45 + "/Parent": "2 0 R",
  46 + "/Resources": {
  47 + "/Font": {
  48 + "/F1": "6 0 R"
  49 + }
  50 + },
  51 + "/Type": "/Page"
  52 + }
  53 + },
  54 + "obj:4 0 R": {
  55 + "stream": {
  56 + "data": "QlQKICAvRjEgMjQgVGYKICA3MiA3MjAgVGQKICAoUG90YXRvKSBUagpFVAo=",
  57 + "dict": {}
  58 + }
  59 + },
  60 + "obj:5 0 R": {
  61 + "value": 44
  62 + },
  63 + "obj:6 0 R": {
  64 + "value": {
  65 + "/BaseFont": "/Helvetica",
  66 + "/Encoding": "/WinAnsiEncoding",
  67 + "/Subtype": "/Type1",
  68 + "/Type": "/Font"
  69 + }
  70 + },
  71 + "trailer": {
  72 + "value": {
  73 + "/ID": [
  74 + "b:42841c13bbf709d79a200fa1691836f8",
  75 + "b:728c020f464c3cf7e02c12605fa7d88b"
  76 + ],
  77 + "/Root": "1 0 R",
  78 + "/Size": 7
  79 + }
  80 + }
  81 + }
  82 + ]
  83 +}
qpdf/qtest/qpdf/weird-tokens.pdf 0 โ†’ 100644
  1 +%PDF-2.0
  2 +%ยฟรทยขรพ
  3 +%QDF-1.0
  4 +
  5 +1 0 obj
  6 +<<
  7 + /Extra [
  8 + (Names with binary data)
  9 + /ABCDEF+#ba#da#cc#e5
  10 + /ABCEDEF+#cf#80
  11 + /one+#a0two
  12 + /text#2fplain
  13 + (Very small/large reals)
  14 + 0.00001
  15 + 1000000000000
  16 + ]
  17 + /Pages 2 0 R
  18 + /Type /Catalog
  19 +>>
  20 +endobj
  21 +
  22 +2 0 obj
  23 +<<
  24 + /Count 1
  25 + /Kids [
  26 + 3 0 R
  27 + ]
  28 + /Type /Pages
  29 +>>
  30 +endobj
  31 +
  32 +%% Page 1
  33 +3 0 obj
  34 +<<
  35 + /Contents 4 0 R
  36 + /MediaBox [
  37 + 0
  38 + 0
  39 + 612
  40 + 792
  41 + ]
  42 + /Parent 2 0 R
  43 + /Resources <<
  44 + /Font <<
  45 + /F1 6 0 R
  46 + >>
  47 + >>
  48 + /Type /Page
  49 +>>
  50 +endobj
  51 +
  52 +%% Contents for page 1
  53 +4 0 obj
  54 +<<
  55 + /Length 5 0 R
  56 +>>
  57 +stream
  58 +BT
  59 + /F1 24 Tf
  60 + 72 720 Td
  61 + (Potato) Tj
  62 +ET
  63 +endstream
  64 +endobj
  65 +
  66 +5 0 obj
  67 +44
  68 +endobj
  69 +
  70 +6 0 obj
  71 +<<
  72 + /BaseFont /Helvetica
  73 + /Encoding /WinAnsiEncoding
  74 + /Subtype /Type1
  75 + /Type /Font
  76 +>>
  77 +endobj
  78 +
  79 +xref
  80 +0 7
  81 +0000000000 65535 f
  82 +0000000025 00000 n
  83 +0000000261 00000 n
  84 +0000000343 00000 n
  85 +0000000539 00000 n
  86 +0000000638 00000 n
  87 +0000000657 00000 n
  88 +trailer <<
  89 + /Root 1 0 R
  90 + /Size 7
  91 + /ID [<42841c13bbf709d79a200fa1691836f8><728c020f464c3cf7e02c12605fa7d88b>]
  92 +>>
  93 +startxref
  94 +763
  95 +%%EOF