Commit 8b25de24c9b1e6acba042ea9ecdee783839e20a6

Authored by Jay Berkenbilt
1 parent 6b576797

Make "objects" and "pages" consistent in JSON output

ChangeLog
  1 +2022-05-04 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * json v1 output: make "pages" and "objects" consistent.
  4 + Previously, "objects" always reflected the objects exactly as they
  5 + appeared in the original file, while "pages" reflected objects
  6 + after repair of the pages tree. This could be misleading. Now, if
  7 + "pages" is specified, "objects" shows the effects of repairing the
  8 + page tree, and if not, it doesn't. This makes no difference for
  9 + correct PDF files that don't have problems in the pages tree. JSON
  10 + v2 will behave in a similar way.
  11 +
1 12 2022-05-03 Jay Berkenbilt <ejb@ql.org>
2 13  
3 14 * Add new Pipeline class Pl_String which appends to a std::string&
... ...
cSpell.json
... ... @@ -511,6 +511,7 @@
511 511 "unfilterable",
512 512 "unparse",
513 513 "unpickling",
  514 + "unrepaired",
514 515 "unretrieved",
515 516 "unversioned",
516 517 "upages",
... ...
libqpdf/QPDFJob.cc
... ... @@ -1618,15 +1618,7 @@ QPDFJob::doJSON(QPDF&amp; pdf)
1618 1618 bool all_keys = m->json_keys.empty();
1619 1619 // The list of selectable top-level keys id duplicated in the
1620 1620 // following places: job.yml, QPDFJob::json_schema, and
1621   - // QPDFJob::doJSON. We do objects and objectinfo first so they
1622   - // reflect the original file without any side effects caused by
1623   - // other operations, such as repairing the pages tree.
1624   - if (all_keys || m->json_keys.count("objects")) {
1625   - doJSONObjects(pdf, j);
1626   - }
1627   - if (all_keys || m->json_keys.count("objectinfo")) {
1628   - doJSONObjectinfo(pdf, j);
1629   - }
  1621 + // QPDFJob::doJSON.
1630 1622 if (all_keys || m->json_keys.count("pages")) {
1631 1623 doJSONPages(pdf, j);
1632 1624 }
... ... @@ -1646,6 +1638,17 @@ QPDFJob::doJSON(QPDF&amp; pdf)
1646 1638 doJSONAttachments(pdf, j);
1647 1639 }
1648 1640  
  1641 + // We do objects and objectinfo last so their information is
  1642 + // consistent with repairing the page tree. To see the original
  1643 + // file with any page tree problems and the page tree not
  1644 + // flattened, select objects/objectinfo without other keys.
  1645 + if (all_keys || m->json_keys.count("objects")) {
  1646 + doJSONObjects(pdf, j);
  1647 + }
  1648 + if (all_keys || m->json_keys.count("objectinfo")) {
  1649 + doJSONObjectinfo(pdf, j);
  1650 + }
  1651 +
1649 1652 // Check against schema
1650 1653  
1651 1654 JSON schema = json_schema(&m->json_keys);
... ...
manual/json.rst
... ... @@ -147,6 +147,16 @@ For the most part, the built-in JSON help tells you everything you need
147 147 to know about the JSON format, but there are a few non-obvious things to
148 148 be aware of:
149 149  
  150 +- If a PDF file has certain types of errors in its pages tree (such as
  151 + page objects that are direct or multiple pages sharing the same
  152 + object ID), qpdf will automatically repair the pages tree. If you
  153 + specify ``"objects"`` and/or ``"objectinfo"`` without any other
  154 + keys, you will see the original pages tree without any corrections.
  155 + If you specify any of keys that require page tree traversal (for
  156 + example, ``"pages"``, ``"outlines"``, or ``"pagelabel"``), then
  157 + ``"objects"`` and ``"objectinfo"`` will show the repaired page tree
  158 + so that object references will be consistent throughout the file.
  159 +
150 160 - While qpdf guarantees that keys present in the help will be present
151 161 in the output, those fields may be null or empty if the information
152 162 is not known or absent in the file. Also, if you specify
... ...
manual/release-notes.rst
... ... @@ -125,6 +125,13 @@ For a detailed list of changes, please see the file
125 125  
126 126 - Other changes
127 127  
  128 + - In JSON v1 mode, the ``"objects"`` key now reflects the repaired
  129 + pages tree if ``"pages"`` (or any other key that has the side
  130 + effect of repairing the page tree) is specified. To see the
  131 + original objects with any unrepaired page tree errors, specify
  132 + ``"objects"`` and/or ``"objectinfo"`` by themselves. This is
  133 + consistent with how JSON v2 behaves.
  134 +
128 135 - A new chapter on contributing to qpdf has been added to the
129 136 documentation. See :ref:`contributing`.
130 137  
... ...
qpdf/qtest/qpdf.test
... ... @@ -2829,7 +2829,7 @@ $td-&gt;runtest(&quot;check output&quot;,
2829 2829 show_ntests();
2830 2830 # ----------
2831 2831 $td->notify("--- Page Tree Issues ---");
2832   -$n_tests += 9;
  2832 +$n_tests += 11;
2833 2833  
2834 2834 $td->runtest("linearize duplicated pages",
2835 2835 {$td->COMMAND =>
... ... @@ -2864,14 +2864,22 @@ $td-&gt;runtest(&quot;show direct pages&quot;,
2864 2864 $td->NORMALIZE_NEWLINES);
2865 2865  
2866 2866 # Json mode for direct and duplicated pages illustrates that the
2867   -# "objects" section still shows the original objects before correction
2868   -# but the "pages" section shows the pages with their new object
2869   -# numbers.
  2867 +# "objects" section the original objects before correction when
  2868 +# "pages" is not output but after correct when it is.# numbers.
2870 2869 foreach my $f (qw(page_api_2 direct-pages))
2871 2870 {
2872   - $td->runtest("json for $f",
2873   - {$td->COMMAND => "qpdf --json=latest $f.pdf"},
2874   - {$td->FILE => "$f-json.out", $td->EXIT_STATUS => 0},
  2871 + $td->runtest("json for $f (objects only)",
  2872 + {$td->COMMAND =>
  2873 + "qpdf --json=latest $f.pdf" .
  2874 + " --json-key=objects --json-key=objectinfo"},
  2875 + {$td->FILE => "$f-json-objects.out", $td->EXIT_STATUS => 0},
  2876 + $td->NORMALIZE_NEWLINES);
  2877 + $td->runtest("json for $f (with pages)",
  2878 + {$td->COMMAND =>
  2879 + "qpdf --json=latest $f.pdf" .
  2880 + " --json-key=objects --json-key=objectinfo" .
  2881 + " --json-key=pages"},
  2882 + {$td->FILE => "$f-json-pages.out", $td->EXIT_STATUS => 0},
2875 2883 $td->NORMALIZE_NEWLINES);
2876 2884 }
2877 2885  
... ...
qpdf/qtest/qpdf/direct-pages-json.out renamed to qpdf/qtest/qpdf/direct-pages-json-objects.out
1 1 {
2   - "acroform": {
3   - "fields": [],
4   - "hasacroform": false,
5   - "needappearances": false
6   - },
7   - "attachments": {},
8   - "encrypt": {
9   - "capabilities": {
10   - "accessibility": true,
11   - "extract": true,
12   - "moddifyannotations": true,
13   - "modify": true,
14   - "modifyassembly": true,
15   - "modifyforms": true,
16   - "modifyother": true,
17   - "printhigh": true,
18   - "printlow": true
19   - },
20   - "encrypted": false,
21   - "ownerpasswordmatched": false,
22   - "parameters": {
23   - "P": 0,
24   - "R": 0,
25   - "V": 0,
26   - "bits": 0,
27   - "filemethod": "none",
28   - "key": null,
29   - "method": "none",
30   - "streammethod": "none",
31   - "stringmethod": "none"
32   - },
33   - "userpasswordmatched": false
34   - },
35 2 "objectinfo": {
36 3 "1 0 R": {
37 4 "stream": {
... ... @@ -145,30 +112,6 @@
145 112 "/Size": 7
146 113 }
147 114 },
148   - "outlines": [],
149   - "pagelabels": [],
150   - "pages": [
151   - {
152   - "contents": [
153   - "3 0 R"
154   - ],
155   - "images": [],
156   - "label": null,
157   - "object": "7 0 R",
158   - "outlines": [],
159   - "pageposfrom1": 1
160   - },
161   - {
162   - "contents": [
163   - "3 0 R"
164   - ],
165   - "images": [],
166   - "label": null,
167   - "object": "8 0 R",
168   - "outlines": [],
169   - "pageposfrom1": 2
170   - }
171   - ],
172 115 "parameters": {
173 116 "decodelevel": "generalized"
174 117 },
... ...
qpdf/qtest/qpdf/direct-pages-json-pages.out 0 → 100644
  1 +{
  2 + "objectinfo": {
  3 + "1 0 R": {
  4 + "stream": {
  5 + "filter": null,
  6 + "is": false,
  7 + "length": null
  8 + }
  9 + },
  10 + "2 0 R": {
  11 + "stream": {
  12 + "filter": null,
  13 + "is": false,
  14 + "length": null
  15 + }
  16 + },
  17 + "3 0 R": {
  18 + "stream": {
  19 + "filter": null,
  20 + "is": true,
  21 + "length": 44
  22 + }
  23 + },
  24 + "4 0 R": {
  25 + "stream": {
  26 + "filter": null,
  27 + "is": false,
  28 + "length": null
  29 + }
  30 + },
  31 + "5 0 R": {
  32 + "stream": {
  33 + "filter": null,
  34 + "is": false,
  35 + "length": null
  36 + }
  37 + },
  38 + "6 0 R": {
  39 + "stream": {
  40 + "filter": null,
  41 + "is": false,
  42 + "length": null
  43 + }
  44 + },
  45 + "7 0 R": {
  46 + "stream": {
  47 + "filter": null,
  48 + "is": false,
  49 + "length": null
  50 + }
  51 + },
  52 + "8 0 R": {
  53 + "stream": {
  54 + "filter": null,
  55 + "is": false,
  56 + "length": null
  57 + }
  58 + }
  59 + },
  60 + "objects": {
  61 + "1 0 R": {
  62 + "/Pages": "2 0 R",
  63 + "/Type": "/Catalog"
  64 + },
  65 + "2 0 R": {
  66 + "/Count": 2,
  67 + "/Kids": [
  68 + "7 0 R",
  69 + "8 0 R"
  70 + ],
  71 + "/Type": "/Pages"
  72 + },
  73 + "3 0 R": {
  74 + "/Length": "4 0 R"
  75 + },
  76 + "4 0 R": 44,
  77 + "5 0 R": {
  78 + "/BaseFont": "/Helvetica",
  79 + "/Encoding": "/WinAnsiEncoding",
  80 + "/Name": "/F1",
  81 + "/Subtype": "/Type1",
  82 + "/Type": "/Font"
  83 + },
  84 + "6 0 R": [
  85 + "/PDF",
  86 + "/Text"
  87 + ],
  88 + "7 0 R": {
  89 + "/Contents": "3 0 R",
  90 + "/MediaBox": [
  91 + 0,
  92 + 0,
  93 + 612,
  94 + 792
  95 + ],
  96 + "/Parent": "2 0 R",
  97 + "/Resources": {
  98 + "/Font": {
  99 + "/F1": "5 0 R"
  100 + },
  101 + "/ProcSet": "6 0 R"
  102 + },
  103 + "/Type": "/Page"
  104 + },
  105 + "8 0 R": {
  106 + "/Contents": "3 0 R",
  107 + "/MediaBox": [
  108 + 0,
  109 + 0,
  110 + 612,
  111 + 792
  112 + ],
  113 + "/Parent": "2 0 R",
  114 + "/Resources": {
  115 + "/Font": {
  116 + "/F1": "5 0 R"
  117 + },
  118 + "/ProcSet": "6 0 R"
  119 + },
  120 + "/Type": "/Page"
  121 + },
  122 + "trailer": {
  123 + "/ID": [
  124 + "\u0013#¥fi|WzfsU…©6ŸÎ<",
  125 + "7,¿DöÛ‹«`Ù&<\u000f\u000bÒj"
  126 + ],
  127 + "/Root": "1 0 R",
  128 + "/Size": 7
  129 + }
  130 + },
  131 + "pages": [
  132 + {
  133 + "contents": [
  134 + "3 0 R"
  135 + ],
  136 + "images": [],
  137 + "label": null,
  138 + "object": "7 0 R",
  139 + "outlines": [],
  140 + "pageposfrom1": 1
  141 + },
  142 + {
  143 + "contents": [
  144 + "3 0 R"
  145 + ],
  146 + "images": [],
  147 + "label": null,
  148 + "object": "8 0 R",
  149 + "outlines": [],
  150 + "pageposfrom1": 2
  151 + }
  152 + ],
  153 + "parameters": {
  154 + "decodelevel": "generalized"
  155 + },
  156 + "version": 1
  157 +}
... ...
qpdf/qtest/qpdf/page_api_2-json-objects.out 0 → 100644
  1 +{
  2 + "objectinfo": {
  3 + "1 0 R": {
  4 + "stream": {
  5 + "filter": null,
  6 + "is": false,
  7 + "length": null
  8 + }
  9 + },
  10 + "10 0 R": {
  11 + "stream": {
  12 + "filter": null,
  13 + "is": false,
  14 + "length": null
  15 + }
  16 + },
  17 + "2 0 R": {
  18 + "stream": {
  19 + "filter": null,
  20 + "is": false,
  21 + "length": null
  22 + }
  23 + },
  24 + "3 0 R": {
  25 + "stream": {
  26 + "filter": null,
  27 + "is": false,
  28 + "length": null
  29 + }
  30 + },
  31 + "4 0 R": {
  32 + "stream": {
  33 + "filter": null,
  34 + "is": false,
  35 + "length": null
  36 + }
  37 + },
  38 + "5 0 R": {
  39 + "stream": {
  40 + "filter": null,
  41 + "is": false,
  42 + "length": null
  43 + }
  44 + },
  45 + "6 0 R": {
  46 + "stream": {
  47 + "filter": null,
  48 + "is": true,
  49 + "length": 47
  50 + }
  51 + },
  52 + "7 0 R": {
  53 + "stream": {
  54 + "filter": null,
  55 + "is": false,
  56 + "length": null
  57 + }
  58 + },
  59 + "8 0 R": {
  60 + "stream": {
  61 + "filter": null,
  62 + "is": false,
  63 + "length": null
  64 + }
  65 + },
  66 + "9 0 R": {
  67 + "stream": {
  68 + "filter": null,
  69 + "is": true,
  70 + "length": 47
  71 + }
  72 + }
  73 + },
  74 + "objects": {
  75 + "1 0 R": {
  76 + "/Pages": "3 0 R",
  77 + "/Type": "/Catalog"
  78 + },
  79 + "10 0 R": 47,
  80 + "2 0 R": {
  81 + "/CreationDate": "D:20120621124041",
  82 + "/Producer": "Apex PDFWriter"
  83 + },
  84 + "3 0 R": {
  85 + "/Count": 3,
  86 + "/Kids": [
  87 + "4 0 R",
  88 + "4 0 R",
  89 + "5 0 R"
  90 + ],
  91 + "/Type": "/Pages"
  92 + },
  93 + "4 0 R": {
  94 + "/Contents": "6 0 R",
  95 + "/MediaBox": [
  96 + 0,
  97 + 0,
  98 + 612,
  99 + 792
  100 + ],
  101 + "/Parent": "3 0 R",
  102 + "/Resources": {
  103 + "/Font": {
  104 + "/F1": "8 0 R"
  105 + },
  106 + "/ProcSet": [
  107 + "/PDF",
  108 + "/Text"
  109 + ]
  110 + },
  111 + "/Type": "/Page"
  112 + },
  113 + "5 0 R": {
  114 + "/Contents": "9 0 R",
  115 + "/MediaBox": [
  116 + 0,
  117 + 0,
  118 + 612,
  119 + 792
  120 + ],
  121 + "/Parent": "3 0 R",
  122 + "/Resources": {
  123 + "/Font": {
  124 + "/F1": "8 0 R"
  125 + },
  126 + "/ProcSet": [
  127 + "/PDF",
  128 + "/Text"
  129 + ]
  130 + },
  131 + "/Type": "/Page"
  132 + },
  133 + "6 0 R": {
  134 + "/Length": "7 0 R"
  135 + },
  136 + "7 0 R": 47,
  137 + "8 0 R": {
  138 + "/BaseFont": "/Times-Roman",
  139 + "/Encoding": "/WinAnsiEncoding",
  140 + "/Subtype": "/Type1",
  141 + "/Type": "/Font"
  142 + },
  143 + "9 0 R": {
  144 + "/Length": "10 0 R"
  145 + },
  146 + "trailer": {
  147 + "/ID": [
  148 + "û˘·ƒÿ{5⁄\u0005Ú−S*º‘o",
  149 + "÷\u0017ž³QY¿ÔÀ\u000f\u0012−¼ý˜\u0002"
  150 + ],
  151 + "/Info": "2 0 R",
  152 + "/Root": "1 0 R",
  153 + "/Size": 11
  154 + }
  155 + },
  156 + "parameters": {
  157 + "decodelevel": "generalized"
  158 + },
  159 + "version": 1
  160 +}
... ...
qpdf/qtest/qpdf/page_api_2-json.out renamed to qpdf/qtest/qpdf/page_api_2-json-pages.out
1 1 {
2   - "acroform": {
3   - "fields": [],
4   - "hasacroform": false,
5   - "needappearances": false
6   - },
7   - "attachments": {},
8   - "encrypt": {
9   - "capabilities": {
10   - "accessibility": true,
11   - "extract": true,
12   - "moddifyannotations": true,
13   - "modify": true,
14   - "modifyassembly": true,
15   - "modifyforms": true,
16   - "modifyother": true,
17   - "printhigh": true,
18   - "printlow": true
19   - },
20   - "encrypted": false,
21   - "ownerpasswordmatched": false,
22   - "parameters": {
23   - "P": 0,
24   - "R": 0,
25   - "V": 0,
26   - "bits": 0,
27   - "filemethod": "none",
28   - "key": null,
29   - "method": "none",
30   - "streammethod": "none",
31   - "stringmethod": "none"
32   - },
33   - "userpasswordmatched": false
34   - },
35 2 "objectinfo": {
36 3 "1 0 R": {
37 4 "stream": {
... ... @@ -47,6 +14,13 @@
47 14 "length": null
48 15 }
49 16 },
  17 + "11 0 R": {
  18 + "stream": {
  19 + "filter": null,
  20 + "is": false,
  21 + "length": null
  22 + }
  23 + },
50 24 "2 0 R": {
51 25 "stream": {
52 26 "filter": null,
... ... @@ -110,6 +84,26 @@
110 84 "/Type": "/Catalog"
111 85 },
112 86 "10 0 R": 47,
  87 + "11 0 R": {
  88 + "/Contents": "6 0 R",
  89 + "/MediaBox": [
  90 + 0,
  91 + 0,
  92 + 612,
  93 + 792
  94 + ],
  95 + "/Parent": "3 0 R",
  96 + "/Resources": {
  97 + "/Font": {
  98 + "/F1": "8 0 R"
  99 + },
  100 + "/ProcSet": [
  101 + "/PDF",
  102 + "/Text"
  103 + ]
  104 + },
  105 + "/Type": "/Page"
  106 + },
113 107 "2 0 R": {
114 108 "/CreationDate": "D:20120621124041",
115 109 "/Producer": "Apex PDFWriter"
... ... @@ -118,7 +112,7 @@
118 112 "/Count": 3,
119 113 "/Kids": [
120 114 "4 0 R",
121   - "4 0 R",
  115 + "11 0 R",
122 116 "5 0 R"
123 117 ],
124 118 "/Type": "/Pages"
... ... @@ -186,8 +180,6 @@
186 180 "/Size": 11
187 181 }
188 182 },
189   - "outlines": [],
190   - "pagelabels": [],
191 183 "pages": [
192 184 {
193 185 "contents": [
... ...