Commit 8b25de24c9b1e6acba042ea9ecdee783839e20a6

Authored by Jay Berkenbilt
1 parent 6b576797

Make "objects" and "pages" consistent in JSON output

ChangeLog
  1 +2022-05-04 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * json v1 output: make "pages" and "objects" consistent.
  4 + Previously, "objects" always reflected the objects exactly as they
  5 + appeared in the original file, while "pages" reflected objects
  6 + after repair of the pages tree. This could be misleading. Now, if
  7 + "pages" is specified, "objects" shows the effects of repairing the
  8 + page tree, and if not, it doesn't. This makes no difference for
  9 + correct PDF files that don't have problems in the pages tree. JSON
  10 + v2 will behave in a similar way.
  11 +
1 2022-05-03 Jay Berkenbilt <ejb@ql.org> 12 2022-05-03 Jay Berkenbilt <ejb@ql.org>
2 13
3 * Add new Pipeline class Pl_String which appends to a std::string& 14 * Add new Pipeline class Pl_String which appends to a std::string&
cSpell.json
@@ -511,6 +511,7 @@ @@ -511,6 +511,7 @@
511 "unfilterable", 511 "unfilterable",
512 "unparse", 512 "unparse",
513 "unpickling", 513 "unpickling",
  514 + "unrepaired",
514 "unretrieved", 515 "unretrieved",
515 "unversioned", 516 "unversioned",
516 "upages", 517 "upages",
libqpdf/QPDFJob.cc
@@ -1618,15 +1618,7 @@ QPDFJob::doJSON(QPDF&amp; pdf) @@ -1618,15 +1618,7 @@ QPDFJob::doJSON(QPDF&amp; pdf)
1618 bool all_keys = m->json_keys.empty(); 1618 bool all_keys = m->json_keys.empty();
1619 // The list of selectable top-level keys id duplicated in the 1619 // The list of selectable top-level keys id duplicated in the
1620 // following places: job.yml, QPDFJob::json_schema, and 1620 // following places: job.yml, QPDFJob::json_schema, and
1621 - // QPDFJob::doJSON. We do objects and objectinfo first so they  
1622 - // reflect the original file without any side effects caused by  
1623 - // other operations, such as repairing the pages tree.  
1624 - if (all_keys || m->json_keys.count("objects")) {  
1625 - doJSONObjects(pdf, j);  
1626 - }  
1627 - if (all_keys || m->json_keys.count("objectinfo")) {  
1628 - doJSONObjectinfo(pdf, j);  
1629 - } 1621 + // QPDFJob::doJSON.
1630 if (all_keys || m->json_keys.count("pages")) { 1622 if (all_keys || m->json_keys.count("pages")) {
1631 doJSONPages(pdf, j); 1623 doJSONPages(pdf, j);
1632 } 1624 }
@@ -1646,6 +1638,17 @@ QPDFJob::doJSON(QPDF&amp; pdf) @@ -1646,6 +1638,17 @@ QPDFJob::doJSON(QPDF&amp; pdf)
1646 doJSONAttachments(pdf, j); 1638 doJSONAttachments(pdf, j);
1647 } 1639 }
1648 1640
  1641 + // We do objects and objectinfo last so their information is
  1642 + // consistent with repairing the page tree. To see the original
  1643 + // file with any page tree problems and the page tree not
  1644 + // flattened, select objects/objectinfo without other keys.
  1645 + if (all_keys || m->json_keys.count("objects")) {
  1646 + doJSONObjects(pdf, j);
  1647 + }
  1648 + if (all_keys || m->json_keys.count("objectinfo")) {
  1649 + doJSONObjectinfo(pdf, j);
  1650 + }
  1651 +
1649 // Check against schema 1652 // Check against schema
1650 1653
1651 JSON schema = json_schema(&m->json_keys); 1654 JSON schema = json_schema(&m->json_keys);
manual/json.rst
@@ -147,6 +147,16 @@ For the most part, the built-in JSON help tells you everything you need @@ -147,6 +147,16 @@ For the most part, the built-in JSON help tells you everything you need
147 to know about the JSON format, but there are a few non-obvious things to 147 to know about the JSON format, but there are a few non-obvious things to
148 be aware of: 148 be aware of:
149 149
  150 +- If a PDF file has certain types of errors in its pages tree (such as
  151 + page objects that are direct or multiple pages sharing the same
  152 + object ID), qpdf will automatically repair the pages tree. If you
  153 + specify ``"objects"`` and/or ``"objectinfo"`` without any other
  154 + keys, you will see the original pages tree without any corrections.
  155 + If you specify any of keys that require page tree traversal (for
  156 + example, ``"pages"``, ``"outlines"``, or ``"pagelabel"``), then
  157 + ``"objects"`` and ``"objectinfo"`` will show the repaired page tree
  158 + so that object references will be consistent throughout the file.
  159 +
150 - While qpdf guarantees that keys present in the help will be present 160 - While qpdf guarantees that keys present in the help will be present
151 in the output, those fields may be null or empty if the information 161 in the output, those fields may be null or empty if the information
152 is not known or absent in the file. Also, if you specify 162 is not known or absent in the file. Also, if you specify
manual/release-notes.rst
@@ -125,6 +125,13 @@ For a detailed list of changes, please see the file @@ -125,6 +125,13 @@ For a detailed list of changes, please see the file
125 125
126 - Other changes 126 - Other changes
127 127
  128 + - In JSON v1 mode, the ``"objects"`` key now reflects the repaired
  129 + pages tree if ``"pages"`` (or any other key that has the side
  130 + effect of repairing the page tree) is specified. To see the
  131 + original objects with any unrepaired page tree errors, specify
  132 + ``"objects"`` and/or ``"objectinfo"`` by themselves. This is
  133 + consistent with how JSON v2 behaves.
  134 +
128 - A new chapter on contributing to qpdf has been added to the 135 - A new chapter on contributing to qpdf has been added to the
129 documentation. See :ref:`contributing`. 136 documentation. See :ref:`contributing`.
130 137
qpdf/qtest/qpdf.test
@@ -2829,7 +2829,7 @@ $td-&gt;runtest(&quot;check output&quot;, @@ -2829,7 +2829,7 @@ $td-&gt;runtest(&quot;check output&quot;,
2829 show_ntests(); 2829 show_ntests();
2830 # ---------- 2830 # ----------
2831 $td->notify("--- Page Tree Issues ---"); 2831 $td->notify("--- Page Tree Issues ---");
2832 -$n_tests += 9; 2832 +$n_tests += 11;
2833 2833
2834 $td->runtest("linearize duplicated pages", 2834 $td->runtest("linearize duplicated pages",
2835 {$td->COMMAND => 2835 {$td->COMMAND =>
@@ -2864,14 +2864,22 @@ $td-&gt;runtest(&quot;show direct pages&quot;, @@ -2864,14 +2864,22 @@ $td-&gt;runtest(&quot;show direct pages&quot;,
2864 $td->NORMALIZE_NEWLINES); 2864 $td->NORMALIZE_NEWLINES);
2865 2865
2866 # Json mode for direct and duplicated pages illustrates that the 2866 # Json mode for direct and duplicated pages illustrates that the
2867 -# "objects" section still shows the original objects before correction  
2868 -# but the "pages" section shows the pages with their new object  
2869 -# numbers. 2867 +# "objects" section the original objects before correction when
  2868 +# "pages" is not output but after correct when it is.# numbers.
2870 foreach my $f (qw(page_api_2 direct-pages)) 2869 foreach my $f (qw(page_api_2 direct-pages))
2871 { 2870 {
2872 - $td->runtest("json for $f",  
2873 - {$td->COMMAND => "qpdf --json=latest $f.pdf"},  
2874 - {$td->FILE => "$f-json.out", $td->EXIT_STATUS => 0}, 2871 + $td->runtest("json for $f (objects only)",
  2872 + {$td->COMMAND =>
  2873 + "qpdf --json=latest $f.pdf" .
  2874 + " --json-key=objects --json-key=objectinfo"},
  2875 + {$td->FILE => "$f-json-objects.out", $td->EXIT_STATUS => 0},
  2876 + $td->NORMALIZE_NEWLINES);
  2877 + $td->runtest("json for $f (with pages)",
  2878 + {$td->COMMAND =>
  2879 + "qpdf --json=latest $f.pdf" .
  2880 + " --json-key=objects --json-key=objectinfo" .
  2881 + " --json-key=pages"},
  2882 + {$td->FILE => "$f-json-pages.out", $td->EXIT_STATUS => 0},
2875 $td->NORMALIZE_NEWLINES); 2883 $td->NORMALIZE_NEWLINES);
2876 } 2884 }
2877 2885
qpdf/qtest/qpdf/direct-pages-json.out renamed to qpdf/qtest/qpdf/direct-pages-json-objects.out
1 { 1 {
2 - "acroform": {  
3 - "fields": [],  
4 - "hasacroform": false,  
5 - "needappearances": false  
6 - },  
7 - "attachments": {},  
8 - "encrypt": {  
9 - "capabilities": {  
10 - "accessibility": true,  
11 - "extract": true,  
12 - "moddifyannotations": true,  
13 - "modify": true,  
14 - "modifyassembly": true,  
15 - "modifyforms": true,  
16 - "modifyother": true,  
17 - "printhigh": true,  
18 - "printlow": true  
19 - },  
20 - "encrypted": false,  
21 - "ownerpasswordmatched": false,  
22 - "parameters": {  
23 - "P": 0,  
24 - "R": 0,  
25 - "V": 0,  
26 - "bits": 0,  
27 - "filemethod": "none",  
28 - "key": null,  
29 - "method": "none",  
30 - "streammethod": "none",  
31 - "stringmethod": "none"  
32 - },  
33 - "userpasswordmatched": false  
34 - },  
35 "objectinfo": { 2 "objectinfo": {
36 "1 0 R": { 3 "1 0 R": {
37 "stream": { 4 "stream": {
@@ -145,30 +112,6 @@ @@ -145,30 +112,6 @@
145 "/Size": 7 112 "/Size": 7
146 } 113 }
147 }, 114 },
148 - "outlines": [],  
149 - "pagelabels": [],  
150 - "pages": [  
151 - {  
152 - "contents": [  
153 - "3 0 R"  
154 - ],  
155 - "images": [],  
156 - "label": null,  
157 - "object": "7 0 R",  
158 - "outlines": [],  
159 - "pageposfrom1": 1  
160 - },  
161 - {  
162 - "contents": [  
163 - "3 0 R"  
164 - ],  
165 - "images": [],  
166 - "label": null,  
167 - "object": "8 0 R",  
168 - "outlines": [],  
169 - "pageposfrom1": 2  
170 - }  
171 - ],  
172 "parameters": { 115 "parameters": {
173 "decodelevel": "generalized" 116 "decodelevel": "generalized"
174 }, 117 },
qpdf/qtest/qpdf/direct-pages-json-pages.out 0 โ†’ 100644
  1 +{
  2 + "objectinfo": {
  3 + "1 0 R": {
  4 + "stream": {
  5 + "filter": null,
  6 + "is": false,
  7 + "length": null
  8 + }
  9 + },
  10 + "2 0 R": {
  11 + "stream": {
  12 + "filter": null,
  13 + "is": false,
  14 + "length": null
  15 + }
  16 + },
  17 + "3 0 R": {
  18 + "stream": {
  19 + "filter": null,
  20 + "is": true,
  21 + "length": 44
  22 + }
  23 + },
  24 + "4 0 R": {
  25 + "stream": {
  26 + "filter": null,
  27 + "is": false,
  28 + "length": null
  29 + }
  30 + },
  31 + "5 0 R": {
  32 + "stream": {
  33 + "filter": null,
  34 + "is": false,
  35 + "length": null
  36 + }
  37 + },
  38 + "6 0 R": {
  39 + "stream": {
  40 + "filter": null,
  41 + "is": false,
  42 + "length": null
  43 + }
  44 + },
  45 + "7 0 R": {
  46 + "stream": {
  47 + "filter": null,
  48 + "is": false,
  49 + "length": null
  50 + }
  51 + },
  52 + "8 0 R": {
  53 + "stream": {
  54 + "filter": null,
  55 + "is": false,
  56 + "length": null
  57 + }
  58 + }
  59 + },
  60 + "objects": {
  61 + "1 0 R": {
  62 + "/Pages": "2 0 R",
  63 + "/Type": "/Catalog"
  64 + },
  65 + "2 0 R": {
  66 + "/Count": 2,
  67 + "/Kids": [
  68 + "7 0 R",
  69 + "8 0 R"
  70 + ],
  71 + "/Type": "/Pages"
  72 + },
  73 + "3 0 R": {
  74 + "/Length": "4 0 R"
  75 + },
  76 + "4 0 R": 44,
  77 + "5 0 R": {
  78 + "/BaseFont": "/Helvetica",
  79 + "/Encoding": "/WinAnsiEncoding",
  80 + "/Name": "/F1",
  81 + "/Subtype": "/Type1",
  82 + "/Type": "/Font"
  83 + },
  84 + "6 0 R": [
  85 + "/PDF",
  86 + "/Text"
  87 + ],
  88 + "7 0 R": {
  89 + "/Contents": "3 0 R",
  90 + "/MediaBox": [
  91 + 0,
  92 + 0,
  93 + 612,
  94 + 792
  95 + ],
  96 + "/Parent": "2 0 R",
  97 + "/Resources": {
  98 + "/Font": {
  99 + "/F1": "5 0 R"
  100 + },
  101 + "/ProcSet": "6 0 R"
  102 + },
  103 + "/Type": "/Page"
  104 + },
  105 + "8 0 R": {
  106 + "/Contents": "3 0 R",
  107 + "/MediaBox": [
  108 + 0,
  109 + 0,
  110 + 612,
  111 + 792
  112 + ],
  113 + "/Parent": "2 0 R",
  114 + "/Resources": {
  115 + "/Font": {
  116 + "/F1": "5 0 R"
  117 + },
  118 + "/ProcSet": "6 0 R"
  119 + },
  120 + "/Type": "/Page"
  121 + },
  122 + "trailer": {
  123 + "/ID": [
  124 + "\u0013#ยฅ๏ฌ|WzfsUโ€ฆยฉ6ลธรŽ<",
  125 + "7,ยฟDรถร›โ€นยซ`ร™&<\u000f\u000bร’j"
  126 + ],
  127 + "/Root": "1 0 R",
  128 + "/Size": 7
  129 + }
  130 + },
  131 + "pages": [
  132 + {
  133 + "contents": [
  134 + "3 0 R"
  135 + ],
  136 + "images": [],
  137 + "label": null,
  138 + "object": "7 0 R",
  139 + "outlines": [],
  140 + "pageposfrom1": 1
  141 + },
  142 + {
  143 + "contents": [
  144 + "3 0 R"
  145 + ],
  146 + "images": [],
  147 + "label": null,
  148 + "object": "8 0 R",
  149 + "outlines": [],
  150 + "pageposfrom1": 2
  151 + }
  152 + ],
  153 + "parameters": {
  154 + "decodelevel": "generalized"
  155 + },
  156 + "version": 1
  157 +}
qpdf/qtest/qpdf/page_api_2-json-objects.out 0 โ†’ 100644
  1 +{
  2 + "objectinfo": {
  3 + "1 0 R": {
  4 + "stream": {
  5 + "filter": null,
  6 + "is": false,
  7 + "length": null
  8 + }
  9 + },
  10 + "10 0 R": {
  11 + "stream": {
  12 + "filter": null,
  13 + "is": false,
  14 + "length": null
  15 + }
  16 + },
  17 + "2 0 R": {
  18 + "stream": {
  19 + "filter": null,
  20 + "is": false,
  21 + "length": null
  22 + }
  23 + },
  24 + "3 0 R": {
  25 + "stream": {
  26 + "filter": null,
  27 + "is": false,
  28 + "length": null
  29 + }
  30 + },
  31 + "4 0 R": {
  32 + "stream": {
  33 + "filter": null,
  34 + "is": false,
  35 + "length": null
  36 + }
  37 + },
  38 + "5 0 R": {
  39 + "stream": {
  40 + "filter": null,
  41 + "is": false,
  42 + "length": null
  43 + }
  44 + },
  45 + "6 0 R": {
  46 + "stream": {
  47 + "filter": null,
  48 + "is": true,
  49 + "length": 47
  50 + }
  51 + },
  52 + "7 0 R": {
  53 + "stream": {
  54 + "filter": null,
  55 + "is": false,
  56 + "length": null
  57 + }
  58 + },
  59 + "8 0 R": {
  60 + "stream": {
  61 + "filter": null,
  62 + "is": false,
  63 + "length": null
  64 + }
  65 + },
  66 + "9 0 R": {
  67 + "stream": {
  68 + "filter": null,
  69 + "is": true,
  70 + "length": 47
  71 + }
  72 + }
  73 + },
  74 + "objects": {
  75 + "1 0 R": {
  76 + "/Pages": "3 0 R",
  77 + "/Type": "/Catalog"
  78 + },
  79 + "10 0 R": 47,
  80 + "2 0 R": {
  81 + "/CreationDate": "D:20120621124041",
  82 + "/Producer": "Apex PDFWriter"
  83 + },
  84 + "3 0 R": {
  85 + "/Count": 3,
  86 + "/Kids": [
  87 + "4 0 R",
  88 + "4 0 R",
  89 + "5 0 R"
  90 + ],
  91 + "/Type": "/Pages"
  92 + },
  93 + "4 0 R": {
  94 + "/Contents": "6 0 R",
  95 + "/MediaBox": [
  96 + 0,
  97 + 0,
  98 + 612,
  99 + 792
  100 + ],
  101 + "/Parent": "3 0 R",
  102 + "/Resources": {
  103 + "/Font": {
  104 + "/F1": "8 0 R"
  105 + },
  106 + "/ProcSet": [
  107 + "/PDF",
  108 + "/Text"
  109 + ]
  110 + },
  111 + "/Type": "/Page"
  112 + },
  113 + "5 0 R": {
  114 + "/Contents": "9 0 R",
  115 + "/MediaBox": [
  116 + 0,
  117 + 0,
  118 + 612,
  119 + 792
  120 + ],
  121 + "/Parent": "3 0 R",
  122 + "/Resources": {
  123 + "/Font": {
  124 + "/F1": "8 0 R"
  125 + },
  126 + "/ProcSet": [
  127 + "/PDF",
  128 + "/Text"
  129 + ]
  130 + },
  131 + "/Type": "/Page"
  132 + },
  133 + "6 0 R": {
  134 + "/Length": "7 0 R"
  135 + },
  136 + "7 0 R": 47,
  137 + "8 0 R": {
  138 + "/BaseFont": "/Times-Roman",
  139 + "/Encoding": "/WinAnsiEncoding",
  140 + "/Subtype": "/Type1",
  141 + "/Type": "/Font"
  142 + },
  143 + "9 0 R": {
  144 + "/Length": "10 0 R"
  145 + },
  146 + "trailer": {
  147 + "/ID": [
  148 + "รปห˜ยทฦ’รฟ{5โ„\u0005รšโˆ’S*ยบโ€˜o",
  149 + "รท\u0017ลพยณQYยฟร”ร€\u000f\u0012โˆ’ยผรฝหœ\u0002"
  150 + ],
  151 + "/Info": "2 0 R",
  152 + "/Root": "1 0 R",
  153 + "/Size": 11
  154 + }
  155 + },
  156 + "parameters": {
  157 + "decodelevel": "generalized"
  158 + },
  159 + "version": 1
  160 +}
qpdf/qtest/qpdf/page_api_2-json.out renamed to qpdf/qtest/qpdf/page_api_2-json-pages.out
1 { 1 {
2 - "acroform": {  
3 - "fields": [],  
4 - "hasacroform": false,  
5 - "needappearances": false  
6 - },  
7 - "attachments": {},  
8 - "encrypt": {  
9 - "capabilities": {  
10 - "accessibility": true,  
11 - "extract": true,  
12 - "moddifyannotations": true,  
13 - "modify": true,  
14 - "modifyassembly": true,  
15 - "modifyforms": true,  
16 - "modifyother": true,  
17 - "printhigh": true,  
18 - "printlow": true  
19 - },  
20 - "encrypted": false,  
21 - "ownerpasswordmatched": false,  
22 - "parameters": {  
23 - "P": 0,  
24 - "R": 0,  
25 - "V": 0,  
26 - "bits": 0,  
27 - "filemethod": "none",  
28 - "key": null,  
29 - "method": "none",  
30 - "streammethod": "none",  
31 - "stringmethod": "none"  
32 - },  
33 - "userpasswordmatched": false  
34 - },  
35 "objectinfo": { 2 "objectinfo": {
36 "1 0 R": { 3 "1 0 R": {
37 "stream": { 4 "stream": {
@@ -47,6 +14,13 @@ @@ -47,6 +14,13 @@
47 "length": null 14 "length": null
48 } 15 }
49 }, 16 },
  17 + "11 0 R": {
  18 + "stream": {
  19 + "filter": null,
  20 + "is": false,
  21 + "length": null
  22 + }
  23 + },
50 "2 0 R": { 24 "2 0 R": {
51 "stream": { 25 "stream": {
52 "filter": null, 26 "filter": null,
@@ -110,6 +84,26 @@ @@ -110,6 +84,26 @@
110 "/Type": "/Catalog" 84 "/Type": "/Catalog"
111 }, 85 },
112 "10 0 R": 47, 86 "10 0 R": 47,
  87 + "11 0 R": {
  88 + "/Contents": "6 0 R",
  89 + "/MediaBox": [
  90 + 0,
  91 + 0,
  92 + 612,
  93 + 792
  94 + ],
  95 + "/Parent": "3 0 R",
  96 + "/Resources": {
  97 + "/Font": {
  98 + "/F1": "8 0 R"
  99 + },
  100 + "/ProcSet": [
  101 + "/PDF",
  102 + "/Text"
  103 + ]
  104 + },
  105 + "/Type": "/Page"
  106 + },
113 "2 0 R": { 107 "2 0 R": {
114 "/CreationDate": "D:20120621124041", 108 "/CreationDate": "D:20120621124041",
115 "/Producer": "Apex PDFWriter" 109 "/Producer": "Apex PDFWriter"
@@ -118,7 +112,7 @@ @@ -118,7 +112,7 @@
118 "/Count": 3, 112 "/Count": 3,
119 "/Kids": [ 113 "/Kids": [
120 "4 0 R", 114 "4 0 R",
121 - "4 0 R", 115 + "11 0 R",
122 "5 0 R" 116 "5 0 R"
123 ], 117 ],
124 "/Type": "/Pages" 118 "/Type": "/Pages"
@@ -186,8 +180,6 @@ @@ -186,8 +180,6 @@
186 "/Size": 11 180 "/Size": 11
187 } 181 }
188 }, 182 },
189 - "outlines": [],  
190 - "pagelabels": [],  
191 "pages": [ 183 "pages": [
192 { 184 {
193 "contents": [ 185 "contents": [