Commit e4be0d5871b06b52d5f059ce71ad3fed81814048

Authored by m-holger
1 parent 3427df9b

During xref recovery reject /Page objects with multiple errors

When validating the pages tree after xref recovery do not tree dictionaries
as page objects if more than 2 of the following are true:

- the dictionary is direct
- the /Type entry is missing or is not /Page
- the dictionary does not contain a valid /Parent entry
- the dictionary does not contain or inherit a valid /MediaBox
- the dictionary does not contain or inherit a /Resources dictionary

Such dictionaries are very unlikely to be page objects and trying to
process them may cause excessive run time and memory usage.
include/qpdf/QPDF.hh
... ... @@ -772,8 +772,9 @@ class QPDF
772 772 bool read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
773 773 bool read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
774 774 qpdf_offset_t read_xrefTable(qpdf_offset_t offset);
775   - qpdf_offset_t read_xrefStream(qpdf_offset_t offset, bool in_stream_recovery=false);
776   - qpdf_offset_t processXRefStream(qpdf_offset_t offset, QPDFObjectHandle& xref_stream, bool in_stream_recovery=false);
  775 + qpdf_offset_t read_xrefStream(qpdf_offset_t offset, bool in_stream_recovery = false);
  776 + qpdf_offset_t processXRefStream(
  777 + qpdf_offset_t offset, QPDFObjectHandle& xref_stream, bool in_stream_recovery = false);
777 778 std::pair<int, std::array<int, 3>>
778 779 processXRefW(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged);
779 780 int processXRefSize(
... ... @@ -897,7 +898,8 @@ class QPDF
897 898 QPDFObjectHandle cur_pages,
898 899 QPDFObjGen::set& visited,
899 900 QPDFObjGen::set& seen,
900   - bool media_box);
  901 + bool media_box,
  902 + bool resources);
901 903 void insertPage(QPDFObjectHandle newpage, int pos);
902 904 void flattenPagesTree();
903 905 void insertPageobjToPage(QPDFObjectHandle const& obj, int pos, bool check_duplicate);
... ...
libqpdf/QPDF_pages.cc
... ... @@ -43,9 +43,10 @@ QPDF::getAllPages()
43 43 // Note that pushInheritedAttributesToPage may also be used to initialize m->all_pages.
44 44 if (m->all_pages.empty() && !m->invalid_page_found) {
45 45 m->ever_called_get_all_pages = true;
  46 + auto root = getRoot();
46 47 QPDFObjGen::set visited;
47 48 QPDFObjGen::set seen;
48   - QPDFObjectHandle pages = getRoot().getKey("/Pages");
  49 + QPDFObjectHandle pages = root.getKey("/Pages");
49 50 bool warned = false;
50 51 bool changed_pages = false;
51 52 while (pages.isDictionary() && pages.hasKey("/Parent")) {
... ... @@ -56,7 +57,7 @@ QPDF::getAllPages()
56 57 // Files have been found in the wild where /Pages in the catalog points to the first
57 58 // page. Try to work around this and similar cases with this heuristic.
58 59 if (!warned) {
59   - getRoot().warnIfPossible(
  60 + root.warnIfPossible(
60 61 "document page tree root (root -> /Pages) doesn't point"
61 62 " to the root of the page tree; attempting to correct");
62 63 warned = true;
... ... @@ -65,7 +66,7 @@ QPDF::getAllPages()
65 66 pages = pages.getKey("/Parent");
66 67 }
67 68 if (changed_pages) {
68   - getRoot().replaceKey("/Pages", pages);
  69 + root.replaceKey("/Pages", pages);
69 70 }
70 71 seen.clear();
71 72 if (!pages.hasKey("/Kids")) {
... ... @@ -74,7 +75,7 @@ QPDF::getAllPages()
74 75 qpdf_e_pages, m->file->getName(), "", 0, "root of pages tree has no /Kids array");
75 76 }
76 77 try {
77   - getAllPagesInternal(pages, visited, seen, false);
  78 + getAllPagesInternal(pages, visited, seen, false, false);
78 79 } catch (...) {
79 80 m->all_pages.clear();
80 81 m->invalid_page_found = false;
... ... @@ -90,7 +91,11 @@ QPDF::getAllPages()
90 91  
91 92 void
92 93 QPDF::getAllPagesInternal(
93   - QPDFObjectHandle cur_node, QPDFObjGen::set& visited, QPDFObjGen::set& seen, bool media_box)
  94 + QPDFObjectHandle cur_node,
  95 + QPDFObjGen::set& visited,
  96 + QPDFObjGen::set& seen,
  97 + bool media_box,
  98 + bool resources)
94 99 {
95 100 if (!visited.add(cur_node)) {
96 101 throw QPDFExc(
... ... @@ -101,6 +106,9 @@ QPDF::getAllPagesInternal(
101 106 "Loop detected in /Pages structure (getAllPages)");
102 107 }
103 108 if (!cur_node.isDictionaryOfType("/Pages")) {
  109 + // During fuzzing files were encountered where the root object appeared in the pages tree.
  110 + // Unconditionally setting the /Type to /Pages could cause problems, but trying to
  111 + // accommodate the possibility may be excessive.
104 112 cur_node.warnIfPossible("/Type key should be /Pages but is not; overriding");
105 113 cur_node.replaceKey("/Type", "/Pages"_qpdf);
106 114 }
... ... @@ -108,6 +116,9 @@ QPDF::getAllPagesInternal(
108 116 media_box = cur_node.getKey("/MediaBox").isRectangle();
109 117 QTC::TC("qpdf", "QPDF inherit mediabox", media_box ? 0 : 1);
110 118 }
  119 + if (!resources) {
  120 + resources = cur_node.getKey("/Resources").isDictionary();
  121 + }
111 122 auto kids = cur_node.getKey("/Kids");
112 123 if (!visited.add(kids)) {
113 124 throw QPDFExc(
... ... @@ -120,13 +131,22 @@ QPDF::getAllPagesInternal(
120 131 int i = -1;
121 132 for (auto& kid: kids.as_array()) {
122 133 ++i;
  134 + int errors = 0;
  135 +
123 136 if (!kid.isDictionary()) {
124 137 kid.warnIfPossible("Pages tree includes non-dictionary object; ignoring");
125 138 m->invalid_page_found = true;
126 139 continue;
127 140 }
  141 + if (!kid.isIndirect()) {
  142 + QTC::TC("qpdf", "QPDF handle direct page object");
  143 + cur_node.warnIfPossible(
  144 + "kid " + std::to_string(i) + " (from 0) is direct; converting to indirect");
  145 + kid = makeIndirectObject(kid);
  146 + ++errors;
  147 + }
128 148 if (kid.hasKey("/Kids")) {
129   - getAllPagesInternal(kid, visited, seen, media_box);
  149 + getAllPagesInternal(kid, visited, seen, media_box, resources);
130 150 } else {
131 151 if (!media_box && !kid.getKey("/MediaBox").isRectangle()) {
132 152 QTC::TC("qpdf", "QPDF missing mediabox");
... ... @@ -136,13 +156,13 @@ QPDF::getAllPagesInternal(
136 156 kid.replaceKey(
137 157 "/MediaBox",
138 158 QPDFObjectHandle::newArray(QPDFObjectHandle::Rectangle(0, 0, 612, 792)));
  159 + ++errors;
139 160 }
140   - if (!kid.isIndirect()) {
141   - QTC::TC("qpdf", "QPDF handle direct page object");
142   - cur_node.warnIfPossible(
143   - "kid " + std::to_string(i) + " (from 0) is direct; converting to indirect");
144   - kid = makeIndirectObject(kid);
145   - } else if (!seen.add(kid)) {
  161 + if (!resources && !kid.getKey("/Resources").isDictionary()) {
  162 + // Consider adding an information message
  163 + ++errors;
  164 + }
  165 + if (!seen.add(kid)) {
146 166 // Make a copy of the page. This does the same as shallowCopyPage in
147 167 // QPDFPageObjectHelper.
148 168 QTC::TC("qpdf", "QPDF resolve duplicated page object");
... ... @@ -151,6 +171,8 @@ QPDF::getAllPagesInternal(
151 171 "kid " + std::to_string(i) +
152 172 " (from 0) appears more than once in the pages tree;"
153 173 " creating a new page object as a copy");
  174 + // This needs to be fixed. shallowCopy does not necessarily produce a valid
  175 + // page.
154 176 kid = makeIndirectObject(QPDFObjectHandle(kid).shallowCopy());
155 177 seen.add(kid);
156 178 } else {
... ... @@ -161,12 +183,24 @@ QPDF::getAllPagesInternal(
161 183 kid = QPDFObjectHandle::newNull();
162 184 continue;
163 185 }
  186 + if (!kid.getKey("/Parent").isSameObjectAs(cur_node)) {
  187 + // Consider fixing and adding an information message.
  188 + ++errors;
  189 + }
164 190 }
165 191 if (!kid.isDictionaryOfType("/Page")) {
166 192 kid.warnIfPossible("/Type key should be /Page but is not; overriding");
167 193 kid.replaceKey("/Type", "/Page"_qpdf);
  194 + ++errors;
  195 + }
  196 + if (m->in_xref_reconstruction && errors > 2) {
  197 + cur_node.warnIfPossible(
  198 + "kid " + std::to_string(i) + " (from 0) has too many errors; ignoring page");
  199 + m->invalid_page_found = true;
  200 + kid = QPDFObjectHandle::newNull();
  201 + continue;
168 202 }
169   - m->all_pages.push_back(kid);
  203 + m->all_pages.emplace_back(kid);
170 204 }
171 205 }
172 206 }
... ...