Commit b8fd18ae562ab8bae1d2e67c1ab63ff4ea62124b

Authored by Jay Berkenbilt
2 parents 64c840b1 1285f976

Merge branch 'parse_ref' into work

libqpdf/QPDFParser.cc
... ... @@ -21,22 +21,7 @@
21 21  
22 22 #include <memory>
23 23  
24   -namespace
25   -{
26   - struct StackFrame
27   - {
28   - StackFrame(std::shared_ptr<InputSource> input) :
29   - offset(input->tell())
30   - {
31   - }
32   -
33   - std::vector<std::shared_ptr<QPDFObject>> olist;
34   - qpdf_offset_t offset;
35   - std::string contents_string{""};
36   - qpdf_offset_t contents_offset{-1};
37   - int null_count{0};
38   - };
39   -} // namespace
  24 +using ObjectPtr = std::shared_ptr<QPDFObject>;
40 25  
41 26 QPDFObjectHandle
42 27 QPDFParser::parse(bool& empty, bool content_stream)
... ... @@ -46,371 +31,457 @@ QPDFParser::parse(bool&amp; empty, bool content_stream)
46 31 // effect of reading the object and changing the file pointer. If you do this, it will cause a
47 32 // logic error to be thrown from QPDF::inParse().
48 33  
49   - const static std::shared_ptr<QPDFObject> null_oh = QPDF_Null::create();
50 34 QPDF::ParseGuard pg(context);
51   -
52 35 empty = false;
  36 + start = input->tell();
53 37  
54   - std::shared_ptr<QPDFObject> object;
55   - bool set_offset = false;
56   -
57   - std::vector<StackFrame> stack;
58   - stack.emplace_back(input);
59   - std::vector<parser_state_e> state_stack;
60   - state_stack.push_back(st_top);
61   - qpdf_offset_t offset;
62   - bool done = false;
63   - int bad_count = 0;
64   - int good_count = 0;
65   - bool b_contents = false;
66   - bool is_null = false;
  38 + if (!tokenizer.nextToken(*input, object_description)) {
  39 + warn(tokenizer.getErrorMessage());
  40 + }
  41 +
  42 + switch (tokenizer.getType()) {
  43 + case QPDFTokenizer::tt_eof:
  44 + if (content_stream) {
  45 + // In content stream mode, leave object uninitialized to indicate EOF
  46 + return {};
  47 + }
  48 + QTC::TC("qpdf", "QPDFParser eof in parse");
  49 + warn("unexpected EOF");
  50 + return {QPDF_Null::create()};
  51 +
  52 + case QPDFTokenizer::tt_bad:
  53 + QTC::TC("qpdf", "QPDFParser bad token in parse");
  54 + return {QPDF_Null::create()};
  55 +
  56 + case QPDFTokenizer::tt_brace_open:
  57 + case QPDFTokenizer::tt_brace_close:
  58 + QTC::TC("qpdf", "QPDFParser bad brace");
  59 + warn("treating unexpected brace token as null");
  60 + return {QPDF_Null::create()};
  61 +
  62 + case QPDFTokenizer::tt_array_close:
  63 + QTC::TC("qpdf", "QPDFParser bad array close");
  64 + warn("treating unexpected array close token as null");
  65 + return {QPDF_Null::create()};
  66 +
  67 + case QPDFTokenizer::tt_dict_close:
  68 + QTC::TC("qpdf", "QPDFParser bad dictionary close");
  69 + warn("unexpected dictionary close token");
  70 + return {QPDF_Null::create()};
  71 +
  72 + case QPDFTokenizer::tt_array_open:
  73 + case QPDFTokenizer::tt_dict_open:
  74 + stack.clear();
  75 + stack.emplace_back(
  76 + input,
  77 + (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array : st_dictionary_key);
  78 + frame = &stack.back();
  79 + return parseRemainder(content_stream);
  80 +
  81 + case QPDFTokenizer::tt_bool:
  82 + return withDescription<QPDF_Bool>(tokenizer.getValue() == "true");
  83 +
  84 + case QPDFTokenizer::tt_null:
  85 + return {QPDF_Null::create()};
  86 +
  87 + case QPDFTokenizer::tt_integer:
  88 + return withDescription<QPDF_Integer>(QUtil::string_to_ll(tokenizer.getValue().c_str()));
  89 +
  90 + case QPDFTokenizer::tt_real:
  91 + return withDescription<QPDF_Real>(tokenizer.getValue());
  92 +
  93 + case QPDFTokenizer::tt_name:
  94 + return withDescription<QPDF_Name>(tokenizer.getValue());
  95 +
  96 + case QPDFTokenizer::tt_word:
  97 + {
  98 + auto const& value = tokenizer.getValue();
  99 + if (content_stream) {
  100 + return withDescription<QPDF_Operator>(value);
  101 + } else if (value == "endobj") {
  102 + // We just saw endobj without having read anything. Treat this as a null and do
  103 + // not move the input source's offset.
  104 + input->seek(input->getLastOffset(), SEEK_SET);
  105 + empty = true;
  106 + return {QPDF_Null::create()};
  107 + } else {
  108 + QTC::TC("qpdf", "QPDFParser treat word as string");
  109 + warn("unknown token while reading object; treating as string");
  110 + return withDescription<QPDF_String>(value);
  111 + }
  112 + }
  113 +
  114 + case QPDFTokenizer::tt_string:
  115 + if (decrypter) {
  116 + std::string s{tokenizer.getValue()};
  117 + decrypter->decryptString(s);
  118 + return withDescription<QPDF_String>(s);
  119 + } else {
  120 + return withDescription<QPDF_String>(tokenizer.getValue());
  121 + }
  122 +
  123 + default:
  124 + warn("treating unknown token type as null while reading object");
  125 + return {QPDF_Null::create()};
  126 + }
  127 +}
67 128  
68   - while (!done) {
69   - bool bad = false;
70   - bool indirect_ref = false;
71   - is_null = false;
72   - auto& frame = stack.back();
73   - auto& olist = frame.olist;
74   - parser_state_e state = state_stack.back();
75   - offset = frame.offset;
  129 +QPDFObjectHandle
  130 +QPDFParser::parseRemainder(bool content_stream)
  131 +{
  132 + // This method must take care not to resolve any objects. Don't check the type of any object
  133 + // without first ensuring that it is a direct object. Otherwise, doing so may have the side
  134 + // effect of reading the object and changing the file pointer. If you do this, it will cause a
  135 + // logic error to be thrown from QPDF::inParse().
76 136  
77   - object = nullptr;
78   - set_offset = false;
  137 + bad_count = 0;
  138 + bool b_contents = false;
79 139  
  140 + while (true) {
80 141 if (!tokenizer.nextToken(*input, object_description)) {
81 142 warn(tokenizer.getErrorMessage());
82 143 }
  144 + ++good_count; // optimistically
  145 +
  146 + if (int_count != 0) {
  147 + // Special handling of indirect references. Treat integer tokens as part of an indirect
  148 + // reference until proven otherwise.
  149 + if (tokenizer.getType() == QPDFTokenizer::tt_integer) {
  150 + if (++int_count > 2) {
  151 + // Process the oldest buffered integer.
  152 + addInt(int_count);
  153 + }
  154 + last_offset_buffer[int_count % 2] = input->getLastOffset();
  155 + int_buffer[int_count % 2] = QUtil::string_to_ll(tokenizer.getValue().c_str());
  156 + continue;
  157 +
  158 + } else if (
  159 + int_count >= 2 && tokenizer.getType() == QPDFTokenizer::tt_word &&
  160 + tokenizer.getValue() == "R") {
  161 + if (context == nullptr) {
  162 + QTC::TC("qpdf", "QPDFParser indirect without context");
  163 + throw std::logic_error("QPDFParser::parse called without context on an object "
  164 + "with indirect references");
  165 + }
  166 + auto ref_og = QPDFObjGen(
  167 + QIntC::to_int(int_buffer[(int_count - 1) % 2]),
  168 + QIntC::to_int(int_buffer[(int_count) % 2]));
  169 + if (ref_og.isIndirect()) {
  170 + // This action has the desirable side effect of causing dangling references
  171 + // (references to indirect objects that don't appear in the PDF) in any parsed
  172 + // object to appear in the object cache.
  173 + add(std::move(context->getObject(ref_og).obj));
  174 + } else {
  175 + QTC::TC("qpdf", "QPDFParser indirect with 0 objid");
  176 + addNull();
  177 + }
  178 + int_count = 0;
  179 + continue;
  180 +
  181 + } else if (int_count > 0) {
  182 + // Process the buffered integers before processing the current token.
  183 + if (int_count > 1) {
  184 + addInt(int_count - 1);
  185 + }
  186 + addInt(int_count);
  187 + int_count = 0;
  188 + }
  189 + }
83 190  
84 191 switch (tokenizer.getType()) {
85 192 case QPDFTokenizer::tt_eof:
86   - if (!content_stream) {
87   - QTC::TC("qpdf", "QPDFParser eof in parse");
88   - warn("unexpected EOF");
  193 + warn("parse error while reading object");
  194 + if (content_stream) {
  195 + // In content stream mode, leave object uninitialized to indicate EOF
  196 + return {};
89 197 }
90   - bad = true;
91   - state = st_eof;
92   - break;
  198 + QTC::TC("qpdf", "QPDFParser eof in parseRemainder");
  199 + warn("unexpected EOF");
  200 + return {QPDF_Null::create()};
93 201  
94 202 case QPDFTokenizer::tt_bad:
95   - QTC::TC("qpdf", "QPDFParser bad token in parse");
96   - bad = true;
97   - is_null = true;
98   - break;
  203 + QTC::TC("qpdf", "QPDFParser bad token in parseRemainder");
  204 + if (tooManyBadTokens()) {
  205 + return {QPDF_Null::create()};
  206 + }
  207 + addNull();
  208 + continue;
99 209  
100 210 case QPDFTokenizer::tt_brace_open:
101 211 case QPDFTokenizer::tt_brace_close:
102   - QTC::TC("qpdf", "QPDFParser bad brace");
  212 + QTC::TC("qpdf", "QPDFParser bad brace in parseRemainder");
103 213 warn("treating unexpected brace token as null");
104   - bad = true;
105   - is_null = true;
106   - break;
  214 + if (tooManyBadTokens()) {
  215 + return {QPDF_Null::create()};
  216 + }
  217 + addNull();
  218 + continue;
107 219  
108 220 case QPDFTokenizer::tt_array_close:
109   - if (state == st_array) {
110   - state = st_stop;
  221 + if (frame->state == st_array) {
  222 + auto object = QPDF_Array::create(std::move(frame->olist), frame->null_count > 100);
  223 + setDescription(object, frame->offset - 1);
  224 + // The `offset` points to the next of "[". Set the rewind offset to point to the
  225 + // beginning of "[". This has been explicitly tested with whitespace surrounding the
  226 + // array start delimiter. getLastOffset points to the array end token and therefore
  227 + // can't be used here.
  228 + if (stack.size() <= 1) {
  229 + return object;
  230 + }
  231 + stack.pop_back();
  232 + frame = &stack.back();
  233 + add(std::move(object));
111 234 } else {
112   - QTC::TC("qpdf", "QPDFParser bad array close");
  235 + QTC::TC("qpdf", "QPDFParser bad array close in parseRemainder");
113 236 warn("treating unexpected array close token as null");
114   - bad = true;
115   - is_null = true;
  237 + if (tooManyBadTokens()) {
  238 + return {QPDF_Null::create()};
  239 + }
  240 + addNull();
116 241 }
117   - break;
  242 + continue;
118 243  
119 244 case QPDFTokenizer::tt_dict_close:
120   - if (state == st_dictionary) {
121   - state = st_stop;
  245 + if (frame->state <= st_dictionary_value) {
  246 + // Attempt to recover more or less gracefully from invalid dictionaries.
  247 + auto& dict = frame->dict;
  248 +
  249 + if (frame->state == st_dictionary_value) {
  250 + QTC::TC("qpdf", "QPDFParser no val for last key");
  251 + warn(
  252 + frame->offset,
  253 + "dictionary ended prematurely; using null as value for last key");
  254 + dict[frame->key] = QPDF_Null::create();
  255 + }
  256 +
  257 + if (!frame->olist.empty())
  258 + fixMissingKeys();
  259 +
  260 + if (!frame->contents_string.empty() && dict.count("/Type") &&
  261 + dict["/Type"].isNameAndEquals("/Sig") && dict.count("/ByteRange") &&
  262 + dict.count("/Contents") && dict["/Contents"].isString()) {
  263 + dict["/Contents"] = QPDFObjectHandle::newString(frame->contents_string);
  264 + dict["/Contents"].setParsedOffset(frame->contents_offset);
  265 + }
  266 + auto object = QPDF_Dictionary::create(std::move(dict));
  267 + setDescription(object, frame->offset - 2);
  268 + // The `offset` points to the next of "<<". Set the rewind offset to point to the
  269 + // beginning of "<<". This has been explicitly tested with whitespace surrounding
  270 + // the dictionary start delimiter. getLastOffset points to the dictionary end token
  271 + // and therefore can't be used here.
  272 + if (stack.size() <= 1) {
  273 + return object;
  274 + }
  275 + stack.pop_back();
  276 + frame = &stack.back();
  277 + add(std::move(object));
122 278 } else {
123   - QTC::TC("qpdf", "QPDFParser bad dictionary close");
  279 + QTC::TC("qpdf", "QPDFParser bad dictionary close in parseRemainder");
124 280 warn("unexpected dictionary close token");
125   - bad = true;
126   - is_null = true;
  281 + if (tooManyBadTokens()) {
  282 + return {QPDF_Null::create()};
  283 + }
  284 + addNull();
127 285 }
128   - break;
  286 + continue;
129 287  
130 288 case QPDFTokenizer::tt_array_open:
131 289 case QPDFTokenizer::tt_dict_open:
132   - if (stack.size() > 500) {
  290 + if (stack.size() > 499) {
133 291 QTC::TC("qpdf", "QPDFParser too deep");
134 292 warn("ignoring excessively deeply nested data structure");
135   - bad = true;
136   - is_null = true;
137   - state = st_top;
  293 + return {QPDF_Null::create()};
138 294 } else {
139   - state = st_start;
140   - state_stack.push_back(
141   - (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array
142   - : st_dictionary);
143 295 b_contents = false;
144   - stack.emplace_back(input);
  296 + stack.emplace_back(
  297 + input,
  298 + (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array
  299 + : st_dictionary_key);
  300 + frame = &stack.back();
  301 + continue;
145 302 }
146   - break;
147 303  
148 304 case QPDFTokenizer::tt_bool:
149   - object = QPDF_Bool::create((tokenizer.getValue() == "true"));
150   - break;
  305 + addScalar<QPDF_Bool>(tokenizer.getValue() == "true");
  306 + continue;
151 307  
152 308 case QPDFTokenizer::tt_null:
153   - is_null = true;
154   - ++frame.null_count;
155   -
156   - break;
  309 + addNull();
  310 + continue;
157 311  
158 312 case QPDFTokenizer::tt_integer:
159   - object = QPDF_Integer::create(
160   - QUtil::string_to_ll(std::string(tokenizer.getValue()).c_str()));
161   - break;
  313 + if (!content_stream) {
  314 + // Buffer token in case it is part of an indirect reference.
  315 + last_offset_buffer[1] = input->getLastOffset();
  316 + int_buffer[1] = QUtil::string_to_ll(tokenizer.getValue().c_str());
  317 + int_count = 1;
  318 + } else {
  319 + addScalar<QPDF_Integer>(QUtil::string_to_ll(tokenizer.getValue().c_str()));
  320 + }
  321 + continue;
162 322  
163 323 case QPDFTokenizer::tt_real:
164   - object = QPDF_Real::create(tokenizer.getValue());
165   - break;
  324 + addScalar<QPDF_Real>(tokenizer.getValue());
  325 + continue;
166 326  
167 327 case QPDFTokenizer::tt_name:
168   - {
169   - auto name = tokenizer.getValue();
170   - object = QPDF_Name::create(name);
171   -
172   - if (name == "/Contents") {
173   - b_contents = true;
174   - } else {
175   - b_contents = false;
176   - }
  328 + if (frame->state == st_dictionary_key) {
  329 + frame->key = tokenizer.getValue();
  330 + frame->state = st_dictionary_value;
  331 + b_contents = decrypter && frame->key == "/Contents";
  332 + continue;
  333 + } else {
  334 + addScalar<QPDF_Name>(tokenizer.getValue());
177 335 }
178   - break;
  336 + continue;
179 337  
180 338 case QPDFTokenizer::tt_word:
181   - {
182   - auto value = tokenizer.getValue();
183   - auto size = olist.size();
184   - if (content_stream) {
185   - object = QPDF_Operator::create(value);
186   - } else if (
187   - value == "R" && state != st_top && size >= 2 && olist.back() &&
188   - olist.back()->getTypeCode() == ::ot_integer &&
189   - !olist.back()->getObjGen().isIndirect() && olist.at(size - 2) &&
190   - olist.at(size - 2)->getTypeCode() == ::ot_integer &&
191   - !olist.at(size - 2)->getObjGen().isIndirect()) {
192   - if (context == nullptr) {
193   - QTC::TC("qpdf", "QPDFParser indirect without context");
194   - throw std::logic_error("QPDFObjectHandle::parse called without context on "
195   - "an object with indirect references");
196   - }
197   - auto ref_og = QPDFObjGen(
198   - QPDFObjectHandle(olist.at(size - 2)).getIntValueAsInt(),
199   - QPDFObjectHandle(olist.back()).getIntValueAsInt());
200   - if (ref_og.isIndirect()) {
201   - // This action has the desirable side effect of causing dangling references
202   - // (references to indirect objects that don't appear in the PDF) in any
203   - // parsed object to appear in the object cache.
204   - object = context->getObject(ref_og).obj;
205   - indirect_ref = true;
206   - } else {
207   - QTC::TC("qpdf", "QPDFParser indirect with 0 objid");
208   - is_null = true;
209   - }
210   - olist.pop_back();
211   - olist.pop_back();
212   - } else if ((value == "endobj") && (state == st_top)) {
213   - // We just saw endobj without having read anything. Treat this as a null and do
214   - // not move the input source's offset.
215   - is_null = true;
216   - input->seek(input->getLastOffset(), SEEK_SET);
217   - empty = true;
218   - } else {
219   - QTC::TC("qpdf", "QPDFParser treat word as string");
220   - warn("unknown token while reading object; treating as string");
221   - bad = true;
222   - object = QPDF_String::create(value);
  339 + if (content_stream) {
  340 + addScalar<QPDF_Operator>(tokenizer.getValue());
  341 + } else {
  342 + QTC::TC("qpdf", "QPDFParser treat word as string in parseRemainder");
  343 + warn("unknown token while reading object; treating as string");
  344 + if (tooManyBadTokens()) {
  345 + return {QPDF_Null::create()};
223 346 }
  347 + addScalar<QPDF_String>(tokenizer.getValue());
224 348 }
225   - break;
  349 + continue;
226 350  
227 351 case QPDFTokenizer::tt_string:
228 352 {
229   - auto val = tokenizer.getValue();
  353 + auto const& val = tokenizer.getValue();
230 354 if (decrypter) {
231 355 if (b_contents) {
232   - frame.contents_string = val;
233   - frame.contents_offset = input->getLastOffset();
  356 + frame->contents_string = val;
  357 + frame->contents_offset = input->getLastOffset();
234 358 b_contents = false;
235 359 }
236 360 std::string s{val};
237 361 decrypter->decryptString(s);
238   - object = QPDF_String::create(s);
  362 + addScalar<QPDF_String>(s);
239 363 } else {
240   - object = QPDF_String::create(val);
  364 + addScalar<QPDF_String>(val);
241 365 }
242 366 }
243   -
244   - break;
  367 + continue;
245 368  
246 369 default:
247 370 warn("treating unknown token type as null while reading object");
248   - bad = true;
249   - is_null = true;
250   - break;
251   - }
252   -
253   - if (object == nullptr && !is_null &&
254   - (!((state == st_start) || (state == st_stop) || (state == st_eof)))) {
255   - throw std::logic_error("QPDFParser:parseInternal: unexpected uninitialized object");
256   - is_null = true;
257   - }
258   -
259   - if (bad) {
260   - ++bad_count;
261   - good_count = 0;
262   - } else {
263   - ++good_count;
264   - if (good_count > 3) {
265   - bad_count = 0;
  371 + if (tooManyBadTokens()) {
  372 + return {QPDF_Null::create()};
266 373 }
  374 + addNull();
267 375 }
268   - if (bad_count > 5) {
269   - // We had too many consecutive errors without enough intervening successful objects.
270   - // Give up.
271   - warn("too many errors; giving up on reading object");
272   - state = st_top;
273   - is_null = true;
274   - }
  376 + }
  377 +}
275 378  
276   - switch (state) {
277   - case st_eof:
278   - if (state_stack.size() > 1) {
279   - warn("parse error while reading object");
280   - }
281   - done = true;
282   - // In content stream mode, leave object uninitialized to indicate EOF
283   - if (!content_stream) {
284   - is_null = true;
285   - }
286   - break;
287   -
288   - case st_dictionary:
289   - case st_array:
290   - if (is_null) {
291   - object = null_oh;
292   - // No need to set description for direct nulls - they probably will become implicit.
293   - } else if (!indirect_ref) {
294   - setDescription(object, input->getLastOffset());
295   - }
296   - set_offset = true;
297   - olist.push_back(object);
298   - break;
  379 +void
  380 +QPDFParser::add(std::shared_ptr<QPDFObject>&& obj)
  381 +{
  382 + if (frame->state != st_dictionary_value) {
  383 + // If state is st_dictionary_key then there is a missing key. Push onto olist for
  384 + // processing once the tt_dict_close token has been found.
  385 + frame->olist.emplace_back(std::move(obj));
  386 + } else {
  387 + if (auto res = frame->dict.insert_or_assign(frame->key, std::move(obj)); !res.second) {
  388 + warnDuplicateKey();
  389 + }
  390 + frame->state = st_dictionary_key;
  391 + }
  392 +}
299 393  
300   - case st_top:
301   - done = true;
302   - break;
  394 +void
  395 +QPDFParser::addNull()
  396 +{
  397 + const static ObjectPtr null_obj = QPDF_Null::create();
303 398  
304   - case st_start:
305   - break;
  399 + if (frame->state != st_dictionary_value) {
  400 + // If state is st_dictionary_key then there is a missing key. Push onto olist for
  401 + // processing once the tt_dict_close token has been found.
  402 + frame->olist.emplace_back(null_obj);
  403 + } else {
  404 + if (auto res = frame->dict.insert_or_assign(frame->key, null_obj); !res.second) {
  405 + warnDuplicateKey();
  406 + }
  407 + frame->state = st_dictionary_key;
  408 + }
  409 + ++frame->null_count;
  410 +}
306 411  
307   - case st_stop:
308   - if ((state_stack.size() < 2) || (stack.size() < 2)) {
309   - throw std::logic_error("QPDFParser::parseInternal: st_stop encountered with "
310   - "insufficient elements in stack");
311   - }
312   - parser_state_e old_state = state_stack.back();
313   - state_stack.pop_back();
314   - if (old_state == st_array) {
315   - object = QPDF_Array::create(std::move(olist), frame.null_count > 100);
316   - setDescription(object, offset - 1);
317   - // The `offset` points to the next of "[". Set the rewind offset to point to the
318   - // beginning of "[". This has been explicitly tested with whitespace surrounding the
319   - // array start delimiter. getLastOffset points to the array end token and therefore
320   - // can't be used here.
321   - set_offset = true;
322   - } else if (old_state == st_dictionary) {
323   - // Convert list to map. Alternating elements are keys. Attempt to recover more or
324   - // less gracefully from invalid dictionaries.
325   - std::set<std::string> names;
326   - for (auto& obj: olist) {
327   - if (obj) {
328   - if (obj->getTypeCode() == ::ot_name) {
329   - names.insert(obj->getStringValue());
330   - }
331   - }
332   - }
  412 +void
  413 +QPDFParser::addInt(int count)
  414 +{
  415 + auto obj = QPDF_Integer::create(int_buffer[count % 2]);
  416 + obj->setDescription(context, description, last_offset_buffer[count % 2]);
  417 + add(std::move(obj));
  418 +}
333 419  
334   - std::map<std::string, QPDFObjectHandle> dict;
335   - int next_fake_key = 1;
336   - for (auto iter = olist.begin(); iter != olist.end();) {
337   - // Calculate key.
338   - std::string key;
339   - if (*iter && (*iter)->getTypeCode() == ::ot_name) {
340   - key = (*iter)->getStringValue();
341   - ++iter;
342   - } else {
343   - for (bool found_fake = false; !found_fake;) {
344   - key = "/QPDFFake" + std::to_string(next_fake_key++);
345   - found_fake = (names.count(key) == 0);
346   - QTC::TC("qpdf", "QPDFParser found fake", (found_fake ? 0 : 1));
347   - }
348   - warn(
349   - offset,
350   - "expected dictionary key but found non-name object; inserting key " +
351   - key);
352   - }
353   - if (dict.count(key) > 0) {
354   - QTC::TC("qpdf", "QPDFParser duplicate dict key");
355   - warn(
356   - offset,
357   - "dictionary has duplicated key " + key +
358   - "; last occurrence overrides earlier ones");
359   - }
  420 +template <typename T, typename... Args>
  421 +void
  422 +QPDFParser::addScalar(Args&&... args)
  423 +{
  424 + auto obj = T::create(args...);
  425 + obj->setDescription(context, description, input->getLastOffset());
  426 + add(std::move(obj));
  427 +}
360 428  
361   - // Calculate value.
362   - std::shared_ptr<QPDFObject> val;
363   - if (iter != olist.end()) {
364   - val = *iter;
365   - ++iter;
366   - } else {
367   - QTC::TC("qpdf", "QPDFParser no val for last key");
368   - warn(
369   - offset,
370   - "dictionary ended prematurely; using null as value for last key");
371   - val = QPDF_Null::create();
372   - }
  429 +template <typename T, typename... Args>
  430 +QPDFObjectHandle
  431 +QPDFParser::withDescription(Args&&... args)
  432 +{
  433 + auto obj = T::create(args...);
  434 + obj->setDescription(context, description, start);
  435 + return {obj};
  436 +}
373 437  
374   - dict[std::move(key)] = std::move(val);
375   - }
376   - if (!frame.contents_string.empty() && dict.count("/Type") &&
377   - dict["/Type"].isNameAndEquals("/Sig") && dict.count("/ByteRange") &&
378   - dict.count("/Contents") && dict["/Contents"].isString()) {
379   - dict["/Contents"] = QPDFObjectHandle::newString(frame.contents_string);
380   - dict["/Contents"].setParsedOffset(frame.contents_offset);
381   - }
382   - object = QPDF_Dictionary::create(std::move(dict));
383   - setDescription(object, offset - 2);
384   - // The `offset` points to the next of "<<". Set the rewind offset to point to the
385   - // beginning of "<<". This has been explicitly tested with whitespace surrounding
386   - // the dictionary start delimiter. getLastOffset points to the dictionary end token
387   - // and therefore can't be used here.
388   - set_offset = true;
389   - }
390   - stack.pop_back();
391   - if (state_stack.back() == st_top) {
392   - done = true;
393   - } else {
394   - stack.back().olist.push_back(object);
395   - }
396   - }
  438 +void
  439 +QPDFParser::setDescription(ObjectPtr& obj, qpdf_offset_t parsed_offset)
  440 +{
  441 + if (obj) {
  442 + obj->setDescription(context, description, parsed_offset);
397 443 }
  444 +}
398 445  
399   - if (is_null) {
400   - object = QPDF_Null::create();
  446 +void
  447 +QPDFParser::fixMissingKeys()
  448 +{
  449 + std::set<std::string> names;
  450 + for (auto& obj: frame->olist) {
  451 + if (obj->getTypeCode() == ::ot_name) {
  452 + names.insert(obj->getStringValue());
  453 + }
401 454 }
402   - if (!set_offset) {
403   - setDescription(object, offset);
  455 + int next_fake_key = 1;
  456 + for (auto const& item: frame->olist) {
  457 + while (true) {
  458 + const std::string key = "/QPDFFake" + std::to_string(next_fake_key++);
  459 + const bool found_fake = frame->dict.count(key) == 0 && names.count(key) == 0;
  460 + QTC::TC("qpdf", "QPDFParser found fake", (found_fake ? 0 : 1));
  461 + if (found_fake) {
  462 + warn(
  463 + frame->offset,
  464 + "expected dictionary key but found non-name object; inserting key " + key);
  465 + frame->dict[key] = item;
  466 + break;
  467 + }
  468 + }
404 469 }
405   - return object;
406 470 }
407 471  
408   -void
409   -QPDFParser::setDescription(std::shared_ptr<QPDFObject>& obj, qpdf_offset_t parsed_offset)
  472 +bool
  473 +QPDFParser::tooManyBadTokens()
410 474 {
411   - if (obj) {
412   - obj->setDescription(context, description, parsed_offset);
  475 + if (good_count <= 4) {
  476 + if (++bad_count > 5) {
  477 + warn("too many errors; giving up on reading object");
  478 + return true;
  479 + }
  480 + } else {
  481 + bad_count = 1;
413 482 }
  483 + good_count = 0;
  484 + return false;
414 485 }
415 486  
416 487 void
... ... @@ -427,6 +498,15 @@ QPDFParser::warn(QPDFExc const&amp; e) const
427 498 }
428 499  
429 500 void
  501 +QPDFParser::warnDuplicateKey()
  502 +{
  503 + QTC::TC("qpdf", "QPDFParser duplicate dict key");
  504 + warn(
  505 + frame->offset,
  506 + "dictionary has duplicated key " + frame->key + "; last occurrence overrides earlier ones");
  507 +}
  508 +
  509 +void
430 510 QPDFParser::warn(qpdf_offset_t offset, std::string const& msg) const
431 511 {
432 512 warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(), object_description, offset, msg));
... ...
libqpdf/qpdf/QPDFParser.hh
... ... @@ -31,11 +31,44 @@ class QPDFParser
31 31 QPDFObjectHandle parse(bool& empty, bool content_stream);
32 32  
33 33 private:
34   - enum parser_state_e { st_top, st_start, st_stop, st_eof, st_dictionary, st_array };
  34 + // Parser state. Note:
  35 + // state < st_dictionary_value == (state = st_dictionary_key || state = st_dictionary_value)
  36 + enum parser_state_e { st_dictionary_key, st_dictionary_value, st_array };
35 37  
  38 + struct StackFrame
  39 + {
  40 + StackFrame(std::shared_ptr<InputSource> const& input, parser_state_e state) :
  41 + state(state),
  42 + offset(input->tell())
  43 + {
  44 + }
  45 +
  46 + std::vector<std::shared_ptr<QPDFObject>> olist;
  47 + std::map<std::string, QPDFObjectHandle> dict;
  48 + parser_state_e state;
  49 + std::string key;
  50 + qpdf_offset_t offset;
  51 + std::string contents_string;
  52 + qpdf_offset_t contents_offset{-1};
  53 + int null_count{0};
  54 + };
  55 +
  56 + QPDFObjectHandle parseRemainder(bool content_stream);
  57 + void add(std::shared_ptr<QPDFObject>&& obj);
  58 + void addNull();
  59 + void addInt(int count);
  60 + template <typename T, typename... Args>
  61 + void addScalar(Args&&... args);
  62 + bool tooManyBadTokens();
  63 + void warnDuplicateKey();
  64 + void fixMissingKeys();
36 65 void warn(qpdf_offset_t offset, std::string const& msg) const;
37 66 void warn(std::string const& msg) const;
38 67 void warn(QPDFExc const&) const;
  68 + template <typename T, typename... Args>
  69 + // Create a new scalar object complete with parsed offset and description.
  70 + // NB the offset includes any leading whitespace.
  71 + QPDFObjectHandle withDescription(Args&&... args);
39 72 void setDescription(std::shared_ptr<QPDFObject>& obj, qpdf_offset_t parsed_offset);
40 73 std::shared_ptr<InputSource> input;
41 74 std::string const& object_description;
... ... @@ -43,6 +76,18 @@ class QPDFParser
43 76 QPDFObjectHandle::StringDecrypter* decrypter;
44 77 QPDF* context;
45 78 std::shared_ptr<QPDFValue::Description> description;
  79 + std::vector<StackFrame> stack;
  80 + StackFrame* frame;
  81 + // Number of recent bad tokens.
  82 + int bad_count = 0;
  83 + // Number of good tokens since last bad token. Irrelevant if bad_count == 0.
  84 + int good_count = 0;
  85 + // Start offset including any leading whitespace.
  86 + qpdf_offset_t start;
  87 + // Number of successive integer tokens.
  88 + int int_count = 0;
  89 + long long int_buffer[2]{0, 0};
  90 + qpdf_offset_t last_offset_buffer[2]{0, 0};
46 91 };
47 92  
48 93 #endif // QPDFPARSER_HH
... ...
qpdf/qpdf.testcov
... ... @@ -57,11 +57,14 @@ QPDF trailer lacks size 0
57 57 QPDF trailer size not integer 0
58 58 QPDF trailer prev not integer 0
59 59 QPDFParser bad brace 0
  60 +QPDFParser bad brace in parseRemainder 0
60 61 QPDFParser bad array close 0
  62 +QPDFParser bad array close in parseRemainder 0
61 63 QPDF stream without length 0
62 64 QPDF stream length not integer 0
63 65 QPDF missing endstream 0
64 66 QPDFParser bad dictionary close 0
  67 +QPDFParser bad dictionary close in parseRemainder 0
65 68 QPDF can't find xref 0
66 69 QPDFTokenizer bad ) 0
67 70 QPDFTokenizer bad > 0
... ... @@ -258,6 +261,7 @@ QPDFParser indirect with 0 objid 0
258 261 QPDF object id 0 0
259 262 QPDF recursion loop in resolve 0
260 263 QPDFParser treat word as string 0
  264 +QPDFParser treat word as string in parseRemainder 0
261 265 QPDFParser found fake 1
262 266 QPDFParser no val for last key 0
263 267 QPDF resolve failure to null 0
... ... @@ -289,7 +293,9 @@ QPDFObjectHandle coalesce called on stream 0
289 293 QPDFObjectHandle coalesce provide stream data 0
290 294 QPDF_Stream bad token at end during normalize 0
291 295 QPDFParser bad token in parse 0
  296 +QPDFParser bad token in parseRemainder 0
292 297 QPDFParser eof in parse 0
  298 +QPDFParser eof in parseRemainder 0
293 299 QPDFObjectHandle array bounds 0
294 300 QPDFObjectHandle boolean returning false 0
295 301 QPDFObjectHandle integer returning 0 0
... ...
qpdf/qtest/parsing.test
... ... @@ -17,7 +17,7 @@ my $td = new TestDriver(&#39;parsing&#39;);
17 17 my $n_tests = 17;
18 18  
19 19 $td->runtest("parse objects from string",
20   - {$td->COMMAND => "test_driver 31 good1.qdf"},
  20 + {$td->COMMAND => "test_driver 31 bad39.qdf"},
21 21 {$td->FILE => "parse-object.out", $td->EXIT_STATUS => 0},
22 22 $td->NORMALIZE_NEWLINES);
23 23 $td->runtest("EOF terminating literal tokens",
... ...
qpdf/qtest/qpdf/bad16-recover.out
1 1 WARNING: bad16.pdf (trailer, offset 753): unexpected dictionary close token
2 2 WARNING: bad16.pdf (trailer, offset 756): unexpected dictionary close token
3 3 WARNING: bad16.pdf (trailer, offset 759): unknown token while reading object; treating as string
4   -WARNING: bad16.pdf (trailer, offset 779): unexpected EOF
5 4 WARNING: bad16.pdf (trailer, offset 779): parse error while reading object
  5 +WARNING: bad16.pdf (trailer, offset 779): unexpected EOF
6 6 WARNING: bad16.pdf: file is damaged
7 7 WARNING: bad16.pdf (offset 712): expected trailer dictionary
8 8 WARNING: bad16.pdf: Attempting to reconstruct cross-reference table
9 9 WARNING: bad16.pdf (trailer, offset 753): unexpected dictionary close token
10 10 WARNING: bad16.pdf (trailer, offset 756): unexpected dictionary close token
11 11 WARNING: bad16.pdf (trailer, offset 759): unknown token while reading object; treating as string
12   -WARNING: bad16.pdf (trailer, offset 779): unexpected EOF
13 12 WARNING: bad16.pdf (trailer, offset 779): parse error while reading object
  13 +WARNING: bad16.pdf (trailer, offset 779): unexpected EOF
14 14 bad16.pdf: unable to find trailer dictionary while recovering damaged file
... ...
qpdf/qtest/qpdf/bad16.out
1 1 WARNING: bad16.pdf (trailer, offset 753): unexpected dictionary close token
2 2 WARNING: bad16.pdf (trailer, offset 756): unexpected dictionary close token
3 3 WARNING: bad16.pdf (trailer, offset 759): unknown token while reading object; treating as string
4   -WARNING: bad16.pdf (trailer, offset 779): unexpected EOF
5 4 WARNING: bad16.pdf (trailer, offset 779): parse error while reading object
  5 +WARNING: bad16.pdf (trailer, offset 779): unexpected EOF
6 6 bad16.pdf (offset 712): expected trailer dictionary
... ...
qpdf/qtest/qpdf/bad36-recover.out
1 1 WARNING: bad36.pdf (trailer, offset 764): unknown token while reading object; treating as string
2   -WARNING: bad36.pdf (trailer, offset 715): expected dictionary key but found non-name object; inserting key /QPDFFake2
3 2 WARNING: bad36.pdf (trailer, offset 715): dictionary ended prematurely; using null as value for last key
  3 +WARNING: bad36.pdf (trailer, offset 715): expected dictionary key but found non-name object; inserting key /QPDFFake2
4 4 /QTest is implicit
5 5 /QTest is direct and has type null (2)
6 6 /QTest is null
... ...
qpdf/qtest/qpdf/bad36.out
1 1 WARNING: bad36.pdf (trailer, offset 764): unknown token while reading object; treating as string
2   -WARNING: bad36.pdf (trailer, offset 715): expected dictionary key but found non-name object; inserting key /QPDFFake2
3 2 WARNING: bad36.pdf (trailer, offset 715): dictionary ended prematurely; using null as value for last key
  3 +WARNING: bad36.pdf (trailer, offset 715): expected dictionary key but found non-name object; inserting key /QPDFFake2
4 4 /QTest is implicit
5 5 /QTest is direct and has type null (2)
6 6 /QTest is null
... ...
qpdf/qtest/qpdf/bad39.qdf 0 โ†’ 100644
  1 +%PDF-1.3
  2 +%ยฟรทยขรพ
  3 +%QDF-1.0
  4 +
  5 +%% Original object ID: 1 0
  6 +1 0 obj
  7 +<<
  8 + /Pages 2 0 R
  9 + /Type /Catalog
  10 +>>
  11 +endobj
  12 +
  13 +%% Original object ID: 2 0
  14 +2 0 obj
  15 +<<
  16 + /Count 1
  17 + /Kids [
  18 + 3 0 R
  19 + ]
  20 + /Type /Pages
  21 +>>
  22 +endobj
  23 +
  24 +%% Page 1
  25 +%% Original object ID: 3 0
  26 +3 0 obj
  27 +<<
  28 + /Contents 4 0 R
  29 + /MediaBox [
  30 + 0
  31 + 0
  32 + 612
  33 + 792
  34 + ]
  35 + /Parent 2 0 R
  36 + /Resources <<
  37 + /Font <<
  38 + /F1 6 0 R
  39 + >>
  40 + /ProcSet 7 0 R
  41 + >>
  42 + /Type /Page
  43 +>>
  44 +endobj
  45 +
  46 +%% Contents for page 1
  47 +%% Original object ID: 4 0
  48 +4 0 obj
  49 +<<
  50 + /Length 5 0 R
  51 +>>
  52 +stream
  53 +BT
  54 + /F1 24 Tf
  55 + 72 720 Td
  56 + (Potato) Tj
  57 +ET
  58 +endstream
  59 +endobj
  60 +
  61 +5 0 obj
  62 +44
  63 +endobj
  64 +
  65 +%% Original object ID: 6 0
  66 +6 0 obj
  67 +<<
  68 + /BaseFont /Helvetica
  69 + /Encoding /WinAnsiEncoding
  70 + /Name /F1
  71 + /Subtype /Type1
  72 + /Type /Font
  73 +>>
  74 +endobj
  75 +
  76 +%% Original object ID: 5 0
  77 +7 0 obj
  78 +[
  79 + /PDF
  80 + /Text
  81 +]
  82 +endobj
  83 +
  84 +xref
  85 +0 8
  86 +0000000000 65535 f
  87 +0000000052 00000 n
  88 +0000000133 00000 n
  89 +0000000242 00000 n
  90 +0000000484 00000 n
  91 +0000000583 00000 n
  92 +0000000629 00000 n
  93 +0000001113 00000 n
  94 +trailer <<
  95 + /Root 1 0 R
  96 + /Size 8
  97 + /ID [<31415926535897932384626433832795><31415926535897932384626433832795>]
  98 +>>
  99 +startxref
  100 +809
  101 +%%EOF
  102 +7 0 obj
... ...
qpdf/qtest/qpdf/issue-335a.out
... ... @@ -51,6 +51,7 @@ WARNING: issue-335a.pdf (trailer, offset 563): unexpected )
51 51 WARNING: issue-335a.pdf (trailer, offset 596): unexpected )
52 52 WARNING: issue-335a.pdf (trailer, offset 597): name with stray # will not work with PDF >= 1.2
53 53 WARNING: issue-335a.pdf (trailer, offset 600): unexpected )
  54 +WARNING: issue-335a.pdf (trailer, offset 134): dictionary has duplicated key /L
54 55 WARNING: issue-335a.pdf (trailer, offset 601): unexpected )
55 56 WARNING: issue-335a.pdf (trailer, offset 648): unexpected )
56 57 WARNING: issue-335a.pdf (trailer, offset 649): name with stray # will not work with PDF >= 1.2
... ... @@ -74,6 +75,7 @@ WARNING: issue-335a.pdf (trailer, offset 563): unexpected )
74 75 WARNING: issue-335a.pdf (trailer, offset 596): unexpected )
75 76 WARNING: issue-335a.pdf (trailer, offset 597): name with stray # will not work with PDF >= 1.2
76 77 WARNING: issue-335a.pdf (trailer, offset 600): unexpected )
  78 +WARNING: issue-335a.pdf (trailer, offset 164): dictionary has duplicated key /L
77 79 WARNING: issue-335a.pdf (trailer, offset 601): unexpected )
78 80 WARNING: issue-335a.pdf (trailer, offset 648): unexpected )
79 81 WARNING: issue-335a.pdf (trailer, offset 649): name with stray # will not work with PDF >= 1.2
... ... @@ -97,6 +99,7 @@ WARNING: issue-335a.pdf (trailer, offset 563): unexpected )
97 99 WARNING: issue-335a.pdf (trailer, offset 596): unexpected )
98 100 WARNING: issue-335a.pdf (trailer, offset 597): name with stray # will not work with PDF >= 1.2
99 101 WARNING: issue-335a.pdf (trailer, offset 600): unexpected )
  102 +WARNING: issue-335a.pdf (trailer, offset 231): dictionary has duplicated key /L
100 103 WARNING: issue-335a.pdf (trailer, offset 601): unexpected )
101 104 WARNING: issue-335a.pdf (trailer, offset 648): unexpected )
102 105 WARNING: issue-335a.pdf (trailer, offset 649): name with stray # will not work with PDF >= 1.2
... ... @@ -448,6 +451,7 @@ WARNING: issue-335a.pdf (trailer, offset 1168): unexpected )
448 451 WARNING: issue-335a.pdf (trailer, offset 1328): unexpected )
449 452 WARNING: issue-335a.pdf (trailer, offset 1329): name with stray # will not work with PDF >= 1.2
450 453 WARNING: issue-335a.pdf (trailer, offset 1332): unexpected )
  454 +WARNING: issue-335a.pdf (trailer, offset 1033): dictionary has duplicated key /L
451 455 WARNING: issue-335a.pdf (trailer, offset 1333): unexpected )
452 456 WARNING: issue-335a.pdf (trailer, offset 1344): unexpected )
453 457 WARNING: issue-335a.pdf (trailer, offset 1428): unexpected )
... ...
qpdf/qtest/qpdf/parse-object.out
1 1 [ /name 16059 3.14159 false << /key true /other [ (string1) (string2) ] >> null ]
2   -logic error parsing indirect: QPDFObjectHandle::parse called without context on an object with indirect references
  2 +logic error parsing indirect: QPDFParser::parse called without context on an object with indirect references
3 3 trailing data: parsed object (trailing test): trailing data found parsing object from string
4 4 WARNING: parsed object (offset 9): unknown token while reading object; treating as string
  5 +WARNING: parsed object: treating unexpected brace token as null
  6 +WARNING: parsed object: treating unexpected brace token as null
  7 +WARNING: parsed object: unexpected dictionary close token
  8 +WARNING: bad39.qdf (object 7 0, offset 1121): unexpected EOF
  9 +WARNING: bad39.qdf (object 7 0, offset 1121): expected endobj
  10 +WARNING: bad39.qdf (object 7 0, offset 1121): EOF after endobj
5 11 test 31 done
... ...
qpdf/test_driver.cc
... ... @@ -1195,6 +1195,13 @@ test_31(QPDF&amp; pdf, char const* arg2)
1195 1195 // mistakenly parsed as an indirect object.
1196 1196 assert(QPDFObjectHandle::parse(&pdf, "[5 0 R 0 R /X]").unparse() == "[ 5 0 R 0 (R) /X ]");
1197 1197 assert(QPDFObjectHandle::parse(&pdf, "[1 0 R]", "indirect test").unparse() == "[ 1 0 R ]");
  1198 + // TC:QPDFParser bad brace
  1199 + assert(QPDFObjectHandle::parse(&pdf, "}").unparse() == "null");
  1200 + assert(QPDFObjectHandle::parse(&pdf, "{").unparse() == "null");
  1201 + // TC:QPDFParser bad dictionary close
  1202 + assert(QPDFObjectHandle::parse(&pdf, ">>").unparse() == "null");
  1203 + // TC:QPDFParser eof in parse
  1204 + assert(QPDFObjectHandle::parse(&pdf, "[7 0 R]").getArrayItem(0).isNull());
1198 1205 }
1199 1206  
1200 1207 static void
... ...