Commit 51932fd91bca1cf2a155ac5c376a13dbd71da546

Authored by m-holger
Committed by GitHub
2 parents 843e2b45 44cd31b9

Merge pull request #1651 from m-holger/parser

Refactor QPDFParser
include/qpdf/QPDFObjectHandle.hh
... ... @@ -61,11 +61,14 @@ class QPDFTokenizer;
61 61 class QPDFExc;
62 62 class Pl_QPDFTokenizer;
63 63 class QPDFMatrix;
64   -class QPDFParser;
  64 +namespace qpdf::impl
  65 +{
  66 + class Parser;
  67 +}
65 68  
66 69 class QPDFObjectHandle: public qpdf::BaseHandle
67 70 {
68   - friend class QPDFParser;
  71 + friend class qpdf::impl::Parser;
69 72  
70 73 public:
71 74 // This class is used by replaceStreamData. It provides an alternative way of associating
... ...
include/qpdf/QPDFTokenizer.hh
... ... @@ -31,6 +31,10 @@
31 31 namespace qpdf
32 32 {
33 33 class Tokenizer;
  34 + namespace impl
  35 + {
  36 + class Parser;
  37 + }
34 38 } // namespace qpdf
35 39  
36 40 class QPDFTokenizer
... ... @@ -203,7 +207,7 @@ class QPDFTokenizer
203 207 void expectInlineImage(InputSource& input);
204 208  
205 209 private:
206   - friend class QPDFParser;
  210 + friend class qpdf::impl::Parser;
207 211  
208 212 QPDFTokenizer(QPDFTokenizer const&) = delete;
209 213 QPDFTokenizer& operator=(QPDFTokenizer const&) = delete;
... ...
libqpdf/QPDFObjectHandle.cc
... ... @@ -25,6 +25,8 @@
25 25 using namespace std::literals;
26 26 using namespace qpdf;
27 27  
  28 +using Parser = impl::Parser;
  29 +
28 30 const Null Null::temp_;
29 31  
30 32 BaseHandle::
... ... @@ -1540,7 +1542,7 @@ QPDFObjectHandle::parse(
1540 1542 QPDF* context, std::string const& object_str, std::string const& object_description)
1541 1543 {
1542 1544 auto input = is::OffsetBuffer("parsed object", object_str);
1543   - auto result = QPDFParser::parse(input, object_description, context);
  1545 + auto result = Parser::parse(input, object_description, context);
1544 1546 size_t offset = QIntC::to_size(input.tell());
1545 1547 while (offset < object_str.length()) {
1546 1548 if (!isspace(object_str.at(offset))) {
... ... @@ -1661,7 +1663,7 @@ QPDFObjectHandle::parseContentStream_data(
1661 1663 auto input = is::OffsetBuffer(description, stream_data);
1662 1664 Tokenizer tokenizer;
1663 1665 tokenizer.allowEOF();
1664   - auto sp_description = QPDFParser::make_description(description, "content");
  1666 + auto sp_description = Parser::make_description(description, "content");
1665 1667 while (QIntC::to_size(input.tell()) < stream_length) {
1666 1668 // Read a token and seek to the beginning. The offset we get from this process is the
1667 1669 // beginning of the next non-ignorable (space, comment) token. This way, the offset and
... ... @@ -1669,7 +1671,7 @@ QPDFObjectHandle::parseContentStream_data(
1669 1671 tokenizer.nextToken(input, "content", true);
1670 1672 qpdf_offset_t offset = input.getLastOffset();
1671 1673 input.seek(offset, SEEK_SET);
1672   - auto obj = QPDFParser::parse_content(input, sp_description, tokenizer, context);
  1674 + auto obj = Parser::parse_content(input, sp_description, tokenizer, context);
1673 1675 if (!obj) {
1674 1676 // EOF
1675 1677 break;
... ... @@ -1678,7 +1680,7 @@ QPDFObjectHandle::parseContentStream_data(
1678 1680 if (callbacks) {
1679 1681 callbacks->handleObject(obj, QIntC::to_size(offset), length);
1680 1682 }
1681   - if (obj.isOperator() && (obj.getOperatorValue() == "ID")) {
  1683 + if (obj.isOperator() && obj.getOperatorValue() == "ID") {
1682 1684 // Discard next character; it is the space after ID that terminated the token. Read
1683 1685 // until end of inline image.
1684 1686 char ch;
... ... @@ -1731,7 +1733,7 @@ QPDFObjectHandle::parse(
1731 1733 StringDecrypter* decrypter,
1732 1734 QPDF* context)
1733 1735 {
1734   - return QPDFParser::parse(*input, object_description, tokenizer, empty, decrypter, context);
  1736 + return Parser::parse(*input, object_description, tokenizer, empty, decrypter, context);
1735 1737 }
1736 1738  
1737 1739 qpdf_offset_t
... ...
libqpdf/QPDFParser.cc
... ... @@ -46,12 +46,13 @@ class QPDF::Doc::ParseGuard
46 46 };
47 47  
48 48 using ParseGuard = QPDF::Doc::ParseGuard;
  49 +using Parser = qpdf::impl::Parser;
49 50  
50 51 QPDFObjectHandle
51   -QPDFParser::parse(InputSource& input, std::string const& object_description, QPDF* context)
  52 +Parser::parse(InputSource& input, std::string const& object_description, QPDF* context)
52 53 {
53 54 qpdf::Tokenizer tokenizer;
54   - if (auto result = QPDFParser(
  55 + if (auto result = Parser(
55 56 input,
56 57 make_description(input.getName(), object_description),
57 58 object_description,
... ... @@ -66,14 +67,14 @@ QPDFParser::parse(InputSource&amp; input, std::string const&amp; object_description, QPD
66 67 }
67 68  
68 69 QPDFObjectHandle
69   -QPDFParser::parse_content(
  70 +Parser::parse_content(
70 71 InputSource& input,
71 72 std::shared_ptr<QPDFObject::Description> sp_description,
72 73 qpdf::Tokenizer& tokenizer,
73 74 QPDF* context)
74 75 {
75 76 static const std::string content("content"); // GCC12 - make constexpr
76   - auto p = QPDFParser(
  77 + auto p = Parser(
77 78 input,
78 79 std::move(sp_description),
79 80 content,
... ... @@ -93,7 +94,7 @@ QPDFParser::parse_content(
93 94 }
94 95  
95 96 QPDFObjectHandle
96   -QPDFParser::parse(
  97 +Parser::parse(
97 98 InputSource& input,
98 99 std::string const& object_description,
99 100 QPDFTokenizer& tokenizer,
... ... @@ -103,7 +104,7 @@ QPDFParser::parse(
103 104 {
104 105 // ABI: This parse overload is only used by the deprecated QPDFObjectHandle::parse. It is the
105 106 // only user of the 'empty' member. When removing this overload also remove 'empty'.
106   - auto p = QPDFParser(
  107 + auto p = Parser(
107 108 input,
108 109 make_description(input.getName(), object_description),
109 110 object_description,
... ... @@ -120,7 +121,7 @@ QPDFParser::parse(
120 121 }
121 122  
122 123 QPDFObjectHandle
123   -QPDFParser::parse(
  124 +Parser::parse(
124 125 InputSource& input,
125 126 std::string const& object_description,
126 127 qpdf::Tokenizer& tokenizer,
... ... @@ -128,7 +129,7 @@ QPDFParser::parse(
128 129 QPDF& context,
129 130 bool sanity_checks)
130 131 {
131   - return QPDFParser(
  132 + return Parser(
132 133 input,
133 134 make_description(input.getName(), object_description),
134 135 object_description,
... ... @@ -143,10 +144,10 @@ QPDFParser::parse(
143 144 }
144 145  
145 146 QPDFObjectHandle
146   -QPDFParser::parse(
  147 +Parser::parse(
147 148 is::OffsetBuffer& input, int stream_id, int obj_id, qpdf::Tokenizer& tokenizer, QPDF& context)
148 149 {
149   - return QPDFParser(
  150 + return Parser(
150 151 input,
151 152 std::make_shared<QPDFObject::Description>(
152 153 QPDFObject::ObjStreamDescr(stream_id, obj_id)),
... ... @@ -161,7 +162,7 @@ QPDFParser::parse(
161 162 }
162 163  
163 164 QPDFObjectHandle
164   -QPDFParser::parse(bool content_stream)
  165 +Parser::parse(bool content_stream)
165 166 {
166 167 try {
167 168 return parse_first(content_stream);
... ... @@ -178,20 +179,20 @@ QPDFParser::parse(bool content_stream)
178 179 }
179 180  
180 181 QPDFObjectHandle
181   -QPDFParser::parse_first(bool content_stream)
  182 +Parser::parse_first(bool content_stream)
182 183 {
183 184 // This method must take care not to resolve any objects. Don't check the type of any object
184 185 // without first ensuring that it is a direct object. Otherwise, doing so may have the side
185 186 // effect of reading the object and changing the file pointer. If you do this, it will cause a
186 187 // logic error to be thrown from QPDF::inParse().
187 188  
188   - QPDF::Doc::ParseGuard pg(context);
189   - start = input.tell();
190   - if (!tokenizer.nextToken(input, object_description)) {
191   - warn(tokenizer.getErrorMessage());
  189 + QPDF::Doc::ParseGuard pg(context_);
  190 + start_ = input_.tell();
  191 + if (!tokenizer_.nextToken(input_, object_description_)) {
  192 + warn(tokenizer_.getErrorMessage());
192 193 }
193 194  
194   - switch (tokenizer.getType()) {
  195 + switch (tokenizer_.getType()) {
195 196 case QPDFTokenizer::tt_eof:
196 197 if (content_stream) {
197 198 // In content stream mode, leave object uninitialized to indicate EOF
... ... @@ -219,57 +220,57 @@ QPDFParser::parse_first(bool content_stream)
219 220  
220 221 case QPDFTokenizer::tt_array_open:
221 222 case QPDFTokenizer::tt_dict_open:
222   - stack.clear();
223   - stack.emplace_back(
224   - input,
225   - (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array : st_dictionary_key);
226   - frame = &stack.back();
227   - return parseRemainder(content_stream);
  223 + stack_.clear();
  224 + stack_.emplace_back(
  225 + input_,
  226 + (tokenizer_.getType() == QPDFTokenizer::tt_array_open) ? st_array : st_dictionary_key);
  227 + frame_ = &stack_.back();
  228 + return parse_remainder(content_stream);
228 229  
229 230 case QPDFTokenizer::tt_bool:
230   - return withDescription<QPDF_Bool>(tokenizer.getValue() == "true");
  231 + return with_description<QPDF_Bool>(tokenizer_.getValue() == "true");
231 232  
232 233 case QPDFTokenizer::tt_null:
233 234 return {QPDFObject::create<QPDF_Null>()};
234 235  
235 236 case QPDFTokenizer::tt_integer:
236   - return withDescription<QPDF_Integer>(QUtil::string_to_ll(tokenizer.getValue().c_str()));
  237 + return with_description<QPDF_Integer>(QUtil::string_to_ll(tokenizer_.getValue().c_str()));
237 238  
238 239 case QPDFTokenizer::tt_real:
239   - return withDescription<QPDF_Real>(tokenizer.getValue());
  240 + return with_description<QPDF_Real>(tokenizer_.getValue());
240 241  
241 242 case QPDFTokenizer::tt_name:
242   - return withDescription<QPDF_Name>(tokenizer.getValue());
  243 + return with_description<QPDF_Name>(tokenizer_.getValue());
243 244  
244 245 case QPDFTokenizer::tt_word:
245 246 {
246   - auto const& value = tokenizer.getValue();
  247 + auto const& value = tokenizer_.getValue();
247 248 if (content_stream) {
248   - return withDescription<QPDF_Operator>(value);
  249 + return with_description<QPDF_Operator>(value);
249 250 } else if (value == "endobj") {
250 251 // We just saw endobj without having read anything. Nothing in the PDF spec appears
251 252 // to allow empty objects, but they have been encountered in actual PDF files and
252 253 // Adobe Reader appears to ignore them. Treat this as a null and do not move the
253 254 // input source's offset.
254 255 empty_ = true;
255   - input.seek(input.getLastOffset(), SEEK_SET);
  256 + input_.seek(input_.getLastOffset(), SEEK_SET);
256 257 if (!content_stream) {
257 258 warn("empty object treated as null");
258 259 }
259 260 return {};
260 261 } else {
261 262 warn("unknown token while reading object; treating as string");
262   - return withDescription<QPDF_String>(value);
  263 + return with_description<QPDF_String>(value);
263 264 }
264 265 }
265 266  
266 267 case QPDFTokenizer::tt_string:
267   - if (decrypter) {
268   - std::string s{tokenizer.getValue()};
269   - decrypter->decryptString(s);
270   - return withDescription<QPDF_String>(s);
  268 + if (decrypter_) {
  269 + std::string s{tokenizer_.getValue()};
  270 + decrypter_->decryptString(s);
  271 + return with_description<QPDF_String>(s);
271 272 } else {
272   - return withDescription<QPDF_String>(tokenizer.getValue());
  273 + return with_description<QPDF_String>(tokenizer_.getValue());
273 274 }
274 275  
275 276 default:
... ... @@ -279,65 +280,65 @@ QPDFParser::parse_first(bool content_stream)
279 280 }
280 281  
281 282 QPDFObjectHandle
282   -QPDFParser::parseRemainder(bool content_stream)
  283 +Parser::parse_remainder(bool content_stream)
283 284 {
284 285 // This method must take care not to resolve any objects. Don't check the type of any object
285 286 // without first ensuring that it is a direct object. Otherwise, doing so may have the side
286 287 // effect of reading the object and changing the file pointer. If you do this, it will cause a
287 288 // logic error to be thrown from QPDF::inParse().
288 289  
289   - bad_count = 0;
  290 + bad_count_ = 0;
290 291 bool b_contents = false;
291 292  
292 293 while (true) {
293   - if (!tokenizer.nextToken(input, object_description)) {
294   - warn(tokenizer.getErrorMessage());
  294 + if (!tokenizer_.nextToken(input_, object_description_)) {
  295 + warn(tokenizer_.getErrorMessage());
295 296 }
296   - ++good_count; // optimistically
  297 + ++good_count_; // optimistically
297 298  
298   - if (int_count != 0) {
  299 + if (int_count_ != 0) {
299 300 // Special handling of indirect references. Treat integer tokens as part of an indirect
300 301 // reference until proven otherwise.
301   - if (tokenizer.getType() == QPDFTokenizer::tt_integer) {
302   - if (++int_count > 2) {
  302 + if (tokenizer_.getType() == QPDFTokenizer::tt_integer) {
  303 + if (++int_count_ > 2) {
303 304 // Process the oldest buffered integer.
304   - addInt(int_count);
  305 + add_int(int_count_);
305 306 }
306   - last_offset_buffer[int_count % 2] = input.getLastOffset();
307   - int_buffer[int_count % 2] = QUtil::string_to_ll(tokenizer.getValue().c_str());
  307 + last_offset_buffer_[int_count_ % 2] = input_.getLastOffset();
  308 + int_buffer_[int_count_ % 2] = QUtil::string_to_ll(tokenizer_.getValue().c_str());
308 309 continue;
309 310  
310 311 } else if (
311   - int_count >= 2 && tokenizer.getType() == QPDFTokenizer::tt_word &&
312   - tokenizer.getValue() == "R") {
313   - if (!context) {
  312 + int_count_ >= 2 && tokenizer_.getType() == QPDFTokenizer::tt_word &&
  313 + tokenizer_.getValue() == "R") {
  314 + if (!context_) {
314 315 throw std::logic_error(
315   - "QPDFParser::parse called without context on an object with indirect "
  316 + "Parser::parse called without context on an object with indirect "
316 317 "references");
317 318 }
318   - auto id = QIntC::to_int(int_buffer[(int_count - 1) % 2]);
319   - auto gen = QIntC::to_int(int_buffer[(int_count) % 2]);
  319 + auto id = QIntC::to_int(int_buffer_[(int_count_ - 1) % 2]);
  320 + auto gen = QIntC::to_int(int_buffer_[(int_count_) % 2]);
320 321 if (!(id < 1 || gen < 0 || gen >= 65535)) {
321   - add(ParseGuard::getObject(context, id, gen, parse_pdf));
  322 + add(ParseGuard::getObject(context_, id, gen, parse_pdf_));
322 323 } else {
323 324 add_bad_null(
324 325 "treating bad indirect reference (" + std::to_string(id) + " " +
325 326 std::to_string(gen) + " R) as null");
326 327 }
327   - int_count = 0;
  328 + int_count_ = 0;
328 329 continue;
329 330  
330   - } else if (int_count > 0) {
  331 + } else if (int_count_ > 0) {
331 332 // Process the buffered integers before processing the current token.
332   - if (int_count > 1) {
333   - addInt(int_count - 1);
  333 + if (int_count_ > 1) {
  334 + add_int(int_count_ - 1);
334 335 }
335   - addInt(int_count);
336   - int_count = 0;
  336 + add_int(int_count_);
  337 + int_count_ = 0;
337 338 }
338 339 }
339 340  
340   - switch (tokenizer.getType()) {
  341 + switch (tokenizer_.getType()) {
341 342 case QPDFTokenizer::tt_eof:
342 343 warn("parse error while reading object");
343 344 if (content_stream) {
... ... @@ -349,7 +350,7 @@ QPDFParser::parseRemainder(bool content_stream)
349 350  
350 351 case QPDFTokenizer::tt_bad:
351 352 check_too_many_bad_tokens();
352   - addNull();
  353 + add_null();
353 354 continue;
354 355  
355 356 case QPDFTokenizer::tt_brace_open:
... ... @@ -358,23 +359,23 @@ QPDFParser::parseRemainder(bool content_stream)
358 359 continue;
359 360  
360 361 case QPDFTokenizer::tt_array_close:
361   - if (frame->state == st_array) {
362   - auto object = frame->null_count > 100
363   - ? QPDFObject::create<QPDF_Array>(std::move(frame->olist), true)
364   - : QPDFObject::create<QPDF_Array>(std::move(frame->olist));
365   - setDescription(object, frame->offset - 1);
  362 + if (frame_->state == st_array) {
  363 + auto object = frame_->null_count > 100
  364 + ? QPDFObject::create<QPDF_Array>(std::move(frame_->olist), true)
  365 + : QPDFObject::create<QPDF_Array>(std::move(frame_->olist));
  366 + set_description(object, frame_->offset - 1);
366 367 // The `offset` points to the next of "[". Set the rewind offset to point to the
367 368 // beginning of "[". This has been explicitly tested with whitespace surrounding the
368 369 // array start delimiter. getLastOffset points to the array end token and therefore
369 370 // can't be used here.
370   - if (stack.size() <= 1) {
  371 + if (stack_.size() <= 1) {
371 372 return object;
372 373 }
373   - stack.pop_back();
374   - frame = &stack.back();
  374 + stack_.pop_back();
  375 + frame_ = &stack_.back();
375 376 add(std::move(object));
376 377 } else {
377   - if (sanity_checks) {
  378 + if (sanity_checks_) {
378 379 // During sanity checks, assume nesting of containers is corrupt and object is
379 380 // unusable.
380 381 warn("unexpected array close token; giving up on reading object");
... ... @@ -385,46 +386,46 @@ QPDFParser::parseRemainder(bool content_stream)
385 386 continue;
386 387  
387 388 case QPDFTokenizer::tt_dict_close:
388   - if (frame->state <= st_dictionary_value) {
  389 + if (frame_->state <= st_dictionary_value) {
389 390 // Attempt to recover more or less gracefully from invalid dictionaries.
390   - auto& dict = frame->dict;
  391 + auto& dict = frame_->dict;
391 392  
392   - if (frame->state == st_dictionary_value) {
  393 + if (frame_->state == st_dictionary_value) {
393 394 warn(
394   - frame->offset,
  395 + frame_->offset,
395 396 "dictionary ended prematurely; using null as value for last key");
396   - dict[frame->key] = QPDFObject::create<QPDF_Null>();
  397 + dict[frame_->key] = QPDFObject::create<QPDF_Null>();
397 398 }
398   - if (!frame->olist.empty()) {
399   - if (sanity_checks) {
  399 + if (!frame_->olist.empty()) {
  400 + if (sanity_checks_) {
400 401 warn(
401   - frame->offset,
  402 + frame_->offset,
402 403 "expected dictionary keys but found non-name objects; ignoring");
403 404 } else {
404   - fixMissingKeys();
  405 + fix_missing_keys();
405 406 }
406 407 }
407 408  
408   - if (!frame->contents_string.empty() && dict.contains("/Type") &&
  409 + if (!frame_->contents_string.empty() && dict.contains("/Type") &&
409 410 dict["/Type"].isNameAndEquals("/Sig") && dict.contains("/ByteRange") &&
410 411 dict.contains("/Contents") && dict["/Contents"].isString()) {
411   - dict["/Contents"] = QPDFObjectHandle::newString(frame->contents_string);
412   - dict["/Contents"].setParsedOffset(frame->contents_offset);
  412 + dict["/Contents"] = QPDFObjectHandle::newString(frame_->contents_string);
  413 + dict["/Contents"].setParsedOffset(frame_->contents_offset);
413 414 }
414 415 auto object = QPDFObject::create<QPDF_Dictionary>(std::move(dict));
415   - setDescription(object, frame->offset - 2);
  416 + set_description(object, frame_->offset - 2);
416 417 // The `offset` points to the next of "<<". Set the rewind offset to point to the
417 418 // beginning of "<<". This has been explicitly tested with whitespace surrounding
418 419 // the dictionary start delimiter. getLastOffset points to the dictionary end token
419 420 // and therefore can't be used here.
420   - if (stack.size() <= 1) {
  421 + if (stack_.size() <= 1) {
421 422 return object;
422 423 }
423   - stack.pop_back();
424   - frame = &stack.back();
  424 + stack_.pop_back();
  425 + frame_ = &stack_.back();
425 426 add(std::move(object));
426 427 } else {
427   - if (sanity_checks) {
  428 + if (sanity_checks_) {
428 429 // During sanity checks, assume nesting of containers is corrupt and object is
429 430 // unusable.
430 431 warn("unexpected dictionary close token; giving up on reading object");
... ... @@ -436,60 +437,60 @@ QPDFParser::parseRemainder(bool content_stream)
436 437  
437 438 case QPDFTokenizer::tt_array_open:
438 439 case QPDFTokenizer::tt_dict_open:
439   - if (stack.size() > max_nesting) {
  440 + if (stack_.size() > max_nesting) {
440 441 limits_error(
441 442 "parser-max-nesting", "ignoring excessively deeply nested data structure");
442 443 }
443 444 b_contents = false;
444   - stack.emplace_back(
445   - input,
446   - (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array
447   - : st_dictionary_key);
448   - frame = &stack.back();
  445 + stack_.emplace_back(
  446 + input_,
  447 + (tokenizer_.getType() == QPDFTokenizer::tt_array_open) ? st_array
  448 + : st_dictionary_key);
  449 + frame_ = &stack_.back();
449 450 continue;
450 451  
451 452 case QPDFTokenizer::tt_bool:
452   - addScalar<QPDF_Bool>(tokenizer.getValue() == "true");
  453 + add_scalar<QPDF_Bool>(tokenizer_.getValue() == "true");
453 454 continue;
454 455  
455 456 case QPDFTokenizer::tt_null:
456   - addNull();
  457 + add_null();
457 458 continue;
458 459  
459 460 case QPDFTokenizer::tt_integer:
460 461 if (!content_stream) {
461 462 // Buffer token in case it is part of an indirect reference.
462   - last_offset_buffer[1] = input.getLastOffset();
463   - int_buffer[1] = QUtil::string_to_ll(tokenizer.getValue().c_str());
464   - int_count = 1;
  463 + last_offset_buffer_[1] = input_.getLastOffset();
  464 + int_buffer_[1] = QUtil::string_to_ll(tokenizer_.getValue().c_str());
  465 + int_count_ = 1;
465 466 } else {
466   - addScalar<QPDF_Integer>(QUtil::string_to_ll(tokenizer.getValue().c_str()));
  467 + add_scalar<QPDF_Integer>(QUtil::string_to_ll(tokenizer_.getValue().c_str()));
467 468 }
468 469 continue;
469 470  
470 471 case QPDFTokenizer::tt_real:
471   - addScalar<QPDF_Real>(tokenizer.getValue());
  472 + add_scalar<QPDF_Real>(tokenizer_.getValue());
472 473 continue;
473 474  
474 475 case QPDFTokenizer::tt_name:
475   - if (frame->state == st_dictionary_key) {
476   - frame->key = tokenizer.getValue();
477   - frame->state = st_dictionary_value;
478   - b_contents = decrypter && frame->key == "/Contents";
  476 + if (frame_->state == st_dictionary_key) {
  477 + frame_->key = tokenizer_.getValue();
  478 + frame_->state = st_dictionary_value;
  479 + b_contents = decrypter_ && frame_->key == "/Contents";
479 480 continue;
480 481 } else {
481   - addScalar<QPDF_Name>(tokenizer.getValue());
  482 + add_scalar<QPDF_Name>(tokenizer_.getValue());
482 483 }
483 484 continue;
484 485  
485 486 case QPDFTokenizer::tt_word:
486 487 if (content_stream) {
487   - addScalar<QPDF_Operator>(tokenizer.getValue());
  488 + add_scalar<QPDF_Operator>(tokenizer_.getValue());
488 489 continue;
489 490 }
490 491  
491   - if (sanity_checks) {
492   - if (tokenizer.getValue() == "endobj" || tokenizer.getValue() == "endstream") {
  492 + if (sanity_checks_) {
  493 + if (tokenizer_.getValue() == "endobj" || tokenizer_.getValue() == "endstream") {
493 494 // During sanity checks, assume an unexpected endobj or endstream indicates that
494 495 // we are parsing past the end of the object.
495 496 warn(
... ... @@ -504,24 +505,24 @@ QPDFParser::parseRemainder(bool content_stream)
504 505  
505 506 warn("unknown token while reading object; treating as string");
506 507 check_too_many_bad_tokens();
507   - addScalar<QPDF_String>(tokenizer.getValue());
  508 + add_scalar<QPDF_String>(tokenizer_.getValue());
508 509  
509 510 continue;
510 511  
511 512 case QPDFTokenizer::tt_string:
512 513 {
513   - auto const& val = tokenizer.getValue();
514   - if (decrypter) {
  514 + auto const& val = tokenizer_.getValue();
  515 + if (decrypter_) {
515 516 if (b_contents) {
516   - frame->contents_string = val;
517   - frame->contents_offset = input.getLastOffset();
  517 + frame_->contents_string = val;
  518 + frame_->contents_offset = input_.getLastOffset();
518 519 b_contents = false;
519 520 }
520 521 std::string s{val};
521   - decrypter->decryptString(s);
522   - addScalar<QPDF_String>(s);
  522 + decrypter_->decryptString(s);
  523 + add_scalar<QPDF_String>(s);
523 524 } else {
524   - addScalar<QPDF_String>(val);
  525 + add_scalar<QPDF_String>(val);
525 526 }
526 527 }
527 528 continue;
... ... @@ -533,107 +534,107 @@ QPDFParser::parseRemainder(bool content_stream)
533 534 }
534 535  
535 536 void
536   -QPDFParser::add(std::shared_ptr<QPDFObject>&& obj)
  537 +Parser::add(std::shared_ptr<QPDFObject>&& obj)
537 538 {
538   - if (frame->state != st_dictionary_value) {
  539 + if (frame_->state != st_dictionary_value) {
539 540 // If state is st_dictionary_key then there is a missing key. Push onto olist for
540 541 // processing once the tt_dict_close token has been found.
541   - frame->olist.emplace_back(std::move(obj));
  542 + frame_->olist.emplace_back(std::move(obj));
542 543 } else {
543   - if (auto res = frame->dict.insert_or_assign(frame->key, std::move(obj)); !res.second) {
544   - warnDuplicateKey();
  544 + if (auto res = frame_->dict.insert_or_assign(frame_->key, std::move(obj)); !res.second) {
  545 + warn_duplicate_key();
545 546 }
546   - frame->state = st_dictionary_key;
  547 + frame_->state = st_dictionary_key;
547 548 }
548 549 }
549 550  
550 551 void
551   -QPDFParser::addNull()
  552 +Parser::add_null()
552 553 {
553 554 const static ObjectPtr null_obj = QPDFObject::create<QPDF_Null>();
554 555  
555   - if (frame->state != st_dictionary_value) {
  556 + if (frame_->state != st_dictionary_value) {
556 557 // If state is st_dictionary_key then there is a missing key. Push onto olist for
557 558 // processing once the tt_dict_close token has been found.
558   - frame->olist.emplace_back(null_obj);
  559 + frame_->olist.emplace_back(null_obj);
559 560 } else {
560   - if (auto res = frame->dict.insert_or_assign(frame->key, null_obj); !res.second) {
561   - warnDuplicateKey();
  561 + if (auto res = frame_->dict.insert_or_assign(frame_->key, null_obj); !res.second) {
  562 + warn_duplicate_key();
562 563 }
563   - frame->state = st_dictionary_key;
  564 + frame_->state = st_dictionary_key;
564 565 }
565   - ++frame->null_count;
  566 + ++frame_->null_count;
566 567 }
567 568  
568 569 void
569   -QPDFParser::add_bad_null(std::string const& msg)
  570 +Parser::add_bad_null(std::string const& msg)
570 571 {
571 572 warn(msg);
572 573 check_too_many_bad_tokens();
573   - addNull();
  574 + add_null();
574 575 }
575 576  
576 577 void
577   -QPDFParser::addInt(int count)
  578 +Parser::add_int(int count)
578 579 {
579   - auto obj = QPDFObject::create<QPDF_Integer>(int_buffer[count % 2]);
580   - obj->setDescription(context, description, last_offset_buffer[count % 2]);
  580 + auto obj = QPDFObject::create<QPDF_Integer>(int_buffer_[count % 2]);
  581 + obj->setDescription(context_, description_, last_offset_buffer_[count % 2]);
581 582 add(std::move(obj));
582 583 }
583 584  
584 585 template <typename T, typename... Args>
585 586 void
586   -QPDFParser::addScalar(Args&&... args)
  587 +Parser::add_scalar(Args&&... args)
587 588 {
588   - auto limit = Limits::parser_max_container_size(bad_count || sanity_checks);
589   - if (frame->olist.size() >= limit || frame->dict.size() >= limit) {
  589 + auto limit = Limits::parser_max_container_size(bad_count_ || sanity_checks_);
  590 + if (frame_->olist.size() >= limit || frame_->dict.size() >= limit) {
590 591 // Stop adding scalars. We are going to abort when the close token or a bad token is
591 592 // encountered.
592   - max_bad_count = 1;
  593 + max_bad_count_ = 1;
593 594 check_too_many_bad_tokens(); // always throws Error()
594 595 }
595 596 auto obj = QPDFObject::create<T>(std::forward<Args>(args)...);
596   - obj->setDescription(context, description, input.getLastOffset());
  597 + obj->setDescription(context_, description_, input_.getLastOffset());
597 598 add(std::move(obj));
598 599 }
599 600  
600 601 template <typename T, typename... Args>
601 602 QPDFObjectHandle
602   -QPDFParser::withDescription(Args&&... args)
  603 +Parser::with_description(Args&&... args)
603 604 {
604 605 auto obj = QPDFObject::create<T>(std::forward<Args>(args)...);
605   - obj->setDescription(context, description, start);
  606 + obj->setDescription(context_, description_, start_);
606 607 return {obj};
607 608 }
608 609  
609 610 void
610   -QPDFParser::setDescription(ObjectPtr& obj, qpdf_offset_t parsed_offset)
  611 +Parser::set_description(ObjectPtr& obj, qpdf_offset_t parsed_offset)
611 612 {
612 613 if (obj) {
613   - obj->setDescription(context, description, parsed_offset);
  614 + obj->setDescription(context_, description_, parsed_offset);
614 615 }
615 616 }
616 617  
617 618 void
618   -QPDFParser::fixMissingKeys()
  619 +Parser::fix_missing_keys()
619 620 {
620 621 std::set<std::string> names;
621   - for (auto& obj: frame->olist) {
  622 + for (auto& obj: frame_->olist) {
622 623 if (obj.raw_type_code() == ::ot_name) {
623 624 names.insert(obj.obj_sp()->getStringValue());
624 625 }
625 626 }
626 627 int next_fake_key = 1;
627   - for (auto const& item: frame->olist) {
  628 + for (auto const& item: frame_->olist) {
628 629 while (true) {
629 630 const std::string key = "/QPDFFake" + std::to_string(next_fake_key++);
630   - const bool found_fake = !frame->dict.contains(key) && !names.contains(key);
  631 + const bool found_fake = !frame_->dict.contains(key) && !names.contains(key);
631 632 QTC::TC("qpdf", "QPDFParser found fake", (found_fake ? 0 : 1));
632 633 if (found_fake) {
633 634 warn(
634   - frame->offset,
  635 + frame_->offset,
635 636 "expected dictionary key but found non-name object; inserting key " + key);
636   - frame->dict[key] = item;
  637 + frame_->dict[key] = item;
637 638 break;
638 639 }
639 640 }
... ... @@ -641,11 +642,11 @@ QPDFParser::fixMissingKeys()
641 642 }
642 643  
643 644 void
644   -QPDFParser::check_too_many_bad_tokens()
  645 +Parser::check_too_many_bad_tokens()
645 646 {
646   - auto limit = Limits::parser_max_container_size(bad_count || sanity_checks);
647   - if (frame->olist.size() >= limit || frame->dict.size() >= limit) {
648   - if (bad_count) {
  647 + auto limit = Limits::parser_max_container_size(bad_count_ || sanity_checks_);
  648 + if (frame_->olist.size() >= limit || frame_->dict.size() >= limit) {
  649 + if (bad_count_) {
649 650 limits_error(
650 651 "parser-max-container-size-damaged",
651 652 "encountered errors while parsing an array or dictionary with more than " +
... ... @@ -656,27 +657,27 @@ QPDFParser::check_too_many_bad_tokens()
656 657 "encountered an array or dictionary with more than " + std::to_string(limit) +
657 658 " elements during xref recovery; giving up on reading object");
658 659 }
659   - if (max_bad_count && --max_bad_count == 0) {
  660 + if (max_bad_count_ && --max_bad_count_ == 0) {
660 661 limits_error(
661 662 "parser-max-errors", "too many errors during parsing; treating object as null");
662 663 }
663   - if (good_count > 4) {
664   - good_count = 0;
665   - bad_count = 1;
  664 + if (good_count_ > 4) {
  665 + good_count_ = 0;
  666 + bad_count_ = 1;
666 667 return;
667 668 }
668   - if (++bad_count > 5 ||
669   - (frame->state != st_array && std::cmp_less(max_bad_count, frame->olist.size()))) {
  669 + if (++bad_count_ > 5 ||
  670 + (frame_->state != st_array && std::cmp_less(max_bad_count_, frame_->olist.size()))) {
670 671 // Give up after 5 errors in close proximity or if the number of missing dictionary keys
671 672 // exceeds the remaining number of allowable total errors.
672 673 warn("too many errors; giving up on reading object");
673 674 throw Error();
674 675 }
675   - good_count = 0;
  676 + good_count_ = 0;
676 677 }
677 678  
678 679 void
679   -QPDFParser::limits_error(std::string const& limit, std::string const& msg)
  680 +Parser::limits_error(std::string const& limit, std::string const& msg)
680 681 {
681 682 Limits::error();
682 683 warn("limits error("s + limit + "): " + msg);
... ... @@ -684,40 +685,41 @@ QPDFParser::limits_error(std::string const&amp; limit, std::string const&amp; msg)
684 685 }
685 686  
686 687 void
687   -QPDFParser::warn(QPDFExc const& e) const
  688 +Parser::warn(QPDFExc const& e) const
688 689 {
689 690 // If parsing on behalf of a QPDF object and want to give a warning, we can warn through the
690 691 // object. If parsing for some other reason, such as an explicit creation of an object from a
691 692 // string, then just throw the exception.
692   - if (context) {
693   - context->warn(e);
  693 + if (context_) {
  694 + context_->warn(e);
694 695 } else {
695 696 throw e;
696 697 }
697 698 }
698 699  
699 700 void
700   -QPDFParser::warnDuplicateKey()
  701 +Parser::warn_duplicate_key()
701 702 {
702 703 warn(
703   - frame->offset,
704   - "dictionary has duplicated key " + frame->key + "; last occurrence overrides earlier ones");
  704 + frame_->offset,
  705 + "dictionary has duplicated key " + frame_->key +
  706 + "; last occurrence overrides earlier ones");
705 707 }
706 708  
707 709 void
708   -QPDFParser::warn(qpdf_offset_t offset, std::string const& msg) const
  710 +Parser::warn(qpdf_offset_t offset, std::string const& msg) const
709 711 {
710   - if (stream_id) {
711   - std::string descr = "object "s + std::to_string(obj_id) + " 0";
712   - std::string name = context->getFilename() + " object stream " + std::to_string(stream_id);
  712 + if (stream_id_) {
  713 + std::string descr = "object "s + std::to_string(obj_id_) + " 0";
  714 + std::string name = context_->getFilename() + " object stream " + std::to_string(stream_id_);
713 715 warn(QPDFExc(qpdf_e_damaged_pdf, name, descr, offset, msg));
714 716 } else {
715   - warn(QPDFExc(qpdf_e_damaged_pdf, input.getName(), object_description, offset, msg));
  717 + warn(QPDFExc(qpdf_e_damaged_pdf, input_.getName(), object_description_, offset, msg));
716 718 }
717 719 }
718 720  
719 721 void
720   -QPDFParser::warn(std::string const& msg) const
  722 +Parser::warn(std::string const& msg) const
721 723 {
722   - warn(input.getLastOffset(), msg);
  724 + warn(input_.getLastOffset(), msg);
723 725 }
... ...
libqpdf/QPDF_objects.cc
... ... @@ -25,6 +25,7 @@ using namespace qpdf;
25 25 using namespace std::literals;
26 26  
27 27 using Objects = QPDF::Doc::Objects;
  28 +using Parser = impl::Parser;
28 29  
29 30 QPDFXRefEntry::QPDFXRefEntry() = default;
30 31  
... ... @@ -1287,7 +1288,7 @@ Objects::readTrailer()
1287 1288 {
1288 1289 qpdf_offset_t offset = m->file->tell();
1289 1290 auto object =
1290   - QPDFParser::parse(*m->file, "trailer", m->tokenizer, nullptr, qpdf, m->reconstructed_xref);
  1291 + Parser::parse(*m->file, "trailer", m->tokenizer, nullptr, qpdf, m->reconstructed_xref);
1291 1292 if (object.isDictionary() && m->objects.readToken(*m->file).isWord("stream")) {
1292 1293 warn(damagedPDF("trailer", m->file->tell(), "stream keyword found in trailer"));
1293 1294 }
... ... @@ -1304,7 +1305,7 @@ Objects::readObject(std::string const&amp; description, QPDFObjGen og)
1304 1305  
1305 1306 StringDecrypter decrypter{&qpdf, og};
1306 1307 StringDecrypter* decrypter_ptr = m->encp->encrypted ? &decrypter : nullptr;
1307   - auto object = QPDFParser::parse(
  1308 + auto object = Parser::parse(
1308 1309 *m->file,
1309 1310 m->last_object_description,
1310 1311 m->tokenizer,
... ... @@ -1834,7 +1835,7 @@ Objects::resolveObjectsInStream(int obj_stream_number)
1834 1835 if (entry != m->xref_table.end() && entry->second.getType() == 2 &&
1835 1836 entry->second.getObjStreamNumber() == obj_stream_number) {
1836 1837 is::OffsetBuffer in("", {b_start + obj_offset, obj_size}, obj_offset);
1837   - if (auto oh = QPDFParser::parse(in, obj_stream_number, obj_id, m->tokenizer, qpdf)) {
  1838 + if (auto oh = Parser::parse(in, obj_stream_number, obj_id, m->tokenizer, qpdf)) {
1838 1839 updateCache(og, oh.obj_sp(), end_before_space, end_after_space);
1839 1840 }
1840 1841 } else {
... ...
libqpdf/qpdf/QPDFParser.hh
... ... @@ -13,153 +13,277 @@
13 13 using namespace qpdf;
14 14 using namespace qpdf::global;
15 15  
16   -class QPDFParser
  16 +namespace qpdf::impl
17 17 {
18   - public:
19   - class Error: public std::exception
  18 + /// @class Parser
  19 + /// @brief Internal parser for PDF objects and content streams.
  20 + /// @par
  21 + /// The Parser class provides static methods for parsing PDF objects from input sources.
  22 + /// It handles tokenization, error recovery, and object construction with proper offset
  23 + /// tracking and description for error reporting.
  24 + class Parser
20 25 {
21 26 public:
22   - Error() = default;
23   - virtual ~Error() noexcept = default;
24   - };
  27 + /// @brief Exception thrown when parser encounters an unrecoverable error.
  28 + class Error: public std::exception
  29 + {
  30 + public:
  31 + Error() = default;
  32 + virtual ~Error() noexcept = default;
  33 + };
25 34  
26   - static QPDFObjectHandle
27   - parse(InputSource& input, std::string const& object_description, QPDF* context);
28   -
29   - static QPDFObjectHandle parse_content(
30   - InputSource& input,
31   - std::shared_ptr<QPDFObject::Description> sp_description,
32   - qpdf::Tokenizer& tokenizer,
33   - QPDF* context);
34   -
35   - // For use by deprecated QPDFObjectHandle::parse.
36   - static QPDFObjectHandle parse(
37   - InputSource& input,
38   - std::string const& object_description,
39   - QPDFTokenizer& tokenizer,
40   - bool& empty,
41   - QPDFObjectHandle::StringDecrypter* decrypter,
42   - QPDF* context);
43   -
44   - // For use by QPDF.
45   - static QPDFObjectHandle parse(
46   - InputSource& input,
47   - std::string const& object_description,
48   - qpdf::Tokenizer& tokenizer,
49   - QPDFObjectHandle::StringDecrypter* decrypter,
50   - QPDF& context,
51   - bool sanity_checks);
52   -
53   - static QPDFObjectHandle parse(
54   - qpdf::is::OffsetBuffer& input,
55   - int stream_id,
56   - int obj_id,
57   - qpdf::Tokenizer& tokenizer,
58   - QPDF& context);
59   -
60   - static std::shared_ptr<QPDFObject::Description>
61   - make_description(std::string const& input_name, std::string const& object_description)
62   - {
63   - using namespace std::literals;
64   - return std::make_shared<QPDFObject::Description>(
65   - input_name + ", " + object_description + " at offset $PO");
66   - }
67   -
68   - private:
69   - QPDFParser(
70   - InputSource& input,
71   - std::shared_ptr<QPDFObject::Description> sp_description,
72   - std::string const& object_description,
73   - qpdf::Tokenizer& tokenizer,
74   - QPDFObjectHandle::StringDecrypter* decrypter,
75   - QPDF* context,
76   - bool parse_pdf,
77   - int stream_id = 0,
78   - int obj_id = 0,
79   - bool sanity_checks = false) :
80   - input(input),
81   - object_description(object_description),
82   - tokenizer(tokenizer),
83   - decrypter(decrypter),
84   - context(context),
85   - description(std::move(sp_description)),
86   - parse_pdf(parse_pdf),
87   - stream_id(stream_id),
88   - obj_id(obj_id),
89   - sanity_checks(sanity_checks)
90   - {
91   - }
  35 + /// @brief Parse a PDF object from an input source.
  36 + /// @param input The input source to read from.
  37 + /// @param object_description Description of the object for error messages.
  38 + /// @param context The QPDF context, or nullptr if parsing standalone.
  39 + /// @return The parsed QPDFObjectHandle, or null if parsing fails.
  40 + static QPDFObjectHandle
  41 + parse(InputSource& input, std::string const& object_description, QPDF* context);
92 42  
93   - // Parser state. Note:
94   - // state <= st_dictionary_value == (state = st_dictionary_key || state = st_dictionary_value)
95   - enum parser_state_e { st_dictionary_key, st_dictionary_value, st_array };
  43 + /// @brief Parse a content stream from an input source.
  44 + /// @param input The input source to read from.
  45 + /// @param sp_description Shared pointer to object description.
  46 + /// @param tokenizer The tokenizer to use for parsing.
  47 + /// @param context The QPDF context.
  48 + /// @return The parsed QPDFObjectHandle, or uninitialized handle on EOF.
  49 + static QPDFObjectHandle parse_content(
  50 + InputSource& input,
  51 + std::shared_ptr<QPDFObject::Description> sp_description,
  52 + qpdf::Tokenizer& tokenizer,
  53 + QPDF* context);
96 54  
97   - struct StackFrame
98   - {
99   - StackFrame(InputSource& input, parser_state_e state) :
100   - state(state),
101   - offset(input.tell())
  55 + /// @brief Parse a PDF object (interface for deprecated QPDFObjectHandle::parse).
  56 + /// @param input The input source to read from.
  57 + /// @param object_description Description of the object for error messages.
  58 + /// @param tokenizer The tokenizer to use for parsing.
  59 + /// @param empty Output parameter indicating if object was empty.
  60 + /// @param decrypter String decrypter for encrypted strings, or nullptr.
  61 + /// @param context The QPDF context, or nullptr if parsing standalone.
  62 + /// @return The parsed QPDFObjectHandle.
  63 + static QPDFObjectHandle parse(
  64 + InputSource& input,
  65 + std::string const& object_description,
  66 + QPDFTokenizer& tokenizer,
  67 + bool& empty,
  68 + QPDFObjectHandle::StringDecrypter* decrypter,
  69 + QPDF* context);
  70 +
  71 + /// @brief Parse a PDF object for use by QPDF.
  72 + /// @param input The input source to read from.
  73 + /// @param object_description Description of the object for error messages.
  74 + /// @param tokenizer The tokenizer to use for parsing.
  75 + /// @param decrypter String decrypter for encrypted strings, or nullptr.
  76 + /// @param context The QPDF context.
  77 + /// @param sanity_checks Enable additional sanity checks during parsing.
  78 + /// @return The parsed QPDFObjectHandle.
  79 + static QPDFObjectHandle parse(
  80 + InputSource& input,
  81 + std::string const& object_description,
  82 + qpdf::Tokenizer& tokenizer,
  83 + QPDFObjectHandle::StringDecrypter* decrypter,
  84 + QPDF& context,
  85 + bool sanity_checks);
  86 +
  87 + /// @brief Parse an object from an object stream.
  88 + /// @param input The offset buffer containing the object data.
  89 + /// @param stream_id The object stream number.
  90 + /// @param obj_id The object ID within the stream.
  91 + /// @param tokenizer The tokenizer to use for parsing.
  92 + /// @param context The QPDF context.
  93 + /// @return The parsed QPDFObjectHandle.
  94 + static QPDFObjectHandle parse(
  95 + qpdf::is::OffsetBuffer& input,
  96 + int stream_id,
  97 + int obj_id,
  98 + qpdf::Tokenizer& tokenizer,
  99 + QPDF& context);
  100 +
  101 + /// @brief Create a description for a parsed object.
  102 + /// @param input_name The name of the input source.
  103 + /// @param object_description Description of the object being parsed.
  104 + /// @return Shared pointer to object description with offset placeholder.
  105 + static std::shared_ptr<QPDFObject::Description>
  106 + make_description(std::string const& input_name, std::string const& object_description)
102 107 {
  108 + using namespace std::literals;
  109 + return std::make_shared<QPDFObject::Description>(
  110 + input_name + ", " + object_description + " at offset $PO");
103 111 }
104 112  
105   - std::vector<QPDFObjectHandle> olist;
106   - std::map<std::string, QPDFObjectHandle> dict;
107   - parser_state_e state;
108   - std::string key;
109   - qpdf_offset_t offset;
110   - std::string contents_string;
111   - qpdf_offset_t contents_offset{-1};
112   - int null_count{0};
113   - };
  113 + private:
  114 + /// @brief Construct a parser instance.
  115 + /// @param input The input source to read from.
  116 + /// @param sp_description Shared pointer to object description.
  117 + /// @param object_description Description string for error messages.
  118 + /// @param tokenizer The tokenizer to use for parsing.
  119 + /// @param decrypter String decrypter for encrypted content.
  120 + /// @param context The QPDF context.
  121 + /// @param parse_pdf Whether parsing PDF objects (vs content streams).
  122 + /// @param stream_id Object stream ID for object stream parsing.
  123 + /// @param obj_id Object ID within object stream.
  124 + /// @param sanity_checks Enable additional sanity checks.
  125 + Parser(
  126 + InputSource& input,
  127 + std::shared_ptr<QPDFObject::Description> sp_description,
  128 + std::string const& object_description,
  129 + qpdf::Tokenizer& tokenizer,
  130 + QPDFObjectHandle::StringDecrypter* decrypter,
  131 + QPDF* context,
  132 + bool parse_pdf,
  133 + int stream_id = 0,
  134 + int obj_id = 0,
  135 + bool sanity_checks = false) :
  136 + input_(input),
  137 + object_description_(object_description),
  138 + tokenizer_(tokenizer),
  139 + decrypter_(decrypter),
  140 + context_(context),
  141 + description_(std::move(sp_description)),
  142 + parse_pdf_(parse_pdf),
  143 + stream_id_(stream_id),
  144 + obj_id_(obj_id),
  145 + sanity_checks_(sanity_checks)
  146 + {
  147 + }
  148 +
  149 + /// @brief Parser state enumeration.
  150 + /// @note state <= st_dictionary_value indicates we're in a dictionary context.
  151 + enum parser_state_e { st_dictionary_key, st_dictionary_value, st_array };
  152 +
  153 + /// @brief Stack frame for tracking nested arrays and dictionaries.
  154 + struct StackFrame
  155 + {
  156 + StackFrame(InputSource& input, parser_state_e state) :
  157 + state(state),
  158 + offset(input.tell())
  159 + {
  160 + }
  161 +
  162 + std::vector<QPDFObjectHandle> olist; ///< Object list for arrays/dict values
  163 + std::map<std::string, QPDFObjectHandle> dict; ///< Dictionary entries
  164 + parser_state_e state; ///< Current parser state
  165 + std::string key; ///< Current dictionary key
  166 + qpdf_offset_t offset; ///< Offset of container start
  167 + std::string contents_string; ///< For /Contents field in signatures
  168 + qpdf_offset_t contents_offset{-1}; ///< Offset of /Contents value
  169 + int null_count{0}; ///< Count of null values in container
  170 + };
  171 +
  172 + /// @brief Parse an object, handling exceptions and returning null on error.
  173 + /// @param content_stream True if parsing a content stream.
  174 + /// @return The parsed object handle, or null/uninitialized on error.
  175 + QPDFObjectHandle parse(bool content_stream = false);
  176 +
  177 + /// @brief Parse the first token and dispatch to appropriate handler.
  178 + /// @param content_stream True if parsing a content stream.
  179 + /// @return The parsed object handle.
  180 + QPDFObjectHandle parse_first(bool content_stream);
  181 +
  182 + /// @brief Parse the remainder of a composite object (array/dict/reference).
  183 + /// @param content_stream True if parsing a content stream.
  184 + /// @return The completed object handle.
  185 + QPDFObjectHandle parse_remainder(bool content_stream);
  186 +
  187 + /// @brief Add an object to the current container.
  188 + /// @param obj The object to add.
  189 + void add(std::shared_ptr<QPDFObject>&& obj);
114 190  
115   - QPDFObjectHandle parse(bool content_stream = false);
116   - QPDFObjectHandle parse_first(bool content_stream);
117   - QPDFObjectHandle parseRemainder(bool content_stream);
118   - void add(std::shared_ptr<QPDFObject>&& obj);
119   - void addNull();
120   - void add_bad_null(std::string const& msg);
121   - void addInt(int count);
122   - template <typename T, typename... Args>
123   - void addScalar(Args&&... args);
124   - void check_too_many_bad_tokens();
125   - void warnDuplicateKey();
126   - void fixMissingKeys();
127   - [[noreturn]] void limits_error(std::string const& limit, std::string const& msg);
128   - void warn(qpdf_offset_t offset, std::string const& msg) const;
129   - void warn(std::string const& msg) const;
130   - void warn(QPDFExc const&) const;
131   - template <typename T, typename... Args>
132   - // Create a new scalar object complete with parsed offset and description.
133   - // NB the offset includes any leading whitespace.
134   - QPDFObjectHandle withDescription(Args&&... args);
135   - void setDescription(std::shared_ptr<QPDFObject>& obj, qpdf_offset_t parsed_offset);
136   - InputSource& input;
137   - std::string const& object_description;
138   - qpdf::Tokenizer& tokenizer;
139   - QPDFObjectHandle::StringDecrypter* decrypter;
140   - QPDF* context;
141   - std::shared_ptr<QPDFObject::Description> description;
142   - bool parse_pdf{false};
143   - int stream_id{0};
144   - int obj_id{0};
145   - bool sanity_checks{false};
146   -
147   - std::vector<StackFrame> stack;
148   - StackFrame* frame{nullptr};
149   - // Number of recent bad tokens. This will always be > 0 once a bad token has been encountered as
150   - // it only gets incremented or reset when a bad token is encountered.
151   - int bad_count{0};
152   - // Number of bad tokens (remaining) before giving up.
153   - uint32_t max_bad_count{Limits::parser_max_errors()};
154   - // Number of good tokens since last bad token. Irrelevant if bad_count == 0.
155   - int good_count{0};
156   - // Start offset including any leading whitespace.
157   - qpdf_offset_t start{0};
158   - // Number of successive integer tokens.
159   - int int_count{0};
160   - long long int_buffer[2]{0, 0};
161   - qpdf_offset_t last_offset_buffer[2]{0, 0};
162   - bool empty_{false};
163   -};
  191 + /// @brief Add a null object to the current container.
  192 + void add_null();
  193 +
  194 + /// @brief Add a null with a warning message.
  195 + /// @param msg Warning message describing the error.
  196 + void add_bad_null(std::string const& msg);
  197 +
  198 + /// @brief Add a buffered integer from int_buffer_.
  199 + /// @param count Buffer index (1 or 2) to read from.
  200 + void add_int(int count);
  201 +
  202 + /// @brief Create and add a scalar object to the current container.
  203 + /// @tparam T The scalar object type (e.g., QPDF_Integer, QPDF_String).
  204 + /// @tparam Args Constructor argument types.
  205 + /// @param args Arguments to forward to the object constructor.
  206 + template <typename T, typename... Args>
  207 + void add_scalar(Args&&... args);
  208 +
  209 + /// @brief Check if too many bad tokens have been encountered and throw if so.
  210 + void check_too_many_bad_tokens();
  211 +
  212 + /// @brief Issue a warning about a duplicate dictionary key.
  213 + void warn_duplicate_key();
  214 +
  215 + /// @brief Fix dictionaries with missing keys by generating fake keys.
  216 + void fix_missing_keys();
  217 +
  218 + /// @brief Report a limits error and throw.
  219 + /// @param limit The limit identifier.
  220 + /// @param msg Error message.
  221 + [[noreturn]] void limits_error(std::string const& limit, std::string const& msg);
  222 +
  223 + /// @brief Issue a warning at a specific offset.
  224 + /// @param offset File offset for the warning.
  225 + /// @param msg Warning message.
  226 + void warn(qpdf_offset_t offset, std::string const& msg) const;
  227 +
  228 + /// @brief Issue a warning at the current offset.
  229 + /// @param msg Warning message.
  230 + void warn(std::string const& msg) const;
  231 +
  232 + /// @brief Issue a warning from a QPDFExc exception.
  233 + /// @param e The exception to report.
  234 + void warn(QPDFExc const& e) const;
  235 +
  236 + /// @brief Create a scalar object with description and parsed offset.
  237 + /// @tparam T The scalar object type.
  238 + /// @tparam Args Constructor argument types.
  239 + /// @param args Arguments to forward to the object constructor.
  240 + /// @return Object handle with description and offset set.
  241 + /// @note The offset includes any leading whitespace.
  242 + template <typename T, typename... Args>
  243 + QPDFObjectHandle with_description(Args&&... args);
  244 +
  245 + /// @brief Set the description and offset on an existing object.
  246 + /// @param obj The object to update.
  247 + /// @param parsed_offset The file offset where the object was parsed.
  248 + void set_description(std::shared_ptr<QPDFObject>& obj, qpdf_offset_t parsed_offset);
  249 +
  250 + // Core parsing state
  251 + InputSource& input_; ///< Input source to read from
  252 + std::string const& object_description_; ///< Description for error messages
  253 + qpdf::Tokenizer& tokenizer_; ///< Tokenizer for lexical analysis
  254 + QPDFObjectHandle::StringDecrypter* decrypter_; ///< Decrypter for encrypted strings
  255 + QPDF* context_; ///< QPDF context for object resolution
  256 + std::shared_ptr<QPDFObject::Description> description_; ///< Shared description for objects
  257 + bool parse_pdf_{false}; ///< True if parsing PDF objects vs content streams
  258 + int stream_id_{0}; ///< Object stream ID (for object stream parsing)
  259 + int obj_id_{0}; ///< Object ID within object stream
  260 + bool sanity_checks_{false}; ///< Enable additional validation checks
  261 +
  262 + // Composite object parsing state
  263 + std::vector<StackFrame> stack_; ///< Stack of nested containers
  264 + StackFrame* frame_{nullptr}; ///< Current stack frame pointer
  265 +
  266 + // Error tracking state
  267 + /// Number of recent bad tokens. Always > 0 after first bad token encountered.
  268 + int bad_count_{0};
  269 + /// Number of bad tokens remaining before giving up.
  270 + uint32_t max_bad_count_{Limits::parser_max_errors()};
  271 + /// Number of good tokens since last bad token. Irrelevant if bad_count == 0.
  272 + int good_count_{0};
  273 +
  274 + // Token buffering state
  275 + /// Start offset of current object, including any leading whitespace.
  276 + qpdf_offset_t start_{0};
  277 + /// Number of successive integer tokens (for indirect reference detection).
  278 + int int_count_{0};
  279 + /// Buffer for up to 2 integer tokens.
  280 + long long int_buffer_[2]{0, 0};
  281 + /// Offsets corresponding to buffered integers.
  282 + qpdf_offset_t last_offset_buffer_[2]{0, 0};
  283 +
  284 + /// True if object was empty (endobj without content).
  285 + bool empty_{false};
  286 + };
  287 +} // namespace qpdf::impl
164 288  
165 289 #endif // QPDFPARSER_HH
... ...
qpdf/qtest/qpdf/parse-object.out
1 1 [ /name 16059 3.14159 false << /key true /other [ (string1) (string2) ] >> null ]
2   -logic error parsing indirect: QPDFParser::parse called without context on an object with indirect references
  2 +logic error parsing indirect: Parser::parse called without context on an object with indirect references
3 3 trailing data: parsed object (trailing test): trailing data found parsing object from string
4 4 WARNING: parsed object (offset 9): unknown token while reading object; treating as string
5 5 WARNING: parsed object: treating unexpected brace token as null
... ...