Commit 51932fd91bca1cf2a155ac5c376a13dbd71da546

Authored by m-holger
Committed by GitHub
2 parents 843e2b45 44cd31b9

Merge pull request #1651 from m-holger/parser

Refactor QPDFParser
include/qpdf/QPDFObjectHandle.hh
@@ -61,11 +61,14 @@ class QPDFTokenizer; @@ -61,11 +61,14 @@ class QPDFTokenizer;
61 class QPDFExc; 61 class QPDFExc;
62 class Pl_QPDFTokenizer; 62 class Pl_QPDFTokenizer;
63 class QPDFMatrix; 63 class QPDFMatrix;
64 -class QPDFParser; 64 +namespace qpdf::impl
  65 +{
  66 + class Parser;
  67 +}
65 68
66 class QPDFObjectHandle: public qpdf::BaseHandle 69 class QPDFObjectHandle: public qpdf::BaseHandle
67 { 70 {
68 - friend class QPDFParser; 71 + friend class qpdf::impl::Parser;
69 72
70 public: 73 public:
71 // This class is used by replaceStreamData. It provides an alternative way of associating 74 // This class is used by replaceStreamData. It provides an alternative way of associating
include/qpdf/QPDFTokenizer.hh
@@ -31,6 +31,10 @@ @@ -31,6 +31,10 @@
31 namespace qpdf 31 namespace qpdf
32 { 32 {
33 class Tokenizer; 33 class Tokenizer;
  34 + namespace impl
  35 + {
  36 + class Parser;
  37 + }
34 } // namespace qpdf 38 } // namespace qpdf
35 39
36 class QPDFTokenizer 40 class QPDFTokenizer
@@ -203,7 +207,7 @@ class QPDFTokenizer @@ -203,7 +207,7 @@ class QPDFTokenizer
203 void expectInlineImage(InputSource& input); 207 void expectInlineImage(InputSource& input);
204 208
205 private: 209 private:
206 - friend class QPDFParser; 210 + friend class qpdf::impl::Parser;
207 211
208 QPDFTokenizer(QPDFTokenizer const&) = delete; 212 QPDFTokenizer(QPDFTokenizer const&) = delete;
209 QPDFTokenizer& operator=(QPDFTokenizer const&) = delete; 213 QPDFTokenizer& operator=(QPDFTokenizer const&) = delete;
libqpdf/QPDFObjectHandle.cc
@@ -25,6 +25,8 @@ @@ -25,6 +25,8 @@
25 using namespace std::literals; 25 using namespace std::literals;
26 using namespace qpdf; 26 using namespace qpdf;
27 27
  28 +using Parser = impl::Parser;
  29 +
28 const Null Null::temp_; 30 const Null Null::temp_;
29 31
30 BaseHandle:: 32 BaseHandle::
@@ -1540,7 +1542,7 @@ QPDFObjectHandle::parse( @@ -1540,7 +1542,7 @@ QPDFObjectHandle::parse(
1540 QPDF* context, std::string const& object_str, std::string const& object_description) 1542 QPDF* context, std::string const& object_str, std::string const& object_description)
1541 { 1543 {
1542 auto input = is::OffsetBuffer("parsed object", object_str); 1544 auto input = is::OffsetBuffer("parsed object", object_str);
1543 - auto result = QPDFParser::parse(input, object_description, context); 1545 + auto result = Parser::parse(input, object_description, context);
1544 size_t offset = QIntC::to_size(input.tell()); 1546 size_t offset = QIntC::to_size(input.tell());
1545 while (offset < object_str.length()) { 1547 while (offset < object_str.length()) {
1546 if (!isspace(object_str.at(offset))) { 1548 if (!isspace(object_str.at(offset))) {
@@ -1661,7 +1663,7 @@ QPDFObjectHandle::parseContentStream_data( @@ -1661,7 +1663,7 @@ QPDFObjectHandle::parseContentStream_data(
1661 auto input = is::OffsetBuffer(description, stream_data); 1663 auto input = is::OffsetBuffer(description, stream_data);
1662 Tokenizer tokenizer; 1664 Tokenizer tokenizer;
1663 tokenizer.allowEOF(); 1665 tokenizer.allowEOF();
1664 - auto sp_description = QPDFParser::make_description(description, "content"); 1666 + auto sp_description = Parser::make_description(description, "content");
1665 while (QIntC::to_size(input.tell()) < stream_length) { 1667 while (QIntC::to_size(input.tell()) < stream_length) {
1666 // Read a token and seek to the beginning. The offset we get from this process is the 1668 // Read a token and seek to the beginning. The offset we get from this process is the
1667 // beginning of the next non-ignorable (space, comment) token. This way, the offset and 1669 // beginning of the next non-ignorable (space, comment) token. This way, the offset and
@@ -1669,7 +1671,7 @@ QPDFObjectHandle::parseContentStream_data( @@ -1669,7 +1671,7 @@ QPDFObjectHandle::parseContentStream_data(
1669 tokenizer.nextToken(input, "content", true); 1671 tokenizer.nextToken(input, "content", true);
1670 qpdf_offset_t offset = input.getLastOffset(); 1672 qpdf_offset_t offset = input.getLastOffset();
1671 input.seek(offset, SEEK_SET); 1673 input.seek(offset, SEEK_SET);
1672 - auto obj = QPDFParser::parse_content(input, sp_description, tokenizer, context); 1674 + auto obj = Parser::parse_content(input, sp_description, tokenizer, context);
1673 if (!obj) { 1675 if (!obj) {
1674 // EOF 1676 // EOF
1675 break; 1677 break;
@@ -1678,7 +1680,7 @@ QPDFObjectHandle::parseContentStream_data( @@ -1678,7 +1680,7 @@ QPDFObjectHandle::parseContentStream_data(
1678 if (callbacks) { 1680 if (callbacks) {
1679 callbacks->handleObject(obj, QIntC::to_size(offset), length); 1681 callbacks->handleObject(obj, QIntC::to_size(offset), length);
1680 } 1682 }
1681 - if (obj.isOperator() && (obj.getOperatorValue() == "ID")) { 1683 + if (obj.isOperator() && obj.getOperatorValue() == "ID") {
1682 // Discard next character; it is the space after ID that terminated the token. Read 1684 // Discard next character; it is the space after ID that terminated the token. Read
1683 // until end of inline image. 1685 // until end of inline image.
1684 char ch; 1686 char ch;
@@ -1731,7 +1733,7 @@ QPDFObjectHandle::parse( @@ -1731,7 +1733,7 @@ QPDFObjectHandle::parse(
1731 StringDecrypter* decrypter, 1733 StringDecrypter* decrypter,
1732 QPDF* context) 1734 QPDF* context)
1733 { 1735 {
1734 - return QPDFParser::parse(*input, object_description, tokenizer, empty, decrypter, context); 1736 + return Parser::parse(*input, object_description, tokenizer, empty, decrypter, context);
1735 } 1737 }
1736 1738
1737 qpdf_offset_t 1739 qpdf_offset_t
libqpdf/QPDFParser.cc
@@ -46,12 +46,13 @@ class QPDF::Doc::ParseGuard @@ -46,12 +46,13 @@ class QPDF::Doc::ParseGuard
46 }; 46 };
47 47
48 using ParseGuard = QPDF::Doc::ParseGuard; 48 using ParseGuard = QPDF::Doc::ParseGuard;
  49 +using Parser = qpdf::impl::Parser;
49 50
50 QPDFObjectHandle 51 QPDFObjectHandle
51 -QPDFParser::parse(InputSource& input, std::string const& object_description, QPDF* context) 52 +Parser::parse(InputSource& input, std::string const& object_description, QPDF* context)
52 { 53 {
53 qpdf::Tokenizer tokenizer; 54 qpdf::Tokenizer tokenizer;
54 - if (auto result = QPDFParser( 55 + if (auto result = Parser(
55 input, 56 input,
56 make_description(input.getName(), object_description), 57 make_description(input.getName(), object_description),
57 object_description, 58 object_description,
@@ -66,14 +67,14 @@ QPDFParser::parse(InputSource&amp; input, std::string const&amp; object_description, QPD @@ -66,14 +67,14 @@ QPDFParser::parse(InputSource&amp; input, std::string const&amp; object_description, QPD
66 } 67 }
67 68
68 QPDFObjectHandle 69 QPDFObjectHandle
69 -QPDFParser::parse_content( 70 +Parser::parse_content(
70 InputSource& input, 71 InputSource& input,
71 std::shared_ptr<QPDFObject::Description> sp_description, 72 std::shared_ptr<QPDFObject::Description> sp_description,
72 qpdf::Tokenizer& tokenizer, 73 qpdf::Tokenizer& tokenizer,
73 QPDF* context) 74 QPDF* context)
74 { 75 {
75 static const std::string content("content"); // GCC12 - make constexpr 76 static const std::string content("content"); // GCC12 - make constexpr
76 - auto p = QPDFParser( 77 + auto p = Parser(
77 input, 78 input,
78 std::move(sp_description), 79 std::move(sp_description),
79 content, 80 content,
@@ -93,7 +94,7 @@ QPDFParser::parse_content( @@ -93,7 +94,7 @@ QPDFParser::parse_content(
93 } 94 }
94 95
95 QPDFObjectHandle 96 QPDFObjectHandle
96 -QPDFParser::parse( 97 +Parser::parse(
97 InputSource& input, 98 InputSource& input,
98 std::string const& object_description, 99 std::string const& object_description,
99 QPDFTokenizer& tokenizer, 100 QPDFTokenizer& tokenizer,
@@ -103,7 +104,7 @@ QPDFParser::parse( @@ -103,7 +104,7 @@ QPDFParser::parse(
103 { 104 {
104 // ABI: This parse overload is only used by the deprecated QPDFObjectHandle::parse. It is the 105 // ABI: This parse overload is only used by the deprecated QPDFObjectHandle::parse. It is the
105 // only user of the 'empty' member. When removing this overload also remove 'empty'. 106 // only user of the 'empty' member. When removing this overload also remove 'empty'.
106 - auto p = QPDFParser( 107 + auto p = Parser(
107 input, 108 input,
108 make_description(input.getName(), object_description), 109 make_description(input.getName(), object_description),
109 object_description, 110 object_description,
@@ -120,7 +121,7 @@ QPDFParser::parse( @@ -120,7 +121,7 @@ QPDFParser::parse(
120 } 121 }
121 122
122 QPDFObjectHandle 123 QPDFObjectHandle
123 -QPDFParser::parse( 124 +Parser::parse(
124 InputSource& input, 125 InputSource& input,
125 std::string const& object_description, 126 std::string const& object_description,
126 qpdf::Tokenizer& tokenizer, 127 qpdf::Tokenizer& tokenizer,
@@ -128,7 +129,7 @@ QPDFParser::parse( @@ -128,7 +129,7 @@ QPDFParser::parse(
128 QPDF& context, 129 QPDF& context,
129 bool sanity_checks) 130 bool sanity_checks)
130 { 131 {
131 - return QPDFParser( 132 + return Parser(
132 input, 133 input,
133 make_description(input.getName(), object_description), 134 make_description(input.getName(), object_description),
134 object_description, 135 object_description,
@@ -143,10 +144,10 @@ QPDFParser::parse( @@ -143,10 +144,10 @@ QPDFParser::parse(
143 } 144 }
144 145
145 QPDFObjectHandle 146 QPDFObjectHandle
146 -QPDFParser::parse( 147 +Parser::parse(
147 is::OffsetBuffer& input, int stream_id, int obj_id, qpdf::Tokenizer& tokenizer, QPDF& context) 148 is::OffsetBuffer& input, int stream_id, int obj_id, qpdf::Tokenizer& tokenizer, QPDF& context)
148 { 149 {
149 - return QPDFParser( 150 + return Parser(
150 input, 151 input,
151 std::make_shared<QPDFObject::Description>( 152 std::make_shared<QPDFObject::Description>(
152 QPDFObject::ObjStreamDescr(stream_id, obj_id)), 153 QPDFObject::ObjStreamDescr(stream_id, obj_id)),
@@ -161,7 +162,7 @@ QPDFParser::parse( @@ -161,7 +162,7 @@ QPDFParser::parse(
161 } 162 }
162 163
163 QPDFObjectHandle 164 QPDFObjectHandle
164 -QPDFParser::parse(bool content_stream) 165 +Parser::parse(bool content_stream)
165 { 166 {
166 try { 167 try {
167 return parse_first(content_stream); 168 return parse_first(content_stream);
@@ -178,20 +179,20 @@ QPDFParser::parse(bool content_stream) @@ -178,20 +179,20 @@ QPDFParser::parse(bool content_stream)
178 } 179 }
179 180
180 QPDFObjectHandle 181 QPDFObjectHandle
181 -QPDFParser::parse_first(bool content_stream) 182 +Parser::parse_first(bool content_stream)
182 { 183 {
183 // This method must take care not to resolve any objects. Don't check the type of any object 184 // This method must take care not to resolve any objects. Don't check the type of any object
184 // without first ensuring that it is a direct object. Otherwise, doing so may have the side 185 // without first ensuring that it is a direct object. Otherwise, doing so may have the side
185 // effect of reading the object and changing the file pointer. If you do this, it will cause a 186 // effect of reading the object and changing the file pointer. If you do this, it will cause a
186 // logic error to be thrown from QPDF::inParse(). 187 // logic error to be thrown from QPDF::inParse().
187 188
188 - QPDF::Doc::ParseGuard pg(context);  
189 - start = input.tell();  
190 - if (!tokenizer.nextToken(input, object_description)) {  
191 - warn(tokenizer.getErrorMessage()); 189 + QPDF::Doc::ParseGuard pg(context_);
  190 + start_ = input_.tell();
  191 + if (!tokenizer_.nextToken(input_, object_description_)) {
  192 + warn(tokenizer_.getErrorMessage());
192 } 193 }
193 194
194 - switch (tokenizer.getType()) { 195 + switch (tokenizer_.getType()) {
195 case QPDFTokenizer::tt_eof: 196 case QPDFTokenizer::tt_eof:
196 if (content_stream) { 197 if (content_stream) {
197 // In content stream mode, leave object uninitialized to indicate EOF 198 // In content stream mode, leave object uninitialized to indicate EOF
@@ -219,57 +220,57 @@ QPDFParser::parse_first(bool content_stream) @@ -219,57 +220,57 @@ QPDFParser::parse_first(bool content_stream)
219 220
220 case QPDFTokenizer::tt_array_open: 221 case QPDFTokenizer::tt_array_open:
221 case QPDFTokenizer::tt_dict_open: 222 case QPDFTokenizer::tt_dict_open:
222 - stack.clear();  
223 - stack.emplace_back(  
224 - input,  
225 - (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array : st_dictionary_key);  
226 - frame = &stack.back();  
227 - return parseRemainder(content_stream); 223 + stack_.clear();
  224 + stack_.emplace_back(
  225 + input_,
  226 + (tokenizer_.getType() == QPDFTokenizer::tt_array_open) ? st_array : st_dictionary_key);
  227 + frame_ = &stack_.back();
  228 + return parse_remainder(content_stream);
228 229
229 case QPDFTokenizer::tt_bool: 230 case QPDFTokenizer::tt_bool:
230 - return withDescription<QPDF_Bool>(tokenizer.getValue() == "true"); 231 + return with_description<QPDF_Bool>(tokenizer_.getValue() == "true");
231 232
232 case QPDFTokenizer::tt_null: 233 case QPDFTokenizer::tt_null:
233 return {QPDFObject::create<QPDF_Null>()}; 234 return {QPDFObject::create<QPDF_Null>()};
234 235
235 case QPDFTokenizer::tt_integer: 236 case QPDFTokenizer::tt_integer:
236 - return withDescription<QPDF_Integer>(QUtil::string_to_ll(tokenizer.getValue().c_str())); 237 + return with_description<QPDF_Integer>(QUtil::string_to_ll(tokenizer_.getValue().c_str()));
237 238
238 case QPDFTokenizer::tt_real: 239 case QPDFTokenizer::tt_real:
239 - return withDescription<QPDF_Real>(tokenizer.getValue()); 240 + return with_description<QPDF_Real>(tokenizer_.getValue());
240 241
241 case QPDFTokenizer::tt_name: 242 case QPDFTokenizer::tt_name:
242 - return withDescription<QPDF_Name>(tokenizer.getValue()); 243 + return with_description<QPDF_Name>(tokenizer_.getValue());
243 244
244 case QPDFTokenizer::tt_word: 245 case QPDFTokenizer::tt_word:
245 { 246 {
246 - auto const& value = tokenizer.getValue(); 247 + auto const& value = tokenizer_.getValue();
247 if (content_stream) { 248 if (content_stream) {
248 - return withDescription<QPDF_Operator>(value); 249 + return with_description<QPDF_Operator>(value);
249 } else if (value == "endobj") { 250 } else if (value == "endobj") {
250 // We just saw endobj without having read anything. Nothing in the PDF spec appears 251 // We just saw endobj without having read anything. Nothing in the PDF spec appears
251 // to allow empty objects, but they have been encountered in actual PDF files and 252 // to allow empty objects, but they have been encountered in actual PDF files and
252 // Adobe Reader appears to ignore them. Treat this as a null and do not move the 253 // Adobe Reader appears to ignore them. Treat this as a null and do not move the
253 // input source's offset. 254 // input source's offset.
254 empty_ = true; 255 empty_ = true;
255 - input.seek(input.getLastOffset(), SEEK_SET); 256 + input_.seek(input_.getLastOffset(), SEEK_SET);
256 if (!content_stream) { 257 if (!content_stream) {
257 warn("empty object treated as null"); 258 warn("empty object treated as null");
258 } 259 }
259 return {}; 260 return {};
260 } else { 261 } else {
261 warn("unknown token while reading object; treating as string"); 262 warn("unknown token while reading object; treating as string");
262 - return withDescription<QPDF_String>(value); 263 + return with_description<QPDF_String>(value);
263 } 264 }
264 } 265 }
265 266
266 case QPDFTokenizer::tt_string: 267 case QPDFTokenizer::tt_string:
267 - if (decrypter) {  
268 - std::string s{tokenizer.getValue()};  
269 - decrypter->decryptString(s);  
270 - return withDescription<QPDF_String>(s); 268 + if (decrypter_) {
  269 + std::string s{tokenizer_.getValue()};
  270 + decrypter_->decryptString(s);
  271 + return with_description<QPDF_String>(s);
271 } else { 272 } else {
272 - return withDescription<QPDF_String>(tokenizer.getValue()); 273 + return with_description<QPDF_String>(tokenizer_.getValue());
273 } 274 }
274 275
275 default: 276 default:
@@ -279,65 +280,65 @@ QPDFParser::parse_first(bool content_stream) @@ -279,65 +280,65 @@ QPDFParser::parse_first(bool content_stream)
279 } 280 }
280 281
281 QPDFObjectHandle 282 QPDFObjectHandle
282 -QPDFParser::parseRemainder(bool content_stream) 283 +Parser::parse_remainder(bool content_stream)
283 { 284 {
284 // This method must take care not to resolve any objects. Don't check the type of any object 285 // This method must take care not to resolve any objects. Don't check the type of any object
285 // without first ensuring that it is a direct object. Otherwise, doing so may have the side 286 // without first ensuring that it is a direct object. Otherwise, doing so may have the side
286 // effect of reading the object and changing the file pointer. If you do this, it will cause a 287 // effect of reading the object and changing the file pointer. If you do this, it will cause a
287 // logic error to be thrown from QPDF::inParse(). 288 // logic error to be thrown from QPDF::inParse().
288 289
289 - bad_count = 0; 290 + bad_count_ = 0;
290 bool b_contents = false; 291 bool b_contents = false;
291 292
292 while (true) { 293 while (true) {
293 - if (!tokenizer.nextToken(input, object_description)) {  
294 - warn(tokenizer.getErrorMessage()); 294 + if (!tokenizer_.nextToken(input_, object_description_)) {
  295 + warn(tokenizer_.getErrorMessage());
295 } 296 }
296 - ++good_count; // optimistically 297 + ++good_count_; // optimistically
297 298
298 - if (int_count != 0) { 299 + if (int_count_ != 0) {
299 // Special handling of indirect references. Treat integer tokens as part of an indirect 300 // Special handling of indirect references. Treat integer tokens as part of an indirect
300 // reference until proven otherwise. 301 // reference until proven otherwise.
301 - if (tokenizer.getType() == QPDFTokenizer::tt_integer) {  
302 - if (++int_count > 2) { 302 + if (tokenizer_.getType() == QPDFTokenizer::tt_integer) {
  303 + if (++int_count_ > 2) {
303 // Process the oldest buffered integer. 304 // Process the oldest buffered integer.
304 - addInt(int_count); 305 + add_int(int_count_);
305 } 306 }
306 - last_offset_buffer[int_count % 2] = input.getLastOffset();  
307 - int_buffer[int_count % 2] = QUtil::string_to_ll(tokenizer.getValue().c_str()); 307 + last_offset_buffer_[int_count_ % 2] = input_.getLastOffset();
  308 + int_buffer_[int_count_ % 2] = QUtil::string_to_ll(tokenizer_.getValue().c_str());
308 continue; 309 continue;
309 310
310 } else if ( 311 } else if (
311 - int_count >= 2 && tokenizer.getType() == QPDFTokenizer::tt_word &&  
312 - tokenizer.getValue() == "R") {  
313 - if (!context) { 312 + int_count_ >= 2 && tokenizer_.getType() == QPDFTokenizer::tt_word &&
  313 + tokenizer_.getValue() == "R") {
  314 + if (!context_) {
314 throw std::logic_error( 315 throw std::logic_error(
315 - "QPDFParser::parse called without context on an object with indirect " 316 + "Parser::parse called without context on an object with indirect "
316 "references"); 317 "references");
317 } 318 }
318 - auto id = QIntC::to_int(int_buffer[(int_count - 1) % 2]);  
319 - auto gen = QIntC::to_int(int_buffer[(int_count) % 2]); 319 + auto id = QIntC::to_int(int_buffer_[(int_count_ - 1) % 2]);
  320 + auto gen = QIntC::to_int(int_buffer_[(int_count_) % 2]);
320 if (!(id < 1 || gen < 0 || gen >= 65535)) { 321 if (!(id < 1 || gen < 0 || gen >= 65535)) {
321 - add(ParseGuard::getObject(context, id, gen, parse_pdf)); 322 + add(ParseGuard::getObject(context_, id, gen, parse_pdf_));
322 } else { 323 } else {
323 add_bad_null( 324 add_bad_null(
324 "treating bad indirect reference (" + std::to_string(id) + " " + 325 "treating bad indirect reference (" + std::to_string(id) + " " +
325 std::to_string(gen) + " R) as null"); 326 std::to_string(gen) + " R) as null");
326 } 327 }
327 - int_count = 0; 328 + int_count_ = 0;
328 continue; 329 continue;
329 330
330 - } else if (int_count > 0) { 331 + } else if (int_count_ > 0) {
331 // Process the buffered integers before processing the current token. 332 // Process the buffered integers before processing the current token.
332 - if (int_count > 1) {  
333 - addInt(int_count - 1); 333 + if (int_count_ > 1) {
  334 + add_int(int_count_ - 1);
334 } 335 }
335 - addInt(int_count);  
336 - int_count = 0; 336 + add_int(int_count_);
  337 + int_count_ = 0;
337 } 338 }
338 } 339 }
339 340
340 - switch (tokenizer.getType()) { 341 + switch (tokenizer_.getType()) {
341 case QPDFTokenizer::tt_eof: 342 case QPDFTokenizer::tt_eof:
342 warn("parse error while reading object"); 343 warn("parse error while reading object");
343 if (content_stream) { 344 if (content_stream) {
@@ -349,7 +350,7 @@ QPDFParser::parseRemainder(bool content_stream) @@ -349,7 +350,7 @@ QPDFParser::parseRemainder(bool content_stream)
349 350
350 case QPDFTokenizer::tt_bad: 351 case QPDFTokenizer::tt_bad:
351 check_too_many_bad_tokens(); 352 check_too_many_bad_tokens();
352 - addNull(); 353 + add_null();
353 continue; 354 continue;
354 355
355 case QPDFTokenizer::tt_brace_open: 356 case QPDFTokenizer::tt_brace_open:
@@ -358,23 +359,23 @@ QPDFParser::parseRemainder(bool content_stream) @@ -358,23 +359,23 @@ QPDFParser::parseRemainder(bool content_stream)
358 continue; 359 continue;
359 360
360 case QPDFTokenizer::tt_array_close: 361 case QPDFTokenizer::tt_array_close:
361 - if (frame->state == st_array) {  
362 - auto object = frame->null_count > 100  
363 - ? QPDFObject::create<QPDF_Array>(std::move(frame->olist), true)  
364 - : QPDFObject::create<QPDF_Array>(std::move(frame->olist));  
365 - setDescription(object, frame->offset - 1); 362 + if (frame_->state == st_array) {
  363 + auto object = frame_->null_count > 100
  364 + ? QPDFObject::create<QPDF_Array>(std::move(frame_->olist), true)
  365 + : QPDFObject::create<QPDF_Array>(std::move(frame_->olist));
  366 + set_description(object, frame_->offset - 1);
366 // The `offset` points to the next of "[". Set the rewind offset to point to the 367 // The `offset` points to the next of "[". Set the rewind offset to point to the
367 // beginning of "[". This has been explicitly tested with whitespace surrounding the 368 // beginning of "[". This has been explicitly tested with whitespace surrounding the
368 // array start delimiter. getLastOffset points to the array end token and therefore 369 // array start delimiter. getLastOffset points to the array end token and therefore
369 // can't be used here. 370 // can't be used here.
370 - if (stack.size() <= 1) { 371 + if (stack_.size() <= 1) {
371 return object; 372 return object;
372 } 373 }
373 - stack.pop_back();  
374 - frame = &stack.back(); 374 + stack_.pop_back();
  375 + frame_ = &stack_.back();
375 add(std::move(object)); 376 add(std::move(object));
376 } else { 377 } else {
377 - if (sanity_checks) { 378 + if (sanity_checks_) {
378 // During sanity checks, assume nesting of containers is corrupt and object is 379 // During sanity checks, assume nesting of containers is corrupt and object is
379 // unusable. 380 // unusable.
380 warn("unexpected array close token; giving up on reading object"); 381 warn("unexpected array close token; giving up on reading object");
@@ -385,46 +386,46 @@ QPDFParser::parseRemainder(bool content_stream) @@ -385,46 +386,46 @@ QPDFParser::parseRemainder(bool content_stream)
385 continue; 386 continue;
386 387
387 case QPDFTokenizer::tt_dict_close: 388 case QPDFTokenizer::tt_dict_close:
388 - if (frame->state <= st_dictionary_value) { 389 + if (frame_->state <= st_dictionary_value) {
389 // Attempt to recover more or less gracefully from invalid dictionaries. 390 // Attempt to recover more or less gracefully from invalid dictionaries.
390 - auto& dict = frame->dict; 391 + auto& dict = frame_->dict;
391 392
392 - if (frame->state == st_dictionary_value) { 393 + if (frame_->state == st_dictionary_value) {
393 warn( 394 warn(
394 - frame->offset, 395 + frame_->offset,
395 "dictionary ended prematurely; using null as value for last key"); 396 "dictionary ended prematurely; using null as value for last key");
396 - dict[frame->key] = QPDFObject::create<QPDF_Null>(); 397 + dict[frame_->key] = QPDFObject::create<QPDF_Null>();
397 } 398 }
398 - if (!frame->olist.empty()) {  
399 - if (sanity_checks) { 399 + if (!frame_->olist.empty()) {
  400 + if (sanity_checks_) {
400 warn( 401 warn(
401 - frame->offset, 402 + frame_->offset,
402 "expected dictionary keys but found non-name objects; ignoring"); 403 "expected dictionary keys but found non-name objects; ignoring");
403 } else { 404 } else {
404 - fixMissingKeys(); 405 + fix_missing_keys();
405 } 406 }
406 } 407 }
407 408
408 - if (!frame->contents_string.empty() && dict.contains("/Type") && 409 + if (!frame_->contents_string.empty() && dict.contains("/Type") &&
409 dict["/Type"].isNameAndEquals("/Sig") && dict.contains("/ByteRange") && 410 dict["/Type"].isNameAndEquals("/Sig") && dict.contains("/ByteRange") &&
410 dict.contains("/Contents") && dict["/Contents"].isString()) { 411 dict.contains("/Contents") && dict["/Contents"].isString()) {
411 - dict["/Contents"] = QPDFObjectHandle::newString(frame->contents_string);  
412 - dict["/Contents"].setParsedOffset(frame->contents_offset); 412 + dict["/Contents"] = QPDFObjectHandle::newString(frame_->contents_string);
  413 + dict["/Contents"].setParsedOffset(frame_->contents_offset);
413 } 414 }
414 auto object = QPDFObject::create<QPDF_Dictionary>(std::move(dict)); 415 auto object = QPDFObject::create<QPDF_Dictionary>(std::move(dict));
415 - setDescription(object, frame->offset - 2); 416 + set_description(object, frame_->offset - 2);
416 // The `offset` points to the next of "<<". Set the rewind offset to point to the 417 // The `offset` points to the next of "<<". Set the rewind offset to point to the
417 // beginning of "<<". This has been explicitly tested with whitespace surrounding 418 // beginning of "<<". This has been explicitly tested with whitespace surrounding
418 // the dictionary start delimiter. getLastOffset points to the dictionary end token 419 // the dictionary start delimiter. getLastOffset points to the dictionary end token
419 // and therefore can't be used here. 420 // and therefore can't be used here.
420 - if (stack.size() <= 1) { 421 + if (stack_.size() <= 1) {
421 return object; 422 return object;
422 } 423 }
423 - stack.pop_back();  
424 - frame = &stack.back(); 424 + stack_.pop_back();
  425 + frame_ = &stack_.back();
425 add(std::move(object)); 426 add(std::move(object));
426 } else { 427 } else {
427 - if (sanity_checks) { 428 + if (sanity_checks_) {
428 // During sanity checks, assume nesting of containers is corrupt and object is 429 // During sanity checks, assume nesting of containers is corrupt and object is
429 // unusable. 430 // unusable.
430 warn("unexpected dictionary close token; giving up on reading object"); 431 warn("unexpected dictionary close token; giving up on reading object");
@@ -436,60 +437,60 @@ QPDFParser::parseRemainder(bool content_stream) @@ -436,60 +437,60 @@ QPDFParser::parseRemainder(bool content_stream)
436 437
437 case QPDFTokenizer::tt_array_open: 438 case QPDFTokenizer::tt_array_open:
438 case QPDFTokenizer::tt_dict_open: 439 case QPDFTokenizer::tt_dict_open:
439 - if (stack.size() > max_nesting) { 440 + if (stack_.size() > max_nesting) {
440 limits_error( 441 limits_error(
441 "parser-max-nesting", "ignoring excessively deeply nested data structure"); 442 "parser-max-nesting", "ignoring excessively deeply nested data structure");
442 } 443 }
443 b_contents = false; 444 b_contents = false;
444 - stack.emplace_back(  
445 - input,  
446 - (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array  
447 - : st_dictionary_key);  
448 - frame = &stack.back(); 445 + stack_.emplace_back(
  446 + input_,
  447 + (tokenizer_.getType() == QPDFTokenizer::tt_array_open) ? st_array
  448 + : st_dictionary_key);
  449 + frame_ = &stack_.back();
449 continue; 450 continue;
450 451
451 case QPDFTokenizer::tt_bool: 452 case QPDFTokenizer::tt_bool:
452 - addScalar<QPDF_Bool>(tokenizer.getValue() == "true"); 453 + add_scalar<QPDF_Bool>(tokenizer_.getValue() == "true");
453 continue; 454 continue;
454 455
455 case QPDFTokenizer::tt_null: 456 case QPDFTokenizer::tt_null:
456 - addNull(); 457 + add_null();
457 continue; 458 continue;
458 459
459 case QPDFTokenizer::tt_integer: 460 case QPDFTokenizer::tt_integer:
460 if (!content_stream) { 461 if (!content_stream) {
461 // Buffer token in case it is part of an indirect reference. 462 // Buffer token in case it is part of an indirect reference.
462 - last_offset_buffer[1] = input.getLastOffset();  
463 - int_buffer[1] = QUtil::string_to_ll(tokenizer.getValue().c_str());  
464 - int_count = 1; 463 + last_offset_buffer_[1] = input_.getLastOffset();
  464 + int_buffer_[1] = QUtil::string_to_ll(tokenizer_.getValue().c_str());
  465 + int_count_ = 1;
465 } else { 466 } else {
466 - addScalar<QPDF_Integer>(QUtil::string_to_ll(tokenizer.getValue().c_str())); 467 + add_scalar<QPDF_Integer>(QUtil::string_to_ll(tokenizer_.getValue().c_str()));
467 } 468 }
468 continue; 469 continue;
469 470
470 case QPDFTokenizer::tt_real: 471 case QPDFTokenizer::tt_real:
471 - addScalar<QPDF_Real>(tokenizer.getValue()); 472 + add_scalar<QPDF_Real>(tokenizer_.getValue());
472 continue; 473 continue;
473 474
474 case QPDFTokenizer::tt_name: 475 case QPDFTokenizer::tt_name:
475 - if (frame->state == st_dictionary_key) {  
476 - frame->key = tokenizer.getValue();  
477 - frame->state = st_dictionary_value;  
478 - b_contents = decrypter && frame->key == "/Contents"; 476 + if (frame_->state == st_dictionary_key) {
  477 + frame_->key = tokenizer_.getValue();
  478 + frame_->state = st_dictionary_value;
  479 + b_contents = decrypter_ && frame_->key == "/Contents";
479 continue; 480 continue;
480 } else { 481 } else {
481 - addScalar<QPDF_Name>(tokenizer.getValue()); 482 + add_scalar<QPDF_Name>(tokenizer_.getValue());
482 } 483 }
483 continue; 484 continue;
484 485
485 case QPDFTokenizer::tt_word: 486 case QPDFTokenizer::tt_word:
486 if (content_stream) { 487 if (content_stream) {
487 - addScalar<QPDF_Operator>(tokenizer.getValue()); 488 + add_scalar<QPDF_Operator>(tokenizer_.getValue());
488 continue; 489 continue;
489 } 490 }
490 491
491 - if (sanity_checks) {  
492 - if (tokenizer.getValue() == "endobj" || tokenizer.getValue() == "endstream") { 492 + if (sanity_checks_) {
  493 + if (tokenizer_.getValue() == "endobj" || tokenizer_.getValue() == "endstream") {
493 // During sanity checks, assume an unexpected endobj or endstream indicates that 494 // During sanity checks, assume an unexpected endobj or endstream indicates that
494 // we are parsing past the end of the object. 495 // we are parsing past the end of the object.
495 warn( 496 warn(
@@ -504,24 +505,24 @@ QPDFParser::parseRemainder(bool content_stream) @@ -504,24 +505,24 @@ QPDFParser::parseRemainder(bool content_stream)
504 505
505 warn("unknown token while reading object; treating as string"); 506 warn("unknown token while reading object; treating as string");
506 check_too_many_bad_tokens(); 507 check_too_many_bad_tokens();
507 - addScalar<QPDF_String>(tokenizer.getValue()); 508 + add_scalar<QPDF_String>(tokenizer_.getValue());
508 509
509 continue; 510 continue;
510 511
511 case QPDFTokenizer::tt_string: 512 case QPDFTokenizer::tt_string:
512 { 513 {
513 - auto const& val = tokenizer.getValue();  
514 - if (decrypter) { 514 + auto const& val = tokenizer_.getValue();
  515 + if (decrypter_) {
515 if (b_contents) { 516 if (b_contents) {
516 - frame->contents_string = val;  
517 - frame->contents_offset = input.getLastOffset(); 517 + frame_->contents_string = val;
  518 + frame_->contents_offset = input_.getLastOffset();
518 b_contents = false; 519 b_contents = false;
519 } 520 }
520 std::string s{val}; 521 std::string s{val};
521 - decrypter->decryptString(s);  
522 - addScalar<QPDF_String>(s); 522 + decrypter_->decryptString(s);
  523 + add_scalar<QPDF_String>(s);
523 } else { 524 } else {
524 - addScalar<QPDF_String>(val); 525 + add_scalar<QPDF_String>(val);
525 } 526 }
526 } 527 }
527 continue; 528 continue;
@@ -533,107 +534,107 @@ QPDFParser::parseRemainder(bool content_stream) @@ -533,107 +534,107 @@ QPDFParser::parseRemainder(bool content_stream)
533 } 534 }
534 535
535 void 536 void
536 -QPDFParser::add(std::shared_ptr<QPDFObject>&& obj) 537 +Parser::add(std::shared_ptr<QPDFObject>&& obj)
537 { 538 {
538 - if (frame->state != st_dictionary_value) { 539 + if (frame_->state != st_dictionary_value) {
539 // If state is st_dictionary_key then there is a missing key. Push onto olist for 540 // If state is st_dictionary_key then there is a missing key. Push onto olist for
540 // processing once the tt_dict_close token has been found. 541 // processing once the tt_dict_close token has been found.
541 - frame->olist.emplace_back(std::move(obj)); 542 + frame_->olist.emplace_back(std::move(obj));
542 } else { 543 } else {
543 - if (auto res = frame->dict.insert_or_assign(frame->key, std::move(obj)); !res.second) {  
544 - warnDuplicateKey(); 544 + if (auto res = frame_->dict.insert_or_assign(frame_->key, std::move(obj)); !res.second) {
  545 + warn_duplicate_key();
545 } 546 }
546 - frame->state = st_dictionary_key; 547 + frame_->state = st_dictionary_key;
547 } 548 }
548 } 549 }
549 550
550 void 551 void
551 -QPDFParser::addNull() 552 +Parser::add_null()
552 { 553 {
553 const static ObjectPtr null_obj = QPDFObject::create<QPDF_Null>(); 554 const static ObjectPtr null_obj = QPDFObject::create<QPDF_Null>();
554 555
555 - if (frame->state != st_dictionary_value) { 556 + if (frame_->state != st_dictionary_value) {
556 // If state is st_dictionary_key then there is a missing key. Push onto olist for 557 // If state is st_dictionary_key then there is a missing key. Push onto olist for
557 // processing once the tt_dict_close token has been found. 558 // processing once the tt_dict_close token has been found.
558 - frame->olist.emplace_back(null_obj); 559 + frame_->olist.emplace_back(null_obj);
559 } else { 560 } else {
560 - if (auto res = frame->dict.insert_or_assign(frame->key, null_obj); !res.second) {  
561 - warnDuplicateKey(); 561 + if (auto res = frame_->dict.insert_or_assign(frame_->key, null_obj); !res.second) {
  562 + warn_duplicate_key();
562 } 563 }
563 - frame->state = st_dictionary_key; 564 + frame_->state = st_dictionary_key;
564 } 565 }
565 - ++frame->null_count; 566 + ++frame_->null_count;
566 } 567 }
567 568
568 void 569 void
569 -QPDFParser::add_bad_null(std::string const& msg) 570 +Parser::add_bad_null(std::string const& msg)
570 { 571 {
571 warn(msg); 572 warn(msg);
572 check_too_many_bad_tokens(); 573 check_too_many_bad_tokens();
573 - addNull(); 574 + add_null();
574 } 575 }
575 576
576 void 577 void
577 -QPDFParser::addInt(int count) 578 +Parser::add_int(int count)
578 { 579 {
579 - auto obj = QPDFObject::create<QPDF_Integer>(int_buffer[count % 2]);  
580 - obj->setDescription(context, description, last_offset_buffer[count % 2]); 580 + auto obj = QPDFObject::create<QPDF_Integer>(int_buffer_[count % 2]);
  581 + obj->setDescription(context_, description_, last_offset_buffer_[count % 2]);
581 add(std::move(obj)); 582 add(std::move(obj));
582 } 583 }
583 584
584 template <typename T, typename... Args> 585 template <typename T, typename... Args>
585 void 586 void
586 -QPDFParser::addScalar(Args&&... args) 587 +Parser::add_scalar(Args&&... args)
587 { 588 {
588 - auto limit = Limits::parser_max_container_size(bad_count || sanity_checks);  
589 - if (frame->olist.size() >= limit || frame->dict.size() >= limit) { 589 + auto limit = Limits::parser_max_container_size(bad_count_ || sanity_checks_);
  590 + if (frame_->olist.size() >= limit || frame_->dict.size() >= limit) {
590 // Stop adding scalars. We are going to abort when the close token or a bad token is 591 // Stop adding scalars. We are going to abort when the close token or a bad token is
591 // encountered. 592 // encountered.
592 - max_bad_count = 1; 593 + max_bad_count_ = 1;
593 check_too_many_bad_tokens(); // always throws Error() 594 check_too_many_bad_tokens(); // always throws Error()
594 } 595 }
595 auto obj = QPDFObject::create<T>(std::forward<Args>(args)...); 596 auto obj = QPDFObject::create<T>(std::forward<Args>(args)...);
596 - obj->setDescription(context, description, input.getLastOffset()); 597 + obj->setDescription(context_, description_, input_.getLastOffset());
597 add(std::move(obj)); 598 add(std::move(obj));
598 } 599 }
599 600
600 template <typename T, typename... Args> 601 template <typename T, typename... Args>
601 QPDFObjectHandle 602 QPDFObjectHandle
602 -QPDFParser::withDescription(Args&&... args) 603 +Parser::with_description(Args&&... args)
603 { 604 {
604 auto obj = QPDFObject::create<T>(std::forward<Args>(args)...); 605 auto obj = QPDFObject::create<T>(std::forward<Args>(args)...);
605 - obj->setDescription(context, description, start); 606 + obj->setDescription(context_, description_, start_);
606 return {obj}; 607 return {obj};
607 } 608 }
608 609
609 void 610 void
610 -QPDFParser::setDescription(ObjectPtr& obj, qpdf_offset_t parsed_offset) 611 +Parser::set_description(ObjectPtr& obj, qpdf_offset_t parsed_offset)
611 { 612 {
612 if (obj) { 613 if (obj) {
613 - obj->setDescription(context, description, parsed_offset); 614 + obj->setDescription(context_, description_, parsed_offset);
614 } 615 }
615 } 616 }
616 617
617 void 618 void
618 -QPDFParser::fixMissingKeys() 619 +Parser::fix_missing_keys()
619 { 620 {
620 std::set<std::string> names; 621 std::set<std::string> names;
621 - for (auto& obj: frame->olist) { 622 + for (auto& obj: frame_->olist) {
622 if (obj.raw_type_code() == ::ot_name) { 623 if (obj.raw_type_code() == ::ot_name) {
623 names.insert(obj.obj_sp()->getStringValue()); 624 names.insert(obj.obj_sp()->getStringValue());
624 } 625 }
625 } 626 }
626 int next_fake_key = 1; 627 int next_fake_key = 1;
627 - for (auto const& item: frame->olist) { 628 + for (auto const& item: frame_->olist) {
628 while (true) { 629 while (true) {
629 const std::string key = "/QPDFFake" + std::to_string(next_fake_key++); 630 const std::string key = "/QPDFFake" + std::to_string(next_fake_key++);
630 - const bool found_fake = !frame->dict.contains(key) && !names.contains(key); 631 + const bool found_fake = !frame_->dict.contains(key) && !names.contains(key);
631 QTC::TC("qpdf", "QPDFParser found fake", (found_fake ? 0 : 1)); 632 QTC::TC("qpdf", "QPDFParser found fake", (found_fake ? 0 : 1));
632 if (found_fake) { 633 if (found_fake) {
633 warn( 634 warn(
634 - frame->offset, 635 + frame_->offset,
635 "expected dictionary key but found non-name object; inserting key " + key); 636 "expected dictionary key but found non-name object; inserting key " + key);
636 - frame->dict[key] = item; 637 + frame_->dict[key] = item;
637 break; 638 break;
638 } 639 }
639 } 640 }
@@ -641,11 +642,11 @@ QPDFParser::fixMissingKeys() @@ -641,11 +642,11 @@ QPDFParser::fixMissingKeys()
641 } 642 }
642 643
643 void 644 void
644 -QPDFParser::check_too_many_bad_tokens() 645 +Parser::check_too_many_bad_tokens()
645 { 646 {
646 - auto limit = Limits::parser_max_container_size(bad_count || sanity_checks);  
647 - if (frame->olist.size() >= limit || frame->dict.size() >= limit) {  
648 - if (bad_count) { 647 + auto limit = Limits::parser_max_container_size(bad_count_ || sanity_checks_);
  648 + if (frame_->olist.size() >= limit || frame_->dict.size() >= limit) {
  649 + if (bad_count_) {
649 limits_error( 650 limits_error(
650 "parser-max-container-size-damaged", 651 "parser-max-container-size-damaged",
651 "encountered errors while parsing an array or dictionary with more than " + 652 "encountered errors while parsing an array or dictionary with more than " +
@@ -656,27 +657,27 @@ QPDFParser::check_too_many_bad_tokens() @@ -656,27 +657,27 @@ QPDFParser::check_too_many_bad_tokens()
656 "encountered an array or dictionary with more than " + std::to_string(limit) + 657 "encountered an array or dictionary with more than " + std::to_string(limit) +
657 " elements during xref recovery; giving up on reading object"); 658 " elements during xref recovery; giving up on reading object");
658 } 659 }
659 - if (max_bad_count && --max_bad_count == 0) { 660 + if (max_bad_count_ && --max_bad_count_ == 0) {
660 limits_error( 661 limits_error(
661 "parser-max-errors", "too many errors during parsing; treating object as null"); 662 "parser-max-errors", "too many errors during parsing; treating object as null");
662 } 663 }
663 - if (good_count > 4) {  
664 - good_count = 0;  
665 - bad_count = 1; 664 + if (good_count_ > 4) {
  665 + good_count_ = 0;
  666 + bad_count_ = 1;
666 return; 667 return;
667 } 668 }
668 - if (++bad_count > 5 ||  
669 - (frame->state != st_array && std::cmp_less(max_bad_count, frame->olist.size()))) { 669 + if (++bad_count_ > 5 ||
  670 + (frame_->state != st_array && std::cmp_less(max_bad_count_, frame_->olist.size()))) {
670 // Give up after 5 errors in close proximity or if the number of missing dictionary keys 671 // Give up after 5 errors in close proximity or if the number of missing dictionary keys
671 // exceeds the remaining number of allowable total errors. 672 // exceeds the remaining number of allowable total errors.
672 warn("too many errors; giving up on reading object"); 673 warn("too many errors; giving up on reading object");
673 throw Error(); 674 throw Error();
674 } 675 }
675 - good_count = 0; 676 + good_count_ = 0;
676 } 677 }
677 678
678 void 679 void
679 -QPDFParser::limits_error(std::string const& limit, std::string const& msg) 680 +Parser::limits_error(std::string const& limit, std::string const& msg)
680 { 681 {
681 Limits::error(); 682 Limits::error();
682 warn("limits error("s + limit + "): " + msg); 683 warn("limits error("s + limit + "): " + msg);
@@ -684,40 +685,41 @@ QPDFParser::limits_error(std::string const&amp; limit, std::string const&amp; msg) @@ -684,40 +685,41 @@ QPDFParser::limits_error(std::string const&amp; limit, std::string const&amp; msg)
684 } 685 }
685 686
686 void 687 void
687 -QPDFParser::warn(QPDFExc const& e) const 688 +Parser::warn(QPDFExc const& e) const
688 { 689 {
689 // If parsing on behalf of a QPDF object and want to give a warning, we can warn through the 690 // If parsing on behalf of a QPDF object and want to give a warning, we can warn through the
690 // object. If parsing for some other reason, such as an explicit creation of an object from a 691 // object. If parsing for some other reason, such as an explicit creation of an object from a
691 // string, then just throw the exception. 692 // string, then just throw the exception.
692 - if (context) {  
693 - context->warn(e); 693 + if (context_) {
  694 + context_->warn(e);
694 } else { 695 } else {
695 throw e; 696 throw e;
696 } 697 }
697 } 698 }
698 699
699 void 700 void
700 -QPDFParser::warnDuplicateKey() 701 +Parser::warn_duplicate_key()
701 { 702 {
702 warn( 703 warn(
703 - frame->offset,  
704 - "dictionary has duplicated key " + frame->key + "; last occurrence overrides earlier ones"); 704 + frame_->offset,
  705 + "dictionary has duplicated key " + frame_->key +
  706 + "; last occurrence overrides earlier ones");
705 } 707 }
706 708
707 void 709 void
708 -QPDFParser::warn(qpdf_offset_t offset, std::string const& msg) const 710 +Parser::warn(qpdf_offset_t offset, std::string const& msg) const
709 { 711 {
710 - if (stream_id) {  
711 - std::string descr = "object "s + std::to_string(obj_id) + " 0";  
712 - std::string name = context->getFilename() + " object stream " + std::to_string(stream_id); 712 + if (stream_id_) {
  713 + std::string descr = "object "s + std::to_string(obj_id_) + " 0";
  714 + std::string name = context_->getFilename() + " object stream " + std::to_string(stream_id_);
713 warn(QPDFExc(qpdf_e_damaged_pdf, name, descr, offset, msg)); 715 warn(QPDFExc(qpdf_e_damaged_pdf, name, descr, offset, msg));
714 } else { 716 } else {
715 - warn(QPDFExc(qpdf_e_damaged_pdf, input.getName(), object_description, offset, msg)); 717 + warn(QPDFExc(qpdf_e_damaged_pdf, input_.getName(), object_description_, offset, msg));
716 } 718 }
717 } 719 }
718 720
719 void 721 void
720 -QPDFParser::warn(std::string const& msg) const 722 +Parser::warn(std::string const& msg) const
721 { 723 {
722 - warn(input.getLastOffset(), msg); 724 + warn(input_.getLastOffset(), msg);
723 } 725 }
libqpdf/QPDF_objects.cc
@@ -25,6 +25,7 @@ using namespace qpdf; @@ -25,6 +25,7 @@ using namespace qpdf;
25 using namespace std::literals; 25 using namespace std::literals;
26 26
27 using Objects = QPDF::Doc::Objects; 27 using Objects = QPDF::Doc::Objects;
  28 +using Parser = impl::Parser;
28 29
29 QPDFXRefEntry::QPDFXRefEntry() = default; 30 QPDFXRefEntry::QPDFXRefEntry() = default;
30 31
@@ -1287,7 +1288,7 @@ Objects::readTrailer() @@ -1287,7 +1288,7 @@ Objects::readTrailer()
1287 { 1288 {
1288 qpdf_offset_t offset = m->file->tell(); 1289 qpdf_offset_t offset = m->file->tell();
1289 auto object = 1290 auto object =
1290 - QPDFParser::parse(*m->file, "trailer", m->tokenizer, nullptr, qpdf, m->reconstructed_xref); 1291 + Parser::parse(*m->file, "trailer", m->tokenizer, nullptr, qpdf, m->reconstructed_xref);
1291 if (object.isDictionary() && m->objects.readToken(*m->file).isWord("stream")) { 1292 if (object.isDictionary() && m->objects.readToken(*m->file).isWord("stream")) {
1292 warn(damagedPDF("trailer", m->file->tell(), "stream keyword found in trailer")); 1293 warn(damagedPDF("trailer", m->file->tell(), "stream keyword found in trailer"));
1293 } 1294 }
@@ -1304,7 +1305,7 @@ Objects::readObject(std::string const&amp; description, QPDFObjGen og) @@ -1304,7 +1305,7 @@ Objects::readObject(std::string const&amp; description, QPDFObjGen og)
1304 1305
1305 StringDecrypter decrypter{&qpdf, og}; 1306 StringDecrypter decrypter{&qpdf, og};
1306 StringDecrypter* decrypter_ptr = m->encp->encrypted ? &decrypter : nullptr; 1307 StringDecrypter* decrypter_ptr = m->encp->encrypted ? &decrypter : nullptr;
1307 - auto object = QPDFParser::parse( 1308 + auto object = Parser::parse(
1308 *m->file, 1309 *m->file,
1309 m->last_object_description, 1310 m->last_object_description,
1310 m->tokenizer, 1311 m->tokenizer,
@@ -1834,7 +1835,7 @@ Objects::resolveObjectsInStream(int obj_stream_number) @@ -1834,7 +1835,7 @@ Objects::resolveObjectsInStream(int obj_stream_number)
1834 if (entry != m->xref_table.end() && entry->second.getType() == 2 && 1835 if (entry != m->xref_table.end() && entry->second.getType() == 2 &&
1835 entry->second.getObjStreamNumber() == obj_stream_number) { 1836 entry->second.getObjStreamNumber() == obj_stream_number) {
1836 is::OffsetBuffer in("", {b_start + obj_offset, obj_size}, obj_offset); 1837 is::OffsetBuffer in("", {b_start + obj_offset, obj_size}, obj_offset);
1837 - if (auto oh = QPDFParser::parse(in, obj_stream_number, obj_id, m->tokenizer, qpdf)) { 1838 + if (auto oh = Parser::parse(in, obj_stream_number, obj_id, m->tokenizer, qpdf)) {
1838 updateCache(og, oh.obj_sp(), end_before_space, end_after_space); 1839 updateCache(og, oh.obj_sp(), end_before_space, end_after_space);
1839 } 1840 }
1840 } else { 1841 } else {
libqpdf/qpdf/QPDFParser.hh
@@ -13,153 +13,277 @@ @@ -13,153 +13,277 @@
13 using namespace qpdf; 13 using namespace qpdf;
14 using namespace qpdf::global; 14 using namespace qpdf::global;
15 15
16 -class QPDFParser 16 +namespace qpdf::impl
17 { 17 {
18 - public:  
19 - class Error: public std::exception 18 + /// @class Parser
  19 + /// @brief Internal parser for PDF objects and content streams.
  20 + /// @par
  21 + /// The Parser class provides static methods for parsing PDF objects from input sources.
  22 + /// It handles tokenization, error recovery, and object construction with proper offset
  23 + /// tracking and description for error reporting.
  24 + class Parser
20 { 25 {
21 public: 26 public:
22 - Error() = default;  
23 - virtual ~Error() noexcept = default;  
24 - }; 27 + /// @brief Exception thrown when parser encounters an unrecoverable error.
  28 + class Error: public std::exception
  29 + {
  30 + public:
  31 + Error() = default;
  32 + virtual ~Error() noexcept = default;
  33 + };
25 34
26 - static QPDFObjectHandle  
27 - parse(InputSource& input, std::string const& object_description, QPDF* context);  
28 -  
29 - static QPDFObjectHandle parse_content(  
30 - InputSource& input,  
31 - std::shared_ptr<QPDFObject::Description> sp_description,  
32 - qpdf::Tokenizer& tokenizer,  
33 - QPDF* context);  
34 -  
35 - // For use by deprecated QPDFObjectHandle::parse.  
36 - static QPDFObjectHandle parse(  
37 - InputSource& input,  
38 - std::string const& object_description,  
39 - QPDFTokenizer& tokenizer,  
40 - bool& empty,  
41 - QPDFObjectHandle::StringDecrypter* decrypter,  
42 - QPDF* context);  
43 -  
44 - // For use by QPDF.  
45 - static QPDFObjectHandle parse(  
46 - InputSource& input,  
47 - std::string const& object_description,  
48 - qpdf::Tokenizer& tokenizer,  
49 - QPDFObjectHandle::StringDecrypter* decrypter,  
50 - QPDF& context,  
51 - bool sanity_checks);  
52 -  
53 - static QPDFObjectHandle parse(  
54 - qpdf::is::OffsetBuffer& input,  
55 - int stream_id,  
56 - int obj_id,  
57 - qpdf::Tokenizer& tokenizer,  
58 - QPDF& context);  
59 -  
60 - static std::shared_ptr<QPDFObject::Description>  
61 - make_description(std::string const& input_name, std::string const& object_description)  
62 - {  
63 - using namespace std::literals;  
64 - return std::make_shared<QPDFObject::Description>(  
65 - input_name + ", " + object_description + " at offset $PO");  
66 - }  
67 -  
68 - private:  
69 - QPDFParser(  
70 - InputSource& input,  
71 - std::shared_ptr<QPDFObject::Description> sp_description,  
72 - std::string const& object_description,  
73 - qpdf::Tokenizer& tokenizer,  
74 - QPDFObjectHandle::StringDecrypter* decrypter,  
75 - QPDF* context,  
76 - bool parse_pdf,  
77 - int stream_id = 0,  
78 - int obj_id = 0,  
79 - bool sanity_checks = false) :  
80 - input(input),  
81 - object_description(object_description),  
82 - tokenizer(tokenizer),  
83 - decrypter(decrypter),  
84 - context(context),  
85 - description(std::move(sp_description)),  
86 - parse_pdf(parse_pdf),  
87 - stream_id(stream_id),  
88 - obj_id(obj_id),  
89 - sanity_checks(sanity_checks)  
90 - {  
91 - } 35 + /// @brief Parse a PDF object from an input source.
  36 + /// @param input The input source to read from.
  37 + /// @param object_description Description of the object for error messages.
  38 + /// @param context The QPDF context, or nullptr if parsing standalone.
  39 + /// @return The parsed QPDFObjectHandle, or null if parsing fails.
  40 + static QPDFObjectHandle
  41 + parse(InputSource& input, std::string const& object_description, QPDF* context);
92 42
93 - // Parser state. Note:  
94 - // state <= st_dictionary_value == (state = st_dictionary_key || state = st_dictionary_value)  
95 - enum parser_state_e { st_dictionary_key, st_dictionary_value, st_array }; 43 + /// @brief Parse a content stream from an input source.
  44 + /// @param input The input source to read from.
  45 + /// @param sp_description Shared pointer to object description.
  46 + /// @param tokenizer The tokenizer to use for parsing.
  47 + /// @param context The QPDF context.
  48 + /// @return The parsed QPDFObjectHandle, or uninitialized handle on EOF.
  49 + static QPDFObjectHandle parse_content(
  50 + InputSource& input,
  51 + std::shared_ptr<QPDFObject::Description> sp_description,
  52 + qpdf::Tokenizer& tokenizer,
  53 + QPDF* context);
96 54
97 - struct StackFrame  
98 - {  
99 - StackFrame(InputSource& input, parser_state_e state) :  
100 - state(state),  
101 - offset(input.tell()) 55 + /// @brief Parse a PDF object (interface for deprecated QPDFObjectHandle::parse).
  56 + /// @param input The input source to read from.
  57 + /// @param object_description Description of the object for error messages.
  58 + /// @param tokenizer The tokenizer to use for parsing.
  59 + /// @param empty Output parameter indicating if object was empty.
  60 + /// @param decrypter String decrypter for encrypted strings, or nullptr.
  61 + /// @param context The QPDF context, or nullptr if parsing standalone.
  62 + /// @return The parsed QPDFObjectHandle.
  63 + static QPDFObjectHandle parse(
  64 + InputSource& input,
  65 + std::string const& object_description,
  66 + QPDFTokenizer& tokenizer,
  67 + bool& empty,
  68 + QPDFObjectHandle::StringDecrypter* decrypter,
  69 + QPDF* context);
  70 +
  71 + /// @brief Parse a PDF object for use by QPDF.
  72 + /// @param input The input source to read from.
  73 + /// @param object_description Description of the object for error messages.
  74 + /// @param tokenizer The tokenizer to use for parsing.
  75 + /// @param decrypter String decrypter for encrypted strings, or nullptr.
  76 + /// @param context The QPDF context.
  77 + /// @param sanity_checks Enable additional sanity checks during parsing.
  78 + /// @return The parsed QPDFObjectHandle.
  79 + static QPDFObjectHandle parse(
  80 + InputSource& input,
  81 + std::string const& object_description,
  82 + qpdf::Tokenizer& tokenizer,
  83 + QPDFObjectHandle::StringDecrypter* decrypter,
  84 + QPDF& context,
  85 + bool sanity_checks);
  86 +
  87 + /// @brief Parse an object from an object stream.
  88 + /// @param input The offset buffer containing the object data.
  89 + /// @param stream_id The object stream number.
  90 + /// @param obj_id The object ID within the stream.
  91 + /// @param tokenizer The tokenizer to use for parsing.
  92 + /// @param context The QPDF context.
  93 + /// @return The parsed QPDFObjectHandle.
  94 + static QPDFObjectHandle parse(
  95 + qpdf::is::OffsetBuffer& input,
  96 + int stream_id,
  97 + int obj_id,
  98 + qpdf::Tokenizer& tokenizer,
  99 + QPDF& context);
  100 +
  101 + /// @brief Create a description for a parsed object.
  102 + /// @param input_name The name of the input source.
  103 + /// @param object_description Description of the object being parsed.
  104 + /// @return Shared pointer to object description with offset placeholder.
  105 + static std::shared_ptr<QPDFObject::Description>
  106 + make_description(std::string const& input_name, std::string const& object_description)
102 { 107 {
  108 + using namespace std::literals;
  109 + return std::make_shared<QPDFObject::Description>(
  110 + input_name + ", " + object_description + " at offset $PO");
103 } 111 }
104 112
105 - std::vector<QPDFObjectHandle> olist;  
106 - std::map<std::string, QPDFObjectHandle> dict;  
107 - parser_state_e state;  
108 - std::string key;  
109 - qpdf_offset_t offset;  
110 - std::string contents_string;  
111 - qpdf_offset_t contents_offset{-1};  
112 - int null_count{0};  
113 - }; 113 + private:
  114 + /// @brief Construct a parser instance.
  115 + /// @param input The input source to read from.
  116 + /// @param sp_description Shared pointer to object description.
  117 + /// @param object_description Description string for error messages.
  118 + /// @param tokenizer The tokenizer to use for parsing.
  119 + /// @param decrypter String decrypter for encrypted content.
  120 + /// @param context The QPDF context.
  121 + /// @param parse_pdf Whether parsing PDF objects (vs content streams).
  122 + /// @param stream_id Object stream ID for object stream parsing.
  123 + /// @param obj_id Object ID within object stream.
  124 + /// @param sanity_checks Enable additional sanity checks.
  125 + Parser(
  126 + InputSource& input,
  127 + std::shared_ptr<QPDFObject::Description> sp_description,
  128 + std::string const& object_description,
  129 + qpdf::Tokenizer& tokenizer,
  130 + QPDFObjectHandle::StringDecrypter* decrypter,
  131 + QPDF* context,
  132 + bool parse_pdf,
  133 + int stream_id = 0,
  134 + int obj_id = 0,
  135 + bool sanity_checks = false) :
  136 + input_(input),
  137 + object_description_(object_description),
  138 + tokenizer_(tokenizer),
  139 + decrypter_(decrypter),
  140 + context_(context),
  141 + description_(std::move(sp_description)),
  142 + parse_pdf_(parse_pdf),
  143 + stream_id_(stream_id),
  144 + obj_id_(obj_id),
  145 + sanity_checks_(sanity_checks)
  146 + {
  147 + }
  148 +
  149 + /// @brief Parser state enumeration.
  150 + /// @note state <= st_dictionary_value indicates we're in a dictionary context.
  151 + enum parser_state_e { st_dictionary_key, st_dictionary_value, st_array };
  152 +
  153 + /// @brief Stack frame for tracking nested arrays and dictionaries.
  154 + struct StackFrame
  155 + {
  156 + StackFrame(InputSource& input, parser_state_e state) :
  157 + state(state),
  158 + offset(input.tell())
  159 + {
  160 + }
  161 +
  162 + std::vector<QPDFObjectHandle> olist; ///< Object list for arrays/dict values
  163 + std::map<std::string, QPDFObjectHandle> dict; ///< Dictionary entries
  164 + parser_state_e state; ///< Current parser state
  165 + std::string key; ///< Current dictionary key
  166 + qpdf_offset_t offset; ///< Offset of container start
  167 + std::string contents_string; ///< For /Contents field in signatures
  168 + qpdf_offset_t contents_offset{-1}; ///< Offset of /Contents value
  169 + int null_count{0}; ///< Count of null values in container
  170 + };
  171 +
  172 + /// @brief Parse an object, handling exceptions and returning null on error.
  173 + /// @param content_stream True if parsing a content stream.
  174 + /// @return The parsed object handle, or null/uninitialized on error.
  175 + QPDFObjectHandle parse(bool content_stream = false);
  176 +
  177 + /// @brief Parse the first token and dispatch to appropriate handler.
  178 + /// @param content_stream True if parsing a content stream.
  179 + /// @return The parsed object handle.
  180 + QPDFObjectHandle parse_first(bool content_stream);
  181 +
  182 + /// @brief Parse the remainder of a composite object (array/dict/reference).
  183 + /// @param content_stream True if parsing a content stream.
  184 + /// @return The completed object handle.
  185 + QPDFObjectHandle parse_remainder(bool content_stream);
  186 +
  187 + /// @brief Add an object to the current container.
  188 + /// @param obj The object to add.
  189 + void add(std::shared_ptr<QPDFObject>&& obj);
114 190
115 - QPDFObjectHandle parse(bool content_stream = false);  
116 - QPDFObjectHandle parse_first(bool content_stream);  
117 - QPDFObjectHandle parseRemainder(bool content_stream);  
118 - void add(std::shared_ptr<QPDFObject>&& obj);  
119 - void addNull();  
120 - void add_bad_null(std::string const& msg);  
121 - void addInt(int count);  
122 - template <typename T, typename... Args>  
123 - void addScalar(Args&&... args);  
124 - void check_too_many_bad_tokens();  
125 - void warnDuplicateKey();  
126 - void fixMissingKeys();  
127 - [[noreturn]] void limits_error(std::string const& limit, std::string const& msg);  
128 - void warn(qpdf_offset_t offset, std::string const& msg) const;  
129 - void warn(std::string const& msg) const;  
130 - void warn(QPDFExc const&) const;  
131 - template <typename T, typename... Args>  
132 - // Create a new scalar object complete with parsed offset and description.  
133 - // NB the offset includes any leading whitespace.  
134 - QPDFObjectHandle withDescription(Args&&... args);  
135 - void setDescription(std::shared_ptr<QPDFObject>& obj, qpdf_offset_t parsed_offset);  
136 - InputSource& input;  
137 - std::string const& object_description;  
138 - qpdf::Tokenizer& tokenizer;  
139 - QPDFObjectHandle::StringDecrypter* decrypter;  
140 - QPDF* context;  
141 - std::shared_ptr<QPDFObject::Description> description;  
142 - bool parse_pdf{false};  
143 - int stream_id{0};  
144 - int obj_id{0};  
145 - bool sanity_checks{false};  
146 -  
147 - std::vector<StackFrame> stack;  
148 - StackFrame* frame{nullptr};  
149 - // Number of recent bad tokens. This will always be > 0 once a bad token has been encountered as  
150 - // it only gets incremented or reset when a bad token is encountered.  
151 - int bad_count{0};  
152 - // Number of bad tokens (remaining) before giving up.  
153 - uint32_t max_bad_count{Limits::parser_max_errors()};  
154 - // Number of good tokens since last bad token. Irrelevant if bad_count == 0.  
155 - int good_count{0};  
156 - // Start offset including any leading whitespace.  
157 - qpdf_offset_t start{0};  
158 - // Number of successive integer tokens.  
159 - int int_count{0};  
160 - long long int_buffer[2]{0, 0};  
161 - qpdf_offset_t last_offset_buffer[2]{0, 0};  
162 - bool empty_{false};  
163 -}; 191 + /// @brief Add a null object to the current container.
  192 + void add_null();
  193 +
  194 + /// @brief Add a null with a warning message.
  195 + /// @param msg Warning message describing the error.
  196 + void add_bad_null(std::string const& msg);
  197 +
  198 + /// @brief Add a buffered integer from int_buffer_.
  199 + /// @param count Buffer index (1 or 2) to read from.
  200 + void add_int(int count);
  201 +
  202 + /// @brief Create and add a scalar object to the current container.
  203 + /// @tparam T The scalar object type (e.g., QPDF_Integer, QPDF_String).
  204 + /// @tparam Args Constructor argument types.
  205 + /// @param args Arguments to forward to the object constructor.
  206 + template <typename T, typename... Args>
  207 + void add_scalar(Args&&... args);
  208 +
  209 + /// @brief Check if too many bad tokens have been encountered and throw if so.
  210 + void check_too_many_bad_tokens();
  211 +
  212 + /// @brief Issue a warning about a duplicate dictionary key.
  213 + void warn_duplicate_key();
  214 +
  215 + /// @brief Fix dictionaries with missing keys by generating fake keys.
  216 + void fix_missing_keys();
  217 +
  218 + /// @brief Report a limits error and throw.
  219 + /// @param limit The limit identifier.
  220 + /// @param msg Error message.
  221 + [[noreturn]] void limits_error(std::string const& limit, std::string const& msg);
  222 +
  223 + /// @brief Issue a warning at a specific offset.
  224 + /// @param offset File offset for the warning.
  225 + /// @param msg Warning message.
  226 + void warn(qpdf_offset_t offset, std::string const& msg) const;
  227 +
  228 + /// @brief Issue a warning at the current offset.
  229 + /// @param msg Warning message.
  230 + void warn(std::string const& msg) const;
  231 +
  232 + /// @brief Issue a warning from a QPDFExc exception.
  233 + /// @param e The exception to report.
  234 + void warn(QPDFExc const& e) const;
  235 +
  236 + /// @brief Create a scalar object with description and parsed offset.
  237 + /// @tparam T The scalar object type.
  238 + /// @tparam Args Constructor argument types.
  239 + /// @param args Arguments to forward to the object constructor.
  240 + /// @return Object handle with description and offset set.
  241 + /// @note The offset includes any leading whitespace.
  242 + template <typename T, typename... Args>
  243 + QPDFObjectHandle with_description(Args&&... args);
  244 +
  245 + /// @brief Set the description and offset on an existing object.
  246 + /// @param obj The object to update.
  247 + /// @param parsed_offset The file offset where the object was parsed.
  248 + void set_description(std::shared_ptr<QPDFObject>& obj, qpdf_offset_t parsed_offset);
  249 +
  250 + // Core parsing state
  251 + InputSource& input_; ///< Input source to read from
  252 + std::string const& object_description_; ///< Description for error messages
  253 + qpdf::Tokenizer& tokenizer_; ///< Tokenizer for lexical analysis
  254 + QPDFObjectHandle::StringDecrypter* decrypter_; ///< Decrypter for encrypted strings
  255 + QPDF* context_; ///< QPDF context for object resolution
  256 + std::shared_ptr<QPDFObject::Description> description_; ///< Shared description for objects
  257 + bool parse_pdf_{false}; ///< True if parsing PDF objects vs content streams
  258 + int stream_id_{0}; ///< Object stream ID (for object stream parsing)
  259 + int obj_id_{0}; ///< Object ID within object stream
  260 + bool sanity_checks_{false}; ///< Enable additional validation checks
  261 +
  262 + // Composite object parsing state
  263 + std::vector<StackFrame> stack_; ///< Stack of nested containers
  264 + StackFrame* frame_{nullptr}; ///< Current stack frame pointer
  265 +
  266 + // Error tracking state
  267 + /// Number of recent bad tokens. Always > 0 after first bad token encountered.
  268 + int bad_count_{0};
  269 + /// Number of bad tokens remaining before giving up.
  270 + uint32_t max_bad_count_{Limits::parser_max_errors()};
  271 + /// Number of good tokens since last bad token. Irrelevant if bad_count == 0.
  272 + int good_count_{0};
  273 +
  274 + // Token buffering state
  275 + /// Start offset of current object, including any leading whitespace.
  276 + qpdf_offset_t start_{0};
  277 + /// Number of successive integer tokens (for indirect reference detection).
  278 + int int_count_{0};
  279 + /// Buffer for up to 2 integer tokens.
  280 + long long int_buffer_[2]{0, 0};
  281 + /// Offsets corresponding to buffered integers.
  282 + qpdf_offset_t last_offset_buffer_[2]{0, 0};
  283 +
  284 + /// True if object was empty (endobj without content).
  285 + bool empty_{false};
  286 + };
  287 +} // namespace qpdf::impl
164 288
165 #endif // QPDFPARSER_HH 289 #endif // QPDFPARSER_HH
qpdf/qtest/qpdf/parse-object.out
1 [ /name 16059 3.14159 false << /key true /other [ (string1) (string2) ] >> null ] 1 [ /name 16059 3.14159 false << /key true /other [ (string1) (string2) ] >> null ]
2 -logic error parsing indirect: QPDFParser::parse called without context on an object with indirect references 2 +logic error parsing indirect: Parser::parse called without context on an object with indirect references
3 trailing data: parsed object (trailing test): trailing data found parsing object from string 3 trailing data: parsed object (trailing test): trailing data found parsing object from string
4 WARNING: parsed object (offset 9): unknown token while reading object; treating as string 4 WARNING: parsed object (offset 9): unknown token while reading object; treating as string
5 WARNING: parsed object: treating unexpected brace token as null 5 WARNING: parsed object: treating unexpected brace token as null