diff --git a/include/qpdf/QPDFObjectHandle.hh b/include/qpdf/QPDFObjectHandle.hh index 10a5244..9fef4e6 100644 --- a/include/qpdf/QPDFObjectHandle.hh +++ b/include/qpdf/QPDFObjectHandle.hh @@ -61,11 +61,14 @@ class QPDFTokenizer; class QPDFExc; class Pl_QPDFTokenizer; class QPDFMatrix; -class QPDFParser; +namespace qpdf::impl +{ + class Parser; +} class QPDFObjectHandle: public qpdf::BaseHandle { - friend class QPDFParser; + friend class qpdf::impl::Parser; public: // This class is used by replaceStreamData. It provides an alternative way of associating diff --git a/include/qpdf/QPDFTokenizer.hh b/include/qpdf/QPDFTokenizer.hh index d94eb02..94dae1a 100644 --- a/include/qpdf/QPDFTokenizer.hh +++ b/include/qpdf/QPDFTokenizer.hh @@ -31,6 +31,10 @@ namespace qpdf { class Tokenizer; + namespace impl + { + class Parser; + } } // namespace qpdf class QPDFTokenizer @@ -203,7 +207,7 @@ class QPDFTokenizer void expectInlineImage(InputSource& input); private: - friend class QPDFParser; + friend class qpdf::impl::Parser; QPDFTokenizer(QPDFTokenizer const&) = delete; QPDFTokenizer& operator=(QPDFTokenizer const&) = delete; diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc index 2cafbab..5bad81b 100644 --- a/libqpdf/QPDFObjectHandle.cc +++ b/libqpdf/QPDFObjectHandle.cc @@ -25,6 +25,8 @@ using namespace std::literals; using namespace qpdf; +using Parser = impl::Parser; + const Null Null::temp_; BaseHandle:: @@ -1540,7 +1542,7 @@ QPDFObjectHandle::parse( QPDF* context, std::string const& object_str, std::string const& object_description) { auto input = is::OffsetBuffer("parsed object", object_str); - auto result = QPDFParser::parse(input, object_description, context); + auto result = Parser::parse(input, object_description, context); size_t offset = QIntC::to_size(input.tell()); while (offset < object_str.length()) { if (!isspace(object_str.at(offset))) { @@ -1661,7 +1663,7 @@ QPDFObjectHandle::parseContentStream_data( auto input = is::OffsetBuffer(description, stream_data); Tokenizer tokenizer; tokenizer.allowEOF(); - auto sp_description = QPDFParser::make_description(description, "content"); + auto sp_description = Parser::make_description(description, "content"); while (QIntC::to_size(input.tell()) < stream_length) { // Read a token and seek to the beginning. The offset we get from this process is the // beginning of the next non-ignorable (space, comment) token. This way, the offset and @@ -1669,7 +1671,7 @@ QPDFObjectHandle::parseContentStream_data( tokenizer.nextToken(input, "content", true); qpdf_offset_t offset = input.getLastOffset(); input.seek(offset, SEEK_SET); - auto obj = QPDFParser::parse_content(input, sp_description, tokenizer, context); + auto obj = Parser::parse_content(input, sp_description, tokenizer, context); if (!obj) { // EOF break; @@ -1678,7 +1680,7 @@ QPDFObjectHandle::parseContentStream_data( if (callbacks) { callbacks->handleObject(obj, QIntC::to_size(offset), length); } - if (obj.isOperator() && (obj.getOperatorValue() == "ID")) { + if (obj.isOperator() && obj.getOperatorValue() == "ID") { // Discard next character; it is the space after ID that terminated the token. Read // until end of inline image. char ch; @@ -1731,7 +1733,7 @@ QPDFObjectHandle::parse( StringDecrypter* decrypter, QPDF* context) { - return QPDFParser::parse(*input, object_description, tokenizer, empty, decrypter, context); + return Parser::parse(*input, object_description, tokenizer, empty, decrypter, context); } qpdf_offset_t diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc index 6f53159..6189eb4 100644 --- a/libqpdf/QPDFParser.cc +++ b/libqpdf/QPDFParser.cc @@ -46,12 +46,13 @@ class QPDF::Doc::ParseGuard }; using ParseGuard = QPDF::Doc::ParseGuard; +using Parser = qpdf::impl::Parser; QPDFObjectHandle -QPDFParser::parse(InputSource& input, std::string const& object_description, QPDF* context) +Parser::parse(InputSource& input, std::string const& object_description, QPDF* context) { qpdf::Tokenizer tokenizer; - if (auto result = QPDFParser( + if (auto result = Parser( input, make_description(input.getName(), object_description), object_description, @@ -66,14 +67,14 @@ QPDFParser::parse(InputSource& input, std::string const& object_description, QPD } QPDFObjectHandle -QPDFParser::parse_content( +Parser::parse_content( InputSource& input, std::shared_ptr sp_description, qpdf::Tokenizer& tokenizer, QPDF* context) { static const std::string content("content"); // GCC12 - make constexpr - auto p = QPDFParser( + auto p = Parser( input, std::move(sp_description), content, @@ -93,7 +94,7 @@ QPDFParser::parse_content( } QPDFObjectHandle -QPDFParser::parse( +Parser::parse( InputSource& input, std::string const& object_description, QPDFTokenizer& tokenizer, @@ -103,7 +104,7 @@ QPDFParser::parse( { // ABI: This parse overload is only used by the deprecated QPDFObjectHandle::parse. It is the // only user of the 'empty' member. When removing this overload also remove 'empty'. - auto p = QPDFParser( + auto p = Parser( input, make_description(input.getName(), object_description), object_description, @@ -120,7 +121,7 @@ QPDFParser::parse( } QPDFObjectHandle -QPDFParser::parse( +Parser::parse( InputSource& input, std::string const& object_description, qpdf::Tokenizer& tokenizer, @@ -128,7 +129,7 @@ QPDFParser::parse( QPDF& context, bool sanity_checks) { - return QPDFParser( + return Parser( input, make_description(input.getName(), object_description), object_description, @@ -143,10 +144,10 @@ QPDFParser::parse( } QPDFObjectHandle -QPDFParser::parse( +Parser::parse( is::OffsetBuffer& input, int stream_id, int obj_id, qpdf::Tokenizer& tokenizer, QPDF& context) { - return QPDFParser( + return Parser( input, std::make_shared( QPDFObject::ObjStreamDescr(stream_id, obj_id)), @@ -161,7 +162,7 @@ QPDFParser::parse( } QPDFObjectHandle -QPDFParser::parse(bool content_stream) +Parser::parse(bool content_stream) { try { return parse_first(content_stream); @@ -178,20 +179,20 @@ QPDFParser::parse(bool content_stream) } QPDFObjectHandle -QPDFParser::parse_first(bool content_stream) +Parser::parse_first(bool content_stream) { // This method must take care not to resolve any objects. Don't check the type of any object // without first ensuring that it is a direct object. Otherwise, doing so may have the side // effect of reading the object and changing the file pointer. If you do this, it will cause a // logic error to be thrown from QPDF::inParse(). - QPDF::Doc::ParseGuard pg(context); - start = input.tell(); - if (!tokenizer.nextToken(input, object_description)) { - warn(tokenizer.getErrorMessage()); + QPDF::Doc::ParseGuard pg(context_); + start_ = input_.tell(); + if (!tokenizer_.nextToken(input_, object_description_)) { + warn(tokenizer_.getErrorMessage()); } - switch (tokenizer.getType()) { + switch (tokenizer_.getType()) { case QPDFTokenizer::tt_eof: if (content_stream) { // In content stream mode, leave object uninitialized to indicate EOF @@ -219,57 +220,57 @@ QPDFParser::parse_first(bool content_stream) case QPDFTokenizer::tt_array_open: case QPDFTokenizer::tt_dict_open: - stack.clear(); - stack.emplace_back( - input, - (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array : st_dictionary_key); - frame = &stack.back(); - return parseRemainder(content_stream); + stack_.clear(); + stack_.emplace_back( + input_, + (tokenizer_.getType() == QPDFTokenizer::tt_array_open) ? st_array : st_dictionary_key); + frame_ = &stack_.back(); + return parse_remainder(content_stream); case QPDFTokenizer::tt_bool: - return withDescription(tokenizer.getValue() == "true"); + return with_description(tokenizer_.getValue() == "true"); case QPDFTokenizer::tt_null: return {QPDFObject::create()}; case QPDFTokenizer::tt_integer: - return withDescription(QUtil::string_to_ll(tokenizer.getValue().c_str())); + return with_description(QUtil::string_to_ll(tokenizer_.getValue().c_str())); case QPDFTokenizer::tt_real: - return withDescription(tokenizer.getValue()); + return with_description(tokenizer_.getValue()); case QPDFTokenizer::tt_name: - return withDescription(tokenizer.getValue()); + return with_description(tokenizer_.getValue()); case QPDFTokenizer::tt_word: { - auto const& value = tokenizer.getValue(); + auto const& value = tokenizer_.getValue(); if (content_stream) { - return withDescription(value); + return with_description(value); } else if (value == "endobj") { // We just saw endobj without having read anything. Nothing in the PDF spec appears // to allow empty objects, but they have been encountered in actual PDF files and // Adobe Reader appears to ignore them. Treat this as a null and do not move the // input source's offset. empty_ = true; - input.seek(input.getLastOffset(), SEEK_SET); + input_.seek(input_.getLastOffset(), SEEK_SET); if (!content_stream) { warn("empty object treated as null"); } return {}; } else { warn("unknown token while reading object; treating as string"); - return withDescription(value); + return with_description(value); } } case QPDFTokenizer::tt_string: - if (decrypter) { - std::string s{tokenizer.getValue()}; - decrypter->decryptString(s); - return withDescription(s); + if (decrypter_) { + std::string s{tokenizer_.getValue()}; + decrypter_->decryptString(s); + return with_description(s); } else { - return withDescription(tokenizer.getValue()); + return with_description(tokenizer_.getValue()); } default: @@ -279,65 +280,65 @@ QPDFParser::parse_first(bool content_stream) } QPDFObjectHandle -QPDFParser::parseRemainder(bool content_stream) +Parser::parse_remainder(bool content_stream) { // This method must take care not to resolve any objects. Don't check the type of any object // without first ensuring that it is a direct object. Otherwise, doing so may have the side // effect of reading the object and changing the file pointer. If you do this, it will cause a // logic error to be thrown from QPDF::inParse(). - bad_count = 0; + bad_count_ = 0; bool b_contents = false; while (true) { - if (!tokenizer.nextToken(input, object_description)) { - warn(tokenizer.getErrorMessage()); + if (!tokenizer_.nextToken(input_, object_description_)) { + warn(tokenizer_.getErrorMessage()); } - ++good_count; // optimistically + ++good_count_; // optimistically - if (int_count != 0) { + if (int_count_ != 0) { // Special handling of indirect references. Treat integer tokens as part of an indirect // reference until proven otherwise. - if (tokenizer.getType() == QPDFTokenizer::tt_integer) { - if (++int_count > 2) { + if (tokenizer_.getType() == QPDFTokenizer::tt_integer) { + if (++int_count_ > 2) { // Process the oldest buffered integer. - addInt(int_count); + add_int(int_count_); } - last_offset_buffer[int_count % 2] = input.getLastOffset(); - int_buffer[int_count % 2] = QUtil::string_to_ll(tokenizer.getValue().c_str()); + last_offset_buffer_[int_count_ % 2] = input_.getLastOffset(); + int_buffer_[int_count_ % 2] = QUtil::string_to_ll(tokenizer_.getValue().c_str()); continue; } else if ( - int_count >= 2 && tokenizer.getType() == QPDFTokenizer::tt_word && - tokenizer.getValue() == "R") { - if (!context) { + int_count_ >= 2 && tokenizer_.getType() == QPDFTokenizer::tt_word && + tokenizer_.getValue() == "R") { + if (!context_) { throw std::logic_error( - "QPDFParser::parse called without context on an object with indirect " + "Parser::parse called without context on an object with indirect " "references"); } - auto id = QIntC::to_int(int_buffer[(int_count - 1) % 2]); - auto gen = QIntC::to_int(int_buffer[(int_count) % 2]); + auto id = QIntC::to_int(int_buffer_[(int_count_ - 1) % 2]); + auto gen = QIntC::to_int(int_buffer_[(int_count_) % 2]); if (!(id < 1 || gen < 0 || gen >= 65535)) { - add(ParseGuard::getObject(context, id, gen, parse_pdf)); + add(ParseGuard::getObject(context_, id, gen, parse_pdf_)); } else { add_bad_null( "treating bad indirect reference (" + std::to_string(id) + " " + std::to_string(gen) + " R) as null"); } - int_count = 0; + int_count_ = 0; continue; - } else if (int_count > 0) { + } else if (int_count_ > 0) { // Process the buffered integers before processing the current token. - if (int_count > 1) { - addInt(int_count - 1); + if (int_count_ > 1) { + add_int(int_count_ - 1); } - addInt(int_count); - int_count = 0; + add_int(int_count_); + int_count_ = 0; } } - switch (tokenizer.getType()) { + switch (tokenizer_.getType()) { case QPDFTokenizer::tt_eof: warn("parse error while reading object"); if (content_stream) { @@ -349,7 +350,7 @@ QPDFParser::parseRemainder(bool content_stream) case QPDFTokenizer::tt_bad: check_too_many_bad_tokens(); - addNull(); + add_null(); continue; case QPDFTokenizer::tt_brace_open: @@ -358,23 +359,23 @@ QPDFParser::parseRemainder(bool content_stream) continue; case QPDFTokenizer::tt_array_close: - if (frame->state == st_array) { - auto object = frame->null_count > 100 - ? QPDFObject::create(std::move(frame->olist), true) - : QPDFObject::create(std::move(frame->olist)); - setDescription(object, frame->offset - 1); + if (frame_->state == st_array) { + auto object = frame_->null_count > 100 + ? QPDFObject::create(std::move(frame_->olist), true) + : QPDFObject::create(std::move(frame_->olist)); + set_description(object, frame_->offset - 1); // The `offset` points to the next of "[". Set the rewind offset to point to the // beginning of "[". This has been explicitly tested with whitespace surrounding the // array start delimiter. getLastOffset points to the array end token and therefore // can't be used here. - if (stack.size() <= 1) { + if (stack_.size() <= 1) { return object; } - stack.pop_back(); - frame = &stack.back(); + stack_.pop_back(); + frame_ = &stack_.back(); add(std::move(object)); } else { - if (sanity_checks) { + if (sanity_checks_) { // During sanity checks, assume nesting of containers is corrupt and object is // unusable. warn("unexpected array close token; giving up on reading object"); @@ -385,46 +386,46 @@ QPDFParser::parseRemainder(bool content_stream) continue; case QPDFTokenizer::tt_dict_close: - if (frame->state <= st_dictionary_value) { + if (frame_->state <= st_dictionary_value) { // Attempt to recover more or less gracefully from invalid dictionaries. - auto& dict = frame->dict; + auto& dict = frame_->dict; - if (frame->state == st_dictionary_value) { + if (frame_->state == st_dictionary_value) { warn( - frame->offset, + frame_->offset, "dictionary ended prematurely; using null as value for last key"); - dict[frame->key] = QPDFObject::create(); + dict[frame_->key] = QPDFObject::create(); } - if (!frame->olist.empty()) { - if (sanity_checks) { + if (!frame_->olist.empty()) { + if (sanity_checks_) { warn( - frame->offset, + frame_->offset, "expected dictionary keys but found non-name objects; ignoring"); } else { - fixMissingKeys(); + fix_missing_keys(); } } - if (!frame->contents_string.empty() && dict.contains("/Type") && + if (!frame_->contents_string.empty() && dict.contains("/Type") && dict["/Type"].isNameAndEquals("/Sig") && dict.contains("/ByteRange") && dict.contains("/Contents") && dict["/Contents"].isString()) { - dict["/Contents"] = QPDFObjectHandle::newString(frame->contents_string); - dict["/Contents"].setParsedOffset(frame->contents_offset); + dict["/Contents"] = QPDFObjectHandle::newString(frame_->contents_string); + dict["/Contents"].setParsedOffset(frame_->contents_offset); } auto object = QPDFObject::create(std::move(dict)); - setDescription(object, frame->offset - 2); + set_description(object, frame_->offset - 2); // The `offset` points to the next of "<<". Set the rewind offset to point to the // beginning of "<<". This has been explicitly tested with whitespace surrounding // the dictionary start delimiter. getLastOffset points to the dictionary end token // and therefore can't be used here. - if (stack.size() <= 1) { + if (stack_.size() <= 1) { return object; } - stack.pop_back(); - frame = &stack.back(); + stack_.pop_back(); + frame_ = &stack_.back(); add(std::move(object)); } else { - if (sanity_checks) { + if (sanity_checks_) { // During sanity checks, assume nesting of containers is corrupt and object is // unusable. warn("unexpected dictionary close token; giving up on reading object"); @@ -436,60 +437,60 @@ QPDFParser::parseRemainder(bool content_stream) case QPDFTokenizer::tt_array_open: case QPDFTokenizer::tt_dict_open: - if (stack.size() > max_nesting) { + if (stack_.size() > max_nesting) { limits_error( "parser-max-nesting", "ignoring excessively deeply nested data structure"); } b_contents = false; - stack.emplace_back( - input, - (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array - : st_dictionary_key); - frame = &stack.back(); + stack_.emplace_back( + input_, + (tokenizer_.getType() == QPDFTokenizer::tt_array_open) ? st_array + : st_dictionary_key); + frame_ = &stack_.back(); continue; case QPDFTokenizer::tt_bool: - addScalar(tokenizer.getValue() == "true"); + add_scalar(tokenizer_.getValue() == "true"); continue; case QPDFTokenizer::tt_null: - addNull(); + add_null(); continue; case QPDFTokenizer::tt_integer: if (!content_stream) { // Buffer token in case it is part of an indirect reference. - last_offset_buffer[1] = input.getLastOffset(); - int_buffer[1] = QUtil::string_to_ll(tokenizer.getValue().c_str()); - int_count = 1; + last_offset_buffer_[1] = input_.getLastOffset(); + int_buffer_[1] = QUtil::string_to_ll(tokenizer_.getValue().c_str()); + int_count_ = 1; } else { - addScalar(QUtil::string_to_ll(tokenizer.getValue().c_str())); + add_scalar(QUtil::string_to_ll(tokenizer_.getValue().c_str())); } continue; case QPDFTokenizer::tt_real: - addScalar(tokenizer.getValue()); + add_scalar(tokenizer_.getValue()); continue; case QPDFTokenizer::tt_name: - if (frame->state == st_dictionary_key) { - frame->key = tokenizer.getValue(); - frame->state = st_dictionary_value; - b_contents = decrypter && frame->key == "/Contents"; + if (frame_->state == st_dictionary_key) { + frame_->key = tokenizer_.getValue(); + frame_->state = st_dictionary_value; + b_contents = decrypter_ && frame_->key == "/Contents"; continue; } else { - addScalar(tokenizer.getValue()); + add_scalar(tokenizer_.getValue()); } continue; case QPDFTokenizer::tt_word: if (content_stream) { - addScalar(tokenizer.getValue()); + add_scalar(tokenizer_.getValue()); continue; } - if (sanity_checks) { - if (tokenizer.getValue() == "endobj" || tokenizer.getValue() == "endstream") { + if (sanity_checks_) { + if (tokenizer_.getValue() == "endobj" || tokenizer_.getValue() == "endstream") { // During sanity checks, assume an unexpected endobj or endstream indicates that // we are parsing past the end of the object. warn( @@ -504,24 +505,24 @@ QPDFParser::parseRemainder(bool content_stream) warn("unknown token while reading object; treating as string"); check_too_many_bad_tokens(); - addScalar(tokenizer.getValue()); + add_scalar(tokenizer_.getValue()); continue; case QPDFTokenizer::tt_string: { - auto const& val = tokenizer.getValue(); - if (decrypter) { + auto const& val = tokenizer_.getValue(); + if (decrypter_) { if (b_contents) { - frame->contents_string = val; - frame->contents_offset = input.getLastOffset(); + frame_->contents_string = val; + frame_->contents_offset = input_.getLastOffset(); b_contents = false; } std::string s{val}; - decrypter->decryptString(s); - addScalar(s); + decrypter_->decryptString(s); + add_scalar(s); } else { - addScalar(val); + add_scalar(val); } } continue; @@ -533,107 +534,107 @@ QPDFParser::parseRemainder(bool content_stream) } void -QPDFParser::add(std::shared_ptr&& obj) +Parser::add(std::shared_ptr&& obj) { - if (frame->state != st_dictionary_value) { + if (frame_->state != st_dictionary_value) { // If state is st_dictionary_key then there is a missing key. Push onto olist for // processing once the tt_dict_close token has been found. - frame->olist.emplace_back(std::move(obj)); + frame_->olist.emplace_back(std::move(obj)); } else { - if (auto res = frame->dict.insert_or_assign(frame->key, std::move(obj)); !res.second) { - warnDuplicateKey(); + if (auto res = frame_->dict.insert_or_assign(frame_->key, std::move(obj)); !res.second) { + warn_duplicate_key(); } - frame->state = st_dictionary_key; + frame_->state = st_dictionary_key; } } void -QPDFParser::addNull() +Parser::add_null() { const static ObjectPtr null_obj = QPDFObject::create(); - if (frame->state != st_dictionary_value) { + if (frame_->state != st_dictionary_value) { // If state is st_dictionary_key then there is a missing key. Push onto olist for // processing once the tt_dict_close token has been found. - frame->olist.emplace_back(null_obj); + frame_->olist.emplace_back(null_obj); } else { - if (auto res = frame->dict.insert_or_assign(frame->key, null_obj); !res.second) { - warnDuplicateKey(); + if (auto res = frame_->dict.insert_or_assign(frame_->key, null_obj); !res.second) { + warn_duplicate_key(); } - frame->state = st_dictionary_key; + frame_->state = st_dictionary_key; } - ++frame->null_count; + ++frame_->null_count; } void -QPDFParser::add_bad_null(std::string const& msg) +Parser::add_bad_null(std::string const& msg) { warn(msg); check_too_many_bad_tokens(); - addNull(); + add_null(); } void -QPDFParser::addInt(int count) +Parser::add_int(int count) { - auto obj = QPDFObject::create(int_buffer[count % 2]); - obj->setDescription(context, description, last_offset_buffer[count % 2]); + auto obj = QPDFObject::create(int_buffer_[count % 2]); + obj->setDescription(context_, description_, last_offset_buffer_[count % 2]); add(std::move(obj)); } template void -QPDFParser::addScalar(Args&&... args) +Parser::add_scalar(Args&&... args) { - auto limit = Limits::parser_max_container_size(bad_count || sanity_checks); - if (frame->olist.size() >= limit || frame->dict.size() >= limit) { + auto limit = Limits::parser_max_container_size(bad_count_ || sanity_checks_); + if (frame_->olist.size() >= limit || frame_->dict.size() >= limit) { // Stop adding scalars. We are going to abort when the close token or a bad token is // encountered. - max_bad_count = 1; + max_bad_count_ = 1; check_too_many_bad_tokens(); // always throws Error() } auto obj = QPDFObject::create(std::forward(args)...); - obj->setDescription(context, description, input.getLastOffset()); + obj->setDescription(context_, description_, input_.getLastOffset()); add(std::move(obj)); } template QPDFObjectHandle -QPDFParser::withDescription(Args&&... args) +Parser::with_description(Args&&... args) { auto obj = QPDFObject::create(std::forward(args)...); - obj->setDescription(context, description, start); + obj->setDescription(context_, description_, start_); return {obj}; } void -QPDFParser::setDescription(ObjectPtr& obj, qpdf_offset_t parsed_offset) +Parser::set_description(ObjectPtr& obj, qpdf_offset_t parsed_offset) { if (obj) { - obj->setDescription(context, description, parsed_offset); + obj->setDescription(context_, description_, parsed_offset); } } void -QPDFParser::fixMissingKeys() +Parser::fix_missing_keys() { std::set names; - for (auto& obj: frame->olist) { + for (auto& obj: frame_->olist) { if (obj.raw_type_code() == ::ot_name) { names.insert(obj.obj_sp()->getStringValue()); } } int next_fake_key = 1; - for (auto const& item: frame->olist) { + for (auto const& item: frame_->olist) { while (true) { const std::string key = "/QPDFFake" + std::to_string(next_fake_key++); - const bool found_fake = !frame->dict.contains(key) && !names.contains(key); + const bool found_fake = !frame_->dict.contains(key) && !names.contains(key); QTC::TC("qpdf", "QPDFParser found fake", (found_fake ? 0 : 1)); if (found_fake) { warn( - frame->offset, + frame_->offset, "expected dictionary key but found non-name object; inserting key " + key); - frame->dict[key] = item; + frame_->dict[key] = item; break; } } @@ -641,11 +642,11 @@ QPDFParser::fixMissingKeys() } void -QPDFParser::check_too_many_bad_tokens() +Parser::check_too_many_bad_tokens() { - auto limit = Limits::parser_max_container_size(bad_count || sanity_checks); - if (frame->olist.size() >= limit || frame->dict.size() >= limit) { - if (bad_count) { + auto limit = Limits::parser_max_container_size(bad_count_ || sanity_checks_); + if (frame_->olist.size() >= limit || frame_->dict.size() >= limit) { + if (bad_count_) { limits_error( "parser-max-container-size-damaged", "encountered errors while parsing an array or dictionary with more than " + @@ -656,27 +657,27 @@ QPDFParser::check_too_many_bad_tokens() "encountered an array or dictionary with more than " + std::to_string(limit) + " elements during xref recovery; giving up on reading object"); } - if (max_bad_count && --max_bad_count == 0) { + if (max_bad_count_ && --max_bad_count_ == 0) { limits_error( "parser-max-errors", "too many errors during parsing; treating object as null"); } - if (good_count > 4) { - good_count = 0; - bad_count = 1; + if (good_count_ > 4) { + good_count_ = 0; + bad_count_ = 1; return; } - if (++bad_count > 5 || - (frame->state != st_array && std::cmp_less(max_bad_count, frame->olist.size()))) { + if (++bad_count_ > 5 || + (frame_->state != st_array && std::cmp_less(max_bad_count_, frame_->olist.size()))) { // Give up after 5 errors in close proximity or if the number of missing dictionary keys // exceeds the remaining number of allowable total errors. warn("too many errors; giving up on reading object"); throw Error(); } - good_count = 0; + good_count_ = 0; } void -QPDFParser::limits_error(std::string const& limit, std::string const& msg) +Parser::limits_error(std::string const& limit, std::string const& msg) { Limits::error(); warn("limits error("s + limit + "): " + msg); @@ -684,40 +685,41 @@ QPDFParser::limits_error(std::string const& limit, std::string const& msg) } void -QPDFParser::warn(QPDFExc const& e) const +Parser::warn(QPDFExc const& e) const { // If parsing on behalf of a QPDF object and want to give a warning, we can warn through the // object. If parsing for some other reason, such as an explicit creation of an object from a // string, then just throw the exception. - if (context) { - context->warn(e); + if (context_) { + context_->warn(e); } else { throw e; } } void -QPDFParser::warnDuplicateKey() +Parser::warn_duplicate_key() { warn( - frame->offset, - "dictionary has duplicated key " + frame->key + "; last occurrence overrides earlier ones"); + frame_->offset, + "dictionary has duplicated key " + frame_->key + + "; last occurrence overrides earlier ones"); } void -QPDFParser::warn(qpdf_offset_t offset, std::string const& msg) const +Parser::warn(qpdf_offset_t offset, std::string const& msg) const { - if (stream_id) { - std::string descr = "object "s + std::to_string(obj_id) + " 0"; - std::string name = context->getFilename() + " object stream " + std::to_string(stream_id); + if (stream_id_) { + std::string descr = "object "s + std::to_string(obj_id_) + " 0"; + std::string name = context_->getFilename() + " object stream " + std::to_string(stream_id_); warn(QPDFExc(qpdf_e_damaged_pdf, name, descr, offset, msg)); } else { - warn(QPDFExc(qpdf_e_damaged_pdf, input.getName(), object_description, offset, msg)); + warn(QPDFExc(qpdf_e_damaged_pdf, input_.getName(), object_description_, offset, msg)); } } void -QPDFParser::warn(std::string const& msg) const +Parser::warn(std::string const& msg) const { - warn(input.getLastOffset(), msg); + warn(input_.getLastOffset(), msg); } diff --git a/libqpdf/QPDF_objects.cc b/libqpdf/QPDF_objects.cc index 6dcf265..f1e5f64 100644 --- a/libqpdf/QPDF_objects.cc +++ b/libqpdf/QPDF_objects.cc @@ -25,6 +25,7 @@ using namespace qpdf; using namespace std::literals; using Objects = QPDF::Doc::Objects; +using Parser = impl::Parser; QPDFXRefEntry::QPDFXRefEntry() = default; @@ -1287,7 +1288,7 @@ Objects::readTrailer() { qpdf_offset_t offset = m->file->tell(); auto object = - QPDFParser::parse(*m->file, "trailer", m->tokenizer, nullptr, qpdf, m->reconstructed_xref); + Parser::parse(*m->file, "trailer", m->tokenizer, nullptr, qpdf, m->reconstructed_xref); if (object.isDictionary() && m->objects.readToken(*m->file).isWord("stream")) { warn(damagedPDF("trailer", m->file->tell(), "stream keyword found in trailer")); } @@ -1304,7 +1305,7 @@ Objects::readObject(std::string const& description, QPDFObjGen og) StringDecrypter decrypter{&qpdf, og}; StringDecrypter* decrypter_ptr = m->encp->encrypted ? &decrypter : nullptr; - auto object = QPDFParser::parse( + auto object = Parser::parse( *m->file, m->last_object_description, m->tokenizer, @@ -1834,7 +1835,7 @@ Objects::resolveObjectsInStream(int obj_stream_number) if (entry != m->xref_table.end() && entry->second.getType() == 2 && entry->second.getObjStreamNumber() == obj_stream_number) { is::OffsetBuffer in("", {b_start + obj_offset, obj_size}, obj_offset); - if (auto oh = QPDFParser::parse(in, obj_stream_number, obj_id, m->tokenizer, qpdf)) { + if (auto oh = Parser::parse(in, obj_stream_number, obj_id, m->tokenizer, qpdf)) { updateCache(og, oh.obj_sp(), end_before_space, end_after_space); } } else { diff --git a/libqpdf/qpdf/QPDFParser.hh b/libqpdf/qpdf/QPDFParser.hh index e108a20..306026e 100644 --- a/libqpdf/qpdf/QPDFParser.hh +++ b/libqpdf/qpdf/QPDFParser.hh @@ -13,153 +13,277 @@ using namespace qpdf; using namespace qpdf::global; -class QPDFParser +namespace qpdf::impl { - public: - class Error: public std::exception + /// @class Parser + /// @brief Internal parser for PDF objects and content streams. + /// @par + /// The Parser class provides static methods for parsing PDF objects from input sources. + /// It handles tokenization, error recovery, and object construction with proper offset + /// tracking and description for error reporting. + class Parser { public: - Error() = default; - virtual ~Error() noexcept = default; - }; + /// @brief Exception thrown when parser encounters an unrecoverable error. + class Error: public std::exception + { + public: + Error() = default; + virtual ~Error() noexcept = default; + }; - static QPDFObjectHandle - parse(InputSource& input, std::string const& object_description, QPDF* context); - - static QPDFObjectHandle parse_content( - InputSource& input, - std::shared_ptr sp_description, - qpdf::Tokenizer& tokenizer, - QPDF* context); - - // For use by deprecated QPDFObjectHandle::parse. - static QPDFObjectHandle parse( - InputSource& input, - std::string const& object_description, - QPDFTokenizer& tokenizer, - bool& empty, - QPDFObjectHandle::StringDecrypter* decrypter, - QPDF* context); - - // For use by QPDF. - static QPDFObjectHandle parse( - InputSource& input, - std::string const& object_description, - qpdf::Tokenizer& tokenizer, - QPDFObjectHandle::StringDecrypter* decrypter, - QPDF& context, - bool sanity_checks); - - static QPDFObjectHandle parse( - qpdf::is::OffsetBuffer& input, - int stream_id, - int obj_id, - qpdf::Tokenizer& tokenizer, - QPDF& context); - - static std::shared_ptr - make_description(std::string const& input_name, std::string const& object_description) - { - using namespace std::literals; - return std::make_shared( - input_name + ", " + object_description + " at offset $PO"); - } - - private: - QPDFParser( - InputSource& input, - std::shared_ptr sp_description, - std::string const& object_description, - qpdf::Tokenizer& tokenizer, - QPDFObjectHandle::StringDecrypter* decrypter, - QPDF* context, - bool parse_pdf, - int stream_id = 0, - int obj_id = 0, - bool sanity_checks = false) : - input(input), - object_description(object_description), - tokenizer(tokenizer), - decrypter(decrypter), - context(context), - description(std::move(sp_description)), - parse_pdf(parse_pdf), - stream_id(stream_id), - obj_id(obj_id), - sanity_checks(sanity_checks) - { - } + /// @brief Parse a PDF object from an input source. + /// @param input The input source to read from. + /// @param object_description Description of the object for error messages. + /// @param context The QPDF context, or nullptr if parsing standalone. + /// @return The parsed QPDFObjectHandle, or null if parsing fails. + static QPDFObjectHandle + parse(InputSource& input, std::string const& object_description, QPDF* context); - // Parser state. Note: - // state <= st_dictionary_value == (state = st_dictionary_key || state = st_dictionary_value) - enum parser_state_e { st_dictionary_key, st_dictionary_value, st_array }; + /// @brief Parse a content stream from an input source. + /// @param input The input source to read from. + /// @param sp_description Shared pointer to object description. + /// @param tokenizer The tokenizer to use for parsing. + /// @param context The QPDF context. + /// @return The parsed QPDFObjectHandle, or uninitialized handle on EOF. + static QPDFObjectHandle parse_content( + InputSource& input, + std::shared_ptr sp_description, + qpdf::Tokenizer& tokenizer, + QPDF* context); - struct StackFrame - { - StackFrame(InputSource& input, parser_state_e state) : - state(state), - offset(input.tell()) + /// @brief Parse a PDF object (interface for deprecated QPDFObjectHandle::parse). + /// @param input The input source to read from. + /// @param object_description Description of the object for error messages. + /// @param tokenizer The tokenizer to use for parsing. + /// @param empty Output parameter indicating if object was empty. + /// @param decrypter String decrypter for encrypted strings, or nullptr. + /// @param context The QPDF context, or nullptr if parsing standalone. + /// @return The parsed QPDFObjectHandle. + static QPDFObjectHandle parse( + InputSource& input, + std::string const& object_description, + QPDFTokenizer& tokenizer, + bool& empty, + QPDFObjectHandle::StringDecrypter* decrypter, + QPDF* context); + + /// @brief Parse a PDF object for use by QPDF. + /// @param input The input source to read from. + /// @param object_description Description of the object for error messages. + /// @param tokenizer The tokenizer to use for parsing. + /// @param decrypter String decrypter for encrypted strings, or nullptr. + /// @param context The QPDF context. + /// @param sanity_checks Enable additional sanity checks during parsing. + /// @return The parsed QPDFObjectHandle. + static QPDFObjectHandle parse( + InputSource& input, + std::string const& object_description, + qpdf::Tokenizer& tokenizer, + QPDFObjectHandle::StringDecrypter* decrypter, + QPDF& context, + bool sanity_checks); + + /// @brief Parse an object from an object stream. + /// @param input The offset buffer containing the object data. + /// @param stream_id The object stream number. + /// @param obj_id The object ID within the stream. + /// @param tokenizer The tokenizer to use for parsing. + /// @param context The QPDF context. + /// @return The parsed QPDFObjectHandle. + static QPDFObjectHandle parse( + qpdf::is::OffsetBuffer& input, + int stream_id, + int obj_id, + qpdf::Tokenizer& tokenizer, + QPDF& context); + + /// @brief Create a description for a parsed object. + /// @param input_name The name of the input source. + /// @param object_description Description of the object being parsed. + /// @return Shared pointer to object description with offset placeholder. + static std::shared_ptr + make_description(std::string const& input_name, std::string const& object_description) { + using namespace std::literals; + return std::make_shared( + input_name + ", " + object_description + " at offset $PO"); } - std::vector olist; - std::map dict; - parser_state_e state; - std::string key; - qpdf_offset_t offset; - std::string contents_string; - qpdf_offset_t contents_offset{-1}; - int null_count{0}; - }; + private: + /// @brief Construct a parser instance. + /// @param input The input source to read from. + /// @param sp_description Shared pointer to object description. + /// @param object_description Description string for error messages. + /// @param tokenizer The tokenizer to use for parsing. + /// @param decrypter String decrypter for encrypted content. + /// @param context The QPDF context. + /// @param parse_pdf Whether parsing PDF objects (vs content streams). + /// @param stream_id Object stream ID for object stream parsing. + /// @param obj_id Object ID within object stream. + /// @param sanity_checks Enable additional sanity checks. + Parser( + InputSource& input, + std::shared_ptr sp_description, + std::string const& object_description, + qpdf::Tokenizer& tokenizer, + QPDFObjectHandle::StringDecrypter* decrypter, + QPDF* context, + bool parse_pdf, + int stream_id = 0, + int obj_id = 0, + bool sanity_checks = false) : + input_(input), + object_description_(object_description), + tokenizer_(tokenizer), + decrypter_(decrypter), + context_(context), + description_(std::move(sp_description)), + parse_pdf_(parse_pdf), + stream_id_(stream_id), + obj_id_(obj_id), + sanity_checks_(sanity_checks) + { + } + + /// @brief Parser state enumeration. + /// @note state <= st_dictionary_value indicates we're in a dictionary context. + enum parser_state_e { st_dictionary_key, st_dictionary_value, st_array }; + + /// @brief Stack frame for tracking nested arrays and dictionaries. + struct StackFrame + { + StackFrame(InputSource& input, parser_state_e state) : + state(state), + offset(input.tell()) + { + } + + std::vector olist; ///< Object list for arrays/dict values + std::map dict; ///< Dictionary entries + parser_state_e state; ///< Current parser state + std::string key; ///< Current dictionary key + qpdf_offset_t offset; ///< Offset of container start + std::string contents_string; ///< For /Contents field in signatures + qpdf_offset_t contents_offset{-1}; ///< Offset of /Contents value + int null_count{0}; ///< Count of null values in container + }; + + /// @brief Parse an object, handling exceptions and returning null on error. + /// @param content_stream True if parsing a content stream. + /// @return The parsed object handle, or null/uninitialized on error. + QPDFObjectHandle parse(bool content_stream = false); + + /// @brief Parse the first token and dispatch to appropriate handler. + /// @param content_stream True if parsing a content stream. + /// @return The parsed object handle. + QPDFObjectHandle parse_first(bool content_stream); + + /// @brief Parse the remainder of a composite object (array/dict/reference). + /// @param content_stream True if parsing a content stream. + /// @return The completed object handle. + QPDFObjectHandle parse_remainder(bool content_stream); + + /// @brief Add an object to the current container. + /// @param obj The object to add. + void add(std::shared_ptr&& obj); - QPDFObjectHandle parse(bool content_stream = false); - QPDFObjectHandle parse_first(bool content_stream); - QPDFObjectHandle parseRemainder(bool content_stream); - void add(std::shared_ptr&& obj); - void addNull(); - void add_bad_null(std::string const& msg); - void addInt(int count); - template - void addScalar(Args&&... args); - void check_too_many_bad_tokens(); - void warnDuplicateKey(); - void fixMissingKeys(); - [[noreturn]] void limits_error(std::string const& limit, std::string const& msg); - void warn(qpdf_offset_t offset, std::string const& msg) const; - void warn(std::string const& msg) const; - void warn(QPDFExc const&) const; - template - // Create a new scalar object complete with parsed offset and description. - // NB the offset includes any leading whitespace. - QPDFObjectHandle withDescription(Args&&... args); - void setDescription(std::shared_ptr& obj, qpdf_offset_t parsed_offset); - InputSource& input; - std::string const& object_description; - qpdf::Tokenizer& tokenizer; - QPDFObjectHandle::StringDecrypter* decrypter; - QPDF* context; - std::shared_ptr description; - bool parse_pdf{false}; - int stream_id{0}; - int obj_id{0}; - bool sanity_checks{false}; - - std::vector stack; - StackFrame* frame{nullptr}; - // Number of recent bad tokens. This will always be > 0 once a bad token has been encountered as - // it only gets incremented or reset when a bad token is encountered. - int bad_count{0}; - // Number of bad tokens (remaining) before giving up. - uint32_t max_bad_count{Limits::parser_max_errors()}; - // Number of good tokens since last bad token. Irrelevant if bad_count == 0. - int good_count{0}; - // Start offset including any leading whitespace. - qpdf_offset_t start{0}; - // Number of successive integer tokens. - int int_count{0}; - long long int_buffer[2]{0, 0}; - qpdf_offset_t last_offset_buffer[2]{0, 0}; - bool empty_{false}; -}; + /// @brief Add a null object to the current container. + void add_null(); + + /// @brief Add a null with a warning message. + /// @param msg Warning message describing the error. + void add_bad_null(std::string const& msg); + + /// @brief Add a buffered integer from int_buffer_. + /// @param count Buffer index (1 or 2) to read from. + void add_int(int count); + + /// @brief Create and add a scalar object to the current container. + /// @tparam T The scalar object type (e.g., QPDF_Integer, QPDF_String). + /// @tparam Args Constructor argument types. + /// @param args Arguments to forward to the object constructor. + template + void add_scalar(Args&&... args); + + /// @brief Check if too many bad tokens have been encountered and throw if so. + void check_too_many_bad_tokens(); + + /// @brief Issue a warning about a duplicate dictionary key. + void warn_duplicate_key(); + + /// @brief Fix dictionaries with missing keys by generating fake keys. + void fix_missing_keys(); + + /// @brief Report a limits error and throw. + /// @param limit The limit identifier. + /// @param msg Error message. + [[noreturn]] void limits_error(std::string const& limit, std::string const& msg); + + /// @brief Issue a warning at a specific offset. + /// @param offset File offset for the warning. + /// @param msg Warning message. + void warn(qpdf_offset_t offset, std::string const& msg) const; + + /// @brief Issue a warning at the current offset. + /// @param msg Warning message. + void warn(std::string const& msg) const; + + /// @brief Issue a warning from a QPDFExc exception. + /// @param e The exception to report. + void warn(QPDFExc const& e) const; + + /// @brief Create a scalar object with description and parsed offset. + /// @tparam T The scalar object type. + /// @tparam Args Constructor argument types. + /// @param args Arguments to forward to the object constructor. + /// @return Object handle with description and offset set. + /// @note The offset includes any leading whitespace. + template + QPDFObjectHandle with_description(Args&&... args); + + /// @brief Set the description and offset on an existing object. + /// @param obj The object to update. + /// @param parsed_offset The file offset where the object was parsed. + void set_description(std::shared_ptr& obj, qpdf_offset_t parsed_offset); + + // Core parsing state + InputSource& input_; ///< Input source to read from + std::string const& object_description_; ///< Description for error messages + qpdf::Tokenizer& tokenizer_; ///< Tokenizer for lexical analysis + QPDFObjectHandle::StringDecrypter* decrypter_; ///< Decrypter for encrypted strings + QPDF* context_; ///< QPDF context for object resolution + std::shared_ptr description_; ///< Shared description for objects + bool parse_pdf_{false}; ///< True if parsing PDF objects vs content streams + int stream_id_{0}; ///< Object stream ID (for object stream parsing) + int obj_id_{0}; ///< Object ID within object stream + bool sanity_checks_{false}; ///< Enable additional validation checks + + // Composite object parsing state + std::vector stack_; ///< Stack of nested containers + StackFrame* frame_{nullptr}; ///< Current stack frame pointer + + // Error tracking state + /// Number of recent bad tokens. Always > 0 after first bad token encountered. + int bad_count_{0}; + /// Number of bad tokens remaining before giving up. + uint32_t max_bad_count_{Limits::parser_max_errors()}; + /// Number of good tokens since last bad token. Irrelevant if bad_count == 0. + int good_count_{0}; + + // Token buffering state + /// Start offset of current object, including any leading whitespace. + qpdf_offset_t start_{0}; + /// Number of successive integer tokens (for indirect reference detection). + int int_count_{0}; + /// Buffer for up to 2 integer tokens. + long long int_buffer_[2]{0, 0}; + /// Offsets corresponding to buffered integers. + qpdf_offset_t last_offset_buffer_[2]{0, 0}; + + /// True if object was empty (endobj without content). + bool empty_{false}; + }; +} // namespace qpdf::impl #endif // QPDFPARSER_HH diff --git a/qpdf/qtest/qpdf/parse-object.out b/qpdf/qtest/qpdf/parse-object.out index 16efee5..13a0d36 100644 --- a/qpdf/qtest/qpdf/parse-object.out +++ b/qpdf/qtest/qpdf/parse-object.out @@ -1,5 +1,5 @@ [ /name 16059 3.14159 false << /key true /other [ (string1) (string2) ] >> null ] -logic error parsing indirect: QPDFParser::parse called without context on an object with indirect references +logic error parsing indirect: Parser::parse called without context on an object with indirect references trailing data: parsed object (trailing test): trailing data found parsing object from string WARNING: parsed object (offset 9): unknown token while reading object; treating as string WARNING: parsed object: treating unexpected brace token as null