diff --git a/include/qpdf/QPDFObjectHandle.hh b/include/qpdf/QPDFObjectHandle.hh index 10a5244..9fef4e6 100644 --- a/include/qpdf/QPDFObjectHandle.hh +++ b/include/qpdf/QPDFObjectHandle.hh @@ -61,11 +61,14 @@ class QPDFTokenizer; class QPDFExc; class Pl_QPDFTokenizer; class QPDFMatrix; -class QPDFParser; +namespace qpdf::impl +{ + class Parser; +} class QPDFObjectHandle: public qpdf::BaseHandle { - friend class QPDFParser; + friend class qpdf::impl::Parser; public: // This class is used by replaceStreamData. It provides an alternative way of associating diff --git a/include/qpdf/QPDFTokenizer.hh b/include/qpdf/QPDFTokenizer.hh index d94eb02..94dae1a 100644 --- a/include/qpdf/QPDFTokenizer.hh +++ b/include/qpdf/QPDFTokenizer.hh @@ -31,6 +31,10 @@ namespace qpdf { class Tokenizer; + namespace impl + { + class Parser; + } } // namespace qpdf class QPDFTokenizer @@ -203,7 +207,7 @@ class QPDFTokenizer void expectInlineImage(InputSource& input); private: - friend class QPDFParser; + friend class qpdf::impl::Parser; QPDFTokenizer(QPDFTokenizer const&) = delete; QPDFTokenizer& operator=(QPDFTokenizer const&) = delete; diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc index 2cafbab..5bad81b 100644 --- a/libqpdf/QPDFObjectHandle.cc +++ b/libqpdf/QPDFObjectHandle.cc @@ -25,6 +25,8 @@ using namespace std::literals; using namespace qpdf; +using Parser = impl::Parser; + const Null Null::temp_; BaseHandle:: @@ -1540,7 +1542,7 @@ QPDFObjectHandle::parse( QPDF* context, std::string const& object_str, std::string const& object_description) { auto input = is::OffsetBuffer("parsed object", object_str); - auto result = QPDFParser::parse(input, object_description, context); + auto result = Parser::parse(input, object_description, context); size_t offset = QIntC::to_size(input.tell()); while (offset < object_str.length()) { if (!isspace(object_str.at(offset))) { @@ -1661,7 +1663,7 @@ QPDFObjectHandle::parseContentStream_data( auto input = is::OffsetBuffer(description, stream_data); Tokenizer tokenizer; tokenizer.allowEOF(); - auto sp_description = QPDFParser::make_description(description, "content"); + auto sp_description = Parser::make_description(description, "content"); while (QIntC::to_size(input.tell()) < stream_length) { // Read a token and seek to the beginning. The offset we get from this process is the // beginning of the next non-ignorable (space, comment) token. This way, the offset and @@ -1669,7 +1671,7 @@ QPDFObjectHandle::parseContentStream_data( tokenizer.nextToken(input, "content", true); qpdf_offset_t offset = input.getLastOffset(); input.seek(offset, SEEK_SET); - auto obj = QPDFParser::parse_content(input, sp_description, tokenizer, context); + auto obj = Parser::parse_content(input, sp_description, tokenizer, context); if (!obj) { // EOF break; @@ -1678,7 +1680,7 @@ QPDFObjectHandle::parseContentStream_data( if (callbacks) { callbacks->handleObject(obj, QIntC::to_size(offset), length); } - if (obj.isOperator() && (obj.getOperatorValue() == "ID")) { + if (obj.isOperator() && obj.getOperatorValue() == "ID") { // Discard next character; it is the space after ID that terminated the token. Read // until end of inline image. char ch; @@ -1731,7 +1733,7 @@ QPDFObjectHandle::parse( StringDecrypter* decrypter, QPDF* context) { - return QPDFParser::parse(*input, object_description, tokenizer, empty, decrypter, context); + return Parser::parse(*input, object_description, tokenizer, empty, decrypter, context); } qpdf_offset_t diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc index 4a48519..6189eb4 100644 --- a/libqpdf/QPDFParser.cc +++ b/libqpdf/QPDFParser.cc @@ -46,12 +46,13 @@ class QPDF::Doc::ParseGuard }; using ParseGuard = QPDF::Doc::ParseGuard; +using Parser = qpdf::impl::Parser; QPDFObjectHandle -QPDFParser::parse(InputSource& input, std::string const& object_description, QPDF* context) +Parser::parse(InputSource& input, std::string const& object_description, QPDF* context) { qpdf::Tokenizer tokenizer; - if (auto result = QPDFParser( + if (auto result = Parser( input, make_description(input.getName(), object_description), object_description, @@ -66,14 +67,14 @@ QPDFParser::parse(InputSource& input, std::string const& object_description, QPD } QPDFObjectHandle -QPDFParser::parse_content( +Parser::parse_content( InputSource& input, std::shared_ptr sp_description, qpdf::Tokenizer& tokenizer, QPDF* context) { static const std::string content("content"); // GCC12 - make constexpr - auto p = QPDFParser( + auto p = Parser( input, std::move(sp_description), content, @@ -93,7 +94,7 @@ QPDFParser::parse_content( } QPDFObjectHandle -QPDFParser::parse( +Parser::parse( InputSource& input, std::string const& object_description, QPDFTokenizer& tokenizer, @@ -103,7 +104,7 @@ QPDFParser::parse( { // ABI: This parse overload is only used by the deprecated QPDFObjectHandle::parse. It is the // only user of the 'empty' member. When removing this overload also remove 'empty'. - auto p = QPDFParser( + auto p = Parser( input, make_description(input.getName(), object_description), object_description, @@ -120,7 +121,7 @@ QPDFParser::parse( } QPDFObjectHandle -QPDFParser::parse( +Parser::parse( InputSource& input, std::string const& object_description, qpdf::Tokenizer& tokenizer, @@ -128,7 +129,7 @@ QPDFParser::parse( QPDF& context, bool sanity_checks) { - return QPDFParser( + return Parser( input, make_description(input.getName(), object_description), object_description, @@ -143,10 +144,10 @@ QPDFParser::parse( } QPDFObjectHandle -QPDFParser::parse( +Parser::parse( is::OffsetBuffer& input, int stream_id, int obj_id, qpdf::Tokenizer& tokenizer, QPDF& context) { - return QPDFParser( + return Parser( input, std::make_shared( QPDFObject::ObjStreamDescr(stream_id, obj_id)), @@ -161,7 +162,7 @@ QPDFParser::parse( } QPDFObjectHandle -QPDFParser::parse(bool content_stream) +Parser::parse(bool content_stream) { try { return parse_first(content_stream); @@ -178,7 +179,7 @@ QPDFParser::parse(bool content_stream) } QPDFObjectHandle -QPDFParser::parse_first(bool content_stream) +Parser::parse_first(bool content_stream) { // This method must take care not to resolve any objects. Don't check the type of any object // without first ensuring that it is a direct object. Otherwise, doing so may have the side @@ -279,7 +280,7 @@ QPDFParser::parse_first(bool content_stream) } QPDFObjectHandle -QPDFParser::parse_remainder(bool content_stream) +Parser::parse_remainder(bool content_stream) { // This method must take care not to resolve any objects. Don't check the type of any object // without first ensuring that it is a direct object. Otherwise, doing so may have the side @@ -312,7 +313,7 @@ QPDFParser::parse_remainder(bool content_stream) tokenizer_.getValue() == "R") { if (!context_) { throw std::logic_error( - "QPDFParser::parse called without context on an object with indirect " + "Parser::parse called without context on an object with indirect " "references"); } auto id = QIntC::to_int(int_buffer_[(int_count_ - 1) % 2]); @@ -533,7 +534,7 @@ QPDFParser::parse_remainder(bool content_stream) } void -QPDFParser::add(std::shared_ptr&& obj) +Parser::add(std::shared_ptr&& obj) { if (frame_->state != st_dictionary_value) { // If state is st_dictionary_key then there is a missing key. Push onto olist for @@ -548,7 +549,7 @@ QPDFParser::add(std::shared_ptr&& obj) } void -QPDFParser::add_null() +Parser::add_null() { const static ObjectPtr null_obj = QPDFObject::create(); @@ -566,7 +567,7 @@ QPDFParser::add_null() } void -QPDFParser::add_bad_null(std::string const& msg) +Parser::add_bad_null(std::string const& msg) { warn(msg); check_too_many_bad_tokens(); @@ -574,7 +575,7 @@ QPDFParser::add_bad_null(std::string const& msg) } void -QPDFParser::add_int(int count) +Parser::add_int(int count) { auto obj = QPDFObject::create(int_buffer_[count % 2]); obj->setDescription(context_, description_, last_offset_buffer_[count % 2]); @@ -583,7 +584,7 @@ QPDFParser::add_int(int count) template void -QPDFParser::add_scalar(Args&&... args) +Parser::add_scalar(Args&&... args) { auto limit = Limits::parser_max_container_size(bad_count_ || sanity_checks_); if (frame_->olist.size() >= limit || frame_->dict.size() >= limit) { @@ -599,7 +600,7 @@ QPDFParser::add_scalar(Args&&... args) template QPDFObjectHandle -QPDFParser::with_description(Args&&... args) +Parser::with_description(Args&&... args) { auto obj = QPDFObject::create(std::forward(args)...); obj->setDescription(context_, description_, start_); @@ -607,7 +608,7 @@ QPDFParser::with_description(Args&&... args) } void -QPDFParser::set_description(ObjectPtr& obj, qpdf_offset_t parsed_offset) +Parser::set_description(ObjectPtr& obj, qpdf_offset_t parsed_offset) { if (obj) { obj->setDescription(context_, description_, parsed_offset); @@ -615,7 +616,7 @@ QPDFParser::set_description(ObjectPtr& obj, qpdf_offset_t parsed_offset) } void -QPDFParser::fix_missing_keys() +Parser::fix_missing_keys() { std::set names; for (auto& obj: frame_->olist) { @@ -641,7 +642,7 @@ QPDFParser::fix_missing_keys() } void -QPDFParser::check_too_many_bad_tokens() +Parser::check_too_many_bad_tokens() { auto limit = Limits::parser_max_container_size(bad_count_ || sanity_checks_); if (frame_->olist.size() >= limit || frame_->dict.size() >= limit) { @@ -676,7 +677,7 @@ QPDFParser::check_too_many_bad_tokens() } void -QPDFParser::limits_error(std::string const& limit, std::string const& msg) +Parser::limits_error(std::string const& limit, std::string const& msg) { Limits::error(); warn("limits error("s + limit + "): " + msg); @@ -684,7 +685,7 @@ QPDFParser::limits_error(std::string const& limit, std::string const& msg) } void -QPDFParser::warn(QPDFExc const& e) const +Parser::warn(QPDFExc const& e) const { // If parsing on behalf of a QPDF object and want to give a warning, we can warn through the // object. If parsing for some other reason, such as an explicit creation of an object from a @@ -697,7 +698,7 @@ QPDFParser::warn(QPDFExc const& e) const } void -QPDFParser::warn_duplicate_key() +Parser::warn_duplicate_key() { warn( frame_->offset, @@ -706,7 +707,7 @@ QPDFParser::warn_duplicate_key() } void -QPDFParser::warn(qpdf_offset_t offset, std::string const& msg) const +Parser::warn(qpdf_offset_t offset, std::string const& msg) const { if (stream_id_) { std::string descr = "object "s + std::to_string(obj_id_) + " 0"; @@ -718,7 +719,7 @@ QPDFParser::warn(qpdf_offset_t offset, std::string const& msg) const } void -QPDFParser::warn(std::string const& msg) const +Parser::warn(std::string const& msg) const { warn(input_.getLastOffset(), msg); } diff --git a/libqpdf/QPDF_objects.cc b/libqpdf/QPDF_objects.cc index 6dcf265..f1e5f64 100644 --- a/libqpdf/QPDF_objects.cc +++ b/libqpdf/QPDF_objects.cc @@ -25,6 +25,7 @@ using namespace qpdf; using namespace std::literals; using Objects = QPDF::Doc::Objects; +using Parser = impl::Parser; QPDFXRefEntry::QPDFXRefEntry() = default; @@ -1287,7 +1288,7 @@ Objects::readTrailer() { qpdf_offset_t offset = m->file->tell(); auto object = - QPDFParser::parse(*m->file, "trailer", m->tokenizer, nullptr, qpdf, m->reconstructed_xref); + Parser::parse(*m->file, "trailer", m->tokenizer, nullptr, qpdf, m->reconstructed_xref); if (object.isDictionary() && m->objects.readToken(*m->file).isWord("stream")) { warn(damagedPDF("trailer", m->file->tell(), "stream keyword found in trailer")); } @@ -1304,7 +1305,7 @@ Objects::readObject(std::string const& description, QPDFObjGen og) StringDecrypter decrypter{&qpdf, og}; StringDecrypter* decrypter_ptr = m->encp->encrypted ? &decrypter : nullptr; - auto object = QPDFParser::parse( + auto object = Parser::parse( *m->file, m->last_object_description, m->tokenizer, @@ -1834,7 +1835,7 @@ Objects::resolveObjectsInStream(int obj_stream_number) if (entry != m->xref_table.end() && entry->second.getType() == 2 && entry->second.getObjStreamNumber() == obj_stream_number) { is::OffsetBuffer in("", {b_start + obj_offset, obj_size}, obj_offset); - if (auto oh = QPDFParser::parse(in, obj_stream_number, obj_id, m->tokenizer, qpdf)) { + if (auto oh = Parser::parse(in, obj_stream_number, obj_id, m->tokenizer, qpdf)) { updateCache(og, oh.obj_sp(), end_before_space, end_after_space); } } else { diff --git a/libqpdf/qpdf/QPDFParser.hh b/libqpdf/qpdf/QPDFParser.hh index dacc57a..572d1e3 100644 --- a/libqpdf/qpdf/QPDFParser.hh +++ b/libqpdf/qpdf/QPDFParser.hh @@ -13,153 +13,157 @@ using namespace qpdf; using namespace qpdf::global; -class QPDFParser +namespace qpdf::impl { - public: - class Error: public std::exception + class Parser { public: - Error() = default; - virtual ~Error() noexcept = default; - }; - - static QPDFObjectHandle - parse(InputSource& input, std::string const& object_description, QPDF* context); - - static QPDFObjectHandle parse_content( - InputSource& input, - std::shared_ptr sp_description, - qpdf::Tokenizer& tokenizer, - QPDF* context); - - // For use by deprecated QPDFObjectHandle::parse. - static QPDFObjectHandle parse( - InputSource& input, - std::string const& object_description, - QPDFTokenizer& tokenizer, - bool& empty, - QPDFObjectHandle::StringDecrypter* decrypter, - QPDF* context); - - // For use by QPDF. - static QPDFObjectHandle parse( - InputSource& input, - std::string const& object_description, - qpdf::Tokenizer& tokenizer, - QPDFObjectHandle::StringDecrypter* decrypter, - QPDF& context, - bool sanity_checks); - - static QPDFObjectHandle parse( - qpdf::is::OffsetBuffer& input, - int stream_id, - int obj_id, - qpdf::Tokenizer& tokenizer, - QPDF& context); - - static std::shared_ptr - make_description(std::string const& input_name, std::string const& object_description) - { - using namespace std::literals; - return std::make_shared( - input_name + ", " + object_description + " at offset $PO"); - } - - private: - QPDFParser( - InputSource& input, - std::shared_ptr sp_description, - std::string const& object_description, - qpdf::Tokenizer& tokenizer, - QPDFObjectHandle::StringDecrypter* decrypter, - QPDF* context, - bool parse_pdf, - int stream_id = 0, - int obj_id = 0, - bool sanity_checks = false) : - input_(input), - object_description_(object_description), - tokenizer_(tokenizer), - decrypter_(decrypter), - context_(context), - description_(std::move(sp_description)), - parse_pdf_(parse_pdf), - stream_id_(stream_id), - obj_id_(obj_id), - sanity_checks_(sanity_checks) - { - } - - // Parser state. Note: - // state <= st_dictionary_value == (state = st_dictionary_key || state = st_dictionary_value) - enum parser_state_e { st_dictionary_key, st_dictionary_value, st_array }; + class Error: public std::exception + { + public: + Error() = default; + virtual ~Error() noexcept = default; + }; + + static QPDFObjectHandle + parse(InputSource& input, std::string const& object_description, QPDF* context); + + static QPDFObjectHandle parse_content( + InputSource& input, + std::shared_ptr sp_description, + qpdf::Tokenizer& tokenizer, + QPDF* context); + + // For use by deprecated QPDFObjectHandle::parse. + static QPDFObjectHandle parse( + InputSource& input, + std::string const& object_description, + QPDFTokenizer& tokenizer, + bool& empty, + QPDFObjectHandle::StringDecrypter* decrypter, + QPDF* context); + + // For use by QPDF. + static QPDFObjectHandle parse( + InputSource& input, + std::string const& object_description, + qpdf::Tokenizer& tokenizer, + QPDFObjectHandle::StringDecrypter* decrypter, + QPDF& context, + bool sanity_checks); + + static QPDFObjectHandle parse( + qpdf::is::OffsetBuffer& input, + int stream_id, + int obj_id, + qpdf::Tokenizer& tokenizer, + QPDF& context); + + static std::shared_ptr + make_description(std::string const& input_name, std::string const& object_description) + { + using namespace std::literals; + return std::make_shared( + input_name + ", " + object_description + " at offset $PO"); + } - struct StackFrame - { - StackFrame(InputSource& input, parser_state_e state) : - state(state), - offset(input.tell()) + private: + Parser( + InputSource& input, + std::shared_ptr sp_description, + std::string const& object_description, + qpdf::Tokenizer& tokenizer, + QPDFObjectHandle::StringDecrypter* decrypter, + QPDF* context, + bool parse_pdf, + int stream_id = 0, + int obj_id = 0, + bool sanity_checks = false) : + input_(input), + object_description_(object_description), + tokenizer_(tokenizer), + decrypter_(decrypter), + context_(context), + description_(std::move(sp_description)), + parse_pdf_(parse_pdf), + stream_id_(stream_id), + obj_id_(obj_id), + sanity_checks_(sanity_checks) { } - std::vector olist; - std::map dict; - parser_state_e state; - std::string key; - qpdf_offset_t offset; - std::string contents_string; - qpdf_offset_t contents_offset{-1}; - int null_count{0}; - }; + // Parser state. Note: + // state <= st_dictionary_value == (state = st_dictionary_key || state = + // st_dictionary_value) + enum parser_state_e { st_dictionary_key, st_dictionary_value, st_array }; - QPDFObjectHandle parse(bool content_stream = false); - QPDFObjectHandle parse_first(bool content_stream); - QPDFObjectHandle parse_remainder(bool content_stream); - void add(std::shared_ptr&& obj); - void add_null(); - void add_bad_null(std::string const& msg); - void add_int(int count); - template - void add_scalar(Args&&... args); - void check_too_many_bad_tokens(); - void warn_duplicate_key(); - void fix_missing_keys(); - [[noreturn]] void limits_error(std::string const& limit, std::string const& msg); - void warn(qpdf_offset_t offset, std::string const& msg) const; - void warn(std::string const& msg) const; - void warn(QPDFExc const&) const; - template - // Create a new scalar object complete with parsed offset and description. - // NB the offset includes any leading whitespace. - QPDFObjectHandle with_description(Args&&... args); - void set_description(std::shared_ptr& obj, qpdf_offset_t parsed_offset); - InputSource& input_; - std::string const& object_description_; - qpdf::Tokenizer& tokenizer_; - QPDFObjectHandle::StringDecrypter* decrypter_; - QPDF* context_; - std::shared_ptr description_; - bool parse_pdf_{false}; - int stream_id_{0}; - int obj_id_{0}; - bool sanity_checks_{false}; - - std::vector stack_; - StackFrame* frame_{nullptr}; - // Number of recent bad tokens. This will always be > 0 once a bad token has been encountered as - // it only gets incremented or reset when a bad token is encountered. - int bad_count_{0}; - // Number of bad tokens (remaining) before giving up. - uint32_t max_bad_count_{Limits::parser_max_errors()}; - // Number of good tokens since last bad token. Irrelevant if bad_count == 0. - int good_count_{0}; - // Start offset including any leading whitespace. - qpdf_offset_t start_{0}; - // Number of successive integer tokens. - int int_count_{0}; - long long int_buffer_[2]{0, 0}; - qpdf_offset_t last_offset_buffer_[2]{0, 0}; - bool empty_{false}; -}; + struct StackFrame + { + StackFrame(InputSource& input, parser_state_e state) : + state(state), + offset(input.tell()) + { + } + + std::vector olist; + std::map dict; + parser_state_e state; + std::string key; + qpdf_offset_t offset; + std::string contents_string; + qpdf_offset_t contents_offset{-1}; + int null_count{0}; + }; + + QPDFObjectHandle parse(bool content_stream = false); + QPDFObjectHandle parse_first(bool content_stream); + QPDFObjectHandle parse_remainder(bool content_stream); + void add(std::shared_ptr&& obj); + void add_null(); + void add_bad_null(std::string const& msg); + void add_int(int count); + template + void add_scalar(Args&&... args); + void check_too_many_bad_tokens(); + void warn_duplicate_key(); + void fix_missing_keys(); + [[noreturn]] void limits_error(std::string const& limit, std::string const& msg); + void warn(qpdf_offset_t offset, std::string const& msg) const; + void warn(std::string const& msg) const; + void warn(QPDFExc const&) const; + template + // Create a new scalar object complete with parsed offset and description. + // NB the offset includes any leading whitespace. + QPDFObjectHandle with_description(Args&&... args); + void set_description(std::shared_ptr& obj, qpdf_offset_t parsed_offset); + InputSource& input_; + std::string const& object_description_; + qpdf::Tokenizer& tokenizer_; + QPDFObjectHandle::StringDecrypter* decrypter_; + QPDF* context_; + std::shared_ptr description_; + bool parse_pdf_{false}; + int stream_id_{0}; + int obj_id_{0}; + bool sanity_checks_{false}; + + std::vector stack_; + StackFrame* frame_{nullptr}; + // Number of recent bad tokens. This will always be > 0 once a bad token has been + // encountered as it only gets incremented or reset when a bad token is encountered. + int bad_count_{0}; + // Number of bad tokens (remaining) before giving up. + uint32_t max_bad_count_{Limits::parser_max_errors()}; + // Number of good tokens since last bad token. Irrelevant if bad_count == 0. + int good_count_{0}; + // Start offset including any leading whitespace. + qpdf_offset_t start_{0}; + // Number of successive integer tokens. + int int_count_{0}; + long long int_buffer_[2]{0, 0}; + qpdf_offset_t last_offset_buffer_[2]{0, 0}; + bool empty_{false}; + }; +} // namespace qpdf::impl #endif // QPDFPARSER_HH diff --git a/qpdf/qtest/qpdf/parse-object.out b/qpdf/qtest/qpdf/parse-object.out index 16efee5..13a0d36 100644 --- a/qpdf/qtest/qpdf/parse-object.out +++ b/qpdf/qtest/qpdf/parse-object.out @@ -1,5 +1,5 @@ [ /name 16059 3.14159 false << /key true /other [ (string1) (string2) ] >> null ] -logic error parsing indirect: QPDFParser::parse called without context on an object with indirect references +logic error parsing indirect: Parser::parse called without context on an object with indirect references trailing data: parsed object (trailing test): trailing data found parsing object from string WARNING: parsed object (offset 9): unknown token while reading object; treating as string WARNING: parsed object: treating unexpected brace token as null