From 0518d585c95c0da6544ce23e8268df1b8dec86e4 Mon Sep 17 00:00:00 2001 From: m-holger Date: Wed, 5 Mar 2025 14:59:38 +0000 Subject: [PATCH] Use Tokenizer instead of QPDFTokenizer internally in qpdf --- include/qpdf/BufferInputSource.hh | 2 ++ libqpdf/QPDFObjectHandle.cc | 41 ++++++++++++++++++++++------------------- libqpdf/QPDFTokenizer.cc | 10 +++++----- libqpdf/qpdf/QPDFParser.hh | 29 +++++++++++++++++++++++++---- libqpdf/qpdf/QPDF_private.hh | 4 +++- 5 files changed, 57 insertions(+), 29 deletions(-) diff --git a/include/qpdf/BufferInputSource.hh b/include/qpdf/BufferInputSource.hh index 53b221c..33adc5a 100644 --- a/include/qpdf/BufferInputSource.hh +++ b/include/qpdf/BufferInputSource.hh @@ -30,6 +30,8 @@ class QPDF_DLL_CLASS BufferInputSource: public InputSource // Otherwise, the caller owns the memory. QPDF_DLL BufferInputSource(std::string const& description, Buffer* buf, bool own_memory = false); + + // NB This overload copies the string contents. QPDF_DLL BufferInputSource(std::string const& description, std::string const& contents); QPDF_DLL diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc index 4c001cd..b5b52b7 100644 --- a/libqpdf/QPDFObjectHandle.cc +++ b/libqpdf/QPDFObjectHandle.cc @@ -1495,19 +1495,23 @@ QPDFObjectHandle QPDFObjectHandle::parse( QPDF* context, std::string const& object_str, std::string const& object_description) { - auto input = std::shared_ptr(new BufferInputSource("parsed object", object_str)); - QPDFTokenizer tokenizer; + // BufferInputSource does not modify the input, but Buffer either requires a string& or copies + // the string. + Buffer buf(const_cast(object_str)); + auto input = BufferInputSource("parsed object", &buf); + qpdf::Tokenizer tokenizer; bool empty = false; - QPDFObjectHandle result = parse(input, object_description, tokenizer, empty, nullptr, context); - size_t offset = QIntC::to_size(input->tell()); + auto result = QPDFParser(input, object_description, tokenizer, nullptr, context, false) + .parse(empty, false); + size_t offset = QIntC::to_size(input.tell()); while (offset < object_str.length()) { if (!isspace(object_str.at(offset))) { QTC::TC("qpdf", "QPDFObjectHandle trailing data in parse"); throw QPDFExc( qpdf_e_damaged_pdf, - input->getName(), + "parsed object", object_description, - input->getLastOffset(), + input.getLastOffset(), "trailing data found parsing object from string"); } ++offset; @@ -1614,45 +1618,44 @@ QPDFObjectHandle::parseContentStream_data( QPDF* context) { size_t stream_length = stream_data->getSize(); - auto input = - std::shared_ptr(new BufferInputSource(description, stream_data.get())); - QPDFTokenizer tokenizer; + auto input = BufferInputSource(description, stream_data.get()); + Tokenizer tokenizer; tokenizer.allowEOF(); bool empty = false; - while (QIntC::to_size(input->tell()) < stream_length) { + while (QIntC::to_size(input.tell()) < stream_length) { // Read a token and seek to the beginning. The offset we get from this process is the // beginning of the next non-ignorable (space, comment) token. This way, the offset and // don't including ignorable content. tokenizer.readToken(input, "content", true); - qpdf_offset_t offset = input->getLastOffset(); - input->seek(offset, SEEK_SET); + qpdf_offset_t offset = input.getLastOffset(); + input.seek(offset, SEEK_SET); auto obj = - QPDFParser(*input, "content", tokenizer, nullptr, context, false).parse(empty, true); + QPDFParser(input, "content", tokenizer, nullptr, context, false).parse(empty, true); if (!obj) { // EOF break; } - size_t length = QIntC::to_size(input->tell() - offset); + size_t length = QIntC::to_size(input.tell() - offset); callbacks->handleObject(obj, QIntC::to_size(offset), length); if (obj.isOperator() && (obj.getOperatorValue() == "ID")) { // Discard next character; it is the space after ID that terminated the token. Read // until end of inline image. char ch; - input->read(&ch, 1); + input.read(&ch, 1); tokenizer.expectInlineImage(input); QPDFTokenizer::Token t = tokenizer.readToken(input, description, true); - offset = input->getLastOffset(); - length = QIntC::to_size(input->tell() - offset); + offset = input.getLastOffset(); + length = QIntC::to_size(input.tell() - offset); if (t.getType() == QPDFTokenizer::tt_bad) { QTC::TC("qpdf", "QPDFObjectHandle EOF in inline image"); warn( context, QPDFExc( qpdf_e_damaged_pdf, - input->getName(), + description, "stream data", - input->tell(), + input.tell(), "EOF found while reading inline image")); } else { std::string inline_image = t.getValue(); diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc index 7ca4664..4f2d2e8 100644 --- a/libqpdf/QPDFTokenizer.cc +++ b/libqpdf/QPDFTokenizer.cc @@ -52,8 +52,8 @@ QPDFWordTokenFinder::check() { // Find a word token matching the given string, preceded by a delimiter, and followed by a // delimiter or EOF. - QPDFTokenizer tokenizer; - QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true, str.size() + 2); + Tokenizer tokenizer; + auto t = tokenizer.readToken(is, "finder", true, str.size() + 2); qpdf_offset_t pos = is.tell(); if (!(t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) { QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); @@ -845,7 +845,7 @@ Tokenizer::findEI(InputSource& input) } inline_image_bytes = QIntC::to_size(input.tell() - pos - 2); - QPDFTokenizer check; + Tokenizer check; bool found_bad = false; // Look at the next 10 tokens or up to EOF. The next inline image's image data would look // like bad tokens, but there will always be at least 10 tokens between one inline image's @@ -853,8 +853,8 @@ Tokenizer::findEI(InputSource& input) // all required as well as a BI and ID. If we get 10 good tokens in a row or hit EOF, we can // be pretty sure we've found the actual EI. for (int i = 0; i < 10; ++i) { - QPDFTokenizer::Token t = check.readToken(input, "checker", true); - QPDFTokenizer::token_type_e type = t.getType(); + auto t = check.readToken(input, "checker", true); + auto type = t.getType(); if (type == tt::tt_eof) { okay = true; } else if (type == tt::tt_bad) { diff --git a/libqpdf/qpdf/QPDFParser.hh b/libqpdf/qpdf/QPDFParser.hh index 6033f30..c12c3a8 100644 --- a/libqpdf/qpdf/QPDFParser.hh +++ b/libqpdf/qpdf/QPDFParser.hh @@ -12,6 +12,8 @@ class QPDFParser { public: QPDFParser() = delete; + + // This constructor is only used by QPDFObjectHandle::parse overload taking a QPDFTokenizer. QPDFParser( InputSource& input, std::string const& object_description, @@ -30,7 +32,26 @@ class QPDFParser parse_pdf(parse_pdf) { } - virtual ~QPDFParser() = default; + + QPDFParser( + InputSource& input, + std::string const& object_description, + qpdf::Tokenizer& tokenizer, + QPDFObjectHandle::StringDecrypter* decrypter, + QPDF* context, + bool parse_pdf) : + input(input), + object_description(object_description), + tokenizer(tokenizer), + decrypter(decrypter), + context(context), + description( + std::make_shared( + std::string(input.getName() + ", " + object_description + " at offset $PO"))), + parse_pdf(parse_pdf) + { + } + ~QPDFParser() = default; QPDFObjectHandle parse(bool& empty, bool content_stream); @@ -83,7 +104,7 @@ class QPDFParser bool parse_pdf; std::vector stack; - StackFrame* frame; + StackFrame* frame{nullptr}; // Number of recent bad tokens. This will always be > 0 once a bad token has been encountered as // it only gets incremented or reset when a bad token is encountered. int bad_count{0}; @@ -92,9 +113,9 @@ class QPDFParser // Number of good tokens since last bad token. Irrelevant if bad_count == 0. int good_count{0}; // Start offset including any leading whitespace. - qpdf_offset_t start; + qpdf_offset_t start{0}; // Number of successive integer tokens. - int int_count = 0; + int int_count{0}; long long int_buffer[2]{0, 0}; qpdf_offset_t last_offset_buffer[2]{0, 0}; }; diff --git a/libqpdf/qpdf/QPDF_private.hh b/libqpdf/qpdf/QPDF_private.hh index 07d6cba..a7d9989 100644 --- a/libqpdf/qpdf/QPDF_private.hh +++ b/libqpdf/qpdf/QPDF_private.hh @@ -3,6 +3,8 @@ #include +#include + // Writer class is restricted to QPDFWriter so that only it can call certain methods. class QPDF::Writer { @@ -452,7 +454,7 @@ class QPDF::Members private: std::shared_ptr log; unsigned long long unique_id{0}; - QPDFTokenizer tokenizer; + qpdf::Tokenizer tokenizer; std::shared_ptr file; std::string last_object_description; bool provided_password_is_hex_key{false}; -- libgit2 0.21.4