Use Tokenizer instead of QPDFTokenizer internally in qpdf

Also remove some shared pointers and use std::string instead of Pl_Buffer in Pl_QPDFTokenizer.

Use Tokenizer instead of QPDFTokenizer internally in qpdf
Also remove some shared pointers and use std::string instead of Pl_Buffer in Pl_QPDFTokenizer.
m-holger
1 parent a64215e6
Showing 5 changed files with 57 additions and 29 deletions
include/qpdf/BufferInputSource.hh
libqpdf/QPDFObjectHandle.cc
libqpdf/QPDFTokenizer.cc
libqpdf/qpdf/QPDFParser.hh
libqpdf/qpdf/QPDF_private.hh
@@ -30,6 +30,8 @@ class QPDF_DLL_CLASS BufferInputSource: public InputSource
     // Otherwise, the caller owns the memory.
     QPDF_DLL
     BufferInputSource(std::string const& description, Buffer* buf, bool own_memory = false);
+
+    // NB This overload copies the string contents.
     QPDF_DLL
     BufferInputSource(std::string const& description, std::string const& contents);
     QPDF_DLL
@@ -1495,19 +1495,23 @@ QPDFObjectHandle
 QPDFObjectHandle::parse(
     QPDF* context, std::string const& object_str, std::string const& object_description)
 {
-    auto input = std::shared_ptr<InputSource>(new BufferInputSource("parsed object", object_str));
-    QPDFTokenizer tokenizer;
+    // BufferInputSource does not modify the input, but Buffer either requires a string& or copies
+    // the string.
+    Buffer buf(const_cast<std::string&>(object_str));
+    auto input = BufferInputSource("parsed object", &buf);
+    qpdf::Tokenizer tokenizer;
     bool empty = false;
-    QPDFObjectHandle result = parse(input, object_description, tokenizer, empty, nullptr, context);
-    size_t offset = QIntC::to_size(input->tell());
+    auto result = QPDFParser(input, object_description, tokenizer, nullptr, context, false)
+                      .parse(empty, false);
+    size_t offset = QIntC::to_size(input.tell());
     while (offset < object_str.length()) {
         if (!isspace(object_str.at(offset))) {
             QTC::TC("qpdf", "QPDFObjectHandle trailing data in parse");
             throw QPDFExc(
                 qpdf_e_damaged_pdf,
-                input->getName(),
+                "parsed object",
                 object_description,
-                input->getLastOffset(),
+                input.getLastOffset(),
                 "trailing data found parsing object from string");
         }
         ++offset;
@@ -1614,45 +1618,44 @@ QPDFObjectHandle::parseContentStream_data(
     QPDF* context)
 {
     size_t stream_length = stream_data->getSize();
-    auto input =
-        std::shared_ptr<InputSource>(new BufferInputSource(description, stream_data.get()));
-    QPDFTokenizer tokenizer;
+    auto input = BufferInputSource(description, stream_data.get());
+    Tokenizer tokenizer;
     tokenizer.allowEOF();
     bool empty = false;
-    while (QIntC::to_size(input->tell()) < stream_length) {
+    while (QIntC::to_size(input.tell()) < stream_length) {
         // Read a token and seek to the beginning. The offset we get from this process is the
         // beginning of the next non-ignorable (space, comment) token. This way, the offset and
         // don't including ignorable content.
         tokenizer.readToken(input, "content", true);
-        qpdf_offset_t offset = input->getLastOffset();
-        input->seek(offset, SEEK_SET);
+        qpdf_offset_t offset = input.getLastOffset();
+        input.seek(offset, SEEK_SET);
         auto obj =
-            QPDFParser(*input, "content", tokenizer, nullptr, context, false).parse(empty, true);
+            QPDFParser(input, "content", tokenizer, nullptr, context, false).parse(empty, true);
         if (!obj) {
             // EOF
             break;
         }
-        size_t length = QIntC::to_size(input->tell() - offset);
+        size_t length = QIntC::to_size(input.tell() - offset);
         callbacks->handleObject(obj, QIntC::to_size(offset), length);
         if (obj.isOperator() && (obj.getOperatorValue() == "ID")) {
             // Discard next character; it is the space after ID that terminated the token.  Read
             // until end of inline image.
             char ch;
-            input->read(&ch, 1);
+            input.read(&ch, 1);
             tokenizer.expectInlineImage(input);
             QPDFTokenizer::Token t = tokenizer.readToken(input, description, true);
-            offset = input->getLastOffset();
-            length = QIntC::to_size(input->tell() - offset);
+            offset = input.getLastOffset();
+            length = QIntC::to_size(input.tell() - offset);
             if (t.getType() == QPDFTokenizer::tt_bad) {
                 QTC::TC("qpdf", "QPDFObjectHandle EOF in inline image");
                 warn(
                     context,
                     QPDFExc(
                         qpdf_e_damaged_pdf,
-                        input->getName(),
+                        description,
                         "stream data",
-                        input->tell(),
+                        input.tell(),
                         "EOF found while reading inline image"));
             } else {
                 std::string inline_image = t.getValue();
@@ -52,8 +52,8 @@ QPDFWordTokenFinder::check()
 {
     // Find a word token matching the given string, preceded by a delimiter, and followed by a
     // delimiter or EOF.
-    QPDFTokenizer tokenizer;
-    QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true, str.size() + 2);
+    Tokenizer tokenizer;
+    auto t = tokenizer.readToken(is, "finder", true, str.size() + 2);
     qpdf_offset_t pos = is.tell();
     if (!(t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) {
         QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
@@ -845,7 +845,7 @@ Tokenizer::findEI(InputSource&amp; input)
         }
         inline_image_bytes = QIntC::to_size(input.tell() - pos - 2);
-        QPDFTokenizer check;
+        Tokenizer check;
         bool found_bad = false;
         // Look at the next 10 tokens or up to EOF. The next inline image's image data would look
         // like bad tokens, but there will always be at least 10 tokens between one inline image's
@@ -853,8 +853,8 @@ Tokenizer::findEI(InputSource&amp; input)
         // all required as well as a BI and ID. If we get 10 good tokens in a row or hit EOF, we can
         // be pretty sure we've found the actual EI.
         for (int i = 0; i < 10; ++i) {
-            QPDFTokenizer::Token t = check.readToken(input, "checker", true);
-            QPDFTokenizer::token_type_e type = t.getType();
+            auto t = check.readToken(input, "checker", true);
+            auto type = t.getType();
             if (type == tt::tt_eof) {
                 okay = true;
             } else if (type == tt::tt_bad) {
@@ -12,6 +12,8 @@ class QPDFParser
 {
   public:
     QPDFParser() = delete;
+
+    // This constructor is only used by QPDFObjectHandle::parse overload taking a QPDFTokenizer.
     QPDFParser(
         InputSource& input,
         std::string const& object_description,
@@ -30,7 +32,26 @@ class QPDFParser
         parse_pdf(parse_pdf)
     {
     }
-    virtual ~QPDFParser() = default;
+
+    QPDFParser(
+        InputSource& input,
+        std::string const& object_description,
+        qpdf::Tokenizer& tokenizer,
+        QPDFObjectHandle::StringDecrypter* decrypter,
+        QPDF* context,
+        bool parse_pdf) :
+        input(input),
+        object_description(object_description),
+        tokenizer(tokenizer),
+        decrypter(decrypter),
+        context(context),
+        description(
+            std::make_shared<QPDFObject::Description>(
+                std::string(input.getName() + ", " + object_description + " at offset $PO"))),
+        parse_pdf(parse_pdf)
+    {
+    }
+    ~QPDFParser() = default;
     QPDFObjectHandle parse(bool& empty, bool content_stream);
@@ -83,7 +104,7 @@ class QPDFParser
     bool parse_pdf;
     std::vector<StackFrame> stack;
-    StackFrame* frame;
+    StackFrame* frame{nullptr};
     // Number of recent bad tokens. This will always be > 0 once a bad token has been encountered as
     // it only gets incremented or reset when a bad token is encountered.
     int bad_count{0};
@@ -92,9 +113,9 @@ class QPDFParser
     // Number of good tokens since last bad token. Irrelevant if bad_count == 0.
     int good_count{0};
     // Start offset including any leading whitespace.
-    qpdf_offset_t start;
+    qpdf_offset_t start{0};
     // Number of successive integer tokens.
-    int int_count = 0;
+    int int_count{0};
     long long int_buffer[2]{0, 0};
     qpdf_offset_t last_offset_buffer[2]{0, 0};
 };
@@ -3,6 +3,8 @@
 #include <qpdf/QPDF.hh>
+#include <qpdf/QPDFTokenizer_private.hh>
+
 // Writer class is restricted to QPDFWriter so that only it can call certain methods.
 class QPDF::Writer
 {
@@ -452,7 +454,7 @@ class QPDF::Members
   private:
     std::shared_ptr<QPDFLogger> log;
     unsigned long long unique_id{0};
-    QPDFTokenizer tokenizer;
+    qpdf::Tokenizer tokenizer;
     std::shared_ptr<InputSource> file;
     std::string last_object_description;
     bool provided_password_is_hex_key{false};