Commit 0518d585c95c0da6544ce23e8268df1b8dec86e4

Authored by m-holger
1 parent a64215e6

Use Tokenizer instead of QPDFTokenizer internally in qpdf

Also remove some shared pointers and use std::string instead of Pl_Buffer
in Pl_QPDFTokenizer.
include/qpdf/BufferInputSource.hh
... ... @@ -30,6 +30,8 @@ class QPDF_DLL_CLASS BufferInputSource: public InputSource
30 30 // Otherwise, the caller owns the memory.
31 31 QPDF_DLL
32 32 BufferInputSource(std::string const& description, Buffer* buf, bool own_memory = false);
  33 +
  34 + // NB This overload copies the string contents.
33 35 QPDF_DLL
34 36 BufferInputSource(std::string const& description, std::string const& contents);
35 37 QPDF_DLL
... ...
libqpdf/QPDFObjectHandle.cc
... ... @@ -1495,19 +1495,23 @@ QPDFObjectHandle
1495 1495 QPDFObjectHandle::parse(
1496 1496 QPDF* context, std::string const& object_str, std::string const& object_description)
1497 1497 {
1498   - auto input = std::shared_ptr<InputSource>(new BufferInputSource("parsed object", object_str));
1499   - QPDFTokenizer tokenizer;
  1498 + // BufferInputSource does not modify the input, but Buffer either requires a string& or copies
  1499 + // the string.
  1500 + Buffer buf(const_cast<std::string&>(object_str));
  1501 + auto input = BufferInputSource("parsed object", &buf);
  1502 + qpdf::Tokenizer tokenizer;
1500 1503 bool empty = false;
1501   - QPDFObjectHandle result = parse(input, object_description, tokenizer, empty, nullptr, context);
1502   - size_t offset = QIntC::to_size(input->tell());
  1504 + auto result = QPDFParser(input, object_description, tokenizer, nullptr, context, false)
  1505 + .parse(empty, false);
  1506 + size_t offset = QIntC::to_size(input.tell());
1503 1507 while (offset < object_str.length()) {
1504 1508 if (!isspace(object_str.at(offset))) {
1505 1509 QTC::TC("qpdf", "QPDFObjectHandle trailing data in parse");
1506 1510 throw QPDFExc(
1507 1511 qpdf_e_damaged_pdf,
1508   - input->getName(),
  1512 + "parsed object",
1509 1513 object_description,
1510   - input->getLastOffset(),
  1514 + input.getLastOffset(),
1511 1515 "trailing data found parsing object from string");
1512 1516 }
1513 1517 ++offset;
... ... @@ -1614,45 +1618,44 @@ QPDFObjectHandle::parseContentStream_data(
1614 1618 QPDF* context)
1615 1619 {
1616 1620 size_t stream_length = stream_data->getSize();
1617   - auto input =
1618   - std::shared_ptr<InputSource>(new BufferInputSource(description, stream_data.get()));
1619   - QPDFTokenizer tokenizer;
  1621 + auto input = BufferInputSource(description, stream_data.get());
  1622 + Tokenizer tokenizer;
1620 1623 tokenizer.allowEOF();
1621 1624 bool empty = false;
1622   - while (QIntC::to_size(input->tell()) < stream_length) {
  1625 + while (QIntC::to_size(input.tell()) < stream_length) {
1623 1626 // Read a token and seek to the beginning. The offset we get from this process is the
1624 1627 // beginning of the next non-ignorable (space, comment) token. This way, the offset and
1625 1628 // don't including ignorable content.
1626 1629 tokenizer.readToken(input, "content", true);
1627   - qpdf_offset_t offset = input->getLastOffset();
1628   - input->seek(offset, SEEK_SET);
  1630 + qpdf_offset_t offset = input.getLastOffset();
  1631 + input.seek(offset, SEEK_SET);
1629 1632 auto obj =
1630   - QPDFParser(*input, "content", tokenizer, nullptr, context, false).parse(empty, true);
  1633 + QPDFParser(input, "content", tokenizer, nullptr, context, false).parse(empty, true);
1631 1634 if (!obj) {
1632 1635 // EOF
1633 1636 break;
1634 1637 }
1635   - size_t length = QIntC::to_size(input->tell() - offset);
  1638 + size_t length = QIntC::to_size(input.tell() - offset);
1636 1639  
1637 1640 callbacks->handleObject(obj, QIntC::to_size(offset), length);
1638 1641 if (obj.isOperator() && (obj.getOperatorValue() == "ID")) {
1639 1642 // Discard next character; it is the space after ID that terminated the token. Read
1640 1643 // until end of inline image.
1641 1644 char ch;
1642   - input->read(&ch, 1);
  1645 + input.read(&ch, 1);
1643 1646 tokenizer.expectInlineImage(input);
1644 1647 QPDFTokenizer::Token t = tokenizer.readToken(input, description, true);
1645   - offset = input->getLastOffset();
1646   - length = QIntC::to_size(input->tell() - offset);
  1648 + offset = input.getLastOffset();
  1649 + length = QIntC::to_size(input.tell() - offset);
1647 1650 if (t.getType() == QPDFTokenizer::tt_bad) {
1648 1651 QTC::TC("qpdf", "QPDFObjectHandle EOF in inline image");
1649 1652 warn(
1650 1653 context,
1651 1654 QPDFExc(
1652 1655 qpdf_e_damaged_pdf,
1653   - input->getName(),
  1656 + description,
1654 1657 "stream data",
1655   - input->tell(),
  1658 + input.tell(),
1656 1659 "EOF found while reading inline image"));
1657 1660 } else {
1658 1661 std::string inline_image = t.getValue();
... ...
libqpdf/QPDFTokenizer.cc
... ... @@ -52,8 +52,8 @@ QPDFWordTokenFinder::check()
52 52 {
53 53 // Find a word token matching the given string, preceded by a delimiter, and followed by a
54 54 // delimiter or EOF.
55   - QPDFTokenizer tokenizer;
56   - QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true, str.size() + 2);
  55 + Tokenizer tokenizer;
  56 + auto t = tokenizer.readToken(is, "finder", true, str.size() + 2);
57 57 qpdf_offset_t pos = is.tell();
58 58 if (!(t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) {
59 59 QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
... ... @@ -845,7 +845,7 @@ Tokenizer::findEI(InputSource&amp; input)
845 845 }
846 846 inline_image_bytes = QIntC::to_size(input.tell() - pos - 2);
847 847  
848   - QPDFTokenizer check;
  848 + Tokenizer check;
849 849 bool found_bad = false;
850 850 // Look at the next 10 tokens or up to EOF. The next inline image's image data would look
851 851 // like bad tokens, but there will always be at least 10 tokens between one inline image's
... ... @@ -853,8 +853,8 @@ Tokenizer::findEI(InputSource&amp; input)
853 853 // all required as well as a BI and ID. If we get 10 good tokens in a row or hit EOF, we can
854 854 // be pretty sure we've found the actual EI.
855 855 for (int i = 0; i < 10; ++i) {
856   - QPDFTokenizer::Token t = check.readToken(input, "checker", true);
857   - QPDFTokenizer::token_type_e type = t.getType();
  856 + auto t = check.readToken(input, "checker", true);
  857 + auto type = t.getType();
858 858 if (type == tt::tt_eof) {
859 859 okay = true;
860 860 } else if (type == tt::tt_bad) {
... ...
libqpdf/qpdf/QPDFParser.hh
... ... @@ -12,6 +12,8 @@ class QPDFParser
12 12 {
13 13 public:
14 14 QPDFParser() = delete;
  15 +
  16 + // This constructor is only used by QPDFObjectHandle::parse overload taking a QPDFTokenizer.
15 17 QPDFParser(
16 18 InputSource& input,
17 19 std::string const& object_description,
... ... @@ -30,7 +32,26 @@ class QPDFParser
30 32 parse_pdf(parse_pdf)
31 33 {
32 34 }
33   - virtual ~QPDFParser() = default;
  35 +
  36 + QPDFParser(
  37 + InputSource& input,
  38 + std::string const& object_description,
  39 + qpdf::Tokenizer& tokenizer,
  40 + QPDFObjectHandle::StringDecrypter* decrypter,
  41 + QPDF* context,
  42 + bool parse_pdf) :
  43 + input(input),
  44 + object_description(object_description),
  45 + tokenizer(tokenizer),
  46 + decrypter(decrypter),
  47 + context(context),
  48 + description(
  49 + std::make_shared<QPDFObject::Description>(
  50 + std::string(input.getName() + ", " + object_description + " at offset $PO"))),
  51 + parse_pdf(parse_pdf)
  52 + {
  53 + }
  54 + ~QPDFParser() = default;
34 55  
35 56 QPDFObjectHandle parse(bool& empty, bool content_stream);
36 57  
... ... @@ -83,7 +104,7 @@ class QPDFParser
83 104 bool parse_pdf;
84 105  
85 106 std::vector<StackFrame> stack;
86   - StackFrame* frame;
  107 + StackFrame* frame{nullptr};
87 108 // Number of recent bad tokens. This will always be > 0 once a bad token has been encountered as
88 109 // it only gets incremented or reset when a bad token is encountered.
89 110 int bad_count{0};
... ... @@ -92,9 +113,9 @@ class QPDFParser
92 113 // Number of good tokens since last bad token. Irrelevant if bad_count == 0.
93 114 int good_count{0};
94 115 // Start offset including any leading whitespace.
95   - qpdf_offset_t start;
  116 + qpdf_offset_t start{0};
96 117 // Number of successive integer tokens.
97   - int int_count = 0;
  118 + int int_count{0};
98 119 long long int_buffer[2]{0, 0};
99 120 qpdf_offset_t last_offset_buffer[2]{0, 0};
100 121 };
... ...
libqpdf/qpdf/QPDF_private.hh
... ... @@ -3,6 +3,8 @@
3 3  
4 4 #include <qpdf/QPDF.hh>
5 5  
  6 +#include <qpdf/QPDFTokenizer_private.hh>
  7 +
6 8 // Writer class is restricted to QPDFWriter so that only it can call certain methods.
7 9 class QPDF::Writer
8 10 {
... ... @@ -452,7 +454,7 @@ class QPDF::Members
452 454 private:
453 455 std::shared_ptr<QPDFLogger> log;
454 456 unsigned long long unique_id{0};
455   - QPDFTokenizer tokenizer;
  457 + qpdf::Tokenizer tokenizer;
456 458 std::shared_ptr<InputSource> file;
457 459 std::string last_object_description;
458 460 bool provided_password_is_hex_key{false};
... ...