Commit 0518d585c95c0da6544ce23e8268df1b8dec86e4

Authored by m-holger
1 parent a64215e6

Use Tokenizer instead of QPDFTokenizer internally in qpdf

Also remove some shared pointers and use std::string instead of Pl_Buffer
in Pl_QPDFTokenizer.
include/qpdf/BufferInputSource.hh
@@ -30,6 +30,8 @@ class QPDF_DLL_CLASS BufferInputSource: public InputSource @@ -30,6 +30,8 @@ class QPDF_DLL_CLASS BufferInputSource: public InputSource
30 // Otherwise, the caller owns the memory. 30 // Otherwise, the caller owns the memory.
31 QPDF_DLL 31 QPDF_DLL
32 BufferInputSource(std::string const& description, Buffer* buf, bool own_memory = false); 32 BufferInputSource(std::string const& description, Buffer* buf, bool own_memory = false);
  33 +
  34 + // NB This overload copies the string contents.
33 QPDF_DLL 35 QPDF_DLL
34 BufferInputSource(std::string const& description, std::string const& contents); 36 BufferInputSource(std::string const& description, std::string const& contents);
35 QPDF_DLL 37 QPDF_DLL
libqpdf/QPDFObjectHandle.cc
@@ -1495,19 +1495,23 @@ QPDFObjectHandle @@ -1495,19 +1495,23 @@ QPDFObjectHandle
1495 QPDFObjectHandle::parse( 1495 QPDFObjectHandle::parse(
1496 QPDF* context, std::string const& object_str, std::string const& object_description) 1496 QPDF* context, std::string const& object_str, std::string const& object_description)
1497 { 1497 {
1498 - auto input = std::shared_ptr<InputSource>(new BufferInputSource("parsed object", object_str));  
1499 - QPDFTokenizer tokenizer; 1498 + // BufferInputSource does not modify the input, but Buffer either requires a string& or copies
  1499 + // the string.
  1500 + Buffer buf(const_cast<std::string&>(object_str));
  1501 + auto input = BufferInputSource("parsed object", &buf);
  1502 + qpdf::Tokenizer tokenizer;
1500 bool empty = false; 1503 bool empty = false;
1501 - QPDFObjectHandle result = parse(input, object_description, tokenizer, empty, nullptr, context);  
1502 - size_t offset = QIntC::to_size(input->tell()); 1504 + auto result = QPDFParser(input, object_description, tokenizer, nullptr, context, false)
  1505 + .parse(empty, false);
  1506 + size_t offset = QIntC::to_size(input.tell());
1503 while (offset < object_str.length()) { 1507 while (offset < object_str.length()) {
1504 if (!isspace(object_str.at(offset))) { 1508 if (!isspace(object_str.at(offset))) {
1505 QTC::TC("qpdf", "QPDFObjectHandle trailing data in parse"); 1509 QTC::TC("qpdf", "QPDFObjectHandle trailing data in parse");
1506 throw QPDFExc( 1510 throw QPDFExc(
1507 qpdf_e_damaged_pdf, 1511 qpdf_e_damaged_pdf,
1508 - input->getName(), 1512 + "parsed object",
1509 object_description, 1513 object_description,
1510 - input->getLastOffset(), 1514 + input.getLastOffset(),
1511 "trailing data found parsing object from string"); 1515 "trailing data found parsing object from string");
1512 } 1516 }
1513 ++offset; 1517 ++offset;
@@ -1614,45 +1618,44 @@ QPDFObjectHandle::parseContentStream_data( @@ -1614,45 +1618,44 @@ QPDFObjectHandle::parseContentStream_data(
1614 QPDF* context) 1618 QPDF* context)
1615 { 1619 {
1616 size_t stream_length = stream_data->getSize(); 1620 size_t stream_length = stream_data->getSize();
1617 - auto input =  
1618 - std::shared_ptr<InputSource>(new BufferInputSource(description, stream_data.get()));  
1619 - QPDFTokenizer tokenizer; 1621 + auto input = BufferInputSource(description, stream_data.get());
  1622 + Tokenizer tokenizer;
1620 tokenizer.allowEOF(); 1623 tokenizer.allowEOF();
1621 bool empty = false; 1624 bool empty = false;
1622 - while (QIntC::to_size(input->tell()) < stream_length) { 1625 + while (QIntC::to_size(input.tell()) < stream_length) {
1623 // Read a token and seek to the beginning. The offset we get from this process is the 1626 // Read a token and seek to the beginning. The offset we get from this process is the
1624 // beginning of the next non-ignorable (space, comment) token. This way, the offset and 1627 // beginning of the next non-ignorable (space, comment) token. This way, the offset and
1625 // don't including ignorable content. 1628 // don't including ignorable content.
1626 tokenizer.readToken(input, "content", true); 1629 tokenizer.readToken(input, "content", true);
1627 - qpdf_offset_t offset = input->getLastOffset();  
1628 - input->seek(offset, SEEK_SET); 1630 + qpdf_offset_t offset = input.getLastOffset();
  1631 + input.seek(offset, SEEK_SET);
1629 auto obj = 1632 auto obj =
1630 - QPDFParser(*input, "content", tokenizer, nullptr, context, false).parse(empty, true); 1633 + QPDFParser(input, "content", tokenizer, nullptr, context, false).parse(empty, true);
1631 if (!obj) { 1634 if (!obj) {
1632 // EOF 1635 // EOF
1633 break; 1636 break;
1634 } 1637 }
1635 - size_t length = QIntC::to_size(input->tell() - offset); 1638 + size_t length = QIntC::to_size(input.tell() - offset);
1636 1639
1637 callbacks->handleObject(obj, QIntC::to_size(offset), length); 1640 callbacks->handleObject(obj, QIntC::to_size(offset), length);
1638 if (obj.isOperator() && (obj.getOperatorValue() == "ID")) { 1641 if (obj.isOperator() && (obj.getOperatorValue() == "ID")) {
1639 // Discard next character; it is the space after ID that terminated the token. Read 1642 // Discard next character; it is the space after ID that terminated the token. Read
1640 // until end of inline image. 1643 // until end of inline image.
1641 char ch; 1644 char ch;
1642 - input->read(&ch, 1); 1645 + input.read(&ch, 1);
1643 tokenizer.expectInlineImage(input); 1646 tokenizer.expectInlineImage(input);
1644 QPDFTokenizer::Token t = tokenizer.readToken(input, description, true); 1647 QPDFTokenizer::Token t = tokenizer.readToken(input, description, true);
1645 - offset = input->getLastOffset();  
1646 - length = QIntC::to_size(input->tell() - offset); 1648 + offset = input.getLastOffset();
  1649 + length = QIntC::to_size(input.tell() - offset);
1647 if (t.getType() == QPDFTokenizer::tt_bad) { 1650 if (t.getType() == QPDFTokenizer::tt_bad) {
1648 QTC::TC("qpdf", "QPDFObjectHandle EOF in inline image"); 1651 QTC::TC("qpdf", "QPDFObjectHandle EOF in inline image");
1649 warn( 1652 warn(
1650 context, 1653 context,
1651 QPDFExc( 1654 QPDFExc(
1652 qpdf_e_damaged_pdf, 1655 qpdf_e_damaged_pdf,
1653 - input->getName(), 1656 + description,
1654 "stream data", 1657 "stream data",
1655 - input->tell(), 1658 + input.tell(),
1656 "EOF found while reading inline image")); 1659 "EOF found while reading inline image"));
1657 } else { 1660 } else {
1658 std::string inline_image = t.getValue(); 1661 std::string inline_image = t.getValue();
libqpdf/QPDFTokenizer.cc
@@ -52,8 +52,8 @@ QPDFWordTokenFinder::check() @@ -52,8 +52,8 @@ QPDFWordTokenFinder::check()
52 { 52 {
53 // Find a word token matching the given string, preceded by a delimiter, and followed by a 53 // Find a word token matching the given string, preceded by a delimiter, and followed by a
54 // delimiter or EOF. 54 // delimiter or EOF.
55 - QPDFTokenizer tokenizer;  
56 - QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true, str.size() + 2); 55 + Tokenizer tokenizer;
  56 + auto t = tokenizer.readToken(is, "finder", true, str.size() + 2);
57 qpdf_offset_t pos = is.tell(); 57 qpdf_offset_t pos = is.tell();
58 if (!(t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) { 58 if (!(t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) {
59 QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); 59 QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
@@ -845,7 +845,7 @@ Tokenizer::findEI(InputSource&amp; input) @@ -845,7 +845,7 @@ Tokenizer::findEI(InputSource&amp; input)
845 } 845 }
846 inline_image_bytes = QIntC::to_size(input.tell() - pos - 2); 846 inline_image_bytes = QIntC::to_size(input.tell() - pos - 2);
847 847
848 - QPDFTokenizer check; 848 + Tokenizer check;
849 bool found_bad = false; 849 bool found_bad = false;
850 // Look at the next 10 tokens or up to EOF. The next inline image's image data would look 850 // Look at the next 10 tokens or up to EOF. The next inline image's image data would look
851 // like bad tokens, but there will always be at least 10 tokens between one inline image's 851 // like bad tokens, but there will always be at least 10 tokens between one inline image's
@@ -853,8 +853,8 @@ Tokenizer::findEI(InputSource&amp; input) @@ -853,8 +853,8 @@ Tokenizer::findEI(InputSource&amp; input)
853 // all required as well as a BI and ID. If we get 10 good tokens in a row or hit EOF, we can 853 // all required as well as a BI and ID. If we get 10 good tokens in a row or hit EOF, we can
854 // be pretty sure we've found the actual EI. 854 // be pretty sure we've found the actual EI.
855 for (int i = 0; i < 10; ++i) { 855 for (int i = 0; i < 10; ++i) {
856 - QPDFTokenizer::Token t = check.readToken(input, "checker", true);  
857 - QPDFTokenizer::token_type_e type = t.getType(); 856 + auto t = check.readToken(input, "checker", true);
  857 + auto type = t.getType();
858 if (type == tt::tt_eof) { 858 if (type == tt::tt_eof) {
859 okay = true; 859 okay = true;
860 } else if (type == tt::tt_bad) { 860 } else if (type == tt::tt_bad) {
libqpdf/qpdf/QPDFParser.hh
@@ -12,6 +12,8 @@ class QPDFParser @@ -12,6 +12,8 @@ class QPDFParser
12 { 12 {
13 public: 13 public:
14 QPDFParser() = delete; 14 QPDFParser() = delete;
  15 +
  16 + // This constructor is only used by QPDFObjectHandle::parse overload taking a QPDFTokenizer.
15 QPDFParser( 17 QPDFParser(
16 InputSource& input, 18 InputSource& input,
17 std::string const& object_description, 19 std::string const& object_description,
@@ -30,7 +32,26 @@ class QPDFParser @@ -30,7 +32,26 @@ class QPDFParser
30 parse_pdf(parse_pdf) 32 parse_pdf(parse_pdf)
31 { 33 {
32 } 34 }
33 - virtual ~QPDFParser() = default; 35 +
  36 + QPDFParser(
  37 + InputSource& input,
  38 + std::string const& object_description,
  39 + qpdf::Tokenizer& tokenizer,
  40 + QPDFObjectHandle::StringDecrypter* decrypter,
  41 + QPDF* context,
  42 + bool parse_pdf) :
  43 + input(input),
  44 + object_description(object_description),
  45 + tokenizer(tokenizer),
  46 + decrypter(decrypter),
  47 + context(context),
  48 + description(
  49 + std::make_shared<QPDFObject::Description>(
  50 + std::string(input.getName() + ", " + object_description + " at offset $PO"))),
  51 + parse_pdf(parse_pdf)
  52 + {
  53 + }
  54 + ~QPDFParser() = default;
34 55
35 QPDFObjectHandle parse(bool& empty, bool content_stream); 56 QPDFObjectHandle parse(bool& empty, bool content_stream);
36 57
@@ -83,7 +104,7 @@ class QPDFParser @@ -83,7 +104,7 @@ class QPDFParser
83 bool parse_pdf; 104 bool parse_pdf;
84 105
85 std::vector<StackFrame> stack; 106 std::vector<StackFrame> stack;
86 - StackFrame* frame; 107 + StackFrame* frame{nullptr};
87 // Number of recent bad tokens. This will always be > 0 once a bad token has been encountered as 108 // Number of recent bad tokens. This will always be > 0 once a bad token has been encountered as
88 // it only gets incremented or reset when a bad token is encountered. 109 // it only gets incremented or reset when a bad token is encountered.
89 int bad_count{0}; 110 int bad_count{0};
@@ -92,9 +113,9 @@ class QPDFParser @@ -92,9 +113,9 @@ class QPDFParser
92 // Number of good tokens since last bad token. Irrelevant if bad_count == 0. 113 // Number of good tokens since last bad token. Irrelevant if bad_count == 0.
93 int good_count{0}; 114 int good_count{0};
94 // Start offset including any leading whitespace. 115 // Start offset including any leading whitespace.
95 - qpdf_offset_t start; 116 + qpdf_offset_t start{0};
96 // Number of successive integer tokens. 117 // Number of successive integer tokens.
97 - int int_count = 0; 118 + int int_count{0};
98 long long int_buffer[2]{0, 0}; 119 long long int_buffer[2]{0, 0};
99 qpdf_offset_t last_offset_buffer[2]{0, 0}; 120 qpdf_offset_t last_offset_buffer[2]{0, 0};
100 }; 121 };
libqpdf/qpdf/QPDF_private.hh
@@ -3,6 +3,8 @@ @@ -3,6 +3,8 @@
3 3
4 #include <qpdf/QPDF.hh> 4 #include <qpdf/QPDF.hh>
5 5
  6 +#include <qpdf/QPDFTokenizer_private.hh>
  7 +
6 // Writer class is restricted to QPDFWriter so that only it can call certain methods. 8 // Writer class is restricted to QPDFWriter so that only it can call certain methods.
7 class QPDF::Writer 9 class QPDF::Writer
8 { 10 {
@@ -452,7 +454,7 @@ class QPDF::Members @@ -452,7 +454,7 @@ class QPDF::Members
452 private: 454 private:
453 std::shared_ptr<QPDFLogger> log; 455 std::shared_ptr<QPDFLogger> log;
454 unsigned long long unique_id{0}; 456 unsigned long long unique_id{0};
455 - QPDFTokenizer tokenizer; 457 + qpdf::Tokenizer tokenizer;
456 std::shared_ptr<InputSource> file; 458 std::shared_ptr<InputSource> file;
457 std::string last_object_description; 459 std::string last_object_description;
458 bool provided_password_is_hex_key{false}; 460 bool provided_password_is_hex_key{false};