Commit 0518d585c95c0da6544ce23e8268df1b8dec86e4
1 parent
a64215e6
Use Tokenizer instead of QPDFTokenizer internally in qpdf
Also remove some shared pointers and use std::string instead of Pl_Buffer in Pl_QPDFTokenizer.
Showing
5 changed files
with
57 additions
and
29 deletions
include/qpdf/BufferInputSource.hh
| ... | ... | @@ -30,6 +30,8 @@ class QPDF_DLL_CLASS BufferInputSource: public InputSource |
| 30 | 30 | // Otherwise, the caller owns the memory. |
| 31 | 31 | QPDF_DLL |
| 32 | 32 | BufferInputSource(std::string const& description, Buffer* buf, bool own_memory = false); |
| 33 | + | |
| 34 | + // NB This overload copies the string contents. | |
| 33 | 35 | QPDF_DLL |
| 34 | 36 | BufferInputSource(std::string const& description, std::string const& contents); |
| 35 | 37 | QPDF_DLL | ... | ... |
libqpdf/QPDFObjectHandle.cc
| ... | ... | @@ -1495,19 +1495,23 @@ QPDFObjectHandle |
| 1495 | 1495 | QPDFObjectHandle::parse( |
| 1496 | 1496 | QPDF* context, std::string const& object_str, std::string const& object_description) |
| 1497 | 1497 | { |
| 1498 | - auto input = std::shared_ptr<InputSource>(new BufferInputSource("parsed object", object_str)); | |
| 1499 | - QPDFTokenizer tokenizer; | |
| 1498 | + // BufferInputSource does not modify the input, but Buffer either requires a string& or copies | |
| 1499 | + // the string. | |
| 1500 | + Buffer buf(const_cast<std::string&>(object_str)); | |
| 1501 | + auto input = BufferInputSource("parsed object", &buf); | |
| 1502 | + qpdf::Tokenizer tokenizer; | |
| 1500 | 1503 | bool empty = false; |
| 1501 | - QPDFObjectHandle result = parse(input, object_description, tokenizer, empty, nullptr, context); | |
| 1502 | - size_t offset = QIntC::to_size(input->tell()); | |
| 1504 | + auto result = QPDFParser(input, object_description, tokenizer, nullptr, context, false) | |
| 1505 | + .parse(empty, false); | |
| 1506 | + size_t offset = QIntC::to_size(input.tell()); | |
| 1503 | 1507 | while (offset < object_str.length()) { |
| 1504 | 1508 | if (!isspace(object_str.at(offset))) { |
| 1505 | 1509 | QTC::TC("qpdf", "QPDFObjectHandle trailing data in parse"); |
| 1506 | 1510 | throw QPDFExc( |
| 1507 | 1511 | qpdf_e_damaged_pdf, |
| 1508 | - input->getName(), | |
| 1512 | + "parsed object", | |
| 1509 | 1513 | object_description, |
| 1510 | - input->getLastOffset(), | |
| 1514 | + input.getLastOffset(), | |
| 1511 | 1515 | "trailing data found parsing object from string"); |
| 1512 | 1516 | } |
| 1513 | 1517 | ++offset; |
| ... | ... | @@ -1614,45 +1618,44 @@ QPDFObjectHandle::parseContentStream_data( |
| 1614 | 1618 | QPDF* context) |
| 1615 | 1619 | { |
| 1616 | 1620 | size_t stream_length = stream_data->getSize(); |
| 1617 | - auto input = | |
| 1618 | - std::shared_ptr<InputSource>(new BufferInputSource(description, stream_data.get())); | |
| 1619 | - QPDFTokenizer tokenizer; | |
| 1621 | + auto input = BufferInputSource(description, stream_data.get()); | |
| 1622 | + Tokenizer tokenizer; | |
| 1620 | 1623 | tokenizer.allowEOF(); |
| 1621 | 1624 | bool empty = false; |
| 1622 | - while (QIntC::to_size(input->tell()) < stream_length) { | |
| 1625 | + while (QIntC::to_size(input.tell()) < stream_length) { | |
| 1623 | 1626 | // Read a token and seek to the beginning. The offset we get from this process is the |
| 1624 | 1627 | // beginning of the next non-ignorable (space, comment) token. This way, the offset and |
| 1625 | 1628 | // don't including ignorable content. |
| 1626 | 1629 | tokenizer.readToken(input, "content", true); |
| 1627 | - qpdf_offset_t offset = input->getLastOffset(); | |
| 1628 | - input->seek(offset, SEEK_SET); | |
| 1630 | + qpdf_offset_t offset = input.getLastOffset(); | |
| 1631 | + input.seek(offset, SEEK_SET); | |
| 1629 | 1632 | auto obj = |
| 1630 | - QPDFParser(*input, "content", tokenizer, nullptr, context, false).parse(empty, true); | |
| 1633 | + QPDFParser(input, "content", tokenizer, nullptr, context, false).parse(empty, true); | |
| 1631 | 1634 | if (!obj) { |
| 1632 | 1635 | // EOF |
| 1633 | 1636 | break; |
| 1634 | 1637 | } |
| 1635 | - size_t length = QIntC::to_size(input->tell() - offset); | |
| 1638 | + size_t length = QIntC::to_size(input.tell() - offset); | |
| 1636 | 1639 | |
| 1637 | 1640 | callbacks->handleObject(obj, QIntC::to_size(offset), length); |
| 1638 | 1641 | if (obj.isOperator() && (obj.getOperatorValue() == "ID")) { |
| 1639 | 1642 | // Discard next character; it is the space after ID that terminated the token. Read |
| 1640 | 1643 | // until end of inline image. |
| 1641 | 1644 | char ch; |
| 1642 | - input->read(&ch, 1); | |
| 1645 | + input.read(&ch, 1); | |
| 1643 | 1646 | tokenizer.expectInlineImage(input); |
| 1644 | 1647 | QPDFTokenizer::Token t = tokenizer.readToken(input, description, true); |
| 1645 | - offset = input->getLastOffset(); | |
| 1646 | - length = QIntC::to_size(input->tell() - offset); | |
| 1648 | + offset = input.getLastOffset(); | |
| 1649 | + length = QIntC::to_size(input.tell() - offset); | |
| 1647 | 1650 | if (t.getType() == QPDFTokenizer::tt_bad) { |
| 1648 | 1651 | QTC::TC("qpdf", "QPDFObjectHandle EOF in inline image"); |
| 1649 | 1652 | warn( |
| 1650 | 1653 | context, |
| 1651 | 1654 | QPDFExc( |
| 1652 | 1655 | qpdf_e_damaged_pdf, |
| 1653 | - input->getName(), | |
| 1656 | + description, | |
| 1654 | 1657 | "stream data", |
| 1655 | - input->tell(), | |
| 1658 | + input.tell(), | |
| 1656 | 1659 | "EOF found while reading inline image")); |
| 1657 | 1660 | } else { |
| 1658 | 1661 | std::string inline_image = t.getValue(); | ... | ... |
libqpdf/QPDFTokenizer.cc
| ... | ... | @@ -52,8 +52,8 @@ QPDFWordTokenFinder::check() |
| 52 | 52 | { |
| 53 | 53 | // Find a word token matching the given string, preceded by a delimiter, and followed by a |
| 54 | 54 | // delimiter or EOF. |
| 55 | - QPDFTokenizer tokenizer; | |
| 56 | - QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true, str.size() + 2); | |
| 55 | + Tokenizer tokenizer; | |
| 56 | + auto t = tokenizer.readToken(is, "finder", true, str.size() + 2); | |
| 57 | 57 | qpdf_offset_t pos = is.tell(); |
| 58 | 58 | if (!(t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) { |
| 59 | 59 | QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); |
| ... | ... | @@ -845,7 +845,7 @@ Tokenizer::findEI(InputSource& input) |
| 845 | 845 | } |
| 846 | 846 | inline_image_bytes = QIntC::to_size(input.tell() - pos - 2); |
| 847 | 847 | |
| 848 | - QPDFTokenizer check; | |
| 848 | + Tokenizer check; | |
| 849 | 849 | bool found_bad = false; |
| 850 | 850 | // Look at the next 10 tokens or up to EOF. The next inline image's image data would look |
| 851 | 851 | // like bad tokens, but there will always be at least 10 tokens between one inline image's |
| ... | ... | @@ -853,8 +853,8 @@ Tokenizer::findEI(InputSource& input) |
| 853 | 853 | // all required as well as a BI and ID. If we get 10 good tokens in a row or hit EOF, we can |
| 854 | 854 | // be pretty sure we've found the actual EI. |
| 855 | 855 | for (int i = 0; i < 10; ++i) { |
| 856 | - QPDFTokenizer::Token t = check.readToken(input, "checker", true); | |
| 857 | - QPDFTokenizer::token_type_e type = t.getType(); | |
| 856 | + auto t = check.readToken(input, "checker", true); | |
| 857 | + auto type = t.getType(); | |
| 858 | 858 | if (type == tt::tt_eof) { |
| 859 | 859 | okay = true; |
| 860 | 860 | } else if (type == tt::tt_bad) { | ... | ... |
libqpdf/qpdf/QPDFParser.hh
| ... | ... | @@ -12,6 +12,8 @@ class QPDFParser |
| 12 | 12 | { |
| 13 | 13 | public: |
| 14 | 14 | QPDFParser() = delete; |
| 15 | + | |
| 16 | + // This constructor is only used by QPDFObjectHandle::parse overload taking a QPDFTokenizer. | |
| 15 | 17 | QPDFParser( |
| 16 | 18 | InputSource& input, |
| 17 | 19 | std::string const& object_description, |
| ... | ... | @@ -30,7 +32,26 @@ class QPDFParser |
| 30 | 32 | parse_pdf(parse_pdf) |
| 31 | 33 | { |
| 32 | 34 | } |
| 33 | - virtual ~QPDFParser() = default; | |
| 35 | + | |
| 36 | + QPDFParser( | |
| 37 | + InputSource& input, | |
| 38 | + std::string const& object_description, | |
| 39 | + qpdf::Tokenizer& tokenizer, | |
| 40 | + QPDFObjectHandle::StringDecrypter* decrypter, | |
| 41 | + QPDF* context, | |
| 42 | + bool parse_pdf) : | |
| 43 | + input(input), | |
| 44 | + object_description(object_description), | |
| 45 | + tokenizer(tokenizer), | |
| 46 | + decrypter(decrypter), | |
| 47 | + context(context), | |
| 48 | + description( | |
| 49 | + std::make_shared<QPDFObject::Description>( | |
| 50 | + std::string(input.getName() + ", " + object_description + " at offset $PO"))), | |
| 51 | + parse_pdf(parse_pdf) | |
| 52 | + { | |
| 53 | + } | |
| 54 | + ~QPDFParser() = default; | |
| 34 | 55 | |
| 35 | 56 | QPDFObjectHandle parse(bool& empty, bool content_stream); |
| 36 | 57 | |
| ... | ... | @@ -83,7 +104,7 @@ class QPDFParser |
| 83 | 104 | bool parse_pdf; |
| 84 | 105 | |
| 85 | 106 | std::vector<StackFrame> stack; |
| 86 | - StackFrame* frame; | |
| 107 | + StackFrame* frame{nullptr}; | |
| 87 | 108 | // Number of recent bad tokens. This will always be > 0 once a bad token has been encountered as |
| 88 | 109 | // it only gets incremented or reset when a bad token is encountered. |
| 89 | 110 | int bad_count{0}; |
| ... | ... | @@ -92,9 +113,9 @@ class QPDFParser |
| 92 | 113 | // Number of good tokens since last bad token. Irrelevant if bad_count == 0. |
| 93 | 114 | int good_count{0}; |
| 94 | 115 | // Start offset including any leading whitespace. |
| 95 | - qpdf_offset_t start; | |
| 116 | + qpdf_offset_t start{0}; | |
| 96 | 117 | // Number of successive integer tokens. |
| 97 | - int int_count = 0; | |
| 118 | + int int_count{0}; | |
| 98 | 119 | long long int_buffer[2]{0, 0}; |
| 99 | 120 | qpdf_offset_t last_offset_buffer[2]{0, 0}; |
| 100 | 121 | }; | ... | ... |
libqpdf/qpdf/QPDF_private.hh
| ... | ... | @@ -3,6 +3,8 @@ |
| 3 | 3 | |
| 4 | 4 | #include <qpdf/QPDF.hh> |
| 5 | 5 | |
| 6 | +#include <qpdf/QPDFTokenizer_private.hh> | |
| 7 | + | |
| 6 | 8 | // Writer class is restricted to QPDFWriter so that only it can call certain methods. |
| 7 | 9 | class QPDF::Writer |
| 8 | 10 | { |
| ... | ... | @@ -452,7 +454,7 @@ class QPDF::Members |
| 452 | 454 | private: |
| 453 | 455 | std::shared_ptr<QPDFLogger> log; |
| 454 | 456 | unsigned long long unique_id{0}; |
| 455 | - QPDFTokenizer tokenizer; | |
| 457 | + qpdf::Tokenizer tokenizer; | |
| 456 | 458 | std::shared_ptr<InputSource> file; |
| 457 | 459 | std::string last_object_description; |
| 458 | 460 | bool provided_password_is_hex_key{false}; | ... | ... |