Commit 0518d585c95c0da6544ce23e8268df1b8dec86e4
1 parent
a64215e6
Use Tokenizer instead of QPDFTokenizer internally in qpdf
Also remove some shared pointers and use std::string instead of Pl_Buffer in Pl_QPDFTokenizer.
Showing
5 changed files
with
57 additions
and
29 deletions
include/qpdf/BufferInputSource.hh
| @@ -30,6 +30,8 @@ class QPDF_DLL_CLASS BufferInputSource: public InputSource | @@ -30,6 +30,8 @@ class QPDF_DLL_CLASS BufferInputSource: public InputSource | ||
| 30 | // Otherwise, the caller owns the memory. | 30 | // Otherwise, the caller owns the memory. |
| 31 | QPDF_DLL | 31 | QPDF_DLL |
| 32 | BufferInputSource(std::string const& description, Buffer* buf, bool own_memory = false); | 32 | BufferInputSource(std::string const& description, Buffer* buf, bool own_memory = false); |
| 33 | + | ||
| 34 | + // NB This overload copies the string contents. | ||
| 33 | QPDF_DLL | 35 | QPDF_DLL |
| 34 | BufferInputSource(std::string const& description, std::string const& contents); | 36 | BufferInputSource(std::string const& description, std::string const& contents); |
| 35 | QPDF_DLL | 37 | QPDF_DLL |
libqpdf/QPDFObjectHandle.cc
| @@ -1495,19 +1495,23 @@ QPDFObjectHandle | @@ -1495,19 +1495,23 @@ QPDFObjectHandle | ||
| 1495 | QPDFObjectHandle::parse( | 1495 | QPDFObjectHandle::parse( |
| 1496 | QPDF* context, std::string const& object_str, std::string const& object_description) | 1496 | QPDF* context, std::string const& object_str, std::string const& object_description) |
| 1497 | { | 1497 | { |
| 1498 | - auto input = std::shared_ptr<InputSource>(new BufferInputSource("parsed object", object_str)); | ||
| 1499 | - QPDFTokenizer tokenizer; | 1498 | + // BufferInputSource does not modify the input, but Buffer either requires a string& or copies |
| 1499 | + // the string. | ||
| 1500 | + Buffer buf(const_cast<std::string&>(object_str)); | ||
| 1501 | + auto input = BufferInputSource("parsed object", &buf); | ||
| 1502 | + qpdf::Tokenizer tokenizer; | ||
| 1500 | bool empty = false; | 1503 | bool empty = false; |
| 1501 | - QPDFObjectHandle result = parse(input, object_description, tokenizer, empty, nullptr, context); | ||
| 1502 | - size_t offset = QIntC::to_size(input->tell()); | 1504 | + auto result = QPDFParser(input, object_description, tokenizer, nullptr, context, false) |
| 1505 | + .parse(empty, false); | ||
| 1506 | + size_t offset = QIntC::to_size(input.tell()); | ||
| 1503 | while (offset < object_str.length()) { | 1507 | while (offset < object_str.length()) { |
| 1504 | if (!isspace(object_str.at(offset))) { | 1508 | if (!isspace(object_str.at(offset))) { |
| 1505 | QTC::TC("qpdf", "QPDFObjectHandle trailing data in parse"); | 1509 | QTC::TC("qpdf", "QPDFObjectHandle trailing data in parse"); |
| 1506 | throw QPDFExc( | 1510 | throw QPDFExc( |
| 1507 | qpdf_e_damaged_pdf, | 1511 | qpdf_e_damaged_pdf, |
| 1508 | - input->getName(), | 1512 | + "parsed object", |
| 1509 | object_description, | 1513 | object_description, |
| 1510 | - input->getLastOffset(), | 1514 | + input.getLastOffset(), |
| 1511 | "trailing data found parsing object from string"); | 1515 | "trailing data found parsing object from string"); |
| 1512 | } | 1516 | } |
| 1513 | ++offset; | 1517 | ++offset; |
| @@ -1614,45 +1618,44 @@ QPDFObjectHandle::parseContentStream_data( | @@ -1614,45 +1618,44 @@ QPDFObjectHandle::parseContentStream_data( | ||
| 1614 | QPDF* context) | 1618 | QPDF* context) |
| 1615 | { | 1619 | { |
| 1616 | size_t stream_length = stream_data->getSize(); | 1620 | size_t stream_length = stream_data->getSize(); |
| 1617 | - auto input = | ||
| 1618 | - std::shared_ptr<InputSource>(new BufferInputSource(description, stream_data.get())); | ||
| 1619 | - QPDFTokenizer tokenizer; | 1621 | + auto input = BufferInputSource(description, stream_data.get()); |
| 1622 | + Tokenizer tokenizer; | ||
| 1620 | tokenizer.allowEOF(); | 1623 | tokenizer.allowEOF(); |
| 1621 | bool empty = false; | 1624 | bool empty = false; |
| 1622 | - while (QIntC::to_size(input->tell()) < stream_length) { | 1625 | + while (QIntC::to_size(input.tell()) < stream_length) { |
| 1623 | // Read a token and seek to the beginning. The offset we get from this process is the | 1626 | // Read a token and seek to the beginning. The offset we get from this process is the |
| 1624 | // beginning of the next non-ignorable (space, comment) token. This way, the offset and | 1627 | // beginning of the next non-ignorable (space, comment) token. This way, the offset and |
| 1625 | // don't including ignorable content. | 1628 | // don't including ignorable content. |
| 1626 | tokenizer.readToken(input, "content", true); | 1629 | tokenizer.readToken(input, "content", true); |
| 1627 | - qpdf_offset_t offset = input->getLastOffset(); | ||
| 1628 | - input->seek(offset, SEEK_SET); | 1630 | + qpdf_offset_t offset = input.getLastOffset(); |
| 1631 | + input.seek(offset, SEEK_SET); | ||
| 1629 | auto obj = | 1632 | auto obj = |
| 1630 | - QPDFParser(*input, "content", tokenizer, nullptr, context, false).parse(empty, true); | 1633 | + QPDFParser(input, "content", tokenizer, nullptr, context, false).parse(empty, true); |
| 1631 | if (!obj) { | 1634 | if (!obj) { |
| 1632 | // EOF | 1635 | // EOF |
| 1633 | break; | 1636 | break; |
| 1634 | } | 1637 | } |
| 1635 | - size_t length = QIntC::to_size(input->tell() - offset); | 1638 | + size_t length = QIntC::to_size(input.tell() - offset); |
| 1636 | 1639 | ||
| 1637 | callbacks->handleObject(obj, QIntC::to_size(offset), length); | 1640 | callbacks->handleObject(obj, QIntC::to_size(offset), length); |
| 1638 | if (obj.isOperator() && (obj.getOperatorValue() == "ID")) { | 1641 | if (obj.isOperator() && (obj.getOperatorValue() == "ID")) { |
| 1639 | // Discard next character; it is the space after ID that terminated the token. Read | 1642 | // Discard next character; it is the space after ID that terminated the token. Read |
| 1640 | // until end of inline image. | 1643 | // until end of inline image. |
| 1641 | char ch; | 1644 | char ch; |
| 1642 | - input->read(&ch, 1); | 1645 | + input.read(&ch, 1); |
| 1643 | tokenizer.expectInlineImage(input); | 1646 | tokenizer.expectInlineImage(input); |
| 1644 | QPDFTokenizer::Token t = tokenizer.readToken(input, description, true); | 1647 | QPDFTokenizer::Token t = tokenizer.readToken(input, description, true); |
| 1645 | - offset = input->getLastOffset(); | ||
| 1646 | - length = QIntC::to_size(input->tell() - offset); | 1648 | + offset = input.getLastOffset(); |
| 1649 | + length = QIntC::to_size(input.tell() - offset); | ||
| 1647 | if (t.getType() == QPDFTokenizer::tt_bad) { | 1650 | if (t.getType() == QPDFTokenizer::tt_bad) { |
| 1648 | QTC::TC("qpdf", "QPDFObjectHandle EOF in inline image"); | 1651 | QTC::TC("qpdf", "QPDFObjectHandle EOF in inline image"); |
| 1649 | warn( | 1652 | warn( |
| 1650 | context, | 1653 | context, |
| 1651 | QPDFExc( | 1654 | QPDFExc( |
| 1652 | qpdf_e_damaged_pdf, | 1655 | qpdf_e_damaged_pdf, |
| 1653 | - input->getName(), | 1656 | + description, |
| 1654 | "stream data", | 1657 | "stream data", |
| 1655 | - input->tell(), | 1658 | + input.tell(), |
| 1656 | "EOF found while reading inline image")); | 1659 | "EOF found while reading inline image")); |
| 1657 | } else { | 1660 | } else { |
| 1658 | std::string inline_image = t.getValue(); | 1661 | std::string inline_image = t.getValue(); |
libqpdf/QPDFTokenizer.cc
| @@ -52,8 +52,8 @@ QPDFWordTokenFinder::check() | @@ -52,8 +52,8 @@ QPDFWordTokenFinder::check() | ||
| 52 | { | 52 | { |
| 53 | // Find a word token matching the given string, preceded by a delimiter, and followed by a | 53 | // Find a word token matching the given string, preceded by a delimiter, and followed by a |
| 54 | // delimiter or EOF. | 54 | // delimiter or EOF. |
| 55 | - QPDFTokenizer tokenizer; | ||
| 56 | - QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true, str.size() + 2); | 55 | + Tokenizer tokenizer; |
| 56 | + auto t = tokenizer.readToken(is, "finder", true, str.size() + 2); | ||
| 57 | qpdf_offset_t pos = is.tell(); | 57 | qpdf_offset_t pos = is.tell(); |
| 58 | if (!(t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) { | 58 | if (!(t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) { |
| 59 | QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); | 59 | QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); |
| @@ -845,7 +845,7 @@ Tokenizer::findEI(InputSource& input) | @@ -845,7 +845,7 @@ Tokenizer::findEI(InputSource& input) | ||
| 845 | } | 845 | } |
| 846 | inline_image_bytes = QIntC::to_size(input.tell() - pos - 2); | 846 | inline_image_bytes = QIntC::to_size(input.tell() - pos - 2); |
| 847 | 847 | ||
| 848 | - QPDFTokenizer check; | 848 | + Tokenizer check; |
| 849 | bool found_bad = false; | 849 | bool found_bad = false; |
| 850 | // Look at the next 10 tokens or up to EOF. The next inline image's image data would look | 850 | // Look at the next 10 tokens or up to EOF. The next inline image's image data would look |
| 851 | // like bad tokens, but there will always be at least 10 tokens between one inline image's | 851 | // like bad tokens, but there will always be at least 10 tokens between one inline image's |
| @@ -853,8 +853,8 @@ Tokenizer::findEI(InputSource& input) | @@ -853,8 +853,8 @@ Tokenizer::findEI(InputSource& input) | ||
| 853 | // all required as well as a BI and ID. If we get 10 good tokens in a row or hit EOF, we can | 853 | // all required as well as a BI and ID. If we get 10 good tokens in a row or hit EOF, we can |
| 854 | // be pretty sure we've found the actual EI. | 854 | // be pretty sure we've found the actual EI. |
| 855 | for (int i = 0; i < 10; ++i) { | 855 | for (int i = 0; i < 10; ++i) { |
| 856 | - QPDFTokenizer::Token t = check.readToken(input, "checker", true); | ||
| 857 | - QPDFTokenizer::token_type_e type = t.getType(); | 856 | + auto t = check.readToken(input, "checker", true); |
| 857 | + auto type = t.getType(); | ||
| 858 | if (type == tt::tt_eof) { | 858 | if (type == tt::tt_eof) { |
| 859 | okay = true; | 859 | okay = true; |
| 860 | } else if (type == tt::tt_bad) { | 860 | } else if (type == tt::tt_bad) { |
libqpdf/qpdf/QPDFParser.hh
| @@ -12,6 +12,8 @@ class QPDFParser | @@ -12,6 +12,8 @@ class QPDFParser | ||
| 12 | { | 12 | { |
| 13 | public: | 13 | public: |
| 14 | QPDFParser() = delete; | 14 | QPDFParser() = delete; |
| 15 | + | ||
| 16 | + // This constructor is only used by QPDFObjectHandle::parse overload taking a QPDFTokenizer. | ||
| 15 | QPDFParser( | 17 | QPDFParser( |
| 16 | InputSource& input, | 18 | InputSource& input, |
| 17 | std::string const& object_description, | 19 | std::string const& object_description, |
| @@ -30,7 +32,26 @@ class QPDFParser | @@ -30,7 +32,26 @@ class QPDFParser | ||
| 30 | parse_pdf(parse_pdf) | 32 | parse_pdf(parse_pdf) |
| 31 | { | 33 | { |
| 32 | } | 34 | } |
| 33 | - virtual ~QPDFParser() = default; | 35 | + |
| 36 | + QPDFParser( | ||
| 37 | + InputSource& input, | ||
| 38 | + std::string const& object_description, | ||
| 39 | + qpdf::Tokenizer& tokenizer, | ||
| 40 | + QPDFObjectHandle::StringDecrypter* decrypter, | ||
| 41 | + QPDF* context, | ||
| 42 | + bool parse_pdf) : | ||
| 43 | + input(input), | ||
| 44 | + object_description(object_description), | ||
| 45 | + tokenizer(tokenizer), | ||
| 46 | + decrypter(decrypter), | ||
| 47 | + context(context), | ||
| 48 | + description( | ||
| 49 | + std::make_shared<QPDFObject::Description>( | ||
| 50 | + std::string(input.getName() + ", " + object_description + " at offset $PO"))), | ||
| 51 | + parse_pdf(parse_pdf) | ||
| 52 | + { | ||
| 53 | + } | ||
| 54 | + ~QPDFParser() = default; | ||
| 34 | 55 | ||
| 35 | QPDFObjectHandle parse(bool& empty, bool content_stream); | 56 | QPDFObjectHandle parse(bool& empty, bool content_stream); |
| 36 | 57 | ||
| @@ -83,7 +104,7 @@ class QPDFParser | @@ -83,7 +104,7 @@ class QPDFParser | ||
| 83 | bool parse_pdf; | 104 | bool parse_pdf; |
| 84 | 105 | ||
| 85 | std::vector<StackFrame> stack; | 106 | std::vector<StackFrame> stack; |
| 86 | - StackFrame* frame; | 107 | + StackFrame* frame{nullptr}; |
| 87 | // Number of recent bad tokens. This will always be > 0 once a bad token has been encountered as | 108 | // Number of recent bad tokens. This will always be > 0 once a bad token has been encountered as |
| 88 | // it only gets incremented or reset when a bad token is encountered. | 109 | // it only gets incremented or reset when a bad token is encountered. |
| 89 | int bad_count{0}; | 110 | int bad_count{0}; |
| @@ -92,9 +113,9 @@ class QPDFParser | @@ -92,9 +113,9 @@ class QPDFParser | ||
| 92 | // Number of good tokens since last bad token. Irrelevant if bad_count == 0. | 113 | // Number of good tokens since last bad token. Irrelevant if bad_count == 0. |
| 93 | int good_count{0}; | 114 | int good_count{0}; |
| 94 | // Start offset including any leading whitespace. | 115 | // Start offset including any leading whitespace. |
| 95 | - qpdf_offset_t start; | 116 | + qpdf_offset_t start{0}; |
| 96 | // Number of successive integer tokens. | 117 | // Number of successive integer tokens. |
| 97 | - int int_count = 0; | 118 | + int int_count{0}; |
| 98 | long long int_buffer[2]{0, 0}; | 119 | long long int_buffer[2]{0, 0}; |
| 99 | qpdf_offset_t last_offset_buffer[2]{0, 0}; | 120 | qpdf_offset_t last_offset_buffer[2]{0, 0}; |
| 100 | }; | 121 | }; |
libqpdf/qpdf/QPDF_private.hh
| @@ -3,6 +3,8 @@ | @@ -3,6 +3,8 @@ | ||
| 3 | 3 | ||
| 4 | #include <qpdf/QPDF.hh> | 4 | #include <qpdf/QPDF.hh> |
| 5 | 5 | ||
| 6 | +#include <qpdf/QPDFTokenizer_private.hh> | ||
| 7 | + | ||
| 6 | // Writer class is restricted to QPDFWriter so that only it can call certain methods. | 8 | // Writer class is restricted to QPDFWriter so that only it can call certain methods. |
| 7 | class QPDF::Writer | 9 | class QPDF::Writer |
| 8 | { | 10 | { |
| @@ -452,7 +454,7 @@ class QPDF::Members | @@ -452,7 +454,7 @@ class QPDF::Members | ||
| 452 | private: | 454 | private: |
| 453 | std::shared_ptr<QPDFLogger> log; | 455 | std::shared_ptr<QPDFLogger> log; |
| 454 | unsigned long long unique_id{0}; | 456 | unsigned long long unique_id{0}; |
| 455 | - QPDFTokenizer tokenizer; | 457 | + qpdf::Tokenizer tokenizer; |
| 456 | std::shared_ptr<InputSource> file; | 458 | std::shared_ptr<InputSource> file; |
| 457 | std::string last_object_description; | 459 | std::string last_object_description; |
| 458 | bool provided_password_is_hex_key{false}; | 460 | bool provided_password_is_hex_key{false}; |