Commit 0b3debaf86eda5ecd1dd6447bcf2ac62abb6dd16
Committed by
GitHub
Merge pull request #1253 from m-holger/pl_t
Refactor Pl_QPDFTokenizer
Showing
5 changed files
with
57 additions
and
48 deletions
include/qpdf/QPDFTokenizer.hh
| ... | ... | @@ -191,6 +191,8 @@ class QPDFTokenizer |
| 191 | 191 | // returns a tt_inline_image token. |
| 192 | 192 | QPDF_DLL |
| 193 | 193 | void expectInlineImage(std::shared_ptr<InputSource> input); |
| 194 | + QPDF_DLL | |
| 195 | + void expectInlineImage(InputSource& input); | |
| 194 | 196 | |
| 195 | 197 | private: |
| 196 | 198 | friend class QPDFParser; |
| ... | ... | @@ -217,7 +219,7 @@ class QPDFTokenizer |
| 217 | 219 | |
| 218 | 220 | bool isSpace(char); |
| 219 | 221 | bool isDelimiter(char); |
| 220 | - void findEI(std::shared_ptr<InputSource> input); | |
| 222 | + void findEI(InputSource& input); | |
| 221 | 223 | |
| 222 | 224 | enum state_e { |
| 223 | 225 | st_top, | ... | ... |
libqpdf/ContentNormalizer.cc
| 1 | 1 | #include <qpdf/ContentNormalizer.hh> |
| 2 | 2 | |
| 3 | +#include <qpdf/QPDF_Name.hh> | |
| 3 | 4 | #include <qpdf/QUtil.hh> |
| 4 | 5 | |
| 5 | 6 | ContentNormalizer::ContentNormalizer() : |
| ... | ... | @@ -11,7 +12,6 @@ ContentNormalizer::ContentNormalizer() : |
| 11 | 12 | void |
| 12 | 13 | ContentNormalizer::handleToken(QPDFTokenizer::Token const& token) |
| 13 | 14 | { |
| 14 | - std::string value = token.getRawValue(); | |
| 15 | 15 | QPDFTokenizer::token_type_e token_type = token.getType(); |
| 16 | 16 | |
| 17 | 17 | if (token_type == QPDFTokenizer::tt_bad) { |
| ... | ... | @@ -24,40 +24,48 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token) |
| 24 | 24 | switch (token_type) { |
| 25 | 25 | case QPDFTokenizer::tt_space: |
| 26 | 26 | { |
| 27 | - size_t len = value.length(); | |
| 28 | - for (size_t i = 0; i < len; ++i) { | |
| 29 | - char ch = value.at(i); | |
| 30 | - if (ch == '\r') { | |
| 31 | - if ((i + 1 < len) && (value.at(i + 1) == '\n')) { | |
| 32 | - // ignore | |
| 33 | - } else { | |
| 34 | - write("\n"); | |
| 35 | - } | |
| 36 | - } else { | |
| 37 | - write(&ch, 1); | |
| 27 | + std::string const& value = token.getRawValue(); | |
| 28 | + auto size = value.size(); | |
| 29 | + size_t pos = 0; | |
| 30 | + auto r_pos = value.find('\r'); | |
| 31 | + while (r_pos != std::string::npos) { | |
| 32 | + if (pos != r_pos) { | |
| 33 | + write(&value[pos], r_pos - pos); | |
| 38 | 34 | } |
| 35 | + if (++r_pos >= size) { | |
| 36 | + write("\n"); | |
| 37 | + return; | |
| 38 | + } | |
| 39 | + if (value[r_pos] != '\n') { | |
| 40 | + write("\n"); | |
| 41 | + } | |
| 42 | + pos = r_pos; | |
| 43 | + r_pos = value.find('\r', pos); | |
| 44 | + } | |
| 45 | + if (pos < size) { | |
| 46 | + write(&value[pos], size - pos); | |
| 39 | 47 | } |
| 40 | 48 | } |
| 41 | - break; | |
| 49 | + return; | |
| 42 | 50 | |
| 43 | 51 | case QPDFTokenizer::tt_string: |
| 44 | 52 | // Replacing string and name tokens in this way normalizes their representation as this will |
| 45 | 53 | // automatically handle quoting of unprintable characters, etc. |
| 46 | - writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_string, token.getValue())); | |
| 54 | + write(QPDFObjectHandle::newString(token.getValue()).unparse()); | |
| 47 | 55 | break; |
| 48 | 56 | |
| 49 | 57 | case QPDFTokenizer::tt_name: |
| 50 | - writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_name, token.getValue())); | |
| 58 | + write(QPDF_Name::normalizeName(token.getValue())); | |
| 51 | 59 | break; |
| 52 | 60 | |
| 53 | 61 | default: |
| 54 | 62 | writeToken(token); |
| 55 | - break; | |
| 63 | + return; | |
| 56 | 64 | } |
| 57 | 65 | |
| 58 | - value = token.getRawValue(); | |
| 59 | - if (((token_type == QPDFTokenizer::tt_string) || (token_type == QPDFTokenizer::tt_name)) && | |
| 60 | - ((value.find('\r') != std::string::npos) || (value.find('\n') != std::string::npos))) { | |
| 66 | + // tt_string or tt_name | |
| 67 | + std::string const& value = token.getRawValue(); | |
| 68 | + if (value.find('\r') != std::string::npos || value.find('\n') != std::string::npos) { | |
| 61 | 69 | write("\n"); |
| 62 | 70 | } |
| 63 | 71 | } | ... | ... |
libqpdf/Pl_QPDFTokenizer.cc
| ... | ... | @@ -36,20 +36,17 @@ void |
| 36 | 36 | Pl_QPDFTokenizer::finish() |
| 37 | 37 | { |
| 38 | 38 | m->buf.finish(); |
| 39 | - auto input = std::shared_ptr<InputSource>( | |
| 40 | - // line-break | |
| 41 | - new BufferInputSource("tokenizer data", m->buf.getBuffer(), true)); | |
| 42 | - | |
| 39 | + auto input = BufferInputSource("tokenizer data", m->buf.getBuffer(), true); | |
| 40 | + std::string empty; | |
| 43 | 41 | while (true) { |
| 44 | - QPDFTokenizer::Token token = | |
| 45 | - m->tokenizer.readToken(input, "offset " + std::to_string(input->tell()), true); | |
| 42 | + auto token = m->tokenizer.readToken(input, empty, true); | |
| 46 | 43 | m->filter->handleToken(token); |
| 47 | 44 | if (token.getType() == QPDFTokenizer::tt_eof) { |
| 48 | 45 | break; |
| 49 | 46 | } else if (token.isWord("ID")) { |
| 50 | 47 | // Read the space after the ID. |
| 51 | 48 | char ch = ' '; |
| 52 | - input->read(&ch, 1); | |
| 49 | + input.read(&ch, 1); | |
| 53 | 50 | m->filter->handleToken( |
| 54 | 51 | // line-break |
| 55 | 52 | QPDFTokenizer::Token(QPDFTokenizer::tt_space, std::string(1, ch))); | ... | ... |
libqpdf/QPDFObjectHandle.cc
| ... | ... | @@ -148,7 +148,7 @@ QPDFObjectHandle::TokenFilter::write(std::string const& str) |
| 148 | 148 | void |
| 149 | 149 | QPDFObjectHandle::TokenFilter::writeToken(QPDFTokenizer::Token const& token) |
| 150 | 150 | { |
| 151 | - std::string value = token.getRawValue(); | |
| 151 | + std::string const& value = token.getRawValue(); | |
| 152 | 152 | write(value.c_str(), value.length()); |
| 153 | 153 | } |
| 154 | 154 | ... | ... |
libqpdf/QPDFTokenizer.cc
| ... | ... | @@ -27,7 +27,7 @@ namespace |
| 27 | 27 | class QPDFWordTokenFinder: public InputSource::Finder |
| 28 | 28 | { |
| 29 | 29 | public: |
| 30 | - QPDFWordTokenFinder(std::shared_ptr<InputSource> is, std::string const& str) : | |
| 30 | + QPDFWordTokenFinder(InputSource& is, std::string const& str) : | |
| 31 | 31 | is(is), |
| 32 | 32 | str(str) |
| 33 | 33 | { |
| ... | ... | @@ -36,7 +36,7 @@ namespace |
| 36 | 36 | bool check() override; |
| 37 | 37 | |
| 38 | 38 | private: |
| 39 | - std::shared_ptr<InputSource> is; | |
| 39 | + InputSource& is; | |
| 40 | 40 | std::string str; |
| 41 | 41 | }; |
| 42 | 42 | } // namespace |
| ... | ... | @@ -48,21 +48,21 @@ QPDFWordTokenFinder::check() |
| 48 | 48 | // delimiter or EOF. |
| 49 | 49 | QPDFTokenizer tokenizer; |
| 50 | 50 | QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true); |
| 51 | - qpdf_offset_t pos = is->tell(); | |
| 51 | + qpdf_offset_t pos = is.tell(); | |
| 52 | 52 | if (!(t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) { |
| 53 | 53 | QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); |
| 54 | 54 | return false; |
| 55 | 55 | } |
| 56 | - qpdf_offset_t token_start = is->getLastOffset(); | |
| 56 | + qpdf_offset_t token_start = is.getLastOffset(); | |
| 57 | 57 | char next; |
| 58 | 58 | bool next_okay = false; |
| 59 | - if (is->read(&next, 1) == 0) { | |
| 59 | + if (is.read(&next, 1) == 0) { | |
| 60 | 60 | QTC::TC("qpdf", "QPDFTokenizer inline image at EOF"); |
| 61 | 61 | next_okay = true; |
| 62 | 62 | } else { |
| 63 | 63 | next_okay = is_delimiter(next); |
| 64 | 64 | } |
| 65 | - is->seek(pos, SEEK_SET); | |
| 65 | + is.seek(pos, SEEK_SET); | |
| 66 | 66 | if (!next_okay) { |
| 67 | 67 | return false; |
| 68 | 68 | } |
| ... | ... | @@ -764,11 +764,17 @@ QPDFTokenizer::presentEOF() |
| 764 | 764 | void |
| 765 | 765 | QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input) |
| 766 | 766 | { |
| 767 | + expectInlineImage(*input); | |
| 768 | +} | |
| 769 | + | |
| 770 | +void | |
| 771 | +QPDFTokenizer::expectInlineImage(InputSource& input) | |
| 772 | +{ | |
| 767 | 773 | if (this->state == st_token_ready) { |
| 768 | 774 | reset(); |
| 769 | 775 | } else if (this->state != st_before_token) { |
| 770 | - throw std::logic_error("QPDFTokenizer::expectInlineImage called" | |
| 771 | - " when tokenizer is in improper state"); | |
| 776 | + throw std::logic_error( | |
| 777 | + "QPDFTokenizer::expectInlineImage called when tokenizer is in improper state"); | |
| 772 | 778 | } |
| 773 | 779 | findEI(input); |
| 774 | 780 | this->before_token = false; |
| ... | ... | @@ -777,14 +783,10 @@ QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input) |
| 777 | 783 | } |
| 778 | 784 | |
| 779 | 785 | void |
| 780 | -QPDFTokenizer::findEI(std::shared_ptr<InputSource> input) | |
| 786 | +QPDFTokenizer::findEI(InputSource& input) | |
| 781 | 787 | { |
| 782 | - if (!input.get()) { | |
| 783 | - return; | |
| 784 | - } | |
| 785 | - | |
| 786 | - qpdf_offset_t last_offset = input->getLastOffset(); | |
| 787 | - qpdf_offset_t pos = input->tell(); | |
| 788 | + qpdf_offset_t last_offset = input.getLastOffset(); | |
| 789 | + qpdf_offset_t pos = input.tell(); | |
| 788 | 790 | |
| 789 | 791 | // Use QPDFWordTokenFinder to find EI surrounded by delimiters. Then read the next several |
| 790 | 792 | // tokens or up to EOF. If we find any suspicious-looking or tokens, this is probably still part |
| ... | ... | @@ -797,10 +799,10 @@ QPDFTokenizer::findEI(std::shared_ptr<InputSource> input) |
| 797 | 799 | bool first_try = true; |
| 798 | 800 | while (!okay) { |
| 799 | 801 | QPDFWordTokenFinder f(input, "EI"); |
| 800 | - if (!input->findFirst("EI", input->tell(), 0, f)) { | |
| 802 | + if (!input.findFirst("EI", input.tell(), 0, f)) { | |
| 801 | 803 | break; |
| 802 | 804 | } |
| 803 | - this->inline_image_bytes = QIntC::to_size(input->tell() - pos - 2); | |
| 805 | + inline_image_bytes = QIntC::to_size(input.tell() - pos - 2); | |
| 804 | 806 | |
| 805 | 807 | QPDFTokenizer check; |
| 806 | 808 | bool found_bad = false; |
| ... | ... | @@ -858,8 +860,8 @@ QPDFTokenizer::findEI(std::shared_ptr<InputSource> input) |
| 858 | 860 | QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try"); |
| 859 | 861 | } |
| 860 | 862 | |
| 861 | - input->seek(pos, SEEK_SET); | |
| 862 | - input->setLastOffset(last_offset); | |
| 863 | + input.seek(pos, SEEK_SET); | |
| 864 | + input.setLastOffset(last_offset); | |
| 863 | 865 | } |
| 864 | 866 | |
| 865 | 867 | bool |
| ... | ... | @@ -902,7 +904,7 @@ QPDFTokenizer::readToken( |
| 902 | 904 | throw QPDFExc( |
| 903 | 905 | qpdf_e_damaged_pdf, |
| 904 | 906 | input.getName(), |
| 905 | - context, | |
| 907 | + context.empty() ? "offset " + std::to_string(input.getLastOffset()) : context, | |
| 906 | 908 | input.getLastOffset(), |
| 907 | 909 | token.getErrorMessage()); |
| 908 | 910 | } | ... | ... |