Commit 0b3debaf86eda5ecd1dd6447bcf2ac62abb6dd16
Committed by
GitHub
Merge pull request #1253 from m-holger/pl_t
Refactor Pl_QPDFTokenizer
Showing
5 changed files
with
57 additions
and
48 deletions
include/qpdf/QPDFTokenizer.hh
| @@ -191,6 +191,8 @@ class QPDFTokenizer | @@ -191,6 +191,8 @@ class QPDFTokenizer | ||
| 191 | // returns a tt_inline_image token. | 191 | // returns a tt_inline_image token. |
| 192 | QPDF_DLL | 192 | QPDF_DLL |
| 193 | void expectInlineImage(std::shared_ptr<InputSource> input); | 193 | void expectInlineImage(std::shared_ptr<InputSource> input); |
| 194 | + QPDF_DLL | ||
| 195 | + void expectInlineImage(InputSource& input); | ||
| 194 | 196 | ||
| 195 | private: | 197 | private: |
| 196 | friend class QPDFParser; | 198 | friend class QPDFParser; |
| @@ -217,7 +219,7 @@ class QPDFTokenizer | @@ -217,7 +219,7 @@ class QPDFTokenizer | ||
| 217 | 219 | ||
| 218 | bool isSpace(char); | 220 | bool isSpace(char); |
| 219 | bool isDelimiter(char); | 221 | bool isDelimiter(char); |
| 220 | - void findEI(std::shared_ptr<InputSource> input); | 222 | + void findEI(InputSource& input); |
| 221 | 223 | ||
| 222 | enum state_e { | 224 | enum state_e { |
| 223 | st_top, | 225 | st_top, |
libqpdf/ContentNormalizer.cc
| 1 | #include <qpdf/ContentNormalizer.hh> | 1 | #include <qpdf/ContentNormalizer.hh> |
| 2 | 2 | ||
| 3 | +#include <qpdf/QPDF_Name.hh> | ||
| 3 | #include <qpdf/QUtil.hh> | 4 | #include <qpdf/QUtil.hh> |
| 4 | 5 | ||
| 5 | ContentNormalizer::ContentNormalizer() : | 6 | ContentNormalizer::ContentNormalizer() : |
| @@ -11,7 +12,6 @@ ContentNormalizer::ContentNormalizer() : | @@ -11,7 +12,6 @@ ContentNormalizer::ContentNormalizer() : | ||
| 11 | void | 12 | void |
| 12 | ContentNormalizer::handleToken(QPDFTokenizer::Token const& token) | 13 | ContentNormalizer::handleToken(QPDFTokenizer::Token const& token) |
| 13 | { | 14 | { |
| 14 | - std::string value = token.getRawValue(); | ||
| 15 | QPDFTokenizer::token_type_e token_type = token.getType(); | 15 | QPDFTokenizer::token_type_e token_type = token.getType(); |
| 16 | 16 | ||
| 17 | if (token_type == QPDFTokenizer::tt_bad) { | 17 | if (token_type == QPDFTokenizer::tt_bad) { |
| @@ -24,40 +24,48 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token) | @@ -24,40 +24,48 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token) | ||
| 24 | switch (token_type) { | 24 | switch (token_type) { |
| 25 | case QPDFTokenizer::tt_space: | 25 | case QPDFTokenizer::tt_space: |
| 26 | { | 26 | { |
| 27 | - size_t len = value.length(); | ||
| 28 | - for (size_t i = 0; i < len; ++i) { | ||
| 29 | - char ch = value.at(i); | ||
| 30 | - if (ch == '\r') { | ||
| 31 | - if ((i + 1 < len) && (value.at(i + 1) == '\n')) { | ||
| 32 | - // ignore | ||
| 33 | - } else { | ||
| 34 | - write("\n"); | ||
| 35 | - } | ||
| 36 | - } else { | ||
| 37 | - write(&ch, 1); | 27 | + std::string const& value = token.getRawValue(); |
| 28 | + auto size = value.size(); | ||
| 29 | + size_t pos = 0; | ||
| 30 | + auto r_pos = value.find('\r'); | ||
| 31 | + while (r_pos != std::string::npos) { | ||
| 32 | + if (pos != r_pos) { | ||
| 33 | + write(&value[pos], r_pos - pos); | ||
| 38 | } | 34 | } |
| 35 | + if (++r_pos >= size) { | ||
| 36 | + write("\n"); | ||
| 37 | + return; | ||
| 38 | + } | ||
| 39 | + if (value[r_pos] != '\n') { | ||
| 40 | + write("\n"); | ||
| 41 | + } | ||
| 42 | + pos = r_pos; | ||
| 43 | + r_pos = value.find('\r', pos); | ||
| 44 | + } | ||
| 45 | + if (pos < size) { | ||
| 46 | + write(&value[pos], size - pos); | ||
| 39 | } | 47 | } |
| 40 | } | 48 | } |
| 41 | - break; | 49 | + return; |
| 42 | 50 | ||
| 43 | case QPDFTokenizer::tt_string: | 51 | case QPDFTokenizer::tt_string: |
| 44 | // Replacing string and name tokens in this way normalizes their representation as this will | 52 | // Replacing string and name tokens in this way normalizes their representation as this will |
| 45 | // automatically handle quoting of unprintable characters, etc. | 53 | // automatically handle quoting of unprintable characters, etc. |
| 46 | - writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_string, token.getValue())); | 54 | + write(QPDFObjectHandle::newString(token.getValue()).unparse()); |
| 47 | break; | 55 | break; |
| 48 | 56 | ||
| 49 | case QPDFTokenizer::tt_name: | 57 | case QPDFTokenizer::tt_name: |
| 50 | - writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_name, token.getValue())); | 58 | + write(QPDF_Name::normalizeName(token.getValue())); |
| 51 | break; | 59 | break; |
| 52 | 60 | ||
| 53 | default: | 61 | default: |
| 54 | writeToken(token); | 62 | writeToken(token); |
| 55 | - break; | 63 | + return; |
| 56 | } | 64 | } |
| 57 | 65 | ||
| 58 | - value = token.getRawValue(); | ||
| 59 | - if (((token_type == QPDFTokenizer::tt_string) || (token_type == QPDFTokenizer::tt_name)) && | ||
| 60 | - ((value.find('\r') != std::string::npos) || (value.find('\n') != std::string::npos))) { | 66 | + // tt_string or tt_name |
| 67 | + std::string const& value = token.getRawValue(); | ||
| 68 | + if (value.find('\r') != std::string::npos || value.find('\n') != std::string::npos) { | ||
| 61 | write("\n"); | 69 | write("\n"); |
| 62 | } | 70 | } |
| 63 | } | 71 | } |
libqpdf/Pl_QPDFTokenizer.cc
| @@ -36,20 +36,17 @@ void | @@ -36,20 +36,17 @@ void | ||
| 36 | Pl_QPDFTokenizer::finish() | 36 | Pl_QPDFTokenizer::finish() |
| 37 | { | 37 | { |
| 38 | m->buf.finish(); | 38 | m->buf.finish(); |
| 39 | - auto input = std::shared_ptr<InputSource>( | ||
| 40 | - // line-break | ||
| 41 | - new BufferInputSource("tokenizer data", m->buf.getBuffer(), true)); | ||
| 42 | - | 39 | + auto input = BufferInputSource("tokenizer data", m->buf.getBuffer(), true); |
| 40 | + std::string empty; | ||
| 43 | while (true) { | 41 | while (true) { |
| 44 | - QPDFTokenizer::Token token = | ||
| 45 | - m->tokenizer.readToken(input, "offset " + std::to_string(input->tell()), true); | 42 | + auto token = m->tokenizer.readToken(input, empty, true); |
| 46 | m->filter->handleToken(token); | 43 | m->filter->handleToken(token); |
| 47 | if (token.getType() == QPDFTokenizer::tt_eof) { | 44 | if (token.getType() == QPDFTokenizer::tt_eof) { |
| 48 | break; | 45 | break; |
| 49 | } else if (token.isWord("ID")) { | 46 | } else if (token.isWord("ID")) { |
| 50 | // Read the space after the ID. | 47 | // Read the space after the ID. |
| 51 | char ch = ' '; | 48 | char ch = ' '; |
| 52 | - input->read(&ch, 1); | 49 | + input.read(&ch, 1); |
| 53 | m->filter->handleToken( | 50 | m->filter->handleToken( |
| 54 | // line-break | 51 | // line-break |
| 55 | QPDFTokenizer::Token(QPDFTokenizer::tt_space, std::string(1, ch))); | 52 | QPDFTokenizer::Token(QPDFTokenizer::tt_space, std::string(1, ch))); |
libqpdf/QPDFObjectHandle.cc
| @@ -148,7 +148,7 @@ QPDFObjectHandle::TokenFilter::write(std::string const& str) | @@ -148,7 +148,7 @@ QPDFObjectHandle::TokenFilter::write(std::string const& str) | ||
| 148 | void | 148 | void |
| 149 | QPDFObjectHandle::TokenFilter::writeToken(QPDFTokenizer::Token const& token) | 149 | QPDFObjectHandle::TokenFilter::writeToken(QPDFTokenizer::Token const& token) |
| 150 | { | 150 | { |
| 151 | - std::string value = token.getRawValue(); | 151 | + std::string const& value = token.getRawValue(); |
| 152 | write(value.c_str(), value.length()); | 152 | write(value.c_str(), value.length()); |
| 153 | } | 153 | } |
| 154 | 154 |
libqpdf/QPDFTokenizer.cc
| @@ -27,7 +27,7 @@ namespace | @@ -27,7 +27,7 @@ namespace | ||
| 27 | class QPDFWordTokenFinder: public InputSource::Finder | 27 | class QPDFWordTokenFinder: public InputSource::Finder |
| 28 | { | 28 | { |
| 29 | public: | 29 | public: |
| 30 | - QPDFWordTokenFinder(std::shared_ptr<InputSource> is, std::string const& str) : | 30 | + QPDFWordTokenFinder(InputSource& is, std::string const& str) : |
| 31 | is(is), | 31 | is(is), |
| 32 | str(str) | 32 | str(str) |
| 33 | { | 33 | { |
| @@ -36,7 +36,7 @@ namespace | @@ -36,7 +36,7 @@ namespace | ||
| 36 | bool check() override; | 36 | bool check() override; |
| 37 | 37 | ||
| 38 | private: | 38 | private: |
| 39 | - std::shared_ptr<InputSource> is; | 39 | + InputSource& is; |
| 40 | std::string str; | 40 | std::string str; |
| 41 | }; | 41 | }; |
| 42 | } // namespace | 42 | } // namespace |
| @@ -48,21 +48,21 @@ QPDFWordTokenFinder::check() | @@ -48,21 +48,21 @@ QPDFWordTokenFinder::check() | ||
| 48 | // delimiter or EOF. | 48 | // delimiter or EOF. |
| 49 | QPDFTokenizer tokenizer; | 49 | QPDFTokenizer tokenizer; |
| 50 | QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true); | 50 | QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true); |
| 51 | - qpdf_offset_t pos = is->tell(); | 51 | + qpdf_offset_t pos = is.tell(); |
| 52 | if (!(t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) { | 52 | if (!(t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) { |
| 53 | QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); | 53 | QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); |
| 54 | return false; | 54 | return false; |
| 55 | } | 55 | } |
| 56 | - qpdf_offset_t token_start = is->getLastOffset(); | 56 | + qpdf_offset_t token_start = is.getLastOffset(); |
| 57 | char next; | 57 | char next; |
| 58 | bool next_okay = false; | 58 | bool next_okay = false; |
| 59 | - if (is->read(&next, 1) == 0) { | 59 | + if (is.read(&next, 1) == 0) { |
| 60 | QTC::TC("qpdf", "QPDFTokenizer inline image at EOF"); | 60 | QTC::TC("qpdf", "QPDFTokenizer inline image at EOF"); |
| 61 | next_okay = true; | 61 | next_okay = true; |
| 62 | } else { | 62 | } else { |
| 63 | next_okay = is_delimiter(next); | 63 | next_okay = is_delimiter(next); |
| 64 | } | 64 | } |
| 65 | - is->seek(pos, SEEK_SET); | 65 | + is.seek(pos, SEEK_SET); |
| 66 | if (!next_okay) { | 66 | if (!next_okay) { |
| 67 | return false; | 67 | return false; |
| 68 | } | 68 | } |
| @@ -764,11 +764,17 @@ QPDFTokenizer::presentEOF() | @@ -764,11 +764,17 @@ QPDFTokenizer::presentEOF() | ||
| 764 | void | 764 | void |
| 765 | QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input) | 765 | QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input) |
| 766 | { | 766 | { |
| 767 | + expectInlineImage(*input); | ||
| 768 | +} | ||
| 769 | + | ||
| 770 | +void | ||
| 771 | +QPDFTokenizer::expectInlineImage(InputSource& input) | ||
| 772 | +{ | ||
| 767 | if (this->state == st_token_ready) { | 773 | if (this->state == st_token_ready) { |
| 768 | reset(); | 774 | reset(); |
| 769 | } else if (this->state != st_before_token) { | 775 | } else if (this->state != st_before_token) { |
| 770 | - throw std::logic_error("QPDFTokenizer::expectInlineImage called" | ||
| 771 | - " when tokenizer is in improper state"); | 776 | + throw std::logic_error( |
| 777 | + "QPDFTokenizer::expectInlineImage called when tokenizer is in improper state"); | ||
| 772 | } | 778 | } |
| 773 | findEI(input); | 779 | findEI(input); |
| 774 | this->before_token = false; | 780 | this->before_token = false; |
| @@ -777,14 +783,10 @@ QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input) | @@ -777,14 +783,10 @@ QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input) | ||
| 777 | } | 783 | } |
| 778 | 784 | ||
| 779 | void | 785 | void |
| 780 | -QPDFTokenizer::findEI(std::shared_ptr<InputSource> input) | 786 | +QPDFTokenizer::findEI(InputSource& input) |
| 781 | { | 787 | { |
| 782 | - if (!input.get()) { | ||
| 783 | - return; | ||
| 784 | - } | ||
| 785 | - | ||
| 786 | - qpdf_offset_t last_offset = input->getLastOffset(); | ||
| 787 | - qpdf_offset_t pos = input->tell(); | 788 | + qpdf_offset_t last_offset = input.getLastOffset(); |
| 789 | + qpdf_offset_t pos = input.tell(); | ||
| 788 | 790 | ||
| 789 | // Use QPDFWordTokenFinder to find EI surrounded by delimiters. Then read the next several | 791 | // Use QPDFWordTokenFinder to find EI surrounded by delimiters. Then read the next several |
| 790 | // tokens or up to EOF. If we find any suspicious-looking or tokens, this is probably still part | 792 | // tokens or up to EOF. If we find any suspicious-looking or tokens, this is probably still part |
| @@ -797,10 +799,10 @@ QPDFTokenizer::findEI(std::shared_ptr<InputSource> input) | @@ -797,10 +799,10 @@ QPDFTokenizer::findEI(std::shared_ptr<InputSource> input) | ||
| 797 | bool first_try = true; | 799 | bool first_try = true; |
| 798 | while (!okay) { | 800 | while (!okay) { |
| 799 | QPDFWordTokenFinder f(input, "EI"); | 801 | QPDFWordTokenFinder f(input, "EI"); |
| 800 | - if (!input->findFirst("EI", input->tell(), 0, f)) { | 802 | + if (!input.findFirst("EI", input.tell(), 0, f)) { |
| 801 | break; | 803 | break; |
| 802 | } | 804 | } |
| 803 | - this->inline_image_bytes = QIntC::to_size(input->tell() - pos - 2); | 805 | + inline_image_bytes = QIntC::to_size(input.tell() - pos - 2); |
| 804 | 806 | ||
| 805 | QPDFTokenizer check; | 807 | QPDFTokenizer check; |
| 806 | bool found_bad = false; | 808 | bool found_bad = false; |
| @@ -858,8 +860,8 @@ QPDFTokenizer::findEI(std::shared_ptr<InputSource> input) | @@ -858,8 +860,8 @@ QPDFTokenizer::findEI(std::shared_ptr<InputSource> input) | ||
| 858 | QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try"); | 860 | QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try"); |
| 859 | } | 861 | } |
| 860 | 862 | ||
| 861 | - input->seek(pos, SEEK_SET); | ||
| 862 | - input->setLastOffset(last_offset); | 863 | + input.seek(pos, SEEK_SET); |
| 864 | + input.setLastOffset(last_offset); | ||
| 863 | } | 865 | } |
| 864 | 866 | ||
| 865 | bool | 867 | bool |
| @@ -902,7 +904,7 @@ QPDFTokenizer::readToken( | @@ -902,7 +904,7 @@ QPDFTokenizer::readToken( | ||
| 902 | throw QPDFExc( | 904 | throw QPDFExc( |
| 903 | qpdf_e_damaged_pdf, | 905 | qpdf_e_damaged_pdf, |
| 904 | input.getName(), | 906 | input.getName(), |
| 905 | - context, | 907 | + context.empty() ? "offset " + std::to_string(input.getLastOffset()) : context, |
| 906 | input.getLastOffset(), | 908 | input.getLastOffset(), |
| 907 | token.getErrorMessage()); | 909 | token.getErrorMessage()); |
| 908 | } | 910 | } |