diff --git a/include/qpdf/QPDFTokenizer.hh b/include/qpdf/QPDFTokenizer.hh index 15f7a77..ec9bbc1 100644 --- a/include/qpdf/QPDFTokenizer.hh +++ b/include/qpdf/QPDFTokenizer.hh @@ -191,6 +191,8 @@ class QPDFTokenizer // returns a tt_inline_image token. QPDF_DLL void expectInlineImage(std::shared_ptr input); + QPDF_DLL + void expectInlineImage(InputSource& input); private: friend class QPDFParser; @@ -217,7 +219,7 @@ class QPDFTokenizer bool isSpace(char); bool isDelimiter(char); - void findEI(std::shared_ptr input); + void findEI(InputSource& input); enum state_e { st_top, diff --git a/libqpdf/ContentNormalizer.cc b/libqpdf/ContentNormalizer.cc index 47830f4..bca8ad6 100644 --- a/libqpdf/ContentNormalizer.cc +++ b/libqpdf/ContentNormalizer.cc @@ -1,5 +1,6 @@ #include +#include #include ContentNormalizer::ContentNormalizer() : @@ -11,7 +12,6 @@ ContentNormalizer::ContentNormalizer() : void ContentNormalizer::handleToken(QPDFTokenizer::Token const& token) { - std::string value = token.getRawValue(); QPDFTokenizer::token_type_e token_type = token.getType(); if (token_type == QPDFTokenizer::tt_bad) { @@ -24,40 +24,48 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token) switch (token_type) { case QPDFTokenizer::tt_space: { - size_t len = value.length(); - for (size_t i = 0; i < len; ++i) { - char ch = value.at(i); - if (ch == '\r') { - if ((i + 1 < len) && (value.at(i + 1) == '\n')) { - // ignore - } else { - write("\n"); - } - } else { - write(&ch, 1); + std::string const& value = token.getRawValue(); + auto size = value.size(); + size_t pos = 0; + auto r_pos = value.find('\r'); + while (r_pos != std::string::npos) { + if (pos != r_pos) { + write(&value[pos], r_pos - pos); } + if (++r_pos >= size) { + write("\n"); + return; + } + if (value[r_pos] != '\n') { + write("\n"); + } + pos = r_pos; + r_pos = value.find('\r', pos); + } + if (pos < size) { + write(&value[pos], size - pos); } } - break; + return; case QPDFTokenizer::tt_string: // Replacing string and name tokens in this way normalizes their representation as this will // automatically handle quoting of unprintable characters, etc. - writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_string, token.getValue())); + write(QPDFObjectHandle::newString(token.getValue()).unparse()); break; case QPDFTokenizer::tt_name: - writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_name, token.getValue())); + write(QPDF_Name::normalizeName(token.getValue())); break; default: writeToken(token); - break; + return; } - value = token.getRawValue(); - if (((token_type == QPDFTokenizer::tt_string) || (token_type == QPDFTokenizer::tt_name)) && - ((value.find('\r') != std::string::npos) || (value.find('\n') != std::string::npos))) { + // tt_string or tt_name + std::string const& value = token.getRawValue(); + if (value.find('\r') != std::string::npos || value.find('\n') != std::string::npos) { write("\n"); } } diff --git a/libqpdf/Pl_QPDFTokenizer.cc b/libqpdf/Pl_QPDFTokenizer.cc index 7bb86d5..91973fc 100644 --- a/libqpdf/Pl_QPDFTokenizer.cc +++ b/libqpdf/Pl_QPDFTokenizer.cc @@ -36,20 +36,17 @@ void Pl_QPDFTokenizer::finish() { m->buf.finish(); - auto input = std::shared_ptr( - // line-break - new BufferInputSource("tokenizer data", m->buf.getBuffer(), true)); - + auto input = BufferInputSource("tokenizer data", m->buf.getBuffer(), true); + std::string empty; while (true) { - QPDFTokenizer::Token token = - m->tokenizer.readToken(input, "offset " + std::to_string(input->tell()), true); + auto token = m->tokenizer.readToken(input, empty, true); m->filter->handleToken(token); if (token.getType() == QPDFTokenizer::tt_eof) { break; } else if (token.isWord("ID")) { // Read the space after the ID. char ch = ' '; - input->read(&ch, 1); + input.read(&ch, 1); m->filter->handleToken( // line-break QPDFTokenizer::Token(QPDFTokenizer::tt_space, std::string(1, ch))); diff --git a/libqpdf/QPDFObjectHandle.cc b/libqpdf/QPDFObjectHandle.cc index 65b3c83..a082020 100644 --- a/libqpdf/QPDFObjectHandle.cc +++ b/libqpdf/QPDFObjectHandle.cc @@ -148,7 +148,7 @@ QPDFObjectHandle::TokenFilter::write(std::string const& str) void QPDFObjectHandle::TokenFilter::writeToken(QPDFTokenizer::Token const& token) { - std::string value = token.getRawValue(); + std::string const& value = token.getRawValue(); write(value.c_str(), value.length()); } diff --git a/libqpdf/QPDFTokenizer.cc b/libqpdf/QPDFTokenizer.cc index ca09708..d48abd3 100644 --- a/libqpdf/QPDFTokenizer.cc +++ b/libqpdf/QPDFTokenizer.cc @@ -27,7 +27,7 @@ namespace class QPDFWordTokenFinder: public InputSource::Finder { public: - QPDFWordTokenFinder(std::shared_ptr is, std::string const& str) : + QPDFWordTokenFinder(InputSource& is, std::string const& str) : is(is), str(str) { @@ -36,7 +36,7 @@ namespace bool check() override; private: - std::shared_ptr is; + InputSource& is; std::string str; }; } // namespace @@ -48,21 +48,21 @@ QPDFWordTokenFinder::check() // delimiter or EOF. QPDFTokenizer tokenizer; QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true); - qpdf_offset_t pos = is->tell(); + qpdf_offset_t pos = is.tell(); if (!(t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) { QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); return false; } - qpdf_offset_t token_start = is->getLastOffset(); + qpdf_offset_t token_start = is.getLastOffset(); char next; bool next_okay = false; - if (is->read(&next, 1) == 0) { + if (is.read(&next, 1) == 0) { QTC::TC("qpdf", "QPDFTokenizer inline image at EOF"); next_okay = true; } else { next_okay = is_delimiter(next); } - is->seek(pos, SEEK_SET); + is.seek(pos, SEEK_SET); if (!next_okay) { return false; } @@ -764,11 +764,17 @@ QPDFTokenizer::presentEOF() void QPDFTokenizer::expectInlineImage(std::shared_ptr input) { + expectInlineImage(*input); +} + +void +QPDFTokenizer::expectInlineImage(InputSource& input) +{ if (this->state == st_token_ready) { reset(); } else if (this->state != st_before_token) { - throw std::logic_error("QPDFTokenizer::expectInlineImage called" - " when tokenizer is in improper state"); + throw std::logic_error( + "QPDFTokenizer::expectInlineImage called when tokenizer is in improper state"); } findEI(input); this->before_token = false; @@ -777,14 +783,10 @@ QPDFTokenizer::expectInlineImage(std::shared_ptr input) } void -QPDFTokenizer::findEI(std::shared_ptr input) +QPDFTokenizer::findEI(InputSource& input) { - if (!input.get()) { - return; - } - - qpdf_offset_t last_offset = input->getLastOffset(); - qpdf_offset_t pos = input->tell(); + qpdf_offset_t last_offset = input.getLastOffset(); + qpdf_offset_t pos = input.tell(); // Use QPDFWordTokenFinder to find EI surrounded by delimiters. Then read the next several // tokens or up to EOF. If we find any suspicious-looking or tokens, this is probably still part @@ -797,10 +799,10 @@ QPDFTokenizer::findEI(std::shared_ptr input) bool first_try = true; while (!okay) { QPDFWordTokenFinder f(input, "EI"); - if (!input->findFirst("EI", input->tell(), 0, f)) { + if (!input.findFirst("EI", input.tell(), 0, f)) { break; } - this->inline_image_bytes = QIntC::to_size(input->tell() - pos - 2); + inline_image_bytes = QIntC::to_size(input.tell() - pos - 2); QPDFTokenizer check; bool found_bad = false; @@ -858,8 +860,8 @@ QPDFTokenizer::findEI(std::shared_ptr input) QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try"); } - input->seek(pos, SEEK_SET); - input->setLastOffset(last_offset); + input.seek(pos, SEEK_SET); + input.setLastOffset(last_offset); } bool @@ -902,7 +904,7 @@ QPDFTokenizer::readToken( throw QPDFExc( qpdf_e_damaged_pdf, input.getName(), - context, + context.empty() ? "offset " + std::to_string(input.getLastOffset()) : context, input.getLastOffset(), token.getErrorMessage()); }