Commit 0b3debaf86eda5ecd1dd6447bcf2ac62abb6dd16

Authored by m-holger
Committed by GitHub
2 parents dc1ae845 1536a760

Merge pull request #1253 from m-holger/pl_t

Refactor Pl_QPDFTokenizer
include/qpdf/QPDFTokenizer.hh
... ... @@ -191,6 +191,8 @@ class QPDFTokenizer
191 191 // returns a tt_inline_image token.
192 192 QPDF_DLL
193 193 void expectInlineImage(std::shared_ptr<InputSource> input);
  194 + QPDF_DLL
  195 + void expectInlineImage(InputSource& input);
194 196  
195 197 private:
196 198 friend class QPDFParser;
... ... @@ -217,7 +219,7 @@ class QPDFTokenizer
217 219  
218 220 bool isSpace(char);
219 221 bool isDelimiter(char);
220   - void findEI(std::shared_ptr<InputSource> input);
  222 + void findEI(InputSource& input);
221 223  
222 224 enum state_e {
223 225 st_top,
... ...
libqpdf/ContentNormalizer.cc
1 1 #include <qpdf/ContentNormalizer.hh>
2 2  
  3 +#include <qpdf/QPDF_Name.hh>
3 4 #include <qpdf/QUtil.hh>
4 5  
5 6 ContentNormalizer::ContentNormalizer() :
... ... @@ -11,7 +12,6 @@ ContentNormalizer::ContentNormalizer() :
11 12 void
12 13 ContentNormalizer::handleToken(QPDFTokenizer::Token const& token)
13 14 {
14   - std::string value = token.getRawValue();
15 15 QPDFTokenizer::token_type_e token_type = token.getType();
16 16  
17 17 if (token_type == QPDFTokenizer::tt_bad) {
... ... @@ -24,40 +24,48 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const&amp; token)
24 24 switch (token_type) {
25 25 case QPDFTokenizer::tt_space:
26 26 {
27   - size_t len = value.length();
28   - for (size_t i = 0; i < len; ++i) {
29   - char ch = value.at(i);
30   - if (ch == '\r') {
31   - if ((i + 1 < len) && (value.at(i + 1) == '\n')) {
32   - // ignore
33   - } else {
34   - write("\n");
35   - }
36   - } else {
37   - write(&ch, 1);
  27 + std::string const& value = token.getRawValue();
  28 + auto size = value.size();
  29 + size_t pos = 0;
  30 + auto r_pos = value.find('\r');
  31 + while (r_pos != std::string::npos) {
  32 + if (pos != r_pos) {
  33 + write(&value[pos], r_pos - pos);
38 34 }
  35 + if (++r_pos >= size) {
  36 + write("\n");
  37 + return;
  38 + }
  39 + if (value[r_pos] != '\n') {
  40 + write("\n");
  41 + }
  42 + pos = r_pos;
  43 + r_pos = value.find('\r', pos);
  44 + }
  45 + if (pos < size) {
  46 + write(&value[pos], size - pos);
39 47 }
40 48 }
41   - break;
  49 + return;
42 50  
43 51 case QPDFTokenizer::tt_string:
44 52 // Replacing string and name tokens in this way normalizes their representation as this will
45 53 // automatically handle quoting of unprintable characters, etc.
46   - writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_string, token.getValue()));
  54 + write(QPDFObjectHandle::newString(token.getValue()).unparse());
47 55 break;
48 56  
49 57 case QPDFTokenizer::tt_name:
50   - writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_name, token.getValue()));
  58 + write(QPDF_Name::normalizeName(token.getValue()));
51 59 break;
52 60  
53 61 default:
54 62 writeToken(token);
55   - break;
  63 + return;
56 64 }
57 65  
58   - value = token.getRawValue();
59   - if (((token_type == QPDFTokenizer::tt_string) || (token_type == QPDFTokenizer::tt_name)) &&
60   - ((value.find('\r') != std::string::npos) || (value.find('\n') != std::string::npos))) {
  66 + // tt_string or tt_name
  67 + std::string const& value = token.getRawValue();
  68 + if (value.find('\r') != std::string::npos || value.find('\n') != std::string::npos) {
61 69 write("\n");
62 70 }
63 71 }
... ...
libqpdf/Pl_QPDFTokenizer.cc
... ... @@ -36,20 +36,17 @@ void
36 36 Pl_QPDFTokenizer::finish()
37 37 {
38 38 m->buf.finish();
39   - auto input = std::shared_ptr<InputSource>(
40   - // line-break
41   - new BufferInputSource("tokenizer data", m->buf.getBuffer(), true));
42   -
  39 + auto input = BufferInputSource("tokenizer data", m->buf.getBuffer(), true);
  40 + std::string empty;
43 41 while (true) {
44   - QPDFTokenizer::Token token =
45   - m->tokenizer.readToken(input, "offset " + std::to_string(input->tell()), true);
  42 + auto token = m->tokenizer.readToken(input, empty, true);
46 43 m->filter->handleToken(token);
47 44 if (token.getType() == QPDFTokenizer::tt_eof) {
48 45 break;
49 46 } else if (token.isWord("ID")) {
50 47 // Read the space after the ID.
51 48 char ch = ' ';
52   - input->read(&ch, 1);
  49 + input.read(&ch, 1);
53 50 m->filter->handleToken(
54 51 // line-break
55 52 QPDFTokenizer::Token(QPDFTokenizer::tt_space, std::string(1, ch)));
... ...
libqpdf/QPDFObjectHandle.cc
... ... @@ -148,7 +148,7 @@ QPDFObjectHandle::TokenFilter::write(std::string const&amp; str)
148 148 void
149 149 QPDFObjectHandle::TokenFilter::writeToken(QPDFTokenizer::Token const& token)
150 150 {
151   - std::string value = token.getRawValue();
  151 + std::string const& value = token.getRawValue();
152 152 write(value.c_str(), value.length());
153 153 }
154 154  
... ...
libqpdf/QPDFTokenizer.cc
... ... @@ -27,7 +27,7 @@ namespace
27 27 class QPDFWordTokenFinder: public InputSource::Finder
28 28 {
29 29 public:
30   - QPDFWordTokenFinder(std::shared_ptr<InputSource> is, std::string const& str) :
  30 + QPDFWordTokenFinder(InputSource& is, std::string const& str) :
31 31 is(is),
32 32 str(str)
33 33 {
... ... @@ -36,7 +36,7 @@ namespace
36 36 bool check() override;
37 37  
38 38 private:
39   - std::shared_ptr<InputSource> is;
  39 + InputSource& is;
40 40 std::string str;
41 41 };
42 42 } // namespace
... ... @@ -48,21 +48,21 @@ QPDFWordTokenFinder::check()
48 48 // delimiter or EOF.
49 49 QPDFTokenizer tokenizer;
50 50 QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
51   - qpdf_offset_t pos = is->tell();
  51 + qpdf_offset_t pos = is.tell();
52 52 if (!(t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) {
53 53 QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
54 54 return false;
55 55 }
56   - qpdf_offset_t token_start = is->getLastOffset();
  56 + qpdf_offset_t token_start = is.getLastOffset();
57 57 char next;
58 58 bool next_okay = false;
59   - if (is->read(&next, 1) == 0) {
  59 + if (is.read(&next, 1) == 0) {
60 60 QTC::TC("qpdf", "QPDFTokenizer inline image at EOF");
61 61 next_okay = true;
62 62 } else {
63 63 next_okay = is_delimiter(next);
64 64 }
65   - is->seek(pos, SEEK_SET);
  65 + is.seek(pos, SEEK_SET);
66 66 if (!next_okay) {
67 67 return false;
68 68 }
... ... @@ -764,11 +764,17 @@ QPDFTokenizer::presentEOF()
764 764 void
765 765 QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input)
766 766 {
  767 + expectInlineImage(*input);
  768 +}
  769 +
  770 +void
  771 +QPDFTokenizer::expectInlineImage(InputSource& input)
  772 +{
767 773 if (this->state == st_token_ready) {
768 774 reset();
769 775 } else if (this->state != st_before_token) {
770   - throw std::logic_error("QPDFTokenizer::expectInlineImage called"
771   - " when tokenizer is in improper state");
  776 + throw std::logic_error(
  777 + "QPDFTokenizer::expectInlineImage called when tokenizer is in improper state");
772 778 }
773 779 findEI(input);
774 780 this->before_token = false;
... ... @@ -777,14 +783,10 @@ QPDFTokenizer::expectInlineImage(std::shared_ptr&lt;InputSource&gt; input)
777 783 }
778 784  
779 785 void
780   -QPDFTokenizer::findEI(std::shared_ptr<InputSource> input)
  786 +QPDFTokenizer::findEI(InputSource& input)
781 787 {
782   - if (!input.get()) {
783   - return;
784   - }
785   -
786   - qpdf_offset_t last_offset = input->getLastOffset();
787   - qpdf_offset_t pos = input->tell();
  788 + qpdf_offset_t last_offset = input.getLastOffset();
  789 + qpdf_offset_t pos = input.tell();
788 790  
789 791 // Use QPDFWordTokenFinder to find EI surrounded by delimiters. Then read the next several
790 792 // tokens or up to EOF. If we find any suspicious-looking or tokens, this is probably still part
... ... @@ -797,10 +799,10 @@ QPDFTokenizer::findEI(std::shared_ptr&lt;InputSource&gt; input)
797 799 bool first_try = true;
798 800 while (!okay) {
799 801 QPDFWordTokenFinder f(input, "EI");
800   - if (!input->findFirst("EI", input->tell(), 0, f)) {
  802 + if (!input.findFirst("EI", input.tell(), 0, f)) {
801 803 break;
802 804 }
803   - this->inline_image_bytes = QIntC::to_size(input->tell() - pos - 2);
  805 + inline_image_bytes = QIntC::to_size(input.tell() - pos - 2);
804 806  
805 807 QPDFTokenizer check;
806 808 bool found_bad = false;
... ... @@ -858,8 +860,8 @@ QPDFTokenizer::findEI(std::shared_ptr&lt;InputSource&gt; input)
858 860 QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try");
859 861 }
860 862  
861   - input->seek(pos, SEEK_SET);
862   - input->setLastOffset(last_offset);
  863 + input.seek(pos, SEEK_SET);
  864 + input.setLastOffset(last_offset);
863 865 }
864 866  
865 867 bool
... ... @@ -902,7 +904,7 @@ QPDFTokenizer::readToken(
902 904 throw QPDFExc(
903 905 qpdf_e_damaged_pdf,
904 906 input.getName(),
905   - context,
  907 + context.empty() ? "offset " + std::to_string(input.getLastOffset()) : context,
906 908 input.getLastOffset(),
907 909 token.getErrorMessage());
908 910 }
... ...