Commit 0b3debaf86eda5ecd1dd6447bcf2ac62abb6dd16

Authored by m-holger
Committed by GitHub
2 parents dc1ae845 1536a760

Merge pull request #1253 from m-holger/pl_t

Refactor Pl_QPDFTokenizer
include/qpdf/QPDFTokenizer.hh
@@ -191,6 +191,8 @@ class QPDFTokenizer @@ -191,6 +191,8 @@ class QPDFTokenizer
191 // returns a tt_inline_image token. 191 // returns a tt_inline_image token.
192 QPDF_DLL 192 QPDF_DLL
193 void expectInlineImage(std::shared_ptr<InputSource> input); 193 void expectInlineImage(std::shared_ptr<InputSource> input);
  194 + QPDF_DLL
  195 + void expectInlineImage(InputSource& input);
194 196
195 private: 197 private:
196 friend class QPDFParser; 198 friend class QPDFParser;
@@ -217,7 +219,7 @@ class QPDFTokenizer @@ -217,7 +219,7 @@ class QPDFTokenizer
217 219
218 bool isSpace(char); 220 bool isSpace(char);
219 bool isDelimiter(char); 221 bool isDelimiter(char);
220 - void findEI(std::shared_ptr<InputSource> input); 222 + void findEI(InputSource& input);
221 223
222 enum state_e { 224 enum state_e {
223 st_top, 225 st_top,
libqpdf/ContentNormalizer.cc
1 #include <qpdf/ContentNormalizer.hh> 1 #include <qpdf/ContentNormalizer.hh>
2 2
  3 +#include <qpdf/QPDF_Name.hh>
3 #include <qpdf/QUtil.hh> 4 #include <qpdf/QUtil.hh>
4 5
5 ContentNormalizer::ContentNormalizer() : 6 ContentNormalizer::ContentNormalizer() :
@@ -11,7 +12,6 @@ ContentNormalizer::ContentNormalizer() : @@ -11,7 +12,6 @@ ContentNormalizer::ContentNormalizer() :
11 void 12 void
12 ContentNormalizer::handleToken(QPDFTokenizer::Token const& token) 13 ContentNormalizer::handleToken(QPDFTokenizer::Token const& token)
13 { 14 {
14 - std::string value = token.getRawValue();  
15 QPDFTokenizer::token_type_e token_type = token.getType(); 15 QPDFTokenizer::token_type_e token_type = token.getType();
16 16
17 if (token_type == QPDFTokenizer::tt_bad) { 17 if (token_type == QPDFTokenizer::tt_bad) {
@@ -24,40 +24,48 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const&amp; token) @@ -24,40 +24,48 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const&amp; token)
24 switch (token_type) { 24 switch (token_type) {
25 case QPDFTokenizer::tt_space: 25 case QPDFTokenizer::tt_space:
26 { 26 {
27 - size_t len = value.length();  
28 - for (size_t i = 0; i < len; ++i) {  
29 - char ch = value.at(i);  
30 - if (ch == '\r') {  
31 - if ((i + 1 < len) && (value.at(i + 1) == '\n')) {  
32 - // ignore  
33 - } else {  
34 - write("\n");  
35 - }  
36 - } else {  
37 - write(&ch, 1); 27 + std::string const& value = token.getRawValue();
  28 + auto size = value.size();
  29 + size_t pos = 0;
  30 + auto r_pos = value.find('\r');
  31 + while (r_pos != std::string::npos) {
  32 + if (pos != r_pos) {
  33 + write(&value[pos], r_pos - pos);
38 } 34 }
  35 + if (++r_pos >= size) {
  36 + write("\n");
  37 + return;
  38 + }
  39 + if (value[r_pos] != '\n') {
  40 + write("\n");
  41 + }
  42 + pos = r_pos;
  43 + r_pos = value.find('\r', pos);
  44 + }
  45 + if (pos < size) {
  46 + write(&value[pos], size - pos);
39 } 47 }
40 } 48 }
41 - break; 49 + return;
42 50
43 case QPDFTokenizer::tt_string: 51 case QPDFTokenizer::tt_string:
44 // Replacing string and name tokens in this way normalizes their representation as this will 52 // Replacing string and name tokens in this way normalizes their representation as this will
45 // automatically handle quoting of unprintable characters, etc. 53 // automatically handle quoting of unprintable characters, etc.
46 - writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_string, token.getValue())); 54 + write(QPDFObjectHandle::newString(token.getValue()).unparse());
47 break; 55 break;
48 56
49 case QPDFTokenizer::tt_name: 57 case QPDFTokenizer::tt_name:
50 - writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_name, token.getValue())); 58 + write(QPDF_Name::normalizeName(token.getValue()));
51 break; 59 break;
52 60
53 default: 61 default:
54 writeToken(token); 62 writeToken(token);
55 - break; 63 + return;
56 } 64 }
57 65
58 - value = token.getRawValue();  
59 - if (((token_type == QPDFTokenizer::tt_string) || (token_type == QPDFTokenizer::tt_name)) &&  
60 - ((value.find('\r') != std::string::npos) || (value.find('\n') != std::string::npos))) { 66 + // tt_string or tt_name
  67 + std::string const& value = token.getRawValue();
  68 + if (value.find('\r') != std::string::npos || value.find('\n') != std::string::npos) {
61 write("\n"); 69 write("\n");
62 } 70 }
63 } 71 }
libqpdf/Pl_QPDFTokenizer.cc
@@ -36,20 +36,17 @@ void @@ -36,20 +36,17 @@ void
36 Pl_QPDFTokenizer::finish() 36 Pl_QPDFTokenizer::finish()
37 { 37 {
38 m->buf.finish(); 38 m->buf.finish();
39 - auto input = std::shared_ptr<InputSource>(  
40 - // line-break  
41 - new BufferInputSource("tokenizer data", m->buf.getBuffer(), true));  
42 - 39 + auto input = BufferInputSource("tokenizer data", m->buf.getBuffer(), true);
  40 + std::string empty;
43 while (true) { 41 while (true) {
44 - QPDFTokenizer::Token token =  
45 - m->tokenizer.readToken(input, "offset " + std::to_string(input->tell()), true); 42 + auto token = m->tokenizer.readToken(input, empty, true);
46 m->filter->handleToken(token); 43 m->filter->handleToken(token);
47 if (token.getType() == QPDFTokenizer::tt_eof) { 44 if (token.getType() == QPDFTokenizer::tt_eof) {
48 break; 45 break;
49 } else if (token.isWord("ID")) { 46 } else if (token.isWord("ID")) {
50 // Read the space after the ID. 47 // Read the space after the ID.
51 char ch = ' '; 48 char ch = ' ';
52 - input->read(&ch, 1); 49 + input.read(&ch, 1);
53 m->filter->handleToken( 50 m->filter->handleToken(
54 // line-break 51 // line-break
55 QPDFTokenizer::Token(QPDFTokenizer::tt_space, std::string(1, ch))); 52 QPDFTokenizer::Token(QPDFTokenizer::tt_space, std::string(1, ch)));
libqpdf/QPDFObjectHandle.cc
@@ -148,7 +148,7 @@ QPDFObjectHandle::TokenFilter::write(std::string const&amp; str) @@ -148,7 +148,7 @@ QPDFObjectHandle::TokenFilter::write(std::string const&amp; str)
148 void 148 void
149 QPDFObjectHandle::TokenFilter::writeToken(QPDFTokenizer::Token const& token) 149 QPDFObjectHandle::TokenFilter::writeToken(QPDFTokenizer::Token const& token)
150 { 150 {
151 - std::string value = token.getRawValue(); 151 + std::string const& value = token.getRawValue();
152 write(value.c_str(), value.length()); 152 write(value.c_str(), value.length());
153 } 153 }
154 154
libqpdf/QPDFTokenizer.cc
@@ -27,7 +27,7 @@ namespace @@ -27,7 +27,7 @@ namespace
27 class QPDFWordTokenFinder: public InputSource::Finder 27 class QPDFWordTokenFinder: public InputSource::Finder
28 { 28 {
29 public: 29 public:
30 - QPDFWordTokenFinder(std::shared_ptr<InputSource> is, std::string const& str) : 30 + QPDFWordTokenFinder(InputSource& is, std::string const& str) :
31 is(is), 31 is(is),
32 str(str) 32 str(str)
33 { 33 {
@@ -36,7 +36,7 @@ namespace @@ -36,7 +36,7 @@ namespace
36 bool check() override; 36 bool check() override;
37 37
38 private: 38 private:
39 - std::shared_ptr<InputSource> is; 39 + InputSource& is;
40 std::string str; 40 std::string str;
41 }; 41 };
42 } // namespace 42 } // namespace
@@ -48,21 +48,21 @@ QPDFWordTokenFinder::check() @@ -48,21 +48,21 @@ QPDFWordTokenFinder::check()
48 // delimiter or EOF. 48 // delimiter or EOF.
49 QPDFTokenizer tokenizer; 49 QPDFTokenizer tokenizer;
50 QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true); 50 QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
51 - qpdf_offset_t pos = is->tell(); 51 + qpdf_offset_t pos = is.tell();
52 if (!(t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) { 52 if (!(t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) {
53 QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); 53 QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
54 return false; 54 return false;
55 } 55 }
56 - qpdf_offset_t token_start = is->getLastOffset(); 56 + qpdf_offset_t token_start = is.getLastOffset();
57 char next; 57 char next;
58 bool next_okay = false; 58 bool next_okay = false;
59 - if (is->read(&next, 1) == 0) { 59 + if (is.read(&next, 1) == 0) {
60 QTC::TC("qpdf", "QPDFTokenizer inline image at EOF"); 60 QTC::TC("qpdf", "QPDFTokenizer inline image at EOF");
61 next_okay = true; 61 next_okay = true;
62 } else { 62 } else {
63 next_okay = is_delimiter(next); 63 next_okay = is_delimiter(next);
64 } 64 }
65 - is->seek(pos, SEEK_SET); 65 + is.seek(pos, SEEK_SET);
66 if (!next_okay) { 66 if (!next_okay) {
67 return false; 67 return false;
68 } 68 }
@@ -764,11 +764,17 @@ QPDFTokenizer::presentEOF() @@ -764,11 +764,17 @@ QPDFTokenizer::presentEOF()
764 void 764 void
765 QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input) 765 QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input)
766 { 766 {
  767 + expectInlineImage(*input);
  768 +}
  769 +
  770 +void
  771 +QPDFTokenizer::expectInlineImage(InputSource& input)
  772 +{
767 if (this->state == st_token_ready) { 773 if (this->state == st_token_ready) {
768 reset(); 774 reset();
769 } else if (this->state != st_before_token) { 775 } else if (this->state != st_before_token) {
770 - throw std::logic_error("QPDFTokenizer::expectInlineImage called"  
771 - " when tokenizer is in improper state"); 776 + throw std::logic_error(
  777 + "QPDFTokenizer::expectInlineImage called when tokenizer is in improper state");
772 } 778 }
773 findEI(input); 779 findEI(input);
774 this->before_token = false; 780 this->before_token = false;
@@ -777,14 +783,10 @@ QPDFTokenizer::expectInlineImage(std::shared_ptr&lt;InputSource&gt; input) @@ -777,14 +783,10 @@ QPDFTokenizer::expectInlineImage(std::shared_ptr&lt;InputSource&gt; input)
777 } 783 }
778 784
779 void 785 void
780 -QPDFTokenizer::findEI(std::shared_ptr<InputSource> input) 786 +QPDFTokenizer::findEI(InputSource& input)
781 { 787 {
782 - if (!input.get()) {  
783 - return;  
784 - }  
785 -  
786 - qpdf_offset_t last_offset = input->getLastOffset();  
787 - qpdf_offset_t pos = input->tell(); 788 + qpdf_offset_t last_offset = input.getLastOffset();
  789 + qpdf_offset_t pos = input.tell();
788 790
789 // Use QPDFWordTokenFinder to find EI surrounded by delimiters. Then read the next several 791 // Use QPDFWordTokenFinder to find EI surrounded by delimiters. Then read the next several
790 // tokens or up to EOF. If we find any suspicious-looking or tokens, this is probably still part 792 // tokens or up to EOF. If we find any suspicious-looking or tokens, this is probably still part
@@ -797,10 +799,10 @@ QPDFTokenizer::findEI(std::shared_ptr&lt;InputSource&gt; input) @@ -797,10 +799,10 @@ QPDFTokenizer::findEI(std::shared_ptr&lt;InputSource&gt; input)
797 bool first_try = true; 799 bool first_try = true;
798 while (!okay) { 800 while (!okay) {
799 QPDFWordTokenFinder f(input, "EI"); 801 QPDFWordTokenFinder f(input, "EI");
800 - if (!input->findFirst("EI", input->tell(), 0, f)) { 802 + if (!input.findFirst("EI", input.tell(), 0, f)) {
801 break; 803 break;
802 } 804 }
803 - this->inline_image_bytes = QIntC::to_size(input->tell() - pos - 2); 805 + inline_image_bytes = QIntC::to_size(input.tell() - pos - 2);
804 806
805 QPDFTokenizer check; 807 QPDFTokenizer check;
806 bool found_bad = false; 808 bool found_bad = false;
@@ -858,8 +860,8 @@ QPDFTokenizer::findEI(std::shared_ptr&lt;InputSource&gt; input) @@ -858,8 +860,8 @@ QPDFTokenizer::findEI(std::shared_ptr&lt;InputSource&gt; input)
858 QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try"); 860 QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try");
859 } 861 }
860 862
861 - input->seek(pos, SEEK_SET);  
862 - input->setLastOffset(last_offset); 863 + input.seek(pos, SEEK_SET);
  864 + input.setLastOffset(last_offset);
863 } 865 }
864 866
865 bool 867 bool
@@ -902,7 +904,7 @@ QPDFTokenizer::readToken( @@ -902,7 +904,7 @@ QPDFTokenizer::readToken(
902 throw QPDFExc( 904 throw QPDFExc(
903 qpdf_e_damaged_pdf, 905 qpdf_e_damaged_pdf,
904 input.getName(), 906 input.getName(),
905 - context, 907 + context.empty() ? "offset " + std::to_string(input.getLastOffset()) : context,
906 input.getLastOffset(), 908 input.getLastOffset(),
907 token.getErrorMessage()); 909 token.getErrorMessage());
908 } 910 }