Commit ec9e310c9ea9cee8d9e16cad2a68f0ad096f3a4b

Authored by Jay Berkenbilt
1 parent 31372edc

Refactor QPDFTokenizer's inline image handling

Add a version of expectInlineImage that takes an input source and
searches for EI. This is in preparation for improving the way EI is
found. This commit just refactors the code without changing the
functionality and adds tests to make sure the old and new code behave
identically.
include/qpdf/Pl_QPDFTokenizer.hh
... ... @@ -27,6 +27,7 @@
27 27 #include <qpdf/QPDFTokenizer.hh>
28 28 #include <qpdf/PointerHolder.hh>
29 29 #include <qpdf/QPDFObjectHandle.hh>
  30 +#include <qpdf/Pl_Buffer.hh>
30 31  
31 32 // Tokenize the incoming text using QPDFTokenizer and pass the tokens
32 33 // in turn to a QPDFObjectHandle::TokenFilter object. All bytes of
... ... @@ -56,9 +57,6 @@ class Pl_QPDFTokenizer: public Pipeline
56 57 virtual void finish();
57 58  
58 59 private:
59   - void processChar(char ch);
60   - void checkUnread();
61   -
62 60 class Members
63 61 {
64 62 friend class Pl_QPDFTokenizer;
... ... @@ -73,9 +71,7 @@ class Pl_QPDFTokenizer: public Pipeline
73 71  
74 72 QPDFObjectHandle::TokenFilter* filter;
75 73 QPDFTokenizer tokenizer;
76   - bool last_char_was_cr;
77   - bool unread_char;
78   - char char_to_unread;
  74 + Pl_Buffer buf;
79 75 };
80 76 PointerHolder<Members> m;
81 77 };
... ...
include/qpdf/QPDFTokenizer.hh
... ... @@ -178,7 +178,15 @@ class QPDFTokenizer
178 178 // including the next EI token. After you call this method, the
179 179 // next call to readToken (or the token created next time getToken
180 180 // returns true) will either be tt_inline_image or tt_bad. This is
181   - // the only way readToken returns a tt_inline_image token.
  181 + // the only way readToken returns a tt_inline_image token. The
  182 + // version of this method that takes a PointerHolder<InputSource>
  183 + // does a better job of locating the end of the inline image and
  184 + // should be used whenever the input source is available. It
  185 + // preserves both tell() and getLastOffset(). The version without
  186 + // the input source will always end the inline image the first
  187 + // time it sees something that looks like an EI operator.
  188 + QPDF_DLL
  189 + void expectInlineImage(PointerHolder<InputSource> input);
182 190 QPDF_DLL
183 191 void expectInlineImage();
184 192  
... ... @@ -223,6 +231,7 @@ class QPDFTokenizer
223 231 std::string error_message;
224 232 bool unread_char;
225 233 char char_to_unread;
  234 + size_t inline_image_bytes;
226 235  
227 236 // State for strings
228 237 int string_depth;
... ...
libqpdf/Pl_QPDFTokenizer.cc
1 1 #include <qpdf/Pl_QPDFTokenizer.hh>
2 2 #include <qpdf/QTC.hh>
  3 +#include <qpdf/QUtil.hh>
  4 +#include <qpdf/BufferInputSource.hh>
3 5 #include <stdexcept>
4 6 #include <string.h>
5 7  
6 8 Pl_QPDFTokenizer::Members::Members() :
7 9 filter(0),
8   - last_char_was_cr(false),
9   - unread_char(false),
10   - char_to_unread('\0')
  10 + buf("tokenizer buffer")
11 11 {
12 12 }
13 13  
... ... @@ -33,61 +33,36 @@ Pl_QPDFTokenizer::~Pl_QPDFTokenizer()
33 33 }
34 34  
35 35 void
36   -Pl_QPDFTokenizer::processChar(char ch)
  36 +Pl_QPDFTokenizer::write(unsigned char* data, size_t len)
37 37 {
38   - this->m->tokenizer.presentCharacter(ch);
39   - QPDFTokenizer::Token token;
40   - if (this->m->tokenizer.getToken(
41   - token, this->m->unread_char, this->m->char_to_unread))
42   - {
43   - this->m->filter->handleToken(token);
44   - if ((token.getType() == QPDFTokenizer::tt_word) &&
45   - (token.getValue() == "ID"))
46   - {
47   - QTC::TC("qpdf", "Pl_QPDFTokenizer found ID");
48   - this->m->tokenizer.expectInlineImage();
49   - }
50   - }
51   -}
52   -
53   -
54   -void
55   -Pl_QPDFTokenizer::checkUnread()
56   -{
57   - if (this->m->unread_char)
58   - {
59   - processChar(this->m->char_to_unread);
60   - if (this->m->unread_char)
61   - {
62   - throw std::logic_error(
63   - "INTERNAL ERROR: unread_char still true after processing "
64   - "unread character");
65   - }
66   - }
67   -}
68   -
69   -void
70   -Pl_QPDFTokenizer::write(unsigned char* buf, size_t len)
71   -{
72   - checkUnread();
73   - for (size_t i = 0; i < len; ++i)
74   - {
75   - processChar(buf[i]);
76   - checkUnread();
77   - }
  38 + this->m->buf.write(data, len);
78 39 }
79 40  
80 41 void
81 42 Pl_QPDFTokenizer::finish()
82 43 {
83   - this->m->tokenizer.presentEOF();
84   - QPDFTokenizer::Token token;
85   - if (this->m->tokenizer.getToken(
86   - token, this->m->unread_char, this->m->char_to_unread))
  44 + this->m->buf.finish();
  45 + PointerHolder<InputSource> input =
  46 + new BufferInputSource("tokenizer data",
  47 + this->m->buf.getBuffer(), true);
  48 +
  49 + while (true)
87 50 {
  51 + QPDFTokenizer::Token token = this->m->tokenizer.readToken(
  52 + input, "offset " + QUtil::int_to_string(input->tell()),
  53 + true);
88 54 this->m->filter->handleToken(token);
  55 + if (token.getType() == QPDFTokenizer::tt_eof)
  56 + {
  57 + break;
  58 + }
  59 + else if ((token.getType() == QPDFTokenizer::tt_word) &&
  60 + (token.getValue() == "ID"))
  61 + {
  62 + QTC::TC("qpdf", "Pl_QPDFTokenizer found ID");
  63 + this->m->tokenizer.expectInlineImage(input);
  64 + }
89 65 }
90   -
91 66 this->m->filter->handleEOF();
92 67 QPDFObjectHandle::TokenFilter::PipelineAccessor::setPipeline(
93 68 m->filter, 0);
... ...
libqpdf/QPDFObjectHandle.cc
... ... @@ -1558,7 +1558,7 @@ QPDFObjectHandle::parseContentStream_data(
1558 1558 // terminated the token. Read until end of inline image.
1559 1559 char ch;
1560 1560 input->read(&ch, 1);
1561   - tokenizer.expectInlineImage();
  1561 + tokenizer.expectInlineImage(input);
1562 1562 QPDFTokenizer::Token t =
1563 1563 tokenizer.readToken(input, description, true);
1564 1564 if (t.getType() == QPDFTokenizer::tt_bad)
... ...
libqpdf/QPDFTokenizer.cc
... ... @@ -13,6 +13,79 @@
13 13 #include <string.h>
14 14 #include <cstdlib>
15 15  
  16 +static bool is_delimiter(char ch)
  17 +{
  18 + return (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0);
  19 +}
  20 +
  21 +class QPDFWordTokenFinder: public InputSource::Finder
  22 +{
  23 + public:
  24 + QPDFWordTokenFinder(PointerHolder<InputSource> is,
  25 + std::string const& str) :
  26 + is(is),
  27 + str(str)
  28 + {
  29 + }
  30 + virtual ~QPDFWordTokenFinder()
  31 + {
  32 + }
  33 + virtual bool check();
  34 +
  35 + private:
  36 + PointerHolder<InputSource> is;
  37 + std::string str;
  38 +};
  39 +
  40 +bool
  41 +QPDFWordTokenFinder::check()
  42 +{
  43 + // Find a word token matching the given string, preceded by a
  44 + // delimiter, and followed by a delimiter or EOF.
  45 + QPDFTokenizer tokenizer;
  46 + QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
  47 + qpdf_offset_t pos = is->tell();
  48 + if (! (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str)))
  49 + {
  50 +/// QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
  51 + return false;
  52 + }
  53 + qpdf_offset_t token_start = is->getLastOffset();
  54 + char next;
  55 + bool next_okay = false;
  56 + if (is->read(&next, 1) == 0)
  57 + {
  58 + QTC::TC("qpdf", "QPDFTokenizer inline image at EOF");
  59 + next_okay = true;
  60 + }
  61 + else
  62 + {
  63 + next_okay = is_delimiter(next);
  64 + }
  65 + is->seek(pos, SEEK_SET);
  66 + if (! next_okay)
  67 + {
  68 +/// QTC::TC("qpdf", "QPDFTokenizer finder word not followed by delimiter");
  69 + return false;
  70 + }
  71 + if (token_start == 0)
  72 + {
  73 + // Can't actually happen...we never start the search at the
  74 + // beginning of the input.
  75 + return false;
  76 + }
  77 + is->seek(token_start - 1, SEEK_SET);
  78 + char prev;
  79 + bool prev_okay = ((is->read(&prev, 1) == 1) && is_delimiter(prev));
  80 + is->seek(pos, SEEK_SET);
  81 + if (! prev_okay)
  82 + {
  83 +/// QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter");
  84 + return false;
  85 + }
  86 + return true;
  87 +}
  88 +
16 89 QPDFTokenizer::Members::Members() :
17 90 pound_special_in_name(true),
18 91 allow_eof(false),
... ... @@ -31,6 +104,7 @@ QPDFTokenizer::Members::reset()
31 104 error_message = "";
32 105 unread_char = false;
33 106 char_to_unread = '\0';
  107 + inline_image_bytes = 0;
34 108 string_depth = 0;
35 109 string_ignoring_newline = false;
36 110 last_char_was_bs = false;
... ... @@ -91,7 +165,7 @@ QPDFTokenizer::isSpace(char ch)
91 165 bool
92 166 QPDFTokenizer::isDelimiter(char ch)
93 167 {
94   - return (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0);
  168 + return is_delimiter(ch);
95 169 }
96 170  
97 171 void
... ... @@ -470,12 +544,21 @@ QPDFTokenizer::presentCharacter(char ch)
470 544 {
471 545 this->m->val += ch;
472 546 size_t len = this->m->val.length();
473   - if ((len >= 4) &&
474   - isDelimiter(this->m->val.at(len-4)) &&
475   - (this->m->val.at(len-3) == 'E') &&
476   - (this->m->val.at(len-2) == 'I') &&
477   - isDelimiter(this->m->val.at(len-1)))
  547 + if (len == this->m->inline_image_bytes)
  548 + {
  549 + QTC::TC("qpdf", "QPDFTokenizer found EI by byte count");
  550 + this->m->type = tt_inline_image;
  551 + this->m->inline_image_bytes = 0;
  552 + this->m->state = st_token_ready;
  553 + }
  554 + else if ((this->m->inline_image_bytes == 0) &&
  555 + (len >= 4) &&
  556 + isDelimiter(this->m->val.at(len-4)) &&
  557 + (this->m->val.at(len-3) == 'E') &&
  558 + (this->m->val.at(len-2) == 'I') &&
  559 + isDelimiter(this->m->val.at(len-1)))
478 560 {
  561 + QTC::TC("qpdf", "QPDFTokenizer found EI the old way");
479 562 this->m->val.erase(len - 1);
480 563 this->m->type = tt_inline_image;
481 564 this->m->unread_char = true;
... ... @@ -562,7 +645,7 @@ QPDFTokenizer::presentEOF()
562 645 (this->m->val.at(len-2) == 'E') &&
563 646 (this->m->val.at(len-1) == 'I'))
564 647 {
565   - QTC::TC("qpdf", "QPDFTokenizer inline image at EOF");
  648 + QTC::TC("qpdf", "QPDFTokenizer inline image at EOF the old way");
566 649 this->m->type = tt_inline_image;
567 650 this->m->state = st_token_ready;
568 651 }
... ... @@ -598,6 +681,26 @@ QPDFTokenizer::presentEOF()
598 681 void
599 682 QPDFTokenizer::expectInlineImage()
600 683 {
  684 + expectInlineImage(PointerHolder<InputSource>());
  685 +}
  686 +
  687 +void
  688 +QPDFTokenizer::expectInlineImage(PointerHolder<InputSource> input)
  689 +{
  690 + if (input.getPointer())
  691 + {
  692 + qpdf_offset_t last_offset = input->getLastOffset();
  693 + qpdf_offset_t pos = input->tell();
  694 +
  695 + QPDFWordTokenFinder f(input, "EI");
  696 + if (input->findFirst("EI", pos, 0, f))
  697 + {
  698 + this->m->inline_image_bytes = input->tell() - pos;
  699 + }
  700 +
  701 + input->seek(pos, SEEK_SET);
  702 + input->setLastOffset(last_offset);
  703 + }
601 704 if (this->m->state != st_top)
602 705 {
603 706 throw std::logic_error("QPDFTokenizer::expectInlineImage called"
... ...
qpdf/qpdf.testcov
... ... @@ -430,3 +430,6 @@ QPDFPageObjectHelper copy shared attribute 0
430 430 qpdf from_nr from repeat_nr 0
431 431 QPDF resolve duplicated page object 0
432 432 QPDF handle direct page object 0
  433 +QPDFTokenizer found EI the old way 0
  434 +QPDFTokenizer found EI by byte count 0
  435 +QPDFTokenizer inline image at EOF the old way 0
... ...
qpdf/qtest/qpdf.test
... ... @@ -694,7 +694,7 @@ $td-&gt;runtest(&quot;check pass1 file&quot;,
694 694 show_ntests();
695 695 # ----------
696 696 $td->notify("--- Tokenizer ---");
697   -$n_tests += 4;
  697 +$n_tests += 5;
698 698  
699 699 $td->runtest("tokenizer with no ignorable",
700 700 {$td->COMMAND => "test_tokenizer -no-ignorable tokens.pdf"},
... ... @@ -706,6 +706,11 @@ $td-&gt;runtest(&quot;tokenizer&quot;,
706 706 {$td->FILE => "tokens.out", $td->EXIT_STATUS => 0},
707 707 $td->NORMALIZE_NEWLINES);
708 708  
  709 +$td->runtest("tokenizer with old inline image code",
  710 + {$td->COMMAND => "test_tokenizer -old-ei tokens.pdf"},
  711 + {$td->FILE => "tokens.out", $td->EXIT_STATUS => 0},
  712 + $td->NORMALIZE_NEWLINES);
  713 +
709 714 $td->runtest("tokenizer with max_len",
710 715 {$td->COMMAND => "test_tokenizer -maxlen 50 tokens.pdf"},
711 716 {$td->FILE => "tokens-maxlen.out", $td->EXIT_STATUS => 0},
... ...
qpdf/test_tokenizer.cc
... ... @@ -16,7 +16,7 @@ static char const* whoami = 0;
16 16 void usage()
17 17 {
18 18 std::cerr << "Usage: " << whoami
19   - << " [-maxlen len | -no-ignorable] filename"
  19 + << " [-maxlen len | -no-ignorable | -old-ei] filename"
20 20 << std::endl;
21 21 exit(2);
22 22 }
... ... @@ -132,7 +132,7 @@ try_skipping(QPDFTokenizer&amp; tokenizer, PointerHolder&lt;InputSource&gt; is,
132 132 static void
133 133 dump_tokens(PointerHolder<InputSource> is, std::string const& label,
134 134 size_t max_len, bool include_ignorable,
135   - bool skip_streams, bool skip_inline_images)
  135 + bool skip_streams, bool skip_inline_images, bool old_ei)
136 136 {
137 137 Finder f1(is, "endstream");
138 138 std::cout << "--- BEGIN " << label << " ---" << std::endl;
... ... @@ -183,7 +183,14 @@ dump_tokens(PointerHolder&lt;InputSource&gt; is, std::string const&amp; label,
183 183 else if (skip_inline_images &&
184 184 (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "ID")))
185 185 {
186   - tokenizer.expectInlineImage();
  186 + if (old_ei)
  187 + {
  188 + tokenizer.expectInlineImage();
  189 + }
  190 + else
  191 + {
  192 + tokenizer.expectInlineImage(is);
  193 + }
187 194 inline_image_offset = is->tell();
188 195 }
189 196 else if (token.getType() == QPDFTokenizer::tt_eof)
... ... @@ -195,7 +202,7 @@ dump_tokens(PointerHolder&lt;InputSource&gt; is, std::string const&amp; label,
195 202 }
196 203  
197 204 static void process(char const* filename, bool include_ignorable,
198   - size_t max_len)
  205 + size_t max_len, bool old_ei)
199 206 {
200 207 PointerHolder<InputSource> is;
201 208  
... ... @@ -203,7 +210,7 @@ static void process(char const* filename, bool include_ignorable,
203 210 FileInputSource* fis = new FileInputSource();
204 211 fis->setFilename(filename);
205 212 is = fis;
206   - dump_tokens(is, "FILE", max_len, include_ignorable, true, false);
  213 + dump_tokens(is, "FILE", max_len, include_ignorable, true, false, false);
207 214  
208 215 // Tokenize content streams, skipping inline images
209 216 QPDF qpdf;
... ... @@ -222,7 +229,7 @@ static void process(char const* filename, bool include_ignorable,
222 229 "content data", content_data.getPointer());
223 230 is = bis;
224 231 dump_tokens(is, "PAGE " + QUtil::int_to_string(pageno),
225   - max_len, include_ignorable, false, true);
  232 + max_len, include_ignorable, false, true, old_ei);
226 233 }
227 234  
228 235 // Tokenize object streams
... ... @@ -241,7 +248,7 @@ static void process(char const* filename, bool include_ignorable,
241 248 is = bis;
242 249 dump_tokens(is, "OBJECT STREAM " +
243 250 QUtil::int_to_string((*iter).getObjectID()),
244   - max_len, include_ignorable, false, false);
  251 + max_len, include_ignorable, false, false, false);
245 252 }
246 253 }
247 254 }
... ... @@ -266,6 +273,7 @@ int main(int argc, char* argv[])
266 273 char const* filename = 0;
267 274 size_t max_len = 0;
268 275 bool include_ignorable = true;
  276 + bool old_ei = false;
269 277 for (int i = 1; i < argc; ++i)
270 278 {
271 279 if (argv[i][0] == '-')
... ... @@ -282,6 +290,10 @@ int main(int argc, char* argv[])
282 290 {
283 291 include_ignorable = false;
284 292 }
  293 + else if (strcmp(argv[i], "-old-ei") == 0)
  294 + {
  295 + old_ei = true;
  296 + }
285 297 else
286 298 {
287 299 usage();
... ... @@ -303,7 +315,7 @@ int main(int argc, char* argv[])
303 315  
304 316 try
305 317 {
306   - process(filename, include_ignorable, max_len);
  318 + process(filename, include_ignorable, max_len, old_ei);
307 319 }
308 320 catch (std::exception& e)
309 321 {
... ...