Commit 6405d3928f78bc227587b87b8e2c2d46502796e0

Authored by Jay Berkenbilt
1 parent a8f22487

be less conservative when skipping over inline images in content normalization

git-svn-id: svn+q:///qpdf/trunk@1050 71b93d88-0707-0410-a8cf-f5a4172ac649
ChangeLog
  1 +2011-04-30 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * libqpdf/Pl_QPDFTokenizer.cc (processChar): When an inline image
  4 + is detected, suspend normalization only up to the end of the
  5 + inline image rather than for the remainder of the content stream.
  6 + (Fixes qpdf-Bugs 3152169.)
  7 +
1 8 2011-01-31 Jay Berkenbilt <ejb@ql.org>
2 9  
3 10 * libqpdf/QPDF.cc (readObjectAtOffset): use -1 rather than 0 when
... ...
  1 +Next
  2 +====
  3 +
  4 + * Look for %PDF header somewhere within the first 1024 bytes of the
  5 + file. Also accept headers of the form "%!PS−Adobe−N.n PDF−M.m".
  6 + See Implementation notes 13 and 14 in appendix H of the PDF 1.7
  7 + specification. This is bug 3267974.
  8 +
1 9 General
2 10 =======
3 11  
... ... @@ -174,6 +182,10 @@ Index: QPDFWriter.cc
174 182 providing some mechanism to recover earlier versions of a file
175 183 embedded prior to appended sections.
176 184  
  185 + * From a suggestion in bug 3152169, consisder having an option to
  186 + re-encode inline images with an ASCII encoding.
  187 +
  188 +
177 189 Splitting by Pages
178 190 ==================
179 191  
... ...
libqpdf/Pl_QPDFTokenizer.cc
1 1 #include <qpdf/Pl_QPDFTokenizer.hh>
2 2 #include <qpdf/QPDF_String.hh>
3 3 #include <qpdf/QPDF_Name.hh>
  4 +#include <qpdf/QTC.hh>
4 5 #include <stdexcept>
5 6 #include <string.h>
6 7  
... ... @@ -11,8 +12,9 @@ Pl_QPDFTokenizer::Pl_QPDFTokenizer(char const* identifier, Pipeline* next) :
11 12 last_char_was_cr(false),
12 13 unread_char(false),
13 14 char_to_unread('\0'),
14   - pass_through(false)
  15 + in_inline_image(false)
15 16 {
  17 + memset(this->image_buf, 0, IMAGE_BUF_SIZE);
16 18 }
17 19  
18 20 Pl_QPDFTokenizer::~Pl_QPDFTokenizer()
... ... @@ -56,11 +58,34 @@ Pl_QPDFTokenizer::writeToken(QPDFTokenizer::Token&amp; token)
56 58 void
57 59 Pl_QPDFTokenizer::processChar(char ch)
58 60 {
59   - if (this->pass_through)
  61 + if (this->in_inline_image)
60 62 {
61   - // We're not normalizing anymore -- just write this without
62   - // looking at it.
63   - writeNext(&ch, 1);
  63 + // Scan through the input looking for EI surrounded by
  64 + // whitespace. If that pattern appears in the inline image's
  65 + // representation, we're hosed, but this situation seems
  66 + // excessively unlikely, and this code path is only followed
  67 + // during content stream normalization, which is pretty much
  68 + // used for debugging and human inspection of PDF files.
  69 + memmove(this->image_buf,
  70 + this->image_buf + 1,
  71 + IMAGE_BUF_SIZE - 1);
  72 + this->image_buf[IMAGE_BUF_SIZE - 1] = ch;
  73 + if (strchr(" \t\n\v\f\r", this->image_buf[0]) &&
  74 + (this->image_buf[1] == 'E') &&
  75 + (this->image_buf[2] == 'I') &&
  76 + strchr(" \t\n\v\f\r", this->image_buf[3]))
  77 + {
  78 + // We've found an EI operator. We've already written the
  79 + // EI operator to output; terminate with a newline
  80 + // character and resume normal processing.
  81 + writeNext("\n", 1);
  82 + this->in_inline_image = false;
  83 + QTC::TC("qpdf", "Pl_QPDFTokenizer found EI");
  84 + }
  85 + else
  86 + {
  87 + writeNext(&ch, 1);
  88 + }
64 89 return;
65 90 }
66 91  
... ... @@ -75,18 +100,10 @@ Pl_QPDFTokenizer::processChar(char ch)
75 100 this->newline_after_next_token = false;
76 101 }
77 102 if ((token.getType() == QPDFTokenizer::tt_word) &&
78   - (token.getValue() == "BI"))
  103 + (token.getValue() == "ID"))
79 104 {
80   - // Uh oh.... we're not sophisticated enough to handle
81   - // inline images safely. We'd have to to set up all the
82   - // filters and pipe the image data through it until the
83   - // filtered output was the right size for an image of the
84   - // specified dimensions. Then we'd either have to write
85   - // out raw image data or continue to write filtered data,
86   - // resuming normalization when we get to the end.
87   - // Instead, for now, we'll just turn off normalization for
88   - // the remainder of this stream.
89   - this->pass_through = true;
  105 + // Suspend normal scanning until we find an EI token.
  106 + this->in_inline_image = true;
90 107 if (this->unread_char)
91 108 {
92 109 writeNext(&this->char_to_unread, 1);
... ... @@ -156,7 +173,7 @@ void
156 173 Pl_QPDFTokenizer::finish()
157 174 {
158 175 this->tokenizer.presentEOF();
159   - if (! this->pass_through)
  176 + if (! this->in_inline_image)
160 177 {
161 178 QPDFTokenizer::Token token;
162 179 if (tokenizer.getToken(token, this->unread_char, this->char_to_unread))
... ...
libqpdf/qpdf/Pl_QPDFTokenizer.hh
... ... @@ -33,7 +33,9 @@ class Pl_QPDFTokenizer: public Pipeline
33 33 bool last_char_was_cr;
34 34 bool unread_char;
35 35 char char_to_unread;
36   - bool pass_through;
  36 + bool in_inline_image;
  37 + static int const IMAGE_BUF_SIZE = 4; // must be >= 4
  38 + char image_buf[IMAGE_BUF_SIZE];
37 39 };
38 40  
39 41 #endif // __PL_QPDFTOKENIZER_HH__
... ...
qpdf/qpdf.testcov
... ... @@ -187,3 +187,4 @@ QPDF_Stream getRawStreamData 0
187 187 QPDF_Stream getStreamData 0
188 188 QPDF_Stream expand filter abbreviation 0
189 189 qpdf-c called qpdf_read_memory 0
  190 +Pl_QPDFTokenizer found EI 0
... ...
qpdf/qtest/qpdf.test
... ... @@ -1257,8 +1257,8 @@ my @flags = ([&quot;-qdf&quot;, # 1
1257 1257 "no arguments"],
1258 1258 );
1259 1259  
1260   -$n_tests += (@files * @flags * 2 * 3);
1261   -$n_compare_pdfs += (@files * @flags * 2);
  1260 +$n_tests += 1 + (@files * @flags * 2 * 3);
  1261 +$n_compare_pdfs += 1 + (@files * @flags * 2);
1262 1262 $n_acroread += (@files * @flags * 2);
1263 1263  
1264 1264 foreach my $file (@files)
... ... @@ -1311,6 +1311,14 @@ foreach my $file (@files)
1311 1311 }
1312 1312 }
1313 1313  
  1314 +# inline-images-cr.pdf is xbkm938-dies.pdf from PDF collection
  1315 +$td->runtest("convert inline-images-cr to qdf",
  1316 + {$td->COMMAND => "qpdf --static-id --no-original-object-ids" .
  1317 + " --qdf inline-images-cr.pdf a.pdf"},
  1318 + {$td->STRING => "", $td->EXIT_STATUS => 0});
  1319 +
  1320 +compare_pdfs("inline-images-cr.pdf", "a.pdf");
  1321 +
1314 1322 show_ntests();
1315 1323 # ----------
1316 1324 $td->notify("--- fix-qdf Tests ---");
... ...
qpdf/qtest/qpdf/inline-images-cr.pdf 0 → 100644
No preview for this file type