Commit 6405d3928f78bc227587b87b8e2c2d46502796e0

Authored by Jay Berkenbilt
1 parent a8f22487

be less conservative when skipping over inline images in content normalization

git-svn-id: svn+q:///qpdf/trunk@1050 71b93d88-0707-0410-a8cf-f5a4172ac649
ChangeLog
  1 +2011-04-30 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * libqpdf/Pl_QPDFTokenizer.cc (processChar): When an inline image
  4 + is detected, suspend normalization only up to the end of the
  5 + inline image rather than for the remainder of the content stream.
  6 + (Fixes qpdf-Bugs 3152169.)
  7 +
1 2011-01-31 Jay Berkenbilt <ejb@ql.org> 8 2011-01-31 Jay Berkenbilt <ejb@ql.org>
2 9
3 * libqpdf/QPDF.cc (readObjectAtOffset): use -1 rather than 0 when 10 * libqpdf/QPDF.cc (readObjectAtOffset): use -1 rather than 0 when
  1 +Next
  2 +====
  3 +
  4 + * Look for %PDF header somewhere within the first 1024 bytes of the
  5 + file. Also accept headers of the form "%!PS−Adobe−N.n PDF−M.m".
  6 + See Implementation notes 13 and 14 in appendix H of the PDF 1.7
  7 + specification. This is bug 3267974.
  8 +
1 General 9 General
2 ======= 10 =======
3 11
@@ -174,6 +182,10 @@ Index: QPDFWriter.cc @@ -174,6 +182,10 @@ Index: QPDFWriter.cc
174 providing some mechanism to recover earlier versions of a file 182 providing some mechanism to recover earlier versions of a file
175 embedded prior to appended sections. 183 embedded prior to appended sections.
176 184
  185 + * From a suggestion in bug 3152169, consisder having an option to
  186 + re-encode inline images with an ASCII encoding.
  187 +
  188 +
177 Splitting by Pages 189 Splitting by Pages
178 ================== 190 ==================
179 191
libqpdf/Pl_QPDFTokenizer.cc
1 #include <qpdf/Pl_QPDFTokenizer.hh> 1 #include <qpdf/Pl_QPDFTokenizer.hh>
2 #include <qpdf/QPDF_String.hh> 2 #include <qpdf/QPDF_String.hh>
3 #include <qpdf/QPDF_Name.hh> 3 #include <qpdf/QPDF_Name.hh>
  4 +#include <qpdf/QTC.hh>
4 #include <stdexcept> 5 #include <stdexcept>
5 #include <string.h> 6 #include <string.h>
6 7
@@ -11,8 +12,9 @@ Pl_QPDFTokenizer::Pl_QPDFTokenizer(char const* identifier, Pipeline* next) : @@ -11,8 +12,9 @@ Pl_QPDFTokenizer::Pl_QPDFTokenizer(char const* identifier, Pipeline* next) :
11 last_char_was_cr(false), 12 last_char_was_cr(false),
12 unread_char(false), 13 unread_char(false),
13 char_to_unread('\0'), 14 char_to_unread('\0'),
14 - pass_through(false) 15 + in_inline_image(false)
15 { 16 {
  17 + memset(this->image_buf, 0, IMAGE_BUF_SIZE);
16 } 18 }
17 19
18 Pl_QPDFTokenizer::~Pl_QPDFTokenizer() 20 Pl_QPDFTokenizer::~Pl_QPDFTokenizer()
@@ -56,11 +58,34 @@ Pl_QPDFTokenizer::writeToken(QPDFTokenizer::Token&amp; token) @@ -56,11 +58,34 @@ Pl_QPDFTokenizer::writeToken(QPDFTokenizer::Token&amp; token)
56 void 58 void
57 Pl_QPDFTokenizer::processChar(char ch) 59 Pl_QPDFTokenizer::processChar(char ch)
58 { 60 {
59 - if (this->pass_through) 61 + if (this->in_inline_image)
60 { 62 {
61 - // We're not normalizing anymore -- just write this without  
62 - // looking at it.  
63 - writeNext(&ch, 1); 63 + // Scan through the input looking for EI surrounded by
  64 + // whitespace. If that pattern appears in the inline image's
  65 + // representation, we're hosed, but this situation seems
  66 + // excessively unlikely, and this code path is only followed
  67 + // during content stream normalization, which is pretty much
  68 + // used for debugging and human inspection of PDF files.
  69 + memmove(this->image_buf,
  70 + this->image_buf + 1,
  71 + IMAGE_BUF_SIZE - 1);
  72 + this->image_buf[IMAGE_BUF_SIZE - 1] = ch;
  73 + if (strchr(" \t\n\v\f\r", this->image_buf[0]) &&
  74 + (this->image_buf[1] == 'E') &&
  75 + (this->image_buf[2] == 'I') &&
  76 + strchr(" \t\n\v\f\r", this->image_buf[3]))
  77 + {
  78 + // We've found an EI operator. We've already written the
  79 + // EI operator to output; terminate with a newline
  80 + // character and resume normal processing.
  81 + writeNext("\n", 1);
  82 + this->in_inline_image = false;
  83 + QTC::TC("qpdf", "Pl_QPDFTokenizer found EI");
  84 + }
  85 + else
  86 + {
  87 + writeNext(&ch, 1);
  88 + }
64 return; 89 return;
65 } 90 }
66 91
@@ -75,18 +100,10 @@ Pl_QPDFTokenizer::processChar(char ch) @@ -75,18 +100,10 @@ Pl_QPDFTokenizer::processChar(char ch)
75 this->newline_after_next_token = false; 100 this->newline_after_next_token = false;
76 } 101 }
77 if ((token.getType() == QPDFTokenizer::tt_word) && 102 if ((token.getType() == QPDFTokenizer::tt_word) &&
78 - (token.getValue() == "BI")) 103 + (token.getValue() == "ID"))
79 { 104 {
80 - // Uh oh.... we're not sophisticated enough to handle  
81 - // inline images safely. We'd have to to set up all the  
82 - // filters and pipe the image data through it until the  
83 - // filtered output was the right size for an image of the  
84 - // specified dimensions. Then we'd either have to write  
85 - // out raw image data or continue to write filtered data,  
86 - // resuming normalization when we get to the end.  
87 - // Instead, for now, we'll just turn off normalization for  
88 - // the remainder of this stream.  
89 - this->pass_through = true; 105 + // Suspend normal scanning until we find an EI token.
  106 + this->in_inline_image = true;
90 if (this->unread_char) 107 if (this->unread_char)
91 { 108 {
92 writeNext(&this->char_to_unread, 1); 109 writeNext(&this->char_to_unread, 1);
@@ -156,7 +173,7 @@ void @@ -156,7 +173,7 @@ void
156 Pl_QPDFTokenizer::finish() 173 Pl_QPDFTokenizer::finish()
157 { 174 {
158 this->tokenizer.presentEOF(); 175 this->tokenizer.presentEOF();
159 - if (! this->pass_through) 176 + if (! this->in_inline_image)
160 { 177 {
161 QPDFTokenizer::Token token; 178 QPDFTokenizer::Token token;
162 if (tokenizer.getToken(token, this->unread_char, this->char_to_unread)) 179 if (tokenizer.getToken(token, this->unread_char, this->char_to_unread))
libqpdf/qpdf/Pl_QPDFTokenizer.hh
@@ -33,7 +33,9 @@ class Pl_QPDFTokenizer: public Pipeline @@ -33,7 +33,9 @@ class Pl_QPDFTokenizer: public Pipeline
33 bool last_char_was_cr; 33 bool last_char_was_cr;
34 bool unread_char; 34 bool unread_char;
35 char char_to_unread; 35 char char_to_unread;
36 - bool pass_through; 36 + bool in_inline_image;
  37 + static int const IMAGE_BUF_SIZE = 4; // must be >= 4
  38 + char image_buf[IMAGE_BUF_SIZE];
37 }; 39 };
38 40
39 #endif // __PL_QPDFTOKENIZER_HH__ 41 #endif // __PL_QPDFTOKENIZER_HH__
qpdf/qpdf.testcov
@@ -187,3 +187,4 @@ QPDF_Stream getRawStreamData 0 @@ -187,3 +187,4 @@ QPDF_Stream getRawStreamData 0
187 QPDF_Stream getStreamData 0 187 QPDF_Stream getStreamData 0
188 QPDF_Stream expand filter abbreviation 0 188 QPDF_Stream expand filter abbreviation 0
189 qpdf-c called qpdf_read_memory 0 189 qpdf-c called qpdf_read_memory 0
  190 +Pl_QPDFTokenizer found EI 0
qpdf/qtest/qpdf.test
@@ -1257,8 +1257,8 @@ my @flags = ([&quot;-qdf&quot;, # 1 @@ -1257,8 +1257,8 @@ my @flags = ([&quot;-qdf&quot;, # 1
1257 "no arguments"], 1257 "no arguments"],
1258 ); 1258 );
1259 1259
1260 -$n_tests += (@files * @flags * 2 * 3);  
1261 -$n_compare_pdfs += (@files * @flags * 2); 1260 +$n_tests += 1 + (@files * @flags * 2 * 3);
  1261 +$n_compare_pdfs += 1 + (@files * @flags * 2);
1262 $n_acroread += (@files * @flags * 2); 1262 $n_acroread += (@files * @flags * 2);
1263 1263
1264 foreach my $file (@files) 1264 foreach my $file (@files)
@@ -1311,6 +1311,14 @@ foreach my $file (@files) @@ -1311,6 +1311,14 @@ foreach my $file (@files)
1311 } 1311 }
1312 } 1312 }
1313 1313
  1314 +# inline-images-cr.pdf is xbkm938-dies.pdf from PDF collection
  1315 +$td->runtest("convert inline-images-cr to qdf",
  1316 + {$td->COMMAND => "qpdf --static-id --no-original-object-ids" .
  1317 + " --qdf inline-images-cr.pdf a.pdf"},
  1318 + {$td->STRING => "", $td->EXIT_STATUS => 0});
  1319 +
  1320 +compare_pdfs("inline-images-cr.pdf", "a.pdf");
  1321 +
1314 show_ntests(); 1322 show_ntests();
1315 # ---------- 1323 # ----------
1316 $td->notify("--- fix-qdf Tests ---"); 1324 $td->notify("--- fix-qdf Tests ---");
qpdf/qtest/qpdf/inline-images-cr.pdf 0 → 100644
No preview for this file type