Commit 6405d3928f78bc227587b87b8e2c2d46502796e0
1 parent
a8f22487
be less conservative when skipping over inline images in content normalization
git-svn-id: svn+q:///qpdf/trunk@1050 71b93d88-0707-0410-a8cf-f5a4172ac649
Showing
7 changed files
with
67 additions
and
20 deletions
ChangeLog
| 1 | +2011-04-30 Jay Berkenbilt <ejb@ql.org> | ||
| 2 | + | ||
| 3 | + * libqpdf/Pl_QPDFTokenizer.cc (processChar): When an inline image | ||
| 4 | + is detected, suspend normalization only up to the end of the | ||
| 5 | + inline image rather than for the remainder of the content stream. | ||
| 6 | + (Fixes qpdf-Bugs 3152169.) | ||
| 7 | + | ||
| 1 | 2011-01-31 Jay Berkenbilt <ejb@ql.org> | 8 | 2011-01-31 Jay Berkenbilt <ejb@ql.org> |
| 2 | 9 | ||
| 3 | * libqpdf/QPDF.cc (readObjectAtOffset): use -1 rather than 0 when | 10 | * libqpdf/QPDF.cc (readObjectAtOffset): use -1 rather than 0 when |
TODO
| 1 | +Next | ||
| 2 | +==== | ||
| 3 | + | ||
| 4 | + * Look for %PDF header somewhere within the first 1024 bytes of the | ||
| 5 | + file. Also accept headers of the form "%!PS−Adobe−N.n PDF−M.m". | ||
| 6 | + See Implementation notes 13 and 14 in appendix H of the PDF 1.7 | ||
| 7 | + specification. This is bug 3267974. | ||
| 8 | + | ||
| 1 | General | 9 | General |
| 2 | ======= | 10 | ======= |
| 3 | 11 | ||
| @@ -174,6 +182,10 @@ Index: QPDFWriter.cc | @@ -174,6 +182,10 @@ Index: QPDFWriter.cc | ||
| 174 | providing some mechanism to recover earlier versions of a file | 182 | providing some mechanism to recover earlier versions of a file |
| 175 | embedded prior to appended sections. | 183 | embedded prior to appended sections. |
| 176 | 184 | ||
| 185 | + * From a suggestion in bug 3152169, consisder having an option to | ||
| 186 | + re-encode inline images with an ASCII encoding. | ||
| 187 | + | ||
| 188 | + | ||
| 177 | Splitting by Pages | 189 | Splitting by Pages |
| 178 | ================== | 190 | ================== |
| 179 | 191 |
libqpdf/Pl_QPDFTokenizer.cc
| 1 | #include <qpdf/Pl_QPDFTokenizer.hh> | 1 | #include <qpdf/Pl_QPDFTokenizer.hh> |
| 2 | #include <qpdf/QPDF_String.hh> | 2 | #include <qpdf/QPDF_String.hh> |
| 3 | #include <qpdf/QPDF_Name.hh> | 3 | #include <qpdf/QPDF_Name.hh> |
| 4 | +#include <qpdf/QTC.hh> | ||
| 4 | #include <stdexcept> | 5 | #include <stdexcept> |
| 5 | #include <string.h> | 6 | #include <string.h> |
| 6 | 7 | ||
| @@ -11,8 +12,9 @@ Pl_QPDFTokenizer::Pl_QPDFTokenizer(char const* identifier, Pipeline* next) : | @@ -11,8 +12,9 @@ Pl_QPDFTokenizer::Pl_QPDFTokenizer(char const* identifier, Pipeline* next) : | ||
| 11 | last_char_was_cr(false), | 12 | last_char_was_cr(false), |
| 12 | unread_char(false), | 13 | unread_char(false), |
| 13 | char_to_unread('\0'), | 14 | char_to_unread('\0'), |
| 14 | - pass_through(false) | 15 | + in_inline_image(false) |
| 15 | { | 16 | { |
| 17 | + memset(this->image_buf, 0, IMAGE_BUF_SIZE); | ||
| 16 | } | 18 | } |
| 17 | 19 | ||
| 18 | Pl_QPDFTokenizer::~Pl_QPDFTokenizer() | 20 | Pl_QPDFTokenizer::~Pl_QPDFTokenizer() |
| @@ -56,11 +58,34 @@ Pl_QPDFTokenizer::writeToken(QPDFTokenizer::Token& token) | @@ -56,11 +58,34 @@ Pl_QPDFTokenizer::writeToken(QPDFTokenizer::Token& token) | ||
| 56 | void | 58 | void |
| 57 | Pl_QPDFTokenizer::processChar(char ch) | 59 | Pl_QPDFTokenizer::processChar(char ch) |
| 58 | { | 60 | { |
| 59 | - if (this->pass_through) | 61 | + if (this->in_inline_image) |
| 60 | { | 62 | { |
| 61 | - // We're not normalizing anymore -- just write this without | ||
| 62 | - // looking at it. | ||
| 63 | - writeNext(&ch, 1); | 63 | + // Scan through the input looking for EI surrounded by |
| 64 | + // whitespace. If that pattern appears in the inline image's | ||
| 65 | + // representation, we're hosed, but this situation seems | ||
| 66 | + // excessively unlikely, and this code path is only followed | ||
| 67 | + // during content stream normalization, which is pretty much | ||
| 68 | + // used for debugging and human inspection of PDF files. | ||
| 69 | + memmove(this->image_buf, | ||
| 70 | + this->image_buf + 1, | ||
| 71 | + IMAGE_BUF_SIZE - 1); | ||
| 72 | + this->image_buf[IMAGE_BUF_SIZE - 1] = ch; | ||
| 73 | + if (strchr(" \t\n\v\f\r", this->image_buf[0]) && | ||
| 74 | + (this->image_buf[1] == 'E') && | ||
| 75 | + (this->image_buf[2] == 'I') && | ||
| 76 | + strchr(" \t\n\v\f\r", this->image_buf[3])) | ||
| 77 | + { | ||
| 78 | + // We've found an EI operator. We've already written the | ||
| 79 | + // EI operator to output; terminate with a newline | ||
| 80 | + // character and resume normal processing. | ||
| 81 | + writeNext("\n", 1); | ||
| 82 | + this->in_inline_image = false; | ||
| 83 | + QTC::TC("qpdf", "Pl_QPDFTokenizer found EI"); | ||
| 84 | + } | ||
| 85 | + else | ||
| 86 | + { | ||
| 87 | + writeNext(&ch, 1); | ||
| 88 | + } | ||
| 64 | return; | 89 | return; |
| 65 | } | 90 | } |
| 66 | 91 | ||
| @@ -75,18 +100,10 @@ Pl_QPDFTokenizer::processChar(char ch) | @@ -75,18 +100,10 @@ Pl_QPDFTokenizer::processChar(char ch) | ||
| 75 | this->newline_after_next_token = false; | 100 | this->newline_after_next_token = false; |
| 76 | } | 101 | } |
| 77 | if ((token.getType() == QPDFTokenizer::tt_word) && | 102 | if ((token.getType() == QPDFTokenizer::tt_word) && |
| 78 | - (token.getValue() == "BI")) | 103 | + (token.getValue() == "ID")) |
| 79 | { | 104 | { |
| 80 | - // Uh oh.... we're not sophisticated enough to handle | ||
| 81 | - // inline images safely. We'd have to to set up all the | ||
| 82 | - // filters and pipe the image data through it until the | ||
| 83 | - // filtered output was the right size for an image of the | ||
| 84 | - // specified dimensions. Then we'd either have to write | ||
| 85 | - // out raw image data or continue to write filtered data, | ||
| 86 | - // resuming normalization when we get to the end. | ||
| 87 | - // Instead, for now, we'll just turn off normalization for | ||
| 88 | - // the remainder of this stream. | ||
| 89 | - this->pass_through = true; | 105 | + // Suspend normal scanning until we find an EI token. |
| 106 | + this->in_inline_image = true; | ||
| 90 | if (this->unread_char) | 107 | if (this->unread_char) |
| 91 | { | 108 | { |
| 92 | writeNext(&this->char_to_unread, 1); | 109 | writeNext(&this->char_to_unread, 1); |
| @@ -156,7 +173,7 @@ void | @@ -156,7 +173,7 @@ void | ||
| 156 | Pl_QPDFTokenizer::finish() | 173 | Pl_QPDFTokenizer::finish() |
| 157 | { | 174 | { |
| 158 | this->tokenizer.presentEOF(); | 175 | this->tokenizer.presentEOF(); |
| 159 | - if (! this->pass_through) | 176 | + if (! this->in_inline_image) |
| 160 | { | 177 | { |
| 161 | QPDFTokenizer::Token token; | 178 | QPDFTokenizer::Token token; |
| 162 | if (tokenizer.getToken(token, this->unread_char, this->char_to_unread)) | 179 | if (tokenizer.getToken(token, this->unread_char, this->char_to_unread)) |
libqpdf/qpdf/Pl_QPDFTokenizer.hh
| @@ -33,7 +33,9 @@ class Pl_QPDFTokenizer: public Pipeline | @@ -33,7 +33,9 @@ class Pl_QPDFTokenizer: public Pipeline | ||
| 33 | bool last_char_was_cr; | 33 | bool last_char_was_cr; |
| 34 | bool unread_char; | 34 | bool unread_char; |
| 35 | char char_to_unread; | 35 | char char_to_unread; |
| 36 | - bool pass_through; | 36 | + bool in_inline_image; |
| 37 | + static int const IMAGE_BUF_SIZE = 4; // must be >= 4 | ||
| 38 | + char image_buf[IMAGE_BUF_SIZE]; | ||
| 37 | }; | 39 | }; |
| 38 | 40 | ||
| 39 | #endif // __PL_QPDFTOKENIZER_HH__ | 41 | #endif // __PL_QPDFTOKENIZER_HH__ |
qpdf/qpdf.testcov
| @@ -187,3 +187,4 @@ QPDF_Stream getRawStreamData 0 | @@ -187,3 +187,4 @@ QPDF_Stream getRawStreamData 0 | ||
| 187 | QPDF_Stream getStreamData 0 | 187 | QPDF_Stream getStreamData 0 |
| 188 | QPDF_Stream expand filter abbreviation 0 | 188 | QPDF_Stream expand filter abbreviation 0 |
| 189 | qpdf-c called qpdf_read_memory 0 | 189 | qpdf-c called qpdf_read_memory 0 |
| 190 | +Pl_QPDFTokenizer found EI 0 |
qpdf/qtest/qpdf.test
| @@ -1257,8 +1257,8 @@ my @flags = (["-qdf", # 1 | @@ -1257,8 +1257,8 @@ my @flags = (["-qdf", # 1 | ||
| 1257 | "no arguments"], | 1257 | "no arguments"], |
| 1258 | ); | 1258 | ); |
| 1259 | 1259 | ||
| 1260 | -$n_tests += (@files * @flags * 2 * 3); | ||
| 1261 | -$n_compare_pdfs += (@files * @flags * 2); | 1260 | +$n_tests += 1 + (@files * @flags * 2 * 3); |
| 1261 | +$n_compare_pdfs += 1 + (@files * @flags * 2); | ||
| 1262 | $n_acroread += (@files * @flags * 2); | 1262 | $n_acroread += (@files * @flags * 2); |
| 1263 | 1263 | ||
| 1264 | foreach my $file (@files) | 1264 | foreach my $file (@files) |
| @@ -1311,6 +1311,14 @@ foreach my $file (@files) | @@ -1311,6 +1311,14 @@ foreach my $file (@files) | ||
| 1311 | } | 1311 | } |
| 1312 | } | 1312 | } |
| 1313 | 1313 | ||
| 1314 | +# inline-images-cr.pdf is xbkm938-dies.pdf from PDF collection | ||
| 1315 | +$td->runtest("convert inline-images-cr to qdf", | ||
| 1316 | + {$td->COMMAND => "qpdf --static-id --no-original-object-ids" . | ||
| 1317 | + " --qdf inline-images-cr.pdf a.pdf"}, | ||
| 1318 | + {$td->STRING => "", $td->EXIT_STATUS => 0}); | ||
| 1319 | + | ||
| 1320 | +compare_pdfs("inline-images-cr.pdf", "a.pdf"); | ||
| 1321 | + | ||
| 1314 | show_ntests(); | 1322 | show_ntests(); |
| 1315 | # ---------- | 1323 | # ---------- |
| 1316 | $td->notify("--- fix-qdf Tests ---"); | 1324 | $td->notify("--- fix-qdf Tests ---"); |
qpdf/qtest/qpdf/inline-images-cr.pdf
0 → 100644
No preview for this file type