Commit 6405d3928f78bc227587b87b8e2c2d46502796e0
1 parent
a8f22487
be less conservative when skipping over inline images in content normalization
git-svn-id: svn+q:///qpdf/trunk@1050 71b93d88-0707-0410-a8cf-f5a4172ac649
Showing
7 changed files
with
67 additions
and
20 deletions
ChangeLog
| 1 | +2011-04-30 Jay Berkenbilt <ejb@ql.org> | |
| 2 | + | |
| 3 | + * libqpdf/Pl_QPDFTokenizer.cc (processChar): When an inline image | |
| 4 | + is detected, suspend normalization only up to the end of the | |
| 5 | + inline image rather than for the remainder of the content stream. | |
| 6 | + (Fixes qpdf-Bugs 3152169.) | |
| 7 | + | |
| 1 | 8 | 2011-01-31 Jay Berkenbilt <ejb@ql.org> |
| 2 | 9 | |
| 3 | 10 | * libqpdf/QPDF.cc (readObjectAtOffset): use -1 rather than 0 when | ... | ... |
TODO
| 1 | +Next | |
| 2 | +==== | |
| 3 | + | |
| 4 | + * Look for %PDF header somewhere within the first 1024 bytes of the | |
| 5 | + file. Also accept headers of the form "%!PS−Adobe−N.n PDF−M.m". | |
| 6 | + See Implementation notes 13 and 14 in appendix H of the PDF 1.7 | |
| 7 | + specification. This is bug 3267974. | |
| 8 | + | |
| 1 | 9 | General |
| 2 | 10 | ======= |
| 3 | 11 | |
| ... | ... | @@ -174,6 +182,10 @@ Index: QPDFWriter.cc |
| 174 | 182 | providing some mechanism to recover earlier versions of a file |
| 175 | 183 | embedded prior to appended sections. |
| 176 | 184 | |
| 185 | + * From a suggestion in bug 3152169, consisder having an option to | |
| 186 | + re-encode inline images with an ASCII encoding. | |
| 187 | + | |
| 188 | + | |
| 177 | 189 | Splitting by Pages |
| 178 | 190 | ================== |
| 179 | 191 | ... | ... |
libqpdf/Pl_QPDFTokenizer.cc
| 1 | 1 | #include <qpdf/Pl_QPDFTokenizer.hh> |
| 2 | 2 | #include <qpdf/QPDF_String.hh> |
| 3 | 3 | #include <qpdf/QPDF_Name.hh> |
| 4 | +#include <qpdf/QTC.hh> | |
| 4 | 5 | #include <stdexcept> |
| 5 | 6 | #include <string.h> |
| 6 | 7 | |
| ... | ... | @@ -11,8 +12,9 @@ Pl_QPDFTokenizer::Pl_QPDFTokenizer(char const* identifier, Pipeline* next) : |
| 11 | 12 | last_char_was_cr(false), |
| 12 | 13 | unread_char(false), |
| 13 | 14 | char_to_unread('\0'), |
| 14 | - pass_through(false) | |
| 15 | + in_inline_image(false) | |
| 15 | 16 | { |
| 17 | + memset(this->image_buf, 0, IMAGE_BUF_SIZE); | |
| 16 | 18 | } |
| 17 | 19 | |
| 18 | 20 | Pl_QPDFTokenizer::~Pl_QPDFTokenizer() |
| ... | ... | @@ -56,11 +58,34 @@ Pl_QPDFTokenizer::writeToken(QPDFTokenizer::Token& token) |
| 56 | 58 | void |
| 57 | 59 | Pl_QPDFTokenizer::processChar(char ch) |
| 58 | 60 | { |
| 59 | - if (this->pass_through) | |
| 61 | + if (this->in_inline_image) | |
| 60 | 62 | { |
| 61 | - // We're not normalizing anymore -- just write this without | |
| 62 | - // looking at it. | |
| 63 | - writeNext(&ch, 1); | |
| 63 | + // Scan through the input looking for EI surrounded by | |
| 64 | + // whitespace. If that pattern appears in the inline image's | |
| 65 | + // representation, we're hosed, but this situation seems | |
| 66 | + // excessively unlikely, and this code path is only followed | |
| 67 | + // during content stream normalization, which is pretty much | |
| 68 | + // used for debugging and human inspection of PDF files. | |
| 69 | + memmove(this->image_buf, | |
| 70 | + this->image_buf + 1, | |
| 71 | + IMAGE_BUF_SIZE - 1); | |
| 72 | + this->image_buf[IMAGE_BUF_SIZE - 1] = ch; | |
| 73 | + if (strchr(" \t\n\v\f\r", this->image_buf[0]) && | |
| 74 | + (this->image_buf[1] == 'E') && | |
| 75 | + (this->image_buf[2] == 'I') && | |
| 76 | + strchr(" \t\n\v\f\r", this->image_buf[3])) | |
| 77 | + { | |
| 78 | + // We've found an EI operator. We've already written the | |
| 79 | + // EI operator to output; terminate with a newline | |
| 80 | + // character and resume normal processing. | |
| 81 | + writeNext("\n", 1); | |
| 82 | + this->in_inline_image = false; | |
| 83 | + QTC::TC("qpdf", "Pl_QPDFTokenizer found EI"); | |
| 84 | + } | |
| 85 | + else | |
| 86 | + { | |
| 87 | + writeNext(&ch, 1); | |
| 88 | + } | |
| 64 | 89 | return; |
| 65 | 90 | } |
| 66 | 91 | |
| ... | ... | @@ -75,18 +100,10 @@ Pl_QPDFTokenizer::processChar(char ch) |
| 75 | 100 | this->newline_after_next_token = false; |
| 76 | 101 | } |
| 77 | 102 | if ((token.getType() == QPDFTokenizer::tt_word) && |
| 78 | - (token.getValue() == "BI")) | |
| 103 | + (token.getValue() == "ID")) | |
| 79 | 104 | { |
| 80 | - // Uh oh.... we're not sophisticated enough to handle | |
| 81 | - // inline images safely. We'd have to to set up all the | |
| 82 | - // filters and pipe the image data through it until the | |
| 83 | - // filtered output was the right size for an image of the | |
| 84 | - // specified dimensions. Then we'd either have to write | |
| 85 | - // out raw image data or continue to write filtered data, | |
| 86 | - // resuming normalization when we get to the end. | |
| 87 | - // Instead, for now, we'll just turn off normalization for | |
| 88 | - // the remainder of this stream. | |
| 89 | - this->pass_through = true; | |
| 105 | + // Suspend normal scanning until we find an EI token. | |
| 106 | + this->in_inline_image = true; | |
| 90 | 107 | if (this->unread_char) |
| 91 | 108 | { |
| 92 | 109 | writeNext(&this->char_to_unread, 1); |
| ... | ... | @@ -156,7 +173,7 @@ void |
| 156 | 173 | Pl_QPDFTokenizer::finish() |
| 157 | 174 | { |
| 158 | 175 | this->tokenizer.presentEOF(); |
| 159 | - if (! this->pass_through) | |
| 176 | + if (! this->in_inline_image) | |
| 160 | 177 | { |
| 161 | 178 | QPDFTokenizer::Token token; |
| 162 | 179 | if (tokenizer.getToken(token, this->unread_char, this->char_to_unread)) | ... | ... |
libqpdf/qpdf/Pl_QPDFTokenizer.hh
| ... | ... | @@ -33,7 +33,9 @@ class Pl_QPDFTokenizer: public Pipeline |
| 33 | 33 | bool last_char_was_cr; |
| 34 | 34 | bool unread_char; |
| 35 | 35 | char char_to_unread; |
| 36 | - bool pass_through; | |
| 36 | + bool in_inline_image; | |
| 37 | + static int const IMAGE_BUF_SIZE = 4; // must be >= 4 | |
| 38 | + char image_buf[IMAGE_BUF_SIZE]; | |
| 37 | 39 | }; |
| 38 | 40 | |
| 39 | 41 | #endif // __PL_QPDFTOKENIZER_HH__ | ... | ... |
qpdf/qpdf.testcov
qpdf/qtest/qpdf.test
| ... | ... | @@ -1257,8 +1257,8 @@ my @flags = (["-qdf", # 1 |
| 1257 | 1257 | "no arguments"], |
| 1258 | 1258 | ); |
| 1259 | 1259 | |
| 1260 | -$n_tests += (@files * @flags * 2 * 3); | |
| 1261 | -$n_compare_pdfs += (@files * @flags * 2); | |
| 1260 | +$n_tests += 1 + (@files * @flags * 2 * 3); | |
| 1261 | +$n_compare_pdfs += 1 + (@files * @flags * 2); | |
| 1262 | 1262 | $n_acroread += (@files * @flags * 2); |
| 1263 | 1263 | |
| 1264 | 1264 | foreach my $file (@files) |
| ... | ... | @@ -1311,6 +1311,14 @@ foreach my $file (@files) |
| 1311 | 1311 | } |
| 1312 | 1312 | } |
| 1313 | 1313 | |
| 1314 | +# inline-images-cr.pdf is xbkm938-dies.pdf from PDF collection | |
| 1315 | +$td->runtest("convert inline-images-cr to qdf", | |
| 1316 | + {$td->COMMAND => "qpdf --static-id --no-original-object-ids" . | |
| 1317 | + " --qdf inline-images-cr.pdf a.pdf"}, | |
| 1318 | + {$td->STRING => "", $td->EXIT_STATUS => 0}); | |
| 1319 | + | |
| 1320 | +compare_pdfs("inline-images-cr.pdf", "a.pdf"); | |
| 1321 | + | |
| 1314 | 1322 | show_ntests(); |
| 1315 | 1323 | # ---------- |
| 1316 | 1324 | $td->notify("--- fix-qdf Tests ---"); | ... | ... |
qpdf/qtest/qpdf/inline-images-cr.pdf
0 → 100644
No preview for this file type