Commit 5136238f2a973f693cea53c340dcff23a655531f
1 parent
30709935
Detect and report bad tokens in content normalization
Showing
9 changed files
with
343 additions
and
4 deletions
ChangeLog
| ... | ... | @@ -153,6 +153,25 @@ |
| 153 | 153 | * Provide heavily annoated examples/pdf-filter-tokens.cc example |
| 154 | 154 | that illustrates use of some simple token filters. |
| 155 | 155 | |
| 156 | + * When normalizing content streams, as in qdf mode, issue warning | |
| 157 | + about bad tokens. Content streams are only normalized when this is | |
| 158 | + explicitly requested, so this has no impact on normal operation. | |
| 159 | + However, in qdf mode, if qpdf detects a bad token, it means that | |
| 160 | + either there's a bug in qpdf's lexer, that the file is damaged, or | |
| 161 | + that the page's contents are split in a weird way. In any of those | |
| 162 | + cases, qpdf could potentially damage the stream's contents by | |
| 163 | + replacing carrige returns with newlines or otherwise messing with | |
| 164 | + spaces. The mostly likely case of this would be an inline image's | |
| 165 | + compressed data being divided across two streams and having the | |
| 166 | + compressed data in the second stream contain a carriage return as | |
| 167 | + part of its binary data. If you are using qdf mode just to look at | |
| 168 | + PDF files in text editors, this usually doesn't matter. In cases | |
| 169 | + of contents split across multiple streams, coalescing streams | |
| 170 | + would eliminate the problem, so the warning mentions this. Prior | |
| 171 | + to this enhancement, the chances of qdf mode writing incorrect | |
| 172 | + data were already very low. This change should make it nearly | |
| 173 | + impossible for qdf mode to unknowingly write invalid data. | |
| 174 | + | |
| 156 | 175 | 2018-02-04 Jay Berkenbilt <ejb@ql.org> |
| 157 | 176 | |
| 158 | 177 | * Add QPDFWriter::setLinearizationPass1Filename method and | ... | ... |
libqpdf/ContentNormalizer.cc
| 1 | 1 | #include <qpdf/ContentNormalizer.hh> |
| 2 | 2 | #include <qpdf/QUtil.hh> |
| 3 | 3 | |
| 4 | -ContentNormalizer::ContentNormalizer() | |
| 4 | +ContentNormalizer::ContentNormalizer() : | |
| 5 | + any_bad_tokens(false), | |
| 6 | + last_token_was_bad(false) | |
| 5 | 7 | { |
| 6 | 8 | } |
| 7 | 9 | |
| ... | ... | @@ -15,6 +17,16 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token) |
| 15 | 17 | std::string value = token.getRawValue(); |
| 16 | 18 | QPDFTokenizer::token_type_e token_type = token.getType(); |
| 17 | 19 | |
| 20 | + if (token_type == QPDFTokenizer::tt_bad) | |
| 21 | + { | |
| 22 | + this->any_bad_tokens = true; | |
| 23 | + this->last_token_was_bad = true; | |
| 24 | + } | |
| 25 | + else if (token_type != QPDFTokenizer::tt_eof) | |
| 26 | + { | |
| 27 | + this->last_token_was_bad = false; | |
| 28 | + } | |
| 29 | + | |
| 18 | 30 | switch (token_type) |
| 19 | 31 | { |
| 20 | 32 | case QPDFTokenizer::tt_space: |
| ... | ... | @@ -75,3 +87,15 @@ ContentNormalizer::handleEOF() |
| 75 | 87 | { |
| 76 | 88 | finish(); |
| 77 | 89 | } |
| 90 | + | |
| 91 | +bool | |
| 92 | +ContentNormalizer::anyBadTokens() const | |
| 93 | +{ | |
| 94 | + return this->any_bad_tokens; | |
| 95 | +} | |
| 96 | + | |
| 97 | +bool | |
| 98 | +ContentNormalizer::lastTokenWasBad()const | |
| 99 | +{ | |
| 100 | + return this->last_token_was_bad; | |
| 101 | +} | ... | ... |
libqpdf/QPDF_Stream.cc
| ... | ... | @@ -609,6 +609,33 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline, |
| 609 | 609 | } |
| 610 | 610 | } |
| 611 | 611 | |
| 612 | + if (filter && | |
| 613 | + (! suppress_warnings) && | |
| 614 | + normalizer.getPointer() && | |
| 615 | + normalizer->anyBadTokens()) | |
| 616 | + { | |
| 617 | + warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(), | |
| 618 | + "", this->offset, | |
| 619 | + "content normalization encountered bad tokens")); | |
| 620 | + if (normalizer->lastTokenWasBad()) | |
| 621 | + { | |
| 622 | + QTC::TC("qpdf", "QPDF_Stream bad token at end during normalize"); | |
| 623 | + warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(), | |
| 624 | + "", this->offset, | |
| 625 | + "normalized content ended with a bad token;" | |
| 626 | + " you may be able to resolve this by" | |
| 627 | + " coalescing content streams in combination" | |
| 628 | + " with normalizing content. From the command" | |
| 629 | + " line, specify --coalesce-contents")); | |
| 630 | + } | |
| 631 | + warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(), | |
| 632 | + "", this->offset, | |
| 633 | + "Resulting stream data may be corrupted but is" | |
| 634 | + " may still useful for manual inspection." | |
| 635 | + " For more information on this warning, search" | |
| 636 | + " for content normalization in the manual.")); | |
| 637 | + } | |
| 638 | + | |
| 612 | 639 | return filter; |
| 613 | 640 | } |
| 614 | 641 | ... | ... |
libqpdf/qpdf/ContentNormalizer.hh
| ... | ... | @@ -10,6 +10,13 @@ class ContentNormalizer: public QPDFObjectHandle::TokenFilter |
| 10 | 10 | virtual ~ContentNormalizer(); |
| 11 | 11 | virtual void handleToken(QPDFTokenizer::Token const&); |
| 12 | 12 | virtual void handleEOF(); |
| 13 | + | |
| 14 | + bool anyBadTokens() const; | |
| 15 | + bool lastTokenWasBad() const; | |
| 16 | + | |
| 17 | + private: | |
| 18 | + bool any_bad_tokens; | |
| 19 | + bool last_token_was_bad; | |
| 13 | 20 | }; |
| 14 | 21 | |
| 15 | 22 | #endif // __CONTENTNORMALIZER_HH__ | ... | ... |
qpdf/qpdf.testcov
qpdf/qtest/qpdf.test
| ... | ... | @@ -737,8 +737,16 @@ $td->runtest("stream with tiff predictor", |
| 737 | 737 | show_ntests(); |
| 738 | 738 | # ---------- |
| 739 | 739 | $td->notify("--- Coalesce contents ---"); |
| 740 | -$n_tests += 4; | |
| 740 | +$n_tests += 6; | |
| 741 | 741 | |
| 742 | +$td->runtest("qdf with normalize warnings", | |
| 743 | + {$td->COMMAND => | |
| 744 | + "qpdf --qdf --static-id coalesce.pdf a.pdf"}, | |
| 745 | + {$td->FILE => "normalize-warnings.out", $td->EXIT_STATUS => 3}, | |
| 746 | + $td->NORMALIZE_NEWLINES); | |
| 747 | +$td->runtest("check output", | |
| 748 | + {$td->FILE => "a.pdf"}, | |
| 749 | + {$td->FILE => "coalesce.qdf"}); | |
| 742 | 750 | $td->runtest("coalesce contents with qdf", |
| 743 | 751 | {$td->COMMAND => |
| 744 | 752 | "qpdf --qdf --static-id" . | ... | ... |
qpdf/qtest/qpdf/coalesce.qdf
0 → 100644
No preview for this file type
qpdf/qtest/qpdf/good14.out
| ... | ... | @@ -13,7 +13,9 @@ three lines |
| 13 | 13 | <8a8b> |
| 14 | 14 | (ab) |
| 15 | 15 | <8c><dd> ) > |
| 16 | -<610062> (MOO)-- stream 1 -- | |
| 16 | +<610062> (MOO)WARNING: good14.pdf (file position 628): content normalization encountered bad tokens | |
| 17 | +WARNING: good14.pdf (file position 628): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | |
| 18 | +-- stream 1 -- | |
| 17 | 19 | This stream does end with a newline. |
| 18 | 20 | // tests: |
| 19 | 21 | // bad tokens preserved |
| ... | ... | @@ -31,10 +33,18 @@ This stream does end with a newline. |
| 31 | 33 | |
| 32 | 34 | /good name |
| 33 | 35 | /bad#00name |
| 36 | +WARNING: good14.pdf (file position 860): content normalization encountered bad tokens | |
| 37 | +WARNING: good14.pdf (file position 860): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | |
| 34 | 38 | -- stream 2 -- |
| 35 | 39 | (This stream ends with a \001 bad token |
| 40 | +WARNING: good14.pdf (file position 1316): content normalization encountered bad tokens | |
| 41 | +WARNING: good14.pdf (file position 1316): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents | |
| 42 | +WARNING: good14.pdf (file position 1316): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | |
| 36 | 43 | -- stream 3 -- |
| 37 | -<AB X-- stream 4 -- | |
| 44 | +<AB XWARNING: good14.pdf (file position 1406): content normalization encountered bad tokens | |
| 45 | +WARNING: good14.pdf (file position 1406): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents | |
| 46 | +WARNING: good14.pdf (file position 1406): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | |
| 47 | +-- stream 4 -- | |
| 38 | 48 | (ends with a name) |
| 39 | 49 | /ThisMustBeLast-- stream 5 -- |
| 40 | 50 | % This stream has an inline image marker that is not terminated |
| ... | ... | @@ -44,4 +54,7 @@ BI |
| 44 | 54 | ID |
| 45 | 55 | <506f7 |
| 46 | 56 | 461746f> |
| 57 | +WARNING: good14.pdf (file position 1549): content normalization encountered bad tokens | |
| 58 | +WARNING: good14.pdf (file position 1549): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents | |
| 59 | +WARNING: good14.pdf (file position 1549): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | |
| 47 | 60 | test 3 done | ... | ... |
qpdf/qtest/qpdf/normalize-warnings.out
0 → 100644
| 1 | +WARNING: coalesce.pdf (file position 671): content normalization encountered bad tokens | |
| 2 | +WARNING: coalesce.pdf (file position 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents | |
| 3 | +WARNING: coalesce.pdf (file position 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | |
| 4 | +WARNING: coalesce.pdf (file position 823): content normalization encountered bad tokens | |
| 5 | +WARNING: coalesce.pdf (file position 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | |
| 6 | +WARNING: coalesce.pdf (file position 962): content normalization encountered bad tokens | |
| 7 | +WARNING: coalesce.pdf (file position 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents | |
| 8 | +WARNING: coalesce.pdf (file position 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | |
| 9 | +qpdf: operation succeeded with warnings; resulting file may have some problems | ... | ... |