Commit 5136238f2a973f693cea53c340dcff23a655531f
1 parent
30709935
Detect and report bad tokens in content normalization
Showing
9 changed files
with
343 additions
and
4 deletions
ChangeLog
| @@ -153,6 +153,25 @@ | @@ -153,6 +153,25 @@ | ||
| 153 | * Provide heavily annoated examples/pdf-filter-tokens.cc example | 153 | * Provide heavily annoated examples/pdf-filter-tokens.cc example |
| 154 | that illustrates use of some simple token filters. | 154 | that illustrates use of some simple token filters. |
| 155 | 155 | ||
| 156 | + * When normalizing content streams, as in qdf mode, issue warning | ||
| 157 | + about bad tokens. Content streams are only normalized when this is | ||
| 158 | + explicitly requested, so this has no impact on normal operation. | ||
| 159 | + However, in qdf mode, if qpdf detects a bad token, it means that | ||
| 160 | + either there's a bug in qpdf's lexer, that the file is damaged, or | ||
| 161 | + that the page's contents are split in a weird way. In any of those | ||
| 162 | + cases, qpdf could potentially damage the stream's contents by | ||
| 163 | + replacing carrige returns with newlines or otherwise messing with | ||
| 164 | + spaces. The mostly likely case of this would be an inline image's | ||
| 165 | + compressed data being divided across two streams and having the | ||
| 166 | + compressed data in the second stream contain a carriage return as | ||
| 167 | + part of its binary data. If you are using qdf mode just to look at | ||
| 168 | + PDF files in text editors, this usually doesn't matter. In cases | ||
| 169 | + of contents split across multiple streams, coalescing streams | ||
| 170 | + would eliminate the problem, so the warning mentions this. Prior | ||
| 171 | + to this enhancement, the chances of qdf mode writing incorrect | ||
| 172 | + data were already very low. This change should make it nearly | ||
| 173 | + impossible for qdf mode to unknowingly write invalid data. | ||
| 174 | + | ||
| 156 | 2018-02-04 Jay Berkenbilt <ejb@ql.org> | 175 | 2018-02-04 Jay Berkenbilt <ejb@ql.org> |
| 157 | 176 | ||
| 158 | * Add QPDFWriter::setLinearizationPass1Filename method and | 177 | * Add QPDFWriter::setLinearizationPass1Filename method and |
libqpdf/ContentNormalizer.cc
| 1 | #include <qpdf/ContentNormalizer.hh> | 1 | #include <qpdf/ContentNormalizer.hh> |
| 2 | #include <qpdf/QUtil.hh> | 2 | #include <qpdf/QUtil.hh> |
| 3 | 3 | ||
| 4 | -ContentNormalizer::ContentNormalizer() | 4 | +ContentNormalizer::ContentNormalizer() : |
| 5 | + any_bad_tokens(false), | ||
| 6 | + last_token_was_bad(false) | ||
| 5 | { | 7 | { |
| 6 | } | 8 | } |
| 7 | 9 | ||
| @@ -15,6 +17,16 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token) | @@ -15,6 +17,16 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const& token) | ||
| 15 | std::string value = token.getRawValue(); | 17 | std::string value = token.getRawValue(); |
| 16 | QPDFTokenizer::token_type_e token_type = token.getType(); | 18 | QPDFTokenizer::token_type_e token_type = token.getType(); |
| 17 | 19 | ||
| 20 | + if (token_type == QPDFTokenizer::tt_bad) | ||
| 21 | + { | ||
| 22 | + this->any_bad_tokens = true; | ||
| 23 | + this->last_token_was_bad = true; | ||
| 24 | + } | ||
| 25 | + else if (token_type != QPDFTokenizer::tt_eof) | ||
| 26 | + { | ||
| 27 | + this->last_token_was_bad = false; | ||
| 28 | + } | ||
| 29 | + | ||
| 18 | switch (token_type) | 30 | switch (token_type) |
| 19 | { | 31 | { |
| 20 | case QPDFTokenizer::tt_space: | 32 | case QPDFTokenizer::tt_space: |
| @@ -75,3 +87,15 @@ ContentNormalizer::handleEOF() | @@ -75,3 +87,15 @@ ContentNormalizer::handleEOF() | ||
| 75 | { | 87 | { |
| 76 | finish(); | 88 | finish(); |
| 77 | } | 89 | } |
| 90 | + | ||
| 91 | +bool | ||
| 92 | +ContentNormalizer::anyBadTokens() const | ||
| 93 | +{ | ||
| 94 | + return this->any_bad_tokens; | ||
| 95 | +} | ||
| 96 | + | ||
| 97 | +bool | ||
| 98 | +ContentNormalizer::lastTokenWasBad()const | ||
| 99 | +{ | ||
| 100 | + return this->last_token_was_bad; | ||
| 101 | +} |
libqpdf/QPDF_Stream.cc
| @@ -609,6 +609,33 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline, | @@ -609,6 +609,33 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline, | ||
| 609 | } | 609 | } |
| 610 | } | 610 | } |
| 611 | 611 | ||
| 612 | + if (filter && | ||
| 613 | + (! suppress_warnings) && | ||
| 614 | + normalizer.getPointer() && | ||
| 615 | + normalizer->anyBadTokens()) | ||
| 616 | + { | ||
| 617 | + warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(), | ||
| 618 | + "", this->offset, | ||
| 619 | + "content normalization encountered bad tokens")); | ||
| 620 | + if (normalizer->lastTokenWasBad()) | ||
| 621 | + { | ||
| 622 | + QTC::TC("qpdf", "QPDF_Stream bad token at end during normalize"); | ||
| 623 | + warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(), | ||
| 624 | + "", this->offset, | ||
| 625 | + "normalized content ended with a bad token;" | ||
| 626 | + " you may be able to resolve this by" | ||
| 627 | + " coalescing content streams in combination" | ||
| 628 | + " with normalizing content. From the command" | ||
| 629 | + " line, specify --coalesce-contents")); | ||
| 630 | + } | ||
| 631 | + warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(), | ||
| 632 | + "", this->offset, | ||
| 633 | + "Resulting stream data may be corrupted but is" | ||
| 634 | + " may still useful for manual inspection." | ||
| 635 | + " For more information on this warning, search" | ||
| 636 | + " for content normalization in the manual.")); | ||
| 637 | + } | ||
| 638 | + | ||
| 612 | return filter; | 639 | return filter; |
| 613 | } | 640 | } |
| 614 | 641 |
libqpdf/qpdf/ContentNormalizer.hh
| @@ -10,6 +10,13 @@ class ContentNormalizer: public QPDFObjectHandle::TokenFilter | @@ -10,6 +10,13 @@ class ContentNormalizer: public QPDFObjectHandle::TokenFilter | ||
| 10 | virtual ~ContentNormalizer(); | 10 | virtual ~ContentNormalizer(); |
| 11 | virtual void handleToken(QPDFTokenizer::Token const&); | 11 | virtual void handleToken(QPDFTokenizer::Token const&); |
| 12 | virtual void handleEOF(); | 12 | virtual void handleEOF(); |
| 13 | + | ||
| 14 | + bool anyBadTokens() const; | ||
| 15 | + bool lastTokenWasBad() const; | ||
| 16 | + | ||
| 17 | + private: | ||
| 18 | + bool any_bad_tokens; | ||
| 19 | + bool last_token_was_bad; | ||
| 13 | }; | 20 | }; |
| 14 | 21 | ||
| 15 | #endif // __CONTENTNORMALIZER_HH__ | 22 | #endif // __CONTENTNORMALIZER_HH__ |
qpdf/qpdf.testcov
| @@ -306,3 +306,4 @@ Pl_QPDFTokenizer found ID 0 | @@ -306,3 +306,4 @@ Pl_QPDFTokenizer found ID 0 | ||
| 306 | QPDFObjectHandle non-stream in stream array 0 | 306 | QPDFObjectHandle non-stream in stream array 0 |
| 307 | QPDFObjectHandle coalesce called on stream 0 | 307 | QPDFObjectHandle coalesce called on stream 0 |
| 308 | QPDFObjectHandle coalesce provide stream data 0 | 308 | QPDFObjectHandle coalesce provide stream data 0 |
| 309 | +QPDF_Stream bad token at end during normalize 0 |
qpdf/qtest/qpdf.test
| @@ -737,8 +737,16 @@ $td->runtest("stream with tiff predictor", | @@ -737,8 +737,16 @@ $td->runtest("stream with tiff predictor", | ||
| 737 | show_ntests(); | 737 | show_ntests(); |
| 738 | # ---------- | 738 | # ---------- |
| 739 | $td->notify("--- Coalesce contents ---"); | 739 | $td->notify("--- Coalesce contents ---"); |
| 740 | -$n_tests += 4; | 740 | +$n_tests += 6; |
| 741 | 741 | ||
| 742 | +$td->runtest("qdf with normalize warnings", | ||
| 743 | + {$td->COMMAND => | ||
| 744 | + "qpdf --qdf --static-id coalesce.pdf a.pdf"}, | ||
| 745 | + {$td->FILE => "normalize-warnings.out", $td->EXIT_STATUS => 3}, | ||
| 746 | + $td->NORMALIZE_NEWLINES); | ||
| 747 | +$td->runtest("check output", | ||
| 748 | + {$td->FILE => "a.pdf"}, | ||
| 749 | + {$td->FILE => "coalesce.qdf"}); | ||
| 742 | $td->runtest("coalesce contents with qdf", | 750 | $td->runtest("coalesce contents with qdf", |
| 743 | {$td->COMMAND => | 751 | {$td->COMMAND => |
| 744 | "qpdf --qdf --static-id" . | 752 | "qpdf --qdf --static-id" . |
qpdf/qtest/qpdf/coalesce.qdf
0 → 100644
No preview for this file type
qpdf/qtest/qpdf/good14.out
| @@ -13,7 +13,9 @@ three lines | @@ -13,7 +13,9 @@ three lines | ||
| 13 | <8a8b> | 13 | <8a8b> |
| 14 | (ab) | 14 | (ab) |
| 15 | <8c><dd> ) > | 15 | <8c><dd> ) > |
| 16 | -<610062> (MOO)-- stream 1 -- | 16 | +<610062> (MOO)WARNING: good14.pdf (file position 628): content normalization encountered bad tokens |
| 17 | +WARNING: good14.pdf (file position 628): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | ||
| 18 | +-- stream 1 -- | ||
| 17 | This stream does end with a newline. | 19 | This stream does end with a newline. |
| 18 | // tests: | 20 | // tests: |
| 19 | // bad tokens preserved | 21 | // bad tokens preserved |
| @@ -31,10 +33,18 @@ This stream does end with a newline. | @@ -31,10 +33,18 @@ This stream does end with a newline. | ||
| 31 | 33 | ||
| 32 | /good name | 34 | /good name |
| 33 | /bad#00name | 35 | /bad#00name |
| 36 | +WARNING: good14.pdf (file position 860): content normalization encountered bad tokens | ||
| 37 | +WARNING: good14.pdf (file position 860): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | ||
| 34 | -- stream 2 -- | 38 | -- stream 2 -- |
| 35 | (This stream ends with a \001 bad token | 39 | (This stream ends with a \001 bad token |
| 40 | +WARNING: good14.pdf (file position 1316): content normalization encountered bad tokens | ||
| 41 | +WARNING: good14.pdf (file position 1316): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents | ||
| 42 | +WARNING: good14.pdf (file position 1316): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | ||
| 36 | -- stream 3 -- | 43 | -- stream 3 -- |
| 37 | -<AB X-- stream 4 -- | 44 | +<AB XWARNING: good14.pdf (file position 1406): content normalization encountered bad tokens |
| 45 | +WARNING: good14.pdf (file position 1406): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents | ||
| 46 | +WARNING: good14.pdf (file position 1406): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | ||
| 47 | +-- stream 4 -- | ||
| 38 | (ends with a name) | 48 | (ends with a name) |
| 39 | /ThisMustBeLast-- stream 5 -- | 49 | /ThisMustBeLast-- stream 5 -- |
| 40 | % This stream has an inline image marker that is not terminated | 50 | % This stream has an inline image marker that is not terminated |
| @@ -44,4 +54,7 @@ BI | @@ -44,4 +54,7 @@ BI | ||
| 44 | ID | 54 | ID |
| 45 | <506f7 | 55 | <506f7 |
| 46 | 461746f> | 56 | 461746f> |
| 57 | +WARNING: good14.pdf (file position 1549): content normalization encountered bad tokens | ||
| 58 | +WARNING: good14.pdf (file position 1549): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents | ||
| 59 | +WARNING: good14.pdf (file position 1549): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | ||
| 47 | test 3 done | 60 | test 3 done |
qpdf/qtest/qpdf/normalize-warnings.out
0 → 100644
| 1 | +WARNING: coalesce.pdf (file position 671): content normalization encountered bad tokens | ||
| 2 | +WARNING: coalesce.pdf (file position 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents | ||
| 3 | +WARNING: coalesce.pdf (file position 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | ||
| 4 | +WARNING: coalesce.pdf (file position 823): content normalization encountered bad tokens | ||
| 5 | +WARNING: coalesce.pdf (file position 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | ||
| 6 | +WARNING: coalesce.pdf (file position 962): content normalization encountered bad tokens | ||
| 7 | +WARNING: coalesce.pdf (file position 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents | ||
| 8 | +WARNING: coalesce.pdf (file position 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | ||
| 9 | +qpdf: operation succeeded with warnings; resulting file may have some problems |