Commit 5136238f2a973f693cea53c340dcff23a655531f

Authored by Jay Berkenbilt
1 parent 30709935

Detect and report bad tokens in content normalization

ChangeLog
... ... @@ -153,6 +153,25 @@
153 153 * Provide heavily annoated examples/pdf-filter-tokens.cc example
154 154 that illustrates use of some simple token filters.
155 155  
  156 + * When normalizing content streams, as in qdf mode, issue warning
  157 + about bad tokens. Content streams are only normalized when this is
  158 + explicitly requested, so this has no impact on normal operation.
  159 + However, in qdf mode, if qpdf detects a bad token, it means that
  160 + either there's a bug in qpdf's lexer, that the file is damaged, or
  161 + that the page's contents are split in a weird way. In any of those
  162 + cases, qpdf could potentially damage the stream's contents by
  163 + replacing carrige returns with newlines or otherwise messing with
  164 + spaces. The mostly likely case of this would be an inline image's
  165 + compressed data being divided across two streams and having the
  166 + compressed data in the second stream contain a carriage return as
  167 + part of its binary data. If you are using qdf mode just to look at
  168 + PDF files in text editors, this usually doesn't matter. In cases
  169 + of contents split across multiple streams, coalescing streams
  170 + would eliminate the problem, so the warning mentions this. Prior
  171 + to this enhancement, the chances of qdf mode writing incorrect
  172 + data were already very low. This change should make it nearly
  173 + impossible for qdf mode to unknowingly write invalid data.
  174 +
156 175 2018-02-04 Jay Berkenbilt <ejb@ql.org>
157 176  
158 177 * Add QPDFWriter::setLinearizationPass1Filename method and
... ...
libqpdf/ContentNormalizer.cc
1 1 #include <qpdf/ContentNormalizer.hh>
2 2 #include <qpdf/QUtil.hh>
3 3  
4   -ContentNormalizer::ContentNormalizer()
  4 +ContentNormalizer::ContentNormalizer() :
  5 + any_bad_tokens(false),
  6 + last_token_was_bad(false)
5 7 {
6 8 }
7 9  
... ... @@ -15,6 +17,16 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const&amp; token)
15 17 std::string value = token.getRawValue();
16 18 QPDFTokenizer::token_type_e token_type = token.getType();
17 19  
  20 + if (token_type == QPDFTokenizer::tt_bad)
  21 + {
  22 + this->any_bad_tokens = true;
  23 + this->last_token_was_bad = true;
  24 + }
  25 + else if (token_type != QPDFTokenizer::tt_eof)
  26 + {
  27 + this->last_token_was_bad = false;
  28 + }
  29 +
18 30 switch (token_type)
19 31 {
20 32 case QPDFTokenizer::tt_space:
... ... @@ -75,3 +87,15 @@ ContentNormalizer::handleEOF()
75 87 {
76 88 finish();
77 89 }
  90 +
  91 +bool
  92 +ContentNormalizer::anyBadTokens() const
  93 +{
  94 + return this->any_bad_tokens;
  95 +}
  96 +
  97 +bool
  98 +ContentNormalizer::lastTokenWasBad()const
  99 +{
  100 + return this->last_token_was_bad;
  101 +}
... ...
libqpdf/QPDF_Stream.cc
... ... @@ -609,6 +609,33 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline,
609 609 }
610 610 }
611 611  
  612 + if (filter &&
  613 + (! suppress_warnings) &&
  614 + normalizer.getPointer() &&
  615 + normalizer->anyBadTokens())
  616 + {
  617 + warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
  618 + "", this->offset,
  619 + "content normalization encountered bad tokens"));
  620 + if (normalizer->lastTokenWasBad())
  621 + {
  622 + QTC::TC("qpdf", "QPDF_Stream bad token at end during normalize");
  623 + warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
  624 + "", this->offset,
  625 + "normalized content ended with a bad token;"
  626 + " you may be able to resolve this by"
  627 + " coalescing content streams in combination"
  628 + " with normalizing content. From the command"
  629 + " line, specify --coalesce-contents"));
  630 + }
  631 + warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
  632 + "", this->offset,
  633 + "Resulting stream data may be corrupted but is"
  634 + " may still useful for manual inspection."
  635 + " For more information on this warning, search"
  636 + " for content normalization in the manual."));
  637 + }
  638 +
612 639 return filter;
613 640 }
614 641  
... ...
libqpdf/qpdf/ContentNormalizer.hh
... ... @@ -10,6 +10,13 @@ class ContentNormalizer: public QPDFObjectHandle::TokenFilter
10 10 virtual ~ContentNormalizer();
11 11 virtual void handleToken(QPDFTokenizer::Token const&);
12 12 virtual void handleEOF();
  13 +
  14 + bool anyBadTokens() const;
  15 + bool lastTokenWasBad() const;
  16 +
  17 + private:
  18 + bool any_bad_tokens;
  19 + bool last_token_was_bad;
13 20 };
14 21  
15 22 #endif // __CONTENTNORMALIZER_HH__
... ...
qpdf/qpdf.testcov
... ... @@ -306,3 +306,4 @@ Pl_QPDFTokenizer found ID 0
306 306 QPDFObjectHandle non-stream in stream array 0
307 307 QPDFObjectHandle coalesce called on stream 0
308 308 QPDFObjectHandle coalesce provide stream data 0
  309 +QPDF_Stream bad token at end during normalize 0
... ...
qpdf/qtest/qpdf.test
... ... @@ -737,8 +737,16 @@ $td-&gt;runtest(&quot;stream with tiff predictor&quot;,
737 737 show_ntests();
738 738 # ----------
739 739 $td->notify("--- Coalesce contents ---");
740   -$n_tests += 4;
  740 +$n_tests += 6;
741 741  
  742 +$td->runtest("qdf with normalize warnings",
  743 + {$td->COMMAND =>
  744 + "qpdf --qdf --static-id coalesce.pdf a.pdf"},
  745 + {$td->FILE => "normalize-warnings.out", $td->EXIT_STATUS => 3},
  746 + $td->NORMALIZE_NEWLINES);
  747 +$td->runtest("check output",
  748 + {$td->FILE => "a.pdf"},
  749 + {$td->FILE => "coalesce.qdf"});
742 750 $td->runtest("coalesce contents with qdf",
743 751 {$td->COMMAND =>
744 752 "qpdf --qdf --static-id" .
... ...
qpdf/qtest/qpdf/coalesce.qdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/good14.out
... ... @@ -13,7 +13,9 @@ three lines
13 13 <8a8b>
14 14 (ab)
15 15 <8c><dd> ) >
16   -<610062> (MOO)-- stream 1 --
  16 +<610062> (MOO)WARNING: good14.pdf (file position 628): content normalization encountered bad tokens
  17 +WARNING: good14.pdf (file position 628): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
  18 +-- stream 1 --
17 19 This stream does end with a newline.
18 20 // tests:
19 21 // bad tokens preserved
... ... @@ -31,10 +33,18 @@ This stream does end with a newline.
31 33  
32 34 /good name
33 35 /bad#00name
  36 +WARNING: good14.pdf (file position 860): content normalization encountered bad tokens
  37 +WARNING: good14.pdf (file position 860): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
34 38 -- stream 2 --
35 39 (This stream ends with a \001 bad token
  40 +WARNING: good14.pdf (file position 1316): content normalization encountered bad tokens
  41 +WARNING: good14.pdf (file position 1316): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
  42 +WARNING: good14.pdf (file position 1316): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
36 43 -- stream 3 --
37   -<AB X-- stream 4 --
  44 +<AB XWARNING: good14.pdf (file position 1406): content normalization encountered bad tokens
  45 +WARNING: good14.pdf (file position 1406): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
  46 +WARNING: good14.pdf (file position 1406): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
  47 +-- stream 4 --
38 48 (ends with a name)
39 49 /ThisMustBeLast-- stream 5 --
40 50 % This stream has an inline image marker that is not terminated
... ... @@ -44,4 +54,7 @@ BI
44 54 ID
45 55 <506f7
46 56 461746f>
  57 +WARNING: good14.pdf (file position 1549): content normalization encountered bad tokens
  58 +WARNING: good14.pdf (file position 1549): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
  59 +WARNING: good14.pdf (file position 1549): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
47 60 test 3 done
... ...
qpdf/qtest/qpdf/normalize-warnings.out 0 → 100644
  1 +WARNING: coalesce.pdf (file position 671): content normalization encountered bad tokens
  2 +WARNING: coalesce.pdf (file position 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
  3 +WARNING: coalesce.pdf (file position 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
  4 +WARNING: coalesce.pdf (file position 823): content normalization encountered bad tokens
  5 +WARNING: coalesce.pdf (file position 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
  6 +WARNING: coalesce.pdf (file position 962): content normalization encountered bad tokens
  7 +WARNING: coalesce.pdf (file position 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
  8 +WARNING: coalesce.pdf (file position 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
  9 +qpdf: operation succeeded with warnings; resulting file may have some problems
... ...