Commit 5136238f2a973f693cea53c340dcff23a655531f

Authored by Jay Berkenbilt
1 parent 30709935

Detect and report bad tokens in content normalization

ChangeLog
@@ -153,6 +153,25 @@ @@ -153,6 +153,25 @@
153 * Provide heavily annoated examples/pdf-filter-tokens.cc example 153 * Provide heavily annoated examples/pdf-filter-tokens.cc example
154 that illustrates use of some simple token filters. 154 that illustrates use of some simple token filters.
155 155
  156 + * When normalizing content streams, as in qdf mode, issue warning
  157 + about bad tokens. Content streams are only normalized when this is
  158 + explicitly requested, so this has no impact on normal operation.
  159 + However, in qdf mode, if qpdf detects a bad token, it means that
  160 + either there's a bug in qpdf's lexer, that the file is damaged, or
  161 + that the page's contents are split in a weird way. In any of those
  162 + cases, qpdf could potentially damage the stream's contents by
  163 + replacing carrige returns with newlines or otherwise messing with
  164 + spaces. The mostly likely case of this would be an inline image's
  165 + compressed data being divided across two streams and having the
  166 + compressed data in the second stream contain a carriage return as
  167 + part of its binary data. If you are using qdf mode just to look at
  168 + PDF files in text editors, this usually doesn't matter. In cases
  169 + of contents split across multiple streams, coalescing streams
  170 + would eliminate the problem, so the warning mentions this. Prior
  171 + to this enhancement, the chances of qdf mode writing incorrect
  172 + data were already very low. This change should make it nearly
  173 + impossible for qdf mode to unknowingly write invalid data.
  174 +
156 2018-02-04 Jay Berkenbilt <ejb@ql.org> 175 2018-02-04 Jay Berkenbilt <ejb@ql.org>
157 176
158 * Add QPDFWriter::setLinearizationPass1Filename method and 177 * Add QPDFWriter::setLinearizationPass1Filename method and
libqpdf/ContentNormalizer.cc
1 #include <qpdf/ContentNormalizer.hh> 1 #include <qpdf/ContentNormalizer.hh>
2 #include <qpdf/QUtil.hh> 2 #include <qpdf/QUtil.hh>
3 3
4 -ContentNormalizer::ContentNormalizer() 4 +ContentNormalizer::ContentNormalizer() :
  5 + any_bad_tokens(false),
  6 + last_token_was_bad(false)
5 { 7 {
6 } 8 }
7 9
@@ -15,6 +17,16 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const&amp; token) @@ -15,6 +17,16 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const&amp; token)
15 std::string value = token.getRawValue(); 17 std::string value = token.getRawValue();
16 QPDFTokenizer::token_type_e token_type = token.getType(); 18 QPDFTokenizer::token_type_e token_type = token.getType();
17 19
  20 + if (token_type == QPDFTokenizer::tt_bad)
  21 + {
  22 + this->any_bad_tokens = true;
  23 + this->last_token_was_bad = true;
  24 + }
  25 + else if (token_type != QPDFTokenizer::tt_eof)
  26 + {
  27 + this->last_token_was_bad = false;
  28 + }
  29 +
18 switch (token_type) 30 switch (token_type)
19 { 31 {
20 case QPDFTokenizer::tt_space: 32 case QPDFTokenizer::tt_space:
@@ -75,3 +87,15 @@ ContentNormalizer::handleEOF() @@ -75,3 +87,15 @@ ContentNormalizer::handleEOF()
75 { 87 {
76 finish(); 88 finish();
77 } 89 }
  90 +
  91 +bool
  92 +ContentNormalizer::anyBadTokens() const
  93 +{
  94 + return this->any_bad_tokens;
  95 +}
  96 +
  97 +bool
  98 +ContentNormalizer::lastTokenWasBad()const
  99 +{
  100 + return this->last_token_was_bad;
  101 +}
libqpdf/QPDF_Stream.cc
@@ -609,6 +609,33 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline, @@ -609,6 +609,33 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline,
609 } 609 }
610 } 610 }
611 611
  612 + if (filter &&
  613 + (! suppress_warnings) &&
  614 + normalizer.getPointer() &&
  615 + normalizer->anyBadTokens())
  616 + {
  617 + warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
  618 + "", this->offset,
  619 + "content normalization encountered bad tokens"));
  620 + if (normalizer->lastTokenWasBad())
  621 + {
  622 + QTC::TC("qpdf", "QPDF_Stream bad token at end during normalize");
  623 + warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
  624 + "", this->offset,
  625 + "normalized content ended with a bad token;"
  626 + " you may be able to resolve this by"
  627 + " coalescing content streams in combination"
  628 + " with normalizing content. From the command"
  629 + " line, specify --coalesce-contents"));
  630 + }
  631 + warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
  632 + "", this->offset,
  633 + "Resulting stream data may be corrupted but is"
  634 + " may still useful for manual inspection."
  635 + " For more information on this warning, search"
  636 + " for content normalization in the manual."));
  637 + }
  638 +
612 return filter; 639 return filter;
613 } 640 }
614 641
libqpdf/qpdf/ContentNormalizer.hh
@@ -10,6 +10,13 @@ class ContentNormalizer: public QPDFObjectHandle::TokenFilter @@ -10,6 +10,13 @@ class ContentNormalizer: public QPDFObjectHandle::TokenFilter
10 virtual ~ContentNormalizer(); 10 virtual ~ContentNormalizer();
11 virtual void handleToken(QPDFTokenizer::Token const&); 11 virtual void handleToken(QPDFTokenizer::Token const&);
12 virtual void handleEOF(); 12 virtual void handleEOF();
  13 +
  14 + bool anyBadTokens() const;
  15 + bool lastTokenWasBad() const;
  16 +
  17 + private:
  18 + bool any_bad_tokens;
  19 + bool last_token_was_bad;
13 }; 20 };
14 21
15 #endif // __CONTENTNORMALIZER_HH__ 22 #endif // __CONTENTNORMALIZER_HH__
qpdf/qpdf.testcov
@@ -306,3 +306,4 @@ Pl_QPDFTokenizer found ID 0 @@ -306,3 +306,4 @@ Pl_QPDFTokenizer found ID 0
306 QPDFObjectHandle non-stream in stream array 0 306 QPDFObjectHandle non-stream in stream array 0
307 QPDFObjectHandle coalesce called on stream 0 307 QPDFObjectHandle coalesce called on stream 0
308 QPDFObjectHandle coalesce provide stream data 0 308 QPDFObjectHandle coalesce provide stream data 0
  309 +QPDF_Stream bad token at end during normalize 0
qpdf/qtest/qpdf.test
@@ -737,8 +737,16 @@ $td-&gt;runtest(&quot;stream with tiff predictor&quot;, @@ -737,8 +737,16 @@ $td-&gt;runtest(&quot;stream with tiff predictor&quot;,
737 show_ntests(); 737 show_ntests();
738 # ---------- 738 # ----------
739 $td->notify("--- Coalesce contents ---"); 739 $td->notify("--- Coalesce contents ---");
740 -$n_tests += 4; 740 +$n_tests += 6;
741 741
  742 +$td->runtest("qdf with normalize warnings",
  743 + {$td->COMMAND =>
  744 + "qpdf --qdf --static-id coalesce.pdf a.pdf"},
  745 + {$td->FILE => "normalize-warnings.out", $td->EXIT_STATUS => 3},
  746 + $td->NORMALIZE_NEWLINES);
  747 +$td->runtest("check output",
  748 + {$td->FILE => "a.pdf"},
  749 + {$td->FILE => "coalesce.qdf"});
742 $td->runtest("coalesce contents with qdf", 750 $td->runtest("coalesce contents with qdf",
743 {$td->COMMAND => 751 {$td->COMMAND =>
744 "qpdf --qdf --static-id" . 752 "qpdf --qdf --static-id" .
qpdf/qtest/qpdf/coalesce.qdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/good14.out
@@ -13,7 +13,9 @@ three lines @@ -13,7 +13,9 @@ three lines
13 <8a8b> 13 <8a8b>
14 (ab) 14 (ab)
15 <8c><dd> ) > 15 <8c><dd> ) >
16 -<610062> (MOO)-- stream 1 -- 16 +<610062> (MOO)WARNING: good14.pdf (file position 628): content normalization encountered bad tokens
  17 +WARNING: good14.pdf (file position 628): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
  18 +-- stream 1 --
17 This stream does end with a newline. 19 This stream does end with a newline.
18 // tests: 20 // tests:
19 // bad tokens preserved 21 // bad tokens preserved
@@ -31,10 +33,18 @@ This stream does end with a newline. @@ -31,10 +33,18 @@ This stream does end with a newline.
31 33
32 /good name 34 /good name
33 /bad#00name 35 /bad#00name
  36 +WARNING: good14.pdf (file position 860): content normalization encountered bad tokens
  37 +WARNING: good14.pdf (file position 860): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
34 -- stream 2 -- 38 -- stream 2 --
35 (This stream ends with a \001 bad token 39 (This stream ends with a \001 bad token
  40 +WARNING: good14.pdf (file position 1316): content normalization encountered bad tokens
  41 +WARNING: good14.pdf (file position 1316): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
  42 +WARNING: good14.pdf (file position 1316): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
36 -- stream 3 -- 43 -- stream 3 --
37 -<AB X-- stream 4 -- 44 +<AB XWARNING: good14.pdf (file position 1406): content normalization encountered bad tokens
  45 +WARNING: good14.pdf (file position 1406): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
  46 +WARNING: good14.pdf (file position 1406): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
  47 +-- stream 4 --
38 (ends with a name) 48 (ends with a name)
39 /ThisMustBeLast-- stream 5 -- 49 /ThisMustBeLast-- stream 5 --
40 % This stream has an inline image marker that is not terminated 50 % This stream has an inline image marker that is not terminated
@@ -44,4 +54,7 @@ BI @@ -44,4 +54,7 @@ BI
44 ID 54 ID
45 <506f7 55 <506f7
46 461746f> 56 461746f>
  57 +WARNING: good14.pdf (file position 1549): content normalization encountered bad tokens
  58 +WARNING: good14.pdf (file position 1549): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
  59 +WARNING: good14.pdf (file position 1549): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
47 test 3 done 60 test 3 done
qpdf/qtest/qpdf/normalize-warnings.out 0 → 100644
  1 +WARNING: coalesce.pdf (file position 671): content normalization encountered bad tokens
  2 +WARNING: coalesce.pdf (file position 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
  3 +WARNING: coalesce.pdf (file position 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
  4 +WARNING: coalesce.pdf (file position 823): content normalization encountered bad tokens
  5 +WARNING: coalesce.pdf (file position 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
  6 +WARNING: coalesce.pdf (file position 962): content normalization encountered bad tokens
  7 +WARNING: coalesce.pdf (file position 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
  8 +WARNING: coalesce.pdf (file position 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
  9 +qpdf: operation succeeded with warnings; resulting file may have some problems