Detect and report bad tokens in content normalization

Jay Berkenbilt
1 parent 30709935
Showing 9 changed files with 343 additions and 4 deletions
ChangeLog
libqpdf/ContentNormalizer.cc
libqpdf/QPDF_Stream.cc
libqpdf/qpdf/ContentNormalizer.hh
qpdf/qpdf.testcov
qpdf/qtest/qpdf.test
qpdf/qtest/qpdf/coalesce.qdf
qpdf/qtest/qpdf/good14.out
qpdf/qtest/qpdf/normalize-warnings.out
@@ -153,6 +153,25 @@
 	* Provide heavily annoated examples/pdf-filter-tokens.cc example
 	that illustrates use of some simple token filters.
+	* When normalizing content streams, as in qdf mode, issue warning
+	about bad tokens. Content streams are only normalized when this is
+	explicitly requested, so this has no impact on normal operation.
+	However, in qdf mode, if qpdf detects a bad token, it means that
+	either there's a bug in qpdf's lexer, that the file is damaged, or
+	that the page's contents are split in a weird way. In any of those
+	cases, qpdf could potentially damage the stream's contents by
+	replacing carrige returns with newlines or otherwise messing with
+	spaces. The mostly likely case of this would be an inline image's
+	compressed data being divided across two streams and having the
+	compressed data in the second stream contain a carriage return as
+	part of its binary data. If you are using qdf mode just to look at
+	PDF files in text editors, this usually doesn't matter. In cases
+	of contents split across multiple streams, coalescing streams
+	would eliminate the problem, so the warning mentions this. Prior
+	to this enhancement, the chances of qdf mode writing incorrect
+	data were already very low. This change should make it nearly
+	impossible for qdf mode to unknowingly write invalid data.
+
 2018-02-04  Jay Berkenbilt  <ejb@ql.org>
 	* Add QPDFWriter::setLinearizationPass1Filename method and
 #include <qpdf/ContentNormalizer.hh>
 #include <qpdf/QUtil.hh>
-ContentNormalizer::ContentNormalizer()
+ContentNormalizer::ContentNormalizer() :
+    any_bad_tokens(false),
+    last_token_was_bad(false)
 {
 }
@@ -15,6 +17,16 @@ ContentNormalizer::handleToken(QPDFTokenizer::Token const&amp; token)
     std::string value = token.getRawValue();
     QPDFTokenizer::token_type_e token_type = token.getType();
+    if (token_type == QPDFTokenizer::tt_bad)
+    {
+        this->any_bad_tokens = true;
+        this->last_token_was_bad = true;
+    }
+    else if (token_type != QPDFTokenizer::tt_eof)
+    {
+        this->last_token_was_bad = false;
+    }
+
     switch (token_type)
     {
       case QPDFTokenizer::tt_space:
@@ -75,3 +87,15 @@ ContentNormalizer::handleEOF()
 {
     finish();
 }
+
+bool
+ContentNormalizer::anyBadTokens() const
+{
+    return this->any_bad_tokens;
+}
+
+bool
+ContentNormalizer::lastTokenWasBad()const
+{
+    return this->last_token_was_bad;
+}
@@ -609,6 +609,33 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline,
         }
     }
+    if (filter &&
+        (! suppress_warnings) &&
+        normalizer.getPointer() &&
+        normalizer->anyBadTokens())
+    {
+        warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
+                     "", this->offset,
+                     "content normalization encountered bad tokens"));
+        if (normalizer->lastTokenWasBad())
+        {
+            QTC::TC("qpdf", "QPDF_Stream bad token at end during normalize");
+            warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
+                         "", this->offset,
+                         "normalized content ended with a bad token;"
+                         " you may be able to resolve this by"
+                         " coalescing content streams in combination"
+                         " with normalizing content. From the command"
+                         " line, specify --coalesce-contents"));
+        }
+        warn(QPDFExc(qpdf_e_damaged_pdf, qpdf->getFilename(),
+                     "", this->offset,
+                     "Resulting stream data may be corrupted but is"
+                     " may still useful for manual inspection."
+                     " For more information on this warning, search"
+                     " for content normalization in the manual."));
+    }
+
     return filter;
 }
@@ -10,6 +10,13 @@ class ContentNormalizer: public QPDFObjectHandle::TokenFilter
     virtual ~ContentNormalizer();
     virtual void handleToken(QPDFTokenizer::Token const&);
     virtual void handleEOF();
+
+    bool anyBadTokens() const;
+    bool lastTokenWasBad() const;
+
+  private:
+    bool any_bad_tokens;
+    bool last_token_was_bad;
 };
 #endif // __CONTENTNORMALIZER_HH__
@@ -306,3 +306,4 @@ Pl_QPDFTokenizer found ID 0
 QPDFObjectHandle non-stream in stream array 0
 QPDFObjectHandle coalesce called on stream 0
 QPDFObjectHandle coalesce provide stream data 0
+QPDF_Stream bad token at end during normalize 0
@@ -737,8 +737,16 @@ $td-&gt;runtest(&quot;stream with tiff predictor&quot;,
 show_ntests();
 # ----------
 $td->notify("--- Coalesce contents ---");
-$n_tests += 4;
+$n_tests += 6;
+$td->runtest("qdf with normalize warnings",
+             {$td->COMMAND =>
+                  "qpdf --qdf --static-id coalesce.pdf a.pdf"},
+             {$td->FILE => "normalize-warnings.out", $td->EXIT_STATUS => 3},
+             $td->NORMALIZE_NEWLINES);
+$td->runtest("check output",
+             {$td->FILE => "a.pdf"},
+             {$td->FILE => "coalesce.qdf"});
 $td->runtest("coalesce contents with qdf",
              {$td->COMMAND =>
                   "qpdf --qdf --static-id" .
@@ -13,7 +13,9 @@ three lines
 <8a8b>
 (ab)
 <8c><dd> ) >
-<610062> (MOO)-- stream 1 --
+<610062> (MOO)WARNING: good14.pdf (file position 628): content normalization encountered bad tokens
+WARNING: good14.pdf (file position 628): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+-- stream 1 --
 This stream does end with a newline.
 // tests:
 //   bad tokens preserved
@@ -31,10 +33,18 @@ This stream does end with a newline.
 /good name
 /bad#00name
+WARNING: good14.pdf (file position 860): content normalization encountered bad tokens
+WARNING: good14.pdf (file position 860): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
 -- stream 2 --
 (This stream ends with a \001 bad token
+WARNING: good14.pdf (file position 1316): content normalization encountered bad tokens
+WARNING: good14.pdf (file position 1316): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: good14.pdf (file position 1316): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
 -- stream 3 --
-<AB X-- stream 4 --
+<AB XWARNING: good14.pdf (file position 1406): content normalization encountered bad tokens
+WARNING: good14.pdf (file position 1406): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: good14.pdf (file position 1406): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+-- stream 4 --
 (ends with a name)
 /ThisMustBeLast-- stream 5 --
 % This stream has an inline image marker that is not terminated
@@ -44,4 +54,7 @@ BI
 ID
 <506f7
 461746f>
+WARNING: good14.pdf (file position 1549): content normalization encountered bad tokens
+WARNING: good14.pdf (file position 1549): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: good14.pdf (file position 1549): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
 test 3 done
+WARNING: coalesce.pdf (file position 671): content normalization encountered bad tokens
+WARNING: coalesce.pdf (file position 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: coalesce.pdf (file position 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+WARNING: coalesce.pdf (file position 823): content normalization encountered bad tokens
+WARNING: coalesce.pdf (file position 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+WARNING: coalesce.pdf (file position 962): content normalization encountered bad tokens
+WARNING: coalesce.pdf (file position 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
+WARNING: coalesce.pdf (file position 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
+qpdf: operation succeeded with warnings; resulting file may have some problems