Improve locating inline image's EI

We've actually seen a PDF file in the wild that contained EI surrounded by delimiters inside the image data, which confused qpdf's naive code. This significantly improves EI detection.

Improve locating inline image's EI
We've actually seen a PDF file in the wild that contained EI surrounded by delimiters inside the image data, which confused qpdf's naive code. This significantly improves EI detection.
Jay Berkenbilt
1 parent ec9e310c
Showing 7 changed files with 155 additions and 15 deletions
ChangeLog
include/qpdf/QPDFTokenizer.hh
libqpdf/QPDFTokenizer.cc
qpdf/qpdf.testcov
qpdf/qtest/qpdf.test
qpdf/qtest/qpdf/large-inline-image.pdf
qpdf/qtest/qpdf/large-inline-image.qdf
+2019-01-30  Jay Berkenbilt  <ejb@ql.org>
+
+	* Improve locating of an inline image's EI operator to correctly
+	handle the case of EI appearing inside the image data.
+
+	* Very low-level QPDFTokenizer API now includes an
+	expectInlineImage method that takes an input stream, enabling it
+	to locate an inline image's EI operator better. This is called
+	automatically everywhere within the qpdf library. Most user code
+	will never have to use the low-level tokenizer API. If you use
+	Pl_QPDFTokenizer, this will be done automatically for you.
+
 2019-01-29  Jay Berkenbilt  <ejb@ql.org>
 	* Bug fix: when returning an inline image token, the tokenizer no
@@ -198,6 +198,7 @@ class QPDFTokenizer
     void resolveLiteral();
     bool isSpace(char);
     bool isDelimiter(char);
+    void findEI(PointerHolder<InputSource> input);
     enum state_e {
         st_top, st_in_space, st_in_comment, st_in_string, st_lt, st_gt,
@@ -47,7 +47,7 @@ QPDFWordTokenFinder::check()
     qpdf_offset_t pos = is->tell();
     if (! (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str)))
     {
-///        QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
+        QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
         return false;
     }
     qpdf_offset_t token_start = is->getLastOffset();
@@ -65,7 +65,6 @@ QPDFWordTokenFinder::check()
     is->seek(pos, SEEK_SET);
     if (! next_okay)
     {
-///        QTC::TC("qpdf", "QPDFTokenizer finder word not followed by delimiter");
         return false;
     }
     if (token_start == 0)
@@ -80,7 +79,7 @@ QPDFWordTokenFinder::check()
     is->seek(pos, SEEK_SET);
     if (! prev_okay)
     {
-///        QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter");
+        QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter");
         return false;
     }
     return true;
@@ -687,26 +686,131 @@ QPDFTokenizer::expectInlineImage()
 void
 QPDFTokenizer::expectInlineImage(PointerHolder<InputSource> input)
 {
-    if (input.getPointer())
+    if (this->m->state != st_top)
+    {
+        throw std::logic_error("QPDFTokenizer::expectInlineImage called"
+                               " when tokenizer is in improper state");
+    }
+    findEI(input);
+    this->m->state = st_inline_image;
+}
+
+void
+QPDFTokenizer::findEI(PointerHolder<InputSource> input)
+{
+    if (! input.getPointer())
     {
-        qpdf_offset_t last_offset = input->getLastOffset();
-        qpdf_offset_t pos = input->tell();
+        return;
+    }
+
+    qpdf_offset_t last_offset = input->getLastOffset();
+    qpdf_offset_t pos = input->tell();
+    // Use QPDFWordTokenFinder to find EI surrounded by delimiters.
+    // Then read the next several tokens or up to EOF. If we find any
+    // suspicious-looking or tokens, this is probably still part of
+    // the image data, so keep looking for EI. Stop at the first EI
+    // that passes. If we get to the end without finding one, return
+    // the last EI we found. Store the number of bytes expected in the
+    // inline image including the EI and use that to break out of
+    // inline image, falling back to the old method if needed.
+
+    bool okay = false;
+    bool first_try = true;
+    while (! okay)
+    {
         QPDFWordTokenFinder f(input, "EI");
-        if (input->findFirst("EI", pos, 0, f))
+        if (! input->findFirst("EI", input->tell(), 0, f))
         {
-            this->m->inline_image_bytes = input->tell() - pos;
+            break;
+        }
+        this->m->inline_image_bytes = input->tell() - pos;
+
+        QPDFTokenizer check;
+        bool found_bad = false;
+        // Look at the next 10 tokens or up to EOF. The next inline
+        // image's image data would look like bad tokens, but there
+        // will always be at least 10 tokens between one inline
+        // image's EI and the next valid one's ID since width, height,
+        // bits per pixel, and color space are all required as well as
+        // a BI and ID. If we get 10 good tokens in a row or hit EOF,
+        // we can be pretty sure we've found the actual EI.
+        for (int i = 0; i < 10; ++i)
+        {
+            QPDFTokenizer::Token t =
+                check.readToken(input, "checker", true);
+            token_type_e type = t.getType();
+            if (type == tt_eof)
+            {
+                okay = true;
+            }
+            else if (type == tt_bad)
+            {
+                found_bad = true;
+            }
+            else if (type == tt_word)
+            {
+                // The qpdf tokenizer lumps alphabetic and otherwise
+                // uncategorized characters into "words". We recognize
+                // strings of alphabetic characters as potential valid
+                // operators for purposes of telling whether we're in
+                // valid content or not. It's not perfect, but it
+                // should work more reliably than what we used to do,
+                // which was already good enough for the vast majority
+                // of files.
+                bool found_alpha = false;
+                bool found_non_printable = false;
+                bool found_other = false;
+                std::string value = t.getValue();
+                for (std::string::iterator iter = value.begin();
+                     iter != value.end(); ++iter)
+                {
+                    char ch = *iter;
+                    if (((ch >= 'a') && (ch <= 'z')) ||
+                        ((ch >= 'A') && (ch <= 'Z')) ||
+                        (ch == '*'))
+                    {
+                        // Treat '*' as alpha since there are valid
+                        // PDF operators that contain * along with
+                        // alphabetic characters.
+                        found_alpha = true;
+                    }
+                    else if (((ch < 32) && (! isSpace(ch))) || (ch > 127))
+                    {
+                        found_non_printable = true;
+                        break;
+                    }
+                    else
+                    {
+                        found_other = true;
+                    }
+                }
+                if (found_non_printable || (found_alpha && found_other))
+                {
+                    found_bad = true;
+                }
+            }
+            if (okay || found_bad)
+            {
+                break;
+            }
+        }
+        if (! found_bad)
+        {
+            okay = true;
+        }
+        if (! okay)
+        {
+            first_try = false;
         }
-
-        input->seek(pos, SEEK_SET);
-        input->setLastOffset(last_offset);
     }
-    if (this->m->state != st_top)
+    if (okay && (! first_try))
     {
-        throw std::logic_error("QPDFTokenizer::expectInlineImage called"
-                               " when tokenizer is in improper state");
+        QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try");
     }
-    this->m->state = st_inline_image;
+
+    input->seek(pos, SEEK_SET);
+    input->setLastOffset(last_offset);
 }
 bool
@@ -430,6 +430,9 @@ QPDFPageObjectHelper copy shared attribute 0
 qpdf from_nr from repeat_nr 0
 QPDF resolve duplicated page object 0
 QPDF handle direct page object 0
+QPDFTokenizer finder found wrong word 0
+QPDFTokenizer finder word not preceded by delimiter 0
 QPDFTokenizer found EI the old way 0
 QPDFTokenizer found EI by byte count 0
 QPDFTokenizer inline image at EOF the old way 0
+QPDFTokenizer found EI after more than one try 0
@@ -693,6 +693,26 @@ $td-&gt;runtest(&quot;check pass1 file&quot;,
 show_ntests();
 # ----------
+$td->notify("--- Inline Images ---");
+$n_tests += 2;
+
+# The file large-inline-image.pdf is a hand-crafted file with several
+# inline images of various sizes including one that is two megabytes,
+# encoded in base85, and has a base85-encoding that contains EI
+# surrounded by delimiters several times. This exercises the EI
+# detection code added in qpdf 8.4.
+
+$td->runtest("complex inline image parsing",
+             {$td->COMMAND =>
+                  "qpdf --qdf --static-id large-inline-image.pdf a.pdf"},
+             {$td->STRING => "", $td->EXIT_STATUS => 0},
+             $td->NORMALIZE_NEWLINES);
+$td->runtest("check output",
+	     {$td->FILE => "a.pdf"},
+	     {$td->FILE => "large-inline-image.qdf"});
+
+show_ntests();
+# ----------
 $td->notify("--- Tokenizer ---");
 $n_tests += 5;