Commit 2b6c79bcaeee0548f3d7291876eb3821e14b8227

Authored by Jay Berkenbilt
1 parent ec9e310c

Improve locating inline image's EI

We've actually seen a PDF file in the wild that contained EI
surrounded by delimiters inside the image data, which confused qpdf's
naive code. This significantly improves EI detection.
ChangeLog
  1 +2019-01-30 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * Improve locating of an inline image's EI operator to correctly
  4 + handle the case of EI appearing inside the image data.
  5 +
  6 + * Very low-level QPDFTokenizer API now includes an
  7 + expectInlineImage method that takes an input stream, enabling it
  8 + to locate an inline image's EI operator better. This is called
  9 + automatically everywhere within the qpdf library. Most user code
  10 + will never have to use the low-level tokenizer API. If you use
  11 + Pl_QPDFTokenizer, this will be done automatically for you.
  12 +
1 13 2019-01-29 Jay Berkenbilt <ejb@ql.org>
2 14  
3 15 * Bug fix: when returning an inline image token, the tokenizer no
... ...
include/qpdf/QPDFTokenizer.hh
... ... @@ -198,6 +198,7 @@ class QPDFTokenizer
198 198 void resolveLiteral();
199 199 bool isSpace(char);
200 200 bool isDelimiter(char);
  201 + void findEI(PointerHolder<InputSource> input);
201 202  
202 203 enum state_e {
203 204 st_top, st_in_space, st_in_comment, st_in_string, st_lt, st_gt,
... ...
libqpdf/QPDFTokenizer.cc
... ... @@ -47,7 +47,7 @@ QPDFWordTokenFinder::check()
47 47 qpdf_offset_t pos = is->tell();
48 48 if (! (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str)))
49 49 {
50   -/// QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
  50 + QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
51 51 return false;
52 52 }
53 53 qpdf_offset_t token_start = is->getLastOffset();
... ... @@ -65,7 +65,6 @@ QPDFWordTokenFinder::check()
65 65 is->seek(pos, SEEK_SET);
66 66 if (! next_okay)
67 67 {
68   -/// QTC::TC("qpdf", "QPDFTokenizer finder word not followed by delimiter");
69 68 return false;
70 69 }
71 70 if (token_start == 0)
... ... @@ -80,7 +79,7 @@ QPDFWordTokenFinder::check()
80 79 is->seek(pos, SEEK_SET);
81 80 if (! prev_okay)
82 81 {
83   -/// QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter");
  82 + QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter");
84 83 return false;
85 84 }
86 85 return true;
... ... @@ -687,26 +686,131 @@ QPDFTokenizer::expectInlineImage()
687 686 void
688 687 QPDFTokenizer::expectInlineImage(PointerHolder<InputSource> input)
689 688 {
690   - if (input.getPointer())
  689 + if (this->m->state != st_top)
  690 + {
  691 + throw std::logic_error("QPDFTokenizer::expectInlineImage called"
  692 + " when tokenizer is in improper state");
  693 + }
  694 + findEI(input);
  695 + this->m->state = st_inline_image;
  696 +}
  697 +
  698 +void
  699 +QPDFTokenizer::findEI(PointerHolder<InputSource> input)
  700 +{
  701 + if (! input.getPointer())
691 702 {
692   - qpdf_offset_t last_offset = input->getLastOffset();
693   - qpdf_offset_t pos = input->tell();
  703 + return;
  704 + }
  705 +
  706 + qpdf_offset_t last_offset = input->getLastOffset();
  707 + qpdf_offset_t pos = input->tell();
694 708  
  709 + // Use QPDFWordTokenFinder to find EI surrounded by delimiters.
  710 + // Then read the next several tokens or up to EOF. If we find any
  711 + // suspicious-looking or tokens, this is probably still part of
  712 + // the image data, so keep looking for EI. Stop at the first EI
  713 + // that passes. If we get to the end without finding one, return
  714 + // the last EI we found. Store the number of bytes expected in the
  715 + // inline image including the EI and use that to break out of
  716 + // inline image, falling back to the old method if needed.
  717 +
  718 + bool okay = false;
  719 + bool first_try = true;
  720 + while (! okay)
  721 + {
695 722 QPDFWordTokenFinder f(input, "EI");
696   - if (input->findFirst("EI", pos, 0, f))
  723 + if (! input->findFirst("EI", input->tell(), 0, f))
697 724 {
698   - this->m->inline_image_bytes = input->tell() - pos;
  725 + break;
  726 + }
  727 + this->m->inline_image_bytes = input->tell() - pos;
  728 +
  729 + QPDFTokenizer check;
  730 + bool found_bad = false;
  731 + // Look at the next 10 tokens or up to EOF. The next inline
  732 + // image's image data would look like bad tokens, but there
  733 + // will always be at least 10 tokens between one inline
  734 + // image's EI and the next valid one's ID since width, height,
  735 + // bits per pixel, and color space are all required as well as
  736 + // a BI and ID. If we get 10 good tokens in a row or hit EOF,
  737 + // we can be pretty sure we've found the actual EI.
  738 + for (int i = 0; i < 10; ++i)
  739 + {
  740 + QPDFTokenizer::Token t =
  741 + check.readToken(input, "checker", true);
  742 + token_type_e type = t.getType();
  743 + if (type == tt_eof)
  744 + {
  745 + okay = true;
  746 + }
  747 + else if (type == tt_bad)
  748 + {
  749 + found_bad = true;
  750 + }
  751 + else if (type == tt_word)
  752 + {
  753 + // The qpdf tokenizer lumps alphabetic and otherwise
  754 + // uncategorized characters into "words". We recognize
  755 + // strings of alphabetic characters as potential valid
  756 + // operators for purposes of telling whether we're in
  757 + // valid content or not. It's not perfect, but it
  758 + // should work more reliably than what we used to do,
  759 + // which was already good enough for the vast majority
  760 + // of files.
  761 + bool found_alpha = false;
  762 + bool found_non_printable = false;
  763 + bool found_other = false;
  764 + std::string value = t.getValue();
  765 + for (std::string::iterator iter = value.begin();
  766 + iter != value.end(); ++iter)
  767 + {
  768 + char ch = *iter;
  769 + if (((ch >= 'a') && (ch <= 'z')) ||
  770 + ((ch >= 'A') && (ch <= 'Z')) ||
  771 + (ch == '*'))
  772 + {
  773 + // Treat '*' as alpha since there are valid
  774 + // PDF operators that contain * along with
  775 + // alphabetic characters.
  776 + found_alpha = true;
  777 + }
  778 + else if (((ch < 32) && (! isSpace(ch))) || (ch > 127))
  779 + {
  780 + found_non_printable = true;
  781 + break;
  782 + }
  783 + else
  784 + {
  785 + found_other = true;
  786 + }
  787 + }
  788 + if (found_non_printable || (found_alpha && found_other))
  789 + {
  790 + found_bad = true;
  791 + }
  792 + }
  793 + if (okay || found_bad)
  794 + {
  795 + break;
  796 + }
  797 + }
  798 + if (! found_bad)
  799 + {
  800 + okay = true;
  801 + }
  802 + if (! okay)
  803 + {
  804 + first_try = false;
699 805 }
700   -
701   - input->seek(pos, SEEK_SET);
702   - input->setLastOffset(last_offset);
703 806 }
704   - if (this->m->state != st_top)
  807 + if (okay && (! first_try))
705 808 {
706   - throw std::logic_error("QPDFTokenizer::expectInlineImage called"
707   - " when tokenizer is in improper state");
  809 + QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try");
708 810 }
709   - this->m->state = st_inline_image;
  811 +
  812 + input->seek(pos, SEEK_SET);
  813 + input->setLastOffset(last_offset);
710 814 }
711 815  
712 816 bool
... ...
qpdf/qpdf.testcov
... ... @@ -430,6 +430,9 @@ QPDFPageObjectHelper copy shared attribute 0
430 430 qpdf from_nr from repeat_nr 0
431 431 QPDF resolve duplicated page object 0
432 432 QPDF handle direct page object 0
  433 +QPDFTokenizer finder found wrong word 0
  434 +QPDFTokenizer finder word not preceded by delimiter 0
433 435 QPDFTokenizer found EI the old way 0
434 436 QPDFTokenizer found EI by byte count 0
435 437 QPDFTokenizer inline image at EOF the old way 0
  438 +QPDFTokenizer found EI after more than one try 0
... ...
qpdf/qtest/qpdf.test
... ... @@ -693,6 +693,26 @@ $td-&gt;runtest(&quot;check pass1 file&quot;,
693 693  
694 694 show_ntests();
695 695 # ----------
  696 +$td->notify("--- Inline Images ---");
  697 +$n_tests += 2;
  698 +
  699 +# The file large-inline-image.pdf is a hand-crafted file with several
  700 +# inline images of various sizes including one that is two megabytes,
  701 +# encoded in base85, and has a base85-encoding that contains EI
  702 +# surrounded by delimiters several times. This exercises the EI
  703 +# detection code added in qpdf 8.4.
  704 +
  705 +$td->runtest("complex inline image parsing",
  706 + {$td->COMMAND =>
  707 + "qpdf --qdf --static-id large-inline-image.pdf a.pdf"},
  708 + {$td->STRING => "", $td->EXIT_STATUS => 0},
  709 + $td->NORMALIZE_NEWLINES);
  710 +$td->runtest("check output",
  711 + {$td->FILE => "a.pdf"},
  712 + {$td->FILE => "large-inline-image.qdf"});
  713 +
  714 +show_ntests();
  715 +# ----------
696 716 $td->notify("--- Tokenizer ---");
697 717 $n_tests += 5;
698 718  
... ...
qpdf/qtest/qpdf/large-inline-image.pdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/large-inline-image.qdf 0 → 100644
No preview for this file type