Commit 2b6c79bcaeee0548f3d7291876eb3821e14b8227
1 parent
ec9e310c
Improve locating inline image's EI
We've actually seen a PDF file in the wild that contained EI surrounded by delimiters inside the image data, which confused qpdf's naive code. This significantly improves EI detection.
Showing
7 changed files
with
155 additions
and
15 deletions
ChangeLog
| 1 | +2019-01-30 Jay Berkenbilt <ejb@ql.org> | ||
| 2 | + | ||
| 3 | + * Improve locating of an inline image's EI operator to correctly | ||
| 4 | + handle the case of EI appearing inside the image data. | ||
| 5 | + | ||
| 6 | + * Very low-level QPDFTokenizer API now includes an | ||
| 7 | + expectInlineImage method that takes an input stream, enabling it | ||
| 8 | + to locate an inline image's EI operator better. This is called | ||
| 9 | + automatically everywhere within the qpdf library. Most user code | ||
| 10 | + will never have to use the low-level tokenizer API. If you use | ||
| 11 | + Pl_QPDFTokenizer, this will be done automatically for you. | ||
| 12 | + | ||
| 1 | 2019-01-29 Jay Berkenbilt <ejb@ql.org> | 13 | 2019-01-29 Jay Berkenbilt <ejb@ql.org> |
| 2 | 14 | ||
| 3 | * Bug fix: when returning an inline image token, the tokenizer no | 15 | * Bug fix: when returning an inline image token, the tokenizer no |
include/qpdf/QPDFTokenizer.hh
| @@ -198,6 +198,7 @@ class QPDFTokenizer | @@ -198,6 +198,7 @@ class QPDFTokenizer | ||
| 198 | void resolveLiteral(); | 198 | void resolveLiteral(); |
| 199 | bool isSpace(char); | 199 | bool isSpace(char); |
| 200 | bool isDelimiter(char); | 200 | bool isDelimiter(char); |
| 201 | + void findEI(PointerHolder<InputSource> input); | ||
| 201 | 202 | ||
| 202 | enum state_e { | 203 | enum state_e { |
| 203 | st_top, st_in_space, st_in_comment, st_in_string, st_lt, st_gt, | 204 | st_top, st_in_space, st_in_comment, st_in_string, st_lt, st_gt, |
libqpdf/QPDFTokenizer.cc
| @@ -47,7 +47,7 @@ QPDFWordTokenFinder::check() | @@ -47,7 +47,7 @@ QPDFWordTokenFinder::check() | ||
| 47 | qpdf_offset_t pos = is->tell(); | 47 | qpdf_offset_t pos = is->tell(); |
| 48 | if (! (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) | 48 | if (! (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) |
| 49 | { | 49 | { |
| 50 | -/// QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); | 50 | + QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); |
| 51 | return false; | 51 | return false; |
| 52 | } | 52 | } |
| 53 | qpdf_offset_t token_start = is->getLastOffset(); | 53 | qpdf_offset_t token_start = is->getLastOffset(); |
| @@ -65,7 +65,6 @@ QPDFWordTokenFinder::check() | @@ -65,7 +65,6 @@ QPDFWordTokenFinder::check() | ||
| 65 | is->seek(pos, SEEK_SET); | 65 | is->seek(pos, SEEK_SET); |
| 66 | if (! next_okay) | 66 | if (! next_okay) |
| 67 | { | 67 | { |
| 68 | -/// QTC::TC("qpdf", "QPDFTokenizer finder word not followed by delimiter"); | ||
| 69 | return false; | 68 | return false; |
| 70 | } | 69 | } |
| 71 | if (token_start == 0) | 70 | if (token_start == 0) |
| @@ -80,7 +79,7 @@ QPDFWordTokenFinder::check() | @@ -80,7 +79,7 @@ QPDFWordTokenFinder::check() | ||
| 80 | is->seek(pos, SEEK_SET); | 79 | is->seek(pos, SEEK_SET); |
| 81 | if (! prev_okay) | 80 | if (! prev_okay) |
| 82 | { | 81 | { |
| 83 | -/// QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter"); | 82 | + QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter"); |
| 84 | return false; | 83 | return false; |
| 85 | } | 84 | } |
| 86 | return true; | 85 | return true; |
| @@ -687,26 +686,131 @@ QPDFTokenizer::expectInlineImage() | @@ -687,26 +686,131 @@ QPDFTokenizer::expectInlineImage() | ||
| 687 | void | 686 | void |
| 688 | QPDFTokenizer::expectInlineImage(PointerHolder<InputSource> input) | 687 | QPDFTokenizer::expectInlineImage(PointerHolder<InputSource> input) |
| 689 | { | 688 | { |
| 690 | - if (input.getPointer()) | 689 | + if (this->m->state != st_top) |
| 690 | + { | ||
| 691 | + throw std::logic_error("QPDFTokenizer::expectInlineImage called" | ||
| 692 | + " when tokenizer is in improper state"); | ||
| 693 | + } | ||
| 694 | + findEI(input); | ||
| 695 | + this->m->state = st_inline_image; | ||
| 696 | +} | ||
| 697 | + | ||
| 698 | +void | ||
| 699 | +QPDFTokenizer::findEI(PointerHolder<InputSource> input) | ||
| 700 | +{ | ||
| 701 | + if (! input.getPointer()) | ||
| 691 | { | 702 | { |
| 692 | - qpdf_offset_t last_offset = input->getLastOffset(); | ||
| 693 | - qpdf_offset_t pos = input->tell(); | 703 | + return; |
| 704 | + } | ||
| 705 | + | ||
| 706 | + qpdf_offset_t last_offset = input->getLastOffset(); | ||
| 707 | + qpdf_offset_t pos = input->tell(); | ||
| 694 | 708 | ||
| 709 | + // Use QPDFWordTokenFinder to find EI surrounded by delimiters. | ||
| 710 | + // Then read the next several tokens or up to EOF. If we find any | ||
| 711 | + // suspicious-looking or tokens, this is probably still part of | ||
| 712 | + // the image data, so keep looking for EI. Stop at the first EI | ||
| 713 | + // that passes. If we get to the end without finding one, return | ||
| 714 | + // the last EI we found. Store the number of bytes expected in the | ||
| 715 | + // inline image including the EI and use that to break out of | ||
| 716 | + // inline image, falling back to the old method if needed. | ||
| 717 | + | ||
| 718 | + bool okay = false; | ||
| 719 | + bool first_try = true; | ||
| 720 | + while (! okay) | ||
| 721 | + { | ||
| 695 | QPDFWordTokenFinder f(input, "EI"); | 722 | QPDFWordTokenFinder f(input, "EI"); |
| 696 | - if (input->findFirst("EI", pos, 0, f)) | 723 | + if (! input->findFirst("EI", input->tell(), 0, f)) |
| 697 | { | 724 | { |
| 698 | - this->m->inline_image_bytes = input->tell() - pos; | 725 | + break; |
| 726 | + } | ||
| 727 | + this->m->inline_image_bytes = input->tell() - pos; | ||
| 728 | + | ||
| 729 | + QPDFTokenizer check; | ||
| 730 | + bool found_bad = false; | ||
| 731 | + // Look at the next 10 tokens or up to EOF. The next inline | ||
| 732 | + // image's image data would look like bad tokens, but there | ||
| 733 | + // will always be at least 10 tokens between one inline | ||
| 734 | + // image's EI and the next valid one's ID since width, height, | ||
| 735 | + // bits per pixel, and color space are all required as well as | ||
| 736 | + // a BI and ID. If we get 10 good tokens in a row or hit EOF, | ||
| 737 | + // we can be pretty sure we've found the actual EI. | ||
| 738 | + for (int i = 0; i < 10; ++i) | ||
| 739 | + { | ||
| 740 | + QPDFTokenizer::Token t = | ||
| 741 | + check.readToken(input, "checker", true); | ||
| 742 | + token_type_e type = t.getType(); | ||
| 743 | + if (type == tt_eof) | ||
| 744 | + { | ||
| 745 | + okay = true; | ||
| 746 | + } | ||
| 747 | + else if (type == tt_bad) | ||
| 748 | + { | ||
| 749 | + found_bad = true; | ||
| 750 | + } | ||
| 751 | + else if (type == tt_word) | ||
| 752 | + { | ||
| 753 | + // The qpdf tokenizer lumps alphabetic and otherwise | ||
| 754 | + // uncategorized characters into "words". We recognize | ||
| 755 | + // strings of alphabetic characters as potential valid | ||
| 756 | + // operators for purposes of telling whether we're in | ||
| 757 | + // valid content or not. It's not perfect, but it | ||
| 758 | + // should work more reliably than what we used to do, | ||
| 759 | + // which was already good enough for the vast majority | ||
| 760 | + // of files. | ||
| 761 | + bool found_alpha = false; | ||
| 762 | + bool found_non_printable = false; | ||
| 763 | + bool found_other = false; | ||
| 764 | + std::string value = t.getValue(); | ||
| 765 | + for (std::string::iterator iter = value.begin(); | ||
| 766 | + iter != value.end(); ++iter) | ||
| 767 | + { | ||
| 768 | + char ch = *iter; | ||
| 769 | + if (((ch >= 'a') && (ch <= 'z')) || | ||
| 770 | + ((ch >= 'A') && (ch <= 'Z')) || | ||
| 771 | + (ch == '*')) | ||
| 772 | + { | ||
| 773 | + // Treat '*' as alpha since there are valid | ||
| 774 | + // PDF operators that contain * along with | ||
| 775 | + // alphabetic characters. | ||
| 776 | + found_alpha = true; | ||
| 777 | + } | ||
| 778 | + else if (((ch < 32) && (! isSpace(ch))) || (ch > 127)) | ||
| 779 | + { | ||
| 780 | + found_non_printable = true; | ||
| 781 | + break; | ||
| 782 | + } | ||
| 783 | + else | ||
| 784 | + { | ||
| 785 | + found_other = true; | ||
| 786 | + } | ||
| 787 | + } | ||
| 788 | + if (found_non_printable || (found_alpha && found_other)) | ||
| 789 | + { | ||
| 790 | + found_bad = true; | ||
| 791 | + } | ||
| 792 | + } | ||
| 793 | + if (okay || found_bad) | ||
| 794 | + { | ||
| 795 | + break; | ||
| 796 | + } | ||
| 797 | + } | ||
| 798 | + if (! found_bad) | ||
| 799 | + { | ||
| 800 | + okay = true; | ||
| 801 | + } | ||
| 802 | + if (! okay) | ||
| 803 | + { | ||
| 804 | + first_try = false; | ||
| 699 | } | 805 | } |
| 700 | - | ||
| 701 | - input->seek(pos, SEEK_SET); | ||
| 702 | - input->setLastOffset(last_offset); | ||
| 703 | } | 806 | } |
| 704 | - if (this->m->state != st_top) | 807 | + if (okay && (! first_try)) |
| 705 | { | 808 | { |
| 706 | - throw std::logic_error("QPDFTokenizer::expectInlineImage called" | ||
| 707 | - " when tokenizer is in improper state"); | 809 | + QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try"); |
| 708 | } | 810 | } |
| 709 | - this->m->state = st_inline_image; | 811 | + |
| 812 | + input->seek(pos, SEEK_SET); | ||
| 813 | + input->setLastOffset(last_offset); | ||
| 710 | } | 814 | } |
| 711 | 815 | ||
| 712 | bool | 816 | bool |
qpdf/qpdf.testcov
| @@ -430,6 +430,9 @@ QPDFPageObjectHelper copy shared attribute 0 | @@ -430,6 +430,9 @@ QPDFPageObjectHelper copy shared attribute 0 | ||
| 430 | qpdf from_nr from repeat_nr 0 | 430 | qpdf from_nr from repeat_nr 0 |
| 431 | QPDF resolve duplicated page object 0 | 431 | QPDF resolve duplicated page object 0 |
| 432 | QPDF handle direct page object 0 | 432 | QPDF handle direct page object 0 |
| 433 | +QPDFTokenizer finder found wrong word 0 | ||
| 434 | +QPDFTokenizer finder word not preceded by delimiter 0 | ||
| 433 | QPDFTokenizer found EI the old way 0 | 435 | QPDFTokenizer found EI the old way 0 |
| 434 | QPDFTokenizer found EI by byte count 0 | 436 | QPDFTokenizer found EI by byte count 0 |
| 435 | QPDFTokenizer inline image at EOF the old way 0 | 437 | QPDFTokenizer inline image at EOF the old way 0 |
| 438 | +QPDFTokenizer found EI after more than one try 0 |
qpdf/qtest/qpdf.test
| @@ -693,6 +693,26 @@ $td->runtest("check pass1 file", | @@ -693,6 +693,26 @@ $td->runtest("check pass1 file", | ||
| 693 | 693 | ||
| 694 | show_ntests(); | 694 | show_ntests(); |
| 695 | # ---------- | 695 | # ---------- |
| 696 | +$td->notify("--- Inline Images ---"); | ||
| 697 | +$n_tests += 2; | ||
| 698 | + | ||
| 699 | +# The file large-inline-image.pdf is a hand-crafted file with several | ||
| 700 | +# inline images of various sizes including one that is two megabytes, | ||
| 701 | +# encoded in base85, and has a base85-encoding that contains EI | ||
| 702 | +# surrounded by delimiters several times. This exercises the EI | ||
| 703 | +# detection code added in qpdf 8.4. | ||
| 704 | + | ||
| 705 | +$td->runtest("complex inline image parsing", | ||
| 706 | + {$td->COMMAND => | ||
| 707 | + "qpdf --qdf --static-id large-inline-image.pdf a.pdf"}, | ||
| 708 | + {$td->STRING => "", $td->EXIT_STATUS => 0}, | ||
| 709 | + $td->NORMALIZE_NEWLINES); | ||
| 710 | +$td->runtest("check output", | ||
| 711 | + {$td->FILE => "a.pdf"}, | ||
| 712 | + {$td->FILE => "large-inline-image.qdf"}); | ||
| 713 | + | ||
| 714 | +show_ntests(); | ||
| 715 | +# ---------- | ||
| 696 | $td->notify("--- Tokenizer ---"); | 716 | $td->notify("--- Tokenizer ---"); |
| 697 | $n_tests += 5; | 717 | $n_tests += 5; |
| 698 | 718 |
qpdf/qtest/qpdf/large-inline-image.pdf
0 → 100644
No preview for this file type
qpdf/qtest/qpdf/large-inline-image.qdf
0 → 100644
No preview for this file type