Commit 2b6c79bcaeee0548f3d7291876eb3821e14b8227
1 parent
ec9e310c
Improve locating inline image's EI
We've actually seen a PDF file in the wild that contained EI surrounded by delimiters inside the image data, which confused qpdf's naive code. This significantly improves EI detection.
Showing
7 changed files
with
155 additions
and
15 deletions
ChangeLog
| 1 | +2019-01-30 Jay Berkenbilt <ejb@ql.org> | |
| 2 | + | |
| 3 | + * Improve locating of an inline image's EI operator to correctly | |
| 4 | + handle the case of EI appearing inside the image data. | |
| 5 | + | |
| 6 | + * Very low-level QPDFTokenizer API now includes an | |
| 7 | + expectInlineImage method that takes an input stream, enabling it | |
| 8 | + to locate an inline image's EI operator better. This is called | |
| 9 | + automatically everywhere within the qpdf library. Most user code | |
| 10 | + will never have to use the low-level tokenizer API. If you use | |
| 11 | + Pl_QPDFTokenizer, this will be done automatically for you. | |
| 12 | + | |
| 1 | 13 | 2019-01-29 Jay Berkenbilt <ejb@ql.org> |
| 2 | 14 | |
| 3 | 15 | * Bug fix: when returning an inline image token, the tokenizer no | ... | ... |
include/qpdf/QPDFTokenizer.hh
| ... | ... | @@ -198,6 +198,7 @@ class QPDFTokenizer |
| 198 | 198 | void resolveLiteral(); |
| 199 | 199 | bool isSpace(char); |
| 200 | 200 | bool isDelimiter(char); |
| 201 | + void findEI(PointerHolder<InputSource> input); | |
| 201 | 202 | |
| 202 | 203 | enum state_e { |
| 203 | 204 | st_top, st_in_space, st_in_comment, st_in_string, st_lt, st_gt, | ... | ... |
libqpdf/QPDFTokenizer.cc
| ... | ... | @@ -47,7 +47,7 @@ QPDFWordTokenFinder::check() |
| 47 | 47 | qpdf_offset_t pos = is->tell(); |
| 48 | 48 | if (! (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) |
| 49 | 49 | { |
| 50 | -/// QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); | |
| 50 | + QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); | |
| 51 | 51 | return false; |
| 52 | 52 | } |
| 53 | 53 | qpdf_offset_t token_start = is->getLastOffset(); |
| ... | ... | @@ -65,7 +65,6 @@ QPDFWordTokenFinder::check() |
| 65 | 65 | is->seek(pos, SEEK_SET); |
| 66 | 66 | if (! next_okay) |
| 67 | 67 | { |
| 68 | -/// QTC::TC("qpdf", "QPDFTokenizer finder word not followed by delimiter"); | |
| 69 | 68 | return false; |
| 70 | 69 | } |
| 71 | 70 | if (token_start == 0) |
| ... | ... | @@ -80,7 +79,7 @@ QPDFWordTokenFinder::check() |
| 80 | 79 | is->seek(pos, SEEK_SET); |
| 81 | 80 | if (! prev_okay) |
| 82 | 81 | { |
| 83 | -/// QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter"); | |
| 82 | + QTC::TC("qpdf", "QPDFTokenizer finder word not preceded by delimiter"); | |
| 84 | 83 | return false; |
| 85 | 84 | } |
| 86 | 85 | return true; |
| ... | ... | @@ -687,26 +686,131 @@ QPDFTokenizer::expectInlineImage() |
| 687 | 686 | void |
| 688 | 687 | QPDFTokenizer::expectInlineImage(PointerHolder<InputSource> input) |
| 689 | 688 | { |
| 690 | - if (input.getPointer()) | |
| 689 | + if (this->m->state != st_top) | |
| 690 | + { | |
| 691 | + throw std::logic_error("QPDFTokenizer::expectInlineImage called" | |
| 692 | + " when tokenizer is in improper state"); | |
| 693 | + } | |
| 694 | + findEI(input); | |
| 695 | + this->m->state = st_inline_image; | |
| 696 | +} | |
| 697 | + | |
| 698 | +void | |
| 699 | +QPDFTokenizer::findEI(PointerHolder<InputSource> input) | |
| 700 | +{ | |
| 701 | + if (! input.getPointer()) | |
| 691 | 702 | { |
| 692 | - qpdf_offset_t last_offset = input->getLastOffset(); | |
| 693 | - qpdf_offset_t pos = input->tell(); | |
| 703 | + return; | |
| 704 | + } | |
| 705 | + | |
| 706 | + qpdf_offset_t last_offset = input->getLastOffset(); | |
| 707 | + qpdf_offset_t pos = input->tell(); | |
| 694 | 708 | |
| 709 | + // Use QPDFWordTokenFinder to find EI surrounded by delimiters. | |
| 710 | + // Then read the next several tokens or up to EOF. If we find any | |
| 711 | + // suspicious-looking or tokens, this is probably still part of | |
| 712 | + // the image data, so keep looking for EI. Stop at the first EI | |
| 713 | + // that passes. If we get to the end without finding one, return | |
| 714 | + // the last EI we found. Store the number of bytes expected in the | |
| 715 | + // inline image including the EI and use that to break out of | |
| 716 | + // inline image, falling back to the old method if needed. | |
| 717 | + | |
| 718 | + bool okay = false; | |
| 719 | + bool first_try = true; | |
| 720 | + while (! okay) | |
| 721 | + { | |
| 695 | 722 | QPDFWordTokenFinder f(input, "EI"); |
| 696 | - if (input->findFirst("EI", pos, 0, f)) | |
| 723 | + if (! input->findFirst("EI", input->tell(), 0, f)) | |
| 697 | 724 | { |
| 698 | - this->m->inline_image_bytes = input->tell() - pos; | |
| 725 | + break; | |
| 726 | + } | |
| 727 | + this->m->inline_image_bytes = input->tell() - pos; | |
| 728 | + | |
| 729 | + QPDFTokenizer check; | |
| 730 | + bool found_bad = false; | |
| 731 | + // Look at the next 10 tokens or up to EOF. The next inline | |
| 732 | + // image's image data would look like bad tokens, but there | |
| 733 | + // will always be at least 10 tokens between one inline | |
| 734 | + // image's EI and the next valid one's ID since width, height, | |
| 735 | + // bits per pixel, and color space are all required as well as | |
| 736 | + // a BI and ID. If we get 10 good tokens in a row or hit EOF, | |
| 737 | + // we can be pretty sure we've found the actual EI. | |
| 738 | + for (int i = 0; i < 10; ++i) | |
| 739 | + { | |
| 740 | + QPDFTokenizer::Token t = | |
| 741 | + check.readToken(input, "checker", true); | |
| 742 | + token_type_e type = t.getType(); | |
| 743 | + if (type == tt_eof) | |
| 744 | + { | |
| 745 | + okay = true; | |
| 746 | + } | |
| 747 | + else if (type == tt_bad) | |
| 748 | + { | |
| 749 | + found_bad = true; | |
| 750 | + } | |
| 751 | + else if (type == tt_word) | |
| 752 | + { | |
| 753 | + // The qpdf tokenizer lumps alphabetic and otherwise | |
| 754 | + // uncategorized characters into "words". We recognize | |
| 755 | + // strings of alphabetic characters as potential valid | |
| 756 | + // operators for purposes of telling whether we're in | |
| 757 | + // valid content or not. It's not perfect, but it | |
| 758 | + // should work more reliably than what we used to do, | |
| 759 | + // which was already good enough for the vast majority | |
| 760 | + // of files. | |
| 761 | + bool found_alpha = false; | |
| 762 | + bool found_non_printable = false; | |
| 763 | + bool found_other = false; | |
| 764 | + std::string value = t.getValue(); | |
| 765 | + for (std::string::iterator iter = value.begin(); | |
| 766 | + iter != value.end(); ++iter) | |
| 767 | + { | |
| 768 | + char ch = *iter; | |
| 769 | + if (((ch >= 'a') && (ch <= 'z')) || | |
| 770 | + ((ch >= 'A') && (ch <= 'Z')) || | |
| 771 | + (ch == '*')) | |
| 772 | + { | |
| 773 | + // Treat '*' as alpha since there are valid | |
| 774 | + // PDF operators that contain * along with | |
| 775 | + // alphabetic characters. | |
| 776 | + found_alpha = true; | |
| 777 | + } | |
| 778 | + else if (((ch < 32) && (! isSpace(ch))) || (ch > 127)) | |
| 779 | + { | |
| 780 | + found_non_printable = true; | |
| 781 | + break; | |
| 782 | + } | |
| 783 | + else | |
| 784 | + { | |
| 785 | + found_other = true; | |
| 786 | + } | |
| 787 | + } | |
| 788 | + if (found_non_printable || (found_alpha && found_other)) | |
| 789 | + { | |
| 790 | + found_bad = true; | |
| 791 | + } | |
| 792 | + } | |
| 793 | + if (okay || found_bad) | |
| 794 | + { | |
| 795 | + break; | |
| 796 | + } | |
| 797 | + } | |
| 798 | + if (! found_bad) | |
| 799 | + { | |
| 800 | + okay = true; | |
| 801 | + } | |
| 802 | + if (! okay) | |
| 803 | + { | |
| 804 | + first_try = false; | |
| 699 | 805 | } |
| 700 | - | |
| 701 | - input->seek(pos, SEEK_SET); | |
| 702 | - input->setLastOffset(last_offset); | |
| 703 | 806 | } |
| 704 | - if (this->m->state != st_top) | |
| 807 | + if (okay && (! first_try)) | |
| 705 | 808 | { |
| 706 | - throw std::logic_error("QPDFTokenizer::expectInlineImage called" | |
| 707 | - " when tokenizer is in improper state"); | |
| 809 | + QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try"); | |
| 708 | 810 | } |
| 709 | - this->m->state = st_inline_image; | |
| 811 | + | |
| 812 | + input->seek(pos, SEEK_SET); | |
| 813 | + input->setLastOffset(last_offset); | |
| 710 | 814 | } |
| 711 | 815 | |
| 712 | 816 | bool | ... | ... |
qpdf/qpdf.testcov
| ... | ... | @@ -430,6 +430,9 @@ QPDFPageObjectHelper copy shared attribute 0 |
| 430 | 430 | qpdf from_nr from repeat_nr 0 |
| 431 | 431 | QPDF resolve duplicated page object 0 |
| 432 | 432 | QPDF handle direct page object 0 |
| 433 | +QPDFTokenizer finder found wrong word 0 | |
| 434 | +QPDFTokenizer finder word not preceded by delimiter 0 | |
| 433 | 435 | QPDFTokenizer found EI the old way 0 |
| 434 | 436 | QPDFTokenizer found EI by byte count 0 |
| 435 | 437 | QPDFTokenizer inline image at EOF the old way 0 |
| 438 | +QPDFTokenizer found EI after more than one try 0 | ... | ... |
qpdf/qtest/qpdf.test
| ... | ... | @@ -693,6 +693,26 @@ $td->runtest("check pass1 file", |
| 693 | 693 | |
| 694 | 694 | show_ntests(); |
| 695 | 695 | # ---------- |
| 696 | +$td->notify("--- Inline Images ---"); | |
| 697 | +$n_tests += 2; | |
| 698 | + | |
| 699 | +# The file large-inline-image.pdf is a hand-crafted file with several | |
| 700 | +# inline images of various sizes including one that is two megabytes, | |
| 701 | +# encoded in base85, and has a base85-encoding that contains EI | |
| 702 | +# surrounded by delimiters several times. This exercises the EI | |
| 703 | +# detection code added in qpdf 8.4. | |
| 704 | + | |
| 705 | +$td->runtest("complex inline image parsing", | |
| 706 | + {$td->COMMAND => | |
| 707 | + "qpdf --qdf --static-id large-inline-image.pdf a.pdf"}, | |
| 708 | + {$td->STRING => "", $td->EXIT_STATUS => 0}, | |
| 709 | + $td->NORMALIZE_NEWLINES); | |
| 710 | +$td->runtest("check output", | |
| 711 | + {$td->FILE => "a.pdf"}, | |
| 712 | + {$td->FILE => "large-inline-image.qdf"}); | |
| 713 | + | |
| 714 | +show_ntests(); | |
| 715 | +# ---------- | |
| 696 | 716 | $td->notify("--- Tokenizer ---"); |
| 697 | 717 | $n_tests += 5; |
| 698 | 718 | ... | ... |
qpdf/qtest/qpdf/large-inline-image.pdf
0 → 100644
No preview for this file type
qpdf/qtest/qpdf/large-inline-image.qdf
0 → 100644
No preview for this file type