Commit fcd611b61eb6cc352b4e072fc791681ad927aee2
1 parent
05ff619b
Refactor parseContentStream
Showing
5 changed files
with
155 additions
and
74 deletions
ChangeLog
| ... | ... | @@ -45,6 +45,18 @@ |
| 45 | 45 | characters may surround the EI operator that marks the end of an |
| 46 | 46 | inline image. |
| 47 | 47 | |
| 48 | + * New method QPDFObjectHandle::parsePageContents() to improve upon | |
| 49 | + QPDFObjectHandle::parseContentStream(). The parseContentStream | |
| 50 | + method used to operate on a single content stream, but was fixed | |
| 51 | + to properly handle pages with contents split across multiple | |
| 52 | + streams in an earlier release. The new method parsePageContents() | |
| 53 | + can be called on the page object rather than the value of the | |
| 54 | + page dictionary's /Contents key. This removes a few lines of | |
| 55 | + boiler-plate code from any code that uses parseContentStream, and | |
| 56 | + it also enables creation of more helpful error messages if | |
| 57 | + problems are encountered as the error messages can include | |
| 58 | + information about which page the streams come from. | |
| 59 | + | |
| 48 | 60 | 2018-02-04 Jay Berkenbilt <ejb@ql.org> |
| 49 | 61 | |
| 50 | 62 | * Add QPDFWriter::setLinearizationPass1Filename method and | ... | ... |
include/qpdf/QPDFObjectHandle.hh
| ... | ... | @@ -88,7 +88,7 @@ class QPDFObjectHandle |
| 88 | 88 | virtual void decryptString(std::string& val) = 0; |
| 89 | 89 | }; |
| 90 | 90 | |
| 91 | - // This class is used by parseContentStream. Callers must | |
| 91 | + // This class is used by parsePageContents. Callers must | |
| 92 | 92 | // instantiate a subclass of this with handlers defined to accept |
| 93 | 93 | // QPDFObjectHandles that are parsed from the stream. |
| 94 | 94 | class ParserCallbacks |
| ... | ... | @@ -103,8 +103,8 @@ class QPDFObjectHandle |
| 103 | 103 | |
| 104 | 104 | protected: |
| 105 | 105 | // Implementors may call this method during parsing to |
| 106 | - // terminate parsing early. This method throws an exception | |
| 107 | - // that is caught by parseContentStream, so its effect is | |
| 106 | + // terminate parsing early. This method throws an exception | |
| 107 | + // that is caught by parsePageContents, so its effect is | |
| 108 | 108 | // immediate. |
| 109 | 109 | QPDF_DLL |
| 110 | 110 | void terminateParsing(); |
| ... | ... | @@ -187,6 +187,24 @@ class QPDFObjectHandle |
| 187 | 187 | QPDF* context); |
| 188 | 188 | |
| 189 | 189 | // Helpers for parsing content streams |
| 190 | + | |
| 191 | + // Parse a page's contents through ParserCallbacks, described | |
| 192 | + // above. This method works whether the contents are a single | |
| 193 | + // stream or an array of streams. Call on a page object. | |
| 194 | + QPDF_DLL | |
| 195 | + void parsePageContents(ParserCallbacks* callbacks); | |
| 196 | + | |
| 197 | + // Pipe a page's contents through the given pipeline. This method | |
| 198 | + // works whether the contents are a single stream or an array of | |
| 199 | + // streams. Call on a page object. | |
| 200 | + QPDF_DLL | |
| 201 | + void pipePageContents(Pipeline* p); | |
| 202 | + | |
| 203 | + // Older method: stream_or_array should be the value of /Contents | |
| 204 | + // from a page object. It's more convenient to just call | |
| 205 | + // parsePageContents on the page object, and error messages will | |
| 206 | + // also be more useful because the page object information will be | |
| 207 | + // known. | |
| 190 | 208 | QPDF_DLL |
| 191 | 209 | static void parseContentStream(QPDFObjectHandle stream_or_array, |
| 192 | 210 | ParserCallbacks* callbacks); |
| ... | ... | @@ -697,12 +715,17 @@ class QPDFObjectHandle |
| 697 | 715 | QPDFTokenizer& tokenizer, bool& empty, |
| 698 | 716 | StringDecrypter* decrypter, QPDF* context, |
| 699 | 717 | bool content_stream); |
| 700 | - static void parseContentStream_internal( | |
| 701 | - PointerHolder<Buffer> stream_data, | |
| 718 | + void parseContentStream_internal( | |
| 702 | 719 | std::string const& description, |
| 703 | 720 | ParserCallbacks* callbacks); |
| 704 | - | |
| 705 | - // Other methods | |
| 721 | + static void parseContentStream_data( | |
| 722 | + PointerHolder<Buffer>, | |
| 723 | + std::string const& description, | |
| 724 | + ParserCallbacks* callbacks); | |
| 725 | + std::vector<QPDFObjectHandle> arrayOrStreamToStreamArray( | |
| 726 | + std::string const& description, std::string& all_description); | |
| 727 | + void pipeContentStreams(Pipeline* p, std::string const& description, | |
| 728 | + std::string& all_description); | |
| 706 | 729 | static void warn(QPDF*, QPDFExc const&); |
| 707 | 730 | |
| 708 | 731 | bool initialized; | ... | ... |
libqpdf/QPDFObjectHandle.cc
| ... | ... | @@ -628,44 +628,78 @@ QPDFObjectHandle::getPageImages() |
| 628 | 628 | } |
| 629 | 629 | |
| 630 | 630 | std::vector<QPDFObjectHandle> |
| 631 | -QPDFObjectHandle::getPageContents() | |
| 631 | +QPDFObjectHandle::arrayOrStreamToStreamArray( | |
| 632 | + std::string const& description, std::string& all_description) | |
| 632 | 633 | { |
| 633 | - assertPageObject(); | |
| 634 | - | |
| 634 | + all_description = description; | |
| 635 | 635 | std::vector<QPDFObjectHandle> result; |
| 636 | - QPDFObjectHandle contents = this->getKey("/Contents"); | |
| 637 | - if (contents.isArray()) | |
| 636 | + if (isArray()) | |
| 638 | 637 | { |
| 639 | - int n_items = contents.getArrayNItems(); | |
| 638 | + int n_items = getArrayNItems(); | |
| 640 | 639 | for (int i = 0; i < n_items; ++i) |
| 641 | 640 | { |
| 642 | - QPDFObjectHandle item = contents.getArrayItem(i); | |
| 641 | + QPDFObjectHandle item = getArrayItem(i); | |
| 643 | 642 | if (item.isStream()) |
| 643 | + { | |
| 644 | + result.push_back(item); | |
| 645 | + } | |
| 646 | + else | |
| 644 | 647 | { |
| 645 | - result.push_back(item); | |
| 646 | - } | |
| 647 | - else | |
| 648 | - { | |
| 649 | - throw std::runtime_error( | |
| 650 | - "unknown item type while inspecting " | |
| 651 | - "element of /Contents array in page " | |
| 652 | - "dictionary"); | |
| 648 | + QTC::TC("qpdf", "QPDFObjectHandle non-stream in stream array"); | |
| 649 | + warn(item.getOwningQPDF(), | |
| 650 | + QPDFExc(qpdf_e_damaged_pdf, description, | |
| 651 | + "item index " + QUtil::int_to_string(i) + | |
| 652 | + " (from 0)", 0, | |
| 653 | + "ignoring non-stream in an array of streams")); | |
| 653 | 654 | } |
| 654 | 655 | } |
| 655 | 656 | } |
| 656 | - else if (contents.isStream()) | |
| 657 | + else if (isStream()) | |
| 658 | + { | |
| 659 | + result.push_back(*this); | |
| 660 | + } | |
| 661 | + else if (! isNull()) | |
| 657 | 662 | { |
| 658 | - result.push_back(contents); | |
| 663 | + warn(getOwningQPDF(), | |
| 664 | + QPDFExc(qpdf_e_damaged_pdf, "", description, 0, | |
| 665 | + " object is supposed to be a stream or an" | |
| 666 | + " array of streams but is neither")); | |
| 659 | 667 | } |
| 660 | - else if (! contents.isNull()) | |
| 668 | + | |
| 669 | + bool first = true; | |
| 670 | + for (std::vector<QPDFObjectHandle>::iterator iter = result.begin(); | |
| 671 | + iter != result.end(); ++iter) | |
| 661 | 672 | { |
| 662 | - throw std::runtime_error("unknown object type inspecting /Contents " | |
| 663 | - "key in page dictionary"); | |
| 673 | + QPDFObjectHandle item = *iter; | |
| 674 | + std::string og = | |
| 675 | + QUtil::int_to_string(item.getObjectID()) + " " + | |
| 676 | + QUtil::int_to_string(item.getGeneration()); | |
| 677 | + if (first) | |
| 678 | + { | |
| 679 | + first = false; | |
| 680 | + } | |
| 681 | + else | |
| 682 | + { | |
| 683 | + all_description += ","; | |
| 684 | + } | |
| 685 | + all_description += " stream " + og; | |
| 664 | 686 | } |
| 665 | 687 | |
| 666 | 688 | return result; |
| 667 | 689 | } |
| 668 | 690 | |
| 691 | +std::vector<QPDFObjectHandle> | |
| 692 | +QPDFObjectHandle::getPageContents() | |
| 693 | +{ | |
| 694 | + assertPageObject(); | |
| 695 | + std::string description = "page object " + | |
| 696 | + QUtil::int_to_string(this->objid) + " " + | |
| 697 | + QUtil::int_to_string(this->generation); | |
| 698 | + std::string all_description; | |
| 699 | + return this->getKey("/Contents").arrayOrStreamToStreamArray( | |
| 700 | + description, all_description); | |
| 701 | +} | |
| 702 | + | |
| 669 | 703 | void |
| 670 | 704 | QPDFObjectHandle::addPageContents(QPDFObjectHandle new_contents, bool first) |
| 671 | 705 | { |
| ... | ... | @@ -806,61 +840,72 @@ QPDFObjectHandle::parse(std::string const& object_str, |
| 806 | 840 | } |
| 807 | 841 | |
| 808 | 842 | void |
| 809 | -QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array, | |
| 810 | - ParserCallbacks* callbacks) | |
| 843 | +QPDFObjectHandle::pipePageContents(Pipeline* p) | |
| 811 | 844 | { |
| 812 | - std::vector<QPDFObjectHandle> streams; | |
| 813 | - if (stream_or_array.isArray()) | |
| 814 | - { | |
| 815 | - streams = stream_or_array.getArrayAsVector(); | |
| 816 | - } | |
| 817 | - else | |
| 818 | - { | |
| 819 | - streams.push_back(stream_or_array); | |
| 820 | - } | |
| 821 | - Pl_Buffer buf("concatenated stream data buffer"); | |
| 822 | - std::string all_description = "content stream objects"; | |
| 823 | - bool first = true; | |
| 845 | + std::string description = "page object " + | |
| 846 | + QUtil::int_to_string(this->objid) + " " + | |
| 847 | + QUtil::int_to_string(this->generation); | |
| 848 | + std::string all_description; | |
| 849 | + this->getKey("/Contents").pipeContentStreams( | |
| 850 | + p, description, all_description); | |
| 851 | +} | |
| 852 | + | |
| 853 | +void | |
| 854 | +QPDFObjectHandle::pipeContentStreams( | |
| 855 | + Pipeline* p, std::string const& description, std::string& all_description) | |
| 856 | +{ | |
| 857 | + std::vector<QPDFObjectHandle> streams = | |
| 858 | + arrayOrStreamToStreamArray( | |
| 859 | + description, all_description); | |
| 824 | 860 | for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin(); |
| 825 | 861 | iter != streams.end(); ++iter) |
| 826 | 862 | { |
| 827 | 863 | QPDFObjectHandle stream = *iter; |
| 828 | - if (! stream.isStream()) | |
| 864 | + std::string og = | |
| 865 | + QUtil::int_to_string(stream.getObjectID()) + " " + | |
| 866 | + QUtil::int_to_string(stream.getGeneration()); | |
| 867 | + std::string description = "content stream object " + og; | |
| 868 | + if (! stream.pipeStreamData(p, 0, qpdf_dl_specialized)) | |
| 829 | 869 | { |
| 830 | - QTC::TC("qpdf", "QPDFObjectHandle non-stream in parsecontent"); | |
| 870 | + QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent"); | |
| 831 | 871 | warn(stream.getOwningQPDF(), |
| 832 | 872 | QPDFExc(qpdf_e_damaged_pdf, "content stream", |
| 833 | - "", 0, | |
| 834 | - "ignoring non-stream while parsing content streams")); | |
| 835 | - } | |
| 836 | - else | |
| 837 | - { | |
| 838 | - std::string og = QUtil::int_to_string(stream.getObjectID()) + " " + | |
| 839 | - QUtil::int_to_string(stream.getGeneration()); | |
| 840 | - std::string description = "content stream object " + og; | |
| 841 | - if (first) | |
| 842 | - { | |
| 843 | - first = false; | |
| 844 | - } | |
| 845 | - else | |
| 846 | - { | |
| 847 | - all_description += ","; | |
| 848 | - } | |
| 849 | - all_description += " " + og; | |
| 850 | - if (! stream.pipeStreamData(&buf, 0, qpdf_dl_specialized)) | |
| 851 | - { | |
| 852 | - QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent"); | |
| 853 | - warn(stream.getOwningQPDF(), | |
| 854 | - QPDFExc(qpdf_e_damaged_pdf, "content stream", | |
| 855 | - description, 0, | |
| 856 | - "errors while decoding content stream")); | |
| 857 | - } | |
| 873 | + description, 0, | |
| 874 | + "errors while decoding content stream")); | |
| 858 | 875 | } |
| 859 | 876 | } |
| 877 | +} | |
| 878 | + | |
| 879 | +void | |
| 880 | +QPDFObjectHandle::parsePageContents(ParserCallbacks* callbacks) | |
| 881 | +{ | |
| 882 | + std::string description = "page object " + | |
| 883 | + QUtil::int_to_string(this->objid) + " " + | |
| 884 | + QUtil::int_to_string(this->generation); | |
| 885 | + this->getKey("/Contents").parseContentStream_internal( | |
| 886 | + description, callbacks); | |
| 887 | +} | |
| 888 | + | |
| 889 | +void | |
| 890 | +QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array, | |
| 891 | + ParserCallbacks* callbacks) | |
| 892 | +{ | |
| 893 | + stream_or_array.parseContentStream_internal( | |
| 894 | + "content stream objects", callbacks); | |
| 895 | +} | |
| 896 | + | |
| 897 | +void | |
| 898 | +QPDFObjectHandle::parseContentStream_internal( | |
| 899 | + std::string const& description, | |
| 900 | + ParserCallbacks* callbacks) | |
| 901 | +{ | |
| 902 | + Pl_Buffer buf("concatenated stream data buffer"); | |
| 903 | + std::string all_description; | |
| 904 | + pipeContentStreams(&buf, description, all_description); | |
| 860 | 905 | PointerHolder<Buffer> stream_data = buf.getBuffer(); |
| 861 | 906 | try |
| 862 | 907 | { |
| 863 | - parseContentStream_internal(stream_data, all_description, callbacks); | |
| 908 | + parseContentStream_data(stream_data, all_description, callbacks); | |
| 864 | 909 | } |
| 865 | 910 | catch (TerminateParsing&) |
| 866 | 911 | { |
| ... | ... | @@ -870,9 +915,10 @@ QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array, |
| 870 | 915 | } |
| 871 | 916 | |
| 872 | 917 | void |
| 873 | -QPDFObjectHandle::parseContentStream_internal(PointerHolder<Buffer> stream_data, | |
| 874 | - std::string const& description, | |
| 875 | - ParserCallbacks* callbacks) | |
| 918 | +QPDFObjectHandle::parseContentStream_data( | |
| 919 | + PointerHolder<Buffer> stream_data, | |
| 920 | + std::string const& description, | |
| 921 | + ParserCallbacks* callbacks) | |
| 876 | 922 | { |
| 877 | 923 | size_t length = stream_data->getSize(); |
| 878 | 924 | PointerHolder<InputSource> input = | ... | ... |
qpdf/qpdf.testcov
| ... | ... | @@ -277,7 +277,6 @@ QPDFObjectHandle found fake 1 |
| 277 | 277 | QPDFObjectHandle no val for last key 0 |
| 278 | 278 | QPDF resolve failure to null 0 |
| 279 | 279 | QPDFWriter preserve unreferenced standard 0 |
| 280 | -QPDFObjectHandle non-stream in parsecontent 0 | |
| 281 | 280 | QPDFObjectHandle errors in parsecontent 0 |
| 282 | 281 | QPDF stream with non-space 0 |
| 283 | 282 | qpdf same file error 0 |
| ... | ... | @@ -304,3 +303,4 @@ QPDF_Stream TIFF predictor 0 |
| 304 | 303 | QPDFTokenizer EOF when not allowed 0 |
| 305 | 304 | QPDFTokenizer inline image at EOF 0 |
| 306 | 305 | Pl_QPDFTokenizer found ID 0 |
| 306 | +QPDFObjectHandle non-stream in stream array 0 | ... | ... |
qpdf/qtest/qpdf/split-content-stream-errors.out
| ... | ... | @@ -4,6 +4,6 @@ File is not encrypted |
| 4 | 4 | File is not linearized |
| 5 | 5 | WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received |
| 6 | 6 | WARNING: split-content-stream-errors.pdf (file position 557): stream will be re-processed without filtering to avoid data loss |
| 7 | -WARNING: content stream: ignoring non-stream while parsing content streams | |
| 7 | +WARNING: content stream objects (item index 0 (from 0)): ignoring non-stream in an array of streams | |
| 8 | 8 | WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received |
| 9 | 9 | WARNING: content stream (content stream object 6 0): errors while decoding content stream | ... | ... |