Commit fcd611b61eb6cc352b4e072fc791681ad927aee2
1 parent
05ff619b
Refactor parseContentStream
Showing
5 changed files
with
155 additions
and
74 deletions
ChangeLog
| @@ -45,6 +45,18 @@ | @@ -45,6 +45,18 @@ | ||
| 45 | characters may surround the EI operator that marks the end of an | 45 | characters may surround the EI operator that marks the end of an |
| 46 | inline image. | 46 | inline image. |
| 47 | 47 | ||
| 48 | + * New method QPDFObjectHandle::parsePageContents() to improve upon | ||
| 49 | + QPDFObjectHandle::parseContentStream(). The parseContentStream | ||
| 50 | + method used to operate on a single content stream, but was fixed | ||
| 51 | + to properly handle pages with contents split across multiple | ||
| 52 | + streams in an earlier release. The new method parsePageContents() | ||
| 53 | + can be called on the page object rather than the value of the | ||
| 54 | + page dictionary's /Contents key. This removes a few lines of | ||
| 55 | + boiler-plate code from any code that uses parseContentStream, and | ||
| 56 | + it also enables creation of more helpful error messages if | ||
| 57 | + problems are encountered as the error messages can include | ||
| 58 | + information about which page the streams come from. | ||
| 59 | + | ||
| 48 | 2018-02-04 Jay Berkenbilt <ejb@ql.org> | 60 | 2018-02-04 Jay Berkenbilt <ejb@ql.org> |
| 49 | 61 | ||
| 50 | * Add QPDFWriter::setLinearizationPass1Filename method and | 62 | * Add QPDFWriter::setLinearizationPass1Filename method and |
include/qpdf/QPDFObjectHandle.hh
| @@ -88,7 +88,7 @@ class QPDFObjectHandle | @@ -88,7 +88,7 @@ class QPDFObjectHandle | ||
| 88 | virtual void decryptString(std::string& val) = 0; | 88 | virtual void decryptString(std::string& val) = 0; |
| 89 | }; | 89 | }; |
| 90 | 90 | ||
| 91 | - // This class is used by parseContentStream. Callers must | 91 | + // This class is used by parsePageContents. Callers must |
| 92 | // instantiate a subclass of this with handlers defined to accept | 92 | // instantiate a subclass of this with handlers defined to accept |
| 93 | // QPDFObjectHandles that are parsed from the stream. | 93 | // QPDFObjectHandles that are parsed from the stream. |
| 94 | class ParserCallbacks | 94 | class ParserCallbacks |
| @@ -103,8 +103,8 @@ class QPDFObjectHandle | @@ -103,8 +103,8 @@ class QPDFObjectHandle | ||
| 103 | 103 | ||
| 104 | protected: | 104 | protected: |
| 105 | // Implementors may call this method during parsing to | 105 | // Implementors may call this method during parsing to |
| 106 | - // terminate parsing early. This method throws an exception | ||
| 107 | - // that is caught by parseContentStream, so its effect is | 106 | + // terminate parsing early. This method throws an exception |
| 107 | + // that is caught by parsePageContents, so its effect is | ||
| 108 | // immediate. | 108 | // immediate. |
| 109 | QPDF_DLL | 109 | QPDF_DLL |
| 110 | void terminateParsing(); | 110 | void terminateParsing(); |
| @@ -187,6 +187,24 @@ class QPDFObjectHandle | @@ -187,6 +187,24 @@ class QPDFObjectHandle | ||
| 187 | QPDF* context); | 187 | QPDF* context); |
| 188 | 188 | ||
| 189 | // Helpers for parsing content streams | 189 | // Helpers for parsing content streams |
| 190 | + | ||
| 191 | + // Parse a page's contents through ParserCallbacks, described | ||
| 192 | + // above. This method works whether the contents are a single | ||
| 193 | + // stream or an array of streams. Call on a page object. | ||
| 194 | + QPDF_DLL | ||
| 195 | + void parsePageContents(ParserCallbacks* callbacks); | ||
| 196 | + | ||
| 197 | + // Pipe a page's contents through the given pipeline. This method | ||
| 198 | + // works whether the contents are a single stream or an array of | ||
| 199 | + // streams. Call on a page object. | ||
| 200 | + QPDF_DLL | ||
| 201 | + void pipePageContents(Pipeline* p); | ||
| 202 | + | ||
| 203 | + // Older method: stream_or_array should be the value of /Contents | ||
| 204 | + // from a page object. It's more convenient to just call | ||
| 205 | + // parsePageContents on the page object, and error messages will | ||
| 206 | + // also be more useful because the page object information will be | ||
| 207 | + // known. | ||
| 190 | QPDF_DLL | 208 | QPDF_DLL |
| 191 | static void parseContentStream(QPDFObjectHandle stream_or_array, | 209 | static void parseContentStream(QPDFObjectHandle stream_or_array, |
| 192 | ParserCallbacks* callbacks); | 210 | ParserCallbacks* callbacks); |
| @@ -697,12 +715,17 @@ class QPDFObjectHandle | @@ -697,12 +715,17 @@ class QPDFObjectHandle | ||
| 697 | QPDFTokenizer& tokenizer, bool& empty, | 715 | QPDFTokenizer& tokenizer, bool& empty, |
| 698 | StringDecrypter* decrypter, QPDF* context, | 716 | StringDecrypter* decrypter, QPDF* context, |
| 699 | bool content_stream); | 717 | bool content_stream); |
| 700 | - static void parseContentStream_internal( | ||
| 701 | - PointerHolder<Buffer> stream_data, | 718 | + void parseContentStream_internal( |
| 702 | std::string const& description, | 719 | std::string const& description, |
| 703 | ParserCallbacks* callbacks); | 720 | ParserCallbacks* callbacks); |
| 704 | - | ||
| 705 | - // Other methods | 721 | + static void parseContentStream_data( |
| 722 | + PointerHolder<Buffer>, | ||
| 723 | + std::string const& description, | ||
| 724 | + ParserCallbacks* callbacks); | ||
| 725 | + std::vector<QPDFObjectHandle> arrayOrStreamToStreamArray( | ||
| 726 | + std::string const& description, std::string& all_description); | ||
| 727 | + void pipeContentStreams(Pipeline* p, std::string const& description, | ||
| 728 | + std::string& all_description); | ||
| 706 | static void warn(QPDF*, QPDFExc const&); | 729 | static void warn(QPDF*, QPDFExc const&); |
| 707 | 730 | ||
| 708 | bool initialized; | 731 | bool initialized; |
libqpdf/QPDFObjectHandle.cc
| @@ -628,44 +628,78 @@ QPDFObjectHandle::getPageImages() | @@ -628,44 +628,78 @@ QPDFObjectHandle::getPageImages() | ||
| 628 | } | 628 | } |
| 629 | 629 | ||
| 630 | std::vector<QPDFObjectHandle> | 630 | std::vector<QPDFObjectHandle> |
| 631 | -QPDFObjectHandle::getPageContents() | 631 | +QPDFObjectHandle::arrayOrStreamToStreamArray( |
| 632 | + std::string const& description, std::string& all_description) | ||
| 632 | { | 633 | { |
| 633 | - assertPageObject(); | ||
| 634 | - | 634 | + all_description = description; |
| 635 | std::vector<QPDFObjectHandle> result; | 635 | std::vector<QPDFObjectHandle> result; |
| 636 | - QPDFObjectHandle contents = this->getKey("/Contents"); | ||
| 637 | - if (contents.isArray()) | 636 | + if (isArray()) |
| 638 | { | 637 | { |
| 639 | - int n_items = contents.getArrayNItems(); | 638 | + int n_items = getArrayNItems(); |
| 640 | for (int i = 0; i < n_items; ++i) | 639 | for (int i = 0; i < n_items; ++i) |
| 641 | { | 640 | { |
| 642 | - QPDFObjectHandle item = contents.getArrayItem(i); | 641 | + QPDFObjectHandle item = getArrayItem(i); |
| 643 | if (item.isStream()) | 642 | if (item.isStream()) |
| 643 | + { | ||
| 644 | + result.push_back(item); | ||
| 645 | + } | ||
| 646 | + else | ||
| 644 | { | 647 | { |
| 645 | - result.push_back(item); | ||
| 646 | - } | ||
| 647 | - else | ||
| 648 | - { | ||
| 649 | - throw std::runtime_error( | ||
| 650 | - "unknown item type while inspecting " | ||
| 651 | - "element of /Contents array in page " | ||
| 652 | - "dictionary"); | 648 | + QTC::TC("qpdf", "QPDFObjectHandle non-stream in stream array"); |
| 649 | + warn(item.getOwningQPDF(), | ||
| 650 | + QPDFExc(qpdf_e_damaged_pdf, description, | ||
| 651 | + "item index " + QUtil::int_to_string(i) + | ||
| 652 | + " (from 0)", 0, | ||
| 653 | + "ignoring non-stream in an array of streams")); | ||
| 653 | } | 654 | } |
| 654 | } | 655 | } |
| 655 | } | 656 | } |
| 656 | - else if (contents.isStream()) | 657 | + else if (isStream()) |
| 658 | + { | ||
| 659 | + result.push_back(*this); | ||
| 660 | + } | ||
| 661 | + else if (! isNull()) | ||
| 657 | { | 662 | { |
| 658 | - result.push_back(contents); | 663 | + warn(getOwningQPDF(), |
| 664 | + QPDFExc(qpdf_e_damaged_pdf, "", description, 0, | ||
| 665 | + " object is supposed to be a stream or an" | ||
| 666 | + " array of streams but is neither")); | ||
| 659 | } | 667 | } |
| 660 | - else if (! contents.isNull()) | 668 | + |
| 669 | + bool first = true; | ||
| 670 | + for (std::vector<QPDFObjectHandle>::iterator iter = result.begin(); | ||
| 671 | + iter != result.end(); ++iter) | ||
| 661 | { | 672 | { |
| 662 | - throw std::runtime_error("unknown object type inspecting /Contents " | ||
| 663 | - "key in page dictionary"); | 673 | + QPDFObjectHandle item = *iter; |
| 674 | + std::string og = | ||
| 675 | + QUtil::int_to_string(item.getObjectID()) + " " + | ||
| 676 | + QUtil::int_to_string(item.getGeneration()); | ||
| 677 | + if (first) | ||
| 678 | + { | ||
| 679 | + first = false; | ||
| 680 | + } | ||
| 681 | + else | ||
| 682 | + { | ||
| 683 | + all_description += ","; | ||
| 684 | + } | ||
| 685 | + all_description += " stream " + og; | ||
| 664 | } | 686 | } |
| 665 | 687 | ||
| 666 | return result; | 688 | return result; |
| 667 | } | 689 | } |
| 668 | 690 | ||
| 691 | +std::vector<QPDFObjectHandle> | ||
| 692 | +QPDFObjectHandle::getPageContents() | ||
| 693 | +{ | ||
| 694 | + assertPageObject(); | ||
| 695 | + std::string description = "page object " + | ||
| 696 | + QUtil::int_to_string(this->objid) + " " + | ||
| 697 | + QUtil::int_to_string(this->generation); | ||
| 698 | + std::string all_description; | ||
| 699 | + return this->getKey("/Contents").arrayOrStreamToStreamArray( | ||
| 700 | + description, all_description); | ||
| 701 | +} | ||
| 702 | + | ||
| 669 | void | 703 | void |
| 670 | QPDFObjectHandle::addPageContents(QPDFObjectHandle new_contents, bool first) | 704 | QPDFObjectHandle::addPageContents(QPDFObjectHandle new_contents, bool first) |
| 671 | { | 705 | { |
| @@ -806,61 +840,72 @@ QPDFObjectHandle::parse(std::string const& object_str, | @@ -806,61 +840,72 @@ QPDFObjectHandle::parse(std::string const& object_str, | ||
| 806 | } | 840 | } |
| 807 | 841 | ||
| 808 | void | 842 | void |
| 809 | -QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array, | ||
| 810 | - ParserCallbacks* callbacks) | 843 | +QPDFObjectHandle::pipePageContents(Pipeline* p) |
| 811 | { | 844 | { |
| 812 | - std::vector<QPDFObjectHandle> streams; | ||
| 813 | - if (stream_or_array.isArray()) | ||
| 814 | - { | ||
| 815 | - streams = stream_or_array.getArrayAsVector(); | ||
| 816 | - } | ||
| 817 | - else | ||
| 818 | - { | ||
| 819 | - streams.push_back(stream_or_array); | ||
| 820 | - } | ||
| 821 | - Pl_Buffer buf("concatenated stream data buffer"); | ||
| 822 | - std::string all_description = "content stream objects"; | ||
| 823 | - bool first = true; | 845 | + std::string description = "page object " + |
| 846 | + QUtil::int_to_string(this->objid) + " " + | ||
| 847 | + QUtil::int_to_string(this->generation); | ||
| 848 | + std::string all_description; | ||
| 849 | + this->getKey("/Contents").pipeContentStreams( | ||
| 850 | + p, description, all_description); | ||
| 851 | +} | ||
| 852 | + | ||
| 853 | +void | ||
| 854 | +QPDFObjectHandle::pipeContentStreams( | ||
| 855 | + Pipeline* p, std::string const& description, std::string& all_description) | ||
| 856 | +{ | ||
| 857 | + std::vector<QPDFObjectHandle> streams = | ||
| 858 | + arrayOrStreamToStreamArray( | ||
| 859 | + description, all_description); | ||
| 824 | for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin(); | 860 | for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin(); |
| 825 | iter != streams.end(); ++iter) | 861 | iter != streams.end(); ++iter) |
| 826 | { | 862 | { |
| 827 | QPDFObjectHandle stream = *iter; | 863 | QPDFObjectHandle stream = *iter; |
| 828 | - if (! stream.isStream()) | 864 | + std::string og = |
| 865 | + QUtil::int_to_string(stream.getObjectID()) + " " + | ||
| 866 | + QUtil::int_to_string(stream.getGeneration()); | ||
| 867 | + std::string description = "content stream object " + og; | ||
| 868 | + if (! stream.pipeStreamData(p, 0, qpdf_dl_specialized)) | ||
| 829 | { | 869 | { |
| 830 | - QTC::TC("qpdf", "QPDFObjectHandle non-stream in parsecontent"); | 870 | + QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent"); |
| 831 | warn(stream.getOwningQPDF(), | 871 | warn(stream.getOwningQPDF(), |
| 832 | QPDFExc(qpdf_e_damaged_pdf, "content stream", | 872 | QPDFExc(qpdf_e_damaged_pdf, "content stream", |
| 833 | - "", 0, | ||
| 834 | - "ignoring non-stream while parsing content streams")); | ||
| 835 | - } | ||
| 836 | - else | ||
| 837 | - { | ||
| 838 | - std::string og = QUtil::int_to_string(stream.getObjectID()) + " " + | ||
| 839 | - QUtil::int_to_string(stream.getGeneration()); | ||
| 840 | - std::string description = "content stream object " + og; | ||
| 841 | - if (first) | ||
| 842 | - { | ||
| 843 | - first = false; | ||
| 844 | - } | ||
| 845 | - else | ||
| 846 | - { | ||
| 847 | - all_description += ","; | ||
| 848 | - } | ||
| 849 | - all_description += " " + og; | ||
| 850 | - if (! stream.pipeStreamData(&buf, 0, qpdf_dl_specialized)) | ||
| 851 | - { | ||
| 852 | - QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent"); | ||
| 853 | - warn(stream.getOwningQPDF(), | ||
| 854 | - QPDFExc(qpdf_e_damaged_pdf, "content stream", | ||
| 855 | - description, 0, | ||
| 856 | - "errors while decoding content stream")); | ||
| 857 | - } | 873 | + description, 0, |
| 874 | + "errors while decoding content stream")); | ||
| 858 | } | 875 | } |
| 859 | } | 876 | } |
| 877 | +} | ||
| 878 | + | ||
| 879 | +void | ||
| 880 | +QPDFObjectHandle::parsePageContents(ParserCallbacks* callbacks) | ||
| 881 | +{ | ||
| 882 | + std::string description = "page object " + | ||
| 883 | + QUtil::int_to_string(this->objid) + " " + | ||
| 884 | + QUtil::int_to_string(this->generation); | ||
| 885 | + this->getKey("/Contents").parseContentStream_internal( | ||
| 886 | + description, callbacks); | ||
| 887 | +} | ||
| 888 | + | ||
| 889 | +void | ||
| 890 | +QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array, | ||
| 891 | + ParserCallbacks* callbacks) | ||
| 892 | +{ | ||
| 893 | + stream_or_array.parseContentStream_internal( | ||
| 894 | + "content stream objects", callbacks); | ||
| 895 | +} | ||
| 896 | + | ||
| 897 | +void | ||
| 898 | +QPDFObjectHandle::parseContentStream_internal( | ||
| 899 | + std::string const& description, | ||
| 900 | + ParserCallbacks* callbacks) | ||
| 901 | +{ | ||
| 902 | + Pl_Buffer buf("concatenated stream data buffer"); | ||
| 903 | + std::string all_description; | ||
| 904 | + pipeContentStreams(&buf, description, all_description); | ||
| 860 | PointerHolder<Buffer> stream_data = buf.getBuffer(); | 905 | PointerHolder<Buffer> stream_data = buf.getBuffer(); |
| 861 | try | 906 | try |
| 862 | { | 907 | { |
| 863 | - parseContentStream_internal(stream_data, all_description, callbacks); | 908 | + parseContentStream_data(stream_data, all_description, callbacks); |
| 864 | } | 909 | } |
| 865 | catch (TerminateParsing&) | 910 | catch (TerminateParsing&) |
| 866 | { | 911 | { |
| @@ -870,9 +915,10 @@ QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array, | @@ -870,9 +915,10 @@ QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array, | ||
| 870 | } | 915 | } |
| 871 | 916 | ||
| 872 | void | 917 | void |
| 873 | -QPDFObjectHandle::parseContentStream_internal(PointerHolder<Buffer> stream_data, | ||
| 874 | - std::string const& description, | ||
| 875 | - ParserCallbacks* callbacks) | 918 | +QPDFObjectHandle::parseContentStream_data( |
| 919 | + PointerHolder<Buffer> stream_data, | ||
| 920 | + std::string const& description, | ||
| 921 | + ParserCallbacks* callbacks) | ||
| 876 | { | 922 | { |
| 877 | size_t length = stream_data->getSize(); | 923 | size_t length = stream_data->getSize(); |
| 878 | PointerHolder<InputSource> input = | 924 | PointerHolder<InputSource> input = |
qpdf/qpdf.testcov
| @@ -277,7 +277,6 @@ QPDFObjectHandle found fake 1 | @@ -277,7 +277,6 @@ QPDFObjectHandle found fake 1 | ||
| 277 | QPDFObjectHandle no val for last key 0 | 277 | QPDFObjectHandle no val for last key 0 |
| 278 | QPDF resolve failure to null 0 | 278 | QPDF resolve failure to null 0 |
| 279 | QPDFWriter preserve unreferenced standard 0 | 279 | QPDFWriter preserve unreferenced standard 0 |
| 280 | -QPDFObjectHandle non-stream in parsecontent 0 | ||
| 281 | QPDFObjectHandle errors in parsecontent 0 | 280 | QPDFObjectHandle errors in parsecontent 0 |
| 282 | QPDF stream with non-space 0 | 281 | QPDF stream with non-space 0 |
| 283 | qpdf same file error 0 | 282 | qpdf same file error 0 |
| @@ -304,3 +303,4 @@ QPDF_Stream TIFF predictor 0 | @@ -304,3 +303,4 @@ QPDF_Stream TIFF predictor 0 | ||
| 304 | QPDFTokenizer EOF when not allowed 0 | 303 | QPDFTokenizer EOF when not allowed 0 |
| 305 | QPDFTokenizer inline image at EOF 0 | 304 | QPDFTokenizer inline image at EOF 0 |
| 306 | Pl_QPDFTokenizer found ID 0 | 305 | Pl_QPDFTokenizer found ID 0 |
| 306 | +QPDFObjectHandle non-stream in stream array 0 |
qpdf/qtest/qpdf/split-content-stream-errors.out
| @@ -4,6 +4,6 @@ File is not encrypted | @@ -4,6 +4,6 @@ File is not encrypted | ||
| 4 | File is not linearized | 4 | File is not linearized |
| 5 | WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received | 5 | WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received |
| 6 | WARNING: split-content-stream-errors.pdf (file position 557): stream will be re-processed without filtering to avoid data loss | 6 | WARNING: split-content-stream-errors.pdf (file position 557): stream will be re-processed without filtering to avoid data loss |
| 7 | -WARNING: content stream: ignoring non-stream while parsing content streams | 7 | +WARNING: content stream objects (item index 0 (from 0)): ignoring non-stream in an array of streams |
| 8 | WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received | 8 | WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received |
| 9 | WARNING: content stream (content stream object 6 0): errors while decoding content stream | 9 | WARNING: content stream (content stream object 6 0): errors while decoding content stream |