Commit fcd611b61eb6cc352b4e072fc791681ad927aee2

Authored by Jay Berkenbilt
1 parent 05ff619b

Refactor parseContentStream

ChangeLog
@@ -45,6 +45,18 @@ @@ -45,6 +45,18 @@
45 characters may surround the EI operator that marks the end of an 45 characters may surround the EI operator that marks the end of an
46 inline image. 46 inline image.
47 47
  48 + * New method QPDFObjectHandle::parsePageContents() to improve upon
  49 + QPDFObjectHandle::parseContentStream(). The parseContentStream
  50 + method used to operate on a single content stream, but was fixed
  51 + to properly handle pages with contents split across multiple
  52 + streams in an earlier release. The new method parsePageContents()
  53 + can be called on the page object rather than the value of the
  54 + page dictionary's /Contents key. This removes a few lines of
  55 + boiler-plate code from any code that uses parseContentStream, and
  56 + it also enables creation of more helpful error messages if
  57 + problems are encountered as the error messages can include
  58 + information about which page the streams come from.
  59 +
48 2018-02-04 Jay Berkenbilt <ejb@ql.org> 60 2018-02-04 Jay Berkenbilt <ejb@ql.org>
49 61
50 * Add QPDFWriter::setLinearizationPass1Filename method and 62 * Add QPDFWriter::setLinearizationPass1Filename method and
include/qpdf/QPDFObjectHandle.hh
@@ -88,7 +88,7 @@ class QPDFObjectHandle @@ -88,7 +88,7 @@ class QPDFObjectHandle
88 virtual void decryptString(std::string& val) = 0; 88 virtual void decryptString(std::string& val) = 0;
89 }; 89 };
90 90
91 - // This class is used by parseContentStream. Callers must 91 + // This class is used by parsePageContents. Callers must
92 // instantiate a subclass of this with handlers defined to accept 92 // instantiate a subclass of this with handlers defined to accept
93 // QPDFObjectHandles that are parsed from the stream. 93 // QPDFObjectHandles that are parsed from the stream.
94 class ParserCallbacks 94 class ParserCallbacks
@@ -103,8 +103,8 @@ class QPDFObjectHandle @@ -103,8 +103,8 @@ class QPDFObjectHandle
103 103
104 protected: 104 protected:
105 // Implementors may call this method during parsing to 105 // Implementors may call this method during parsing to
106 - // terminate parsing early. This method throws an exception  
107 - // that is caught by parseContentStream, so its effect is 106 + // terminate parsing early. This method throws an exception
  107 + // that is caught by parsePageContents, so its effect is
108 // immediate. 108 // immediate.
109 QPDF_DLL 109 QPDF_DLL
110 void terminateParsing(); 110 void terminateParsing();
@@ -187,6 +187,24 @@ class QPDFObjectHandle @@ -187,6 +187,24 @@ class QPDFObjectHandle
187 QPDF* context); 187 QPDF* context);
188 188
189 // Helpers for parsing content streams 189 // Helpers for parsing content streams
  190 +
  191 + // Parse a page's contents through ParserCallbacks, described
  192 + // above. This method works whether the contents are a single
  193 + // stream or an array of streams. Call on a page object.
  194 + QPDF_DLL
  195 + void parsePageContents(ParserCallbacks* callbacks);
  196 +
  197 + // Pipe a page's contents through the given pipeline. This method
  198 + // works whether the contents are a single stream or an array of
  199 + // streams. Call on a page object.
  200 + QPDF_DLL
  201 + void pipePageContents(Pipeline* p);
  202 +
  203 + // Older method: stream_or_array should be the value of /Contents
  204 + // from a page object. It's more convenient to just call
  205 + // parsePageContents on the page object, and error messages will
  206 + // also be more useful because the page object information will be
  207 + // known.
190 QPDF_DLL 208 QPDF_DLL
191 static void parseContentStream(QPDFObjectHandle stream_or_array, 209 static void parseContentStream(QPDFObjectHandle stream_or_array,
192 ParserCallbacks* callbacks); 210 ParserCallbacks* callbacks);
@@ -697,12 +715,17 @@ class QPDFObjectHandle @@ -697,12 +715,17 @@ class QPDFObjectHandle
697 QPDFTokenizer& tokenizer, bool& empty, 715 QPDFTokenizer& tokenizer, bool& empty,
698 StringDecrypter* decrypter, QPDF* context, 716 StringDecrypter* decrypter, QPDF* context,
699 bool content_stream); 717 bool content_stream);
700 - static void parseContentStream_internal(  
701 - PointerHolder<Buffer> stream_data, 718 + void parseContentStream_internal(
702 std::string const& description, 719 std::string const& description,
703 ParserCallbacks* callbacks); 720 ParserCallbacks* callbacks);
704 -  
705 - // Other methods 721 + static void parseContentStream_data(
  722 + PointerHolder<Buffer>,
  723 + std::string const& description,
  724 + ParserCallbacks* callbacks);
  725 + std::vector<QPDFObjectHandle> arrayOrStreamToStreamArray(
  726 + std::string const& description, std::string& all_description);
  727 + void pipeContentStreams(Pipeline* p, std::string const& description,
  728 + std::string& all_description);
706 static void warn(QPDF*, QPDFExc const&); 729 static void warn(QPDF*, QPDFExc const&);
707 730
708 bool initialized; 731 bool initialized;
libqpdf/QPDFObjectHandle.cc
@@ -628,44 +628,78 @@ QPDFObjectHandle::getPageImages() @@ -628,44 +628,78 @@ QPDFObjectHandle::getPageImages()
628 } 628 }
629 629
630 std::vector<QPDFObjectHandle> 630 std::vector<QPDFObjectHandle>
631 -QPDFObjectHandle::getPageContents() 631 +QPDFObjectHandle::arrayOrStreamToStreamArray(
  632 + std::string const& description, std::string& all_description)
632 { 633 {
633 - assertPageObject();  
634 - 634 + all_description = description;
635 std::vector<QPDFObjectHandle> result; 635 std::vector<QPDFObjectHandle> result;
636 - QPDFObjectHandle contents = this->getKey("/Contents");  
637 - if (contents.isArray()) 636 + if (isArray())
638 { 637 {
639 - int n_items = contents.getArrayNItems(); 638 + int n_items = getArrayNItems();
640 for (int i = 0; i < n_items; ++i) 639 for (int i = 0; i < n_items; ++i)
641 { 640 {
642 - QPDFObjectHandle item = contents.getArrayItem(i); 641 + QPDFObjectHandle item = getArrayItem(i);
643 if (item.isStream()) 642 if (item.isStream())
  643 + {
  644 + result.push_back(item);
  645 + }
  646 + else
644 { 647 {
645 - result.push_back(item);  
646 - }  
647 - else  
648 - {  
649 - throw std::runtime_error(  
650 - "unknown item type while inspecting "  
651 - "element of /Contents array in page "  
652 - "dictionary"); 648 + QTC::TC("qpdf", "QPDFObjectHandle non-stream in stream array");
  649 + warn(item.getOwningQPDF(),
  650 + QPDFExc(qpdf_e_damaged_pdf, description,
  651 + "item index " + QUtil::int_to_string(i) +
  652 + " (from 0)", 0,
  653 + "ignoring non-stream in an array of streams"));
653 } 654 }
654 } 655 }
655 } 656 }
656 - else if (contents.isStream()) 657 + else if (isStream())
  658 + {
  659 + result.push_back(*this);
  660 + }
  661 + else if (! isNull())
657 { 662 {
658 - result.push_back(contents); 663 + warn(getOwningQPDF(),
  664 + QPDFExc(qpdf_e_damaged_pdf, "", description, 0,
  665 + " object is supposed to be a stream or an"
  666 + " array of streams but is neither"));
659 } 667 }
660 - else if (! contents.isNull()) 668 +
  669 + bool first = true;
  670 + for (std::vector<QPDFObjectHandle>::iterator iter = result.begin();
  671 + iter != result.end(); ++iter)
661 { 672 {
662 - throw std::runtime_error("unknown object type inspecting /Contents "  
663 - "key in page dictionary"); 673 + QPDFObjectHandle item = *iter;
  674 + std::string og =
  675 + QUtil::int_to_string(item.getObjectID()) + " " +
  676 + QUtil::int_to_string(item.getGeneration());
  677 + if (first)
  678 + {
  679 + first = false;
  680 + }
  681 + else
  682 + {
  683 + all_description += ",";
  684 + }
  685 + all_description += " stream " + og;
664 } 686 }
665 687
666 return result; 688 return result;
667 } 689 }
668 690
  691 +std::vector<QPDFObjectHandle>
  692 +QPDFObjectHandle::getPageContents()
  693 +{
  694 + assertPageObject();
  695 + std::string description = "page object " +
  696 + QUtil::int_to_string(this->objid) + " " +
  697 + QUtil::int_to_string(this->generation);
  698 + std::string all_description;
  699 + return this->getKey("/Contents").arrayOrStreamToStreamArray(
  700 + description, all_description);
  701 +}
  702 +
669 void 703 void
670 QPDFObjectHandle::addPageContents(QPDFObjectHandle new_contents, bool first) 704 QPDFObjectHandle::addPageContents(QPDFObjectHandle new_contents, bool first)
671 { 705 {
@@ -806,61 +840,72 @@ QPDFObjectHandle::parse(std::string const&amp; object_str, @@ -806,61 +840,72 @@ QPDFObjectHandle::parse(std::string const&amp; object_str,
806 } 840 }
807 841
808 void 842 void
809 -QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,  
810 - ParserCallbacks* callbacks) 843 +QPDFObjectHandle::pipePageContents(Pipeline* p)
811 { 844 {
812 - std::vector<QPDFObjectHandle> streams;  
813 - if (stream_or_array.isArray())  
814 - {  
815 - streams = stream_or_array.getArrayAsVector();  
816 - }  
817 - else  
818 - {  
819 - streams.push_back(stream_or_array);  
820 - }  
821 - Pl_Buffer buf("concatenated stream data buffer");  
822 - std::string all_description = "content stream objects";  
823 - bool first = true; 845 + std::string description = "page object " +
  846 + QUtil::int_to_string(this->objid) + " " +
  847 + QUtil::int_to_string(this->generation);
  848 + std::string all_description;
  849 + this->getKey("/Contents").pipeContentStreams(
  850 + p, description, all_description);
  851 +}
  852 +
  853 +void
  854 +QPDFObjectHandle::pipeContentStreams(
  855 + Pipeline* p, std::string const& description, std::string& all_description)
  856 +{
  857 + std::vector<QPDFObjectHandle> streams =
  858 + arrayOrStreamToStreamArray(
  859 + description, all_description);
824 for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin(); 860 for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin();
825 iter != streams.end(); ++iter) 861 iter != streams.end(); ++iter)
826 { 862 {
827 QPDFObjectHandle stream = *iter; 863 QPDFObjectHandle stream = *iter;
828 - if (! stream.isStream()) 864 + std::string og =
  865 + QUtil::int_to_string(stream.getObjectID()) + " " +
  866 + QUtil::int_to_string(stream.getGeneration());
  867 + std::string description = "content stream object " + og;
  868 + if (! stream.pipeStreamData(p, 0, qpdf_dl_specialized))
829 { 869 {
830 - QTC::TC("qpdf", "QPDFObjectHandle non-stream in parsecontent"); 870 + QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent");
831 warn(stream.getOwningQPDF(), 871 warn(stream.getOwningQPDF(),
832 QPDFExc(qpdf_e_damaged_pdf, "content stream", 872 QPDFExc(qpdf_e_damaged_pdf, "content stream",
833 - "", 0,  
834 - "ignoring non-stream while parsing content streams"));  
835 - }  
836 - else  
837 - {  
838 - std::string og = QUtil::int_to_string(stream.getObjectID()) + " " +  
839 - QUtil::int_to_string(stream.getGeneration());  
840 - std::string description = "content stream object " + og;  
841 - if (first)  
842 - {  
843 - first = false;  
844 - }  
845 - else  
846 - {  
847 - all_description += ",";  
848 - }  
849 - all_description += " " + og;  
850 - if (! stream.pipeStreamData(&buf, 0, qpdf_dl_specialized))  
851 - {  
852 - QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent");  
853 - warn(stream.getOwningQPDF(),  
854 - QPDFExc(qpdf_e_damaged_pdf, "content stream",  
855 - description, 0,  
856 - "errors while decoding content stream"));  
857 - } 873 + description, 0,
  874 + "errors while decoding content stream"));
858 } 875 }
859 } 876 }
  877 +}
  878 +
  879 +void
  880 +QPDFObjectHandle::parsePageContents(ParserCallbacks* callbacks)
  881 +{
  882 + std::string description = "page object " +
  883 + QUtil::int_to_string(this->objid) + " " +
  884 + QUtil::int_to_string(this->generation);
  885 + this->getKey("/Contents").parseContentStream_internal(
  886 + description, callbacks);
  887 +}
  888 +
  889 +void
  890 +QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
  891 + ParserCallbacks* callbacks)
  892 +{
  893 + stream_or_array.parseContentStream_internal(
  894 + "content stream objects", callbacks);
  895 +}
  896 +
  897 +void
  898 +QPDFObjectHandle::parseContentStream_internal(
  899 + std::string const& description,
  900 + ParserCallbacks* callbacks)
  901 +{
  902 + Pl_Buffer buf("concatenated stream data buffer");
  903 + std::string all_description;
  904 + pipeContentStreams(&buf, description, all_description);
860 PointerHolder<Buffer> stream_data = buf.getBuffer(); 905 PointerHolder<Buffer> stream_data = buf.getBuffer();
861 try 906 try
862 { 907 {
863 - parseContentStream_internal(stream_data, all_description, callbacks); 908 + parseContentStream_data(stream_data, all_description, callbacks);
864 } 909 }
865 catch (TerminateParsing&) 910 catch (TerminateParsing&)
866 { 911 {
@@ -870,9 +915,10 @@ QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array, @@ -870,9 +915,10 @@ QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
870 } 915 }
871 916
872 void 917 void
873 -QPDFObjectHandle::parseContentStream_internal(PointerHolder<Buffer> stream_data,  
874 - std::string const& description,  
875 - ParserCallbacks* callbacks) 918 +QPDFObjectHandle::parseContentStream_data(
  919 + PointerHolder<Buffer> stream_data,
  920 + std::string const& description,
  921 + ParserCallbacks* callbacks)
876 { 922 {
877 size_t length = stream_data->getSize(); 923 size_t length = stream_data->getSize();
878 PointerHolder<InputSource> input = 924 PointerHolder<InputSource> input =
qpdf/qpdf.testcov
@@ -277,7 +277,6 @@ QPDFObjectHandle found fake 1 @@ -277,7 +277,6 @@ QPDFObjectHandle found fake 1
277 QPDFObjectHandle no val for last key 0 277 QPDFObjectHandle no val for last key 0
278 QPDF resolve failure to null 0 278 QPDF resolve failure to null 0
279 QPDFWriter preserve unreferenced standard 0 279 QPDFWriter preserve unreferenced standard 0
280 -QPDFObjectHandle non-stream in parsecontent 0  
281 QPDFObjectHandle errors in parsecontent 0 280 QPDFObjectHandle errors in parsecontent 0
282 QPDF stream with non-space 0 281 QPDF stream with non-space 0
283 qpdf same file error 0 282 qpdf same file error 0
@@ -304,3 +303,4 @@ QPDF_Stream TIFF predictor 0 @@ -304,3 +303,4 @@ QPDF_Stream TIFF predictor 0
304 QPDFTokenizer EOF when not allowed 0 303 QPDFTokenizer EOF when not allowed 0
305 QPDFTokenizer inline image at EOF 0 304 QPDFTokenizer inline image at EOF 0
306 Pl_QPDFTokenizer found ID 0 305 Pl_QPDFTokenizer found ID 0
  306 +QPDFObjectHandle non-stream in stream array 0
qpdf/qtest/qpdf/split-content-stream-errors.out
@@ -4,6 +4,6 @@ File is not encrypted @@ -4,6 +4,6 @@ File is not encrypted
4 File is not linearized 4 File is not linearized
5 WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received 5 WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received
6 WARNING: split-content-stream-errors.pdf (file position 557): stream will be re-processed without filtering to avoid data loss 6 WARNING: split-content-stream-errors.pdf (file position 557): stream will be re-processed without filtering to avoid data loss
7 -WARNING: content stream: ignoring non-stream while parsing content streams 7 +WARNING: content stream objects (item index 0 (from 0)): ignoring non-stream in an array of streams
8 WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received 8 WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received
9 WARNING: content stream (content stream object 6 0): errors while decoding content stream 9 WARNING: content stream (content stream object 6 0): errors while decoding content stream