Commit fcd611b61eb6cc352b4e072fc791681ad927aee2

Authored by Jay Berkenbilt
1 parent 05ff619b

Refactor parseContentStream

ChangeLog
... ... @@ -45,6 +45,18 @@
45 45 characters may surround the EI operator that marks the end of an
46 46 inline image.
47 47  
  48 + * New method QPDFObjectHandle::parsePageContents() to improve upon
  49 + QPDFObjectHandle::parseContentStream(). The parseContentStream
  50 + method used to operate on a single content stream, but was fixed
  51 + to properly handle pages with contents split across multiple
  52 + streams in an earlier release. The new method parsePageContents()
  53 + can be called on the page object rather than the value of the
  54 + page dictionary's /Contents key. This removes a few lines of
  55 + boiler-plate code from any code that uses parseContentStream, and
  56 + it also enables creation of more helpful error messages if
  57 + problems are encountered as the error messages can include
  58 + information about which page the streams come from.
  59 +
48 60 2018-02-04 Jay Berkenbilt <ejb@ql.org>
49 61  
50 62 * Add QPDFWriter::setLinearizationPass1Filename method and
... ...
include/qpdf/QPDFObjectHandle.hh
... ... @@ -88,7 +88,7 @@ class QPDFObjectHandle
88 88 virtual void decryptString(std::string& val) = 0;
89 89 };
90 90  
91   - // This class is used by parseContentStream. Callers must
  91 + // This class is used by parsePageContents. Callers must
92 92 // instantiate a subclass of this with handlers defined to accept
93 93 // QPDFObjectHandles that are parsed from the stream.
94 94 class ParserCallbacks
... ... @@ -103,8 +103,8 @@ class QPDFObjectHandle
103 103  
104 104 protected:
105 105 // Implementors may call this method during parsing to
106   - // terminate parsing early. This method throws an exception
107   - // that is caught by parseContentStream, so its effect is
  106 + // terminate parsing early. This method throws an exception
  107 + // that is caught by parsePageContents, so its effect is
108 108 // immediate.
109 109 QPDF_DLL
110 110 void terminateParsing();
... ... @@ -187,6 +187,24 @@ class QPDFObjectHandle
187 187 QPDF* context);
188 188  
189 189 // Helpers for parsing content streams
  190 +
  191 + // Parse a page's contents through ParserCallbacks, described
  192 + // above. This method works whether the contents are a single
  193 + // stream or an array of streams. Call on a page object.
  194 + QPDF_DLL
  195 + void parsePageContents(ParserCallbacks* callbacks);
  196 +
  197 + // Pipe a page's contents through the given pipeline. This method
  198 + // works whether the contents are a single stream or an array of
  199 + // streams. Call on a page object.
  200 + QPDF_DLL
  201 + void pipePageContents(Pipeline* p);
  202 +
  203 + // Older method: stream_or_array should be the value of /Contents
  204 + // from a page object. It's more convenient to just call
  205 + // parsePageContents on the page object, and error messages will
  206 + // also be more useful because the page object information will be
  207 + // known.
190 208 QPDF_DLL
191 209 static void parseContentStream(QPDFObjectHandle stream_or_array,
192 210 ParserCallbacks* callbacks);
... ... @@ -697,12 +715,17 @@ class QPDFObjectHandle
697 715 QPDFTokenizer& tokenizer, bool& empty,
698 716 StringDecrypter* decrypter, QPDF* context,
699 717 bool content_stream);
700   - static void parseContentStream_internal(
701   - PointerHolder<Buffer> stream_data,
  718 + void parseContentStream_internal(
702 719 std::string const& description,
703 720 ParserCallbacks* callbacks);
704   -
705   - // Other methods
  721 + static void parseContentStream_data(
  722 + PointerHolder<Buffer>,
  723 + std::string const& description,
  724 + ParserCallbacks* callbacks);
  725 + std::vector<QPDFObjectHandle> arrayOrStreamToStreamArray(
  726 + std::string const& description, std::string& all_description);
  727 + void pipeContentStreams(Pipeline* p, std::string const& description,
  728 + std::string& all_description);
706 729 static void warn(QPDF*, QPDFExc const&);
707 730  
708 731 bool initialized;
... ...
libqpdf/QPDFObjectHandle.cc
... ... @@ -628,44 +628,78 @@ QPDFObjectHandle::getPageImages()
628 628 }
629 629  
630 630 std::vector<QPDFObjectHandle>
631   -QPDFObjectHandle::getPageContents()
  631 +QPDFObjectHandle::arrayOrStreamToStreamArray(
  632 + std::string const& description, std::string& all_description)
632 633 {
633   - assertPageObject();
634   -
  634 + all_description = description;
635 635 std::vector<QPDFObjectHandle> result;
636   - QPDFObjectHandle contents = this->getKey("/Contents");
637   - if (contents.isArray())
  636 + if (isArray())
638 637 {
639   - int n_items = contents.getArrayNItems();
  638 + int n_items = getArrayNItems();
640 639 for (int i = 0; i < n_items; ++i)
641 640 {
642   - QPDFObjectHandle item = contents.getArrayItem(i);
  641 + QPDFObjectHandle item = getArrayItem(i);
643 642 if (item.isStream())
  643 + {
  644 + result.push_back(item);
  645 + }
  646 + else
644 647 {
645   - result.push_back(item);
646   - }
647   - else
648   - {
649   - throw std::runtime_error(
650   - "unknown item type while inspecting "
651   - "element of /Contents array in page "
652   - "dictionary");
  648 + QTC::TC("qpdf", "QPDFObjectHandle non-stream in stream array");
  649 + warn(item.getOwningQPDF(),
  650 + QPDFExc(qpdf_e_damaged_pdf, description,
  651 + "item index " + QUtil::int_to_string(i) +
  652 + " (from 0)", 0,
  653 + "ignoring non-stream in an array of streams"));
653 654 }
654 655 }
655 656 }
656   - else if (contents.isStream())
  657 + else if (isStream())
  658 + {
  659 + result.push_back(*this);
  660 + }
  661 + else if (! isNull())
657 662 {
658   - result.push_back(contents);
  663 + warn(getOwningQPDF(),
  664 + QPDFExc(qpdf_e_damaged_pdf, "", description, 0,
  665 + " object is supposed to be a stream or an"
  666 + " array of streams but is neither"));
659 667 }
660   - else if (! contents.isNull())
  668 +
  669 + bool first = true;
  670 + for (std::vector<QPDFObjectHandle>::iterator iter = result.begin();
  671 + iter != result.end(); ++iter)
661 672 {
662   - throw std::runtime_error("unknown object type inspecting /Contents "
663   - "key in page dictionary");
  673 + QPDFObjectHandle item = *iter;
  674 + std::string og =
  675 + QUtil::int_to_string(item.getObjectID()) + " " +
  676 + QUtil::int_to_string(item.getGeneration());
  677 + if (first)
  678 + {
  679 + first = false;
  680 + }
  681 + else
  682 + {
  683 + all_description += ",";
  684 + }
  685 + all_description += " stream " + og;
664 686 }
665 687  
666 688 return result;
667 689 }
668 690  
  691 +std::vector<QPDFObjectHandle>
  692 +QPDFObjectHandle::getPageContents()
  693 +{
  694 + assertPageObject();
  695 + std::string description = "page object " +
  696 + QUtil::int_to_string(this->objid) + " " +
  697 + QUtil::int_to_string(this->generation);
  698 + std::string all_description;
  699 + return this->getKey("/Contents").arrayOrStreamToStreamArray(
  700 + description, all_description);
  701 +}
  702 +
669 703 void
670 704 QPDFObjectHandle::addPageContents(QPDFObjectHandle new_contents, bool first)
671 705 {
... ... @@ -806,61 +840,72 @@ QPDFObjectHandle::parse(std::string const&amp; object_str,
806 840 }
807 841  
808 842 void
809   -QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
810   - ParserCallbacks* callbacks)
  843 +QPDFObjectHandle::pipePageContents(Pipeline* p)
811 844 {
812   - std::vector<QPDFObjectHandle> streams;
813   - if (stream_or_array.isArray())
814   - {
815   - streams = stream_or_array.getArrayAsVector();
816   - }
817   - else
818   - {
819   - streams.push_back(stream_or_array);
820   - }
821   - Pl_Buffer buf("concatenated stream data buffer");
822   - std::string all_description = "content stream objects";
823   - bool first = true;
  845 + std::string description = "page object " +
  846 + QUtil::int_to_string(this->objid) + " " +
  847 + QUtil::int_to_string(this->generation);
  848 + std::string all_description;
  849 + this->getKey("/Contents").pipeContentStreams(
  850 + p, description, all_description);
  851 +}
  852 +
  853 +void
  854 +QPDFObjectHandle::pipeContentStreams(
  855 + Pipeline* p, std::string const& description, std::string& all_description)
  856 +{
  857 + std::vector<QPDFObjectHandle> streams =
  858 + arrayOrStreamToStreamArray(
  859 + description, all_description);
824 860 for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin();
825 861 iter != streams.end(); ++iter)
826 862 {
827 863 QPDFObjectHandle stream = *iter;
828   - if (! stream.isStream())
  864 + std::string og =
  865 + QUtil::int_to_string(stream.getObjectID()) + " " +
  866 + QUtil::int_to_string(stream.getGeneration());
  867 + std::string description = "content stream object " + og;
  868 + if (! stream.pipeStreamData(p, 0, qpdf_dl_specialized))
829 869 {
830   - QTC::TC("qpdf", "QPDFObjectHandle non-stream in parsecontent");
  870 + QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent");
831 871 warn(stream.getOwningQPDF(),
832 872 QPDFExc(qpdf_e_damaged_pdf, "content stream",
833   - "", 0,
834   - "ignoring non-stream while parsing content streams"));
835   - }
836   - else
837   - {
838   - std::string og = QUtil::int_to_string(stream.getObjectID()) + " " +
839   - QUtil::int_to_string(stream.getGeneration());
840   - std::string description = "content stream object " + og;
841   - if (first)
842   - {
843   - first = false;
844   - }
845   - else
846   - {
847   - all_description += ",";
848   - }
849   - all_description += " " + og;
850   - if (! stream.pipeStreamData(&buf, 0, qpdf_dl_specialized))
851   - {
852   - QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent");
853   - warn(stream.getOwningQPDF(),
854   - QPDFExc(qpdf_e_damaged_pdf, "content stream",
855   - description, 0,
856   - "errors while decoding content stream"));
857   - }
  873 + description, 0,
  874 + "errors while decoding content stream"));
858 875 }
859 876 }
  877 +}
  878 +
  879 +void
  880 +QPDFObjectHandle::parsePageContents(ParserCallbacks* callbacks)
  881 +{
  882 + std::string description = "page object " +
  883 + QUtil::int_to_string(this->objid) + " " +
  884 + QUtil::int_to_string(this->generation);
  885 + this->getKey("/Contents").parseContentStream_internal(
  886 + description, callbacks);
  887 +}
  888 +
  889 +void
  890 +QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
  891 + ParserCallbacks* callbacks)
  892 +{
  893 + stream_or_array.parseContentStream_internal(
  894 + "content stream objects", callbacks);
  895 +}
  896 +
  897 +void
  898 +QPDFObjectHandle::parseContentStream_internal(
  899 + std::string const& description,
  900 + ParserCallbacks* callbacks)
  901 +{
  902 + Pl_Buffer buf("concatenated stream data buffer");
  903 + std::string all_description;
  904 + pipeContentStreams(&buf, description, all_description);
860 905 PointerHolder<Buffer> stream_data = buf.getBuffer();
861 906 try
862 907 {
863   - parseContentStream_internal(stream_data, all_description, callbacks);
  908 + parseContentStream_data(stream_data, all_description, callbacks);
864 909 }
865 910 catch (TerminateParsing&)
866 911 {
... ... @@ -870,9 +915,10 @@ QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
870 915 }
871 916  
872 917 void
873   -QPDFObjectHandle::parseContentStream_internal(PointerHolder<Buffer> stream_data,
874   - std::string const& description,
875   - ParserCallbacks* callbacks)
  918 +QPDFObjectHandle::parseContentStream_data(
  919 + PointerHolder<Buffer> stream_data,
  920 + std::string const& description,
  921 + ParserCallbacks* callbacks)
876 922 {
877 923 size_t length = stream_data->getSize();
878 924 PointerHolder<InputSource> input =
... ...
qpdf/qpdf.testcov
... ... @@ -277,7 +277,6 @@ QPDFObjectHandle found fake 1
277 277 QPDFObjectHandle no val for last key 0
278 278 QPDF resolve failure to null 0
279 279 QPDFWriter preserve unreferenced standard 0
280   -QPDFObjectHandle non-stream in parsecontent 0
281 280 QPDFObjectHandle errors in parsecontent 0
282 281 QPDF stream with non-space 0
283 282 qpdf same file error 0
... ... @@ -304,3 +303,4 @@ QPDF_Stream TIFF predictor 0
304 303 QPDFTokenizer EOF when not allowed 0
305 304 QPDFTokenizer inline image at EOF 0
306 305 Pl_QPDFTokenizer found ID 0
  306 +QPDFObjectHandle non-stream in stream array 0
... ...
qpdf/qtest/qpdf/split-content-stream-errors.out
... ... @@ -4,6 +4,6 @@ File is not encrypted
4 4 File is not linearized
5 5 WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received
6 6 WARNING: split-content-stream-errors.pdf (file position 557): stream will be re-processed without filtering to avoid data loss
7   -WARNING: content stream: ignoring non-stream while parsing content streams
  7 +WARNING: content stream objects (item index 0 (from 0)): ignoring non-stream in an array of streams
8 8 WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received
9 9 WARNING: content stream (content stream object 6 0): errors while decoding content stream
... ...