Commit b389268f16fcd571bbc57ef848caba25490a1b86

Authored by Jay Berkenbilt
1 parent a1368242

Better handle split content streams (fixes #73)

When parsing content streams, allow content to be split arbitrarily
across stream boundaries.
ChangeLog
  1 +2017-07-29 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * Fix content stream parsing to handle cases of structures within
  4 + the stream split across stream boundaries. Fixes #73.
  5 +
1 2017-07-28 Jay Berkenbilt <ejb@ql.org> 6 2017-07-28 Jay Berkenbilt <ejb@ql.org>
2 7
3 * Add --preserve-unreferenced command-line option and 8 * Add --preserve-unreferenced command-line option and
include/qpdf/QPDFObjectHandle.hh
@@ -623,7 +623,9 @@ class QPDFObjectHandle @@ -623,7 +623,9 @@ class QPDFObjectHandle
623 bool in_array, bool in_dictionary, 623 bool in_array, bool in_dictionary,
624 bool content_stream); 624 bool content_stream);
625 static void parseContentStream_internal( 625 static void parseContentStream_internal(
626 - QPDFObjectHandle stream, ParserCallbacks* callbacks); 626 + PointerHolder<Buffer> stream_data,
  627 + std::string const& description,
  628 + ParserCallbacks* callbacks);
627 629
628 // Other methods 630 // Other methods
629 static void warn(QPDF*, QPDFExc const&); 631 static void warn(QPDF*, QPDFExc const&);
libqpdf/QPDFObjectHandle.cc
@@ -13,6 +13,7 @@ @@ -13,6 +13,7 @@
13 #include <qpdf/QPDF_Dictionary.hh> 13 #include <qpdf/QPDF_Dictionary.hh>
14 #include <qpdf/QPDF_Stream.hh> 14 #include <qpdf/QPDF_Stream.hh>
15 #include <qpdf/QPDF_Reserved.hh> 15 #include <qpdf/QPDF_Reserved.hh>
  16 +#include <qpdf/Pl_Buffer.hh>
16 #include <qpdf/BufferInputSource.hh> 17 #include <qpdf/BufferInputSource.hh>
17 #include <qpdf/QPDFExc.hh> 18 #include <qpdf/QPDFExc.hh>
18 19
@@ -739,37 +740,63 @@ QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array, @@ -739,37 +740,63 @@ QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
739 { 740 {
740 streams.push_back(stream_or_array); 741 streams.push_back(stream_or_array);
741 } 742 }
  743 + Pl_Buffer buf("concatenated stream data buffer");
  744 + std::string all_description = "content stream objects";
  745 + bool first = true;
742 for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin(); 746 for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin();
743 iter != streams.end(); ++iter) 747 iter != streams.end(); ++iter)
744 { 748 {
745 QPDFObjectHandle stream = *iter; 749 QPDFObjectHandle stream = *iter;
746 if (! stream.isStream()) 750 if (! stream.isStream())
747 { 751 {
748 - throw std::logic_error(  
749 - "QPDFObjectHandle: parseContentStream called on non-stream"); 752 + QTC::TC("qpdf", "QPDFObjectHandle non-stream in parsecontent");
  753 + warn(stream.getOwningQPDF(),
  754 + QPDFExc(qpdf_e_damaged_pdf, "content stream",
  755 + "", 0,
  756 + "ignoring non-stream while parsing content streams"));
750 } 757 }
751 - try  
752 - {  
753 - parseContentStream_internal(stream, callbacks);  
754 - }  
755 - catch (TerminateParsing&) 758 + else
756 { 759 {
757 - return; 760 + std::string og = QUtil::int_to_string(stream.getObjectID()) + " " +
  761 + QUtil::int_to_string(stream.getGeneration());
  762 + std::string description = "content stream object " + og;
  763 + if (first)
  764 + {
  765 + first = false;
  766 + }
  767 + else
  768 + {
  769 + all_description += ",";
  770 + }
  771 + all_description += " " + og;
  772 + if (! stream.pipeStreamData(&buf, true, false, false, false))
  773 + {
  774 + QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent");
  775 + warn(stream.getOwningQPDF(),
  776 + QPDFExc(qpdf_e_damaged_pdf, "content stream",
  777 + description, 0,
  778 + "errors while decoding content stream"));
  779 + }
758 } 780 }
759 } 781 }
  782 + PointerHolder<Buffer> stream_data = buf.getBuffer();
  783 + try
  784 + {
  785 + parseContentStream_internal(stream_data, all_description, callbacks);
  786 + }
  787 + catch (TerminateParsing&)
  788 + {
  789 + return;
  790 + }
760 callbacks->handleEOF(); 791 callbacks->handleEOF();
761 } 792 }
762 793
763 void 794 void
764 -QPDFObjectHandle::parseContentStream_internal(QPDFObjectHandle stream, 795 +QPDFObjectHandle::parseContentStream_internal(PointerHolder<Buffer> stream_data,
  796 + std::string const& description,
765 ParserCallbacks* callbacks) 797 ParserCallbacks* callbacks)
766 { 798 {
767 - stream.assertStream();  
768 - PointerHolder<Buffer> stream_data = stream.getStreamData();  
769 size_t length = stream_data->getSize(); 799 size_t length = stream_data->getSize();
770 - std::string description = "content stream object " +  
771 - QUtil::int_to_string(stream.getObjectID()) + " " +  
772 - QUtil::int_to_string(stream.getGeneration());  
773 PointerHolder<InputSource> input = 800 PointerHolder<InputSource> input =
774 new BufferInputSource(description, stream_data.getPointer()); 801 new BufferInputSource(description, stream_data.getPointer());
775 QPDFTokenizer tokenizer; 802 QPDFTokenizer tokenizer;
qpdf/qpdf.testcov
@@ -281,3 +281,5 @@ QPDFObjectHandle no val for last key 0 @@ -281,3 +281,5 @@ QPDFObjectHandle no val for last key 0
281 QPDF resolve failure to null 0 281 QPDF resolve failure to null 0
282 QPDFWriter precheck stream 0 282 QPDFWriter precheck stream 0
283 QPDFWriter preserve unreferenced standard 0 283 QPDFWriter preserve unreferenced standard 0
  284 +QPDFObjectHandle non-stream in parsecontent 0
  285 +QPDFObjectHandle errors in parsecontent 0
qpdf/qtest/qpdf.test
@@ -206,7 +206,7 @@ $td-&gt;runtest(&quot;remove page we don&#39;t have&quot;, @@ -206,7 +206,7 @@ $td-&gt;runtest(&quot;remove page we don&#39;t have&quot;,
206 show_ntests(); 206 show_ntests();
207 # ---------- 207 # ----------
208 $td->notify("--- Miscellaneous Tests ---"); 208 $td->notify("--- Miscellaneous Tests ---");
209 -$n_tests += 86; 209 +$n_tests += 88;
210 210
211 $td->runtest("qpdf version", 211 $td->runtest("qpdf version",
212 {$td->COMMAND => "qpdf --version"}, 212 {$td->COMMAND => "qpdf --version"},
@@ -604,6 +604,20 @@ $td-&gt;runtest(&quot;no trailing space in xref table&quot;, @@ -604,6 +604,20 @@ $td-&gt;runtest(&quot;no trailing space in xref table&quot;,
604 {$td->FILE => "no-space-in-xref.out", $td->EXIT_STATUS => 0}, 604 {$td->FILE => "no-space-in-xref.out", $td->EXIT_STATUS => 0},
605 $td->NORMALIZE_NEWLINES); 605 $td->NORMALIZE_NEWLINES);
606 606
  607 +# An array is split across multiple content streams starting object
  608 +# 42. This was reported in github issue 73. The file is modified from
  609 +# that example.
  610 +$td->runtest("parse split content stream",
  611 + {$td->COMMAND => "qpdf --check split-content-stream.pdf"},
  612 + {$td->FILE => "split-content-stream.out", $td->EXIT_STATUS => 0},
  613 + $td->NORMALIZE_NEWLINES);
  614 +$td->runtest("split content stream errors",
  615 + {$td->COMMAND => "qpdf --check split-content-stream-errors.pdf"},
  616 + {$td->FILE => "split-content-stream-errors.out",
  617 + $td->EXIT_STATUS => 3},
  618 + $td->NORMALIZE_NEWLINES);
  619 +
  620 +
607 show_ntests(); 621 show_ntests();
608 # ---------- 622 # ----------
609 $td->notify("--- Numeric range parsing tests ---"); 623 $td->notify("--- Numeric range parsing tests ---");
qpdf/qtest/qpdf/content-stream-errors.out
@@ -2,6 +2,6 @@ checking content-stream-errors.pdf @@ -2,6 +2,6 @@ checking content-stream-errors.pdf
2 PDF Version: 1.3 2 PDF Version: 1.3
3 File is not encrypted 3 File is not encrypted
4 File is not linearized 4 File is not linearized
5 -page 1: content stream object 7 0 (content, file position 52): parse error while reading object  
6 -page 3: content stream object 15 0 (stream data, file position 117): EOF found while reading inline image  
7 -page 4: content stream object 19 0 (content, file position 53): parse error while reading object 5 +page 1: content stream objects 7 0 (content, file position 52): parse error while reading object
  6 +page 3: content stream objects 15 0 (stream data, file position 117): EOF found while reading inline image
  7 +page 4: content stream objects 19 0 (content, file position 53): parse error while reading object
qpdf/qtest/qpdf/eof-in-inline-image.out
@@ -22,4 +22,4 @@ name: /Fl @@ -22,4 +22,4 @@ name: /Fl
22 name: /DP 22 name: /DP
23 dictionary: << /Columns 1 /Predictor 15 >> 23 dictionary: << /Columns 1 /Predictor 15 >>
24 operator: ID 24 operator: ID
25 -content stream object 4 0 (stream data, file position 139): EOF found while reading inline image 25 +content stream objects 4 0 (stream data, file position 139): EOF found while reading inline image
qpdf/qtest/qpdf/split-content-stream-errors.out 0 → 100644
  1 +WARNING: split-content-stream-errors.pdf: file is damaged
  2 +WARNING: split-content-stream-errors.pdf (file position 802): xref not found
  3 +WARNING: split-content-stream-errors.pdf: Attempting to reconstruct cross-reference table
  4 +checking split-content-stream-errors.pdf
  5 +PDF Version: 1.3
  6 +File is not encrypted
  7 +File is not linearized
  8 +WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received
  9 +WARNING: content stream: ignoring non-stream while parsing content streams
  10 +WARNING: split-content-stream-errors.pdf (file position 557): error decoding stream data for object 6 0: LZWDecoder: bad code received
  11 +WARNING: content stream (content stream object 6 0): errors while decoding content stream
qpdf/qtest/qpdf/split-content-stream-errors.pdf 0 → 100644
  1 +%PDF-1.3
  2 +%¿÷¢þ
  3 +%QDF-1.0
  4 +
  5 +1 0 obj
  6 +<<
  7 + /Pages 2 0 R
  8 + /Type /Catalog
  9 +>>
  10 +endobj
  11 +
  12 +2 0 obj
  13 +<<
  14 + /Count 1
  15 + /Kids [
  16 + 3 0 R
  17 + ]
  18 + /Type /Pages
  19 +>>
  20 +endobj
  21 +
  22 +%% Page 1
  23 +3 0 obj
  24 +<<
  25 + /Contents [
  26 + 4 0 R
  27 + 6 0 R
  28 + ]
  29 + /MediaBox [
  30 + 0
  31 + 0
  32 + 612
  33 + 792
  34 + ]
  35 + /Parent 2 0 R
  36 + /Resources <<
  37 + /Font <<
  38 + /F1 8 0 R
  39 + >>
  40 + /ProcSet 9 0 R
  41 + >>
  42 + /Type /Page
  43 +>>
  44 +endobj
  45 +
  46 +%% Contents for page 1
  47 +4 0 obj
  48 +<<
  49 + /Length 5 0 R
  50 + /Oops (Not a stream)
  51 +>>
  52 +endobj
  53 +
  54 +5 0 obj
  55 +44
  56 +endobj
  57 +
  58 +%% Contents for page 1
  59 +6 0 obj
  60 +<<
  61 + /Length 7 0 R
  62 + /Filter /LZWDecode
  63 +>>
  64 +stream
  65 +BT
  66 + /F1 24 Tf
  67 + 72 720 Td
  68 + (Encoding errors) Tj
  69 +ET
  70 +endstream
  71 +endobj
  72 +
  73 +7 0 obj
  74 +53
  75 +endobj
  76 +
  77 +8 0 obj
  78 +<<
  79 + /BaseFont /Helvetica
  80 + /Encoding /WinAnsiEncoding
  81 + /Name /F1
  82 + /Subtype /Type1
  83 + /Type /Font
  84 +>>
  85 +endobj
  86 +
  87 +9 0 obj
  88 +[
  89 + /PDF
  90 + /Text
  91 +]
  92 +endobj
  93 +
  94 +xref
  95 +0 10
  96 +0000000000 65535 f
  97 +0000000025 00000 n
  98 +0000000079 00000 n
  99 +0000000161 00000 n
  100 +0000000396 00000 n
  101 +0000000457 00000 n
  102 +0000000499 00000 n
  103 +0000000630 00000 n
  104 +0000000649 00000 n
  105 +0000000767 00000 n
  106 +trailer <<
  107 + /Root 1 0 R
  108 + /Size 10
  109 + /ID [<cbdd966f9b7b2bb31ad606c532d7cce5><e5f7cff7a542641606230aadd53106a4>]
  110 +>>
  111 +startxref
  112 +802
  113 +%%EOF
qpdf/qtest/qpdf/split-content-stream.out 0 → 100644
  1 +checking split-content-stream.pdf
  2 +PDF Version: 1.4
  3 +File is not encrypted
  4 +File is not linearized
  5 +No syntax or stream encoding errors found; the file may still contain
  6 +errors that qpdf cannot detect
qpdf/qtest/qpdf/split-content-stream.pdf 0 → 100644
No preview for this file type