Commit 7f8892525f897b17049f9e59bc4ce8ac28c9e082

Authored by Jay Berkenbilt
1 parent 428d96df

Add precheck streams capability

When requested, QPDFWriter will do more aggress prechecking of streams
to make sure it can actually succeed in decoding them before
attempting to do so. This will allow preservation of raw data even
when the raw data is corrupted relative to the specified filters.
ChangeLog
1 2017-07-27 Jay Berkenbilt <ejb@ql.org> 1 2017-07-27 Jay Berkenbilt <ejb@ql.org>
2 2
  3 + * Add --precheck-streams command-line option and setStreamPrecheck
  4 + option to QPDFWriter to tell QPDFWriter to attempt decoding a
  5 + stream fully before deciding whether to filter it or not.
  6 +
3 * Recover gracefully from streams that aren't filterable because 7 * Recover gracefully from streams that aren't filterable because
4 the filter parameters are invalid in the stream dictionary or the 8 the filter parameters are invalid in the stream dictionary or the
5 dictionary itself is invalid. 9 dictionary itself is invalid.
include/qpdf/QPDF.hh
@@ -540,13 +540,14 @@ class QPDF @@ -540,13 +540,14 @@ class QPDF
540 { 540 {
541 friend class QPDF_Stream; 541 friend class QPDF_Stream;
542 private: 542 private:
543 - static void pipeStreamData(QPDF* qpdf, int objid, int generation, 543 + static bool pipeStreamData(QPDF* qpdf, int objid, int generation,
544 qpdf_offset_t offset, size_t length, 544 qpdf_offset_t offset, size_t length,
545 QPDFObjectHandle dict, 545 QPDFObjectHandle dict,
546 - Pipeline* pipeline) 546 + Pipeline* pipeline, bool suppress_warnings)
547 { 547 {
548 - qpdf->pipeStreamData(  
549 - objid, generation, offset, length, dict, pipeline); 548 + return qpdf->pipeStreamData(
  549 + objid, generation, offset, length, dict, pipeline,
  550 + suppress_warnings);
550 } 551 }
551 }; 552 };
552 friend class Pipe; 553 friend class Pipe;
@@ -666,10 +667,11 @@ class QPDF @@ -666,10 +667,11 @@ class QPDF
666 void findAttachmentStreams(); 667 void findAttachmentStreams();
667 668
668 // Calls finish() on the pipeline when done but does not delete it 669 // Calls finish() on the pipeline when done but does not delete it
669 - void pipeStreamData(int objid, int generation, 670 + bool pipeStreamData(int objid, int generation,
670 qpdf_offset_t offset, size_t length, 671 qpdf_offset_t offset, size_t length,
671 QPDFObjectHandle dict, 672 QPDFObjectHandle dict,
672 - Pipeline* pipeline); 673 + Pipeline* pipeline,
  674 + bool suppress_warnings);
673 675
674 // For QPDFWriter: 676 // For QPDFWriter:
675 677
include/qpdf/QPDFObjectHandle.hh
@@ -394,7 +394,8 @@ class QPDFObjectHandle @@ -394,7 +394,8 @@ class QPDFObjectHandle
394 // replaced if writing a new stream object. 394 // replaced if writing a new stream object.
395 QPDF_DLL 395 QPDF_DLL
396 bool pipeStreamData(Pipeline*, bool filter, 396 bool pipeStreamData(Pipeline*, bool filter,
397 - bool normalize, bool compress); 397 + bool normalize, bool compress,
  398 + bool suppress_warnings = false);
398 399
399 // Replace a stream's dictionary. The new dictionary must be 400 // Replace a stream's dictionary. The new dictionary must be
400 // consistent with the stream's data. This is most appropriately 401 // consistent with the stream's data. This is most appropriately
include/qpdf/QPDFWriter.hh
@@ -144,6 +144,17 @@ class QPDFWriter @@ -144,6 +144,17 @@ class QPDFWriter
144 QPDF_DLL 144 QPDF_DLL
145 void setQDFMode(bool); 145 void setQDFMode(bool);
146 146
  147 + // Enable stream precheck mode. In this mode, all filterable
  148 + // streams are checked by actually attempting to decode them
  149 + // before filtering. This may add significant time to the process
  150 + // of writing the data because all streams from the input must be
  151 + // read twice, but it enables the raw stream data to be preserved
  152 + // even in cases where qpdf would run into errors decoding the
  153 + // stream after it determines that it should be able to do it.
  154 + // Examples would include compressed data with errors in it.
  155 + QPDF_DLL
  156 + void setPrecheckStreams(bool);
  157 +
147 // Set the minimum PDF version. If the PDF version of the input 158 // Set the minimum PDF version. If the PDF version of the input
148 // file (or previously set minimum version) is less than the 159 // file (or previously set minimum version) is less than the
149 // version passed to this method, the PDF version of the output 160 // version passed to this method, the PDF version of the output
@@ -415,6 +426,7 @@ class QPDFWriter @@ -415,6 +426,7 @@ class QPDFWriter
415 bool stream_data_mode_set; 426 bool stream_data_mode_set;
416 qpdf_stream_data_e stream_data_mode; 427 qpdf_stream_data_e stream_data_mode;
417 bool qdf_mode; 428 bool qdf_mode;
  429 + bool precheck_streams;
418 bool static_id; 430 bool static_id;
419 bool suppress_original_object_ids; 431 bool suppress_original_object_ids;
420 bool direct_stream_lengths; 432 bool direct_stream_lengths;
libqpdf/QPDF.cc
@@ -2134,12 +2134,14 @@ QPDF::getCompressibleObjGens() @@ -2134,12 +2134,14 @@ QPDF::getCompressibleObjGens()
2134 return result; 2134 return result;
2135 } 2135 }
2136 2136
2137 -void 2137 +bool
2138 QPDF::pipeStreamData(int objid, int generation, 2138 QPDF::pipeStreamData(int objid, int generation,
2139 qpdf_offset_t offset, size_t length, 2139 qpdf_offset_t offset, size_t length,
2140 QPDFObjectHandle stream_dict, 2140 QPDFObjectHandle stream_dict,
2141 - Pipeline* pipeline) 2141 + Pipeline* pipeline,
  2142 + bool suppress_warnings)
2142 { 2143 {
  2144 + bool success = false;
2143 std::vector<PointerHolder<Pipeline> > to_delete; 2145 std::vector<PointerHolder<Pipeline> > to_delete;
2144 if (this->encrypted) 2146 if (this->encrypted)
2145 { 2147 {
@@ -2165,21 +2167,29 @@ QPDF::pipeStreamData(int objid, int generation, @@ -2165,21 +2167,29 @@ QPDF::pipeStreamData(int objid, int generation,
2165 length -= len; 2167 length -= len;
2166 pipeline->write(QUtil::unsigned_char_pointer(buf), len); 2168 pipeline->write(QUtil::unsigned_char_pointer(buf), len);
2167 } 2169 }
  2170 + success = true;
2168 } 2171 }
2169 catch (QPDFExc& e) 2172 catch (QPDFExc& e)
2170 { 2173 {
2171 - warn(e); 2174 + if (! suppress_warnings)
  2175 + {
  2176 + warn(e);
  2177 + }
2172 } 2178 }
2173 catch (std::runtime_error& e) 2179 catch (std::runtime_error& e)
2174 { 2180 {
2175 - QTC::TC("qpdf", "QPDF decoding error warning");  
2176 - warn(QPDFExc(qpdf_e_damaged_pdf, this->file->getName(),  
2177 - "", this->file->getLastOffset(),  
2178 - "error decoding stream data for object " +  
2179 - QUtil::int_to_string(objid) + " " +  
2180 - QUtil::int_to_string(generation) + ": " + e.what())); 2181 + if (! suppress_warnings)
  2182 + {
  2183 + QTC::TC("qpdf", "QPDF decoding error warning");
  2184 + warn(QPDFExc(qpdf_e_damaged_pdf, this->file->getName(),
  2185 + "", this->file->getLastOffset(),
  2186 + "error decoding stream data for object " +
  2187 + QUtil::int_to_string(objid) + " " +
  2188 + QUtil::int_to_string(generation) + ": " + e.what()));
  2189 + }
2181 } 2190 }
2182 pipeline->finish(); 2191 pipeline->finish();
  2192 + return success;
2183 } 2193 }
2184 2194
2185 void 2195 void
libqpdf/QPDFObjectHandle.cc
@@ -496,11 +496,12 @@ QPDFObjectHandle::getRawStreamData() @@ -496,11 +496,12 @@ QPDFObjectHandle::getRawStreamData()
496 496
497 bool 497 bool
498 QPDFObjectHandle::pipeStreamData(Pipeline* p, bool filter, 498 QPDFObjectHandle::pipeStreamData(Pipeline* p, bool filter,
499 - bool normalize, bool compress) 499 + bool normalize, bool compress,
  500 + bool suppress_warnings)
500 { 501 {
501 assertStream(); 502 assertStream();
502 return dynamic_cast<QPDF_Stream*>(obj.getPointer())->pipeStreamData( 503 return dynamic_cast<QPDF_Stream*>(obj.getPointer())->pipeStreamData(
503 - p, filter, normalize, compress); 504 + p, filter, normalize, compress, suppress_warnings);
504 } 505 }
505 506
506 void 507 void
libqpdf/QPDFWriter.cc
@@ -57,6 +57,7 @@ QPDFWriter::init() @@ -57,6 +57,7 @@ QPDFWriter::init()
57 stream_data_mode_set = false; 57 stream_data_mode_set = false;
58 stream_data_mode = qpdf_s_compress; 58 stream_data_mode = qpdf_s_compress;
59 qdf_mode = false; 59 qdf_mode = false;
  60 + precheck_streams = false;
60 static_id = false; 61 static_id = false;
61 suppress_original_object_ids = false; 62 suppress_original_object_ids = false;
62 direct_stream_lengths = true; 63 direct_stream_lengths = true;
@@ -177,6 +178,12 @@ QPDFWriter::setQDFMode(bool val) @@ -177,6 +178,12 @@ QPDFWriter::setQDFMode(bool val)
177 } 178 }
178 179
179 void 180 void
  181 +QPDFWriter::setPrecheckStreams(bool val)
  182 +{
  183 + this->precheck_streams = val;
  184 +}
  185 +
  186 +void
180 QPDFWriter::setMinimumPDFVersion(std::string const& version) 187 QPDFWriter::setMinimumPDFVersion(std::string const& version)
181 { 188 {
182 setMinimumPDFVersion(version, 0); 189 setMinimumPDFVersion(version, 0);
@@ -1522,6 +1529,21 @@ QPDFWriter::unparseObject(QPDFObjectHandle object, int level, @@ -1522,6 +1529,21 @@ QPDFWriter::unparseObject(QPDFObjectHandle object, int level,
1522 1529
1523 flags |= f_stream; 1530 flags |= f_stream;
1524 1531
  1532 + if (filter && this->precheck_streams)
  1533 + {
  1534 + try
  1535 + {
  1536 + QTC::TC("qpdf", "QPDFWriter precheck stream");
  1537 + Pl_Discard discard;
  1538 + filter = object.pipeStreamData(
  1539 + &discard, true, false, false, true);
  1540 + }
  1541 + catch (std::exception)
  1542 + {
  1543 + filter = false;
  1544 + }
  1545 + }
  1546 +
1525 pushPipeline(new Pl_Buffer("stream data")); 1547 pushPipeline(new Pl_Buffer("stream data"));
1526 activatePipelineStack(); 1548 activatePipelineStack();
1527 bool filtered = 1549 bool filtered =
libqpdf/QPDF_Stream.cc
@@ -85,7 +85,7 @@ PointerHolder&lt;Buffer&gt; @@ -85,7 +85,7 @@ PointerHolder&lt;Buffer&gt;
85 QPDF_Stream::getStreamData() 85 QPDF_Stream::getStreamData()
86 { 86 {
87 Pl_Buffer buf("stream data buffer"); 87 Pl_Buffer buf("stream data buffer");
88 - if (! pipeStreamData(&buf, true, false, false)) 88 + if (! pipeStreamData(&buf, true, false, false, false))
89 { 89 {
90 throw std::logic_error("getStreamData called on unfilterable stream"); 90 throw std::logic_error("getStreamData called on unfilterable stream");
91 } 91 }
@@ -97,7 +97,7 @@ PointerHolder&lt;Buffer&gt; @@ -97,7 +97,7 @@ PointerHolder&lt;Buffer&gt;
97 QPDF_Stream::getRawStreamData() 97 QPDF_Stream::getRawStreamData()
98 { 98 {
99 Pl_Buffer buf("stream data buffer"); 99 Pl_Buffer buf("stream data buffer");
100 - pipeStreamData(&buf, false, false, false); 100 + pipeStreamData(&buf, false, false, false, false);
101 QTC::TC("qpdf", "QPDF_Stream getRawStreamData"); 101 QTC::TC("qpdf", "QPDF_Stream getRawStreamData");
102 return buf.getBuffer(); 102 return buf.getBuffer();
103 } 103 }
@@ -351,7 +351,8 @@ QPDF_Stream::filterable(std::vector&lt;std::string&gt;&amp; filters, @@ -351,7 +351,8 @@ QPDF_Stream::filterable(std::vector&lt;std::string&gt;&amp; filters,
351 351
352 bool 352 bool
353 QPDF_Stream::pipeStreamData(Pipeline* pipeline, bool filter, 353 QPDF_Stream::pipeStreamData(Pipeline* pipeline, bool filter,
354 - bool normalize, bool compress) 354 + bool normalize, bool compress,
  355 + bool suppress_warnings)
355 { 356 {
356 std::vector<std::string> filters; 357 std::vector<std::string> filters;
357 int predictor = 1; 358 int predictor = 1;
@@ -487,9 +488,13 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline, bool filter, @@ -487,9 +488,13 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline, bool filter,
487 else 488 else
488 { 489 {
489 QTC::TC("qpdf", "QPDF_Stream pipe original stream data"); 490 QTC::TC("qpdf", "QPDF_Stream pipe original stream data");
490 - QPDF::Pipe::pipeStreamData(this->qpdf, this->objid, this->generation,  
491 - this->offset, this->length,  
492 - this->stream_dict, pipeline); 491 + if (! QPDF::Pipe::pipeStreamData(this->qpdf, this->objid, this->generation,
  492 + this->offset, this->length,
  493 + this->stream_dict, pipeline,
  494 + suppress_warnings))
  495 + {
  496 + filter = false;
  497 + }
493 } 498 }
494 499
495 return filter; 500 return filter;
libqpdf/qpdf/QPDF_Stream.hh
@@ -23,7 +23,8 @@ class QPDF_Stream: public QPDFObject @@ -23,7 +23,8 @@ class QPDF_Stream: public QPDFObject
23 23
24 // See comments in QPDFObjectHandle.hh for these methods. 24 // See comments in QPDFObjectHandle.hh for these methods.
25 bool pipeStreamData(Pipeline*, bool filter, 25 bool pipeStreamData(Pipeline*, bool filter,
26 - bool normalize, bool compress); 26 + bool normalize, bool compress,
  27 + bool suppress_warnings);
27 PointerHolder<Buffer> getStreamData(); 28 PointerHolder<Buffer> getStreamData();
28 PointerHolder<Buffer> getRawStreamData(); 29 PointerHolder<Buffer> getRawStreamData();
29 void replaceStreamData(PointerHolder<Buffer> data, 30 void replaceStreamData(PointerHolder<Buffer> data,
manual/qpdf-manual.xml
@@ -822,6 +822,23 @@ outfile.pdf&lt;/option&gt; @@ -822,6 +822,23 @@ outfile.pdf&lt;/option&gt;
822 </listitem> 822 </listitem>
823 </varlistentry> 823 </varlistentry>
824 <varlistentry> 824 <varlistentry>
  825 + <term><option>--precheck-streams</option></term>
  826 + <listitem>
  827 + <para>
  828 + Tells qpdf to precheck each stream for the ability to decode
  829 + it. Ordinarily qpdf tries to decode streams that it thinks it
  830 + can decode based on the filters, and if there ends up being an
  831 + error when actually trying to do the decode, the stream data
  832 + is truncated. This flag causes qpdf to actually read the
  833 + stream fully before deciding whether to filter the stream.
  834 + This option will slow qpdf down since it will have to read the
  835 + stream twice, but it allows raw stream data to be preserved in
  836 + cases where the decoding of the stream would fail for some
  837 + reason. This may be useful in working with some damaged files.
  838 + </para>
  839 + </listitem>
  840 + </varlistentry>
  841 + <varlistentry>
825 <term><option>--qdf</option></term> 842 <term><option>--qdf</option></term>
826 <listitem> 843 <listitem>
827 <para> 844 <para>
qpdf/qpdf.cc
@@ -202,6 +202,7 @@ familiar with the PDF file format or who are PDF developers.\n\ @@ -202,6 +202,7 @@ familiar with the PDF file format or who are PDF developers.\n\
202 --suppress-recovery prevents qpdf from attempting to recover damaged files\n\ 202 --suppress-recovery prevents qpdf from attempting to recover damaged files\n\
203 --object-streams=mode controls handing of object streams\n\ 203 --object-streams=mode controls handing of object streams\n\
204 --ignore-xref-streams tells qpdf to ignore any cross-reference streams\n\ 204 --ignore-xref-streams tells qpdf to ignore any cross-reference streams\n\
  205 +--precheck-streams precheck ability to decode streams\n\
205 --qdf turns on \"QDF mode\" (below)\n\ 206 --qdf turns on \"QDF mode\" (below)\n\
206 --min-version=version sets the minimum PDF version of the output file\n\ 207 --min-version=version sets the minimum PDF version of the output file\n\
207 --force-version=version forces this to be the PDF version of the output file\n\ 208 --force-version=version forces this to be the PDF version of the output file\n\
@@ -1028,6 +1029,7 @@ int main(int argc, char* argv[]) @@ -1028,6 +1029,7 @@ int main(int argc, char* argv[])
1028 qpdf_object_stream_e object_stream_mode = qpdf_o_preserve; 1029 qpdf_object_stream_e object_stream_mode = qpdf_o_preserve;
1029 bool ignore_xref_streams = false; 1030 bool ignore_xref_streams = false;
1030 bool qdf_mode = false; 1031 bool qdf_mode = false;
  1032 + bool precheck_streams = false;
1031 std::string min_version; 1033 std::string min_version;
1032 std::string force_version; 1034 std::string force_version;
1033 1035
@@ -1213,6 +1215,10 @@ int main(int argc, char* argv[]) @@ -1213,6 +1215,10 @@ int main(int argc, char* argv[])
1213 { 1215 {
1214 qdf_mode = true; 1216 qdf_mode = true;
1215 } 1217 }
  1218 + else if (strcmp(arg, "precheck-streams") == 0)
  1219 + {
  1220 + precheck_streams = true;
  1221 + }
1216 else if (strcmp(arg, "min-version") == 0) 1222 else if (strcmp(arg, "min-version") == 0)
1217 { 1223 {
1218 if (parameter == 0) 1224 if (parameter == 0)
@@ -1704,6 +1710,10 @@ int main(int argc, char* argv[]) @@ -1704,6 +1710,10 @@ int main(int argc, char* argv[])
1704 { 1710 {
1705 w.setQDFMode(true); 1711 w.setQDFMode(true);
1706 } 1712 }
  1713 + if (precheck_streams)
  1714 + {
  1715 + w.setPrecheckStreams(true);
  1716 + }
1707 if (normalize_set) 1717 if (normalize_set)
1708 { 1718 {
1709 w.setContentNormalization(normalize); 1719 w.setContentNormalization(normalize);
qpdf/qpdf.testcov
@@ -279,3 +279,4 @@ QPDFObjectHandle treat word as string 0 @@ -279,3 +279,4 @@ QPDFObjectHandle treat word as string 0
279 QPDFObjectHandle found fake 1 279 QPDFObjectHandle found fake 1
280 QPDFObjectHandle no val for last key 0 280 QPDFObjectHandle no val for last key 0
281 QPDF resolve failure to null 0 281 QPDF resolve failure to null 0
  282 +QPDFWriter precheck stream 0
qpdf/qtest/qpdf.test
@@ -723,6 +723,26 @@ $td-&gt;runtest(&quot;check output&quot;, @@ -723,6 +723,26 @@ $td-&gt;runtest(&quot;check output&quot;,
723 {$td->FILE => "from-scratch-0.pdf"}); 723 {$td->FILE => "from-scratch-0.pdf"});
724 show_ntests(); 724 show_ntests();
725 # ---------- 725 # ----------
  726 +$td->notify("--- Precheck streams ---");
  727 +$n_tests += 4;
  728 +
  729 +$td->runtest("bad stream without precheck",
  730 + {$td->COMMAND => "qpdf --static-id bad-data.pdf a.pdf"},
  731 + {$td->FILE => "bad-data.out", $td->EXIT_STATUS => 3},
  732 + $td->NORMALIZE_NEWLINES);
  733 +$td->runtest("check output",
  734 + {$td->FILE => "a.pdf"},
  735 + {$td->FILE => "bad-data-out.pdf"});
  736 +$td->runtest("bad stream with precheck",
  737 + {$td->COMMAND =>
  738 + "qpdf --static-id --precheck-streams bad-data.pdf a.pdf"},
  739 + {$td->STRING => "", $td->EXIT_STATUS => 0},
  740 + $td->NORMALIZE_NEWLINES);
  741 +$td->runtest("check output",
  742 + {$td->FILE => "a.pdf"},
  743 + {$td->FILE => "bad-data-precheck.pdf"});
  744 +show_ntests();
  745 +# ----------
726 $td->notify("--- Copy Foreign Objects ---"); 746 $td->notify("--- Copy Foreign Objects ---");
727 $n_tests += 7; 747 $n_tests += 7;
728 748
qpdf/qtest/qpdf/bad-data-out.pdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/bad-data-precheck.pdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/bad-data.out 0 → 100644
  1 +WARNING: bad-data.pdf (file position 319): error decoding stream data for object 4 0: LZWDecoder: bad code received
  2 +qpdf: operation succeeded with warnings; resulting file may have some problems
qpdf/qtest/qpdf/bad-data.pdf 0 → 100644
No preview for this file type