Commit 7f8892525f897b17049f9e59bc4ce8ac28c9e082

Authored by Jay Berkenbilt
1 parent 428d96df

Add precheck streams capability

When requested, QPDFWriter will do more aggress prechecking of streams
to make sure it can actually succeed in decoding them before
attempting to do so. This will allow preservation of raw data even
when the raw data is corrupted relative to the specified filters.
ChangeLog
1 1 2017-07-27 Jay Berkenbilt <ejb@ql.org>
2 2  
  3 + * Add --precheck-streams command-line option and setStreamPrecheck
  4 + option to QPDFWriter to tell QPDFWriter to attempt decoding a
  5 + stream fully before deciding whether to filter it or not.
  6 +
3 7 * Recover gracefully from streams that aren't filterable because
4 8 the filter parameters are invalid in the stream dictionary or the
5 9 dictionary itself is invalid.
... ...
include/qpdf/QPDF.hh
... ... @@ -540,13 +540,14 @@ class QPDF
540 540 {
541 541 friend class QPDF_Stream;
542 542 private:
543   - static void pipeStreamData(QPDF* qpdf, int objid, int generation,
  543 + static bool pipeStreamData(QPDF* qpdf, int objid, int generation,
544 544 qpdf_offset_t offset, size_t length,
545 545 QPDFObjectHandle dict,
546   - Pipeline* pipeline)
  546 + Pipeline* pipeline, bool suppress_warnings)
547 547 {
548   - qpdf->pipeStreamData(
549   - objid, generation, offset, length, dict, pipeline);
  548 + return qpdf->pipeStreamData(
  549 + objid, generation, offset, length, dict, pipeline,
  550 + suppress_warnings);
550 551 }
551 552 };
552 553 friend class Pipe;
... ... @@ -666,10 +667,11 @@ class QPDF
666 667 void findAttachmentStreams();
667 668  
668 669 // Calls finish() on the pipeline when done but does not delete it
669   - void pipeStreamData(int objid, int generation,
  670 + bool pipeStreamData(int objid, int generation,
670 671 qpdf_offset_t offset, size_t length,
671 672 QPDFObjectHandle dict,
672   - Pipeline* pipeline);
  673 + Pipeline* pipeline,
  674 + bool suppress_warnings);
673 675  
674 676 // For QPDFWriter:
675 677  
... ...
include/qpdf/QPDFObjectHandle.hh
... ... @@ -394,7 +394,8 @@ class QPDFObjectHandle
394 394 // replaced if writing a new stream object.
395 395 QPDF_DLL
396 396 bool pipeStreamData(Pipeline*, bool filter,
397   - bool normalize, bool compress);
  397 + bool normalize, bool compress,
  398 + bool suppress_warnings = false);
398 399  
399 400 // Replace a stream's dictionary. The new dictionary must be
400 401 // consistent with the stream's data. This is most appropriately
... ...
include/qpdf/QPDFWriter.hh
... ... @@ -144,6 +144,17 @@ class QPDFWriter
144 144 QPDF_DLL
145 145 void setQDFMode(bool);
146 146  
  147 + // Enable stream precheck mode. In this mode, all filterable
  148 + // streams are checked by actually attempting to decode them
  149 + // before filtering. This may add significant time to the process
  150 + // of writing the data because all streams from the input must be
  151 + // read twice, but it enables the raw stream data to be preserved
  152 + // even in cases where qpdf would run into errors decoding the
  153 + // stream after it determines that it should be able to do it.
  154 + // Examples would include compressed data with errors in it.
  155 + QPDF_DLL
  156 + void setPrecheckStreams(bool);
  157 +
147 158 // Set the minimum PDF version. If the PDF version of the input
148 159 // file (or previously set minimum version) is less than the
149 160 // version passed to this method, the PDF version of the output
... ... @@ -415,6 +426,7 @@ class QPDFWriter
415 426 bool stream_data_mode_set;
416 427 qpdf_stream_data_e stream_data_mode;
417 428 bool qdf_mode;
  429 + bool precheck_streams;
418 430 bool static_id;
419 431 bool suppress_original_object_ids;
420 432 bool direct_stream_lengths;
... ...
libqpdf/QPDF.cc
... ... @@ -2134,12 +2134,14 @@ QPDF::getCompressibleObjGens()
2134 2134 return result;
2135 2135 }
2136 2136  
2137   -void
  2137 +bool
2138 2138 QPDF::pipeStreamData(int objid, int generation,
2139 2139 qpdf_offset_t offset, size_t length,
2140 2140 QPDFObjectHandle stream_dict,
2141   - Pipeline* pipeline)
  2141 + Pipeline* pipeline,
  2142 + bool suppress_warnings)
2142 2143 {
  2144 + bool success = false;
2143 2145 std::vector<PointerHolder<Pipeline> > to_delete;
2144 2146 if (this->encrypted)
2145 2147 {
... ... @@ -2165,21 +2167,29 @@ QPDF::pipeStreamData(int objid, int generation,
2165 2167 length -= len;
2166 2168 pipeline->write(QUtil::unsigned_char_pointer(buf), len);
2167 2169 }
  2170 + success = true;
2168 2171 }
2169 2172 catch (QPDFExc& e)
2170 2173 {
2171   - warn(e);
  2174 + if (! suppress_warnings)
  2175 + {
  2176 + warn(e);
  2177 + }
2172 2178 }
2173 2179 catch (std::runtime_error& e)
2174 2180 {
2175   - QTC::TC("qpdf", "QPDF decoding error warning");
2176   - warn(QPDFExc(qpdf_e_damaged_pdf, this->file->getName(),
2177   - "", this->file->getLastOffset(),
2178   - "error decoding stream data for object " +
2179   - QUtil::int_to_string(objid) + " " +
2180   - QUtil::int_to_string(generation) + ": " + e.what()));
  2181 + if (! suppress_warnings)
  2182 + {
  2183 + QTC::TC("qpdf", "QPDF decoding error warning");
  2184 + warn(QPDFExc(qpdf_e_damaged_pdf, this->file->getName(),
  2185 + "", this->file->getLastOffset(),
  2186 + "error decoding stream data for object " +
  2187 + QUtil::int_to_string(objid) + " " +
  2188 + QUtil::int_to_string(generation) + ": " + e.what()));
  2189 + }
2181 2190 }
2182 2191 pipeline->finish();
  2192 + return success;
2183 2193 }
2184 2194  
2185 2195 void
... ...
libqpdf/QPDFObjectHandle.cc
... ... @@ -496,11 +496,12 @@ QPDFObjectHandle::getRawStreamData()
496 496  
497 497 bool
498 498 QPDFObjectHandle::pipeStreamData(Pipeline* p, bool filter,
499   - bool normalize, bool compress)
  499 + bool normalize, bool compress,
  500 + bool suppress_warnings)
500 501 {
501 502 assertStream();
502 503 return dynamic_cast<QPDF_Stream*>(obj.getPointer())->pipeStreamData(
503   - p, filter, normalize, compress);
  504 + p, filter, normalize, compress, suppress_warnings);
504 505 }
505 506  
506 507 void
... ...
libqpdf/QPDFWriter.cc
... ... @@ -57,6 +57,7 @@ QPDFWriter::init()
57 57 stream_data_mode_set = false;
58 58 stream_data_mode = qpdf_s_compress;
59 59 qdf_mode = false;
  60 + precheck_streams = false;
60 61 static_id = false;
61 62 suppress_original_object_ids = false;
62 63 direct_stream_lengths = true;
... ... @@ -177,6 +178,12 @@ QPDFWriter::setQDFMode(bool val)
177 178 }
178 179  
179 180 void
  181 +QPDFWriter::setPrecheckStreams(bool val)
  182 +{
  183 + this->precheck_streams = val;
  184 +}
  185 +
  186 +void
180 187 QPDFWriter::setMinimumPDFVersion(std::string const& version)
181 188 {
182 189 setMinimumPDFVersion(version, 0);
... ... @@ -1522,6 +1529,21 @@ QPDFWriter::unparseObject(QPDFObjectHandle object, int level,
1522 1529  
1523 1530 flags |= f_stream;
1524 1531  
  1532 + if (filter && this->precheck_streams)
  1533 + {
  1534 + try
  1535 + {
  1536 + QTC::TC("qpdf", "QPDFWriter precheck stream");
  1537 + Pl_Discard discard;
  1538 + filter = object.pipeStreamData(
  1539 + &discard, true, false, false, true);
  1540 + }
  1541 + catch (std::exception)
  1542 + {
  1543 + filter = false;
  1544 + }
  1545 + }
  1546 +
1525 1547 pushPipeline(new Pl_Buffer("stream data"));
1526 1548 activatePipelineStack();
1527 1549 bool filtered =
... ...
libqpdf/QPDF_Stream.cc
... ... @@ -85,7 +85,7 @@ PointerHolder&lt;Buffer&gt;
85 85 QPDF_Stream::getStreamData()
86 86 {
87 87 Pl_Buffer buf("stream data buffer");
88   - if (! pipeStreamData(&buf, true, false, false))
  88 + if (! pipeStreamData(&buf, true, false, false, false))
89 89 {
90 90 throw std::logic_error("getStreamData called on unfilterable stream");
91 91 }
... ... @@ -97,7 +97,7 @@ PointerHolder&lt;Buffer&gt;
97 97 QPDF_Stream::getRawStreamData()
98 98 {
99 99 Pl_Buffer buf("stream data buffer");
100   - pipeStreamData(&buf, false, false, false);
  100 + pipeStreamData(&buf, false, false, false, false);
101 101 QTC::TC("qpdf", "QPDF_Stream getRawStreamData");
102 102 return buf.getBuffer();
103 103 }
... ... @@ -351,7 +351,8 @@ QPDF_Stream::filterable(std::vector&lt;std::string&gt;&amp; filters,
351 351  
352 352 bool
353 353 QPDF_Stream::pipeStreamData(Pipeline* pipeline, bool filter,
354   - bool normalize, bool compress)
  354 + bool normalize, bool compress,
  355 + bool suppress_warnings)
355 356 {
356 357 std::vector<std::string> filters;
357 358 int predictor = 1;
... ... @@ -487,9 +488,13 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline, bool filter,
487 488 else
488 489 {
489 490 QTC::TC("qpdf", "QPDF_Stream pipe original stream data");
490   - QPDF::Pipe::pipeStreamData(this->qpdf, this->objid, this->generation,
491   - this->offset, this->length,
492   - this->stream_dict, pipeline);
  491 + if (! QPDF::Pipe::pipeStreamData(this->qpdf, this->objid, this->generation,
  492 + this->offset, this->length,
  493 + this->stream_dict, pipeline,
  494 + suppress_warnings))
  495 + {
  496 + filter = false;
  497 + }
493 498 }
494 499  
495 500 return filter;
... ...
libqpdf/qpdf/QPDF_Stream.hh
... ... @@ -23,7 +23,8 @@ class QPDF_Stream: public QPDFObject
23 23  
24 24 // See comments in QPDFObjectHandle.hh for these methods.
25 25 bool pipeStreamData(Pipeline*, bool filter,
26   - bool normalize, bool compress);
  26 + bool normalize, bool compress,
  27 + bool suppress_warnings);
27 28 PointerHolder<Buffer> getStreamData();
28 29 PointerHolder<Buffer> getRawStreamData();
29 30 void replaceStreamData(PointerHolder<Buffer> data,
... ...
manual/qpdf-manual.xml
... ... @@ -822,6 +822,23 @@ outfile.pdf&lt;/option&gt;
822 822 </listitem>
823 823 </varlistentry>
824 824 <varlistentry>
  825 + <term><option>--precheck-streams</option></term>
  826 + <listitem>
  827 + <para>
  828 + Tells qpdf to precheck each stream for the ability to decode
  829 + it. Ordinarily qpdf tries to decode streams that it thinks it
  830 + can decode based on the filters, and if there ends up being an
  831 + error when actually trying to do the decode, the stream data
  832 + is truncated. This flag causes qpdf to actually read the
  833 + stream fully before deciding whether to filter the stream.
  834 + This option will slow qpdf down since it will have to read the
  835 + stream twice, but it allows raw stream data to be preserved in
  836 + cases where the decoding of the stream would fail for some
  837 + reason. This may be useful in working with some damaged files.
  838 + </para>
  839 + </listitem>
  840 + </varlistentry>
  841 + <varlistentry>
825 842 <term><option>--qdf</option></term>
826 843 <listitem>
827 844 <para>
... ...
qpdf/qpdf.cc
... ... @@ -202,6 +202,7 @@ familiar with the PDF file format or who are PDF developers.\n\
202 202 --suppress-recovery prevents qpdf from attempting to recover damaged files\n\
203 203 --object-streams=mode controls handing of object streams\n\
204 204 --ignore-xref-streams tells qpdf to ignore any cross-reference streams\n\
  205 +--precheck-streams precheck ability to decode streams\n\
205 206 --qdf turns on \"QDF mode\" (below)\n\
206 207 --min-version=version sets the minimum PDF version of the output file\n\
207 208 --force-version=version forces this to be the PDF version of the output file\n\
... ... @@ -1028,6 +1029,7 @@ int main(int argc, char* argv[])
1028 1029 qpdf_object_stream_e object_stream_mode = qpdf_o_preserve;
1029 1030 bool ignore_xref_streams = false;
1030 1031 bool qdf_mode = false;
  1032 + bool precheck_streams = false;
1031 1033 std::string min_version;
1032 1034 std::string force_version;
1033 1035  
... ... @@ -1213,6 +1215,10 @@ int main(int argc, char* argv[])
1213 1215 {
1214 1216 qdf_mode = true;
1215 1217 }
  1218 + else if (strcmp(arg, "precheck-streams") == 0)
  1219 + {
  1220 + precheck_streams = true;
  1221 + }
1216 1222 else if (strcmp(arg, "min-version") == 0)
1217 1223 {
1218 1224 if (parameter == 0)
... ... @@ -1704,6 +1710,10 @@ int main(int argc, char* argv[])
1704 1710 {
1705 1711 w.setQDFMode(true);
1706 1712 }
  1713 + if (precheck_streams)
  1714 + {
  1715 + w.setPrecheckStreams(true);
  1716 + }
1707 1717 if (normalize_set)
1708 1718 {
1709 1719 w.setContentNormalization(normalize);
... ...
qpdf/qpdf.testcov
... ... @@ -279,3 +279,4 @@ QPDFObjectHandle treat word as string 0
279 279 QPDFObjectHandle found fake 1
280 280 QPDFObjectHandle no val for last key 0
281 281 QPDF resolve failure to null 0
  282 +QPDFWriter precheck stream 0
... ...
qpdf/qtest/qpdf.test
... ... @@ -723,6 +723,26 @@ $td-&gt;runtest(&quot;check output&quot;,
723 723 {$td->FILE => "from-scratch-0.pdf"});
724 724 show_ntests();
725 725 # ----------
  726 +$td->notify("--- Precheck streams ---");
  727 +$n_tests += 4;
  728 +
  729 +$td->runtest("bad stream without precheck",
  730 + {$td->COMMAND => "qpdf --static-id bad-data.pdf a.pdf"},
  731 + {$td->FILE => "bad-data.out", $td->EXIT_STATUS => 3},
  732 + $td->NORMALIZE_NEWLINES);
  733 +$td->runtest("check output",
  734 + {$td->FILE => "a.pdf"},
  735 + {$td->FILE => "bad-data-out.pdf"});
  736 +$td->runtest("bad stream with precheck",
  737 + {$td->COMMAND =>
  738 + "qpdf --static-id --precheck-streams bad-data.pdf a.pdf"},
  739 + {$td->STRING => "", $td->EXIT_STATUS => 0},
  740 + $td->NORMALIZE_NEWLINES);
  741 +$td->runtest("check output",
  742 + {$td->FILE => "a.pdf"},
  743 + {$td->FILE => "bad-data-precheck.pdf"});
  744 +show_ntests();
  745 +# ----------
726 746 $td->notify("--- Copy Foreign Objects ---");
727 747 $n_tests += 7;
728 748  
... ...
qpdf/qtest/qpdf/bad-data-out.pdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/bad-data-precheck.pdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/bad-data.out 0 → 100644
  1 +WARNING: bad-data.pdf (file position 319): error decoding stream data for object 4 0: LZWDecoder: bad code received
  2 +qpdf: operation succeeded with warnings; resulting file may have some problems
... ...
qpdf/qtest/qpdf/bad-data.pdf 0 → 100644
No preview for this file type