Commit b3c4b4dbab135643c06a074b6b13993ae1b9a741

Authored by m-holger
1 parent ad3bac2c

Refine xref table reconstruction

During xref table reconstruction ignore uncompressed object entries found
in xref streams. The xref table gets populated with entries for the
objects actually found in the file. The entries for uncompressed object in
xref streams are redundant and potentially incorrect.
include/qpdf/QPDF.hh
@@ -765,15 +765,15 @@ class QPDF @@ -765,15 +765,15 @@ class QPDF
765 void parse(char const* password); 765 void parse(char const* password);
766 void inParse(bool); 766 void inParse(bool);
767 void setTrailer(QPDFObjectHandle obj); 767 void setTrailer(QPDFObjectHandle obj);
768 - void read_xref(qpdf_offset_t offset); 768 + void read_xref(qpdf_offset_t offset, bool in_stream_recovery = false);
769 bool resolveXRefTable(); 769 bool resolveXRefTable();
770 void reconstruct_xref(QPDFExc& e, bool found_startxref = true); 770 void reconstruct_xref(QPDFExc& e, bool found_startxref = true);
771 bool parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes); 771 bool parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes);
772 bool read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type); 772 bool read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
773 bool read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type); 773 bool read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
774 qpdf_offset_t read_xrefTable(qpdf_offset_t offset); 774 qpdf_offset_t read_xrefTable(qpdf_offset_t offset);
775 - qpdf_offset_t read_xrefStream(qpdf_offset_t offset);  
776 - qpdf_offset_t processXRefStream(qpdf_offset_t offset, QPDFObjectHandle& xref_stream); 775 + qpdf_offset_t read_xrefStream(qpdf_offset_t offset, bool in_stream_recovery=false);
  776 + qpdf_offset_t processXRefStream(qpdf_offset_t offset, QPDFObjectHandle& xref_stream, bool in_stream_recovery=false);
777 std::pair<int, std::array<int, 3>> 777 std::pair<int, std::array<int, 3>>
778 processXRefW(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged); 778 processXRefW(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged);
779 int processXRefSize( 779 int processXRefSize(
libqpdf/QPDF_objects.cc
@@ -325,7 +325,7 @@ QPDF::reconstruct_xref(QPDFExc&amp; e, bool found_startxref) @@ -325,7 +325,7 @@ QPDF::reconstruct_xref(QPDFExc&amp; e, bool found_startxref)
325 } 325 }
326 if (max_offset > 0) { 326 if (max_offset > 0) {
327 try { 327 try {
328 - read_xref(max_offset); 328 + read_xref(max_offset, true);
329 } catch (std::exception&) { 329 } catch (std::exception&) {
330 warn(damagedPDF( 330 warn(damagedPDF(
331 "", -1, "error decoding candidate xref stream while recovering damaged file")); 331 "", -1, "error decoding candidate xref stream while recovering damaged file"));
@@ -388,7 +388,7 @@ QPDF::reconstruct_xref(QPDFExc&amp; e, bool found_startxref) @@ -388,7 +388,7 @@ QPDF::reconstruct_xref(QPDFExc&amp; e, bool found_startxref)
388 } 388 }
389 389
390 void 390 void
391 -QPDF::read_xref(qpdf_offset_t xref_offset) 391 +QPDF::read_xref(qpdf_offset_t xref_offset, bool in_stream_recovery)
392 { 392 {
393 std::map<int, int> free_table; 393 std::map<int, int> free_table;
394 std::set<qpdf_offset_t> visited; 394 std::set<qpdf_offset_t> visited;
@@ -440,7 +440,7 @@ QPDF::read_xref(qpdf_offset_t xref_offset) @@ -440,7 +440,7 @@ QPDF::read_xref(qpdf_offset_t xref_offset)
440 } 440 }
441 xref_offset = read_xrefTable(xref_offset + skip); 441 xref_offset = read_xrefTable(xref_offset + skip);
442 } else { 442 } else {
443 - xref_offset = read_xrefStream(xref_offset); 443 + xref_offset = read_xrefStream(xref_offset, in_stream_recovery);
444 } 444 }
445 if (visited.count(xref_offset) != 0) { 445 if (visited.count(xref_offset) != 0) {
446 QTC::TC("qpdf", "QPDF xref loop"); 446 QTC::TC("qpdf", "QPDF xref loop");
@@ -759,7 +759,7 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset) @@ -759,7 +759,7 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset)
759 759
760 // Read a single cross-reference stream. 760 // Read a single cross-reference stream.
761 qpdf_offset_t 761 qpdf_offset_t
762 -QPDF::read_xrefStream(qpdf_offset_t xref_offset) 762 +QPDF::read_xrefStream(qpdf_offset_t xref_offset, bool in_stream_recovery)
763 { 763 {
764 if (!m->ignore_xref_streams) { 764 if (!m->ignore_xref_streams) {
765 QPDFObjGen x_og; 765 QPDFObjGen x_og;
@@ -772,7 +772,7 @@ QPDF::read_xrefStream(qpdf_offset_t xref_offset) @@ -772,7 +772,7 @@ QPDF::read_xrefStream(qpdf_offset_t xref_offset)
772 } 772 }
773 if (xref_obj.isStreamOfType("/XRef")) { 773 if (xref_obj.isStreamOfType("/XRef")) {
774 QTC::TC("qpdf", "QPDF found xref stream"); 774 QTC::TC("qpdf", "QPDF found xref stream");
775 - return processXRefStream(xref_offset, xref_obj); 775 + return processXRefStream(xref_offset, xref_obj, in_stream_recovery);
776 } 776 }
777 } 777 }
778 778
@@ -905,7 +905,8 @@ QPDF::processXRefIndex( @@ -905,7 +905,8 @@ QPDF::processXRefIndex(
905 } 905 }
906 906
907 qpdf_offset_t 907 qpdf_offset_t
908 -QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj) 908 +QPDF::processXRefStream(
  909 + qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj, bool in_stream_recovery)
909 { 910 {
910 auto damaged = [this, xref_offset](std::string_view msg) -> QPDFExc { 911 auto damaged = [this, xref_offset](std::string_view msg) -> QPDFExc {
911 return damagedPDF("xref stream", xref_offset, msg.data()); 912 return damagedPDF("xref stream", xref_offset, msg.data());
@@ -971,7 +972,13 @@ QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle&amp; xref_obj) @@ -971,7 +972,13 @@ QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle&amp; xref_obj)
971 // objects. 972 // objects.
972 insertFreeXrefEntry(QPDFObjGen(obj, 0)); 973 insertFreeXrefEntry(QPDFObjGen(obj, 0));
973 } else { 974 } else {
974 - insertXrefEntry(obj, toI(fields[0]), fields[1], toI(fields[2])); 975 + auto typ = toI(fields[0]);
  976 + if (!in_stream_recovery || typ == 2) {
  977 + // If we are in xref stream recovery all actual uncompressed objects have
  978 + // already been inserted into the xref table. Avoid adding junk data into the
  979 + // xref table.
  980 + insertXrefEntry(obj, toI(fields[0]), fields[1], toI(fields[2]));
  981 + }
975 } 982 }
976 ++obj; 983 ++obj;
977 } 984 }