Commit b3c4b4dbab135643c06a074b6b13993ae1b9a741

Authored by m-holger
1 parent ad3bac2c

Refine xref table reconstruction

During xref table reconstruction ignore uncompressed object entries found
in xref streams. The xref table gets populated with entries for the
objects actually found in the file. The entries for uncompressed object in
xref streams are redundant and potentially incorrect.
include/qpdf/QPDF.hh
... ... @@ -765,15 +765,15 @@ class QPDF
765 765 void parse(char const* password);
766 766 void inParse(bool);
767 767 void setTrailer(QPDFObjectHandle obj);
768   - void read_xref(qpdf_offset_t offset);
  768 + void read_xref(qpdf_offset_t offset, bool in_stream_recovery = false);
769 769 bool resolveXRefTable();
770 770 void reconstruct_xref(QPDFExc& e, bool found_startxref = true);
771 771 bool parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes);
772 772 bool read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
773 773 bool read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
774 774 qpdf_offset_t read_xrefTable(qpdf_offset_t offset);
775   - qpdf_offset_t read_xrefStream(qpdf_offset_t offset);
776   - qpdf_offset_t processXRefStream(qpdf_offset_t offset, QPDFObjectHandle& xref_stream);
  775 + qpdf_offset_t read_xrefStream(qpdf_offset_t offset, bool in_stream_recovery=false);
  776 + qpdf_offset_t processXRefStream(qpdf_offset_t offset, QPDFObjectHandle& xref_stream, bool in_stream_recovery=false);
777 777 std::pair<int, std::array<int, 3>>
778 778 processXRefW(QPDFObjectHandle& dict, std::function<QPDFExc(std::string_view)> damaged);
779 779 int processXRefSize(
... ...
libqpdf/QPDF_objects.cc
... ... @@ -325,7 +325,7 @@ QPDF::reconstruct_xref(QPDFExc&amp; e, bool found_startxref)
325 325 }
326 326 if (max_offset > 0) {
327 327 try {
328   - read_xref(max_offset);
  328 + read_xref(max_offset, true);
329 329 } catch (std::exception&) {
330 330 warn(damagedPDF(
331 331 "", -1, "error decoding candidate xref stream while recovering damaged file"));
... ... @@ -388,7 +388,7 @@ QPDF::reconstruct_xref(QPDFExc&amp; e, bool found_startxref)
388 388 }
389 389  
390 390 void
391   -QPDF::read_xref(qpdf_offset_t xref_offset)
  391 +QPDF::read_xref(qpdf_offset_t xref_offset, bool in_stream_recovery)
392 392 {
393 393 std::map<int, int> free_table;
394 394 std::set<qpdf_offset_t> visited;
... ... @@ -440,7 +440,7 @@ QPDF::read_xref(qpdf_offset_t xref_offset)
440 440 }
441 441 xref_offset = read_xrefTable(xref_offset + skip);
442 442 } else {
443   - xref_offset = read_xrefStream(xref_offset);
  443 + xref_offset = read_xrefStream(xref_offset, in_stream_recovery);
444 444 }
445 445 if (visited.count(xref_offset) != 0) {
446 446 QTC::TC("qpdf", "QPDF xref loop");
... ... @@ -759,7 +759,7 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset)
759 759  
760 760 // Read a single cross-reference stream.
761 761 qpdf_offset_t
762   -QPDF::read_xrefStream(qpdf_offset_t xref_offset)
  762 +QPDF::read_xrefStream(qpdf_offset_t xref_offset, bool in_stream_recovery)
763 763 {
764 764 if (!m->ignore_xref_streams) {
765 765 QPDFObjGen x_og;
... ... @@ -772,7 +772,7 @@ QPDF::read_xrefStream(qpdf_offset_t xref_offset)
772 772 }
773 773 if (xref_obj.isStreamOfType("/XRef")) {
774 774 QTC::TC("qpdf", "QPDF found xref stream");
775   - return processXRefStream(xref_offset, xref_obj);
  775 + return processXRefStream(xref_offset, xref_obj, in_stream_recovery);
776 776 }
777 777 }
778 778  
... ... @@ -905,7 +905,8 @@ QPDF::processXRefIndex(
905 905 }
906 906  
907 907 qpdf_offset_t
908   -QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj)
  908 +QPDF::processXRefStream(
  909 + qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj, bool in_stream_recovery)
909 910 {
910 911 auto damaged = [this, xref_offset](std::string_view msg) -> QPDFExc {
911 912 return damagedPDF("xref stream", xref_offset, msg.data());
... ... @@ -971,7 +972,13 @@ QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle&amp; xref_obj)
971 972 // objects.
972 973 insertFreeXrefEntry(QPDFObjGen(obj, 0));
973 974 } else {
974   - insertXrefEntry(obj, toI(fields[0]), fields[1], toI(fields[2]));
  975 + auto typ = toI(fields[0]);
  976 + if (!in_stream_recovery || typ == 2) {
  977 + // If we are in xref stream recovery all actual uncompressed objects have
  978 + // already been inserted into the xref table. Avoid adding junk data into the
  979 + // xref table.
  980 + insertXrefEntry(obj, toI(fields[0]), fields[1], toI(fields[2]));
  981 + }
975 982 }
976 983 ++obj;
977 984 }
... ...