Commit 7927241dcd0e7b5350b9b3e56eabbf5ebf34c378

Authored by m-holger
1 parent 09d970e2

Refine recovery from missing startxref (fixes #1335)

If startxref cannot be found in the last 1024 try finding it in the
whole file and check whether it is valid.
include/qpdf/QPDF.hh
@@ -762,7 +762,7 @@ class QPDF @@ -762,7 +762,7 @@ class QPDF
762 void setTrailer(QPDFObjectHandle obj); 762 void setTrailer(QPDFObjectHandle obj);
763 void read_xref(qpdf_offset_t offset); 763 void read_xref(qpdf_offset_t offset);
764 bool resolveXRefTable(); 764 bool resolveXRefTable();
765 - void reconstruct_xref(QPDFExc& e); 765 + void reconstruct_xref(QPDFExc& e, bool found_startxref = true);
766 bool parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes); 766 bool parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes);
767 bool read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type); 767 bool read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
768 bool read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type); 768 bool read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
libqpdf/QPDF.cc
@@ -470,7 +470,7 @@ QPDF::parse(char const* password) @@ -470,7 +470,7 @@ QPDF::parse(char const* password)
470 } 470 }
471 } catch (QPDFExc& e) { 471 } catch (QPDFExc& e) {
472 if (m->attempt_recovery) { 472 if (m->attempt_recovery) {
473 - reconstruct_xref(e); 473 + reconstruct_xref(e, xref_offset > 0);
474 QTC::TC("qpdf", "QPDF reconstructed xref table"); 474 QTC::TC("qpdf", "QPDF reconstructed xref table");
475 } else { 475 } else {
476 throw; 476 throw;
@@ -530,7 +530,7 @@ QPDF::setTrailer(QPDFObjectHandle obj) @@ -530,7 +530,7 @@ QPDF::setTrailer(QPDFObjectHandle obj)
530 } 530 }
531 531
532 void 532 void
533 -QPDF::reconstruct_xref(QPDFExc& e) 533 +QPDF::reconstruct_xref(QPDFExc& e, bool found_startxref)
534 { 534 {
535 if (m->reconstructed_xref) { 535 if (m->reconstructed_xref) {
536 // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because 536 // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because
@@ -568,6 +568,7 @@ QPDF::reconstruct_xref(QPDFExc& e) @@ -568,6 +568,7 @@ QPDF::reconstruct_xref(QPDFExc& e)
568 568
569 std::vector<std::tuple<int, int, qpdf_offset_t>> found_objects; 569 std::vector<std::tuple<int, int, qpdf_offset_t>> found_objects;
570 std::vector<qpdf_offset_t> trailers; 570 std::vector<qpdf_offset_t> trailers;
  571 + std::vector<qpdf_offset_t> startxrefs;
571 572
572 m->file->seek(0, SEEK_END); 573 m->file->seek(0, SEEK_END);
573 qpdf_offset_t eof = m->file->tell(); 574 qpdf_offset_t eof = m->file->tell();
@@ -593,11 +594,34 @@ QPDF::reconstruct_xref(QPDFExc&amp; e) @@ -593,11 +594,34 @@ QPDF::reconstruct_xref(QPDFExc&amp; e)
593 m->file->seek(pos, SEEK_SET); 594 m->file->seek(pos, SEEK_SET);
594 } else if (!m->trailer && t1.isWord("trailer")) { 595 } else if (!m->trailer && t1.isWord("trailer")) {
595 trailers.emplace_back(m->file->tell()); 596 trailers.emplace_back(m->file->tell());
  597 + } else if (!found_startxref && t1.isWord("startxref")) {
  598 + startxrefs.emplace_back(m->file->tell());
596 } 599 }
597 check_warnings(); 600 check_warnings();
598 m->file->findAndSkipNextEOL(); 601 m->file->findAndSkipNextEOL();
599 } 602 }
600 603
  604 + if (!found_startxref && !startxrefs.empty() && !found_objects.empty() &&
  605 + startxrefs.back() > std::get<2>(found_objects.back())) {
  606 + try {
  607 + m->file->seek(startxrefs.back(), SEEK_SET);
  608 + if (auto offset = QUtil::string_to_ll(readToken(*m->file).getValue().data())) {
  609 + read_xref(offset);
  610 + if (getRoot().getKey("/Pages").isDictionary()) {
  611 + QTC::TC("qpdf", "QPDF startxref more than 1024 before end");
  612 + warn(
  613 + damagedPDF("", 0, "startxref was more than 1024 bytes before end of file"));
  614 + initializeEncryption();
  615 + m->parsed = true;
  616 + m->reconstructed_xref = false;
  617 + return;
  618 + }
  619 + }
  620 + } catch (...) {
  621 + // ok, bad luck. Do recovery.
  622 + }
  623 + }
  624 +
601 auto rend = found_objects.rend(); 625 auto rend = found_objects.rend();
602 for (auto it = found_objects.rbegin(); it != rend; it++) { 626 for (auto it = found_objects.rbegin(); it != rend; it++) {
603 auto [obj, gen, token_start] = *it; 627 auto [obj, gen, token_start] = *it;
qpdf/qpdf.testcov
@@ -53,6 +53,7 @@ QPDF xref gen &gt; 0 1 @@ -53,6 +53,7 @@ QPDF xref gen &gt; 0 1
53 QPDF xref size mismatch 0 53 QPDF xref size mismatch 0
54 QPDF not a pdf file 0 54 QPDF not a pdf file 0
55 QPDF can't find startxref 0 55 QPDF can't find startxref 0
  56 +QPDF startxref more than 1024 before end 0
56 QPDF invalid xref 0 57 QPDF invalid xref 0
57 QPDF invalid xref entry 0 58 QPDF invalid xref entry 0
58 QPDF missing trailer 0 59 QPDF missing trailer 0
qpdf/qtest/qpdf/recover-xref-stream.out
1 WARNING: recover-xref-stream.pdf: file is damaged 1 WARNING: recover-xref-stream.pdf: file is damaged
2 WARNING: recover-xref-stream.pdf: can't find startxref 2 WARNING: recover-xref-stream.pdf: can't find startxref
3 WARNING: recover-xref-stream.pdf: Attempting to reconstruct cross-reference table 3 WARNING: recover-xref-stream.pdf: Attempting to reconstruct cross-reference table
4 -WARNING: recover-xref-stream.pdf: reported number of objects (14) is not one plus the highest object number (15) 4 +WARNING: recover-xref-stream.pdf: startxref was more than 1024 bytes before end of file
5 qpdf: operation succeeded with warnings; resulting file may have some problems 5 qpdf: operation succeeded with warnings; resulting file may have some problems