Commit 9740930b2de30249cb94dd1cc1044ecc2e88095c

Authored by m-holger
Committed by GitHub
2 parents ab48d664 7927241d

Merge pull request #1392 from m-holger/i1335

Refine recovery from missing startxref (fixes #1335)
include/qpdf/QPDF.hh
@@ -762,7 +762,7 @@ class QPDF @@ -762,7 +762,7 @@ class QPDF
762 void setTrailer(QPDFObjectHandle obj); 762 void setTrailer(QPDFObjectHandle obj);
763 void read_xref(qpdf_offset_t offset); 763 void read_xref(qpdf_offset_t offset);
764 bool resolveXRefTable(); 764 bool resolveXRefTable();
765 - void reconstruct_xref(QPDFExc& e); 765 + void reconstruct_xref(QPDFExc& e, bool found_startxref = true);
766 bool parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes); 766 bool parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes);
767 bool read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type); 767 bool read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
768 bool read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type); 768 bool read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
libqpdf/QPDF.cc
@@ -471,7 +471,7 @@ QPDF::parse(char const* password) @@ -471,7 +471,7 @@ QPDF::parse(char const* password)
471 } 471 }
472 } catch (QPDFExc& e) { 472 } catch (QPDFExc& e) {
473 if (m->attempt_recovery) { 473 if (m->attempt_recovery) {
474 - reconstruct_xref(e); 474 + reconstruct_xref(e, xref_offset > 0);
475 QTC::TC("qpdf", "QPDF reconstructed xref table"); 475 QTC::TC("qpdf", "QPDF reconstructed xref table");
476 } else { 476 } else {
477 throw; 477 throw;
@@ -531,7 +531,7 @@ QPDF::setTrailer(QPDFObjectHandle obj) @@ -531,7 +531,7 @@ QPDF::setTrailer(QPDFObjectHandle obj)
531 } 531 }
532 532
533 void 533 void
534 -QPDF::reconstruct_xref(QPDFExc& e) 534 +QPDF::reconstruct_xref(QPDFExc& e, bool found_startxref)
535 { 535 {
536 if (m->reconstructed_xref) { 536 if (m->reconstructed_xref) {
537 // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because 537 // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because
@@ -569,6 +569,7 @@ QPDF::reconstruct_xref(QPDFExc& e) @@ -569,6 +569,7 @@ QPDF::reconstruct_xref(QPDFExc& e)
569 569
570 std::vector<std::tuple<int, int, qpdf_offset_t>> found_objects; 570 std::vector<std::tuple<int, int, qpdf_offset_t>> found_objects;
571 std::vector<qpdf_offset_t> trailers; 571 std::vector<qpdf_offset_t> trailers;
  572 + std::vector<qpdf_offset_t> startxrefs;
572 573
573 m->file->seek(0, SEEK_END); 574 m->file->seek(0, SEEK_END);
574 qpdf_offset_t eof = m->file->tell(); 575 qpdf_offset_t eof = m->file->tell();
@@ -594,11 +595,34 @@ QPDF::reconstruct_xref(QPDFExc&amp; e) @@ -594,11 +595,34 @@ QPDF::reconstruct_xref(QPDFExc&amp; e)
594 m->file->seek(pos, SEEK_SET); 595 m->file->seek(pos, SEEK_SET);
595 } else if (!m->trailer && t1.isWord("trailer")) { 596 } else if (!m->trailer && t1.isWord("trailer")) {
596 trailers.emplace_back(m->file->tell()); 597 trailers.emplace_back(m->file->tell());
  598 + } else if (!found_startxref && t1.isWord("startxref")) {
  599 + startxrefs.emplace_back(m->file->tell());
597 } 600 }
598 check_warnings(); 601 check_warnings();
599 m->file->findAndSkipNextEOL(); 602 m->file->findAndSkipNextEOL();
600 } 603 }
601 604
  605 + if (!found_startxref && !startxrefs.empty() && !found_objects.empty() &&
  606 + startxrefs.back() > std::get<2>(found_objects.back())) {
  607 + try {
  608 + m->file->seek(startxrefs.back(), SEEK_SET);
  609 + if (auto offset = QUtil::string_to_ll(readToken(*m->file).getValue().data())) {
  610 + read_xref(offset);
  611 + if (getRoot().getKey("/Pages").isDictionary()) {
  612 + QTC::TC("qpdf", "QPDF startxref more than 1024 before end");
  613 + warn(
  614 + damagedPDF("", 0, "startxref was more than 1024 bytes before end of file"));
  615 + initializeEncryption();
  616 + m->parsed = true;
  617 + m->reconstructed_xref = false;
  618 + return;
  619 + }
  620 + }
  621 + } catch (...) {
  622 + // ok, bad luck. Do recovery.
  623 + }
  624 + }
  625 +
602 auto rend = found_objects.rend(); 626 auto rend = found_objects.rend();
603 for (auto it = found_objects.rbegin(); it != rend; it++) { 627 for (auto it = found_objects.rbegin(); it != rend; it++) {
604 auto [obj, gen, token_start] = *it; 628 auto [obj, gen, token_start] = *it;
qpdf/qpdf.testcov
@@ -53,6 +53,7 @@ QPDF xref gen &gt; 0 1 @@ -53,6 +53,7 @@ QPDF xref gen &gt; 0 1
53 QPDF xref size mismatch 0 53 QPDF xref size mismatch 0
54 QPDF not a pdf file 0 54 QPDF not a pdf file 0
55 QPDF can't find startxref 0 55 QPDF can't find startxref 0
  56 +QPDF startxref more than 1024 before end 0
56 QPDF invalid xref 0 57 QPDF invalid xref 0
57 QPDF invalid xref entry 0 58 QPDF invalid xref entry 0
58 QPDF missing trailer 0 59 QPDF missing trailer 0
qpdf/qtest/qpdf/recover-xref-stream.out
1 WARNING: recover-xref-stream.pdf: file is damaged 1 WARNING: recover-xref-stream.pdf: file is damaged
2 WARNING: recover-xref-stream.pdf: can't find startxref 2 WARNING: recover-xref-stream.pdf: can't find startxref
3 WARNING: recover-xref-stream.pdf: Attempting to reconstruct cross-reference table 3 WARNING: recover-xref-stream.pdf: Attempting to reconstruct cross-reference table
4 -WARNING: recover-xref-stream.pdf: reported number of objects (14) is not one plus the highest object number (15) 4 +WARNING: recover-xref-stream.pdf: startxref was more than 1024 bytes before end of file
5 qpdf: operation succeeded with warnings; resulting file may have some problems 5 qpdf: operation succeeded with warnings; resulting file may have some problems