Commit 7927241dcd0e7b5350b9b3e56eabbf5ebf34c378

Authored by m-holger
1 parent 09d970e2

Refine recovery from missing startxref (fixes #1335)

If startxref cannot be found in the last 1024 try finding it in the
whole file and check whether it is valid.
include/qpdf/QPDF.hh
... ... @@ -762,7 +762,7 @@ class QPDF
762 762 void setTrailer(QPDFObjectHandle obj);
763 763 void read_xref(qpdf_offset_t offset);
764 764 bool resolveXRefTable();
765   - void reconstruct_xref(QPDFExc& e);
  765 + void reconstruct_xref(QPDFExc& e, bool found_startxref = true);
766 766 bool parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes);
767 767 bool read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
768 768 bool read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
... ...
libqpdf/QPDF.cc
... ... @@ -470,7 +470,7 @@ QPDF::parse(char const* password)
470 470 }
471 471 } catch (QPDFExc& e) {
472 472 if (m->attempt_recovery) {
473   - reconstruct_xref(e);
  473 + reconstruct_xref(e, xref_offset > 0);
474 474 QTC::TC("qpdf", "QPDF reconstructed xref table");
475 475 } else {
476 476 throw;
... ... @@ -530,7 +530,7 @@ QPDF::setTrailer(QPDFObjectHandle obj)
530 530 }
531 531  
532 532 void
533   -QPDF::reconstruct_xref(QPDFExc& e)
  533 +QPDF::reconstruct_xref(QPDFExc& e, bool found_startxref)
534 534 {
535 535 if (m->reconstructed_xref) {
536 536 // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because
... ... @@ -568,6 +568,7 @@ QPDF::reconstruct_xref(QPDFExc& e)
568 568  
569 569 std::vector<std::tuple<int, int, qpdf_offset_t>> found_objects;
570 570 std::vector<qpdf_offset_t> trailers;
  571 + std::vector<qpdf_offset_t> startxrefs;
571 572  
572 573 m->file->seek(0, SEEK_END);
573 574 qpdf_offset_t eof = m->file->tell();
... ... @@ -593,11 +594,34 @@ QPDF::reconstruct_xref(QPDFExc&amp; e)
593 594 m->file->seek(pos, SEEK_SET);
594 595 } else if (!m->trailer && t1.isWord("trailer")) {
595 596 trailers.emplace_back(m->file->tell());
  597 + } else if (!found_startxref && t1.isWord("startxref")) {
  598 + startxrefs.emplace_back(m->file->tell());
596 599 }
597 600 check_warnings();
598 601 m->file->findAndSkipNextEOL();
599 602 }
600 603  
  604 + if (!found_startxref && !startxrefs.empty() && !found_objects.empty() &&
  605 + startxrefs.back() > std::get<2>(found_objects.back())) {
  606 + try {
  607 + m->file->seek(startxrefs.back(), SEEK_SET);
  608 + if (auto offset = QUtil::string_to_ll(readToken(*m->file).getValue().data())) {
  609 + read_xref(offset);
  610 + if (getRoot().getKey("/Pages").isDictionary()) {
  611 + QTC::TC("qpdf", "QPDF startxref more than 1024 before end");
  612 + warn(
  613 + damagedPDF("", 0, "startxref was more than 1024 bytes before end of file"));
  614 + initializeEncryption();
  615 + m->parsed = true;
  616 + m->reconstructed_xref = false;
  617 + return;
  618 + }
  619 + }
  620 + } catch (...) {
  621 + // ok, bad luck. Do recovery.
  622 + }
  623 + }
  624 +
601 625 auto rend = found_objects.rend();
602 626 for (auto it = found_objects.rbegin(); it != rend; it++) {
603 627 auto [obj, gen, token_start] = *it;
... ...
qpdf/qpdf.testcov
... ... @@ -53,6 +53,7 @@ QPDF xref gen &gt; 0 1
53 53 QPDF xref size mismatch 0
54 54 QPDF not a pdf file 0
55 55 QPDF can't find startxref 0
  56 +QPDF startxref more than 1024 before end 0
56 57 QPDF invalid xref 0
57 58 QPDF invalid xref entry 0
58 59 QPDF missing trailer 0
... ...
qpdf/qtest/qpdf/recover-xref-stream.out
1 1 WARNING: recover-xref-stream.pdf: file is damaged
2 2 WARNING: recover-xref-stream.pdf: can't find startxref
3 3 WARNING: recover-xref-stream.pdf: Attempting to reconstruct cross-reference table
4   -WARNING: recover-xref-stream.pdf: reported number of objects (14) is not one plus the highest object number (15)
  4 +WARNING: recover-xref-stream.pdf: startxref was more than 1024 bytes before end of file
5 5 qpdf: operation succeeded with warnings; resulting file may have some problems
... ...