From 7927241dcd0e7b5350b9b3e56eabbf5ebf34c378 Mon Sep 17 00:00:00 2001 From: m-holger Date: Mon, 10 Mar 2025 17:11:57 +0000 Subject: [PATCH] Refine recovery from missing startxref (fixes #1335) --- include/qpdf/QPDF.hh | 2 +- libqpdf/QPDF.cc | 28 ++++++++++++++++++++++++++-- qpdf/qpdf.testcov | 1 + qpdf/qtest/qpdf/recover-xref-stream.out | 2 +- 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/include/qpdf/QPDF.hh b/include/qpdf/QPDF.hh index 6163150..150f7e6 100644 --- a/include/qpdf/QPDF.hh +++ b/include/qpdf/QPDF.hh @@ -762,7 +762,7 @@ class QPDF void setTrailer(QPDFObjectHandle obj); void read_xref(qpdf_offset_t offset); bool resolveXRefTable(); - void reconstruct_xref(QPDFExc& e); + void reconstruct_xref(QPDFExc& e, bool found_startxref = true); bool parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes); bool read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type); bool read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type); diff --git a/libqpdf/QPDF.cc b/libqpdf/QPDF.cc index 10cd7d7..19ffd76 100644 --- a/libqpdf/QPDF.cc +++ b/libqpdf/QPDF.cc @@ -470,7 +470,7 @@ QPDF::parse(char const* password) } } catch (QPDFExc& e) { if (m->attempt_recovery) { - reconstruct_xref(e); + reconstruct_xref(e, xref_offset > 0); QTC::TC("qpdf", "QPDF reconstructed xref table"); } else { throw; @@ -530,7 +530,7 @@ QPDF::setTrailer(QPDFObjectHandle obj) } void -QPDF::reconstruct_xref(QPDFExc& e) +QPDF::reconstruct_xref(QPDFExc& e, bool found_startxref) { if (m->reconstructed_xref) { // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because @@ -568,6 +568,7 @@ QPDF::reconstruct_xref(QPDFExc& e) std::vector> found_objects; std::vector trailers; + std::vector startxrefs; m->file->seek(0, SEEK_END); qpdf_offset_t eof = m->file->tell(); @@ -593,11 +594,34 @@ QPDF::reconstruct_xref(QPDFExc& e) m->file->seek(pos, SEEK_SET); } else if (!m->trailer && t1.isWord("trailer")) { trailers.emplace_back(m->file->tell()); + } else if (!found_startxref && t1.isWord("startxref")) { + startxrefs.emplace_back(m->file->tell()); } check_warnings(); m->file->findAndSkipNextEOL(); } + if (!found_startxref && !startxrefs.empty() && !found_objects.empty() && + startxrefs.back() > std::get<2>(found_objects.back())) { + try { + m->file->seek(startxrefs.back(), SEEK_SET); + if (auto offset = QUtil::string_to_ll(readToken(*m->file).getValue().data())) { + read_xref(offset); + if (getRoot().getKey("/Pages").isDictionary()) { + QTC::TC("qpdf", "QPDF startxref more than 1024 before end"); + warn( + damagedPDF("", 0, "startxref was more than 1024 bytes before end of file")); + initializeEncryption(); + m->parsed = true; + m->reconstructed_xref = false; + return; + } + } + } catch (...) { + // ok, bad luck. Do recovery. + } + } + auto rend = found_objects.rend(); for (auto it = found_objects.rbegin(); it != rend; it++) { auto [obj, gen, token_start] = *it; diff --git a/qpdf/qpdf.testcov b/qpdf/qpdf.testcov index 4743a56..b4d2eab 100644 --- a/qpdf/qpdf.testcov +++ b/qpdf/qpdf.testcov @@ -53,6 +53,7 @@ QPDF xref gen > 0 1 QPDF xref size mismatch 0 QPDF not a pdf file 0 QPDF can't find startxref 0 +QPDF startxref more than 1024 before end 0 QPDF invalid xref 0 QPDF invalid xref entry 0 QPDF missing trailer 0 diff --git a/qpdf/qtest/qpdf/recover-xref-stream.out b/qpdf/qtest/qpdf/recover-xref-stream.out index ba0e1aa..fd15969 100644 --- a/qpdf/qtest/qpdf/recover-xref-stream.out +++ b/qpdf/qtest/qpdf/recover-xref-stream.out @@ -1,5 +1,5 @@ WARNING: recover-xref-stream.pdf: file is damaged WARNING: recover-xref-stream.pdf: can't find startxref WARNING: recover-xref-stream.pdf: Attempting to reconstruct cross-reference table -WARNING: recover-xref-stream.pdf: reported number of objects (14) is not one plus the highest object number (15) +WARNING: recover-xref-stream.pdf: startxref was more than 1024 bytes before end of file qpdf: operation succeeded with warnings; resulting file may have some problems -- libgit2 0.21.4