Commit 9740930b2de30249cb94dd1cc1044ecc2e88095c

Authored by m-holger
Committed by GitHub
2 parents ab48d664 7927241d

Merge pull request #1392 from m-holger/i1335

Refine recovery from missing startxref (fixes #1335)
include/qpdf/QPDF.hh
... ... @@ -762,7 +762,7 @@ class QPDF
762 762 void setTrailer(QPDFObjectHandle obj);
763 763 void read_xref(qpdf_offset_t offset);
764 764 bool resolveXRefTable();
765   - void reconstruct_xref(QPDFExc& e);
  765 + void reconstruct_xref(QPDFExc& e, bool found_startxref = true);
766 766 bool parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes);
767 767 bool read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
768 768 bool read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
... ...
libqpdf/QPDF.cc
... ... @@ -471,7 +471,7 @@ QPDF::parse(char const* password)
471 471 }
472 472 } catch (QPDFExc& e) {
473 473 if (m->attempt_recovery) {
474   - reconstruct_xref(e);
  474 + reconstruct_xref(e, xref_offset > 0);
475 475 QTC::TC("qpdf", "QPDF reconstructed xref table");
476 476 } else {
477 477 throw;
... ... @@ -531,7 +531,7 @@ QPDF::setTrailer(QPDFObjectHandle obj)
531 531 }
532 532  
533 533 void
534   -QPDF::reconstruct_xref(QPDFExc& e)
  534 +QPDF::reconstruct_xref(QPDFExc& e, bool found_startxref)
535 535 {
536 536 if (m->reconstructed_xref) {
537 537 // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because
... ... @@ -569,6 +569,7 @@ QPDF::reconstruct_xref(QPDFExc& e)
569 569  
570 570 std::vector<std::tuple<int, int, qpdf_offset_t>> found_objects;
571 571 std::vector<qpdf_offset_t> trailers;
  572 + std::vector<qpdf_offset_t> startxrefs;
572 573  
573 574 m->file->seek(0, SEEK_END);
574 575 qpdf_offset_t eof = m->file->tell();
... ... @@ -594,11 +595,34 @@ QPDF::reconstruct_xref(QPDFExc&amp; e)
594 595 m->file->seek(pos, SEEK_SET);
595 596 } else if (!m->trailer && t1.isWord("trailer")) {
596 597 trailers.emplace_back(m->file->tell());
  598 + } else if (!found_startxref && t1.isWord("startxref")) {
  599 + startxrefs.emplace_back(m->file->tell());
597 600 }
598 601 check_warnings();
599 602 m->file->findAndSkipNextEOL();
600 603 }
601 604  
  605 + if (!found_startxref && !startxrefs.empty() && !found_objects.empty() &&
  606 + startxrefs.back() > std::get<2>(found_objects.back())) {
  607 + try {
  608 + m->file->seek(startxrefs.back(), SEEK_SET);
  609 + if (auto offset = QUtil::string_to_ll(readToken(*m->file).getValue().data())) {
  610 + read_xref(offset);
  611 + if (getRoot().getKey("/Pages").isDictionary()) {
  612 + QTC::TC("qpdf", "QPDF startxref more than 1024 before end");
  613 + warn(
  614 + damagedPDF("", 0, "startxref was more than 1024 bytes before end of file"));
  615 + initializeEncryption();
  616 + m->parsed = true;
  617 + m->reconstructed_xref = false;
  618 + return;
  619 + }
  620 + }
  621 + } catch (...) {
  622 + // ok, bad luck. Do recovery.
  623 + }
  624 + }
  625 +
602 626 auto rend = found_objects.rend();
603 627 for (auto it = found_objects.rbegin(); it != rend; it++) {
604 628 auto [obj, gen, token_start] = *it;
... ...
qpdf/qpdf.testcov
... ... @@ -53,6 +53,7 @@ QPDF xref gen &gt; 0 1
53 53 QPDF xref size mismatch 0
54 54 QPDF not a pdf file 0
55 55 QPDF can't find startxref 0
  56 +QPDF startxref more than 1024 before end 0
56 57 QPDF invalid xref 0
57 58 QPDF invalid xref entry 0
58 59 QPDF missing trailer 0
... ...
qpdf/qtest/qpdf/recover-xref-stream.out
1 1 WARNING: recover-xref-stream.pdf: file is damaged
2 2 WARNING: recover-xref-stream.pdf: can't find startxref
3 3 WARNING: recover-xref-stream.pdf: Attempting to reconstruct cross-reference table
4   -WARNING: recover-xref-stream.pdf: reported number of objects (14) is not one plus the highest object number (15)
  4 +WARNING: recover-xref-stream.pdf: startxref was more than 1024 bytes before end of file
5 5 qpdf: operation succeeded with warnings; resulting file may have some problems
... ...