From ca3ea2e3054189ca9b6d55f7a74588a3a1cb74b2 Mon Sep 17 00:00:00 2001 From: m-holger Date: Sat, 1 Feb 2025 17:23:02 +0000 Subject: [PATCH] Refine xref reconstruction (fixes #1335) --- libqpdf/QPDF.cc | 30 +++++++++++++++++++++++++++--- qpdf/qtest/error-condition.test | 2 +- qpdf/qtest/qpdf/bad16-recover.out | 8 +++++++- qpdf/qtest/qpdf/bad7-recover.out | 8 +++++++- qpdf/qtest/qpdf/issue-100.out | 8 ++++++++ qpdf/qtest/qpdf/issue-101.out | 2 ++ qpdf/qtest/qpdf/issue-148.out | 3 ++- qpdf/qtest/qpdf/issue-202.out | 3 ++- qpdf/qtest/specific-bugs.test | 2 +- 9 files changed, 57 insertions(+), 9 deletions(-) diff --git a/libqpdf/QPDF.cc b/libqpdf/QPDF.cc index c52831f..85fe94d 100644 --- a/libqpdf/QPDF.cc +++ b/libqpdf/QPDF.cc @@ -608,6 +608,7 @@ QPDF::reconstruct_xref(QPDFExc& e) if (!m->trailer) { qpdf_offset_t max_offset{0}; + size_t max_size{0}; // If there are any xref streams, take the last one to appear. for (auto const& iter: m->xref_table) { auto entry = iter.second; @@ -623,7 +624,8 @@ QPDF::reconstruct_xref(QPDFExc& e) continue; } auto offset = entry.getOffset(); - if (offset > max_offset) { + auto size = oh.getDict().getKey("/Size").getUIntValueAsUInt(); + if (size > max_size || (size == max_size && offset > max_offset)) { max_offset = offset; setTrailer(oh.getDict()); } @@ -633,13 +635,35 @@ QPDF::reconstruct_xref(QPDFExc& e) try { read_xref(max_offset); } catch (std::exception&) { - throw damagedPDF( - "", 0, "error decoding candidate xref stream while recovering damaged file"); + warn(damagedPDF( + "", 0, "error decoding candidate xref stream while recovering damaged file")); } QTC::TC("qpdf", "QPDF recover xref stream"); } } + if (!m->trailer || (!m->parsed && !m->trailer.getKey("/Root").isDictionary())) { + // Try to find a Root dictionary. As a quick fix try the one with the highest object id. + QPDFObjectHandle root; + for (auto const& iter: m->obj_cache) { + try { + if (QPDFObjectHandle(iter.second.object).isDictionaryOfType("/Catalog")) { + root = iter.second.object; + } + } catch (std::exception&) { + continue; + } + } + if (root) { + if (!m->trailer) { + warn(damagedPDF( + "", 0, "unable to find trailer dictionary while recovering damaged file")); + m->trailer = QPDFObjectHandle::newDictionary(); + } + m->trailer.replaceKey("/Root", root); + } + } + if (!m->trailer) { // We could check the last encountered object to see if it was an xref stream. If so, we // could try to get the trailer from there. This may make it possible to recover files with diff --git a/qpdf/qtest/error-condition.test b/qpdf/qtest/error-condition.test index 52d0eb7..8c2e523 100644 --- a/qpdf/qtest/error-condition.test +++ b/qpdf/qtest/error-condition.test @@ -127,7 +127,7 @@ $n_tests += @badfiles + 11; # though in some cases it may. Acrobat Reader would not be able to # recover any of these files any better. my %recover_failures = (); -for (1, 7, 16) +for (1) { $recover_failures{$_} = 1; } diff --git a/qpdf/qtest/qpdf/bad16-recover.out b/qpdf/qtest/qpdf/bad16-recover.out index 0bedd64..ad88d10 100644 --- a/qpdf/qtest/qpdf/bad16-recover.out +++ b/qpdf/qtest/qpdf/bad16-recover.out @@ -11,4 +11,10 @@ WARNING: bad16.pdf (trailer, offset 756): unexpected dictionary close token WARNING: bad16.pdf (trailer, offset 759): unknown token while reading object; treating as string WARNING: bad16.pdf (trailer, offset 779): parse error while reading object WARNING: bad16.pdf (trailer, offset 779): unexpected EOF -bad16.pdf: unable to find trailer dictionary while recovering damaged file +WARNING: bad16.pdf: unable to find trailer dictionary while recovering damaged file +/QTest is implicit +/QTest is direct and has type null (2) +/QTest is null +unparse: null +unparseResolved: null +test 1 done diff --git a/qpdf/qtest/qpdf/bad7-recover.out b/qpdf/qtest/qpdf/bad7-recover.out index 0e5d4a6..ed5ce18 100644 --- a/qpdf/qtest/qpdf/bad7-recover.out +++ b/qpdf/qtest/qpdf/bad7-recover.out @@ -3,4 +3,10 @@ WARNING: bad7.pdf (offset 698): expected trailer dictionary WARNING: bad7.pdf: Attempting to reconstruct cross-reference table WARNING: bad7.pdf (object 2 0, offset 128): expected endobj WARNING: bad7.pdf (object 4 0, offset 389): expected endobj -bad7.pdf: unable to find trailer dictionary while recovering damaged file +WARNING: bad7.pdf: unable to find trailer dictionary while recovering damaged file +/QTest is implicit +/QTest is direct and has type null (2) +/QTest is null +unparse: null +unparseResolved: null +test 1 done diff --git a/qpdf/qtest/qpdf/issue-100.out b/qpdf/qtest/qpdf/issue-100.out index 7373ab6..f667fae 100644 --- a/qpdf/qtest/qpdf/issue-100.out +++ b/qpdf/qtest/qpdf/issue-100.out @@ -2,4 +2,12 @@ WARNING: issue-100.pdf: file is damaged WARNING: issue-100.pdf (offset 736): xref not found WARNING: issue-100.pdf: Attempting to reconstruct cross-reference table WARNING: issue-100.pdf (trailer, offset 488): stream keyword found in trailer +WARNING: issue-100.pdf (object 5 0, offset 268): unknown token while reading object; treating as string +WARNING: issue-100.pdf (object 5 0, offset 286): unknown token while reading object; treating as string +WARNING: issue-100.pdf (object 5 0, offset 289): unknown token while reading object; treating as string +WARNING: issue-100.pdf (object 5 0, offset 294): unknown token while reading object; treating as string +WARNING: issue-100.pdf (object 5 0, offset 297): unknown token while reading object; treating as string +WARNING: issue-100.pdf (object 5 0, offset 304): unknown token while reading object; treating as string +WARNING: issue-100.pdf (object 5 0, offset 304): too many errors; giving up on reading object +WARNING: issue-100.pdf (object 5 0, offset 308): expected endobj qpdf: issue-100.pdf: unable to find /Root dictionary diff --git a/qpdf/qtest/qpdf/issue-101.out b/qpdf/qtest/qpdf/issue-101.out index cffe8da..99414d0 100644 --- a/qpdf/qtest/qpdf/issue-101.out +++ b/qpdf/qtest/qpdf/issue-101.out @@ -2,4 +2,6 @@ WARNING: issue-101.pdf: file is damaged WARNING: issue-101.pdf (offset 3526): xref not found WARNING: issue-101.pdf: Attempting to reconstruct cross-reference table WARNING: issue-101.pdf (trailer, offset 1508): stream keyword found in trailer +WARNING: issue-101.pdf (object 5 0, offset 1242): dictionary ended prematurely; using null as value for last key +WARNING: issue-101.pdf (object 5 0, offset 1242): expected dictionary key but found non-name object; inserting key /QPDFFake1 qpdf: issue-101.pdf: unable to find /Root dictionary diff --git a/qpdf/qtest/qpdf/issue-148.out b/qpdf/qtest/qpdf/issue-148.out index dbc424f..8245fbf 100644 --- a/qpdf/qtest/qpdf/issue-148.out +++ b/qpdf/qtest/qpdf/issue-148.out @@ -12,4 +12,5 @@ WARNING: issue-148.pdf (xref stream: object 8 0, offset 73): attempting to recov WARNING: issue-148.pdf (xref stream: object 8 0, offset 73): recovered stream length: 2 WARNING: issue-148.pdf (xref stream: object 8 0, offset 85): expected endobj WARNING: issue-148.pdf (offset 73): error decoding stream data for object 8 0: stream inflate: inflate: data: incorrect header check -qpdf: issue-148.pdf: error decoding candidate xref stream while recovering damaged file +WARNING: issue-148.pdf: error decoding candidate xref stream while recovering damaged file +qpdf: issue-148.pdf: unable to find /Root dictionary diff --git a/qpdf/qtest/qpdf/issue-202.out b/qpdf/qtest/qpdf/issue-202.out index 441b708..913c379 100644 --- a/qpdf/qtest/qpdf/issue-202.out +++ b/qpdf/qtest/qpdf/issue-202.out @@ -5,4 +5,5 @@ WARNING: issue-202.pdf: Attempting to reconstruct cross-reference table WARNING: issue-202.pdf (trailer, offset 55770): ignoring excessively deeply nested data structure WARNING: issue-202.pdf (object 222 0, offset 50101): dictionary has duplicated key /Creator; last occurrence overrides earlier ones WARNING: issue-202.pdf (object 222 0, offset 50101): dictionary has duplicated key /Producer; last occurrence overrides earlier ones -qpdf: issue-202.pdf: unable to find trailer dictionary while recovering damaged file +WARNING: issue-202.pdf: unable to find trailer dictionary while recovering damaged file +qpdf: operation succeeded with warnings; resulting file may have some problems diff --git a/qpdf/qtest/specific-bugs.test b/qpdf/qtest/specific-bugs.test index 15c9e01..1613224 100644 --- a/qpdf/qtest/specific-bugs.test +++ b/qpdf/qtest/specific-bugs.test @@ -34,7 +34,7 @@ my @bug_tests = ( ["148", "free memory on bad flate", 2], ["149", "xref prev pointer loop", 3], ["150", "integer overflow", 2], - ["202", "even more deeply nested dictionary", 2], + ["202", "even more deeply nested dictionary", 3], ["263", "empty xref stream", 2], ["335a", "ozz-fuzz-12152", 2], ["335b", "ozz-fuzz-14845", 2], -- libgit2 0.21.4