Commit 2994f9cf4cc45e33406de34d4bce45ca491df98e

Authored by Jay Berkenbilt
1 parent 8a24287c

Attempt to find xref streams during recovery (fixes #1103)

ChangeLog
  1 +2024-01-06 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * When recovering a file's xref table, attempt to find xref
  4 + streams if a traditional trailer dictionary is not found. Fixes
  5 + #1103.
  6 +
1 7 2024-01-05 Jay Berkenbilt <ejb@ql.org>
2 8  
3 9 * Add --set-page-labels command-line argument and supporting API.
... ...
libqpdf/QPDF.cc
... ... @@ -580,6 +580,38 @@ QPDF::reconstruct_xref(QPDFExc&amp; e)
580 580 m->deleted_objects.clear();
581 581  
582 582 if (!m->trailer.isInitialized()) {
  583 + qpdf_offset_t max_offset{0};
  584 + // If there are any xref streams, take the last one to appear.
  585 + for (auto const& iter: m->xref_table) {
  586 + auto entry = iter.second;
  587 + if (entry.getType() != 1) {
  588 + continue;
  589 + }
  590 + auto oh = getObjectByObjGen(iter.first);
  591 + try {
  592 + if (!oh.isStreamOfType("/XRef")) {
  593 + continue;
  594 + }
  595 + } catch (std::exception&) {
  596 + continue;
  597 + }
  598 + auto offset = entry.getOffset();
  599 + if (offset > max_offset) {
  600 + max_offset = offset;
  601 + setTrailer(oh.getDict());
  602 + }
  603 + }
  604 + if (max_offset > 0) {
  605 + try {
  606 + read_xref(max_offset);
  607 + } catch (std::exception&) {
  608 + throw damagedPDF("", 0, "error decoding candidate xref stream while recovering damaged file");
  609 + }
  610 + QTC::TC("qpdf", "QPDF recover xref stream");
  611 + }
  612 + }
  613 +
  614 + if (!m->trailer.isInitialized()) {
583 615 // We could check the last encountered object to see if it was an xref stream. If so, we
584 616 // could try to get the trailer from there. This may make it possible to recover files with
585 617 // bad startxref pointers even when they have object streams.
... ...
manual/release-notes.rst
... ... @@ -67,6 +67,11 @@ Planned changes for future 12.x (subject to change):
67 67  
68 68 - ``QPDFPageLabelDocumentHelper::pageLabelDict``
69 69  
  70 + - Improve file recovery logic to better handle files with
  71 + cross-reference streams. This should enable qpdf to recover some
  72 + files that it would previously have reported "unable to find
  73 + trailer dictionary."
  74 +
70 75 11.7.0: December 24, 2023
71 76 - Bug fixes:
72 77  
... ...
qpdf/qpdf.testcov
... ... @@ -689,3 +689,4 @@ QPDFPageObjectHelper used fallback without copying 0
689 689 QPDF skipping cache for known unchecked object 0
690 690 QPDF fix dangling triggered xref reconstruction 0
691 691 QPDFPageDocumentHelper flatten resources missing or invalid 0
  692 +QPDF recover xref stream 0
... ...
qpdf/qtest/object-stream.test
... ... @@ -16,7 +16,7 @@ cleanup();
16 16  
17 17 my $td = new TestDriver('object-stream');
18 18  
19   -my $n_tests = 3 + (36 * 4) + (12 * 2);
  19 +my $n_tests = 5 + (36 * 4) + (12 * 2);
20 20 my $n_compare_pdfs = 36;
21 21  
22 22 for (my $n = 16; $n <= 19; ++$n)
... ... @@ -87,5 +87,15 @@ $td-&gt;runtest(&quot;check file&quot;,
87 87 {$td->FILE => "gen1.qdf"});
88 88  
89 89  
  90 +# Recover a file with xref streams
  91 +$td->runtest("recover file with xref stream",
  92 + {$td->COMMAND => "qpdf --static-id --compress-streams=n" .
  93 + " recover-xref-stream.pdf a.pdf"},
  94 + {$td->FILE => "recover-xref-stream.out", $td->EXIT_STATUS => 3},
  95 + $td->NORMALIZE_NEWLINES);
  96 +$td->runtest("check file",
  97 + {$td->FILE => "a.pdf"},
  98 + {$td->FILE => "recover-xref-stream-recovered.pdf"});
  99 +
90 100 cleanup();
91 101 $td->report(calc_ntests($n_tests, $n_compare_pdfs));
... ...
qpdf/qtest/qpdf/bad7-recover.out
1 1 WARNING: bad7.pdf: file is damaged
2 2 WARNING: bad7.pdf (offset 698): expected trailer dictionary
3 3 WARNING: bad7.pdf: Attempting to reconstruct cross-reference table
  4 +WARNING: bad7.pdf (object 2 0, offset 128): expected endobj
  5 +WARNING: bad7.pdf (object 4 0, offset 389): expected endobj
4 6 bad7.pdf: unable to find trailer dictionary while recovering damaged file
... ...
qpdf/qtest/qpdf/issue-146.out
... ... @@ -2,4 +2,7 @@ WARNING: issue-146.pdf: file is damaged
2 2 WARNING: issue-146.pdf: can't find startxref
3 3 WARNING: issue-146.pdf: Attempting to reconstruct cross-reference table
4 4 WARNING: issue-146.pdf (trailer, offset 695): ignoring excessively deeply nested data structure
  5 +WARNING: issue-146.pdf (object 1 0, offset 92): expected endobj
  6 +WARNING: issue-146.pdf (object 7 0, offset 146): unknown token while reading object; treating as string
  7 +WARNING: issue-146.pdf (object 7 0, offset 168): expected endobj
5 8 qpdf: issue-146.pdf: unable to find trailer dictionary while recovering damaged file
... ...
qpdf/qtest/qpdf/issue-148.out
... ... @@ -7,4 +7,9 @@ WARNING: issue-148.pdf (offset 73): error decoding stream data for object 8 0: s
7 7 WARNING: issue-148.pdf: file is damaged
8 8 WARNING: issue-148.pdf (offset 73): getStreamData called on unfilterable stream
9 9 WARNING: issue-148.pdf: Attempting to reconstruct cross-reference table
10   -qpdf: issue-148.pdf: unable to find trailer dictionary while recovering damaged file
  10 +WARNING: issue-148.pdf (xref stream: object 8 0, offset 26): stream dictionary lacks /Length key
  11 +WARNING: issue-148.pdf (xref stream: object 8 0, offset 73): attempting to recover stream length
  12 +WARNING: issue-148.pdf (xref stream: object 8 0, offset 73): recovered stream length: 2
  13 +WARNING: issue-148.pdf (xref stream: object 8 0, offset 85): expected endobj
  14 +WARNING: issue-148.pdf (offset 73): error decoding stream data for object 8 0: stream inflate: inflate: data: incorrect header check
  15 +qpdf: issue-148.pdf: error decoding candidate xref stream while recovering damaged file
... ...
qpdf/qtest/qpdf/issue-150.out
... ... @@ -2,4 +2,5 @@ WARNING: issue-150.pdf: can&#39;t find PDF header
2 2 WARNING: issue-150.pdf: file is damaged
3 3 WARNING: issue-150.pdf: error reading xref: overflow/underflow converting 9900000000000000000 to 64-bit integer
4 4 WARNING: issue-150.pdf: Attempting to reconstruct cross-reference table
  5 +WARNING: issue-150.pdf (object 8 0): object has offset 0
5 6 qpdf: issue-150.pdf: unable to find trailer dictionary while recovering damaged file
... ...
qpdf/qtest/qpdf/issue-202.out
... ... @@ -3,4 +3,6 @@ WARNING: issue-202.pdf: file is damaged
3 3 WARNING: issue-202.pdf (offset 54769): expected trailer dictionary
4 4 WARNING: issue-202.pdf: Attempting to reconstruct cross-reference table
5 5 WARNING: issue-202.pdf (trailer, offset 55770): ignoring excessively deeply nested data structure
  6 +WARNING: issue-202.pdf (object 222 0, offset 50101): dictionary has duplicated key /Creator; last occurrence overrides earlier ones
  7 +WARNING: issue-202.pdf (object 222 0, offset 50101): dictionary has duplicated key /Producer; last occurrence overrides earlier ones
6 8 qpdf: issue-202.pdf: unable to find trailer dictionary while recovering damaged file
... ...
qpdf/qtest/qpdf/recover-xref-stream-recovered.pdf 0 โ†’ 100644
No preview for this file type
qpdf/qtest/qpdf/recover-xref-stream.out 0 โ†’ 100644
  1 +WARNING: recover-xref-stream.pdf: file is damaged
  2 +WARNING: recover-xref-stream.pdf: can't find startxref
  3 +WARNING: recover-xref-stream.pdf: Attempting to reconstruct cross-reference table
  4 +WARNING: recover-xref-stream.pdf: reported number of objects (14) is not one plus the highest object number (15)
  5 +qpdf: operation succeeded with warnings; resulting file may have some problems
... ...
qpdf/qtest/qpdf/recover-xref-stream.pdf 0 โ†’ 100644
No preview for this file type