Commit 2994f9cf4cc45e33406de34d4bce45ca491df98e

Authored by Jay Berkenbilt
1 parent 8a24287c

Attempt to find xref streams during recovery (fixes #1103)

ChangeLog
  1 +2024-01-06 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * When recovering a file's xref table, attempt to find xref
  4 + streams if a traditional trailer dictionary is not found. Fixes
  5 + #1103.
  6 +
1 2024-01-05 Jay Berkenbilt <ejb@ql.org> 7 2024-01-05 Jay Berkenbilt <ejb@ql.org>
2 8
3 * Add --set-page-labels command-line argument and supporting API. 9 * Add --set-page-labels command-line argument and supporting API.
libqpdf/QPDF.cc
@@ -580,6 +580,38 @@ QPDF::reconstruct_xref(QPDFExc&amp; e) @@ -580,6 +580,38 @@ QPDF::reconstruct_xref(QPDFExc&amp; e)
580 m->deleted_objects.clear(); 580 m->deleted_objects.clear();
581 581
582 if (!m->trailer.isInitialized()) { 582 if (!m->trailer.isInitialized()) {
  583 + qpdf_offset_t max_offset{0};
  584 + // If there are any xref streams, take the last one to appear.
  585 + for (auto const& iter: m->xref_table) {
  586 + auto entry = iter.second;
  587 + if (entry.getType() != 1) {
  588 + continue;
  589 + }
  590 + auto oh = getObjectByObjGen(iter.first);
  591 + try {
  592 + if (!oh.isStreamOfType("/XRef")) {
  593 + continue;
  594 + }
  595 + } catch (std::exception&) {
  596 + continue;
  597 + }
  598 + auto offset = entry.getOffset();
  599 + if (offset > max_offset) {
  600 + max_offset = offset;
  601 + setTrailer(oh.getDict());
  602 + }
  603 + }
  604 + if (max_offset > 0) {
  605 + try {
  606 + read_xref(max_offset);
  607 + } catch (std::exception&) {
  608 + throw damagedPDF("", 0, "error decoding candidate xref stream while recovering damaged file");
  609 + }
  610 + QTC::TC("qpdf", "QPDF recover xref stream");
  611 + }
  612 + }
  613 +
  614 + if (!m->trailer.isInitialized()) {
583 // We could check the last encountered object to see if it was an xref stream. If so, we 615 // We could check the last encountered object to see if it was an xref stream. If so, we
584 // could try to get the trailer from there. This may make it possible to recover files with 616 // could try to get the trailer from there. This may make it possible to recover files with
585 // bad startxref pointers even when they have object streams. 617 // bad startxref pointers even when they have object streams.
manual/release-notes.rst
@@ -67,6 +67,11 @@ Planned changes for future 12.x (subject to change): @@ -67,6 +67,11 @@ Planned changes for future 12.x (subject to change):
67 67
68 - ``QPDFPageLabelDocumentHelper::pageLabelDict`` 68 - ``QPDFPageLabelDocumentHelper::pageLabelDict``
69 69
  70 + - Improve file recovery logic to better handle files with
  71 + cross-reference streams. This should enable qpdf to recover some
  72 + files that it would previously have reported "unable to find
  73 + trailer dictionary."
  74 +
70 11.7.0: December 24, 2023 75 11.7.0: December 24, 2023
71 - Bug fixes: 76 - Bug fixes:
72 77
qpdf/qpdf.testcov
@@ -689,3 +689,4 @@ QPDFPageObjectHelper used fallback without copying 0 @@ -689,3 +689,4 @@ QPDFPageObjectHelper used fallback without copying 0
689 QPDF skipping cache for known unchecked object 0 689 QPDF skipping cache for known unchecked object 0
690 QPDF fix dangling triggered xref reconstruction 0 690 QPDF fix dangling triggered xref reconstruction 0
691 QPDFPageDocumentHelper flatten resources missing or invalid 0 691 QPDFPageDocumentHelper flatten resources missing or invalid 0
  692 +QPDF recover xref stream 0
qpdf/qtest/object-stream.test
@@ -16,7 +16,7 @@ cleanup(); @@ -16,7 +16,7 @@ cleanup();
16 16
17 my $td = new TestDriver('object-stream'); 17 my $td = new TestDriver('object-stream');
18 18
19 -my $n_tests = 3 + (36 * 4) + (12 * 2); 19 +my $n_tests = 5 + (36 * 4) + (12 * 2);
20 my $n_compare_pdfs = 36; 20 my $n_compare_pdfs = 36;
21 21
22 for (my $n = 16; $n <= 19; ++$n) 22 for (my $n = 16; $n <= 19; ++$n)
@@ -87,5 +87,15 @@ $td-&gt;runtest(&quot;check file&quot;, @@ -87,5 +87,15 @@ $td-&gt;runtest(&quot;check file&quot;,
87 {$td->FILE => "gen1.qdf"}); 87 {$td->FILE => "gen1.qdf"});
88 88
89 89
  90 +# Recover a file with xref streams
  91 +$td->runtest("recover file with xref stream",
  92 + {$td->COMMAND => "qpdf --static-id --compress-streams=n" .
  93 + " recover-xref-stream.pdf a.pdf"},
  94 + {$td->FILE => "recover-xref-stream.out", $td->EXIT_STATUS => 3},
  95 + $td->NORMALIZE_NEWLINES);
  96 +$td->runtest("check file",
  97 + {$td->FILE => "a.pdf"},
  98 + {$td->FILE => "recover-xref-stream-recovered.pdf"});
  99 +
90 cleanup(); 100 cleanup();
91 $td->report(calc_ntests($n_tests, $n_compare_pdfs)); 101 $td->report(calc_ntests($n_tests, $n_compare_pdfs));
qpdf/qtest/qpdf/bad7-recover.out
1 WARNING: bad7.pdf: file is damaged 1 WARNING: bad7.pdf: file is damaged
2 WARNING: bad7.pdf (offset 698): expected trailer dictionary 2 WARNING: bad7.pdf (offset 698): expected trailer dictionary
3 WARNING: bad7.pdf: Attempting to reconstruct cross-reference table 3 WARNING: bad7.pdf: Attempting to reconstruct cross-reference table
  4 +WARNING: bad7.pdf (object 2 0, offset 128): expected endobj
  5 +WARNING: bad7.pdf (object 4 0, offset 389): expected endobj
4 bad7.pdf: unable to find trailer dictionary while recovering damaged file 6 bad7.pdf: unable to find trailer dictionary while recovering damaged file
qpdf/qtest/qpdf/issue-146.out
@@ -2,4 +2,7 @@ WARNING: issue-146.pdf: file is damaged @@ -2,4 +2,7 @@ WARNING: issue-146.pdf: file is damaged
2 WARNING: issue-146.pdf: can't find startxref 2 WARNING: issue-146.pdf: can't find startxref
3 WARNING: issue-146.pdf: Attempting to reconstruct cross-reference table 3 WARNING: issue-146.pdf: Attempting to reconstruct cross-reference table
4 WARNING: issue-146.pdf (trailer, offset 695): ignoring excessively deeply nested data structure 4 WARNING: issue-146.pdf (trailer, offset 695): ignoring excessively deeply nested data structure
  5 +WARNING: issue-146.pdf (object 1 0, offset 92): expected endobj
  6 +WARNING: issue-146.pdf (object 7 0, offset 146): unknown token while reading object; treating as string
  7 +WARNING: issue-146.pdf (object 7 0, offset 168): expected endobj
5 qpdf: issue-146.pdf: unable to find trailer dictionary while recovering damaged file 8 qpdf: issue-146.pdf: unable to find trailer dictionary while recovering damaged file
qpdf/qtest/qpdf/issue-148.out
@@ -7,4 +7,9 @@ WARNING: issue-148.pdf (offset 73): error decoding stream data for object 8 0: s @@ -7,4 +7,9 @@ WARNING: issue-148.pdf (offset 73): error decoding stream data for object 8 0: s
7 WARNING: issue-148.pdf: file is damaged 7 WARNING: issue-148.pdf: file is damaged
8 WARNING: issue-148.pdf (offset 73): getStreamData called on unfilterable stream 8 WARNING: issue-148.pdf (offset 73): getStreamData called on unfilterable stream
9 WARNING: issue-148.pdf: Attempting to reconstruct cross-reference table 9 WARNING: issue-148.pdf: Attempting to reconstruct cross-reference table
10 -qpdf: issue-148.pdf: unable to find trailer dictionary while recovering damaged file 10 +WARNING: issue-148.pdf (xref stream: object 8 0, offset 26): stream dictionary lacks /Length key
  11 +WARNING: issue-148.pdf (xref stream: object 8 0, offset 73): attempting to recover stream length
  12 +WARNING: issue-148.pdf (xref stream: object 8 0, offset 73): recovered stream length: 2
  13 +WARNING: issue-148.pdf (xref stream: object 8 0, offset 85): expected endobj
  14 +WARNING: issue-148.pdf (offset 73): error decoding stream data for object 8 0: stream inflate: inflate: data: incorrect header check
  15 +qpdf: issue-148.pdf: error decoding candidate xref stream while recovering damaged file
qpdf/qtest/qpdf/issue-150.out
@@ -2,4 +2,5 @@ WARNING: issue-150.pdf: can&#39;t find PDF header @@ -2,4 +2,5 @@ WARNING: issue-150.pdf: can&#39;t find PDF header
2 WARNING: issue-150.pdf: file is damaged 2 WARNING: issue-150.pdf: file is damaged
3 WARNING: issue-150.pdf: error reading xref: overflow/underflow converting 9900000000000000000 to 64-bit integer 3 WARNING: issue-150.pdf: error reading xref: overflow/underflow converting 9900000000000000000 to 64-bit integer
4 WARNING: issue-150.pdf: Attempting to reconstruct cross-reference table 4 WARNING: issue-150.pdf: Attempting to reconstruct cross-reference table
  5 +WARNING: issue-150.pdf (object 8 0): object has offset 0
5 qpdf: issue-150.pdf: unable to find trailer dictionary while recovering damaged file 6 qpdf: issue-150.pdf: unable to find trailer dictionary while recovering damaged file
qpdf/qtest/qpdf/issue-202.out
@@ -3,4 +3,6 @@ WARNING: issue-202.pdf: file is damaged @@ -3,4 +3,6 @@ WARNING: issue-202.pdf: file is damaged
3 WARNING: issue-202.pdf (offset 54769): expected trailer dictionary 3 WARNING: issue-202.pdf (offset 54769): expected trailer dictionary
4 WARNING: issue-202.pdf: Attempting to reconstruct cross-reference table 4 WARNING: issue-202.pdf: Attempting to reconstruct cross-reference table
5 WARNING: issue-202.pdf (trailer, offset 55770): ignoring excessively deeply nested data structure 5 WARNING: issue-202.pdf (trailer, offset 55770): ignoring excessively deeply nested data structure
  6 +WARNING: issue-202.pdf (object 222 0, offset 50101): dictionary has duplicated key /Creator; last occurrence overrides earlier ones
  7 +WARNING: issue-202.pdf (object 222 0, offset 50101): dictionary has duplicated key /Producer; last occurrence overrides earlier ones
6 qpdf: issue-202.pdf: unable to find trailer dictionary while recovering damaged file 8 qpdf: issue-202.pdf: unable to find trailer dictionary while recovering damaged file
qpdf/qtest/qpdf/recover-xref-stream-recovered.pdf 0 โ†’ 100644
No preview for this file type
qpdf/qtest/qpdf/recover-xref-stream.out 0 โ†’ 100644
  1 +WARNING: recover-xref-stream.pdf: file is damaged
  2 +WARNING: recover-xref-stream.pdf: can't find startxref
  3 +WARNING: recover-xref-stream.pdf: Attempting to reconstruct cross-reference table
  4 +WARNING: recover-xref-stream.pdf: reported number of objects (14) is not one plus the highest object number (15)
  5 +qpdf: operation succeeded with warnings; resulting file may have some problems
qpdf/qtest/qpdf/recover-xref-stream.pdf 0 โ†’ 100644
No preview for this file type