Commit 2994f9cf4cc45e33406de34d4bce45ca491df98e
1 parent
8a24287c
Attempt to find xref streams during recovery (fixes #1103)
Showing
13 changed files
with
74 additions
and
2 deletions
ChangeLog
| 1 | +2024-01-06 Jay Berkenbilt <ejb@ql.org> | ||
| 2 | + | ||
| 3 | + * When recovering a file's xref table, attempt to find xref | ||
| 4 | + streams if a traditional trailer dictionary is not found. Fixes | ||
| 5 | + #1103. | ||
| 6 | + | ||
| 1 | 2024-01-05 Jay Berkenbilt <ejb@ql.org> | 7 | 2024-01-05 Jay Berkenbilt <ejb@ql.org> |
| 2 | 8 | ||
| 3 | * Add --set-page-labels command-line argument and supporting API. | 9 | * Add --set-page-labels command-line argument and supporting API. |
libqpdf/QPDF.cc
| @@ -580,6 +580,38 @@ QPDF::reconstruct_xref(QPDFExc& e) | @@ -580,6 +580,38 @@ QPDF::reconstruct_xref(QPDFExc& e) | ||
| 580 | m->deleted_objects.clear(); | 580 | m->deleted_objects.clear(); |
| 581 | 581 | ||
| 582 | if (!m->trailer.isInitialized()) { | 582 | if (!m->trailer.isInitialized()) { |
| 583 | + qpdf_offset_t max_offset{0}; | ||
| 584 | + // If there are any xref streams, take the last one to appear. | ||
| 585 | + for (auto const& iter: m->xref_table) { | ||
| 586 | + auto entry = iter.second; | ||
| 587 | + if (entry.getType() != 1) { | ||
| 588 | + continue; | ||
| 589 | + } | ||
| 590 | + auto oh = getObjectByObjGen(iter.first); | ||
| 591 | + try { | ||
| 592 | + if (!oh.isStreamOfType("/XRef")) { | ||
| 593 | + continue; | ||
| 594 | + } | ||
| 595 | + } catch (std::exception&) { | ||
| 596 | + continue; | ||
| 597 | + } | ||
| 598 | + auto offset = entry.getOffset(); | ||
| 599 | + if (offset > max_offset) { | ||
| 600 | + max_offset = offset; | ||
| 601 | + setTrailer(oh.getDict()); | ||
| 602 | + } | ||
| 603 | + } | ||
| 604 | + if (max_offset > 0) { | ||
| 605 | + try { | ||
| 606 | + read_xref(max_offset); | ||
| 607 | + } catch (std::exception&) { | ||
| 608 | + throw damagedPDF("", 0, "error decoding candidate xref stream while recovering damaged file"); | ||
| 609 | + } | ||
| 610 | + QTC::TC("qpdf", "QPDF recover xref stream"); | ||
| 611 | + } | ||
| 612 | + } | ||
| 613 | + | ||
| 614 | + if (!m->trailer.isInitialized()) { | ||
| 583 | // We could check the last encountered object to see if it was an xref stream. If so, we | 615 | // We could check the last encountered object to see if it was an xref stream. If so, we |
| 584 | // could try to get the trailer from there. This may make it possible to recover files with | 616 | // could try to get the trailer from there. This may make it possible to recover files with |
| 585 | // bad startxref pointers even when they have object streams. | 617 | // bad startxref pointers even when they have object streams. |
manual/release-notes.rst
| @@ -67,6 +67,11 @@ Planned changes for future 12.x (subject to change): | @@ -67,6 +67,11 @@ Planned changes for future 12.x (subject to change): | ||
| 67 | 67 | ||
| 68 | - ``QPDFPageLabelDocumentHelper::pageLabelDict`` | 68 | - ``QPDFPageLabelDocumentHelper::pageLabelDict`` |
| 69 | 69 | ||
| 70 | + - Improve file recovery logic to better handle files with | ||
| 71 | + cross-reference streams. This should enable qpdf to recover some | ||
| 72 | + files that it would previously have reported "unable to find | ||
| 73 | + trailer dictionary." | ||
| 74 | + | ||
| 70 | 11.7.0: December 24, 2023 | 75 | 11.7.0: December 24, 2023 |
| 71 | - Bug fixes: | 76 | - Bug fixes: |
| 72 | 77 |
qpdf/qpdf.testcov
| @@ -689,3 +689,4 @@ QPDFPageObjectHelper used fallback without copying 0 | @@ -689,3 +689,4 @@ QPDFPageObjectHelper used fallback without copying 0 | ||
| 689 | QPDF skipping cache for known unchecked object 0 | 689 | QPDF skipping cache for known unchecked object 0 |
| 690 | QPDF fix dangling triggered xref reconstruction 0 | 690 | QPDF fix dangling triggered xref reconstruction 0 |
| 691 | QPDFPageDocumentHelper flatten resources missing or invalid 0 | 691 | QPDFPageDocumentHelper flatten resources missing or invalid 0 |
| 692 | +QPDF recover xref stream 0 |
qpdf/qtest/object-stream.test
| @@ -16,7 +16,7 @@ cleanup(); | @@ -16,7 +16,7 @@ cleanup(); | ||
| 16 | 16 | ||
| 17 | my $td = new TestDriver('object-stream'); | 17 | my $td = new TestDriver('object-stream'); |
| 18 | 18 | ||
| 19 | -my $n_tests = 3 + (36 * 4) + (12 * 2); | 19 | +my $n_tests = 5 + (36 * 4) + (12 * 2); |
| 20 | my $n_compare_pdfs = 36; | 20 | my $n_compare_pdfs = 36; |
| 21 | 21 | ||
| 22 | for (my $n = 16; $n <= 19; ++$n) | 22 | for (my $n = 16; $n <= 19; ++$n) |
| @@ -87,5 +87,15 @@ $td->runtest("check file", | @@ -87,5 +87,15 @@ $td->runtest("check file", | ||
| 87 | {$td->FILE => "gen1.qdf"}); | 87 | {$td->FILE => "gen1.qdf"}); |
| 88 | 88 | ||
| 89 | 89 | ||
| 90 | +# Recover a file with xref streams | ||
| 91 | +$td->runtest("recover file with xref stream", | ||
| 92 | + {$td->COMMAND => "qpdf --static-id --compress-streams=n" . | ||
| 93 | + " recover-xref-stream.pdf a.pdf"}, | ||
| 94 | + {$td->FILE => "recover-xref-stream.out", $td->EXIT_STATUS => 3}, | ||
| 95 | + $td->NORMALIZE_NEWLINES); | ||
| 96 | +$td->runtest("check file", | ||
| 97 | + {$td->FILE => "a.pdf"}, | ||
| 98 | + {$td->FILE => "recover-xref-stream-recovered.pdf"}); | ||
| 99 | + | ||
| 90 | cleanup(); | 100 | cleanup(); |
| 91 | $td->report(calc_ntests($n_tests, $n_compare_pdfs)); | 101 | $td->report(calc_ntests($n_tests, $n_compare_pdfs)); |
qpdf/qtest/qpdf/bad7-recover.out
| 1 | WARNING: bad7.pdf: file is damaged | 1 | WARNING: bad7.pdf: file is damaged |
| 2 | WARNING: bad7.pdf (offset 698): expected trailer dictionary | 2 | WARNING: bad7.pdf (offset 698): expected trailer dictionary |
| 3 | WARNING: bad7.pdf: Attempting to reconstruct cross-reference table | 3 | WARNING: bad7.pdf: Attempting to reconstruct cross-reference table |
| 4 | +WARNING: bad7.pdf (object 2 0, offset 128): expected endobj | ||
| 5 | +WARNING: bad7.pdf (object 4 0, offset 389): expected endobj | ||
| 4 | bad7.pdf: unable to find trailer dictionary while recovering damaged file | 6 | bad7.pdf: unable to find trailer dictionary while recovering damaged file |
qpdf/qtest/qpdf/issue-146.out
| @@ -2,4 +2,7 @@ WARNING: issue-146.pdf: file is damaged | @@ -2,4 +2,7 @@ WARNING: issue-146.pdf: file is damaged | ||
| 2 | WARNING: issue-146.pdf: can't find startxref | 2 | WARNING: issue-146.pdf: can't find startxref |
| 3 | WARNING: issue-146.pdf: Attempting to reconstruct cross-reference table | 3 | WARNING: issue-146.pdf: Attempting to reconstruct cross-reference table |
| 4 | WARNING: issue-146.pdf (trailer, offset 695): ignoring excessively deeply nested data structure | 4 | WARNING: issue-146.pdf (trailer, offset 695): ignoring excessively deeply nested data structure |
| 5 | +WARNING: issue-146.pdf (object 1 0, offset 92): expected endobj | ||
| 6 | +WARNING: issue-146.pdf (object 7 0, offset 146): unknown token while reading object; treating as string | ||
| 7 | +WARNING: issue-146.pdf (object 7 0, offset 168): expected endobj | ||
| 5 | qpdf: issue-146.pdf: unable to find trailer dictionary while recovering damaged file | 8 | qpdf: issue-146.pdf: unable to find trailer dictionary while recovering damaged file |
qpdf/qtest/qpdf/issue-148.out
| @@ -7,4 +7,9 @@ WARNING: issue-148.pdf (offset 73): error decoding stream data for object 8 0: s | @@ -7,4 +7,9 @@ WARNING: issue-148.pdf (offset 73): error decoding stream data for object 8 0: s | ||
| 7 | WARNING: issue-148.pdf: file is damaged | 7 | WARNING: issue-148.pdf: file is damaged |
| 8 | WARNING: issue-148.pdf (offset 73): getStreamData called on unfilterable stream | 8 | WARNING: issue-148.pdf (offset 73): getStreamData called on unfilterable stream |
| 9 | WARNING: issue-148.pdf: Attempting to reconstruct cross-reference table | 9 | WARNING: issue-148.pdf: Attempting to reconstruct cross-reference table |
| 10 | -qpdf: issue-148.pdf: unable to find trailer dictionary while recovering damaged file | 10 | +WARNING: issue-148.pdf (xref stream: object 8 0, offset 26): stream dictionary lacks /Length key |
| 11 | +WARNING: issue-148.pdf (xref stream: object 8 0, offset 73): attempting to recover stream length | ||
| 12 | +WARNING: issue-148.pdf (xref stream: object 8 0, offset 73): recovered stream length: 2 | ||
| 13 | +WARNING: issue-148.pdf (xref stream: object 8 0, offset 85): expected endobj | ||
| 14 | +WARNING: issue-148.pdf (offset 73): error decoding stream data for object 8 0: stream inflate: inflate: data: incorrect header check | ||
| 15 | +qpdf: issue-148.pdf: error decoding candidate xref stream while recovering damaged file |
qpdf/qtest/qpdf/issue-150.out
| @@ -2,4 +2,5 @@ WARNING: issue-150.pdf: can't find PDF header | @@ -2,4 +2,5 @@ WARNING: issue-150.pdf: can't find PDF header | ||
| 2 | WARNING: issue-150.pdf: file is damaged | 2 | WARNING: issue-150.pdf: file is damaged |
| 3 | WARNING: issue-150.pdf: error reading xref: overflow/underflow converting 9900000000000000000 to 64-bit integer | 3 | WARNING: issue-150.pdf: error reading xref: overflow/underflow converting 9900000000000000000 to 64-bit integer |
| 4 | WARNING: issue-150.pdf: Attempting to reconstruct cross-reference table | 4 | WARNING: issue-150.pdf: Attempting to reconstruct cross-reference table |
| 5 | +WARNING: issue-150.pdf (object 8 0): object has offset 0 | ||
| 5 | qpdf: issue-150.pdf: unable to find trailer dictionary while recovering damaged file | 6 | qpdf: issue-150.pdf: unable to find trailer dictionary while recovering damaged file |
qpdf/qtest/qpdf/issue-202.out
| @@ -3,4 +3,6 @@ WARNING: issue-202.pdf: file is damaged | @@ -3,4 +3,6 @@ WARNING: issue-202.pdf: file is damaged | ||
| 3 | WARNING: issue-202.pdf (offset 54769): expected trailer dictionary | 3 | WARNING: issue-202.pdf (offset 54769): expected trailer dictionary |
| 4 | WARNING: issue-202.pdf: Attempting to reconstruct cross-reference table | 4 | WARNING: issue-202.pdf: Attempting to reconstruct cross-reference table |
| 5 | WARNING: issue-202.pdf (trailer, offset 55770): ignoring excessively deeply nested data structure | 5 | WARNING: issue-202.pdf (trailer, offset 55770): ignoring excessively deeply nested data structure |
| 6 | +WARNING: issue-202.pdf (object 222 0, offset 50101): dictionary has duplicated key /Creator; last occurrence overrides earlier ones | ||
| 7 | +WARNING: issue-202.pdf (object 222 0, offset 50101): dictionary has duplicated key /Producer; last occurrence overrides earlier ones | ||
| 6 | qpdf: issue-202.pdf: unable to find trailer dictionary while recovering damaged file | 8 | qpdf: issue-202.pdf: unable to find trailer dictionary while recovering damaged file |
qpdf/qtest/qpdf/recover-xref-stream-recovered.pdf
0 โ 100644
No preview for this file type
qpdf/qtest/qpdf/recover-xref-stream.out
0 โ 100644
| 1 | +WARNING: recover-xref-stream.pdf: file is damaged | ||
| 2 | +WARNING: recover-xref-stream.pdf: can't find startxref | ||
| 3 | +WARNING: recover-xref-stream.pdf: Attempting to reconstruct cross-reference table | ||
| 4 | +WARNING: recover-xref-stream.pdf: reported number of objects (14) is not one plus the highest object number (15) | ||
| 5 | +qpdf: operation succeeded with warnings; resulting file may have some problems |
qpdf/qtest/qpdf/recover-xref-stream.pdf
0 โ 100644
No preview for this file type