Commit ca3ea2e3054189ca9b6d55f7a74588a3a1cb74b2
1 parent
aa583f29
Refine xref reconstruction (fixes #1335)
When recovering XRef streams, start with the stream with the largest /Size rather than the largest offset. Also, if reconstruction fails to find a trailer with a valid /Root entry search for a root object.
Showing
9 changed files
with
57 additions
and
9 deletions
libqpdf/QPDF.cc
| ... | ... | @@ -608,6 +608,7 @@ QPDF::reconstruct_xref(QPDFExc& e) |
| 608 | 608 | |
| 609 | 609 | if (!m->trailer) { |
| 610 | 610 | qpdf_offset_t max_offset{0}; |
| 611 | + size_t max_size{0}; | |
| 611 | 612 | // If there are any xref streams, take the last one to appear. |
| 612 | 613 | for (auto const& iter: m->xref_table) { |
| 613 | 614 | auto entry = iter.second; |
| ... | ... | @@ -623,7 +624,8 @@ QPDF::reconstruct_xref(QPDFExc& e) |
| 623 | 624 | continue; |
| 624 | 625 | } |
| 625 | 626 | auto offset = entry.getOffset(); |
| 626 | - if (offset > max_offset) { | |
| 627 | + auto size = oh.getDict().getKey("/Size").getUIntValueAsUInt(); | |
| 628 | + if (size > max_size || (size == max_size && offset > max_offset)) { | |
| 627 | 629 | max_offset = offset; |
| 628 | 630 | setTrailer(oh.getDict()); |
| 629 | 631 | } |
| ... | ... | @@ -633,13 +635,35 @@ QPDF::reconstruct_xref(QPDFExc& e) |
| 633 | 635 | try { |
| 634 | 636 | read_xref(max_offset); |
| 635 | 637 | } catch (std::exception&) { |
| 636 | - throw damagedPDF( | |
| 637 | - "", 0, "error decoding candidate xref stream while recovering damaged file"); | |
| 638 | + warn(damagedPDF( | |
| 639 | + "", 0, "error decoding candidate xref stream while recovering damaged file")); | |
| 638 | 640 | } |
| 639 | 641 | QTC::TC("qpdf", "QPDF recover xref stream"); |
| 640 | 642 | } |
| 641 | 643 | } |
| 642 | 644 | |
| 645 | + if (!m->trailer || (!m->parsed && !m->trailer.getKey("/Root").isDictionary())) { | |
| 646 | + // Try to find a Root dictionary. As a quick fix try the one with the highest object id. | |
| 647 | + QPDFObjectHandle root; | |
| 648 | + for (auto const& iter: m->obj_cache) { | |
| 649 | + try { | |
| 650 | + if (QPDFObjectHandle(iter.second.object).isDictionaryOfType("/Catalog")) { | |
| 651 | + root = iter.second.object; | |
| 652 | + } | |
| 653 | + } catch (std::exception&) { | |
| 654 | + continue; | |
| 655 | + } | |
| 656 | + } | |
| 657 | + if (root) { | |
| 658 | + if (!m->trailer) { | |
| 659 | + warn(damagedPDF( | |
| 660 | + "", 0, "unable to find trailer dictionary while recovering damaged file")); | |
| 661 | + m->trailer = QPDFObjectHandle::newDictionary(); | |
| 662 | + } | |
| 663 | + m->trailer.replaceKey("/Root", root); | |
| 664 | + } | |
| 665 | + } | |
| 666 | + | |
| 643 | 667 | if (!m->trailer) { |
| 644 | 668 | // We could check the last encountered object to see if it was an xref stream. If so, we |
| 645 | 669 | // could try to get the trailer from there. This may make it possible to recover files with | ... | ... |
qpdf/qtest/error-condition.test
| ... | ... | @@ -127,7 +127,7 @@ $n_tests += @badfiles + 11; |
| 127 | 127 | # though in some cases it may. Acrobat Reader would not be able to |
| 128 | 128 | # recover any of these files any better. |
| 129 | 129 | my %recover_failures = (); |
| 130 | -for (1, 7, 16) | |
| 130 | +for (1) | |
| 131 | 131 | { |
| 132 | 132 | $recover_failures{$_} = 1; |
| 133 | 133 | } | ... | ... |
qpdf/qtest/qpdf/bad16-recover.out
| ... | ... | @@ -11,4 +11,10 @@ WARNING: bad16.pdf (trailer, offset 756): unexpected dictionary close token |
| 11 | 11 | WARNING: bad16.pdf (trailer, offset 759): unknown token while reading object; treating as string |
| 12 | 12 | WARNING: bad16.pdf (trailer, offset 779): parse error while reading object |
| 13 | 13 | WARNING: bad16.pdf (trailer, offset 779): unexpected EOF |
| 14 | -bad16.pdf: unable to find trailer dictionary while recovering damaged file | |
| 14 | +WARNING: bad16.pdf: unable to find trailer dictionary while recovering damaged file | |
| 15 | +/QTest is implicit | |
| 16 | +/QTest is direct and has type null (2) | |
| 17 | +/QTest is null | |
| 18 | +unparse: null | |
| 19 | +unparseResolved: null | |
| 20 | +test 1 done | ... | ... |
qpdf/qtest/qpdf/bad7-recover.out
| ... | ... | @@ -3,4 +3,10 @@ WARNING: bad7.pdf (offset 698): expected trailer dictionary |
| 3 | 3 | WARNING: bad7.pdf: Attempting to reconstruct cross-reference table |
| 4 | 4 | WARNING: bad7.pdf (object 2 0, offset 128): expected endobj |
| 5 | 5 | WARNING: bad7.pdf (object 4 0, offset 389): expected endobj |
| 6 | -bad7.pdf: unable to find trailer dictionary while recovering damaged file | |
| 6 | +WARNING: bad7.pdf: unable to find trailer dictionary while recovering damaged file | |
| 7 | +/QTest is implicit | |
| 8 | +/QTest is direct and has type null (2) | |
| 9 | +/QTest is null | |
| 10 | +unparse: null | |
| 11 | +unparseResolved: null | |
| 12 | +test 1 done | ... | ... |
qpdf/qtest/qpdf/issue-100.out
| ... | ... | @@ -2,4 +2,12 @@ WARNING: issue-100.pdf: file is damaged |
| 2 | 2 | WARNING: issue-100.pdf (offset 736): xref not found |
| 3 | 3 | WARNING: issue-100.pdf: Attempting to reconstruct cross-reference table |
| 4 | 4 | WARNING: issue-100.pdf (trailer, offset 488): stream keyword found in trailer |
| 5 | +WARNING: issue-100.pdf (object 5 0, offset 268): unknown token while reading object; treating as string | |
| 6 | +WARNING: issue-100.pdf (object 5 0, offset 286): unknown token while reading object; treating as string | |
| 7 | +WARNING: issue-100.pdf (object 5 0, offset 289): unknown token while reading object; treating as string | |
| 8 | +WARNING: issue-100.pdf (object 5 0, offset 294): unknown token while reading object; treating as string | |
| 9 | +WARNING: issue-100.pdf (object 5 0, offset 297): unknown token while reading object; treating as string | |
| 10 | +WARNING: issue-100.pdf (object 5 0, offset 304): unknown token while reading object; treating as string | |
| 11 | +WARNING: issue-100.pdf (object 5 0, offset 304): too many errors; giving up on reading object | |
| 12 | +WARNING: issue-100.pdf (object 5 0, offset 308): expected endobj | |
| 5 | 13 | qpdf: issue-100.pdf: unable to find /Root dictionary | ... | ... |
qpdf/qtest/qpdf/issue-101.out
| ... | ... | @@ -2,4 +2,6 @@ WARNING: issue-101.pdf: file is damaged |
| 2 | 2 | WARNING: issue-101.pdf (offset 3526): xref not found |
| 3 | 3 | WARNING: issue-101.pdf: Attempting to reconstruct cross-reference table |
| 4 | 4 | WARNING: issue-101.pdf (trailer, offset 1508): stream keyword found in trailer |
| 5 | +WARNING: issue-101.pdf (object 5 0, offset 1242): dictionary ended prematurely; using null as value for last key | |
| 6 | +WARNING: issue-101.pdf (object 5 0, offset 1242): expected dictionary key but found non-name object; inserting key /QPDFFake1 | |
| 5 | 7 | qpdf: issue-101.pdf: unable to find /Root dictionary | ... | ... |
qpdf/qtest/qpdf/issue-148.out
| ... | ... | @@ -12,4 +12,5 @@ WARNING: issue-148.pdf (xref stream: object 8 0, offset 73): attempting to recov |
| 12 | 12 | WARNING: issue-148.pdf (xref stream: object 8 0, offset 73): recovered stream length: 2 |
| 13 | 13 | WARNING: issue-148.pdf (xref stream: object 8 0, offset 85): expected endobj |
| 14 | 14 | WARNING: issue-148.pdf (offset 73): error decoding stream data for object 8 0: stream inflate: inflate: data: incorrect header check |
| 15 | -qpdf: issue-148.pdf: error decoding candidate xref stream while recovering damaged file | |
| 15 | +WARNING: issue-148.pdf: error decoding candidate xref stream while recovering damaged file | |
| 16 | +qpdf: issue-148.pdf: unable to find /Root dictionary | ... | ... |
qpdf/qtest/qpdf/issue-202.out
| ... | ... | @@ -5,4 +5,5 @@ WARNING: issue-202.pdf: Attempting to reconstruct cross-reference table |
| 5 | 5 | WARNING: issue-202.pdf (trailer, offset 55770): ignoring excessively deeply nested data structure |
| 6 | 6 | WARNING: issue-202.pdf (object 222 0, offset 50101): dictionary has duplicated key /Creator; last occurrence overrides earlier ones |
| 7 | 7 | WARNING: issue-202.pdf (object 222 0, offset 50101): dictionary has duplicated key /Producer; last occurrence overrides earlier ones |
| 8 | -qpdf: issue-202.pdf: unable to find trailer dictionary while recovering damaged file | |
| 8 | +WARNING: issue-202.pdf: unable to find trailer dictionary while recovering damaged file | |
| 9 | +qpdf: operation succeeded with warnings; resulting file may have some problems | ... | ... |
qpdf/qtest/specific-bugs.test
| ... | ... | @@ -34,7 +34,7 @@ my @bug_tests = ( |
| 34 | 34 | ["148", "free memory on bad flate", 2], |
| 35 | 35 | ["149", "xref prev pointer loop", 3], |
| 36 | 36 | ["150", "integer overflow", 2], |
| 37 | - ["202", "even more deeply nested dictionary", 2], | |
| 37 | + ["202", "even more deeply nested dictionary", 3], | |
| 38 | 38 | ["263", "empty xref stream", 2], |
| 39 | 39 | ["335a", "ozz-fuzz-12152", 2], |
| 40 | 40 | ["335b", "ozz-fuzz-14845", 2], | ... | ... |