Commit ca3ea2e3054189ca9b6d55f7a74588a3a1cb74b2

Authored by m-holger
1 parent aa583f29

Refine xref reconstruction (fixes #1335)

When recovering XRef streams, start with the stream with the largest
/Size rather than the largest offset.

Also, if reconstruction fails to find a trailer with a valid /Root entry
search for a root object.
libqpdf/QPDF.cc
@@ -608,6 +608,7 @@ QPDF::reconstruct_xref(QPDFExc& e) @@ -608,6 +608,7 @@ QPDF::reconstruct_xref(QPDFExc& e)
608 608
609 if (!m->trailer) { 609 if (!m->trailer) {
610 qpdf_offset_t max_offset{0}; 610 qpdf_offset_t max_offset{0};
  611 + size_t max_size{0};
611 // If there are any xref streams, take the last one to appear. 612 // If there are any xref streams, take the last one to appear.
612 for (auto const& iter: m->xref_table) { 613 for (auto const& iter: m->xref_table) {
613 auto entry = iter.second; 614 auto entry = iter.second;
@@ -623,7 +624,8 @@ QPDF::reconstruct_xref(QPDFExc& e) @@ -623,7 +624,8 @@ QPDF::reconstruct_xref(QPDFExc& e)
623 continue; 624 continue;
624 } 625 }
625 auto offset = entry.getOffset(); 626 auto offset = entry.getOffset();
626 - if (offset > max_offset) { 627 + auto size = oh.getDict().getKey("/Size").getUIntValueAsUInt();
  628 + if (size > max_size || (size == max_size && offset > max_offset)) {
627 max_offset = offset; 629 max_offset = offset;
628 setTrailer(oh.getDict()); 630 setTrailer(oh.getDict());
629 } 631 }
@@ -633,13 +635,35 @@ QPDF::reconstruct_xref(QPDFExc& e) @@ -633,13 +635,35 @@ QPDF::reconstruct_xref(QPDFExc& e)
633 try { 635 try {
634 read_xref(max_offset); 636 read_xref(max_offset);
635 } catch (std::exception&) { 637 } catch (std::exception&) {
636 - throw damagedPDF(  
637 - "", 0, "error decoding candidate xref stream while recovering damaged file"); 638 + warn(damagedPDF(
  639 + "", 0, "error decoding candidate xref stream while recovering damaged file"));
638 } 640 }
639 QTC::TC("qpdf", "QPDF recover xref stream"); 641 QTC::TC("qpdf", "QPDF recover xref stream");
640 } 642 }
641 } 643 }
642 644
  645 + if (!m->trailer || (!m->parsed && !m->trailer.getKey("/Root").isDictionary())) {
  646 + // Try to find a Root dictionary. As a quick fix try the one with the highest object id.
  647 + QPDFObjectHandle root;
  648 + for (auto const& iter: m->obj_cache) {
  649 + try {
  650 + if (QPDFObjectHandle(iter.second.object).isDictionaryOfType("/Catalog")) {
  651 + root = iter.second.object;
  652 + }
  653 + } catch (std::exception&) {
  654 + continue;
  655 + }
  656 + }
  657 + if (root) {
  658 + if (!m->trailer) {
  659 + warn(damagedPDF(
  660 + "", 0, "unable to find trailer dictionary while recovering damaged file"));
  661 + m->trailer = QPDFObjectHandle::newDictionary();
  662 + }
  663 + m->trailer.replaceKey("/Root", root);
  664 + }
  665 + }
  666 +
643 if (!m->trailer) { 667 if (!m->trailer) {
644 // We could check the last encountered object to see if it was an xref stream. If so, we 668 // We could check the last encountered object to see if it was an xref stream. If so, we
645 // could try to get the trailer from there. This may make it possible to recover files with 669 // could try to get the trailer from there. This may make it possible to recover files with
qpdf/qtest/error-condition.test
@@ -127,7 +127,7 @@ $n_tests += @badfiles + 11; @@ -127,7 +127,7 @@ $n_tests += @badfiles + 11;
127 # though in some cases it may. Acrobat Reader would not be able to 127 # though in some cases it may. Acrobat Reader would not be able to
128 # recover any of these files any better. 128 # recover any of these files any better.
129 my %recover_failures = (); 129 my %recover_failures = ();
130 -for (1, 7, 16) 130 +for (1)
131 { 131 {
132 $recover_failures{$_} = 1; 132 $recover_failures{$_} = 1;
133 } 133 }
qpdf/qtest/qpdf/bad16-recover.out
@@ -11,4 +11,10 @@ WARNING: bad16.pdf (trailer, offset 756): unexpected dictionary close token @@ -11,4 +11,10 @@ WARNING: bad16.pdf (trailer, offset 756): unexpected dictionary close token
11 WARNING: bad16.pdf (trailer, offset 759): unknown token while reading object; treating as string 11 WARNING: bad16.pdf (trailer, offset 759): unknown token while reading object; treating as string
12 WARNING: bad16.pdf (trailer, offset 779): parse error while reading object 12 WARNING: bad16.pdf (trailer, offset 779): parse error while reading object
13 WARNING: bad16.pdf (trailer, offset 779): unexpected EOF 13 WARNING: bad16.pdf (trailer, offset 779): unexpected EOF
14 -bad16.pdf: unable to find trailer dictionary while recovering damaged file 14 +WARNING: bad16.pdf: unable to find trailer dictionary while recovering damaged file
  15 +/QTest is implicit
  16 +/QTest is direct and has type null (2)
  17 +/QTest is null
  18 +unparse: null
  19 +unparseResolved: null
  20 +test 1 done
qpdf/qtest/qpdf/bad7-recover.out
@@ -3,4 +3,10 @@ WARNING: bad7.pdf (offset 698): expected trailer dictionary @@ -3,4 +3,10 @@ WARNING: bad7.pdf (offset 698): expected trailer dictionary
3 WARNING: bad7.pdf: Attempting to reconstruct cross-reference table 3 WARNING: bad7.pdf: Attempting to reconstruct cross-reference table
4 WARNING: bad7.pdf (object 2 0, offset 128): expected endobj 4 WARNING: bad7.pdf (object 2 0, offset 128): expected endobj
5 WARNING: bad7.pdf (object 4 0, offset 389): expected endobj 5 WARNING: bad7.pdf (object 4 0, offset 389): expected endobj
6 -bad7.pdf: unable to find trailer dictionary while recovering damaged file 6 +WARNING: bad7.pdf: unable to find trailer dictionary while recovering damaged file
  7 +/QTest is implicit
  8 +/QTest is direct and has type null (2)
  9 +/QTest is null
  10 +unparse: null
  11 +unparseResolved: null
  12 +test 1 done
qpdf/qtest/qpdf/issue-100.out
@@ -2,4 +2,12 @@ WARNING: issue-100.pdf: file is damaged @@ -2,4 +2,12 @@ WARNING: issue-100.pdf: file is damaged
2 WARNING: issue-100.pdf (offset 736): xref not found 2 WARNING: issue-100.pdf (offset 736): xref not found
3 WARNING: issue-100.pdf: Attempting to reconstruct cross-reference table 3 WARNING: issue-100.pdf: Attempting to reconstruct cross-reference table
4 WARNING: issue-100.pdf (trailer, offset 488): stream keyword found in trailer 4 WARNING: issue-100.pdf (trailer, offset 488): stream keyword found in trailer
  5 +WARNING: issue-100.pdf (object 5 0, offset 268): unknown token while reading object; treating as string
  6 +WARNING: issue-100.pdf (object 5 0, offset 286): unknown token while reading object; treating as string
  7 +WARNING: issue-100.pdf (object 5 0, offset 289): unknown token while reading object; treating as string
  8 +WARNING: issue-100.pdf (object 5 0, offset 294): unknown token while reading object; treating as string
  9 +WARNING: issue-100.pdf (object 5 0, offset 297): unknown token while reading object; treating as string
  10 +WARNING: issue-100.pdf (object 5 0, offset 304): unknown token while reading object; treating as string
  11 +WARNING: issue-100.pdf (object 5 0, offset 304): too many errors; giving up on reading object
  12 +WARNING: issue-100.pdf (object 5 0, offset 308): expected endobj
5 qpdf: issue-100.pdf: unable to find /Root dictionary 13 qpdf: issue-100.pdf: unable to find /Root dictionary
qpdf/qtest/qpdf/issue-101.out
@@ -2,4 +2,6 @@ WARNING: issue-101.pdf: file is damaged @@ -2,4 +2,6 @@ WARNING: issue-101.pdf: file is damaged
2 WARNING: issue-101.pdf (offset 3526): xref not found 2 WARNING: issue-101.pdf (offset 3526): xref not found
3 WARNING: issue-101.pdf: Attempting to reconstruct cross-reference table 3 WARNING: issue-101.pdf: Attempting to reconstruct cross-reference table
4 WARNING: issue-101.pdf (trailer, offset 1508): stream keyword found in trailer 4 WARNING: issue-101.pdf (trailer, offset 1508): stream keyword found in trailer
  5 +WARNING: issue-101.pdf (object 5 0, offset 1242): dictionary ended prematurely; using null as value for last key
  6 +WARNING: issue-101.pdf (object 5 0, offset 1242): expected dictionary key but found non-name object; inserting key /QPDFFake1
5 qpdf: issue-101.pdf: unable to find /Root dictionary 7 qpdf: issue-101.pdf: unable to find /Root dictionary
qpdf/qtest/qpdf/issue-148.out
@@ -12,4 +12,5 @@ WARNING: issue-148.pdf (xref stream: object 8 0, offset 73): attempting to recov @@ -12,4 +12,5 @@ WARNING: issue-148.pdf (xref stream: object 8 0, offset 73): attempting to recov
12 WARNING: issue-148.pdf (xref stream: object 8 0, offset 73): recovered stream length: 2 12 WARNING: issue-148.pdf (xref stream: object 8 0, offset 73): recovered stream length: 2
13 WARNING: issue-148.pdf (xref stream: object 8 0, offset 85): expected endobj 13 WARNING: issue-148.pdf (xref stream: object 8 0, offset 85): expected endobj
14 WARNING: issue-148.pdf (offset 73): error decoding stream data for object 8 0: stream inflate: inflate: data: incorrect header check 14 WARNING: issue-148.pdf (offset 73): error decoding stream data for object 8 0: stream inflate: inflate: data: incorrect header check
15 -qpdf: issue-148.pdf: error decoding candidate xref stream while recovering damaged file 15 +WARNING: issue-148.pdf: error decoding candidate xref stream while recovering damaged file
  16 +qpdf: issue-148.pdf: unable to find /Root dictionary
qpdf/qtest/qpdf/issue-202.out
@@ -5,4 +5,5 @@ WARNING: issue-202.pdf: Attempting to reconstruct cross-reference table @@ -5,4 +5,5 @@ WARNING: issue-202.pdf: Attempting to reconstruct cross-reference table
5 WARNING: issue-202.pdf (trailer, offset 55770): ignoring excessively deeply nested data structure 5 WARNING: issue-202.pdf (trailer, offset 55770): ignoring excessively deeply nested data structure
6 WARNING: issue-202.pdf (object 222 0, offset 50101): dictionary has duplicated key /Creator; last occurrence overrides earlier ones 6 WARNING: issue-202.pdf (object 222 0, offset 50101): dictionary has duplicated key /Creator; last occurrence overrides earlier ones
7 WARNING: issue-202.pdf (object 222 0, offset 50101): dictionary has duplicated key /Producer; last occurrence overrides earlier ones 7 WARNING: issue-202.pdf (object 222 0, offset 50101): dictionary has duplicated key /Producer; last occurrence overrides earlier ones
8 -qpdf: issue-202.pdf: unable to find trailer dictionary while recovering damaged file 8 +WARNING: issue-202.pdf: unable to find trailer dictionary while recovering damaged file
  9 +qpdf: operation succeeded with warnings; resulting file may have some problems
qpdf/qtest/specific-bugs.test
@@ -34,7 +34,7 @@ my @bug_tests = ( @@ -34,7 +34,7 @@ my @bug_tests = (
34 ["148", "free memory on bad flate", 2], 34 ["148", "free memory on bad flate", 2],
35 ["149", "xref prev pointer loop", 3], 35 ["149", "xref prev pointer loop", 3],
36 ["150", "integer overflow", 2], 36 ["150", "integer overflow", 2],
37 - ["202", "even more deeply nested dictionary", 2], 37 + ["202", "even more deeply nested dictionary", 3],
38 ["263", "empty xref stream", 2], 38 ["263", "empty xref stream", 2],
39 ["335a", "ozz-fuzz-12152", 2], 39 ["335a", "ozz-fuzz-12152", 2],
40 ["335b", "ozz-fuzz-14845", 2], 40 ["335b", "ozz-fuzz-14845", 2],