Commit 3fbff845949c56180dfdafc0812da95a168f224b

Authored by m-holger
1 parent 1e072e22

Move QPDF::reconstruct_xref to QPDF::Xref_table

Also, when recovering trailer from xref streams, pick the last valid
trailer encountered rather than the first.
include/qpdf/QPDF.hh
@@ -762,7 +762,6 @@ class QPDF @@ -762,7 +762,6 @@ class QPDF
762 void setTrailer(QPDFObjectHandle obj); 762 void setTrailer(QPDFObjectHandle obj);
763 void read_xref(qpdf_offset_t offset); 763 void read_xref(qpdf_offset_t offset);
764 bool resolveXRefTable(); 764 bool resolveXRefTable();
765 - void reconstruct_xref(QPDFExc& e);  
766 bool parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes); 765 bool parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes);
767 bool read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type); 766 bool read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
768 bool read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type); 767 bool read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
libqpdf/QPDF.cc
@@ -476,7 +476,7 @@ QPDF::parse(char const* password) @@ -476,7 +476,7 @@ QPDF::parse(char const* password)
476 } 476 }
477 } catch (QPDFExc& e) { 477 } catch (QPDFExc& e) {
478 if (m->attempt_recovery) { 478 if (m->attempt_recovery) {
479 - reconstruct_xref(e); 479 + m->xref_table.reconstruct(e);
480 QTC::TC("qpdf", "QPDF reconstructed xref table"); 480 QTC::TC("qpdf", "QPDF reconstructed xref table");
481 } else { 481 } else {
482 throw; 482 throw;
@@ -535,40 +535,42 @@ QPDF::setTrailer(QPDFObjectHandle obj) @@ -535,40 +535,42 @@ QPDF::setTrailer(QPDFObjectHandle obj)
535 } 535 }
536 536
537 void 537 void
538 -QPDF::reconstruct_xref(QPDFExc& e) 538 +QPDF::Xref_table::reconstruct(QPDFExc& e)
539 { 539 {
540 - if (m->xref_table.reconstructed) { 540 + if (reconstructed) {
541 // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because 541 // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because
542 // qpdf is throwing many fewer exceptions while parsing. Most situations are warnings now. 542 // qpdf is throwing many fewer exceptions while parsing. Most situations are warnings now.
543 throw e; 543 throw e;
544 } 544 }
545 545
  546 + auto* m = qpdf.m.get();
  547 +
546 // If recovery generates more than 1000 warnings, the file is so severely damaged that there 548 // If recovery generates more than 1000 warnings, the file is so severely damaged that there
547 // probably is no point trying to continue. 549 // probably is no point trying to continue.
548 const auto max_warnings = m->warnings.size() + 1000U; 550 const auto max_warnings = m->warnings.size() + 1000U;
549 auto check_warnings = [this, max_warnings]() { 551 auto check_warnings = [this, max_warnings]() {
550 - if (m->warnings.size() > max_warnings) {  
551 - throw damagedPDF("", 0, "too many errors while reconstructing cross-reference table"); 552 + if (qpdf.m->warnings.size() > max_warnings) {
  553 + throw damaged_pdf("too many errors while reconstructing cross-reference table");
552 } 554 }
553 }; 555 };
554 556
555 - m->xref_table.reconstructed = true; 557 + reconstructed = true;
556 // We may find more objects, which may contain dangling references. 558 // We may find more objects, which may contain dangling references.
557 m->fixed_dangling_refs = false; 559 m->fixed_dangling_refs = false;
558 560
559 - warn(damagedPDF("", 0, "file is damaged"));  
560 - warn(e);  
561 - warn(damagedPDF("", 0, "Attempting to reconstruct cross-reference table")); 561 + warn_damaged("file is damaged");
  562 + qpdf.warn(e);
  563 + warn_damaged("Attempting to reconstruct cross-reference table");
562 564
563 // Delete all references to type 1 (uncompressed) objects 565 // Delete all references to type 1 (uncompressed) objects
564 std::set<QPDFObjGen> to_delete; 566 std::set<QPDFObjGen> to_delete;
565 - for (auto const& iter: m->xref_table) { 567 + for (auto const& iter: *this) {
566 if (iter.second.getType() == 1) { 568 if (iter.second.getType() == 1) {
567 to_delete.insert(iter.first); 569 to_delete.insert(iter.first);
568 } 570 }
569 } 571 }
570 for (auto const& iter: to_delete) { 572 for (auto const& iter: to_delete) {
571 - m->xref_table.erase(iter); 573 + erase(iter);
572 } 574 }
573 575
574 m->file->seek(0, SEEK_END); 576 m->file->seek(0, SEEK_END);
@@ -577,46 +579,45 @@ QPDF::reconstruct_xref(QPDFExc&amp; e) @@ -577,46 +579,45 @@ QPDF::reconstruct_xref(QPDFExc&amp; e)
577 // Don't allow very long tokens here during recovery. All the interesting tokens are covered. 579 // Don't allow very long tokens here during recovery. All the interesting tokens are covered.
578 static size_t const MAX_LEN = 10; 580 static size_t const MAX_LEN = 10;
579 while (m->file->tell() < eof) { 581 while (m->file->tell() < eof) {
580 - QPDFTokenizer::Token t1 = readToken(*m->file, MAX_LEN); 582 + QPDFTokenizer::Token t1 = qpdf.readToken(*m->file, MAX_LEN);
581 qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length()); 583 qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length());
582 if (t1.isInteger()) { 584 if (t1.isInteger()) {
583 auto pos = m->file->tell(); 585 auto pos = m->file->tell();
584 - QPDFTokenizer::Token t2 = readToken(*m->file, MAX_LEN);  
585 - if ((t2.isInteger()) && (readToken(*m->file, MAX_LEN).isWord("obj"))) { 586 + QPDFTokenizer::Token t2 = qpdf.readToken(*m->file, MAX_LEN);
  587 + if (t2.isInteger() && qpdf.readToken(*m->file, MAX_LEN).isWord("obj")) {
586 int obj = QUtil::string_to_int(t1.getValue().c_str()); 588 int obj = QUtil::string_to_int(t1.getValue().c_str());
587 int gen = QUtil::string_to_int(t2.getValue().c_str()); 589 int gen = QUtil::string_to_int(t2.getValue().c_str());
588 - if (obj <= m->xref_table.max_id) {  
589 - m->xref_table.insert_reconstructed(obj, token_start, gen); 590 + if (obj <= max_id) {
  591 + insert_reconstructed(obj, token_start, gen);
590 } else { 592 } else {
591 - warn(damagedPDF(  
592 - "", 0, "ignoring object with impossibly large id " + std::to_string(obj))); 593 + warn_damaged("ignoring object with impossibly large id " + std::to_string(obj));
593 } 594 }
594 } 595 }
595 m->file->seek(pos, SEEK_SET); 596 m->file->seek(pos, SEEK_SET);
596 - } else if (!m->xref_table.trailer && t1.isWord("trailer")) { 597 + } else if (!trailer && t1.isWord("trailer")) {
597 auto pos = m->file->tell(); 598 auto pos = m->file->tell();
598 - QPDFObjectHandle t = readTrailer(); 599 + QPDFObjectHandle t = qpdf.readTrailer();
599 if (!t.isDictionary()) { 600 if (!t.isDictionary()) {
600 // Oh well. It was worth a try. 601 // Oh well. It was worth a try.
601 } else { 602 } else {
602 - setTrailer(t); 603 + qpdf.setTrailer(t);
603 } 604 }
604 m->file->seek(pos, SEEK_SET); 605 m->file->seek(pos, SEEK_SET);
605 } 606 }
606 check_warnings(); 607 check_warnings();
607 m->file->findAndSkipNextEOL(); 608 m->file->findAndSkipNextEOL();
608 } 609 }
609 - m->xref_table.deleted_objects.clear(); 610 + deleted_objects.clear();
610 611
611 - if (!m->xref_table.trailer) { 612 + if (!trailer) {
612 qpdf_offset_t max_offset{0}; 613 qpdf_offset_t max_offset{0};
613 // If there are any xref streams, take the last one to appear. 614 // If there are any xref streams, take the last one to appear.
614 - for (auto const& iter: m->xref_table) { 615 + for (auto const& iter: *this) {
615 auto entry = iter.second; 616 auto entry = iter.second;
616 if (entry.getType() != 1) { 617 if (entry.getType() != 1) {
617 continue; 618 continue;
618 } 619 }
619 - auto oh = getObjectByObjGen(iter.first); 620 + auto oh = qpdf.getObjectByObjGen(iter.first);
620 try { 621 try {
621 if (!oh.isStreamOfType("/XRef")) { 622 if (!oh.isStreamOfType("/XRef")) {
622 continue; 623 continue;
@@ -627,41 +628,41 @@ QPDF::reconstruct_xref(QPDFExc&amp; e) @@ -627,41 +628,41 @@ QPDF::reconstruct_xref(QPDFExc&amp; e)
627 auto offset = entry.getOffset(); 628 auto offset = entry.getOffset();
628 if (offset > max_offset) { 629 if (offset > max_offset) {
629 max_offset = offset; 630 max_offset = offset;
630 - setTrailer(oh.getDict()); 631 + trailer = oh.getDict();
631 } 632 }
632 check_warnings(); 633 check_warnings();
633 } 634 }
634 if (max_offset > 0) { 635 if (max_offset > 0) {
635 try { 636 try {
636 - read_xref(max_offset); 637 + qpdf.read_xref(max_offset);
637 } catch (std::exception&) { 638 } catch (std::exception&) {
638 - throw damagedPDF(  
639 - "", 0, "error decoding candidate xref stream while recovering damaged file"); 639 + throw damaged_pdf(
  640 + "error decoding candidate xref stream while recovering damaged file");
640 } 641 }
641 QTC::TC("qpdf", "QPDF recover xref stream"); 642 QTC::TC("qpdf", "QPDF recover xref stream");
642 } 643 }
643 } 644 }
644 645
645 - if (!m->xref_table.trailer) { 646 + if (!trailer) {
646 // We could check the last encountered object to see if it was an xref stream. If so, we 647 // We could check the last encountered object to see if it was an xref stream. If so, we
647 // could try to get the trailer from there. This may make it possible to recover files with 648 // could try to get the trailer from there. This may make it possible to recover files with
648 // bad startxref pointers even when they have object streams. 649 // bad startxref pointers even when they have object streams.
649 650
650 - throw damagedPDF("", 0, "unable to find trailer dictionary while recovering damaged file"); 651 + throw damaged_pdf("unable to find trailer dictionary while recovering damaged file");
651 } 652 }
652 - if (m->xref_table.empty()) { 653 + if (empty()) {
653 // We cannot check for an empty xref table in parse because empty tables are valid when 654 // We cannot check for an empty xref table in parse because empty tables are valid when
654 // creating QPDF objects from JSON. 655 // creating QPDF objects from JSON.
655 - throw damagedPDF("", 0, "unable to find objects while recovering damaged file"); 656 + throw damaged_pdf("unable to find objects while recovering damaged file");
656 } 657 }
657 check_warnings(); 658 check_warnings();
658 - if (!m->xref_table.parsed) {  
659 - m->xref_table.parsed = true;  
660 - getAllPages(); 659 + if (!parsed) {
  660 + parsed = true;
  661 + qpdf.getAllPages();
661 check_warnings(); 662 check_warnings();
662 if (m->all_pages.empty()) { 663 if (m->all_pages.empty()) {
663 - m->xref_table.parsed = false;  
664 - throw damagedPDF("", 0, "unable to find any pages while recovering damaged file"); 664 + parsed = false;
  665 + throw damaged_pdf("unable to find any pages while recovering damaged file");
665 } 666 }
666 } 667 }
667 // We could iterate through the objects looking for streams and try to find objects inside of 668 // We could iterate through the objects looking for streams and try to find objects inside of
@@ -1766,7 +1767,7 @@ QPDF::readObjectAtOffset( @@ -1766,7 +1767,7 @@ QPDF::readObjectAtOffset(
1766 } catch (QPDFExc& e) { 1767 } catch (QPDFExc& e) {
1767 if (try_recovery) { 1768 if (try_recovery) {
1768 // Try again after reconstructing xref table 1769 // Try again after reconstructing xref table
1769 - reconstruct_xref(e); 1770 + m->xref_table.reconstruct(e);
1770 if (m->xref_table.count(exp_og) && (m->xref_table[exp_og].getType() == 1)) { 1771 if (m->xref_table.count(exp_og) && (m->xref_table[exp_og].getType() == 1)) {
1771 qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset(); 1772 qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset();
1772 QPDFObjectHandle result = 1773 QPDFObjectHandle result =
libqpdf/qpdf/QPDF_private.hh
@@ -16,6 +16,8 @@ class QPDF::Xref_table: public std::map&lt;QPDFObjGen, QPDFXRefEntry&gt; @@ -16,6 +16,8 @@ class QPDF::Xref_table: public std::map&lt;QPDFObjGen, QPDFXRefEntry&gt;
16 void insert(int obj, int f0, qpdf_offset_t f1, int f2); 16 void insert(int obj, int f0, qpdf_offset_t f1, int f2);
17 void insert_free(QPDFObjGen); 17 void insert_free(QPDFObjGen);
18 18
  19 + void reconstruct(QPDFExc& e);
  20 +
19 QPDFObjectHandle trailer; 21 QPDFObjectHandle trailer;
20 bool reconstructed{false}; 22 bool reconstructed{false};
21 // Various tables are indexed by object id, with potential size id + 1 23 // Various tables are indexed by object id, with potential size id + 1
@@ -31,6 +33,17 @@ class QPDF::Xref_table: public std::map&lt;QPDFObjGen, QPDFXRefEntry&gt; @@ -31,6 +33,17 @@ class QPDF::Xref_table: public std::map&lt;QPDFObjGen, QPDFXRefEntry&gt;
31 qpdf_offset_t first_item_offset{0}; // actual value from file 33 qpdf_offset_t first_item_offset{0}; // actual value from file
32 34
33 private: 35 private:
  36 + QPDFExc
  37 + damaged_pdf(std::string const& msg)
  38 + {
  39 + return qpdf.damagedPDF("", 0, msg);
  40 + }
  41 +
  42 + void
  43 + warn_damaged(std::string const& msg)
  44 + {
  45 + qpdf.warn(damaged_pdf(msg));
  46 + }
34 QPDF& qpdf; 47 QPDF& qpdf;
35 }; 48 };
36 49
qpdf/qtest/object-stream.test
@@ -102,11 +102,10 @@ $td-&gt;runtest(&quot;recover file with xref stream&quot;, @@ -102,11 +102,10 @@ $td-&gt;runtest(&quot;recover file with xref stream&quot;,
102 {$td->COMMAND => "qpdf --static-id --compress-streams=n" . 102 {$td->COMMAND => "qpdf --static-id --compress-streams=n" .
103 " recover-xref-stream.pdf a.pdf"}, 103 " recover-xref-stream.pdf a.pdf"},
104 {$td->FILE => "recover-xref-stream.out", $td->EXIT_STATUS => 3}, 104 {$td->FILE => "recover-xref-stream.out", $td->EXIT_STATUS => 3},
105 - $td->EXPECT_FAILURE); 105 + $td->NORMALIZE_NEWLINES);
106 $td->runtest("check file", 106 $td->runtest("check file",
107 {$td->FILE => "a.pdf"}, 107 {$td->FILE => "a.pdf"},
108 - {$td->FILE => "recover-xref-stream-recovered.pdf"},  
109 - $td->EXPECT_FAILURE); 108 + {$td->FILE => "recover-xref-stream-recovered.pdf"});
110 109
111 # Self-referential object stream 110 # Self-referential object stream
112 $td->runtest("self-referential object stream", 111 $td->runtest("self-referential object stream",
qpdf/qtest/qpdf/recover-xref-stream.out
1 WARNING: recover-xref-stream.pdf: file is damaged 1 WARNING: recover-xref-stream.pdf: file is damaged
2 WARNING: recover-xref-stream.pdf: can't find startxref 2 WARNING: recover-xref-stream.pdf: can't find startxref
3 WARNING: recover-xref-stream.pdf: Attempting to reconstruct cross-reference table 3 WARNING: recover-xref-stream.pdf: Attempting to reconstruct cross-reference table
4 -WARNING: recover-xref-stream.pdf: reported number of objects (14) is not one plus the highest object number (15)  
5 qpdf: operation succeeded with warnings; resulting file may have some problems 4 qpdf: operation succeeded with warnings; resulting file may have some problems