Commit 3fbff845949c56180dfdafc0812da95a168f224b

Authored by m-holger
1 parent 1e072e22

Move QPDF::reconstruct_xref to QPDF::Xref_table

Also, when recovering trailer from xref streams, pick the last valid
trailer encountered rather than the first.
include/qpdf/QPDF.hh
... ... @@ -762,7 +762,6 @@ class QPDF
762 762 void setTrailer(QPDFObjectHandle obj);
763 763 void read_xref(qpdf_offset_t offset);
764 764 bool resolveXRefTable();
765   - void reconstruct_xref(QPDFExc& e);
766 765 bool parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes);
767 766 bool read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
768 767 bool read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
... ...
libqpdf/QPDF.cc
... ... @@ -476,7 +476,7 @@ QPDF::parse(char const* password)
476 476 }
477 477 } catch (QPDFExc& e) {
478 478 if (m->attempt_recovery) {
479   - reconstruct_xref(e);
  479 + m->xref_table.reconstruct(e);
480 480 QTC::TC("qpdf", "QPDF reconstructed xref table");
481 481 } else {
482 482 throw;
... ... @@ -535,40 +535,42 @@ QPDF::setTrailer(QPDFObjectHandle obj)
535 535 }
536 536  
537 537 void
538   -QPDF::reconstruct_xref(QPDFExc& e)
  538 +QPDF::Xref_table::reconstruct(QPDFExc& e)
539 539 {
540   - if (m->xref_table.reconstructed) {
  540 + if (reconstructed) {
541 541 // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because
542 542 // qpdf is throwing many fewer exceptions while parsing. Most situations are warnings now.
543 543 throw e;
544 544 }
545 545  
  546 + auto* m = qpdf.m.get();
  547 +
546 548 // If recovery generates more than 1000 warnings, the file is so severely damaged that there
547 549 // probably is no point trying to continue.
548 550 const auto max_warnings = m->warnings.size() + 1000U;
549 551 auto check_warnings = [this, max_warnings]() {
550   - if (m->warnings.size() > max_warnings) {
551   - throw damagedPDF("", 0, "too many errors while reconstructing cross-reference table");
  552 + if (qpdf.m->warnings.size() > max_warnings) {
  553 + throw damaged_pdf("too many errors while reconstructing cross-reference table");
552 554 }
553 555 };
554 556  
555   - m->xref_table.reconstructed = true;
  557 + reconstructed = true;
556 558 // We may find more objects, which may contain dangling references.
557 559 m->fixed_dangling_refs = false;
558 560  
559   - warn(damagedPDF("", 0, "file is damaged"));
560   - warn(e);
561   - warn(damagedPDF("", 0, "Attempting to reconstruct cross-reference table"));
  561 + warn_damaged("file is damaged");
  562 + qpdf.warn(e);
  563 + warn_damaged("Attempting to reconstruct cross-reference table");
562 564  
563 565 // Delete all references to type 1 (uncompressed) objects
564 566 std::set<QPDFObjGen> to_delete;
565   - for (auto const& iter: m->xref_table) {
  567 + for (auto const& iter: *this) {
566 568 if (iter.second.getType() == 1) {
567 569 to_delete.insert(iter.first);
568 570 }
569 571 }
570 572 for (auto const& iter: to_delete) {
571   - m->xref_table.erase(iter);
  573 + erase(iter);
572 574 }
573 575  
574 576 m->file->seek(0, SEEK_END);
... ... @@ -577,46 +579,45 @@ QPDF::reconstruct_xref(QPDFExc&amp; e)
577 579 // Don't allow very long tokens here during recovery. All the interesting tokens are covered.
578 580 static size_t const MAX_LEN = 10;
579 581 while (m->file->tell() < eof) {
580   - QPDFTokenizer::Token t1 = readToken(*m->file, MAX_LEN);
  582 + QPDFTokenizer::Token t1 = qpdf.readToken(*m->file, MAX_LEN);
581 583 qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length());
582 584 if (t1.isInteger()) {
583 585 auto pos = m->file->tell();
584   - QPDFTokenizer::Token t2 = readToken(*m->file, MAX_LEN);
585   - if ((t2.isInteger()) && (readToken(*m->file, MAX_LEN).isWord("obj"))) {
  586 + QPDFTokenizer::Token t2 = qpdf.readToken(*m->file, MAX_LEN);
  587 + if (t2.isInteger() && qpdf.readToken(*m->file, MAX_LEN).isWord("obj")) {
586 588 int obj = QUtil::string_to_int(t1.getValue().c_str());
587 589 int gen = QUtil::string_to_int(t2.getValue().c_str());
588   - if (obj <= m->xref_table.max_id) {
589   - m->xref_table.insert_reconstructed(obj, token_start, gen);
  590 + if (obj <= max_id) {
  591 + insert_reconstructed(obj, token_start, gen);
590 592 } else {
591   - warn(damagedPDF(
592   - "", 0, "ignoring object with impossibly large id " + std::to_string(obj)));
  593 + warn_damaged("ignoring object with impossibly large id " + std::to_string(obj));
593 594 }
594 595 }
595 596 m->file->seek(pos, SEEK_SET);
596   - } else if (!m->xref_table.trailer && t1.isWord("trailer")) {
  597 + } else if (!trailer && t1.isWord("trailer")) {
597 598 auto pos = m->file->tell();
598   - QPDFObjectHandle t = readTrailer();
  599 + QPDFObjectHandle t = qpdf.readTrailer();
599 600 if (!t.isDictionary()) {
600 601 // Oh well. It was worth a try.
601 602 } else {
602   - setTrailer(t);
  603 + qpdf.setTrailer(t);
603 604 }
604 605 m->file->seek(pos, SEEK_SET);
605 606 }
606 607 check_warnings();
607 608 m->file->findAndSkipNextEOL();
608 609 }
609   - m->xref_table.deleted_objects.clear();
  610 + deleted_objects.clear();
610 611  
611   - if (!m->xref_table.trailer) {
  612 + if (!trailer) {
612 613 qpdf_offset_t max_offset{0};
613 614 // If there are any xref streams, take the last one to appear.
614   - for (auto const& iter: m->xref_table) {
  615 + for (auto const& iter: *this) {
615 616 auto entry = iter.second;
616 617 if (entry.getType() != 1) {
617 618 continue;
618 619 }
619   - auto oh = getObjectByObjGen(iter.first);
  620 + auto oh = qpdf.getObjectByObjGen(iter.first);
620 621 try {
621 622 if (!oh.isStreamOfType("/XRef")) {
622 623 continue;
... ... @@ -627,41 +628,41 @@ QPDF::reconstruct_xref(QPDFExc&amp; e)
627 628 auto offset = entry.getOffset();
628 629 if (offset > max_offset) {
629 630 max_offset = offset;
630   - setTrailer(oh.getDict());
  631 + trailer = oh.getDict();
631 632 }
632 633 check_warnings();
633 634 }
634 635 if (max_offset > 0) {
635 636 try {
636   - read_xref(max_offset);
  637 + qpdf.read_xref(max_offset);
637 638 } catch (std::exception&) {
638   - throw damagedPDF(
639   - "", 0, "error decoding candidate xref stream while recovering damaged file");
  639 + throw damaged_pdf(
  640 + "error decoding candidate xref stream while recovering damaged file");
640 641 }
641 642 QTC::TC("qpdf", "QPDF recover xref stream");
642 643 }
643 644 }
644 645  
645   - if (!m->xref_table.trailer) {
  646 + if (!trailer) {
646 647 // We could check the last encountered object to see if it was an xref stream. If so, we
647 648 // could try to get the trailer from there. This may make it possible to recover files with
648 649 // bad startxref pointers even when they have object streams.
649 650  
650   - throw damagedPDF("", 0, "unable to find trailer dictionary while recovering damaged file");
  651 + throw damaged_pdf("unable to find trailer dictionary while recovering damaged file");
651 652 }
652   - if (m->xref_table.empty()) {
  653 + if (empty()) {
653 654 // We cannot check for an empty xref table in parse because empty tables are valid when
654 655 // creating QPDF objects from JSON.
655   - throw damagedPDF("", 0, "unable to find objects while recovering damaged file");
  656 + throw damaged_pdf("unable to find objects while recovering damaged file");
656 657 }
657 658 check_warnings();
658   - if (!m->xref_table.parsed) {
659   - m->xref_table.parsed = true;
660   - getAllPages();
  659 + if (!parsed) {
  660 + parsed = true;
  661 + qpdf.getAllPages();
661 662 check_warnings();
662 663 if (m->all_pages.empty()) {
663   - m->xref_table.parsed = false;
664   - throw damagedPDF("", 0, "unable to find any pages while recovering damaged file");
  664 + parsed = false;
  665 + throw damaged_pdf("unable to find any pages while recovering damaged file");
665 666 }
666 667 }
667 668 // We could iterate through the objects looking for streams and try to find objects inside of
... ... @@ -1766,7 +1767,7 @@ QPDF::readObjectAtOffset(
1766 1767 } catch (QPDFExc& e) {
1767 1768 if (try_recovery) {
1768 1769 // Try again after reconstructing xref table
1769   - reconstruct_xref(e);
  1770 + m->xref_table.reconstruct(e);
1770 1771 if (m->xref_table.count(exp_og) && (m->xref_table[exp_og].getType() == 1)) {
1771 1772 qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset();
1772 1773 QPDFObjectHandle result =
... ...
libqpdf/qpdf/QPDF_private.hh
... ... @@ -16,6 +16,8 @@ class QPDF::Xref_table: public std::map&lt;QPDFObjGen, QPDFXRefEntry&gt;
16 16 void insert(int obj, int f0, qpdf_offset_t f1, int f2);
17 17 void insert_free(QPDFObjGen);
18 18  
  19 + void reconstruct(QPDFExc& e);
  20 +
19 21 QPDFObjectHandle trailer;
20 22 bool reconstructed{false};
21 23 // Various tables are indexed by object id, with potential size id + 1
... ... @@ -31,6 +33,17 @@ class QPDF::Xref_table: public std::map&lt;QPDFObjGen, QPDFXRefEntry&gt;
31 33 qpdf_offset_t first_item_offset{0}; // actual value from file
32 34  
33 35 private:
  36 + QPDFExc
  37 + damaged_pdf(std::string const& msg)
  38 + {
  39 + return qpdf.damagedPDF("", 0, msg);
  40 + }
  41 +
  42 + void
  43 + warn_damaged(std::string const& msg)
  44 + {
  45 + qpdf.warn(damaged_pdf(msg));
  46 + }
34 47 QPDF& qpdf;
35 48 };
36 49  
... ...
qpdf/qtest/object-stream.test
... ... @@ -102,11 +102,10 @@ $td-&gt;runtest(&quot;recover file with xref stream&quot;,
102 102 {$td->COMMAND => "qpdf --static-id --compress-streams=n" .
103 103 " recover-xref-stream.pdf a.pdf"},
104 104 {$td->FILE => "recover-xref-stream.out", $td->EXIT_STATUS => 3},
105   - $td->EXPECT_FAILURE);
  105 + $td->NORMALIZE_NEWLINES);
106 106 $td->runtest("check file",
107 107 {$td->FILE => "a.pdf"},
108   - {$td->FILE => "recover-xref-stream-recovered.pdf"},
109   - $td->EXPECT_FAILURE);
  108 + {$td->FILE => "recover-xref-stream-recovered.pdf"});
110 109  
111 110 # Self-referential object stream
112 111 $td->runtest("self-referential object stream",
... ...
qpdf/qtest/qpdf/recover-xref-stream.out
1 1 WARNING: recover-xref-stream.pdf: file is damaged
2 2 WARNING: recover-xref-stream.pdf: can't find startxref
3 3 WARNING: recover-xref-stream.pdf: Attempting to reconstruct cross-reference table
4   -WARNING: recover-xref-stream.pdf: reported number of objects (14) is not one plus the highest object number (15)
5 4 qpdf: operation succeeded with warnings; resulting file may have some problems
... ...