Commit 3fbff845949c56180dfdafc0812da95a168f224b
1 parent
1e072e22
Move QPDF::reconstruct_xref to QPDF::Xref_table
Also, when recovering trailer from xref streams, pick the last valid trailer encountered rather than the first.
Showing
5 changed files
with
55 additions
and
44 deletions
include/qpdf/QPDF.hh
| @@ -762,7 +762,6 @@ class QPDF | @@ -762,7 +762,6 @@ class QPDF | ||
| 762 | void setTrailer(QPDFObjectHandle obj); | 762 | void setTrailer(QPDFObjectHandle obj); |
| 763 | void read_xref(qpdf_offset_t offset); | 763 | void read_xref(qpdf_offset_t offset); |
| 764 | bool resolveXRefTable(); | 764 | bool resolveXRefTable(); |
| 765 | - void reconstruct_xref(QPDFExc& e); | ||
| 766 | bool parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes); | 765 | bool parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes); |
| 767 | bool read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type); | 766 | bool read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type); |
| 768 | bool read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type); | 767 | bool read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type); |
libqpdf/QPDF.cc
| @@ -476,7 +476,7 @@ QPDF::parse(char const* password) | @@ -476,7 +476,7 @@ QPDF::parse(char const* password) | ||
| 476 | } | 476 | } |
| 477 | } catch (QPDFExc& e) { | 477 | } catch (QPDFExc& e) { |
| 478 | if (m->attempt_recovery) { | 478 | if (m->attempt_recovery) { |
| 479 | - reconstruct_xref(e); | 479 | + m->xref_table.reconstruct(e); |
| 480 | QTC::TC("qpdf", "QPDF reconstructed xref table"); | 480 | QTC::TC("qpdf", "QPDF reconstructed xref table"); |
| 481 | } else { | 481 | } else { |
| 482 | throw; | 482 | throw; |
| @@ -535,40 +535,42 @@ QPDF::setTrailer(QPDFObjectHandle obj) | @@ -535,40 +535,42 @@ QPDF::setTrailer(QPDFObjectHandle obj) | ||
| 535 | } | 535 | } |
| 536 | 536 | ||
| 537 | void | 537 | void |
| 538 | -QPDF::reconstruct_xref(QPDFExc& e) | 538 | +QPDF::Xref_table::reconstruct(QPDFExc& e) |
| 539 | { | 539 | { |
| 540 | - if (m->xref_table.reconstructed) { | 540 | + if (reconstructed) { |
| 541 | // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because | 541 | // Avoid xref reconstruction infinite loops. This is getting very hard to reproduce because |
| 542 | // qpdf is throwing many fewer exceptions while parsing. Most situations are warnings now. | 542 | // qpdf is throwing many fewer exceptions while parsing. Most situations are warnings now. |
| 543 | throw e; | 543 | throw e; |
| 544 | } | 544 | } |
| 545 | 545 | ||
| 546 | + auto* m = qpdf.m.get(); | ||
| 547 | + | ||
| 546 | // If recovery generates more than 1000 warnings, the file is so severely damaged that there | 548 | // If recovery generates more than 1000 warnings, the file is so severely damaged that there |
| 547 | // probably is no point trying to continue. | 549 | // probably is no point trying to continue. |
| 548 | const auto max_warnings = m->warnings.size() + 1000U; | 550 | const auto max_warnings = m->warnings.size() + 1000U; |
| 549 | auto check_warnings = [this, max_warnings]() { | 551 | auto check_warnings = [this, max_warnings]() { |
| 550 | - if (m->warnings.size() > max_warnings) { | ||
| 551 | - throw damagedPDF("", 0, "too many errors while reconstructing cross-reference table"); | 552 | + if (qpdf.m->warnings.size() > max_warnings) { |
| 553 | + throw damaged_pdf("too many errors while reconstructing cross-reference table"); | ||
| 552 | } | 554 | } |
| 553 | }; | 555 | }; |
| 554 | 556 | ||
| 555 | - m->xref_table.reconstructed = true; | 557 | + reconstructed = true; |
| 556 | // We may find more objects, which may contain dangling references. | 558 | // We may find more objects, which may contain dangling references. |
| 557 | m->fixed_dangling_refs = false; | 559 | m->fixed_dangling_refs = false; |
| 558 | 560 | ||
| 559 | - warn(damagedPDF("", 0, "file is damaged")); | ||
| 560 | - warn(e); | ||
| 561 | - warn(damagedPDF("", 0, "Attempting to reconstruct cross-reference table")); | 561 | + warn_damaged("file is damaged"); |
| 562 | + qpdf.warn(e); | ||
| 563 | + warn_damaged("Attempting to reconstruct cross-reference table"); | ||
| 562 | 564 | ||
| 563 | // Delete all references to type 1 (uncompressed) objects | 565 | // Delete all references to type 1 (uncompressed) objects |
| 564 | std::set<QPDFObjGen> to_delete; | 566 | std::set<QPDFObjGen> to_delete; |
| 565 | - for (auto const& iter: m->xref_table) { | 567 | + for (auto const& iter: *this) { |
| 566 | if (iter.second.getType() == 1) { | 568 | if (iter.second.getType() == 1) { |
| 567 | to_delete.insert(iter.first); | 569 | to_delete.insert(iter.first); |
| 568 | } | 570 | } |
| 569 | } | 571 | } |
| 570 | for (auto const& iter: to_delete) { | 572 | for (auto const& iter: to_delete) { |
| 571 | - m->xref_table.erase(iter); | 573 | + erase(iter); |
| 572 | } | 574 | } |
| 573 | 575 | ||
| 574 | m->file->seek(0, SEEK_END); | 576 | m->file->seek(0, SEEK_END); |
| @@ -577,46 +579,45 @@ QPDF::reconstruct_xref(QPDFExc& e) | @@ -577,46 +579,45 @@ QPDF::reconstruct_xref(QPDFExc& e) | ||
| 577 | // Don't allow very long tokens here during recovery. All the interesting tokens are covered. | 579 | // Don't allow very long tokens here during recovery. All the interesting tokens are covered. |
| 578 | static size_t const MAX_LEN = 10; | 580 | static size_t const MAX_LEN = 10; |
| 579 | while (m->file->tell() < eof) { | 581 | while (m->file->tell() < eof) { |
| 580 | - QPDFTokenizer::Token t1 = readToken(*m->file, MAX_LEN); | 582 | + QPDFTokenizer::Token t1 = qpdf.readToken(*m->file, MAX_LEN); |
| 581 | qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length()); | 583 | qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length()); |
| 582 | if (t1.isInteger()) { | 584 | if (t1.isInteger()) { |
| 583 | auto pos = m->file->tell(); | 585 | auto pos = m->file->tell(); |
| 584 | - QPDFTokenizer::Token t2 = readToken(*m->file, MAX_LEN); | ||
| 585 | - if ((t2.isInteger()) && (readToken(*m->file, MAX_LEN).isWord("obj"))) { | 586 | + QPDFTokenizer::Token t2 = qpdf.readToken(*m->file, MAX_LEN); |
| 587 | + if (t2.isInteger() && qpdf.readToken(*m->file, MAX_LEN).isWord("obj")) { | ||
| 586 | int obj = QUtil::string_to_int(t1.getValue().c_str()); | 588 | int obj = QUtil::string_to_int(t1.getValue().c_str()); |
| 587 | int gen = QUtil::string_to_int(t2.getValue().c_str()); | 589 | int gen = QUtil::string_to_int(t2.getValue().c_str()); |
| 588 | - if (obj <= m->xref_table.max_id) { | ||
| 589 | - m->xref_table.insert_reconstructed(obj, token_start, gen); | 590 | + if (obj <= max_id) { |
| 591 | + insert_reconstructed(obj, token_start, gen); | ||
| 590 | } else { | 592 | } else { |
| 591 | - warn(damagedPDF( | ||
| 592 | - "", 0, "ignoring object with impossibly large id " + std::to_string(obj))); | 593 | + warn_damaged("ignoring object with impossibly large id " + std::to_string(obj)); |
| 593 | } | 594 | } |
| 594 | } | 595 | } |
| 595 | m->file->seek(pos, SEEK_SET); | 596 | m->file->seek(pos, SEEK_SET); |
| 596 | - } else if (!m->xref_table.trailer && t1.isWord("trailer")) { | 597 | + } else if (!trailer && t1.isWord("trailer")) { |
| 597 | auto pos = m->file->tell(); | 598 | auto pos = m->file->tell(); |
| 598 | - QPDFObjectHandle t = readTrailer(); | 599 | + QPDFObjectHandle t = qpdf.readTrailer(); |
| 599 | if (!t.isDictionary()) { | 600 | if (!t.isDictionary()) { |
| 600 | // Oh well. It was worth a try. | 601 | // Oh well. It was worth a try. |
| 601 | } else { | 602 | } else { |
| 602 | - setTrailer(t); | 603 | + qpdf.setTrailer(t); |
| 603 | } | 604 | } |
| 604 | m->file->seek(pos, SEEK_SET); | 605 | m->file->seek(pos, SEEK_SET); |
| 605 | } | 606 | } |
| 606 | check_warnings(); | 607 | check_warnings(); |
| 607 | m->file->findAndSkipNextEOL(); | 608 | m->file->findAndSkipNextEOL(); |
| 608 | } | 609 | } |
| 609 | - m->xref_table.deleted_objects.clear(); | 610 | + deleted_objects.clear(); |
| 610 | 611 | ||
| 611 | - if (!m->xref_table.trailer) { | 612 | + if (!trailer) { |
| 612 | qpdf_offset_t max_offset{0}; | 613 | qpdf_offset_t max_offset{0}; |
| 613 | // If there are any xref streams, take the last one to appear. | 614 | // If there are any xref streams, take the last one to appear. |
| 614 | - for (auto const& iter: m->xref_table) { | 615 | + for (auto const& iter: *this) { |
| 615 | auto entry = iter.second; | 616 | auto entry = iter.second; |
| 616 | if (entry.getType() != 1) { | 617 | if (entry.getType() != 1) { |
| 617 | continue; | 618 | continue; |
| 618 | } | 619 | } |
| 619 | - auto oh = getObjectByObjGen(iter.first); | 620 | + auto oh = qpdf.getObjectByObjGen(iter.first); |
| 620 | try { | 621 | try { |
| 621 | if (!oh.isStreamOfType("/XRef")) { | 622 | if (!oh.isStreamOfType("/XRef")) { |
| 622 | continue; | 623 | continue; |
| @@ -627,41 +628,41 @@ QPDF::reconstruct_xref(QPDFExc& e) | @@ -627,41 +628,41 @@ QPDF::reconstruct_xref(QPDFExc& e) | ||
| 627 | auto offset = entry.getOffset(); | 628 | auto offset = entry.getOffset(); |
| 628 | if (offset > max_offset) { | 629 | if (offset > max_offset) { |
| 629 | max_offset = offset; | 630 | max_offset = offset; |
| 630 | - setTrailer(oh.getDict()); | 631 | + trailer = oh.getDict(); |
| 631 | } | 632 | } |
| 632 | check_warnings(); | 633 | check_warnings(); |
| 633 | } | 634 | } |
| 634 | if (max_offset > 0) { | 635 | if (max_offset > 0) { |
| 635 | try { | 636 | try { |
| 636 | - read_xref(max_offset); | 637 | + qpdf.read_xref(max_offset); |
| 637 | } catch (std::exception&) { | 638 | } catch (std::exception&) { |
| 638 | - throw damagedPDF( | ||
| 639 | - "", 0, "error decoding candidate xref stream while recovering damaged file"); | 639 | + throw damaged_pdf( |
| 640 | + "error decoding candidate xref stream while recovering damaged file"); | ||
| 640 | } | 641 | } |
| 641 | QTC::TC("qpdf", "QPDF recover xref stream"); | 642 | QTC::TC("qpdf", "QPDF recover xref stream"); |
| 642 | } | 643 | } |
| 643 | } | 644 | } |
| 644 | 645 | ||
| 645 | - if (!m->xref_table.trailer) { | 646 | + if (!trailer) { |
| 646 | // We could check the last encountered object to see if it was an xref stream. If so, we | 647 | // We could check the last encountered object to see if it was an xref stream. If so, we |
| 647 | // could try to get the trailer from there. This may make it possible to recover files with | 648 | // could try to get the trailer from there. This may make it possible to recover files with |
| 648 | // bad startxref pointers even when they have object streams. | 649 | // bad startxref pointers even when they have object streams. |
| 649 | 650 | ||
| 650 | - throw damagedPDF("", 0, "unable to find trailer dictionary while recovering damaged file"); | 651 | + throw damaged_pdf("unable to find trailer dictionary while recovering damaged file"); |
| 651 | } | 652 | } |
| 652 | - if (m->xref_table.empty()) { | 653 | + if (empty()) { |
| 653 | // We cannot check for an empty xref table in parse because empty tables are valid when | 654 | // We cannot check for an empty xref table in parse because empty tables are valid when |
| 654 | // creating QPDF objects from JSON. | 655 | // creating QPDF objects from JSON. |
| 655 | - throw damagedPDF("", 0, "unable to find objects while recovering damaged file"); | 656 | + throw damaged_pdf("unable to find objects while recovering damaged file"); |
| 656 | } | 657 | } |
| 657 | check_warnings(); | 658 | check_warnings(); |
| 658 | - if (!m->xref_table.parsed) { | ||
| 659 | - m->xref_table.parsed = true; | ||
| 660 | - getAllPages(); | 659 | + if (!parsed) { |
| 660 | + parsed = true; | ||
| 661 | + qpdf.getAllPages(); | ||
| 661 | check_warnings(); | 662 | check_warnings(); |
| 662 | if (m->all_pages.empty()) { | 663 | if (m->all_pages.empty()) { |
| 663 | - m->xref_table.parsed = false; | ||
| 664 | - throw damagedPDF("", 0, "unable to find any pages while recovering damaged file"); | 664 | + parsed = false; |
| 665 | + throw damaged_pdf("unable to find any pages while recovering damaged file"); | ||
| 665 | } | 666 | } |
| 666 | } | 667 | } |
| 667 | // We could iterate through the objects looking for streams and try to find objects inside of | 668 | // We could iterate through the objects looking for streams and try to find objects inside of |
| @@ -1766,7 +1767,7 @@ QPDF::readObjectAtOffset( | @@ -1766,7 +1767,7 @@ QPDF::readObjectAtOffset( | ||
| 1766 | } catch (QPDFExc& e) { | 1767 | } catch (QPDFExc& e) { |
| 1767 | if (try_recovery) { | 1768 | if (try_recovery) { |
| 1768 | // Try again after reconstructing xref table | 1769 | // Try again after reconstructing xref table |
| 1769 | - reconstruct_xref(e); | 1770 | + m->xref_table.reconstruct(e); |
| 1770 | if (m->xref_table.count(exp_og) && (m->xref_table[exp_og].getType() == 1)) { | 1771 | if (m->xref_table.count(exp_og) && (m->xref_table[exp_og].getType() == 1)) { |
| 1771 | qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset(); | 1772 | qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset(); |
| 1772 | QPDFObjectHandle result = | 1773 | QPDFObjectHandle result = |
libqpdf/qpdf/QPDF_private.hh
| @@ -16,6 +16,8 @@ class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry> | @@ -16,6 +16,8 @@ class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry> | ||
| 16 | void insert(int obj, int f0, qpdf_offset_t f1, int f2); | 16 | void insert(int obj, int f0, qpdf_offset_t f1, int f2); |
| 17 | void insert_free(QPDFObjGen); | 17 | void insert_free(QPDFObjGen); |
| 18 | 18 | ||
| 19 | + void reconstruct(QPDFExc& e); | ||
| 20 | + | ||
| 19 | QPDFObjectHandle trailer; | 21 | QPDFObjectHandle trailer; |
| 20 | bool reconstructed{false}; | 22 | bool reconstructed{false}; |
| 21 | // Various tables are indexed by object id, with potential size id + 1 | 23 | // Various tables are indexed by object id, with potential size id + 1 |
| @@ -31,6 +33,17 @@ class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry> | @@ -31,6 +33,17 @@ class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry> | ||
| 31 | qpdf_offset_t first_item_offset{0}; // actual value from file | 33 | qpdf_offset_t first_item_offset{0}; // actual value from file |
| 32 | 34 | ||
| 33 | private: | 35 | private: |
| 36 | + QPDFExc | ||
| 37 | + damaged_pdf(std::string const& msg) | ||
| 38 | + { | ||
| 39 | + return qpdf.damagedPDF("", 0, msg); | ||
| 40 | + } | ||
| 41 | + | ||
| 42 | + void | ||
| 43 | + warn_damaged(std::string const& msg) | ||
| 44 | + { | ||
| 45 | + qpdf.warn(damaged_pdf(msg)); | ||
| 46 | + } | ||
| 34 | QPDF& qpdf; | 47 | QPDF& qpdf; |
| 35 | }; | 48 | }; |
| 36 | 49 |
qpdf/qtest/object-stream.test
| @@ -102,11 +102,10 @@ $td->runtest("recover file with xref stream", | @@ -102,11 +102,10 @@ $td->runtest("recover file with xref stream", | ||
| 102 | {$td->COMMAND => "qpdf --static-id --compress-streams=n" . | 102 | {$td->COMMAND => "qpdf --static-id --compress-streams=n" . |
| 103 | " recover-xref-stream.pdf a.pdf"}, | 103 | " recover-xref-stream.pdf a.pdf"}, |
| 104 | {$td->FILE => "recover-xref-stream.out", $td->EXIT_STATUS => 3}, | 104 | {$td->FILE => "recover-xref-stream.out", $td->EXIT_STATUS => 3}, |
| 105 | - $td->EXPECT_FAILURE); | 105 | + $td->NORMALIZE_NEWLINES); |
| 106 | $td->runtest("check file", | 106 | $td->runtest("check file", |
| 107 | {$td->FILE => "a.pdf"}, | 107 | {$td->FILE => "a.pdf"}, |
| 108 | - {$td->FILE => "recover-xref-stream-recovered.pdf"}, | ||
| 109 | - $td->EXPECT_FAILURE); | 108 | + {$td->FILE => "recover-xref-stream-recovered.pdf"}); |
| 110 | 109 | ||
| 111 | # Self-referential object stream | 110 | # Self-referential object stream |
| 112 | $td->runtest("self-referential object stream", | 111 | $td->runtest("self-referential object stream", |
qpdf/qtest/qpdf/recover-xref-stream.out
| 1 | WARNING: recover-xref-stream.pdf: file is damaged | 1 | WARNING: recover-xref-stream.pdf: file is damaged |
| 2 | WARNING: recover-xref-stream.pdf: can't find startxref | 2 | WARNING: recover-xref-stream.pdf: can't find startxref |
| 3 | WARNING: recover-xref-stream.pdf: Attempting to reconstruct cross-reference table | 3 | WARNING: recover-xref-stream.pdf: Attempting to reconstruct cross-reference table |
| 4 | -WARNING: recover-xref-stream.pdf: reported number of objects (14) is not one plus the highest object number (15) | ||
| 5 | qpdf: operation succeeded with warnings; resulting file may have some problems | 4 | qpdf: operation succeeded with warnings; resulting file may have some problems |