Commit 1e2dcbf03e8f1114b42c1237b536019fff2e1f4c
1 parent
b1d845e7
Add QPDF::Xref_table members file and tokenizer
Showing
2 changed files
with
48 additions
and
49 deletions
libqpdf/QPDF.cc
| ... | ... | @@ -201,7 +201,7 @@ QPDF::Members::Members(QPDF& qpdf) : |
| 201 | 201 | file_sp(new InvalidInputSource()), |
| 202 | 202 | file(file_sp.get()), |
| 203 | 203 | encp(new EncryptionParameters), |
| 204 | - xref_table(qpdf) | |
| 204 | + xref_table(qpdf, file) | |
| 205 | 205 | { |
| 206 | 206 | } |
| 207 | 207 | |
| ... | ... | @@ -495,12 +495,10 @@ QPDF::warn( |
| 495 | 495 | void |
| 496 | 496 | QPDF::Xref_table::initialize() |
| 497 | 497 | { |
| 498 | - auto* m = qpdf.m.get(); | |
| 499 | - | |
| 500 | 498 | // PDF spec says %%EOF must be found within the last 1024 bytes of/ the file. We add an extra |
| 501 | 499 | // 30 characters to leave room for the startxref stuff. |
| 502 | - m->file->seek(0, SEEK_END); | |
| 503 | - qpdf_offset_t end_offset = m->file->tell(); | |
| 500 | + file->seek(0, SEEK_END); | |
| 501 | + qpdf_offset_t end_offset = file->tell(); | |
| 504 | 502 | max_offset = end_offset; |
| 505 | 503 | // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic |
| 506 | 504 | // scenarios at least 3 bytes are required. |
| ... | ... | @@ -510,8 +508,8 @@ QPDF::Xref_table::initialize() |
| 510 | 508 | qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0); |
| 511 | 509 | PatternFinder sf(qpdf, &QPDF::findStartxref); |
| 512 | 510 | qpdf_offset_t xref_offset = 0; |
| 513 | - if (m->file->findLast("startxref", start_offset, 0, sf)) { | |
| 514 | - xref_offset = QUtil::string_to_ll(qpdf.readToken(*m->file).getValue().c_str()); | |
| 511 | + if (file->findLast("startxref", start_offset, 0, sf)) { | |
| 512 | + xref_offset = QUtil::string_to_ll(read_token().getValue().c_str()); | |
| 515 | 513 | } |
| 516 | 514 | |
| 517 | 515 | try { |
| ... | ... | @@ -547,11 +545,9 @@ QPDF::Xref_table::reconstruct(QPDFExc& e) |
| 547 | 545 | throw e; |
| 548 | 546 | } |
| 549 | 547 | |
| 550 | - auto* m = qpdf.m.get(); | |
| 551 | - | |
| 552 | 548 | // If recovery generates more than 1000 warnings, the file is so severely damaged that there |
| 553 | 549 | // probably is no point trying to continue. |
| 554 | - const auto max_warnings = m->warnings.size() + 1000U; | |
| 550 | + const auto max_warnings = qpdf.m->warnings.size() + 1000U; | |
| 555 | 551 | auto check_warnings = [this, max_warnings]() { |
| 556 | 552 | if (qpdf.m->warnings.size() > max_warnings) { |
| 557 | 553 | throw damaged_pdf("too many errors while reconstructing cross-reference table"); |
| ... | ... | @@ -560,7 +556,7 @@ QPDF::Xref_table::reconstruct(QPDFExc& e) |
| 560 | 556 | |
| 561 | 557 | reconstructed = true; |
| 562 | 558 | // We may find more objects, which may contain dangling references. |
| 563 | - m->fixed_dangling_refs = false; | |
| 559 | + qpdf.m->fixed_dangling_refs = false; | |
| 564 | 560 | |
| 565 | 561 | warn_damaged("file is damaged"); |
| 566 | 562 | qpdf.warn(e); |
| ... | ... | @@ -577,18 +573,18 @@ QPDF::Xref_table::reconstruct(QPDFExc& e) |
| 577 | 573 | erase(iter); |
| 578 | 574 | } |
| 579 | 575 | |
| 580 | - m->file->seek(0, SEEK_END); | |
| 581 | - qpdf_offset_t eof = m->file->tell(); | |
| 582 | - m->file->seek(0, SEEK_SET); | |
| 576 | + file->seek(0, SEEK_END); | |
| 577 | + qpdf_offset_t eof = file->tell(); | |
| 578 | + file->seek(0, SEEK_SET); | |
| 583 | 579 | // Don't allow very long tokens here during recovery. All the interesting tokens are covered. |
| 584 | 580 | static size_t const MAX_LEN = 10; |
| 585 | - while (m->file->tell() < eof) { | |
| 586 | - QPDFTokenizer::Token t1 = qpdf.readToken(*m->file, MAX_LEN); | |
| 587 | - qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length()); | |
| 581 | + while (file->tell() < eof) { | |
| 582 | + QPDFTokenizer::Token t1 = read_token(MAX_LEN); | |
| 583 | + qpdf_offset_t token_start = file->tell() - toO(t1.getValue().length()); | |
| 588 | 584 | if (t1.isInteger()) { |
| 589 | - auto pos = m->file->tell(); | |
| 590 | - QPDFTokenizer::Token t2 = qpdf.readToken(*m->file, MAX_LEN); | |
| 591 | - if (t2.isInteger() && qpdf.readToken(*m->file, MAX_LEN).isWord("obj")) { | |
| 585 | + auto pos = file->tell(); | |
| 586 | + QPDFTokenizer::Token t2 = read_token(MAX_LEN); | |
| 587 | + if (t2.isInteger() && read_token(MAX_LEN).isWord("obj")) { | |
| 592 | 588 | int obj = QUtil::string_to_int(t1.getValue().c_str()); |
| 593 | 589 | int gen = QUtil::string_to_int(t2.getValue().c_str()); |
| 594 | 590 | if (obj <= max_id) { |
| ... | ... | @@ -597,19 +593,19 @@ QPDF::Xref_table::reconstruct(QPDFExc& e) |
| 597 | 593 | warn_damaged("ignoring object with impossibly large id " + std::to_string(obj)); |
| 598 | 594 | } |
| 599 | 595 | } |
| 600 | - m->file->seek(pos, SEEK_SET); | |
| 596 | + file->seek(pos, SEEK_SET); | |
| 601 | 597 | } else if (!trailer && t1.isWord("trailer")) { |
| 602 | - auto pos = m->file->tell(); | |
| 598 | + auto pos = file->tell(); | |
| 603 | 599 | QPDFObjectHandle t = qpdf.readTrailer(); |
| 604 | 600 | if (!t.isDictionary()) { |
| 605 | 601 | // Oh well. It was worth a try. |
| 606 | 602 | } else { |
| 607 | 603 | trailer = t; |
| 608 | 604 | } |
| 609 | - m->file->seek(pos, SEEK_SET); | |
| 605 | + file->seek(pos, SEEK_SET); | |
| 610 | 606 | } |
| 611 | 607 | check_warnings(); |
| 612 | - m->file->findAndSkipNextEOL(); | |
| 608 | + file->findAndSkipNextEOL(); | |
| 613 | 609 | } |
| 614 | 610 | deleted_objects.clear(); |
| 615 | 611 | |
| ... | ... | @@ -664,7 +660,7 @@ QPDF::Xref_table::reconstruct(QPDFExc& e) |
| 664 | 660 | parsed = true; |
| 665 | 661 | qpdf.getAllPages(); |
| 666 | 662 | check_warnings(); |
| 667 | - if (m->all_pages.empty()) { | |
| 663 | + if (qpdf.m->all_pages.empty()) { | |
| 668 | 664 | parsed = false; |
| 669 | 665 | throw damaged_pdf("unable to find any pages while recovering damaged file"); |
| 670 | 666 | } |
| ... | ... | @@ -679,15 +675,13 @@ QPDF::Xref_table::reconstruct(QPDFExc& e) |
| 679 | 675 | void |
| 680 | 676 | QPDF::Xref_table::read(qpdf_offset_t xref_offset) |
| 681 | 677 | { |
| 682 | - auto* m = qpdf.m.get(); | |
| 683 | - | |
| 684 | 678 | std::map<int, int> free_table; |
| 685 | 679 | std::set<qpdf_offset_t> visited; |
| 686 | 680 | while (xref_offset) { |
| 687 | 681 | visited.insert(xref_offset); |
| 688 | 682 | char buf[7]; |
| 689 | 683 | memset(buf, 0, sizeof(buf)); |
| 690 | - m->file->seek(xref_offset, SEEK_SET); | |
| 684 | + file->seek(xref_offset, SEEK_SET); | |
| 691 | 685 | // Some files miss the mark a little with startxref. We could do a better job of searching |
| 692 | 686 | // in the neighborhood for something that looks like either an xref table or stream, but the |
| 693 | 687 | // simple heuristic of skipping whitespace can help with the xref table case and is harmless |
| ... | ... | @@ -696,11 +690,11 @@ QPDF::Xref_table::read(qpdf_offset_t xref_offset) |
| 696 | 690 | bool skipped_space = false; |
| 697 | 691 | while (!done) { |
| 698 | 692 | char ch; |
| 699 | - if (1 == m->file->read(&ch, 1)) { | |
| 693 | + if (1 == file->read(&ch, 1)) { | |
| 700 | 694 | if (QUtil::is_space(ch)) { |
| 701 | 695 | skipped_space = true; |
| 702 | 696 | } else { |
| 703 | - m->file->unreadCh(ch); | |
| 697 | + file->unreadCh(ch); | |
| 704 | 698 | done = true; |
| 705 | 699 | } |
| 706 | 700 | } else { |
| ... | ... | @@ -709,7 +703,7 @@ QPDF::Xref_table::read(qpdf_offset_t xref_offset) |
| 709 | 703 | } |
| 710 | 704 | } |
| 711 | 705 | |
| 712 | - m->file->read(buf, sizeof(buf) - 1); | |
| 706 | + file->read(buf, sizeof(buf) - 1); | |
| 713 | 707 | // The PDF spec says xref must be followed by a line terminator, but files exist in the wild |
| 714 | 708 | // where it is terminated by arbitrary whitespace. |
| 715 | 709 | if ((strncmp(buf, "xref", 4) == 0) && QUtil::is_space(buf[4])) { |
| ... | ... | @@ -823,11 +817,9 @@ QPDF::Xref_table::parse_first(std::string const& line, int& obj, int& num, int& |
| 823 | 817 | bool |
| 824 | 818 | QPDF::Xref_table::read_bad_entry(qpdf_offset_t& f1, int& f2, char& type) |
| 825 | 819 | { |
| 826 | - auto* m = qpdf.m.get(); | |
| 827 | - | |
| 828 | 820 | // Reposition after initial read attempt and reread. |
| 829 | - m->file->seek(m->file->getLastOffset(), SEEK_SET); | |
| 830 | - auto line = m->file->readLine(30); | |
| 821 | + file->seek(file->getLastOffset(), SEEK_SET); | |
| 822 | + auto line = file->readLine(30); | |
| 831 | 823 | |
| 832 | 824 | // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated |
| 833 | 825 | // buffer. |
| ... | ... | @@ -907,10 +899,8 @@ QPDF::Xref_table::read_bad_entry(qpdf_offset_t& f1, int& f2, char& type) |
| 907 | 899 | bool |
| 908 | 900 | QPDF::Xref_table::read_entry(qpdf_offset_t& f1, int& f2, char& type) |
| 909 | 901 | { |
| 910 | - auto* m = qpdf.m.get(); | |
| 911 | - | |
| 912 | 902 | std::array<char, 21> line; |
| 913 | - if (m->file->read(line.data(), 20) != 20) { | |
| 903 | + if (file->read(line.data(), 20) != 20) { | |
| 914 | 904 | // C++20: [[unlikely]] |
| 915 | 905 | return false; |
| 916 | 906 | } |
| ... | ... | @@ -963,13 +953,11 @@ QPDF::Xref_table::read_entry(qpdf_offset_t& f1, int& f2, char& type) |
| 963 | 953 | qpdf_offset_t |
| 964 | 954 | QPDF::Xref_table::read_table(qpdf_offset_t xref_offset) |
| 965 | 955 | { |
| 966 | - auto* m = qpdf.m.get(); | |
| 967 | - | |
| 968 | - m->file->seek(xref_offset, SEEK_SET); | |
| 956 | + file->seek(xref_offset, SEEK_SET); | |
| 969 | 957 | std::string line; |
| 970 | 958 | while (true) { |
| 971 | 959 | line.assign(50, '\0'); |
| 972 | - m->file->read(line.data(), line.size()); | |
| 960 | + file->read(line.data(), line.size()); | |
| 973 | 961 | int obj = 0; |
| 974 | 962 | int num = 0; |
| 975 | 963 | int bytes = 0; |
| ... | ... | @@ -977,11 +965,11 @@ QPDF::Xref_table::read_table(qpdf_offset_t xref_offset) |
| 977 | 965 | QTC::TC("qpdf", "QPDF invalid xref"); |
| 978 | 966 | throw damaged_table("xref syntax invalid"); |
| 979 | 967 | } |
| 980 | - m->file->seek(m->file->getLastOffset() + bytes, SEEK_SET); | |
| 968 | + file->seek(file->getLastOffset() + bytes, SEEK_SET); | |
| 981 | 969 | for (qpdf_offset_t i = obj; i - num < obj; ++i) { |
| 982 | 970 | if (i == 0) { |
| 983 | 971 | // This is needed by checkLinearization() |
| 984 | - first_item_offset = m->file->tell(); | |
| 972 | + first_item_offset = file->tell(); | |
| 985 | 973 | } |
| 986 | 974 | // For xref_table, these will always be small enough to be ints |
| 987 | 975 | qpdf_offset_t f1 = 0; |
| ... | ... | @@ -997,11 +985,11 @@ QPDF::Xref_table::read_table(qpdf_offset_t xref_offset) |
| 997 | 985 | insert(toI(i), 1, f1, f2); |
| 998 | 986 | } |
| 999 | 987 | } |
| 1000 | - qpdf_offset_t pos = m->file->tell(); | |
| 1001 | - if (qpdf.readToken(*m->file).isWord("trailer")) { | |
| 988 | + qpdf_offset_t pos = file->tell(); | |
| 989 | + if (read_token().isWord("trailer")) { | |
| 1002 | 990 | break; |
| 1003 | 991 | } else { |
| 1004 | - m->file->seek(pos, SEEK_SET); | |
| 992 | + file->seek(pos, SEEK_SET); | |
| 1005 | 993 | } |
| 1006 | 994 | } |
| 1007 | 995 | ... | ... |
libqpdf/qpdf/QPDF_private.hh
| ... | ... | @@ -7,9 +7,11 @@ |
| 7 | 7 | class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry> |
| 8 | 8 | { |
| 9 | 9 | public: |
| 10 | - Xref_table(QPDF& qpdf) : | |
| 11 | - qpdf(qpdf) | |
| 10 | + Xref_table(QPDF& qpdf, InputSource* const& file) : | |
| 11 | + qpdf(qpdf), | |
| 12 | + file(file) | |
| 12 | 13 | { |
| 14 | + tokenizer.allowEOF(); | |
| 13 | 15 | } |
| 14 | 16 | |
| 15 | 17 | void initialize(); |
| ... | ... | @@ -50,6 +52,12 @@ class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry> |
| 50 | 52 | int max_num_entries, |
| 51 | 53 | std::function<QPDFExc(std::string_view)> damaged); |
| 52 | 54 | |
| 55 | + QPDFTokenizer::Token | |
| 56 | + read_token(size_t max_len = 0) | |
| 57 | + { | |
| 58 | + return tokenizer.readToken(*file, "", true, max_len); | |
| 59 | + } | |
| 60 | + | |
| 53 | 61 | // Methods to insert table entries |
| 54 | 62 | void insert_reconstructed(int obj, qpdf_offset_t f1, int f2); |
| 55 | 63 | void insert(int obj, int f0, qpdf_offset_t f1, int f2); |
| ... | ... | @@ -72,7 +80,10 @@ class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry> |
| 72 | 80 | { |
| 73 | 81 | qpdf.warn(damaged_pdf(msg)); |
| 74 | 82 | } |
| 83 | + | |
| 75 | 84 | QPDF& qpdf; |
| 85 | + InputSource* const& file; | |
| 86 | + QPDFTokenizer tokenizer; | |
| 76 | 87 | }; |
| 77 | 88 | |
| 78 | 89 | // Writer class is restricted to QPDFWriter so that only it can call certain methods. | ... | ... |