Commit 1e2dcbf03e8f1114b42c1237b536019fff2e1f4c
1 parent
b1d845e7
Add QPDF::Xref_table members file and tokenizer
Showing
2 changed files
with
48 additions
and
49 deletions
libqpdf/QPDF.cc
| @@ -201,7 +201,7 @@ QPDF::Members::Members(QPDF& qpdf) : | @@ -201,7 +201,7 @@ QPDF::Members::Members(QPDF& qpdf) : | ||
| 201 | file_sp(new InvalidInputSource()), | 201 | file_sp(new InvalidInputSource()), |
| 202 | file(file_sp.get()), | 202 | file(file_sp.get()), |
| 203 | encp(new EncryptionParameters), | 203 | encp(new EncryptionParameters), |
| 204 | - xref_table(qpdf) | 204 | + xref_table(qpdf, file) |
| 205 | { | 205 | { |
| 206 | } | 206 | } |
| 207 | 207 | ||
| @@ -495,12 +495,10 @@ QPDF::warn( | @@ -495,12 +495,10 @@ QPDF::warn( | ||
| 495 | void | 495 | void |
| 496 | QPDF::Xref_table::initialize() | 496 | QPDF::Xref_table::initialize() |
| 497 | { | 497 | { |
| 498 | - auto* m = qpdf.m.get(); | ||
| 499 | - | ||
| 500 | // PDF spec says %%EOF must be found within the last 1024 bytes of/ the file. We add an extra | 498 | // PDF spec says %%EOF must be found within the last 1024 bytes of/ the file. We add an extra |
| 501 | // 30 characters to leave room for the startxref stuff. | 499 | // 30 characters to leave room for the startxref stuff. |
| 502 | - m->file->seek(0, SEEK_END); | ||
| 503 | - qpdf_offset_t end_offset = m->file->tell(); | 500 | + file->seek(0, SEEK_END); |
| 501 | + qpdf_offset_t end_offset = file->tell(); | ||
| 504 | max_offset = end_offset; | 502 | max_offset = end_offset; |
| 505 | // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic | 503 | // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic |
| 506 | // scenarios at least 3 bytes are required. | 504 | // scenarios at least 3 bytes are required. |
| @@ -510,8 +508,8 @@ QPDF::Xref_table::initialize() | @@ -510,8 +508,8 @@ QPDF::Xref_table::initialize() | ||
| 510 | qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0); | 508 | qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0); |
| 511 | PatternFinder sf(qpdf, &QPDF::findStartxref); | 509 | PatternFinder sf(qpdf, &QPDF::findStartxref); |
| 512 | qpdf_offset_t xref_offset = 0; | 510 | qpdf_offset_t xref_offset = 0; |
| 513 | - if (m->file->findLast("startxref", start_offset, 0, sf)) { | ||
| 514 | - xref_offset = QUtil::string_to_ll(qpdf.readToken(*m->file).getValue().c_str()); | 511 | + if (file->findLast("startxref", start_offset, 0, sf)) { |
| 512 | + xref_offset = QUtil::string_to_ll(read_token().getValue().c_str()); | ||
| 515 | } | 513 | } |
| 516 | 514 | ||
| 517 | try { | 515 | try { |
| @@ -547,11 +545,9 @@ QPDF::Xref_table::reconstruct(QPDFExc& e) | @@ -547,11 +545,9 @@ QPDF::Xref_table::reconstruct(QPDFExc& e) | ||
| 547 | throw e; | 545 | throw e; |
| 548 | } | 546 | } |
| 549 | 547 | ||
| 550 | - auto* m = qpdf.m.get(); | ||
| 551 | - | ||
| 552 | // If recovery generates more than 1000 warnings, the file is so severely damaged that there | 548 | // If recovery generates more than 1000 warnings, the file is so severely damaged that there |
| 553 | // probably is no point trying to continue. | 549 | // probably is no point trying to continue. |
| 554 | - const auto max_warnings = m->warnings.size() + 1000U; | 550 | + const auto max_warnings = qpdf.m->warnings.size() + 1000U; |
| 555 | auto check_warnings = [this, max_warnings]() { | 551 | auto check_warnings = [this, max_warnings]() { |
| 556 | if (qpdf.m->warnings.size() > max_warnings) { | 552 | if (qpdf.m->warnings.size() > max_warnings) { |
| 557 | throw damaged_pdf("too many errors while reconstructing cross-reference table"); | 553 | throw damaged_pdf("too many errors while reconstructing cross-reference table"); |
| @@ -560,7 +556,7 @@ QPDF::Xref_table::reconstruct(QPDFExc& e) | @@ -560,7 +556,7 @@ QPDF::Xref_table::reconstruct(QPDFExc& e) | ||
| 560 | 556 | ||
| 561 | reconstructed = true; | 557 | reconstructed = true; |
| 562 | // We may find more objects, which may contain dangling references. | 558 | // We may find more objects, which may contain dangling references. |
| 563 | - m->fixed_dangling_refs = false; | 559 | + qpdf.m->fixed_dangling_refs = false; |
| 564 | 560 | ||
| 565 | warn_damaged("file is damaged"); | 561 | warn_damaged("file is damaged"); |
| 566 | qpdf.warn(e); | 562 | qpdf.warn(e); |
| @@ -577,18 +573,18 @@ QPDF::Xref_table::reconstruct(QPDFExc& e) | @@ -577,18 +573,18 @@ QPDF::Xref_table::reconstruct(QPDFExc& e) | ||
| 577 | erase(iter); | 573 | erase(iter); |
| 578 | } | 574 | } |
| 579 | 575 | ||
| 580 | - m->file->seek(0, SEEK_END); | ||
| 581 | - qpdf_offset_t eof = m->file->tell(); | ||
| 582 | - m->file->seek(0, SEEK_SET); | 576 | + file->seek(0, SEEK_END); |
| 577 | + qpdf_offset_t eof = file->tell(); | ||
| 578 | + file->seek(0, SEEK_SET); | ||
| 583 | // Don't allow very long tokens here during recovery. All the interesting tokens are covered. | 579 | // Don't allow very long tokens here during recovery. All the interesting tokens are covered. |
| 584 | static size_t const MAX_LEN = 10; | 580 | static size_t const MAX_LEN = 10; |
| 585 | - while (m->file->tell() < eof) { | ||
| 586 | - QPDFTokenizer::Token t1 = qpdf.readToken(*m->file, MAX_LEN); | ||
| 587 | - qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length()); | 581 | + while (file->tell() < eof) { |
| 582 | + QPDFTokenizer::Token t1 = read_token(MAX_LEN); | ||
| 583 | + qpdf_offset_t token_start = file->tell() - toO(t1.getValue().length()); | ||
| 588 | if (t1.isInteger()) { | 584 | if (t1.isInteger()) { |
| 589 | - auto pos = m->file->tell(); | ||
| 590 | - QPDFTokenizer::Token t2 = qpdf.readToken(*m->file, MAX_LEN); | ||
| 591 | - if (t2.isInteger() && qpdf.readToken(*m->file, MAX_LEN).isWord("obj")) { | 585 | + auto pos = file->tell(); |
| 586 | + QPDFTokenizer::Token t2 = read_token(MAX_LEN); | ||
| 587 | + if (t2.isInteger() && read_token(MAX_LEN).isWord("obj")) { | ||
| 592 | int obj = QUtil::string_to_int(t1.getValue().c_str()); | 588 | int obj = QUtil::string_to_int(t1.getValue().c_str()); |
| 593 | int gen = QUtil::string_to_int(t2.getValue().c_str()); | 589 | int gen = QUtil::string_to_int(t2.getValue().c_str()); |
| 594 | if (obj <= max_id) { | 590 | if (obj <= max_id) { |
| @@ -597,19 +593,19 @@ QPDF::Xref_table::reconstruct(QPDFExc& e) | @@ -597,19 +593,19 @@ QPDF::Xref_table::reconstruct(QPDFExc& e) | ||
| 597 | warn_damaged("ignoring object with impossibly large id " + std::to_string(obj)); | 593 | warn_damaged("ignoring object with impossibly large id " + std::to_string(obj)); |
| 598 | } | 594 | } |
| 599 | } | 595 | } |
| 600 | - m->file->seek(pos, SEEK_SET); | 596 | + file->seek(pos, SEEK_SET); |
| 601 | } else if (!trailer && t1.isWord("trailer")) { | 597 | } else if (!trailer && t1.isWord("trailer")) { |
| 602 | - auto pos = m->file->tell(); | 598 | + auto pos = file->tell(); |
| 603 | QPDFObjectHandle t = qpdf.readTrailer(); | 599 | QPDFObjectHandle t = qpdf.readTrailer(); |
| 604 | if (!t.isDictionary()) { | 600 | if (!t.isDictionary()) { |
| 605 | // Oh well. It was worth a try. | 601 | // Oh well. It was worth a try. |
| 606 | } else { | 602 | } else { |
| 607 | trailer = t; | 603 | trailer = t; |
| 608 | } | 604 | } |
| 609 | - m->file->seek(pos, SEEK_SET); | 605 | + file->seek(pos, SEEK_SET); |
| 610 | } | 606 | } |
| 611 | check_warnings(); | 607 | check_warnings(); |
| 612 | - m->file->findAndSkipNextEOL(); | 608 | + file->findAndSkipNextEOL(); |
| 613 | } | 609 | } |
| 614 | deleted_objects.clear(); | 610 | deleted_objects.clear(); |
| 615 | 611 | ||
| @@ -664,7 +660,7 @@ QPDF::Xref_table::reconstruct(QPDFExc& e) | @@ -664,7 +660,7 @@ QPDF::Xref_table::reconstruct(QPDFExc& e) | ||
| 664 | parsed = true; | 660 | parsed = true; |
| 665 | qpdf.getAllPages(); | 661 | qpdf.getAllPages(); |
| 666 | check_warnings(); | 662 | check_warnings(); |
| 667 | - if (m->all_pages.empty()) { | 663 | + if (qpdf.m->all_pages.empty()) { |
| 668 | parsed = false; | 664 | parsed = false; |
| 669 | throw damaged_pdf("unable to find any pages while recovering damaged file"); | 665 | throw damaged_pdf("unable to find any pages while recovering damaged file"); |
| 670 | } | 666 | } |
| @@ -679,15 +675,13 @@ QPDF::Xref_table::reconstruct(QPDFExc& e) | @@ -679,15 +675,13 @@ QPDF::Xref_table::reconstruct(QPDFExc& e) | ||
| 679 | void | 675 | void |
| 680 | QPDF::Xref_table::read(qpdf_offset_t xref_offset) | 676 | QPDF::Xref_table::read(qpdf_offset_t xref_offset) |
| 681 | { | 677 | { |
| 682 | - auto* m = qpdf.m.get(); | ||
| 683 | - | ||
| 684 | std::map<int, int> free_table; | 678 | std::map<int, int> free_table; |
| 685 | std::set<qpdf_offset_t> visited; | 679 | std::set<qpdf_offset_t> visited; |
| 686 | while (xref_offset) { | 680 | while (xref_offset) { |
| 687 | visited.insert(xref_offset); | 681 | visited.insert(xref_offset); |
| 688 | char buf[7]; | 682 | char buf[7]; |
| 689 | memset(buf, 0, sizeof(buf)); | 683 | memset(buf, 0, sizeof(buf)); |
| 690 | - m->file->seek(xref_offset, SEEK_SET); | 684 | + file->seek(xref_offset, SEEK_SET); |
| 691 | // Some files miss the mark a little with startxref. We could do a better job of searching | 685 | // Some files miss the mark a little with startxref. We could do a better job of searching |
| 692 | // in the neighborhood for something that looks like either an xref table or stream, but the | 686 | // in the neighborhood for something that looks like either an xref table or stream, but the |
| 693 | // simple heuristic of skipping whitespace can help with the xref table case and is harmless | 687 | // simple heuristic of skipping whitespace can help with the xref table case and is harmless |
| @@ -696,11 +690,11 @@ QPDF::Xref_table::read(qpdf_offset_t xref_offset) | @@ -696,11 +690,11 @@ QPDF::Xref_table::read(qpdf_offset_t xref_offset) | ||
| 696 | bool skipped_space = false; | 690 | bool skipped_space = false; |
| 697 | while (!done) { | 691 | while (!done) { |
| 698 | char ch; | 692 | char ch; |
| 699 | - if (1 == m->file->read(&ch, 1)) { | 693 | + if (1 == file->read(&ch, 1)) { |
| 700 | if (QUtil::is_space(ch)) { | 694 | if (QUtil::is_space(ch)) { |
| 701 | skipped_space = true; | 695 | skipped_space = true; |
| 702 | } else { | 696 | } else { |
| 703 | - m->file->unreadCh(ch); | 697 | + file->unreadCh(ch); |
| 704 | done = true; | 698 | done = true; |
| 705 | } | 699 | } |
| 706 | } else { | 700 | } else { |
| @@ -709,7 +703,7 @@ QPDF::Xref_table::read(qpdf_offset_t xref_offset) | @@ -709,7 +703,7 @@ QPDF::Xref_table::read(qpdf_offset_t xref_offset) | ||
| 709 | } | 703 | } |
| 710 | } | 704 | } |
| 711 | 705 | ||
| 712 | - m->file->read(buf, sizeof(buf) - 1); | 706 | + file->read(buf, sizeof(buf) - 1); |
| 713 | // The PDF spec says xref must be followed by a line terminator, but files exist in the wild | 707 | // The PDF spec says xref must be followed by a line terminator, but files exist in the wild |
| 714 | // where it is terminated by arbitrary whitespace. | 708 | // where it is terminated by arbitrary whitespace. |
| 715 | if ((strncmp(buf, "xref", 4) == 0) && QUtil::is_space(buf[4])) { | 709 | if ((strncmp(buf, "xref", 4) == 0) && QUtil::is_space(buf[4])) { |
| @@ -823,11 +817,9 @@ QPDF::Xref_table::parse_first(std::string const& line, int& obj, int& num, int& | @@ -823,11 +817,9 @@ QPDF::Xref_table::parse_first(std::string const& line, int& obj, int& num, int& | ||
| 823 | bool | 817 | bool |
| 824 | QPDF::Xref_table::read_bad_entry(qpdf_offset_t& f1, int& f2, char& type) | 818 | QPDF::Xref_table::read_bad_entry(qpdf_offset_t& f1, int& f2, char& type) |
| 825 | { | 819 | { |
| 826 | - auto* m = qpdf.m.get(); | ||
| 827 | - | ||
| 828 | // Reposition after initial read attempt and reread. | 820 | // Reposition after initial read attempt and reread. |
| 829 | - m->file->seek(m->file->getLastOffset(), SEEK_SET); | ||
| 830 | - auto line = m->file->readLine(30); | 821 | + file->seek(file->getLastOffset(), SEEK_SET); |
| 822 | + auto line = file->readLine(30); | ||
| 831 | 823 | ||
| 832 | // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated | 824 | // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated |
| 833 | // buffer. | 825 | // buffer. |
| @@ -907,10 +899,8 @@ QPDF::Xref_table::read_bad_entry(qpdf_offset_t& f1, int& f2, char& type) | @@ -907,10 +899,8 @@ QPDF::Xref_table::read_bad_entry(qpdf_offset_t& f1, int& f2, char& type) | ||
| 907 | bool | 899 | bool |
| 908 | QPDF::Xref_table::read_entry(qpdf_offset_t& f1, int& f2, char& type) | 900 | QPDF::Xref_table::read_entry(qpdf_offset_t& f1, int& f2, char& type) |
| 909 | { | 901 | { |
| 910 | - auto* m = qpdf.m.get(); | ||
| 911 | - | ||
| 912 | std::array<char, 21> line; | 902 | std::array<char, 21> line; |
| 913 | - if (m->file->read(line.data(), 20) != 20) { | 903 | + if (file->read(line.data(), 20) != 20) { |
| 914 | // C++20: [[unlikely]] | 904 | // C++20: [[unlikely]] |
| 915 | return false; | 905 | return false; |
| 916 | } | 906 | } |
| @@ -963,13 +953,11 @@ QPDF::Xref_table::read_entry(qpdf_offset_t& f1, int& f2, char& type) | @@ -963,13 +953,11 @@ QPDF::Xref_table::read_entry(qpdf_offset_t& f1, int& f2, char& type) | ||
| 963 | qpdf_offset_t | 953 | qpdf_offset_t |
| 964 | QPDF::Xref_table::read_table(qpdf_offset_t xref_offset) | 954 | QPDF::Xref_table::read_table(qpdf_offset_t xref_offset) |
| 965 | { | 955 | { |
| 966 | - auto* m = qpdf.m.get(); | ||
| 967 | - | ||
| 968 | - m->file->seek(xref_offset, SEEK_SET); | 956 | + file->seek(xref_offset, SEEK_SET); |
| 969 | std::string line; | 957 | std::string line; |
| 970 | while (true) { | 958 | while (true) { |
| 971 | line.assign(50, '\0'); | 959 | line.assign(50, '\0'); |
| 972 | - m->file->read(line.data(), line.size()); | 960 | + file->read(line.data(), line.size()); |
| 973 | int obj = 0; | 961 | int obj = 0; |
| 974 | int num = 0; | 962 | int num = 0; |
| 975 | int bytes = 0; | 963 | int bytes = 0; |
| @@ -977,11 +965,11 @@ QPDF::Xref_table::read_table(qpdf_offset_t xref_offset) | @@ -977,11 +965,11 @@ QPDF::Xref_table::read_table(qpdf_offset_t xref_offset) | ||
| 977 | QTC::TC("qpdf", "QPDF invalid xref"); | 965 | QTC::TC("qpdf", "QPDF invalid xref"); |
| 978 | throw damaged_table("xref syntax invalid"); | 966 | throw damaged_table("xref syntax invalid"); |
| 979 | } | 967 | } |
| 980 | - m->file->seek(m->file->getLastOffset() + bytes, SEEK_SET); | 968 | + file->seek(file->getLastOffset() + bytes, SEEK_SET); |
| 981 | for (qpdf_offset_t i = obj; i - num < obj; ++i) { | 969 | for (qpdf_offset_t i = obj; i - num < obj; ++i) { |
| 982 | if (i == 0) { | 970 | if (i == 0) { |
| 983 | // This is needed by checkLinearization() | 971 | // This is needed by checkLinearization() |
| 984 | - first_item_offset = m->file->tell(); | 972 | + first_item_offset = file->tell(); |
| 985 | } | 973 | } |
| 986 | // For xref_table, these will always be small enough to be ints | 974 | // For xref_table, these will always be small enough to be ints |
| 987 | qpdf_offset_t f1 = 0; | 975 | qpdf_offset_t f1 = 0; |
| @@ -997,11 +985,11 @@ QPDF::Xref_table::read_table(qpdf_offset_t xref_offset) | @@ -997,11 +985,11 @@ QPDF::Xref_table::read_table(qpdf_offset_t xref_offset) | ||
| 997 | insert(toI(i), 1, f1, f2); | 985 | insert(toI(i), 1, f1, f2); |
| 998 | } | 986 | } |
| 999 | } | 987 | } |
| 1000 | - qpdf_offset_t pos = m->file->tell(); | ||
| 1001 | - if (qpdf.readToken(*m->file).isWord("trailer")) { | 988 | + qpdf_offset_t pos = file->tell(); |
| 989 | + if (read_token().isWord("trailer")) { | ||
| 1002 | break; | 990 | break; |
| 1003 | } else { | 991 | } else { |
| 1004 | - m->file->seek(pos, SEEK_SET); | 992 | + file->seek(pos, SEEK_SET); |
| 1005 | } | 993 | } |
| 1006 | } | 994 | } |
| 1007 | 995 |
libqpdf/qpdf/QPDF_private.hh
| @@ -7,9 +7,11 @@ | @@ -7,9 +7,11 @@ | ||
| 7 | class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry> | 7 | class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry> |
| 8 | { | 8 | { |
| 9 | public: | 9 | public: |
| 10 | - Xref_table(QPDF& qpdf) : | ||
| 11 | - qpdf(qpdf) | 10 | + Xref_table(QPDF& qpdf, InputSource* const& file) : |
| 11 | + qpdf(qpdf), | ||
| 12 | + file(file) | ||
| 12 | { | 13 | { |
| 14 | + tokenizer.allowEOF(); | ||
| 13 | } | 15 | } |
| 14 | 16 | ||
| 15 | void initialize(); | 17 | void initialize(); |
| @@ -50,6 +52,12 @@ class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry> | @@ -50,6 +52,12 @@ class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry> | ||
| 50 | int max_num_entries, | 52 | int max_num_entries, |
| 51 | std::function<QPDFExc(std::string_view)> damaged); | 53 | std::function<QPDFExc(std::string_view)> damaged); |
| 52 | 54 | ||
| 55 | + QPDFTokenizer::Token | ||
| 56 | + read_token(size_t max_len = 0) | ||
| 57 | + { | ||
| 58 | + return tokenizer.readToken(*file, "", true, max_len); | ||
| 59 | + } | ||
| 60 | + | ||
| 53 | // Methods to insert table entries | 61 | // Methods to insert table entries |
| 54 | void insert_reconstructed(int obj, qpdf_offset_t f1, int f2); | 62 | void insert_reconstructed(int obj, qpdf_offset_t f1, int f2); |
| 55 | void insert(int obj, int f0, qpdf_offset_t f1, int f2); | 63 | void insert(int obj, int f0, qpdf_offset_t f1, int f2); |
| @@ -72,7 +80,10 @@ class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry> | @@ -72,7 +80,10 @@ class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry> | ||
| 72 | { | 80 | { |
| 73 | qpdf.warn(damaged_pdf(msg)); | 81 | qpdf.warn(damaged_pdf(msg)); |
| 74 | } | 82 | } |
| 83 | + | ||
| 75 | QPDF& qpdf; | 84 | QPDF& qpdf; |
| 85 | + InputSource* const& file; | ||
| 86 | + QPDFTokenizer tokenizer; | ||
| 76 | }; | 87 | }; |
| 77 | 88 | ||
| 78 | // Writer class is restricted to QPDFWriter so that only it can call certain methods. | 89 | // Writer class is restricted to QPDFWriter so that only it can call certain methods. |