Commit 1e2dcbf03e8f1114b42c1237b536019fff2e1f4c

Authored by m-holger
1 parent b1d845e7

Add QPDF::Xref_table members file and tokenizer

libqpdf/QPDF.cc
... ... @@ -201,7 +201,7 @@ QPDF::Members::Members(QPDF& qpdf) :
201 201 file_sp(new InvalidInputSource()),
202 202 file(file_sp.get()),
203 203 encp(new EncryptionParameters),
204   - xref_table(qpdf)
  204 + xref_table(qpdf, file)
205 205 {
206 206 }
207 207  
... ... @@ -495,12 +495,10 @@ QPDF::warn(
495 495 void
496 496 QPDF::Xref_table::initialize()
497 497 {
498   - auto* m = qpdf.m.get();
499   -
500 498 // PDF spec says %%EOF must be found within the last 1024 bytes of/ the file. We add an extra
501 499 // 30 characters to leave room for the startxref stuff.
502   - m->file->seek(0, SEEK_END);
503   - qpdf_offset_t end_offset = m->file->tell();
  500 + file->seek(0, SEEK_END);
  501 + qpdf_offset_t end_offset = file->tell();
504 502 max_offset = end_offset;
505 503 // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic
506 504 // scenarios at least 3 bytes are required.
... ... @@ -510,8 +508,8 @@ QPDF::Xref_table::initialize()
510 508 qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);
511 509 PatternFinder sf(qpdf, &QPDF::findStartxref);
512 510 qpdf_offset_t xref_offset = 0;
513   - if (m->file->findLast("startxref", start_offset, 0, sf)) {
514   - xref_offset = QUtil::string_to_ll(qpdf.readToken(*m->file).getValue().c_str());
  511 + if (file->findLast("startxref", start_offset, 0, sf)) {
  512 + xref_offset = QUtil::string_to_ll(read_token().getValue().c_str());
515 513 }
516 514  
517 515 try {
... ... @@ -547,11 +545,9 @@ QPDF::Xref_table::reconstruct(QPDFExc& e)
547 545 throw e;
548 546 }
549 547  
550   - auto* m = qpdf.m.get();
551   -
552 548 // If recovery generates more than 1000 warnings, the file is so severely damaged that there
553 549 // probably is no point trying to continue.
554   - const auto max_warnings = m->warnings.size() + 1000U;
  550 + const auto max_warnings = qpdf.m->warnings.size() + 1000U;
555 551 auto check_warnings = [this, max_warnings]() {
556 552 if (qpdf.m->warnings.size() > max_warnings) {
557 553 throw damaged_pdf("too many errors while reconstructing cross-reference table");
... ... @@ -560,7 +556,7 @@ QPDF::Xref_table::reconstruct(QPDFExc& e)
560 556  
561 557 reconstructed = true;
562 558 // We may find more objects, which may contain dangling references.
563   - m->fixed_dangling_refs = false;
  559 + qpdf.m->fixed_dangling_refs = false;
564 560  
565 561 warn_damaged("file is damaged");
566 562 qpdf.warn(e);
... ... @@ -577,18 +573,18 @@ QPDF::Xref_table::reconstruct(QPDFExc& e)
577 573 erase(iter);
578 574 }
579 575  
580   - m->file->seek(0, SEEK_END);
581   - qpdf_offset_t eof = m->file->tell();
582   - m->file->seek(0, SEEK_SET);
  576 + file->seek(0, SEEK_END);
  577 + qpdf_offset_t eof = file->tell();
  578 + file->seek(0, SEEK_SET);
583 579 // Don't allow very long tokens here during recovery. All the interesting tokens are covered.
584 580 static size_t const MAX_LEN = 10;
585   - while (m->file->tell() < eof) {
586   - QPDFTokenizer::Token t1 = qpdf.readToken(*m->file, MAX_LEN);
587   - qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length());
  581 + while (file->tell() < eof) {
  582 + QPDFTokenizer::Token t1 = read_token(MAX_LEN);
  583 + qpdf_offset_t token_start = file->tell() - toO(t1.getValue().length());
588 584 if (t1.isInteger()) {
589   - auto pos = m->file->tell();
590   - QPDFTokenizer::Token t2 = qpdf.readToken(*m->file, MAX_LEN);
591   - if (t2.isInteger() && qpdf.readToken(*m->file, MAX_LEN).isWord("obj")) {
  585 + auto pos = file->tell();
  586 + QPDFTokenizer::Token t2 = read_token(MAX_LEN);
  587 + if (t2.isInteger() && read_token(MAX_LEN).isWord("obj")) {
592 588 int obj = QUtil::string_to_int(t1.getValue().c_str());
593 589 int gen = QUtil::string_to_int(t2.getValue().c_str());
594 590 if (obj <= max_id) {
... ... @@ -597,19 +593,19 @@ QPDF::Xref_table::reconstruct(QPDFExc&amp; e)
597 593 warn_damaged("ignoring object with impossibly large id " + std::to_string(obj));
598 594 }
599 595 }
600   - m->file->seek(pos, SEEK_SET);
  596 + file->seek(pos, SEEK_SET);
601 597 } else if (!trailer && t1.isWord("trailer")) {
602   - auto pos = m->file->tell();
  598 + auto pos = file->tell();
603 599 QPDFObjectHandle t = qpdf.readTrailer();
604 600 if (!t.isDictionary()) {
605 601 // Oh well. It was worth a try.
606 602 } else {
607 603 trailer = t;
608 604 }
609   - m->file->seek(pos, SEEK_SET);
  605 + file->seek(pos, SEEK_SET);
610 606 }
611 607 check_warnings();
612   - m->file->findAndSkipNextEOL();
  608 + file->findAndSkipNextEOL();
613 609 }
614 610 deleted_objects.clear();
615 611  
... ... @@ -664,7 +660,7 @@ QPDF::Xref_table::reconstruct(QPDFExc&amp; e)
664 660 parsed = true;
665 661 qpdf.getAllPages();
666 662 check_warnings();
667   - if (m->all_pages.empty()) {
  663 + if (qpdf.m->all_pages.empty()) {
668 664 parsed = false;
669 665 throw damaged_pdf("unable to find any pages while recovering damaged file");
670 666 }
... ... @@ -679,15 +675,13 @@ QPDF::Xref_table::reconstruct(QPDFExc&amp; e)
679 675 void
680 676 QPDF::Xref_table::read(qpdf_offset_t xref_offset)
681 677 {
682   - auto* m = qpdf.m.get();
683   -
684 678 std::map<int, int> free_table;
685 679 std::set<qpdf_offset_t> visited;
686 680 while (xref_offset) {
687 681 visited.insert(xref_offset);
688 682 char buf[7];
689 683 memset(buf, 0, sizeof(buf));
690   - m->file->seek(xref_offset, SEEK_SET);
  684 + file->seek(xref_offset, SEEK_SET);
691 685 // Some files miss the mark a little with startxref. We could do a better job of searching
692 686 // in the neighborhood for something that looks like either an xref table or stream, but the
693 687 // simple heuristic of skipping whitespace can help with the xref table case and is harmless
... ... @@ -696,11 +690,11 @@ QPDF::Xref_table::read(qpdf_offset_t xref_offset)
696 690 bool skipped_space = false;
697 691 while (!done) {
698 692 char ch;
699   - if (1 == m->file->read(&ch, 1)) {
  693 + if (1 == file->read(&ch, 1)) {
700 694 if (QUtil::is_space(ch)) {
701 695 skipped_space = true;
702 696 } else {
703   - m->file->unreadCh(ch);
  697 + file->unreadCh(ch);
704 698 done = true;
705 699 }
706 700 } else {
... ... @@ -709,7 +703,7 @@ QPDF::Xref_table::read(qpdf_offset_t xref_offset)
709 703 }
710 704 }
711 705  
712   - m->file->read(buf, sizeof(buf) - 1);
  706 + file->read(buf, sizeof(buf) - 1);
713 707 // The PDF spec says xref must be followed by a line terminator, but files exist in the wild
714 708 // where it is terminated by arbitrary whitespace.
715 709 if ((strncmp(buf, "xref", 4) == 0) && QUtil::is_space(buf[4])) {
... ... @@ -823,11 +817,9 @@ QPDF::Xref_table::parse_first(std::string const&amp; line, int&amp; obj, int&amp; num, int&amp;
823 817 bool
824 818 QPDF::Xref_table::read_bad_entry(qpdf_offset_t& f1, int& f2, char& type)
825 819 {
826   - auto* m = qpdf.m.get();
827   -
828 820 // Reposition after initial read attempt and reread.
829   - m->file->seek(m->file->getLastOffset(), SEEK_SET);
830   - auto line = m->file->readLine(30);
  821 + file->seek(file->getLastOffset(), SEEK_SET);
  822 + auto line = file->readLine(30);
831 823  
832 824 // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
833 825 // buffer.
... ... @@ -907,10 +899,8 @@ QPDF::Xref_table::read_bad_entry(qpdf_offset_t&amp; f1, int&amp; f2, char&amp; type)
907 899 bool
908 900 QPDF::Xref_table::read_entry(qpdf_offset_t& f1, int& f2, char& type)
909 901 {
910   - auto* m = qpdf.m.get();
911   -
912 902 std::array<char, 21> line;
913   - if (m->file->read(line.data(), 20) != 20) {
  903 + if (file->read(line.data(), 20) != 20) {
914 904 // C++20: [[unlikely]]
915 905 return false;
916 906 }
... ... @@ -963,13 +953,11 @@ QPDF::Xref_table::read_entry(qpdf_offset_t&amp; f1, int&amp; f2, char&amp; type)
963 953 qpdf_offset_t
964 954 QPDF::Xref_table::read_table(qpdf_offset_t xref_offset)
965 955 {
966   - auto* m = qpdf.m.get();
967   -
968   - m->file->seek(xref_offset, SEEK_SET);
  956 + file->seek(xref_offset, SEEK_SET);
969 957 std::string line;
970 958 while (true) {
971 959 line.assign(50, '\0');
972   - m->file->read(line.data(), line.size());
  960 + file->read(line.data(), line.size());
973 961 int obj = 0;
974 962 int num = 0;
975 963 int bytes = 0;
... ... @@ -977,11 +965,11 @@ QPDF::Xref_table::read_table(qpdf_offset_t xref_offset)
977 965 QTC::TC("qpdf", "QPDF invalid xref");
978 966 throw damaged_table("xref syntax invalid");
979 967 }
980   - m->file->seek(m->file->getLastOffset() + bytes, SEEK_SET);
  968 + file->seek(file->getLastOffset() + bytes, SEEK_SET);
981 969 for (qpdf_offset_t i = obj; i - num < obj; ++i) {
982 970 if (i == 0) {
983 971 // This is needed by checkLinearization()
984   - first_item_offset = m->file->tell();
  972 + first_item_offset = file->tell();
985 973 }
986 974 // For xref_table, these will always be small enough to be ints
987 975 qpdf_offset_t f1 = 0;
... ... @@ -997,11 +985,11 @@ QPDF::Xref_table::read_table(qpdf_offset_t xref_offset)
997 985 insert(toI(i), 1, f1, f2);
998 986 }
999 987 }
1000   - qpdf_offset_t pos = m->file->tell();
1001   - if (qpdf.readToken(*m->file).isWord("trailer")) {
  988 + qpdf_offset_t pos = file->tell();
  989 + if (read_token().isWord("trailer")) {
1002 990 break;
1003 991 } else {
1004   - m->file->seek(pos, SEEK_SET);
  992 + file->seek(pos, SEEK_SET);
1005 993 }
1006 994 }
1007 995  
... ...
libqpdf/qpdf/QPDF_private.hh
... ... @@ -7,9 +7,11 @@
7 7 class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry>
8 8 {
9 9 public:
10   - Xref_table(QPDF& qpdf) :
11   - qpdf(qpdf)
  10 + Xref_table(QPDF& qpdf, InputSource* const& file) :
  11 + qpdf(qpdf),
  12 + file(file)
12 13 {
  14 + tokenizer.allowEOF();
13 15 }
14 16  
15 17 void initialize();
... ... @@ -50,6 +52,12 @@ class QPDF::Xref_table: public std::map&lt;QPDFObjGen, QPDFXRefEntry&gt;
50 52 int max_num_entries,
51 53 std::function<QPDFExc(std::string_view)> damaged);
52 54  
  55 + QPDFTokenizer::Token
  56 + read_token(size_t max_len = 0)
  57 + {
  58 + return tokenizer.readToken(*file, "", true, max_len);
  59 + }
  60 +
53 61 // Methods to insert table entries
54 62 void insert_reconstructed(int obj, qpdf_offset_t f1, int f2);
55 63 void insert(int obj, int f0, qpdf_offset_t f1, int f2);
... ... @@ -72,7 +80,10 @@ class QPDF::Xref_table: public std::map&lt;QPDFObjGen, QPDFXRefEntry&gt;
72 80 {
73 81 qpdf.warn(damaged_pdf(msg));
74 82 }
  83 +
75 84 QPDF& qpdf;
  85 + InputSource* const& file;
  86 + QPDFTokenizer tokenizer;
76 87 };
77 88  
78 89 // Writer class is restricted to QPDFWriter so that only it can call certain methods.
... ...