Commit 1e2dcbf03e8f1114b42c1237b536019fff2e1f4c

Authored by m-holger
1 parent b1d845e7

Add QPDF::Xref_table members file and tokenizer

libqpdf/QPDF.cc
@@ -201,7 +201,7 @@ QPDF::Members::Members(QPDF& qpdf) : @@ -201,7 +201,7 @@ QPDF::Members::Members(QPDF& qpdf) :
201 file_sp(new InvalidInputSource()), 201 file_sp(new InvalidInputSource()),
202 file(file_sp.get()), 202 file(file_sp.get()),
203 encp(new EncryptionParameters), 203 encp(new EncryptionParameters),
204 - xref_table(qpdf) 204 + xref_table(qpdf, file)
205 { 205 {
206 } 206 }
207 207
@@ -495,12 +495,10 @@ QPDF::warn( @@ -495,12 +495,10 @@ QPDF::warn(
495 void 495 void
496 QPDF::Xref_table::initialize() 496 QPDF::Xref_table::initialize()
497 { 497 {
498 - auto* m = qpdf.m.get();  
499 -  
500 // PDF spec says %%EOF must be found within the last 1024 bytes of/ the file. We add an extra 498 // PDF spec says %%EOF must be found within the last 1024 bytes of/ the file. We add an extra
501 // 30 characters to leave room for the startxref stuff. 499 // 30 characters to leave room for the startxref stuff.
502 - m->file->seek(0, SEEK_END);  
503 - qpdf_offset_t end_offset = m->file->tell(); 500 + file->seek(0, SEEK_END);
  501 + qpdf_offset_t end_offset = file->tell();
504 max_offset = end_offset; 502 max_offset = end_offset;
505 // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic 503 // Sanity check on object ids. All objects must appear in xref table / stream. In all realistic
506 // scenarios at least 3 bytes are required. 504 // scenarios at least 3 bytes are required.
@@ -510,8 +508,8 @@ QPDF::Xref_table::initialize() @@ -510,8 +508,8 @@ QPDF::Xref_table::initialize()
510 qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0); 508 qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);
511 PatternFinder sf(qpdf, &QPDF::findStartxref); 509 PatternFinder sf(qpdf, &QPDF::findStartxref);
512 qpdf_offset_t xref_offset = 0; 510 qpdf_offset_t xref_offset = 0;
513 - if (m->file->findLast("startxref", start_offset, 0, sf)) {  
514 - xref_offset = QUtil::string_to_ll(qpdf.readToken(*m->file).getValue().c_str()); 511 + if (file->findLast("startxref", start_offset, 0, sf)) {
  512 + xref_offset = QUtil::string_to_ll(read_token().getValue().c_str());
515 } 513 }
516 514
517 try { 515 try {
@@ -547,11 +545,9 @@ QPDF::Xref_table::reconstruct(QPDFExc& e) @@ -547,11 +545,9 @@ QPDF::Xref_table::reconstruct(QPDFExc& e)
547 throw e; 545 throw e;
548 } 546 }
549 547
550 - auto* m = qpdf.m.get();  
551 -  
552 // If recovery generates more than 1000 warnings, the file is so severely damaged that there 548 // If recovery generates more than 1000 warnings, the file is so severely damaged that there
553 // probably is no point trying to continue. 549 // probably is no point trying to continue.
554 - const auto max_warnings = m->warnings.size() + 1000U; 550 + const auto max_warnings = qpdf.m->warnings.size() + 1000U;
555 auto check_warnings = [this, max_warnings]() { 551 auto check_warnings = [this, max_warnings]() {
556 if (qpdf.m->warnings.size() > max_warnings) { 552 if (qpdf.m->warnings.size() > max_warnings) {
557 throw damaged_pdf("too many errors while reconstructing cross-reference table"); 553 throw damaged_pdf("too many errors while reconstructing cross-reference table");
@@ -560,7 +556,7 @@ QPDF::Xref_table::reconstruct(QPDFExc& e) @@ -560,7 +556,7 @@ QPDF::Xref_table::reconstruct(QPDFExc& e)
560 556
561 reconstructed = true; 557 reconstructed = true;
562 // We may find more objects, which may contain dangling references. 558 // We may find more objects, which may contain dangling references.
563 - m->fixed_dangling_refs = false; 559 + qpdf.m->fixed_dangling_refs = false;
564 560
565 warn_damaged("file is damaged"); 561 warn_damaged("file is damaged");
566 qpdf.warn(e); 562 qpdf.warn(e);
@@ -577,18 +573,18 @@ QPDF::Xref_table::reconstruct(QPDFExc& e) @@ -577,18 +573,18 @@ QPDF::Xref_table::reconstruct(QPDFExc& e)
577 erase(iter); 573 erase(iter);
578 } 574 }
579 575
580 - m->file->seek(0, SEEK_END);  
581 - qpdf_offset_t eof = m->file->tell();  
582 - m->file->seek(0, SEEK_SET); 576 + file->seek(0, SEEK_END);
  577 + qpdf_offset_t eof = file->tell();
  578 + file->seek(0, SEEK_SET);
583 // Don't allow very long tokens here during recovery. All the interesting tokens are covered. 579 // Don't allow very long tokens here during recovery. All the interesting tokens are covered.
584 static size_t const MAX_LEN = 10; 580 static size_t const MAX_LEN = 10;
585 - while (m->file->tell() < eof) {  
586 - QPDFTokenizer::Token t1 = qpdf.readToken(*m->file, MAX_LEN);  
587 - qpdf_offset_t token_start = m->file->tell() - toO(t1.getValue().length()); 581 + while (file->tell() < eof) {
  582 + QPDFTokenizer::Token t1 = read_token(MAX_LEN);
  583 + qpdf_offset_t token_start = file->tell() - toO(t1.getValue().length());
588 if (t1.isInteger()) { 584 if (t1.isInteger()) {
589 - auto pos = m->file->tell();  
590 - QPDFTokenizer::Token t2 = qpdf.readToken(*m->file, MAX_LEN);  
591 - if (t2.isInteger() && qpdf.readToken(*m->file, MAX_LEN).isWord("obj")) { 585 + auto pos = file->tell();
  586 + QPDFTokenizer::Token t2 = read_token(MAX_LEN);
  587 + if (t2.isInteger() && read_token(MAX_LEN).isWord("obj")) {
592 int obj = QUtil::string_to_int(t1.getValue().c_str()); 588 int obj = QUtil::string_to_int(t1.getValue().c_str());
593 int gen = QUtil::string_to_int(t2.getValue().c_str()); 589 int gen = QUtil::string_to_int(t2.getValue().c_str());
594 if (obj <= max_id) { 590 if (obj <= max_id) {
@@ -597,19 +593,19 @@ QPDF::Xref_table::reconstruct(QPDFExc&amp; e) @@ -597,19 +593,19 @@ QPDF::Xref_table::reconstruct(QPDFExc&amp; e)
597 warn_damaged("ignoring object with impossibly large id " + std::to_string(obj)); 593 warn_damaged("ignoring object with impossibly large id " + std::to_string(obj));
598 } 594 }
599 } 595 }
600 - m->file->seek(pos, SEEK_SET); 596 + file->seek(pos, SEEK_SET);
601 } else if (!trailer && t1.isWord("trailer")) { 597 } else if (!trailer && t1.isWord("trailer")) {
602 - auto pos = m->file->tell(); 598 + auto pos = file->tell();
603 QPDFObjectHandle t = qpdf.readTrailer(); 599 QPDFObjectHandle t = qpdf.readTrailer();
604 if (!t.isDictionary()) { 600 if (!t.isDictionary()) {
605 // Oh well. It was worth a try. 601 // Oh well. It was worth a try.
606 } else { 602 } else {
607 trailer = t; 603 trailer = t;
608 } 604 }
609 - m->file->seek(pos, SEEK_SET); 605 + file->seek(pos, SEEK_SET);
610 } 606 }
611 check_warnings(); 607 check_warnings();
612 - m->file->findAndSkipNextEOL(); 608 + file->findAndSkipNextEOL();
613 } 609 }
614 deleted_objects.clear(); 610 deleted_objects.clear();
615 611
@@ -664,7 +660,7 @@ QPDF::Xref_table::reconstruct(QPDFExc&amp; e) @@ -664,7 +660,7 @@ QPDF::Xref_table::reconstruct(QPDFExc&amp; e)
664 parsed = true; 660 parsed = true;
665 qpdf.getAllPages(); 661 qpdf.getAllPages();
666 check_warnings(); 662 check_warnings();
667 - if (m->all_pages.empty()) { 663 + if (qpdf.m->all_pages.empty()) {
668 parsed = false; 664 parsed = false;
669 throw damaged_pdf("unable to find any pages while recovering damaged file"); 665 throw damaged_pdf("unable to find any pages while recovering damaged file");
670 } 666 }
@@ -679,15 +675,13 @@ QPDF::Xref_table::reconstruct(QPDFExc&amp; e) @@ -679,15 +675,13 @@ QPDF::Xref_table::reconstruct(QPDFExc&amp; e)
679 void 675 void
680 QPDF::Xref_table::read(qpdf_offset_t xref_offset) 676 QPDF::Xref_table::read(qpdf_offset_t xref_offset)
681 { 677 {
682 - auto* m = qpdf.m.get();  
683 -  
684 std::map<int, int> free_table; 678 std::map<int, int> free_table;
685 std::set<qpdf_offset_t> visited; 679 std::set<qpdf_offset_t> visited;
686 while (xref_offset) { 680 while (xref_offset) {
687 visited.insert(xref_offset); 681 visited.insert(xref_offset);
688 char buf[7]; 682 char buf[7];
689 memset(buf, 0, sizeof(buf)); 683 memset(buf, 0, sizeof(buf));
690 - m->file->seek(xref_offset, SEEK_SET); 684 + file->seek(xref_offset, SEEK_SET);
691 // Some files miss the mark a little with startxref. We could do a better job of searching 685 // Some files miss the mark a little with startxref. We could do a better job of searching
692 // in the neighborhood for something that looks like either an xref table or stream, but the 686 // in the neighborhood for something that looks like either an xref table or stream, but the
693 // simple heuristic of skipping whitespace can help with the xref table case and is harmless 687 // simple heuristic of skipping whitespace can help with the xref table case and is harmless
@@ -696,11 +690,11 @@ QPDF::Xref_table::read(qpdf_offset_t xref_offset) @@ -696,11 +690,11 @@ QPDF::Xref_table::read(qpdf_offset_t xref_offset)
696 bool skipped_space = false; 690 bool skipped_space = false;
697 while (!done) { 691 while (!done) {
698 char ch; 692 char ch;
699 - if (1 == m->file->read(&ch, 1)) { 693 + if (1 == file->read(&ch, 1)) {
700 if (QUtil::is_space(ch)) { 694 if (QUtil::is_space(ch)) {
701 skipped_space = true; 695 skipped_space = true;
702 } else { 696 } else {
703 - m->file->unreadCh(ch); 697 + file->unreadCh(ch);
704 done = true; 698 done = true;
705 } 699 }
706 } else { 700 } else {
@@ -709,7 +703,7 @@ QPDF::Xref_table::read(qpdf_offset_t xref_offset) @@ -709,7 +703,7 @@ QPDF::Xref_table::read(qpdf_offset_t xref_offset)
709 } 703 }
710 } 704 }
711 705
712 - m->file->read(buf, sizeof(buf) - 1); 706 + file->read(buf, sizeof(buf) - 1);
713 // The PDF spec says xref must be followed by a line terminator, but files exist in the wild 707 // The PDF spec says xref must be followed by a line terminator, but files exist in the wild
714 // where it is terminated by arbitrary whitespace. 708 // where it is terminated by arbitrary whitespace.
715 if ((strncmp(buf, "xref", 4) == 0) && QUtil::is_space(buf[4])) { 709 if ((strncmp(buf, "xref", 4) == 0) && QUtil::is_space(buf[4])) {
@@ -823,11 +817,9 @@ QPDF::Xref_table::parse_first(std::string const&amp; line, int&amp; obj, int&amp; num, int&amp; @@ -823,11 +817,9 @@ QPDF::Xref_table::parse_first(std::string const&amp; line, int&amp; obj, int&amp; num, int&amp;
823 bool 817 bool
824 QPDF::Xref_table::read_bad_entry(qpdf_offset_t& f1, int& f2, char& type) 818 QPDF::Xref_table::read_bad_entry(qpdf_offset_t& f1, int& f2, char& type)
825 { 819 {
826 - auto* m = qpdf.m.get();  
827 -  
828 // Reposition after initial read attempt and reread. 820 // Reposition after initial read attempt and reread.
829 - m->file->seek(m->file->getLastOffset(), SEEK_SET);  
830 - auto line = m->file->readLine(30); 821 + file->seek(file->getLastOffset(), SEEK_SET);
  822 + auto line = file->readLine(30);
831 823
832 // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated 824 // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
833 // buffer. 825 // buffer.
@@ -907,10 +899,8 @@ QPDF::Xref_table::read_bad_entry(qpdf_offset_t&amp; f1, int&amp; f2, char&amp; type) @@ -907,10 +899,8 @@ QPDF::Xref_table::read_bad_entry(qpdf_offset_t&amp; f1, int&amp; f2, char&amp; type)
907 bool 899 bool
908 QPDF::Xref_table::read_entry(qpdf_offset_t& f1, int& f2, char& type) 900 QPDF::Xref_table::read_entry(qpdf_offset_t& f1, int& f2, char& type)
909 { 901 {
910 - auto* m = qpdf.m.get();  
911 -  
912 std::array<char, 21> line; 902 std::array<char, 21> line;
913 - if (m->file->read(line.data(), 20) != 20) { 903 + if (file->read(line.data(), 20) != 20) {
914 // C++20: [[unlikely]] 904 // C++20: [[unlikely]]
915 return false; 905 return false;
916 } 906 }
@@ -963,13 +953,11 @@ QPDF::Xref_table::read_entry(qpdf_offset_t&amp; f1, int&amp; f2, char&amp; type) @@ -963,13 +953,11 @@ QPDF::Xref_table::read_entry(qpdf_offset_t&amp; f1, int&amp; f2, char&amp; type)
963 qpdf_offset_t 953 qpdf_offset_t
964 QPDF::Xref_table::read_table(qpdf_offset_t xref_offset) 954 QPDF::Xref_table::read_table(qpdf_offset_t xref_offset)
965 { 955 {
966 - auto* m = qpdf.m.get();  
967 -  
968 - m->file->seek(xref_offset, SEEK_SET); 956 + file->seek(xref_offset, SEEK_SET);
969 std::string line; 957 std::string line;
970 while (true) { 958 while (true) {
971 line.assign(50, '\0'); 959 line.assign(50, '\0');
972 - m->file->read(line.data(), line.size()); 960 + file->read(line.data(), line.size());
973 int obj = 0; 961 int obj = 0;
974 int num = 0; 962 int num = 0;
975 int bytes = 0; 963 int bytes = 0;
@@ -977,11 +965,11 @@ QPDF::Xref_table::read_table(qpdf_offset_t xref_offset) @@ -977,11 +965,11 @@ QPDF::Xref_table::read_table(qpdf_offset_t xref_offset)
977 QTC::TC("qpdf", "QPDF invalid xref"); 965 QTC::TC("qpdf", "QPDF invalid xref");
978 throw damaged_table("xref syntax invalid"); 966 throw damaged_table("xref syntax invalid");
979 } 967 }
980 - m->file->seek(m->file->getLastOffset() + bytes, SEEK_SET); 968 + file->seek(file->getLastOffset() + bytes, SEEK_SET);
981 for (qpdf_offset_t i = obj; i - num < obj; ++i) { 969 for (qpdf_offset_t i = obj; i - num < obj; ++i) {
982 if (i == 0) { 970 if (i == 0) {
983 // This is needed by checkLinearization() 971 // This is needed by checkLinearization()
984 - first_item_offset = m->file->tell(); 972 + first_item_offset = file->tell();
985 } 973 }
986 // For xref_table, these will always be small enough to be ints 974 // For xref_table, these will always be small enough to be ints
987 qpdf_offset_t f1 = 0; 975 qpdf_offset_t f1 = 0;
@@ -997,11 +985,11 @@ QPDF::Xref_table::read_table(qpdf_offset_t xref_offset) @@ -997,11 +985,11 @@ QPDF::Xref_table::read_table(qpdf_offset_t xref_offset)
997 insert(toI(i), 1, f1, f2); 985 insert(toI(i), 1, f1, f2);
998 } 986 }
999 } 987 }
1000 - qpdf_offset_t pos = m->file->tell();  
1001 - if (qpdf.readToken(*m->file).isWord("trailer")) { 988 + qpdf_offset_t pos = file->tell();
  989 + if (read_token().isWord("trailer")) {
1002 break; 990 break;
1003 } else { 991 } else {
1004 - m->file->seek(pos, SEEK_SET); 992 + file->seek(pos, SEEK_SET);
1005 } 993 }
1006 } 994 }
1007 995
libqpdf/qpdf/QPDF_private.hh
@@ -7,9 +7,11 @@ @@ -7,9 +7,11 @@
7 class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry> 7 class QPDF::Xref_table: public std::map<QPDFObjGen, QPDFXRefEntry>
8 { 8 {
9 public: 9 public:
10 - Xref_table(QPDF& qpdf) :  
11 - qpdf(qpdf) 10 + Xref_table(QPDF& qpdf, InputSource* const& file) :
  11 + qpdf(qpdf),
  12 + file(file)
12 { 13 {
  14 + tokenizer.allowEOF();
13 } 15 }
14 16
15 void initialize(); 17 void initialize();
@@ -50,6 +52,12 @@ class QPDF::Xref_table: public std::map&lt;QPDFObjGen, QPDFXRefEntry&gt; @@ -50,6 +52,12 @@ class QPDF::Xref_table: public std::map&lt;QPDFObjGen, QPDFXRefEntry&gt;
50 int max_num_entries, 52 int max_num_entries,
51 std::function<QPDFExc(std::string_view)> damaged); 53 std::function<QPDFExc(std::string_view)> damaged);
52 54
  55 + QPDFTokenizer::Token
  56 + read_token(size_t max_len = 0)
  57 + {
  58 + return tokenizer.readToken(*file, "", true, max_len);
  59 + }
  60 +
53 // Methods to insert table entries 61 // Methods to insert table entries
54 void insert_reconstructed(int obj, qpdf_offset_t f1, int f2); 62 void insert_reconstructed(int obj, qpdf_offset_t f1, int f2);
55 void insert(int obj, int f0, qpdf_offset_t f1, int f2); 63 void insert(int obj, int f0, qpdf_offset_t f1, int f2);
@@ -72,7 +80,10 @@ class QPDF::Xref_table: public std::map&lt;QPDFObjGen, QPDFXRefEntry&gt; @@ -72,7 +80,10 @@ class QPDF::Xref_table: public std::map&lt;QPDFObjGen, QPDFXRefEntry&gt;
72 { 80 {
73 qpdf.warn(damaged_pdf(msg)); 81 qpdf.warn(damaged_pdf(msg));
74 } 82 }
  83 +
75 QPDF& qpdf; 84 QPDF& qpdf;
  85 + InputSource* const& file;
  86 + QPDFTokenizer tokenizer;
76 }; 87 };
77 88
78 // Writer class is restricted to QPDFWriter so that only it can call certain methods. 89 // Writer class is restricted to QPDFWriter so that only it can call certain methods.