Commit 30f109e244f365111d5219903f13d64cf1a95054

Authored by Jay Berkenbilt
1 parent 98a843c2

Read xref table without PCRE

Also accept more errors than before.
ChangeLog
1 2017-08-10 Jay Berkenbilt <ejb@ql.org> 1 2017-08-10 Jay Berkenbilt <ejb@ql.org>
2 2
  3 + * Be more forgiving of certain types of errors in the xref table
  4 + that don't interfere with interpreting the table.
  5 +
3 * Remove unused "tracing" parameter from PointerHolder's 6 * Remove unused "tracing" parameter from PointerHolder's
4 (T*, bool) constructor. This change breaks source code 7 (T*, bool) constructor. This change breaks source code
5 compatibility, but since this argument to PointerHolder has not 8 compatibility, but since this argument to PointerHolder has not
include/qpdf/QPDF.hh
@@ -652,6 +652,10 @@ class QPDF @@ -652,6 +652,10 @@ class QPDF
652 void setTrailer(QPDFObjectHandle obj); 652 void setTrailer(QPDFObjectHandle obj);
653 void read_xref(qpdf_offset_t offset); 653 void read_xref(qpdf_offset_t offset);
654 void reconstruct_xref(QPDFExc& e); 654 void reconstruct_xref(QPDFExc& e);
  655 + bool parse_xrefFirst(std::string const& line,
  656 + int& obj, int& num, int& bytes);
  657 + bool parse_xrefEntry(std::string const& line,
  658 + qpdf_offset_t& f1, int& f2, char& type);
655 qpdf_offset_t read_xrefTable(qpdf_offset_t offset); 659 qpdf_offset_t read_xrefTable(qpdf_offset_t offset);
656 qpdf_offset_t read_xrefStream(qpdf_offset_t offset); 660 qpdf_offset_t read_xrefStream(qpdf_offset_t offset);
657 qpdf_offset_t processXRefStream( 661 qpdf_offset_t processXRefStream(
libqpdf/QPDF.cc
@@ -9,7 +9,6 @@ @@ -9,7 +9,6 @@
9 9
10 #include <qpdf/QTC.hh> 10 #include <qpdf/QTC.hh>
11 #include <qpdf/QUtil.hh> 11 #include <qpdf/QUtil.hh>
12 -#include <qpdf/PCRE.hh>  
13 #include <qpdf/Pipeline.hh> 12 #include <qpdf/Pipeline.hh>
14 #include <qpdf/Pl_Discard.hh> 13 #include <qpdf/Pl_Discard.hh>
15 #include <qpdf/FileInputSource.hh> 14 #include <qpdf/FileInputSource.hh>
@@ -537,12 +536,162 @@ QPDF::read_xref(qpdf_offset_t xref_offset) @@ -537,12 +536,162 @@ QPDF::read_xref(qpdf_offset_t xref_offset)
537 this->deleted_objects.clear(); 536 this->deleted_objects.clear();
538 } 537 }
539 538
  539 +bool
  540 +QPDF::parse_xrefFirst(std::string const& line,
  541 + int& obj, int& num, int& bytes)
  542 +{
  543 + // is_space and is_digit both return false on '\0', so this will
  544 + // not overrun the null-terminated buffer.
  545 + char const* p = line.c_str();
  546 + char const* start = line.c_str();
  547 +
  548 + // Skip zero or more spaces
  549 + while (QUtil::is_space(*p))
  550 + {
  551 + ++p;
  552 + }
  553 + // Require digit
  554 + if (! QUtil::is_digit(*p))
  555 + {
  556 + return false;
  557 + }
  558 + // Gather digits
  559 + std::string obj_str;
  560 + while (QUtil::is_digit(*p))
  561 + {
  562 + obj_str.append(1, *p++);
  563 + }
  564 + // Require space
  565 + if (! QUtil::is_space(*p))
  566 + {
  567 + return false;
  568 + }
  569 + // Skip spaces
  570 + while (QUtil::is_space(*p))
  571 + {
  572 + ++p;
  573 + }
  574 + // Require digit
  575 + if (! QUtil::is_digit(*p))
  576 + {
  577 + return false;
  578 + }
  579 + // Gather digits
  580 + std::string num_str;
  581 + while (QUtil::is_digit(*p))
  582 + {
  583 + num_str.append(1, *p++);
  584 + }
  585 + // Skip any space including line terminators
  586 + while (QUtil::is_space(*p))
  587 + {
  588 + ++p;
  589 + }
  590 + bytes = p - start;
  591 + obj = atoi(obj_str.c_str());
  592 + num = atoi(num_str.c_str());
  593 + return true;
  594 +}
  595 +
  596 +bool
  597 +QPDF::parse_xrefEntry(std::string const& line,
  598 + qpdf_offset_t& f1, int& f2, char& type)
  599 +{
  600 + // is_space and is_digit both return false on '\0', so this will
  601 + // not overrun the null-terminated buffer.
  602 + char const* p = line.c_str();
  603 +
  604 + // Skip zero or more spaces. There aren't supposed to be any.
  605 + bool invalid = false;
  606 + while (QUtil::is_space(*p))
  607 + {
  608 + ++p;
  609 + QTC::TC("qpdf", "QPDF ignore first space in xref entry");
  610 + invalid = true;
  611 + }
  612 + // Require digit
  613 + if (! QUtil::is_digit(*p))
  614 + {
  615 + return false;
  616 + }
  617 + // Gather digits
  618 + std::string f1_str;
  619 + while (QUtil::is_digit(*p))
  620 + {
  621 + f1_str.append(1, *p++);
  622 + }
  623 + // Require space
  624 + if (! QUtil::is_space(*p))
  625 + {
  626 + return false;
  627 + }
  628 + if (QUtil::is_space(*(p+1)))
  629 + {
  630 + QTC::TC("qpdf", "QPDF ignore first extra space in xref entry");
  631 + invalid = true;
  632 + }
  633 + // Skip spaces
  634 + while (QUtil::is_space(*p))
  635 + {
  636 + ++p;
  637 + }
  638 + // Require digit
  639 + if (! QUtil::is_digit(*p))
  640 + {
  641 + return false;
  642 + }
  643 + // Gather digits
  644 + std::string f2_str;
  645 + while (QUtil::is_digit(*p))
  646 + {
  647 + f2_str.append(1, *p++);
  648 + }
  649 + // Require space
  650 + if (! QUtil::is_space(*p))
  651 + {
  652 + return false;
  653 + }
  654 + if (QUtil::is_space(*(p+1)))
  655 + {
  656 + QTC::TC("qpdf", "QPDF ignore second extra space in xref entry");
  657 + invalid = true;
  658 + }
  659 + // Skip spaces
  660 + while (QUtil::is_space(*p))
  661 + {
  662 + ++p;
  663 + }
  664 + if ((*p == 'f') || (*p == 'n'))
  665 + {
  666 + type = *p;
  667 + }
  668 + else
  669 + {
  670 + return false;
  671 + }
  672 + if ((f1_str.length() != 10) || (f2_str.length() != 5))
  673 + {
  674 + QTC::TC("qpdf", "QPDF ignore length error xref entry");
  675 + invalid = true;
  676 + }
  677 +
  678 + if (invalid)
  679 + {
  680 + warn(QPDFExc(qpdf_e_damaged_pdf, this->file->getName(),
  681 + "xref table",
  682 + this->file->getLastOffset(),
  683 + "accepting invalid xref table entry"));
  684 + }
  685 +
  686 + f1 = QUtil::string_to_ll(f1_str.c_str());
  687 + f2 = atoi(f2_str.c_str());
  688 +
  689 + return true;
  690 +}
  691 +
540 qpdf_offset_t 692 qpdf_offset_t
541 QPDF::read_xrefTable(qpdf_offset_t xref_offset) 693 QPDF::read_xrefTable(qpdf_offset_t xref_offset)
542 { 694 {
543 - PCRE xref_first_re("^\\s*(\\d+)\\s+(\\d+)\\s*");  
544 - PCRE xref_entry_re("(?s:(^\\d{10}) (\\d{5}) ([fn])\\s*$)");  
545 -  
546 std::vector<QPDFObjGen> deleted_items; 695 std::vector<QPDFObjGen> deleted_items;
547 696
548 this->file->seek(xref_offset, SEEK_SET); 697 this->file->seek(xref_offset, SEEK_SET);
@@ -553,18 +702,17 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset) @@ -553,18 +702,17 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset)
553 memset(linebuf, 0, sizeof(linebuf)); 702 memset(linebuf, 0, sizeof(linebuf));
554 this->file->read(linebuf, sizeof(linebuf) - 1); 703 this->file->read(linebuf, sizeof(linebuf) - 1);
555 std::string line = linebuf; 704 std::string line = linebuf;
556 - PCRE::Match m1 = xref_first_re.match(line.c_str());  
557 - if (! m1) 705 + int obj = 0;
  706 + int num = 0;
  707 + int bytes = 0;
  708 + if (! parse_xrefFirst(line, obj, num, bytes))
558 { 709 {
559 QTC::TC("qpdf", "QPDF invalid xref"); 710 QTC::TC("qpdf", "QPDF invalid xref");
560 throw QPDFExc(qpdf_e_damaged_pdf, this->file->getName(), 711 throw QPDFExc(qpdf_e_damaged_pdf, this->file->getName(),
561 "xref table", this->file->getLastOffset(), 712 "xref table", this->file->getLastOffset(),
562 "xref syntax invalid"); 713 "xref syntax invalid");
563 } 714 }
564 - file->seek(this->file->getLastOffset() + m1.getMatch(0).length(),  
565 - SEEK_SET);  
566 - int obj = atoi(m1.getMatch(1).c_str());  
567 - int num = atoi(m1.getMatch(2).c_str()); 715 + this->file->seek(this->file->getLastOffset() + bytes, SEEK_SET);
568 for (int i = obj; i < obj + num; ++i) 716 for (int i = obj; i < obj + num; ++i)
569 { 717 {
570 if (i == 0) 718 if (i == 0)
@@ -573,8 +721,11 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset) @@ -573,8 +721,11 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset)
573 this->first_xref_item_offset = this->file->tell(); 721 this->first_xref_item_offset = this->file->tell();
574 } 722 }
575 std::string xref_entry = this->file->readLine(30); 723 std::string xref_entry = this->file->readLine(30);
576 - PCRE::Match m2 = xref_entry_re.match(xref_entry.c_str());  
577 - if (! m2) 724 + // For xref_table, these will always be small enough to be ints
  725 + qpdf_offset_t f1 = 0;
  726 + int f2 = 0;
  727 + char type = '\0';
  728 + if (! parse_xrefEntry(xref_entry, f1, f2, type))
578 { 729 {
579 QTC::TC("qpdf", "QPDF invalid xref entry"); 730 QTC::TC("qpdf", "QPDF invalid xref entry");
580 throw QPDFExc( 731 throw QPDFExc(
@@ -583,11 +734,6 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset) @@ -583,11 +734,6 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset)
583 "invalid xref entry (obj=" + 734 "invalid xref entry (obj=" +
584 QUtil::int_to_string(i) + ")"); 735 QUtil::int_to_string(i) + ")");
585 } 736 }
586 -  
587 - // For xref_table, these will always be small enough to be ints  
588 - qpdf_offset_t f1 = QUtil::string_to_ll(m2.getMatch(1).c_str());  
589 - int f2 = atoi(m2.getMatch(2).c_str());  
590 - char type = m2.getMatch(3).at(0);  
591 if (type == 'f') 737 if (type == 'f')
592 { 738 {
593 // Save deleted items until after we've checked the 739 // Save deleted items until after we've checked the
qpdf/qpdf.testcov
@@ -289,3 +289,7 @@ qpdf single-pages %d 0 @@ -289,3 +289,7 @@ qpdf single-pages %d 0
289 qpdf single-pages .pdf 0 289 qpdf single-pages .pdf 0
290 qpdf single-pages other 0 290 qpdf single-pages other 0
291 QPDFTokenizer allowing bad token 0 291 QPDFTokenizer allowing bad token 0
  292 +QPDF ignore first space in xref entry 0
  293 +QPDF ignore first extra space in xref entry 0
  294 +QPDF ignore second extra space in xref entry 0
  295 +QPDF ignore length error xref entry 0
qpdf/qtest/qpdf.test
@@ -232,7 +232,7 @@ foreach my $d (@bug_tests) @@ -232,7 +232,7 @@ foreach my $d (@bug_tests)
232 show_ntests(); 232 show_ntests();
233 # ---------- 233 # ----------
234 $td->notify("--- Miscellaneous Tests ---"); 234 $td->notify("--- Miscellaneous Tests ---");
235 -$n_tests += 86; 235 +$n_tests += 87;
236 236
237 $td->runtest("qpdf version", 237 $td->runtest("qpdf version",
238 {$td->COMMAND => "qpdf --version"}, 238 {$td->COMMAND => "qpdf --version"},
@@ -669,6 +669,13 @@ $td-&gt;runtest(&quot;ignore bad token&quot;, @@ -669,6 +669,13 @@ $td-&gt;runtest(&quot;ignore bad token&quot;,
669 $td->EXIT_STATUS => 0}, 669 $td->EXIT_STATUS => 0},
670 $td->NORMALIZE_NEWLINES); 670 $td->NORMALIZE_NEWLINES);
671 671
  672 +$td->runtest("recoverable xref errors",
  673 + {$td->COMMAND =>
  674 + "qpdf --check --show-xref xref-errors.pdf"},
  675 + {$td->FILE => "xref-errors.out",
  676 + $td->EXIT_STATUS => 3},
  677 + $td->NORMALIZE_NEWLINES);
  678 +
672 show_ntests(); 679 show_ntests();
673 # ---------- 680 # ----------
674 $td->notify("--- Single Page ---"); 681 $td->notify("--- Single Page ---");
qpdf/qtest/qpdf/xref-errors.out 0 → 100644
  1 +WARNING: xref-errors.pdf (xref table, file position 585): accepting invalid xref table entry
  2 +WARNING: xref-errors.pdf (xref table, file position 606): accepting invalid xref table entry
  3 +WARNING: xref-errors.pdf (xref table, file position 627): accepting invalid xref table entry
  4 +WARNING: xref-errors.pdf (xref table, file position 648): accepting invalid xref table entry
  5 +WARNING: xref-errors.pdf (xref table, file position 667): accepting invalid xref table entry
  6 +checking xref-errors.pdf
  7 +PDF Version: 1.3
  8 +File is not encrypted
  9 +File is not linearized
  10 +1/0: uncompressed; offset = 9
  11 +2/0: uncompressed; offset = 63
  12 +3/0: uncompressed; offset = 135
  13 +4/0: uncompressed; offset = 307
  14 +5/0: uncompressed; offset = 403
  15 +6/0: uncompressed; offset = 438
qpdf/qtest/qpdf/xref-errors.pdf 0 → 100644
  1 +%PDF-1.3
  2 +1 0 obj
  3 +<<
  4 + /Type /Catalog
  5 + /Pages 2 0 R
  6 +>>
  7 +endobj
  8 +
  9 +2 0 obj
  10 +<<
  11 + /Type /Pages
  12 + /Kids [
  13 + 3 0 R
  14 + ]
  15 + /Count 1
  16 +>>
  17 +endobj
  18 +
  19 +3 0 obj
  20 +<<
  21 + /Type /Page
  22 + /Parent 2 0 R
  23 + /MediaBox [0 0 612 792]
  24 + /Contents 4 0 R
  25 + /Resources <<
  26 + /ProcSet 5 0 R
  27 + /Font <<
  28 + /F1 6 0 R
  29 + >>
  30 + >>
  31 +>>
  32 +endobj
  33 +
  34 +4 0 obj
  35 +<<
  36 + /Length 44
  37 +>>
  38 +stream
  39 +BT
  40 + /F1 24 Tf
  41 + 72 720 Td
  42 + (Potato) Tj
  43 +ET
  44 +endstream
  45 +endobj
  46 +
  47 +5 0 obj
  48 +[
  49 + /PDF
  50 + /Text
  51 +]
  52 +endobj
  53 +
  54 +6 0 obj
  55 +<<
  56 + /Type /Font
  57 + /Subtype /Type1
  58 + /Name /F1
  59 + /BaseFont /Helvetica
  60 + /Encoding /WinAnsiEncoding
  61 +>>
  62 +endobj
  63 +
  64 +xref
  65 +0 7
  66 +0000000000 65535 f
  67 + 0000000009 00000 n
  68 +0000000063 00000 n
  69 +0000000135 00000 n
  70 +000000307 00000 n
  71 +0000000403 0000 n
  72 +0000000438 00000 n
  73 +trailer <<
  74 + /Size 7
  75 + /Root 1 0 R
  76 +>>
  77 +startxref
  78 +556
  79 +%%EOF