Commit 30f109e244f365111d5219903f13d64cf1a95054
1 parent
98a843c2
Read xref table without PCRE
Also accept more errors than before.
Showing
7 changed files
with
276 additions
and
18 deletions
ChangeLog
| 1 | 2017-08-10 Jay Berkenbilt <ejb@ql.org> | 1 | 2017-08-10 Jay Berkenbilt <ejb@ql.org> |
| 2 | 2 | ||
| 3 | + * Be more forgiving of certain types of errors in the xref table | ||
| 4 | + that don't interfere with interpreting the table. | ||
| 5 | + | ||
| 3 | * Remove unused "tracing" parameter from PointerHolder's | 6 | * Remove unused "tracing" parameter from PointerHolder's |
| 4 | (T*, bool) constructor. This change breaks source code | 7 | (T*, bool) constructor. This change breaks source code |
| 5 | compatibility, but since this argument to PointerHolder has not | 8 | compatibility, but since this argument to PointerHolder has not |
include/qpdf/QPDF.hh
| @@ -652,6 +652,10 @@ class QPDF | @@ -652,6 +652,10 @@ class QPDF | ||
| 652 | void setTrailer(QPDFObjectHandle obj); | 652 | void setTrailer(QPDFObjectHandle obj); |
| 653 | void read_xref(qpdf_offset_t offset); | 653 | void read_xref(qpdf_offset_t offset); |
| 654 | void reconstruct_xref(QPDFExc& e); | 654 | void reconstruct_xref(QPDFExc& e); |
| 655 | + bool parse_xrefFirst(std::string const& line, | ||
| 656 | + int& obj, int& num, int& bytes); | ||
| 657 | + bool parse_xrefEntry(std::string const& line, | ||
| 658 | + qpdf_offset_t& f1, int& f2, char& type); | ||
| 655 | qpdf_offset_t read_xrefTable(qpdf_offset_t offset); | 659 | qpdf_offset_t read_xrefTable(qpdf_offset_t offset); |
| 656 | qpdf_offset_t read_xrefStream(qpdf_offset_t offset); | 660 | qpdf_offset_t read_xrefStream(qpdf_offset_t offset); |
| 657 | qpdf_offset_t processXRefStream( | 661 | qpdf_offset_t processXRefStream( |
libqpdf/QPDF.cc
| @@ -9,7 +9,6 @@ | @@ -9,7 +9,6 @@ | ||
| 9 | 9 | ||
| 10 | #include <qpdf/QTC.hh> | 10 | #include <qpdf/QTC.hh> |
| 11 | #include <qpdf/QUtil.hh> | 11 | #include <qpdf/QUtil.hh> |
| 12 | -#include <qpdf/PCRE.hh> | ||
| 13 | #include <qpdf/Pipeline.hh> | 12 | #include <qpdf/Pipeline.hh> |
| 14 | #include <qpdf/Pl_Discard.hh> | 13 | #include <qpdf/Pl_Discard.hh> |
| 15 | #include <qpdf/FileInputSource.hh> | 14 | #include <qpdf/FileInputSource.hh> |
| @@ -537,12 +536,162 @@ QPDF::read_xref(qpdf_offset_t xref_offset) | @@ -537,12 +536,162 @@ QPDF::read_xref(qpdf_offset_t xref_offset) | ||
| 537 | this->deleted_objects.clear(); | 536 | this->deleted_objects.clear(); |
| 538 | } | 537 | } |
| 539 | 538 | ||
| 539 | +bool | ||
| 540 | +QPDF::parse_xrefFirst(std::string const& line, | ||
| 541 | + int& obj, int& num, int& bytes) | ||
| 542 | +{ | ||
| 543 | + // is_space and is_digit both return false on '\0', so this will | ||
| 544 | + // not overrun the null-terminated buffer. | ||
| 545 | + char const* p = line.c_str(); | ||
| 546 | + char const* start = line.c_str(); | ||
| 547 | + | ||
| 548 | + // Skip zero or more spaces | ||
| 549 | + while (QUtil::is_space(*p)) | ||
| 550 | + { | ||
| 551 | + ++p; | ||
| 552 | + } | ||
| 553 | + // Require digit | ||
| 554 | + if (! QUtil::is_digit(*p)) | ||
| 555 | + { | ||
| 556 | + return false; | ||
| 557 | + } | ||
| 558 | + // Gather digits | ||
| 559 | + std::string obj_str; | ||
| 560 | + while (QUtil::is_digit(*p)) | ||
| 561 | + { | ||
| 562 | + obj_str.append(1, *p++); | ||
| 563 | + } | ||
| 564 | + // Require space | ||
| 565 | + if (! QUtil::is_space(*p)) | ||
| 566 | + { | ||
| 567 | + return false; | ||
| 568 | + } | ||
| 569 | + // Skip spaces | ||
| 570 | + while (QUtil::is_space(*p)) | ||
| 571 | + { | ||
| 572 | + ++p; | ||
| 573 | + } | ||
| 574 | + // Require digit | ||
| 575 | + if (! QUtil::is_digit(*p)) | ||
| 576 | + { | ||
| 577 | + return false; | ||
| 578 | + } | ||
| 579 | + // Gather digits | ||
| 580 | + std::string num_str; | ||
| 581 | + while (QUtil::is_digit(*p)) | ||
| 582 | + { | ||
| 583 | + num_str.append(1, *p++); | ||
| 584 | + } | ||
| 585 | + // Skip any space including line terminators | ||
| 586 | + while (QUtil::is_space(*p)) | ||
| 587 | + { | ||
| 588 | + ++p; | ||
| 589 | + } | ||
| 590 | + bytes = p - start; | ||
| 591 | + obj = atoi(obj_str.c_str()); | ||
| 592 | + num = atoi(num_str.c_str()); | ||
| 593 | + return true; | ||
| 594 | +} | ||
| 595 | + | ||
| 596 | +bool | ||
| 597 | +QPDF::parse_xrefEntry(std::string const& line, | ||
| 598 | + qpdf_offset_t& f1, int& f2, char& type) | ||
| 599 | +{ | ||
| 600 | + // is_space and is_digit both return false on '\0', so this will | ||
| 601 | + // not overrun the null-terminated buffer. | ||
| 602 | + char const* p = line.c_str(); | ||
| 603 | + | ||
| 604 | + // Skip zero or more spaces. There aren't supposed to be any. | ||
| 605 | + bool invalid = false; | ||
| 606 | + while (QUtil::is_space(*p)) | ||
| 607 | + { | ||
| 608 | + ++p; | ||
| 609 | + QTC::TC("qpdf", "QPDF ignore first space in xref entry"); | ||
| 610 | + invalid = true; | ||
| 611 | + } | ||
| 612 | + // Require digit | ||
| 613 | + if (! QUtil::is_digit(*p)) | ||
| 614 | + { | ||
| 615 | + return false; | ||
| 616 | + } | ||
| 617 | + // Gather digits | ||
| 618 | + std::string f1_str; | ||
| 619 | + while (QUtil::is_digit(*p)) | ||
| 620 | + { | ||
| 621 | + f1_str.append(1, *p++); | ||
| 622 | + } | ||
| 623 | + // Require space | ||
| 624 | + if (! QUtil::is_space(*p)) | ||
| 625 | + { | ||
| 626 | + return false; | ||
| 627 | + } | ||
| 628 | + if (QUtil::is_space(*(p+1))) | ||
| 629 | + { | ||
| 630 | + QTC::TC("qpdf", "QPDF ignore first extra space in xref entry"); | ||
| 631 | + invalid = true; | ||
| 632 | + } | ||
| 633 | + // Skip spaces | ||
| 634 | + while (QUtil::is_space(*p)) | ||
| 635 | + { | ||
| 636 | + ++p; | ||
| 637 | + } | ||
| 638 | + // Require digit | ||
| 639 | + if (! QUtil::is_digit(*p)) | ||
| 640 | + { | ||
| 641 | + return false; | ||
| 642 | + } | ||
| 643 | + // Gather digits | ||
| 644 | + std::string f2_str; | ||
| 645 | + while (QUtil::is_digit(*p)) | ||
| 646 | + { | ||
| 647 | + f2_str.append(1, *p++); | ||
| 648 | + } | ||
| 649 | + // Require space | ||
| 650 | + if (! QUtil::is_space(*p)) | ||
| 651 | + { | ||
| 652 | + return false; | ||
| 653 | + } | ||
| 654 | + if (QUtil::is_space(*(p+1))) | ||
| 655 | + { | ||
| 656 | + QTC::TC("qpdf", "QPDF ignore second extra space in xref entry"); | ||
| 657 | + invalid = true; | ||
| 658 | + } | ||
| 659 | + // Skip spaces | ||
| 660 | + while (QUtil::is_space(*p)) | ||
| 661 | + { | ||
| 662 | + ++p; | ||
| 663 | + } | ||
| 664 | + if ((*p == 'f') || (*p == 'n')) | ||
| 665 | + { | ||
| 666 | + type = *p; | ||
| 667 | + } | ||
| 668 | + else | ||
| 669 | + { | ||
| 670 | + return false; | ||
| 671 | + } | ||
| 672 | + if ((f1_str.length() != 10) || (f2_str.length() != 5)) | ||
| 673 | + { | ||
| 674 | + QTC::TC("qpdf", "QPDF ignore length error xref entry"); | ||
| 675 | + invalid = true; | ||
| 676 | + } | ||
| 677 | + | ||
| 678 | + if (invalid) | ||
| 679 | + { | ||
| 680 | + warn(QPDFExc(qpdf_e_damaged_pdf, this->file->getName(), | ||
| 681 | + "xref table", | ||
| 682 | + this->file->getLastOffset(), | ||
| 683 | + "accepting invalid xref table entry")); | ||
| 684 | + } | ||
| 685 | + | ||
| 686 | + f1 = QUtil::string_to_ll(f1_str.c_str()); | ||
| 687 | + f2 = atoi(f2_str.c_str()); | ||
| 688 | + | ||
| 689 | + return true; | ||
| 690 | +} | ||
| 691 | + | ||
| 540 | qpdf_offset_t | 692 | qpdf_offset_t |
| 541 | QPDF::read_xrefTable(qpdf_offset_t xref_offset) | 693 | QPDF::read_xrefTable(qpdf_offset_t xref_offset) |
| 542 | { | 694 | { |
| 543 | - PCRE xref_first_re("^\\s*(\\d+)\\s+(\\d+)\\s*"); | ||
| 544 | - PCRE xref_entry_re("(?s:(^\\d{10}) (\\d{5}) ([fn])\\s*$)"); | ||
| 545 | - | ||
| 546 | std::vector<QPDFObjGen> deleted_items; | 695 | std::vector<QPDFObjGen> deleted_items; |
| 547 | 696 | ||
| 548 | this->file->seek(xref_offset, SEEK_SET); | 697 | this->file->seek(xref_offset, SEEK_SET); |
| @@ -553,18 +702,17 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset) | @@ -553,18 +702,17 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset) | ||
| 553 | memset(linebuf, 0, sizeof(linebuf)); | 702 | memset(linebuf, 0, sizeof(linebuf)); |
| 554 | this->file->read(linebuf, sizeof(linebuf) - 1); | 703 | this->file->read(linebuf, sizeof(linebuf) - 1); |
| 555 | std::string line = linebuf; | 704 | std::string line = linebuf; |
| 556 | - PCRE::Match m1 = xref_first_re.match(line.c_str()); | ||
| 557 | - if (! m1) | 705 | + int obj = 0; |
| 706 | + int num = 0; | ||
| 707 | + int bytes = 0; | ||
| 708 | + if (! parse_xrefFirst(line, obj, num, bytes)) | ||
| 558 | { | 709 | { |
| 559 | QTC::TC("qpdf", "QPDF invalid xref"); | 710 | QTC::TC("qpdf", "QPDF invalid xref"); |
| 560 | throw QPDFExc(qpdf_e_damaged_pdf, this->file->getName(), | 711 | throw QPDFExc(qpdf_e_damaged_pdf, this->file->getName(), |
| 561 | "xref table", this->file->getLastOffset(), | 712 | "xref table", this->file->getLastOffset(), |
| 562 | "xref syntax invalid"); | 713 | "xref syntax invalid"); |
| 563 | } | 714 | } |
| 564 | - file->seek(this->file->getLastOffset() + m1.getMatch(0).length(), | ||
| 565 | - SEEK_SET); | ||
| 566 | - int obj = atoi(m1.getMatch(1).c_str()); | ||
| 567 | - int num = atoi(m1.getMatch(2).c_str()); | 715 | + this->file->seek(this->file->getLastOffset() + bytes, SEEK_SET); |
| 568 | for (int i = obj; i < obj + num; ++i) | 716 | for (int i = obj; i < obj + num; ++i) |
| 569 | { | 717 | { |
| 570 | if (i == 0) | 718 | if (i == 0) |
| @@ -573,8 +721,11 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset) | @@ -573,8 +721,11 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset) | ||
| 573 | this->first_xref_item_offset = this->file->tell(); | 721 | this->first_xref_item_offset = this->file->tell(); |
| 574 | } | 722 | } |
| 575 | std::string xref_entry = this->file->readLine(30); | 723 | std::string xref_entry = this->file->readLine(30); |
| 576 | - PCRE::Match m2 = xref_entry_re.match(xref_entry.c_str()); | ||
| 577 | - if (! m2) | 724 | + // For xref_table, these will always be small enough to be ints |
| 725 | + qpdf_offset_t f1 = 0; | ||
| 726 | + int f2 = 0; | ||
| 727 | + char type = '\0'; | ||
| 728 | + if (! parse_xrefEntry(xref_entry, f1, f2, type)) | ||
| 578 | { | 729 | { |
| 579 | QTC::TC("qpdf", "QPDF invalid xref entry"); | 730 | QTC::TC("qpdf", "QPDF invalid xref entry"); |
| 580 | throw QPDFExc( | 731 | throw QPDFExc( |
| @@ -583,11 +734,6 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset) | @@ -583,11 +734,6 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset) | ||
| 583 | "invalid xref entry (obj=" + | 734 | "invalid xref entry (obj=" + |
| 584 | QUtil::int_to_string(i) + ")"); | 735 | QUtil::int_to_string(i) + ")"); |
| 585 | } | 736 | } |
| 586 | - | ||
| 587 | - // For xref_table, these will always be small enough to be ints | ||
| 588 | - qpdf_offset_t f1 = QUtil::string_to_ll(m2.getMatch(1).c_str()); | ||
| 589 | - int f2 = atoi(m2.getMatch(2).c_str()); | ||
| 590 | - char type = m2.getMatch(3).at(0); | ||
| 591 | if (type == 'f') | 737 | if (type == 'f') |
| 592 | { | 738 | { |
| 593 | // Save deleted items until after we've checked the | 739 | // Save deleted items until after we've checked the |
qpdf/qpdf.testcov
| @@ -289,3 +289,7 @@ qpdf single-pages %d 0 | @@ -289,3 +289,7 @@ qpdf single-pages %d 0 | ||
| 289 | qpdf single-pages .pdf 0 | 289 | qpdf single-pages .pdf 0 |
| 290 | qpdf single-pages other 0 | 290 | qpdf single-pages other 0 |
| 291 | QPDFTokenizer allowing bad token 0 | 291 | QPDFTokenizer allowing bad token 0 |
| 292 | +QPDF ignore first space in xref entry 0 | ||
| 293 | +QPDF ignore first extra space in xref entry 0 | ||
| 294 | +QPDF ignore second extra space in xref entry 0 | ||
| 295 | +QPDF ignore length error xref entry 0 |
qpdf/qtest/qpdf.test
| @@ -232,7 +232,7 @@ foreach my $d (@bug_tests) | @@ -232,7 +232,7 @@ foreach my $d (@bug_tests) | ||
| 232 | show_ntests(); | 232 | show_ntests(); |
| 233 | # ---------- | 233 | # ---------- |
| 234 | $td->notify("--- Miscellaneous Tests ---"); | 234 | $td->notify("--- Miscellaneous Tests ---"); |
| 235 | -$n_tests += 86; | 235 | +$n_tests += 87; |
| 236 | 236 | ||
| 237 | $td->runtest("qpdf version", | 237 | $td->runtest("qpdf version", |
| 238 | {$td->COMMAND => "qpdf --version"}, | 238 | {$td->COMMAND => "qpdf --version"}, |
| @@ -669,6 +669,13 @@ $td->runtest("ignore bad token", | @@ -669,6 +669,13 @@ $td->runtest("ignore bad token", | ||
| 669 | $td->EXIT_STATUS => 0}, | 669 | $td->EXIT_STATUS => 0}, |
| 670 | $td->NORMALIZE_NEWLINES); | 670 | $td->NORMALIZE_NEWLINES); |
| 671 | 671 | ||
| 672 | +$td->runtest("recoverable xref errors", | ||
| 673 | + {$td->COMMAND => | ||
| 674 | + "qpdf --check --show-xref xref-errors.pdf"}, | ||
| 675 | + {$td->FILE => "xref-errors.out", | ||
| 676 | + $td->EXIT_STATUS => 3}, | ||
| 677 | + $td->NORMALIZE_NEWLINES); | ||
| 678 | + | ||
| 672 | show_ntests(); | 679 | show_ntests(); |
| 673 | # ---------- | 680 | # ---------- |
| 674 | $td->notify("--- Single Page ---"); | 681 | $td->notify("--- Single Page ---"); |
qpdf/qtest/qpdf/xref-errors.out
0 → 100644
| 1 | +WARNING: xref-errors.pdf (xref table, file position 585): accepting invalid xref table entry | ||
| 2 | +WARNING: xref-errors.pdf (xref table, file position 606): accepting invalid xref table entry | ||
| 3 | +WARNING: xref-errors.pdf (xref table, file position 627): accepting invalid xref table entry | ||
| 4 | +WARNING: xref-errors.pdf (xref table, file position 648): accepting invalid xref table entry | ||
| 5 | +WARNING: xref-errors.pdf (xref table, file position 667): accepting invalid xref table entry | ||
| 6 | +checking xref-errors.pdf | ||
| 7 | +PDF Version: 1.3 | ||
| 8 | +File is not encrypted | ||
| 9 | +File is not linearized | ||
| 10 | +1/0: uncompressed; offset = 9 | ||
| 11 | +2/0: uncompressed; offset = 63 | ||
| 12 | +3/0: uncompressed; offset = 135 | ||
| 13 | +4/0: uncompressed; offset = 307 | ||
| 14 | +5/0: uncompressed; offset = 403 | ||
| 15 | +6/0: uncompressed; offset = 438 |
qpdf/qtest/qpdf/xref-errors.pdf
0 → 100644
| 1 | +%PDF-1.3 | ||
| 2 | +1 0 obj | ||
| 3 | +<< | ||
| 4 | + /Type /Catalog | ||
| 5 | + /Pages 2 0 R | ||
| 6 | +>> | ||
| 7 | +endobj | ||
| 8 | + | ||
| 9 | +2 0 obj | ||
| 10 | +<< | ||
| 11 | + /Type /Pages | ||
| 12 | + /Kids [ | ||
| 13 | + 3 0 R | ||
| 14 | + ] | ||
| 15 | + /Count 1 | ||
| 16 | +>> | ||
| 17 | +endobj | ||
| 18 | + | ||
| 19 | +3 0 obj | ||
| 20 | +<< | ||
| 21 | + /Type /Page | ||
| 22 | + /Parent 2 0 R | ||
| 23 | + /MediaBox [0 0 612 792] | ||
| 24 | + /Contents 4 0 R | ||
| 25 | + /Resources << | ||
| 26 | + /ProcSet 5 0 R | ||
| 27 | + /Font << | ||
| 28 | + /F1 6 0 R | ||
| 29 | + >> | ||
| 30 | + >> | ||
| 31 | +>> | ||
| 32 | +endobj | ||
| 33 | + | ||
| 34 | +4 0 obj | ||
| 35 | +<< | ||
| 36 | + /Length 44 | ||
| 37 | +>> | ||
| 38 | +stream | ||
| 39 | +BT | ||
| 40 | + /F1 24 Tf | ||
| 41 | + 72 720 Td | ||
| 42 | + (Potato) Tj | ||
| 43 | +ET | ||
| 44 | +endstream | ||
| 45 | +endobj | ||
| 46 | + | ||
| 47 | +5 0 obj | ||
| 48 | +[ | ||
| 49 | |||
| 50 | + /Text | ||
| 51 | +] | ||
| 52 | +endobj | ||
| 53 | + | ||
| 54 | +6 0 obj | ||
| 55 | +<< | ||
| 56 | + /Type /Font | ||
| 57 | + /Subtype /Type1 | ||
| 58 | + /Name /F1 | ||
| 59 | + /BaseFont /Helvetica | ||
| 60 | + /Encoding /WinAnsiEncoding | ||
| 61 | +>> | ||
| 62 | +endobj | ||
| 63 | + | ||
| 64 | +xref | ||
| 65 | +0 7 | ||
| 66 | +0000000000 65535 f | ||
| 67 | + 0000000009 00000 n | ||
| 68 | +0000000063 00000 n | ||
| 69 | +0000000135 00000 n | ||
| 70 | +000000307 00000 n | ||
| 71 | +0000000403 0000 n | ||
| 72 | +0000000438 00000 n | ||
| 73 | +trailer << | ||
| 74 | + /Size 7 | ||
| 75 | + /Root 1 0 R | ||
| 76 | +>> | ||
| 77 | +startxref | ||
| 78 | +556 | ||
| 79 | +%%EOF |