Commit 30f109e244f365111d5219903f13d64cf1a95054
1 parent
98a843c2
Read xref table without PCRE
Also accept more errors than before.
Showing
7 changed files
with
276 additions
and
18 deletions
ChangeLog
| 1 | 1 | 2017-08-10 Jay Berkenbilt <ejb@ql.org> |
| 2 | 2 | |
| 3 | + * Be more forgiving of certain types of errors in the xref table | |
| 4 | + that don't interfere with interpreting the table. | |
| 5 | + | |
| 3 | 6 | * Remove unused "tracing" parameter from PointerHolder's |
| 4 | 7 | (T*, bool) constructor. This change breaks source code |
| 5 | 8 | compatibility, but since this argument to PointerHolder has not | ... | ... |
include/qpdf/QPDF.hh
| ... | ... | @@ -652,6 +652,10 @@ class QPDF |
| 652 | 652 | void setTrailer(QPDFObjectHandle obj); |
| 653 | 653 | void read_xref(qpdf_offset_t offset); |
| 654 | 654 | void reconstruct_xref(QPDFExc& e); |
| 655 | + bool parse_xrefFirst(std::string const& line, | |
| 656 | + int& obj, int& num, int& bytes); | |
| 657 | + bool parse_xrefEntry(std::string const& line, | |
| 658 | + qpdf_offset_t& f1, int& f2, char& type); | |
| 655 | 659 | qpdf_offset_t read_xrefTable(qpdf_offset_t offset); |
| 656 | 660 | qpdf_offset_t read_xrefStream(qpdf_offset_t offset); |
| 657 | 661 | qpdf_offset_t processXRefStream( | ... | ... |
libqpdf/QPDF.cc
| ... | ... | @@ -9,7 +9,6 @@ |
| 9 | 9 | |
| 10 | 10 | #include <qpdf/QTC.hh> |
| 11 | 11 | #include <qpdf/QUtil.hh> |
| 12 | -#include <qpdf/PCRE.hh> | |
| 13 | 12 | #include <qpdf/Pipeline.hh> |
| 14 | 13 | #include <qpdf/Pl_Discard.hh> |
| 15 | 14 | #include <qpdf/FileInputSource.hh> |
| ... | ... | @@ -537,12 +536,162 @@ QPDF::read_xref(qpdf_offset_t xref_offset) |
| 537 | 536 | this->deleted_objects.clear(); |
| 538 | 537 | } |
| 539 | 538 | |
| 539 | +bool | |
| 540 | +QPDF::parse_xrefFirst(std::string const& line, | |
| 541 | + int& obj, int& num, int& bytes) | |
| 542 | +{ | |
| 543 | + // is_space and is_digit both return false on '\0', so this will | |
| 544 | + // not overrun the null-terminated buffer. | |
| 545 | + char const* p = line.c_str(); | |
| 546 | + char const* start = line.c_str(); | |
| 547 | + | |
| 548 | + // Skip zero or more spaces | |
| 549 | + while (QUtil::is_space(*p)) | |
| 550 | + { | |
| 551 | + ++p; | |
| 552 | + } | |
| 553 | + // Require digit | |
| 554 | + if (! QUtil::is_digit(*p)) | |
| 555 | + { | |
| 556 | + return false; | |
| 557 | + } | |
| 558 | + // Gather digits | |
| 559 | + std::string obj_str; | |
| 560 | + while (QUtil::is_digit(*p)) | |
| 561 | + { | |
| 562 | + obj_str.append(1, *p++); | |
| 563 | + } | |
| 564 | + // Require space | |
| 565 | + if (! QUtil::is_space(*p)) | |
| 566 | + { | |
| 567 | + return false; | |
| 568 | + } | |
| 569 | + // Skip spaces | |
| 570 | + while (QUtil::is_space(*p)) | |
| 571 | + { | |
| 572 | + ++p; | |
| 573 | + } | |
| 574 | + // Require digit | |
| 575 | + if (! QUtil::is_digit(*p)) | |
| 576 | + { | |
| 577 | + return false; | |
| 578 | + } | |
| 579 | + // Gather digits | |
| 580 | + std::string num_str; | |
| 581 | + while (QUtil::is_digit(*p)) | |
| 582 | + { | |
| 583 | + num_str.append(1, *p++); | |
| 584 | + } | |
| 585 | + // Skip any space including line terminators | |
| 586 | + while (QUtil::is_space(*p)) | |
| 587 | + { | |
| 588 | + ++p; | |
| 589 | + } | |
| 590 | + bytes = p - start; | |
| 591 | + obj = atoi(obj_str.c_str()); | |
| 592 | + num = atoi(num_str.c_str()); | |
| 593 | + return true; | |
| 594 | +} | |
| 595 | + | |
| 596 | +bool | |
| 597 | +QPDF::parse_xrefEntry(std::string const& line, | |
| 598 | + qpdf_offset_t& f1, int& f2, char& type) | |
| 599 | +{ | |
| 600 | + // is_space and is_digit both return false on '\0', so this will | |
| 601 | + // not overrun the null-terminated buffer. | |
| 602 | + char const* p = line.c_str(); | |
| 603 | + | |
| 604 | + // Skip zero or more spaces. There aren't supposed to be any. | |
| 605 | + bool invalid = false; | |
| 606 | + while (QUtil::is_space(*p)) | |
| 607 | + { | |
| 608 | + ++p; | |
| 609 | + QTC::TC("qpdf", "QPDF ignore first space in xref entry"); | |
| 610 | + invalid = true; | |
| 611 | + } | |
| 612 | + // Require digit | |
| 613 | + if (! QUtil::is_digit(*p)) | |
| 614 | + { | |
| 615 | + return false; | |
| 616 | + } | |
| 617 | + // Gather digits | |
| 618 | + std::string f1_str; | |
| 619 | + while (QUtil::is_digit(*p)) | |
| 620 | + { | |
| 621 | + f1_str.append(1, *p++); | |
| 622 | + } | |
| 623 | + // Require space | |
| 624 | + if (! QUtil::is_space(*p)) | |
| 625 | + { | |
| 626 | + return false; | |
| 627 | + } | |
| 628 | + if (QUtil::is_space(*(p+1))) | |
| 629 | + { | |
| 630 | + QTC::TC("qpdf", "QPDF ignore first extra space in xref entry"); | |
| 631 | + invalid = true; | |
| 632 | + } | |
| 633 | + // Skip spaces | |
| 634 | + while (QUtil::is_space(*p)) | |
| 635 | + { | |
| 636 | + ++p; | |
| 637 | + } | |
| 638 | + // Require digit | |
| 639 | + if (! QUtil::is_digit(*p)) | |
| 640 | + { | |
| 641 | + return false; | |
| 642 | + } | |
| 643 | + // Gather digits | |
| 644 | + std::string f2_str; | |
| 645 | + while (QUtil::is_digit(*p)) | |
| 646 | + { | |
| 647 | + f2_str.append(1, *p++); | |
| 648 | + } | |
| 649 | + // Require space | |
| 650 | + if (! QUtil::is_space(*p)) | |
| 651 | + { | |
| 652 | + return false; | |
| 653 | + } | |
| 654 | + if (QUtil::is_space(*(p+1))) | |
| 655 | + { | |
| 656 | + QTC::TC("qpdf", "QPDF ignore second extra space in xref entry"); | |
| 657 | + invalid = true; | |
| 658 | + } | |
| 659 | + // Skip spaces | |
| 660 | + while (QUtil::is_space(*p)) | |
| 661 | + { | |
| 662 | + ++p; | |
| 663 | + } | |
| 664 | + if ((*p == 'f') || (*p == 'n')) | |
| 665 | + { | |
| 666 | + type = *p; | |
| 667 | + } | |
| 668 | + else | |
| 669 | + { | |
| 670 | + return false; | |
| 671 | + } | |
| 672 | + if ((f1_str.length() != 10) || (f2_str.length() != 5)) | |
| 673 | + { | |
| 674 | + QTC::TC("qpdf", "QPDF ignore length error xref entry"); | |
| 675 | + invalid = true; | |
| 676 | + } | |
| 677 | + | |
| 678 | + if (invalid) | |
| 679 | + { | |
| 680 | + warn(QPDFExc(qpdf_e_damaged_pdf, this->file->getName(), | |
| 681 | + "xref table", | |
| 682 | + this->file->getLastOffset(), | |
| 683 | + "accepting invalid xref table entry")); | |
| 684 | + } | |
| 685 | + | |
| 686 | + f1 = QUtil::string_to_ll(f1_str.c_str()); | |
| 687 | + f2 = atoi(f2_str.c_str()); | |
| 688 | + | |
| 689 | + return true; | |
| 690 | +} | |
| 691 | + | |
| 540 | 692 | qpdf_offset_t |
| 541 | 693 | QPDF::read_xrefTable(qpdf_offset_t xref_offset) |
| 542 | 694 | { |
| 543 | - PCRE xref_first_re("^\\s*(\\d+)\\s+(\\d+)\\s*"); | |
| 544 | - PCRE xref_entry_re("(?s:(^\\d{10}) (\\d{5}) ([fn])\\s*$)"); | |
| 545 | - | |
| 546 | 695 | std::vector<QPDFObjGen> deleted_items; |
| 547 | 696 | |
| 548 | 697 | this->file->seek(xref_offset, SEEK_SET); |
| ... | ... | @@ -553,18 +702,17 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset) |
| 553 | 702 | memset(linebuf, 0, sizeof(linebuf)); |
| 554 | 703 | this->file->read(linebuf, sizeof(linebuf) - 1); |
| 555 | 704 | std::string line = linebuf; |
| 556 | - PCRE::Match m1 = xref_first_re.match(line.c_str()); | |
| 557 | - if (! m1) | |
| 705 | + int obj = 0; | |
| 706 | + int num = 0; | |
| 707 | + int bytes = 0; | |
| 708 | + if (! parse_xrefFirst(line, obj, num, bytes)) | |
| 558 | 709 | { |
| 559 | 710 | QTC::TC("qpdf", "QPDF invalid xref"); |
| 560 | 711 | throw QPDFExc(qpdf_e_damaged_pdf, this->file->getName(), |
| 561 | 712 | "xref table", this->file->getLastOffset(), |
| 562 | 713 | "xref syntax invalid"); |
| 563 | 714 | } |
| 564 | - file->seek(this->file->getLastOffset() + m1.getMatch(0).length(), | |
| 565 | - SEEK_SET); | |
| 566 | - int obj = atoi(m1.getMatch(1).c_str()); | |
| 567 | - int num = atoi(m1.getMatch(2).c_str()); | |
| 715 | + this->file->seek(this->file->getLastOffset() + bytes, SEEK_SET); | |
| 568 | 716 | for (int i = obj; i < obj + num; ++i) |
| 569 | 717 | { |
| 570 | 718 | if (i == 0) |
| ... | ... | @@ -573,8 +721,11 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset) |
| 573 | 721 | this->first_xref_item_offset = this->file->tell(); |
| 574 | 722 | } |
| 575 | 723 | std::string xref_entry = this->file->readLine(30); |
| 576 | - PCRE::Match m2 = xref_entry_re.match(xref_entry.c_str()); | |
| 577 | - if (! m2) | |
| 724 | + // For xref_table, these will always be small enough to be ints | |
| 725 | + qpdf_offset_t f1 = 0; | |
| 726 | + int f2 = 0; | |
| 727 | + char type = '\0'; | |
| 728 | + if (! parse_xrefEntry(xref_entry, f1, f2, type)) | |
| 578 | 729 | { |
| 579 | 730 | QTC::TC("qpdf", "QPDF invalid xref entry"); |
| 580 | 731 | throw QPDFExc( |
| ... | ... | @@ -583,11 +734,6 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset) |
| 583 | 734 | "invalid xref entry (obj=" + |
| 584 | 735 | QUtil::int_to_string(i) + ")"); |
| 585 | 736 | } |
| 586 | - | |
| 587 | - // For xref_table, these will always be small enough to be ints | |
| 588 | - qpdf_offset_t f1 = QUtil::string_to_ll(m2.getMatch(1).c_str()); | |
| 589 | - int f2 = atoi(m2.getMatch(2).c_str()); | |
| 590 | - char type = m2.getMatch(3).at(0); | |
| 591 | 737 | if (type == 'f') |
| 592 | 738 | { |
| 593 | 739 | // Save deleted items until after we've checked the | ... | ... |
qpdf/qpdf.testcov
| ... | ... | @@ -289,3 +289,7 @@ qpdf single-pages %d 0 |
| 289 | 289 | qpdf single-pages .pdf 0 |
| 290 | 290 | qpdf single-pages other 0 |
| 291 | 291 | QPDFTokenizer allowing bad token 0 |
| 292 | +QPDF ignore first space in xref entry 0 | |
| 293 | +QPDF ignore first extra space in xref entry 0 | |
| 294 | +QPDF ignore second extra space in xref entry 0 | |
| 295 | +QPDF ignore length error xref entry 0 | ... | ... |
qpdf/qtest/qpdf.test
| ... | ... | @@ -232,7 +232,7 @@ foreach my $d (@bug_tests) |
| 232 | 232 | show_ntests(); |
| 233 | 233 | # ---------- |
| 234 | 234 | $td->notify("--- Miscellaneous Tests ---"); |
| 235 | -$n_tests += 86; | |
| 235 | +$n_tests += 87; | |
| 236 | 236 | |
| 237 | 237 | $td->runtest("qpdf version", |
| 238 | 238 | {$td->COMMAND => "qpdf --version"}, |
| ... | ... | @@ -669,6 +669,13 @@ $td->runtest("ignore bad token", |
| 669 | 669 | $td->EXIT_STATUS => 0}, |
| 670 | 670 | $td->NORMALIZE_NEWLINES); |
| 671 | 671 | |
| 672 | +$td->runtest("recoverable xref errors", | |
| 673 | + {$td->COMMAND => | |
| 674 | + "qpdf --check --show-xref xref-errors.pdf"}, | |
| 675 | + {$td->FILE => "xref-errors.out", | |
| 676 | + $td->EXIT_STATUS => 3}, | |
| 677 | + $td->NORMALIZE_NEWLINES); | |
| 678 | + | |
| 672 | 679 | show_ntests(); |
| 673 | 680 | # ---------- |
| 674 | 681 | $td->notify("--- Single Page ---"); | ... | ... |
qpdf/qtest/qpdf/xref-errors.out
0 → 100644
| 1 | +WARNING: xref-errors.pdf (xref table, file position 585): accepting invalid xref table entry | |
| 2 | +WARNING: xref-errors.pdf (xref table, file position 606): accepting invalid xref table entry | |
| 3 | +WARNING: xref-errors.pdf (xref table, file position 627): accepting invalid xref table entry | |
| 4 | +WARNING: xref-errors.pdf (xref table, file position 648): accepting invalid xref table entry | |
| 5 | +WARNING: xref-errors.pdf (xref table, file position 667): accepting invalid xref table entry | |
| 6 | +checking xref-errors.pdf | |
| 7 | +PDF Version: 1.3 | |
| 8 | +File is not encrypted | |
| 9 | +File is not linearized | |
| 10 | +1/0: uncompressed; offset = 9 | |
| 11 | +2/0: uncompressed; offset = 63 | |
| 12 | +3/0: uncompressed; offset = 135 | |
| 13 | +4/0: uncompressed; offset = 307 | |
| 14 | +5/0: uncompressed; offset = 403 | |
| 15 | +6/0: uncompressed; offset = 438 | ... | ... |
qpdf/qtest/qpdf/xref-errors.pdf
0 → 100644
| 1 | +%PDF-1.3 | |
| 2 | +1 0 obj | |
| 3 | +<< | |
| 4 | + /Type /Catalog | |
| 5 | + /Pages 2 0 R | |
| 6 | +>> | |
| 7 | +endobj | |
| 8 | + | |
| 9 | +2 0 obj | |
| 10 | +<< | |
| 11 | + /Type /Pages | |
| 12 | + /Kids [ | |
| 13 | + 3 0 R | |
| 14 | + ] | |
| 15 | + /Count 1 | |
| 16 | +>> | |
| 17 | +endobj | |
| 18 | + | |
| 19 | +3 0 obj | |
| 20 | +<< | |
| 21 | + /Type /Page | |
| 22 | + /Parent 2 0 R | |
| 23 | + /MediaBox [0 0 612 792] | |
| 24 | + /Contents 4 0 R | |
| 25 | + /Resources << | |
| 26 | + /ProcSet 5 0 R | |
| 27 | + /Font << | |
| 28 | + /F1 6 0 R | |
| 29 | + >> | |
| 30 | + >> | |
| 31 | +>> | |
| 32 | +endobj | |
| 33 | + | |
| 34 | +4 0 obj | |
| 35 | +<< | |
| 36 | + /Length 44 | |
| 37 | +>> | |
| 38 | +stream | |
| 39 | +BT | |
| 40 | + /F1 24 Tf | |
| 41 | + 72 720 Td | |
| 42 | + (Potato) Tj | |
| 43 | +ET | |
| 44 | +endstream | |
| 45 | +endobj | |
| 46 | + | |
| 47 | +5 0 obj | |
| 48 | +[ | |
| 49 | ||
| 50 | + /Text | |
| 51 | +] | |
| 52 | +endobj | |
| 53 | + | |
| 54 | +6 0 obj | |
| 55 | +<< | |
| 56 | + /Type /Font | |
| 57 | + /Subtype /Type1 | |
| 58 | + /Name /F1 | |
| 59 | + /BaseFont /Helvetica | |
| 60 | + /Encoding /WinAnsiEncoding | |
| 61 | +>> | |
| 62 | +endobj | |
| 63 | + | |
| 64 | +xref | |
| 65 | +0 7 | |
| 66 | +0000000000 65535 f | |
| 67 | + 0000000009 00000 n | |
| 68 | +0000000063 00000 n | |
| 69 | +0000000135 00000 n | |
| 70 | +000000307 00000 n | |
| 71 | +0000000403 0000 n | |
| 72 | +0000000438 00000 n | |
| 73 | +trailer << | |
| 74 | + /Size 7 | |
| 75 | + /Root 1 0 R | |
| 76 | +>> | |
| 77 | +startxref | |
| 78 | +556 | |
| 79 | +%%EOF | ... | ... |