Commit 2266c6232b2ffd34c78c1924c1d7d3b932ea4a29
1 parent
736bafbb
Rework InputSource::readLine to make it much more efficient
This rework makes xref reconstruction run much faster and use much less memory.
Showing
2 changed files
with
160 additions
and
64 deletions
include/qpdf/QPDF.hh
| ... | ... | @@ -433,8 +433,9 @@ class QPDF |
| 433 | 433 | |
| 434 | 434 | void setLastOffset(qpdf_offset_t); |
| 435 | 435 | qpdf_offset_t getLastOffset() const; |
| 436 | - std::string readLine(); | |
| 436 | + std::string readLine(size_t max_line_length); | |
| 437 | 437 | |
| 438 | + virtual qpdf_offset_t findAndSkipNextEOL() = 0; | |
| 438 | 439 | virtual std::string const& getName() const = 0; |
| 439 | 440 | virtual qpdf_offset_t tell() = 0; |
| 440 | 441 | virtual void seek(qpdf_offset_t offset, int whence) = 0; |
| ... | ... | @@ -453,6 +454,7 @@ class QPDF |
| 453 | 454 | void setFilename(char const* filename); |
| 454 | 455 | void setFile(char const* description, FILE* filep, bool close_file); |
| 455 | 456 | virtual ~FileInputSource(); |
| 457 | + virtual qpdf_offset_t findAndSkipNextEOL(); | |
| 456 | 458 | virtual std::string const& getName() const; |
| 457 | 459 | virtual qpdf_offset_t tell(); |
| 458 | 460 | virtual void seek(qpdf_offset_t offset, int whence); |
| ... | ... | @@ -477,6 +479,7 @@ class QPDF |
| 477 | 479 | BufferInputSource(std::string const& description, Buffer* buf, |
| 478 | 480 | bool own_memory = false); |
| 479 | 481 | virtual ~BufferInputSource(); |
| 482 | + virtual qpdf_offset_t findAndSkipNextEOL(); | |
| 480 | 483 | virtual std::string const& getName() const; |
| 481 | 484 | virtual qpdf_offset_t tell(); |
| 482 | 485 | virtual void seek(qpdf_offset_t offset, int whence); | ... | ... |
libqpdf/QPDF.cc
| ... | ... | @@ -49,53 +49,29 @@ QPDF::InputSource::getLastOffset() const |
| 49 | 49 | } |
| 50 | 50 | |
| 51 | 51 | std::string |
| 52 | -QPDF::InputSource::readLine() | |
| 52 | +QPDF::InputSource::readLine(size_t max_line_length) | |
| 53 | 53 | { |
| 54 | - // Read a line terminated by one or more \r or \n characters | |
| 55 | - // without caring what the exact terminator is. Consume the | |
| 56 | - // trailing newline characters but don't return them. | |
| 54 | + // Return at most max_line_length characters from the next line. | |
| 55 | + // Lines are terminated by one or more \r or \n characters. | |
| 56 | + // Consume the trailing newline characters but don't return them. | |
| 57 | + // After this is called, the file will be positioned after a line | |
| 58 | + // terminator or at the end of the file, and last_offset will | |
| 59 | + // point to position the file had when this method was called. | |
| 57 | 60 | |
| 58 | 61 | qpdf_offset_t offset = this->tell(); |
| 59 | - std::string buf; | |
| 60 | - enum { st_before_nl, st_at_nl } state = st_before_nl; | |
| 61 | - char ch; | |
| 62 | - while (1) | |
| 62 | + char* buf = new char[max_line_length + 1]; | |
| 63 | + PointerHolder<char> bp(true, buf); | |
| 64 | + memset(buf, '\0', max_line_length + 1); | |
| 65 | + this->read(buf, max_line_length); | |
| 66 | + this->seek(offset, SEEK_SET); | |
| 67 | + qpdf_offset_t eol = this->findAndSkipNextEOL(); | |
| 68 | + this->last_offset = offset; | |
| 69 | + size_t line_length = eol - offset; | |
| 70 | + if (line_length < max_line_length) | |
| 63 | 71 | { |
| 64 | - size_t len = this->read(&ch, 1); | |
| 65 | - if (len == 0) | |
| 66 | - { | |
| 67 | - break; | |
| 68 | - } | |
| 69 | - | |
| 70 | - if (state == st_before_nl) | |
| 71 | - { | |
| 72 | - if ((ch == '\012') || (ch == '\015')) | |
| 73 | - { | |
| 74 | - state = st_at_nl; | |
| 75 | - } | |
| 76 | - else | |
| 77 | - { | |
| 78 | - buf += ch; | |
| 79 | - } | |
| 80 | - } | |
| 81 | - else if (state == st_at_nl) | |
| 82 | - { | |
| 83 | - if ((ch == '\012') || (ch == '\015')) | |
| 84 | - { | |
| 85 | - // do nothing | |
| 86 | - } | |
| 87 | - else | |
| 88 | - { | |
| 89 | - // unread this character | |
| 90 | - this->unreadCh(ch); | |
| 91 | - break; | |
| 92 | - } | |
| 93 | - } | |
| 72 | + buf[line_length] = '\0'; | |
| 94 | 73 | } |
| 95 | - // Override last offset to be where we started this line rather | |
| 96 | - // than before the last character read | |
| 97 | - this->last_offset = offset; | |
| 98 | - return buf; | |
| 74 | + return std::string(buf); | |
| 99 | 75 | } |
| 100 | 76 | |
| 101 | 77 | QPDF::FileInputSource::FileInputSource() : |
| ... | ... | @@ -140,6 +116,51 @@ QPDF::FileInputSource::destroy() |
| 140 | 116 | } |
| 141 | 117 | } |
| 142 | 118 | |
| 119 | +qpdf_offset_t | |
| 120 | +QPDF::FileInputSource::findAndSkipNextEOL() | |
| 121 | +{ | |
| 122 | + qpdf_offset_t result = 0; | |
| 123 | + bool done = false; | |
| 124 | + char buf[10240]; | |
| 125 | + while (! done) | |
| 126 | + { | |
| 127 | + qpdf_offset_t cur_offset = QUtil::tell(this->file); | |
| 128 | + size_t len = this->read(buf, sizeof(buf)); | |
| 129 | + if (len == 0) | |
| 130 | + { | |
| 131 | + done = true; | |
| 132 | + result = this->tell(); | |
| 133 | + } | |
| 134 | + else | |
| 135 | + { | |
| 136 | + char* p1 = (char*)memchr((void*)buf, '\r', len); | |
| 137 | + char* p2 = (char*)memchr((void*)buf, '\n', len); | |
| 138 | + char* p = (p1 && p2) ? std::min(p1, p2) : p1 ? p1 : p2; | |
| 139 | + if (p) | |
| 140 | + { | |
| 141 | + result = cur_offset + (p - buf); | |
| 142 | + // We found \r or \n. Keep reading until we get past | |
| 143 | + // \r and \n characters. | |
| 144 | + this->seek(result + 1, SEEK_SET); | |
| 145 | + char ch; | |
| 146 | + while (! done) | |
| 147 | + { | |
| 148 | + if (this->read(&ch, 1) == 0) | |
| 149 | + { | |
| 150 | + done = true; | |
| 151 | + } | |
| 152 | + else if (! ((ch == '\r') || (ch == '\n'))) | |
| 153 | + { | |
| 154 | + this->unreadCh(ch); | |
| 155 | + done = true; | |
| 156 | + } | |
| 157 | + } | |
| 158 | + } | |
| 159 | + } | |
| 160 | + } | |
| 161 | + return result; | |
| 162 | +} | |
| 163 | + | |
| 143 | 164 | std::string const& |
| 144 | 165 | QPDF::FileInputSource::getName() const |
| 145 | 166 | { |
| ... | ... | @@ -207,6 +228,45 @@ QPDF::BufferInputSource::~BufferInputSource() |
| 207 | 228 | } |
| 208 | 229 | } |
| 209 | 230 | |
| 231 | +qpdf_offset_t | |
| 232 | +QPDF::BufferInputSource::findAndSkipNextEOL() | |
| 233 | +{ | |
| 234 | + qpdf_offset_t end_pos = (qpdf_offset_t) this->buf->getSize(); | |
| 235 | + if (this->cur_offset >= end_pos) | |
| 236 | + { | |
| 237 | + this->last_offset = end_pos; | |
| 238 | + this->cur_offset = end_pos; | |
| 239 | + return end_pos; | |
| 240 | + } | |
| 241 | + | |
| 242 | + qpdf_offset_t result = 0; | |
| 243 | + size_t len = (size_t)(end_pos - this->cur_offset); | |
| 244 | + unsigned char const* buffer = this->buf->getBuffer(); | |
| 245 | + | |
| 246 | + void* start = (void*)(buffer + this->cur_offset); | |
| 247 | + unsigned char* p1 = (unsigned char*)memchr(start, '\r', len); | |
| 248 | + unsigned char* p2 = (unsigned char*)memchr(start, '\n', len); | |
| 249 | + unsigned char* p = (p1 && p2) ? std::min(p1, p2) : p1 ? p1 : p2; | |
| 250 | + if (p) | |
| 251 | + { | |
| 252 | + result = p - buffer; | |
| 253 | + this->cur_offset = result + 1; | |
| 254 | + ++p; | |
| 255 | + while ((this->cur_offset < end_pos) && | |
| 256 | + ((*p == '\r') || (*p == '\n'))) | |
| 257 | + { | |
| 258 | + ++p; | |
| 259 | + ++this->cur_offset; | |
| 260 | + } | |
| 261 | + } | |
| 262 | + else | |
| 263 | + { | |
| 264 | + this->cur_offset = end_pos; | |
| 265 | + result = end_pos; | |
| 266 | + } | |
| 267 | + return result; | |
| 268 | +} | |
| 269 | + | |
| 210 | 270 | std::string const& |
| 211 | 271 | QPDF::BufferInputSource::getName() const |
| 212 | 272 | { |
| ... | ... | @@ -420,7 +480,7 @@ QPDF::parse(char const* password) |
| 420 | 480 | this->provided_password = password; |
| 421 | 481 | } |
| 422 | 482 | |
| 423 | - std::string line = this->file->readLine(); | |
| 483 | + std::string line = this->file->readLine(20); | |
| 424 | 484 | PCRE::Match m1 = header_re.match(line.c_str()); |
| 425 | 485 | if (m1) |
| 426 | 486 | { |
| ... | ... | @@ -556,7 +616,7 @@ QPDF::reconstruct_xref(QPDFExc& e) |
| 556 | 616 | bool in_obj = false; |
| 557 | 617 | while (this->file->tell() < eof) |
| 558 | 618 | { |
| 559 | - std::string line = this->file->readLine(); | |
| 619 | + std::string line = this->file->readLine(50); | |
| 560 | 620 | if (in_obj) |
| 561 | 621 | { |
| 562 | 622 | if (endobj_re.match(line.c_str())) |
| ... | ... | @@ -624,7 +684,7 @@ QPDF::read_xref(qpdf_offset_t xref_offset) |
| 624 | 684 | while (xref_offset) |
| 625 | 685 | { |
| 626 | 686 | this->file->seek(xref_offset, SEEK_SET); |
| 627 | - std::string line = this->file->readLine(); | |
| 687 | + std::string line = this->file->readLine(50); | |
| 628 | 688 | if (line == "xref") |
| 629 | 689 | { |
| 630 | 690 | xref_offset = read_xrefTable(this->file->tell()); |
| ... | ... | @@ -677,7 +737,7 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset) |
| 677 | 737 | bool done = false; |
| 678 | 738 | while (! done) |
| 679 | 739 | { |
| 680 | - std::string line = this->file->readLine(); | |
| 740 | + std::string line = this->file->readLine(50); | |
| 681 | 741 | PCRE::Match m1 = xref_first_re.match(line.c_str()); |
| 682 | 742 | if (! m1) |
| 683 | 743 | { |
| ... | ... | @@ -1528,27 +1588,60 @@ QPDF::recoverStreamLength(PointerHolder<InputSource> input, |
| 1528 | 1588 | input->seek(0, SEEK_END); |
| 1529 | 1589 | qpdf_offset_t eof = input->tell(); |
| 1530 | 1590 | input->seek(stream_offset, SEEK_SET); |
| 1531 | - std::string last_line; | |
| 1532 | 1591 | qpdf_offset_t last_line_offset = 0; |
| 1533 | 1592 | size_t length = 0; |
| 1593 | + static int const line_end_length = 12; // room for endstream\r\n\0 | |
| 1594 | + char last_line_end[line_end_length]; | |
| 1534 | 1595 | while (input->tell() < eof) |
| 1535 | 1596 | { |
| 1536 | - std::string line = input->readLine(); | |
| 1537 | - // Can't use regexp last_line since it might contain nulls | |
| 1538 | - if (endobj_re.match(line.c_str()) && | |
| 1539 | - (last_line.length() >= 9) && | |
| 1540 | - (last_line.substr(last_line.length() - 9, 9) == "endstream")) | |
| 1541 | - { | |
| 1542 | - // Stream probably ends right before "endstream", which | |
| 1543 | - // contains 9 characters. | |
| 1544 | - length = last_line_offset + last_line.length() - 9 - stream_offset; | |
| 1545 | - // Go back to where we would have been if we had just read | |
| 1546 | - // the endstream. | |
| 1547 | - input->seek(input->getLastOffset(), SEEK_SET); | |
| 1548 | - break; | |
| 1549 | - } | |
| 1550 | - last_line = line; | |
| 1551 | - last_line_offset = input->getLastOffset(); | |
| 1597 | + std::string line = input->readLine(50); | |
| 1598 | + qpdf_offset_t line_offset = input->getLastOffset(); | |
| 1599 | + if (endobj_re.match(line.c_str())) | |
| 1600 | + { | |
| 1601 | + qpdf_offset_t endstream_offset = 0; | |
| 1602 | + if (last_line_offset >= line_end_length) | |
| 1603 | + { | |
| 1604 | + qpdf_offset_t cur_offset = input->tell(); | |
| 1605 | + // Read from the end of the last line, guaranteeing | |
| 1606 | + // null termination | |
| 1607 | + qpdf_offset_t search_offset = | |
| 1608 | + line_offset - (line_end_length - 1); | |
| 1609 | + input->seek(search_offset, SEEK_SET); | |
| 1610 | + memset(last_line_end, '\0', line_end_length); | |
| 1611 | + input->read(last_line_end, line_end_length - 1); | |
| 1612 | + input->seek(cur_offset, SEEK_SET); | |
| 1613 | + // if endstream[\r\n] will fit in last_line_end, the | |
| 1614 | + // 'e' has to be in one of the first three spots. | |
| 1615 | + // Check explicitly rather than using strstr directly | |
| 1616 | + // in case there are nulls right before endstream. | |
| 1617 | + char* p = ((last_line_end[0] == 'e') ? last_line_end : | |
| 1618 | + (last_line_end[1] == 'e') ? last_line_end + 1 : | |
| 1619 | + (last_line_end[2] == 'e') ? last_line_end + 2 : | |
| 1620 | + 0); | |
| 1621 | + char* endstream_p = 0; | |
| 1622 | + if (p) | |
| 1623 | + { | |
| 1624 | + char* p1 = strstr(p, "endstream\n"); | |
| 1625 | + char* p2 = strstr(p, "endstream\r"); | |
| 1626 | + endstream_p = (p1 ? p1 : p2); | |
| 1627 | + } | |
| 1628 | + if (endstream_p) | |
| 1629 | + { | |
| 1630 | + endstream_offset = | |
| 1631 | + search_offset + (endstream_p - last_line_end); | |
| 1632 | + } | |
| 1633 | + } | |
| 1634 | + if (endstream_offset > 0) | |
| 1635 | + { | |
| 1636 | + // Stream probably ends right before "endstream" | |
| 1637 | + length = endstream_offset - stream_offset; | |
| 1638 | + // Go back to where we would have been if we had just | |
| 1639 | + // read the endstream. | |
| 1640 | + input->seek(line_offset, SEEK_SET); | |
| 1641 | + break; | |
| 1642 | + } | |
| 1643 | + } | |
| 1644 | + last_line_offset = line_offset; | |
| 1552 | 1645 | } |
| 1553 | 1646 | |
| 1554 | 1647 | if (length) | ... | ... |