Commit 2266c6232b2ffd34c78c1924c1d7d3b932ea4a29

Authored by Jay Berkenbilt
1 parent 736bafbb

Rework InputSource::readLine to make it much more efficient

This rework makes xref reconstruction run much faster and use much
less memory.
include/qpdf/QPDF.hh
... ... @@ -433,8 +433,9 @@ class QPDF
433 433  
434 434 void setLastOffset(qpdf_offset_t);
435 435 qpdf_offset_t getLastOffset() const;
436   - std::string readLine();
  436 + std::string readLine(size_t max_line_length);
437 437  
  438 + virtual qpdf_offset_t findAndSkipNextEOL() = 0;
438 439 virtual std::string const& getName() const = 0;
439 440 virtual qpdf_offset_t tell() = 0;
440 441 virtual void seek(qpdf_offset_t offset, int whence) = 0;
... ... @@ -453,6 +454,7 @@ class QPDF
453 454 void setFilename(char const* filename);
454 455 void setFile(char const* description, FILE* filep, bool close_file);
455 456 virtual ~FileInputSource();
  457 + virtual qpdf_offset_t findAndSkipNextEOL();
456 458 virtual std::string const& getName() const;
457 459 virtual qpdf_offset_t tell();
458 460 virtual void seek(qpdf_offset_t offset, int whence);
... ... @@ -477,6 +479,7 @@ class QPDF
477 479 BufferInputSource(std::string const& description, Buffer* buf,
478 480 bool own_memory = false);
479 481 virtual ~BufferInputSource();
  482 + virtual qpdf_offset_t findAndSkipNextEOL();
480 483 virtual std::string const& getName() const;
481 484 virtual qpdf_offset_t tell();
482 485 virtual void seek(qpdf_offset_t offset, int whence);
... ...
libqpdf/QPDF.cc
... ... @@ -49,53 +49,29 @@ QPDF::InputSource::getLastOffset() const
49 49 }
50 50  
51 51 std::string
52   -QPDF::InputSource::readLine()
  52 +QPDF::InputSource::readLine(size_t max_line_length)
53 53 {
54   - // Read a line terminated by one or more \r or \n characters
55   - // without caring what the exact terminator is. Consume the
56   - // trailing newline characters but don't return them.
  54 + // Return at most max_line_length characters from the next line.
  55 + // Lines are terminated by one or more \r or \n characters.
  56 + // Consume the trailing newline characters but don't return them.
  57 + // After this is called, the file will be positioned after a line
  58 + // terminator or at the end of the file, and last_offset will
  59 + // point to position the file had when this method was called.
57 60  
58 61 qpdf_offset_t offset = this->tell();
59   - std::string buf;
60   - enum { st_before_nl, st_at_nl } state = st_before_nl;
61   - char ch;
62   - while (1)
  62 + char* buf = new char[max_line_length + 1];
  63 + PointerHolder<char> bp(true, buf);
  64 + memset(buf, '\0', max_line_length + 1);
  65 + this->read(buf, max_line_length);
  66 + this->seek(offset, SEEK_SET);
  67 + qpdf_offset_t eol = this->findAndSkipNextEOL();
  68 + this->last_offset = offset;
  69 + size_t line_length = eol - offset;
  70 + if (line_length < max_line_length)
63 71 {
64   - size_t len = this->read(&ch, 1);
65   - if (len == 0)
66   - {
67   - break;
68   - }
69   -
70   - if (state == st_before_nl)
71   - {
72   - if ((ch == '\012') || (ch == '\015'))
73   - {
74   - state = st_at_nl;
75   - }
76   - else
77   - {
78   - buf += ch;
79   - }
80   - }
81   - else if (state == st_at_nl)
82   - {
83   - if ((ch == '\012') || (ch == '\015'))
84   - {
85   - // do nothing
86   - }
87   - else
88   - {
89   - // unread this character
90   - this->unreadCh(ch);
91   - break;
92   - }
93   - }
  72 + buf[line_length] = '\0';
94 73 }
95   - // Override last offset to be where we started this line rather
96   - // than before the last character read
97   - this->last_offset = offset;
98   - return buf;
  74 + return std::string(buf);
99 75 }
100 76  
101 77 QPDF::FileInputSource::FileInputSource() :
... ... @@ -140,6 +116,51 @@ QPDF::FileInputSource::destroy()
140 116 }
141 117 }
142 118  
  119 +qpdf_offset_t
  120 +QPDF::FileInputSource::findAndSkipNextEOL()
  121 +{
  122 + qpdf_offset_t result = 0;
  123 + bool done = false;
  124 + char buf[10240];
  125 + while (! done)
  126 + {
  127 + qpdf_offset_t cur_offset = QUtil::tell(this->file);
  128 + size_t len = this->read(buf, sizeof(buf));
  129 + if (len == 0)
  130 + {
  131 + done = true;
  132 + result = this->tell();
  133 + }
  134 + else
  135 + {
  136 + char* p1 = (char*)memchr((void*)buf, '\r', len);
  137 + char* p2 = (char*)memchr((void*)buf, '\n', len);
  138 + char* p = (p1 && p2) ? std::min(p1, p2) : p1 ? p1 : p2;
  139 + if (p)
  140 + {
  141 + result = cur_offset + (p - buf);
  142 + // We found \r or \n. Keep reading until we get past
  143 + // \r and \n characters.
  144 + this->seek(result + 1, SEEK_SET);
  145 + char ch;
  146 + while (! done)
  147 + {
  148 + if (this->read(&ch, 1) == 0)
  149 + {
  150 + done = true;
  151 + }
  152 + else if (! ((ch == '\r') || (ch == '\n')))
  153 + {
  154 + this->unreadCh(ch);
  155 + done = true;
  156 + }
  157 + }
  158 + }
  159 + }
  160 + }
  161 + return result;
  162 +}
  163 +
143 164 std::string const&
144 165 QPDF::FileInputSource::getName() const
145 166 {
... ... @@ -207,6 +228,45 @@ QPDF::BufferInputSource::~BufferInputSource()
207 228 }
208 229 }
209 230  
  231 +qpdf_offset_t
  232 +QPDF::BufferInputSource::findAndSkipNextEOL()
  233 +{
  234 + qpdf_offset_t end_pos = (qpdf_offset_t) this->buf->getSize();
  235 + if (this->cur_offset >= end_pos)
  236 + {
  237 + this->last_offset = end_pos;
  238 + this->cur_offset = end_pos;
  239 + return end_pos;
  240 + }
  241 +
  242 + qpdf_offset_t result = 0;
  243 + size_t len = (size_t)(end_pos - this->cur_offset);
  244 + unsigned char const* buffer = this->buf->getBuffer();
  245 +
  246 + void* start = (void*)(buffer + this->cur_offset);
  247 + unsigned char* p1 = (unsigned char*)memchr(start, '\r', len);
  248 + unsigned char* p2 = (unsigned char*)memchr(start, '\n', len);
  249 + unsigned char* p = (p1 && p2) ? std::min(p1, p2) : p1 ? p1 : p2;
  250 + if (p)
  251 + {
  252 + result = p - buffer;
  253 + this->cur_offset = result + 1;
  254 + ++p;
  255 + while ((this->cur_offset < end_pos) &&
  256 + ((*p == '\r') || (*p == '\n')))
  257 + {
  258 + ++p;
  259 + ++this->cur_offset;
  260 + }
  261 + }
  262 + else
  263 + {
  264 + this->cur_offset = end_pos;
  265 + result = end_pos;
  266 + }
  267 + return result;
  268 +}
  269 +
210 270 std::string const&
211 271 QPDF::BufferInputSource::getName() const
212 272 {
... ... @@ -420,7 +480,7 @@ QPDF::parse(char const* password)
420 480 this->provided_password = password;
421 481 }
422 482  
423   - std::string line = this->file->readLine();
  483 + std::string line = this->file->readLine(20);
424 484 PCRE::Match m1 = header_re.match(line.c_str());
425 485 if (m1)
426 486 {
... ... @@ -556,7 +616,7 @@ QPDF::reconstruct_xref(QPDFExc&amp; e)
556 616 bool in_obj = false;
557 617 while (this->file->tell() < eof)
558 618 {
559   - std::string line = this->file->readLine();
  619 + std::string line = this->file->readLine(50);
560 620 if (in_obj)
561 621 {
562 622 if (endobj_re.match(line.c_str()))
... ... @@ -624,7 +684,7 @@ QPDF::read_xref(qpdf_offset_t xref_offset)
624 684 while (xref_offset)
625 685 {
626 686 this->file->seek(xref_offset, SEEK_SET);
627   - std::string line = this->file->readLine();
  687 + std::string line = this->file->readLine(50);
628 688 if (line == "xref")
629 689 {
630 690 xref_offset = read_xrefTable(this->file->tell());
... ... @@ -677,7 +737,7 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset)
677 737 bool done = false;
678 738 while (! done)
679 739 {
680   - std::string line = this->file->readLine();
  740 + std::string line = this->file->readLine(50);
681 741 PCRE::Match m1 = xref_first_re.match(line.c_str());
682 742 if (! m1)
683 743 {
... ... @@ -1528,27 +1588,60 @@ QPDF::recoverStreamLength(PointerHolder&lt;InputSource&gt; input,
1528 1588 input->seek(0, SEEK_END);
1529 1589 qpdf_offset_t eof = input->tell();
1530 1590 input->seek(stream_offset, SEEK_SET);
1531   - std::string last_line;
1532 1591 qpdf_offset_t last_line_offset = 0;
1533 1592 size_t length = 0;
  1593 + static int const line_end_length = 12; // room for endstream\r\n\0
  1594 + char last_line_end[line_end_length];
1534 1595 while (input->tell() < eof)
1535 1596 {
1536   - std::string line = input->readLine();
1537   - // Can't use regexp last_line since it might contain nulls
1538   - if (endobj_re.match(line.c_str()) &&
1539   - (last_line.length() >= 9) &&
1540   - (last_line.substr(last_line.length() - 9, 9) == "endstream"))
1541   - {
1542   - // Stream probably ends right before "endstream", which
1543   - // contains 9 characters.
1544   - length = last_line_offset + last_line.length() - 9 - stream_offset;
1545   - // Go back to where we would have been if we had just read
1546   - // the endstream.
1547   - input->seek(input->getLastOffset(), SEEK_SET);
1548   - break;
1549   - }
1550   - last_line = line;
1551   - last_line_offset = input->getLastOffset();
  1597 + std::string line = input->readLine(50);
  1598 + qpdf_offset_t line_offset = input->getLastOffset();
  1599 + if (endobj_re.match(line.c_str()))
  1600 + {
  1601 + qpdf_offset_t endstream_offset = 0;
  1602 + if (last_line_offset >= line_end_length)
  1603 + {
  1604 + qpdf_offset_t cur_offset = input->tell();
  1605 + // Read from the end of the last line, guaranteeing
  1606 + // null termination
  1607 + qpdf_offset_t search_offset =
  1608 + line_offset - (line_end_length - 1);
  1609 + input->seek(search_offset, SEEK_SET);
  1610 + memset(last_line_end, '\0', line_end_length);
  1611 + input->read(last_line_end, line_end_length - 1);
  1612 + input->seek(cur_offset, SEEK_SET);
  1613 + // if endstream[\r\n] will fit in last_line_end, the
  1614 + // 'e' has to be in one of the first three spots.
  1615 + // Check explicitly rather than using strstr directly
  1616 + // in case there are nulls right before endstream.
  1617 + char* p = ((last_line_end[0] == 'e') ? last_line_end :
  1618 + (last_line_end[1] == 'e') ? last_line_end + 1 :
  1619 + (last_line_end[2] == 'e') ? last_line_end + 2 :
  1620 + 0);
  1621 + char* endstream_p = 0;
  1622 + if (p)
  1623 + {
  1624 + char* p1 = strstr(p, "endstream\n");
  1625 + char* p2 = strstr(p, "endstream\r");
  1626 + endstream_p = (p1 ? p1 : p2);
  1627 + }
  1628 + if (endstream_p)
  1629 + {
  1630 + endstream_offset =
  1631 + search_offset + (endstream_p - last_line_end);
  1632 + }
  1633 + }
  1634 + if (endstream_offset > 0)
  1635 + {
  1636 + // Stream probably ends right before "endstream"
  1637 + length = endstream_offset - stream_offset;
  1638 + // Go back to where we would have been if we had just
  1639 + // read the endstream.
  1640 + input->seek(line_offset, SEEK_SET);
  1641 + break;
  1642 + }
  1643 + }
  1644 + last_line_offset = line_offset;
1552 1645 }
1553 1646  
1554 1647 if (length)
... ...