Commit 2266c6232b2ffd34c78c1924c1d7d3b932ea4a29

Authored by Jay Berkenbilt
1 parent 736bafbb

Rework InputSource::readLine to make it much more efficient

This rework makes xref reconstruction run much faster and use much
less memory.
include/qpdf/QPDF.hh
@@ -433,8 +433,9 @@ class QPDF @@ -433,8 +433,9 @@ class QPDF
433 433
434 void setLastOffset(qpdf_offset_t); 434 void setLastOffset(qpdf_offset_t);
435 qpdf_offset_t getLastOffset() const; 435 qpdf_offset_t getLastOffset() const;
436 - std::string readLine(); 436 + std::string readLine(size_t max_line_length);
437 437
  438 + virtual qpdf_offset_t findAndSkipNextEOL() = 0;
438 virtual std::string const& getName() const = 0; 439 virtual std::string const& getName() const = 0;
439 virtual qpdf_offset_t tell() = 0; 440 virtual qpdf_offset_t tell() = 0;
440 virtual void seek(qpdf_offset_t offset, int whence) = 0; 441 virtual void seek(qpdf_offset_t offset, int whence) = 0;
@@ -453,6 +454,7 @@ class QPDF @@ -453,6 +454,7 @@ class QPDF
453 void setFilename(char const* filename); 454 void setFilename(char const* filename);
454 void setFile(char const* description, FILE* filep, bool close_file); 455 void setFile(char const* description, FILE* filep, bool close_file);
455 virtual ~FileInputSource(); 456 virtual ~FileInputSource();
  457 + virtual qpdf_offset_t findAndSkipNextEOL();
456 virtual std::string const& getName() const; 458 virtual std::string const& getName() const;
457 virtual qpdf_offset_t tell(); 459 virtual qpdf_offset_t tell();
458 virtual void seek(qpdf_offset_t offset, int whence); 460 virtual void seek(qpdf_offset_t offset, int whence);
@@ -477,6 +479,7 @@ class QPDF @@ -477,6 +479,7 @@ class QPDF
477 BufferInputSource(std::string const& description, Buffer* buf, 479 BufferInputSource(std::string const& description, Buffer* buf,
478 bool own_memory = false); 480 bool own_memory = false);
479 virtual ~BufferInputSource(); 481 virtual ~BufferInputSource();
  482 + virtual qpdf_offset_t findAndSkipNextEOL();
480 virtual std::string const& getName() const; 483 virtual std::string const& getName() const;
481 virtual qpdf_offset_t tell(); 484 virtual qpdf_offset_t tell();
482 virtual void seek(qpdf_offset_t offset, int whence); 485 virtual void seek(qpdf_offset_t offset, int whence);
libqpdf/QPDF.cc
@@ -49,53 +49,29 @@ QPDF::InputSource::getLastOffset() const @@ -49,53 +49,29 @@ QPDF::InputSource::getLastOffset() const
49 } 49 }
50 50
51 std::string 51 std::string
52 -QPDF::InputSource::readLine() 52 +QPDF::InputSource::readLine(size_t max_line_length)
53 { 53 {
54 - // Read a line terminated by one or more \r or \n characters  
55 - // without caring what the exact terminator is. Consume the  
56 - // trailing newline characters but don't return them. 54 + // Return at most max_line_length characters from the next line.
  55 + // Lines are terminated by one or more \r or \n characters.
  56 + // Consume the trailing newline characters but don't return them.
  57 + // After this is called, the file will be positioned after a line
  58 + // terminator or at the end of the file, and last_offset will
  59 + // point to position the file had when this method was called.
57 60
58 qpdf_offset_t offset = this->tell(); 61 qpdf_offset_t offset = this->tell();
59 - std::string buf;  
60 - enum { st_before_nl, st_at_nl } state = st_before_nl;  
61 - char ch;  
62 - while (1) 62 + char* buf = new char[max_line_length + 1];
  63 + PointerHolder<char> bp(true, buf);
  64 + memset(buf, '\0', max_line_length + 1);
  65 + this->read(buf, max_line_length);
  66 + this->seek(offset, SEEK_SET);
  67 + qpdf_offset_t eol = this->findAndSkipNextEOL();
  68 + this->last_offset = offset;
  69 + size_t line_length = eol - offset;
  70 + if (line_length < max_line_length)
63 { 71 {
64 - size_t len = this->read(&ch, 1);  
65 - if (len == 0)  
66 - {  
67 - break;  
68 - }  
69 -  
70 - if (state == st_before_nl)  
71 - {  
72 - if ((ch == '\012') || (ch == '\015'))  
73 - {  
74 - state = st_at_nl;  
75 - }  
76 - else  
77 - {  
78 - buf += ch;  
79 - }  
80 - }  
81 - else if (state == st_at_nl)  
82 - {  
83 - if ((ch == '\012') || (ch == '\015'))  
84 - {  
85 - // do nothing  
86 - }  
87 - else  
88 - {  
89 - // unread this character  
90 - this->unreadCh(ch);  
91 - break;  
92 - }  
93 - } 72 + buf[line_length] = '\0';
94 } 73 }
95 - // Override last offset to be where we started this line rather  
96 - // than before the last character read  
97 - this->last_offset = offset;  
98 - return buf; 74 + return std::string(buf);
99 } 75 }
100 76
101 QPDF::FileInputSource::FileInputSource() : 77 QPDF::FileInputSource::FileInputSource() :
@@ -140,6 +116,51 @@ QPDF::FileInputSource::destroy() @@ -140,6 +116,51 @@ QPDF::FileInputSource::destroy()
140 } 116 }
141 } 117 }
142 118
  119 +qpdf_offset_t
  120 +QPDF::FileInputSource::findAndSkipNextEOL()
  121 +{
  122 + qpdf_offset_t result = 0;
  123 + bool done = false;
  124 + char buf[10240];
  125 + while (! done)
  126 + {
  127 + qpdf_offset_t cur_offset = QUtil::tell(this->file);
  128 + size_t len = this->read(buf, sizeof(buf));
  129 + if (len == 0)
  130 + {
  131 + done = true;
  132 + result = this->tell();
  133 + }
  134 + else
  135 + {
  136 + char* p1 = (char*)memchr((void*)buf, '\r', len);
  137 + char* p2 = (char*)memchr((void*)buf, '\n', len);
  138 + char* p = (p1 && p2) ? std::min(p1, p2) : p1 ? p1 : p2;
  139 + if (p)
  140 + {
  141 + result = cur_offset + (p - buf);
  142 + // We found \r or \n. Keep reading until we get past
  143 + // \r and \n characters.
  144 + this->seek(result + 1, SEEK_SET);
  145 + char ch;
  146 + while (! done)
  147 + {
  148 + if (this->read(&ch, 1) == 0)
  149 + {
  150 + done = true;
  151 + }
  152 + else if (! ((ch == '\r') || (ch == '\n')))
  153 + {
  154 + this->unreadCh(ch);
  155 + done = true;
  156 + }
  157 + }
  158 + }
  159 + }
  160 + }
  161 + return result;
  162 +}
  163 +
143 std::string const& 164 std::string const&
144 QPDF::FileInputSource::getName() const 165 QPDF::FileInputSource::getName() const
145 { 166 {
@@ -207,6 +228,45 @@ QPDF::BufferInputSource::~BufferInputSource() @@ -207,6 +228,45 @@ QPDF::BufferInputSource::~BufferInputSource()
207 } 228 }
208 } 229 }
209 230
  231 +qpdf_offset_t
  232 +QPDF::BufferInputSource::findAndSkipNextEOL()
  233 +{
  234 + qpdf_offset_t end_pos = (qpdf_offset_t) this->buf->getSize();
  235 + if (this->cur_offset >= end_pos)
  236 + {
  237 + this->last_offset = end_pos;
  238 + this->cur_offset = end_pos;
  239 + return end_pos;
  240 + }
  241 +
  242 + qpdf_offset_t result = 0;
  243 + size_t len = (size_t)(end_pos - this->cur_offset);
  244 + unsigned char const* buffer = this->buf->getBuffer();
  245 +
  246 + void* start = (void*)(buffer + this->cur_offset);
  247 + unsigned char* p1 = (unsigned char*)memchr(start, '\r', len);
  248 + unsigned char* p2 = (unsigned char*)memchr(start, '\n', len);
  249 + unsigned char* p = (p1 && p2) ? std::min(p1, p2) : p1 ? p1 : p2;
  250 + if (p)
  251 + {
  252 + result = p - buffer;
  253 + this->cur_offset = result + 1;
  254 + ++p;
  255 + while ((this->cur_offset < end_pos) &&
  256 + ((*p == '\r') || (*p == '\n')))
  257 + {
  258 + ++p;
  259 + ++this->cur_offset;
  260 + }
  261 + }
  262 + else
  263 + {
  264 + this->cur_offset = end_pos;
  265 + result = end_pos;
  266 + }
  267 + return result;
  268 +}
  269 +
210 std::string const& 270 std::string const&
211 QPDF::BufferInputSource::getName() const 271 QPDF::BufferInputSource::getName() const
212 { 272 {
@@ -420,7 +480,7 @@ QPDF::parse(char const* password) @@ -420,7 +480,7 @@ QPDF::parse(char const* password)
420 this->provided_password = password; 480 this->provided_password = password;
421 } 481 }
422 482
423 - std::string line = this->file->readLine(); 483 + std::string line = this->file->readLine(20);
424 PCRE::Match m1 = header_re.match(line.c_str()); 484 PCRE::Match m1 = header_re.match(line.c_str());
425 if (m1) 485 if (m1)
426 { 486 {
@@ -556,7 +616,7 @@ QPDF::reconstruct_xref(QPDFExc&amp; e) @@ -556,7 +616,7 @@ QPDF::reconstruct_xref(QPDFExc&amp; e)
556 bool in_obj = false; 616 bool in_obj = false;
557 while (this->file->tell() < eof) 617 while (this->file->tell() < eof)
558 { 618 {
559 - std::string line = this->file->readLine(); 619 + std::string line = this->file->readLine(50);
560 if (in_obj) 620 if (in_obj)
561 { 621 {
562 if (endobj_re.match(line.c_str())) 622 if (endobj_re.match(line.c_str()))
@@ -624,7 +684,7 @@ QPDF::read_xref(qpdf_offset_t xref_offset) @@ -624,7 +684,7 @@ QPDF::read_xref(qpdf_offset_t xref_offset)
624 while (xref_offset) 684 while (xref_offset)
625 { 685 {
626 this->file->seek(xref_offset, SEEK_SET); 686 this->file->seek(xref_offset, SEEK_SET);
627 - std::string line = this->file->readLine(); 687 + std::string line = this->file->readLine(50);
628 if (line == "xref") 688 if (line == "xref")
629 { 689 {
630 xref_offset = read_xrefTable(this->file->tell()); 690 xref_offset = read_xrefTable(this->file->tell());
@@ -677,7 +737,7 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset) @@ -677,7 +737,7 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset)
677 bool done = false; 737 bool done = false;
678 while (! done) 738 while (! done)
679 { 739 {
680 - std::string line = this->file->readLine(); 740 + std::string line = this->file->readLine(50);
681 PCRE::Match m1 = xref_first_re.match(line.c_str()); 741 PCRE::Match m1 = xref_first_re.match(line.c_str());
682 if (! m1) 742 if (! m1)
683 { 743 {
@@ -1528,27 +1588,60 @@ QPDF::recoverStreamLength(PointerHolder&lt;InputSource&gt; input, @@ -1528,27 +1588,60 @@ QPDF::recoverStreamLength(PointerHolder&lt;InputSource&gt; input,
1528 input->seek(0, SEEK_END); 1588 input->seek(0, SEEK_END);
1529 qpdf_offset_t eof = input->tell(); 1589 qpdf_offset_t eof = input->tell();
1530 input->seek(stream_offset, SEEK_SET); 1590 input->seek(stream_offset, SEEK_SET);
1531 - std::string last_line;  
1532 qpdf_offset_t last_line_offset = 0; 1591 qpdf_offset_t last_line_offset = 0;
1533 size_t length = 0; 1592 size_t length = 0;
  1593 + static int const line_end_length = 12; // room for endstream\r\n\0
  1594 + char last_line_end[line_end_length];
1534 while (input->tell() < eof) 1595 while (input->tell() < eof)
1535 { 1596 {
1536 - std::string line = input->readLine();  
1537 - // Can't use regexp last_line since it might contain nulls  
1538 - if (endobj_re.match(line.c_str()) &&  
1539 - (last_line.length() >= 9) &&  
1540 - (last_line.substr(last_line.length() - 9, 9) == "endstream"))  
1541 - {  
1542 - // Stream probably ends right before "endstream", which  
1543 - // contains 9 characters.  
1544 - length = last_line_offset + last_line.length() - 9 - stream_offset;  
1545 - // Go back to where we would have been if we had just read  
1546 - // the endstream.  
1547 - input->seek(input->getLastOffset(), SEEK_SET);  
1548 - break;  
1549 - }  
1550 - last_line = line;  
1551 - last_line_offset = input->getLastOffset(); 1597 + std::string line = input->readLine(50);
  1598 + qpdf_offset_t line_offset = input->getLastOffset();
  1599 + if (endobj_re.match(line.c_str()))
  1600 + {
  1601 + qpdf_offset_t endstream_offset = 0;
  1602 + if (last_line_offset >= line_end_length)
  1603 + {
  1604 + qpdf_offset_t cur_offset = input->tell();
  1605 + // Read from the end of the last line, guaranteeing
  1606 + // null termination
  1607 + qpdf_offset_t search_offset =
  1608 + line_offset - (line_end_length - 1);
  1609 + input->seek(search_offset, SEEK_SET);
  1610 + memset(last_line_end, '\0', line_end_length);
  1611 + input->read(last_line_end, line_end_length - 1);
  1612 + input->seek(cur_offset, SEEK_SET);
  1613 + // if endstream[\r\n] will fit in last_line_end, the
  1614 + // 'e' has to be in one of the first three spots.
  1615 + // Check explicitly rather than using strstr directly
  1616 + // in case there are nulls right before endstream.
  1617 + char* p = ((last_line_end[0] == 'e') ? last_line_end :
  1618 + (last_line_end[1] == 'e') ? last_line_end + 1 :
  1619 + (last_line_end[2] == 'e') ? last_line_end + 2 :
  1620 + 0);
  1621 + char* endstream_p = 0;
  1622 + if (p)
  1623 + {
  1624 + char* p1 = strstr(p, "endstream\n");
  1625 + char* p2 = strstr(p, "endstream\r");
  1626 + endstream_p = (p1 ? p1 : p2);
  1627 + }
  1628 + if (endstream_p)
  1629 + {
  1630 + endstream_offset =
  1631 + search_offset + (endstream_p - last_line_end);
  1632 + }
  1633 + }
  1634 + if (endstream_offset > 0)
  1635 + {
  1636 + // Stream probably ends right before "endstream"
  1637 + length = endstream_offset - stream_offset;
  1638 + // Go back to where we would have been if we had just
  1639 + // read the endstream.
  1640 + input->seek(line_offset, SEEK_SET);
  1641 + break;
  1642 + }
  1643 + }
  1644 + last_line_offset = line_offset;
1552 } 1645 }
1553 1646
1554 if (length) 1647 if (length)