Commit 03aa9679ac16be44348f29a97c2c36145ae5a35a

Authored by Jay Berkenbilt
1 parent 1765c6ec

Find starxref without PCRE

include/qpdf/QPDF.hh
@@ -1029,6 +1029,7 @@ class QPDF @@ -1029,6 +1029,7 @@ class QPDF
1029 1029
1030 // Methods to support pattern finding 1030 // Methods to support pattern finding
1031 bool findHeader(); 1031 bool findHeader();
  1032 + bool findStartxref();
1032 1033
1033 // methods to support linearization checking -- implemented in 1034 // methods to support linearization checking -- implemented in
1034 // QPDF_linearization.cc 1035 // QPDF_linearization.cc
libqpdf/QPDF.cc
@@ -254,11 +254,26 @@ QPDF::findHeader() @@ -254,11 +254,26 @@ QPDF::findHeader()
254 return valid; 254 return valid;
255 } 255 }
256 256
  257 +bool
  258 +QPDF::findStartxref()
  259 +{
  260 + QPDFTokenizer::Token t = readToken(this->file, true);
  261 + if (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "startxref"))
  262 + {
  263 + t = readToken(this->file, true);
  264 + if (t.getType() == QPDFTokenizer::tt_integer)
  265 + {
  266 + // Position in front of offset token
  267 + this->file->seek(this->file->getLastOffset(), SEEK_SET);
  268 + return true;
  269 + }
  270 + }
  271 + return false;
  272 +}
  273 +
257 void 274 void
258 QPDF::parse(char const* password) 275 QPDF::parse(char const* password)
259 { 276 {
260 - PCRE eof_re("(?s:startxref\\s+(\\d+)\\s+%%EOF\\b)");  
261 -  
262 if (password) 277 if (password)
263 { 278 {
264 this->provided_password = password; 279 this->provided_password = password;
@@ -283,47 +298,25 @@ QPDF::parse(char const* password) @@ -283,47 +298,25 @@ QPDF::parse(char const* password)
283 // PDF spec says %%EOF must be found within the last 1024 bytes of 298 // PDF spec says %%EOF must be found within the last 1024 bytes of
284 // the file. We add an extra 30 characters to leave room for the 299 // the file. We add an extra 30 characters to leave room for the
285 // startxref stuff. 300 // startxref stuff.
286 - static int const tbuf_size = 1054;  
287 this->file->seek(0, SEEK_END); 301 this->file->seek(0, SEEK_END);
288 - if (this->file->tell() > tbuf_size) 302 + qpdf_offset_t end_offset = this->file->tell();
  303 + qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);
  304 + PatternFinder sf(*this, &QPDF::findStartxref);
  305 + qpdf_offset_t xref_offset = 0;
  306 + if (this->file->findLast("startxref", start_offset, 0, sf))
289 { 307 {
290 - this->file->seek(-tbuf_size, SEEK_END);  
291 - }  
292 - else  
293 - {  
294 - this->file->rewind();  
295 - }  
296 - char* buf = new char[tbuf_size + 1];  
297 - // Put buf in an array-style PointerHolder to guarantee deletion  
298 - // of buf.  
299 - PointerHolder<char> b(true, buf);  
300 - memset(buf, '\0', tbuf_size + 1);  
301 - this->file->read(buf, tbuf_size);  
302 -  
303 - // Since buf may contain null characters, we can't do a regexp  
304 - // search on buf directly. Find the last occurrence within buf  
305 - // where the regexp matches.  
306 - char* p = buf;  
307 - char const* candidate = "";  
308 - while ((p = static_cast<char*>(memchr(p, 's', tbuf_size - (p - buf)))) != 0)  
309 - {  
310 - if (eof_re.match(p))  
311 - {  
312 - candidate = p;  
313 - }  
314 - ++p; 308 + xref_offset = QUtil::string_to_ll(
  309 + readToken(this->file).getValue().c_str());
315 } 310 }
316 311
317 try 312 try
318 { 313 {
319 - PCRE::Match m2 = eof_re.match(candidate);  
320 - if (! m2) 314 + if (xref_offset == 0)
321 { 315 {
322 QTC::TC("qpdf", "QPDF can't find startxref"); 316 QTC::TC("qpdf", "QPDF can't find startxref");
323 throw QPDFExc(qpdf_e_damaged_pdf, this->file->getName(), "", 0, 317 throw QPDFExc(qpdf_e_damaged_pdf, this->file->getName(), "", 0,
324 "can't find startxref"); 318 "can't find startxref");
325 } 319 }
326 - qpdf_offset_t xref_offset = QUtil::string_to_ll(m2.getMatch(1).c_str());  
327 read_xref(xref_offset); 320 read_xref(xref_offset);
328 } 321 }
329 catch (QPDFExc& e) 322 catch (QPDFExc& e)
libqpdf/QPDFTokenizer.cc
@@ -521,7 +521,7 @@ QPDFTokenizer::readToken(PointerHolder&lt;InputSource&gt; input, @@ -521,7 +521,7 @@ QPDFTokenizer::readToken(PointerHolder&lt;InputSource&gt; input,
521 { 521 {
522 if (allow_bad) 522 if (allow_bad)
523 { 523 {
524 -// QTC::TC("qpdf", "QPDFTokenizer allowing bad token"); 524 + QTC::TC("qpdf", "QPDFTokenizer allowing bad token");
525 } 525 }
526 else 526 else
527 { 527 {
qpdf/qpdf.testcov
@@ -290,3 +290,4 @@ qpdf read args from file 0 @@ -290,3 +290,4 @@ qpdf read args from file 0
290 qpdf single-pages %d 0 290 qpdf single-pages %d 0
291 qpdf single-pages .pdf 0 291 qpdf single-pages .pdf 0
292 qpdf single-pages other 0 292 qpdf single-pages other 0
  293 +QPDFTokenizer allowing bad token 0
qpdf/qtest/qpdf/issue-117.out
1 WARNING: issue-117.pdf: file is damaged 1 WARNING: issue-117.pdf: file is damaged
2 -WARNING: issue-117.pdf: can't find startxref 2 +WARNING: issue-117.pdf (file position 3526): xref not found
3 WARNING: issue-117.pdf: Attempting to reconstruct cross-reference table 3 WARNING: issue-117.pdf: Attempting to reconstruct cross-reference table
4 WARNING: issue-117.pdf (file position 66): loop detected resolving object 2 0 4 WARNING: issue-117.pdf (file position 66): loop detected resolving object 2 0
5 WARNING: issue-117.pdf (object 2 0, file position 22): /Length key in stream dictionary is not an integer 5 WARNING: issue-117.pdf (object 2 0, file position 22): /Length key in stream dictionary is not an integer