Commit ca5b1d267ab77947cabebce0124a49480d4143b8

Authored by Jay Berkenbilt
1 parent 3082e4e6

Improve stream length recovery

Eliminate PCRE and find endobj not preceded by endstream. Be more lax
about placement of endstream and endobj.
ChangeLog
@@ -11,6 +11,12 @@ @@ -11,6 +11,12 @@
11 the (bool, T*) version of the constructor instead. If not, just 11 the (bool, T*) version of the constructor instead. If not, just
12 remove the second parameter. 12 remove the second parameter.
13 13
  14 +2017-08-09 Jay Berkenbilt <ejb@ql.org>
  15 +
  16 + * When recovering stream length, find endobj without endstream as
  17 + well as just looking for endstream. Be a little more lax about
  18 + where we allow it to be found.
  19 +
14 2017-08-05 Jay Berkenbilt <ejb@ql.org> 20 2017-08-05 Jay Berkenbilt <ejb@ql.org>
15 21
16 * Add --single-pages option to cause output to be written to a 22 * Add --single-pages option to cause output to be written to a
include/qpdf/QPDF.hh
@@ -1030,6 +1030,7 @@ class QPDF @@ -1030,6 +1030,7 @@ class QPDF
1030 // Methods to support pattern finding 1030 // Methods to support pattern finding
1031 bool findHeader(); 1031 bool findHeader();
1032 bool findStartxref(); 1032 bool findStartxref();
  1033 + bool findEndstream();
1033 1034
1034 // methods to support linearization checking -- implemented in 1035 // methods to support linearization checking -- implemented in
1035 // QPDF_linearization.cc 1036 // QPDF_linearization.cc
libqpdf/QPDF.cc
@@ -1231,76 +1231,43 @@ QPDF::readObject(PointerHolder&lt;InputSource&gt; input, @@ -1231,76 +1231,43 @@ QPDF::readObject(PointerHolder&lt;InputSource&gt; input,
1231 return object; 1231 return object;
1232 } 1232 }
1233 1233
  1234 +bool
  1235 +QPDF::findEndstream()
  1236 +{
  1237 + // Find endstream or endobj. Position the input at that token.
  1238 + QPDFTokenizer::Token t = readToken(this->file, true);
  1239 + if ((t.getType() == QPDFTokenizer::tt_word) &&
  1240 + ((t.getValue() == "endobj") ||
  1241 + (t.getValue() == "endstream")));
  1242 + {
  1243 + this->file->seek(this->file->getLastOffset(), SEEK_SET);
  1244 + return true;
  1245 + }
  1246 + return false;
  1247 +}
  1248 +
1234 size_t 1249 size_t
1235 QPDF::recoverStreamLength(PointerHolder<InputSource> input, 1250 QPDF::recoverStreamLength(PointerHolder<InputSource> input,
1236 int objid, int generation, 1251 int objid, int generation,
1237 qpdf_offset_t stream_offset) 1252 qpdf_offset_t stream_offset)
1238 { 1253 {
1239 - PCRE endobj_re("^\\s*endobj\\b");  
1240 -  
1241 // Try to reconstruct stream length by looking for 1254 // Try to reconstruct stream length by looking for
1242 - // endstream(\r\n?|\n)endobj 1255 + // endstream or endobj
1243 warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(), 1256 warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(),
1244 this->last_object_description, stream_offset, 1257 this->last_object_description, stream_offset,
1245 "attempting to recover stream length")); 1258 "attempting to recover stream length"));
1246 1259
1247 - input->seek(0, SEEK_END);  
1248 - qpdf_offset_t eof = input->tell();  
1249 - input->seek(stream_offset, SEEK_SET);  
1250 - qpdf_offset_t last_line_offset = 0; 1260 + PatternFinder ef(*this, &QPDF::findEndstream);
1251 size_t length = 0; 1261 size_t length = 0;
1252 - static int const line_end_length = 12; // room for endstream\r\n\0  
1253 - char last_line_end[line_end_length];  
1254 - while (input->tell() < eof) 1262 + if (this->file->findFirst("end", stream_offset, 0, ef))
1255 { 1263 {
1256 - std::string line = input->readLine(50);  
1257 - qpdf_offset_t line_offset = input->getLastOffset();  
1258 - if (endobj_re.match(line.c_str())) 1264 + length = this->file->tell() - stream_offset;
  1265 + // Reread endstream but, if it was endobj, don't skip that.
  1266 + QPDFTokenizer::Token t = readToken(this->file);
  1267 + if (t.getValue() == "endobj")
1259 { 1268 {
1260 - qpdf_offset_t endstream_offset = 0;  
1261 - if (last_line_offset >= line_end_length)  
1262 - {  
1263 - qpdf_offset_t cur_offset = input->tell();  
1264 - // Read from the end of the last line, guaranteeing  
1265 - // null termination  
1266 - qpdf_offset_t search_offset =  
1267 - line_offset - (line_end_length - 1);  
1268 - input->seek(search_offset, SEEK_SET);  
1269 - memset(last_line_end, '\0', line_end_length);  
1270 - input->read(last_line_end, line_end_length - 1);  
1271 - input->seek(cur_offset, SEEK_SET);  
1272 - // if endstream[\r\n] will fit in last_line_end, the  
1273 - // 'e' has to be in one of the first three spots.  
1274 - // Check explicitly rather than using strstr directly  
1275 - // in case there are nulls right before endstream.  
1276 - char* p = ((last_line_end[0] == 'e') ? last_line_end :  
1277 - (last_line_end[1] == 'e') ? last_line_end + 1 :  
1278 - (last_line_end[2] == 'e') ? last_line_end + 2 :  
1279 - 0);  
1280 - char* endstream_p = 0;  
1281 - if (p)  
1282 - {  
1283 - char* p1 = strstr(p, "endstream\n");  
1284 - char* p2 = strstr(p, "endstream\r");  
1285 - endstream_p = (p1 ? p1 : p2);  
1286 - }  
1287 - if (endstream_p)  
1288 - {  
1289 - endstream_offset =  
1290 - search_offset + (endstream_p - last_line_end);  
1291 - }  
1292 - }  
1293 - if (endstream_offset > 0)  
1294 - {  
1295 - // Stream probably ends right before "endstream"  
1296 - length = endstream_offset - stream_offset;  
1297 - // Go back to where we would have been if we had just  
1298 - // read the endstream.  
1299 - input->seek(line_offset, SEEK_SET);  
1300 - break;  
1301 - }  
1302 - }  
1303 - last_line_offset = line_offset; 1269 + this->file->seek(this->file->getLastOffset(), SEEK_SET);
  1270 + }
1304 } 1271 }
1305 1272
1306 if (length) 1273 if (length)
qpdf/qtest/qpdf/bad24-recover.out
1 WARNING: bad24.pdf (object 4 0, file position 385): expected endstream 1 WARNING: bad24.pdf (object 4 0, file position 385): expected endstream
2 WARNING: bad24.pdf (object 4 0, file position 341): attempting to recover stream length 2 WARNING: bad24.pdf (object 4 0, file position 341): attempting to recover stream length
3 -WARNING: bad24.pdf (object 4 0, file position 341): unable to recover stream data; treating stream as empty  
4 -WARNING: bad24.pdf (object 4 0, file position 778): EOF while reading token  
5 -/QTest is implicit  
6 -/QTest is indirect and has type null (2)  
7 -/QTest is null 3 +WARNING: bad24.pdf (object 4 0, file position 341): recovered stream length: 54
  4 +/QTest is indirect and has type stream (10)
  5 +/QTest is a stream. Dictionary: << /Length 44 >>
  6 +Raw stream data:
  7 +BT
  8 + /F1 24 Tf
  9 + 72 720 Td
  10 + (Potato) Tj
  11 +ET
  12 +enxstream
  13 +
  14 +Uncompressed stream data:
  15 +BT
  16 + /F1 24 Tf
  17 + 72 720 Td
  18 + (Potato) Tj
  19 +ET
  20 +enxstream
  21 +
  22 +End of stream data
8 unparse: 4 0 R 23 unparse: 4 0 R
9 -unparseResolved: null 24 +unparseResolved: 4 0 R
10 test 1 done 25 test 1 done
qpdf/qtest/qpdf/issue-101.out
@@ -5,10 +5,16 @@ WARNING: issue-101.pdf (file position 1242): expected dictionary key but found n @@ -5,10 +5,16 @@ WARNING: issue-101.pdf (file position 1242): expected dictionary key but found n
5 WARNING: issue-101.pdf (file position 1242): dictionary ended prematurely; using null as value for last key 5 WARNING: issue-101.pdf (file position 1242): dictionary ended prematurely; using null as value for last key
6 WARNING: issue-101.pdf (object 5 0, file position 1438): /Length key in stream dictionary is not an integer 6 WARNING: issue-101.pdf (object 5 0, file position 1438): /Length key in stream dictionary is not an integer
7 WARNING: issue-101.pdf (object 5 0, file position 1509): attempting to recover stream length 7 WARNING: issue-101.pdf (object 5 0, file position 1509): attempting to recover stream length
8 -WARNING: issue-101.pdf (object 5 0, file position 1509): recovered stream length: 205 8 +WARNING: issue-101.pdf (object 5 0, file position 1509): recovered stream length: 8
  9 +WARNING: issue-101.pdf (trailer, file position 1631): /Length key in stream dictionary is not an integer
  10 +WARNING: issue-101.pdf (trailer, file position 1702): attempting to recover stream length
  11 +WARNING: issue-101.pdf (trailer, file position 1702): recovered stream length: 12
9 WARNING: issue-101.pdf (trailer, file position 2026): /Length key in stream dictionary is not an integer 12 WARNING: issue-101.pdf (trailer, file position 2026): /Length key in stream dictionary is not an integer
10 WARNING: issue-101.pdf (trailer, file position 2097): attempting to recover stream length 13 WARNING: issue-101.pdf (trailer, file position 2097): attempting to recover stream length
11 -WARNING: issue-101.pdf (trailer, file position 2097): recovered stream length: 709 14 +WARNING: issue-101.pdf (trailer, file position 2097): recovered stream length: 12
  15 +WARNING: issue-101.pdf (trailer, file position 2613): /Length key in stream dictionary is not an integer
  16 +WARNING: issue-101.pdf (trailer, file position 2684): attempting to recover stream length
  17 +WARNING: issue-101.pdf (trailer, file position 2684): recovered stream length: 74
12 WARNING: issue-101.pdf (trailer, file position 2928): unknown token while reading object; treating as string 18 WARNING: issue-101.pdf (trailer, file position 2928): unknown token while reading object; treating as string
13 WARNING: issue-101.pdf (trailer, file position 2929): unknown token while reading object; treating as string 19 WARNING: issue-101.pdf (trailer, file position 2929): unknown token while reading object; treating as string
14 WARNING: issue-101.pdf (trailer, file position 2928): expected dictionary key but found non-name object; inserting key /QPDFFake1 20 WARNING: issue-101.pdf (trailer, file position 2928): expected dictionary key but found non-name object; inserting key /QPDFFake1
@@ -22,8 +28,32 @@ WARNING: issue-101.pdf (trailer, file position 3410): attempting to recover stre @@ -22,8 +28,32 @@ WARNING: issue-101.pdf (trailer, file position 3410): attempting to recover stre
22 WARNING: issue-101.pdf (trailer, file position 3410): recovered stream length: 12 28 WARNING: issue-101.pdf (trailer, file position 3410): recovered stream length: 12
23 WARNING: issue-101.pdf (trailer, file position 3560): /Length key in stream dictionary is not an integer 29 WARNING: issue-101.pdf (trailer, file position 3560): /Length key in stream dictionary is not an integer
24 WARNING: issue-101.pdf (trailer, file position 3631): attempting to recover stream length 30 WARNING: issue-101.pdf (trailer, file position 3631): attempting to recover stream length
25 -WARNING: issue-101.pdf (trailer, file position 3631): recovered stream length: 167 31 +WARNING: issue-101.pdf (trailer, file position 3631): recovered stream length: 8
26 WARNING: issue-101.pdf (trailer, file position 4113): /Length key in stream dictionary is not an integer 32 WARNING: issue-101.pdf (trailer, file position 4113): /Length key in stream dictionary is not an integer
27 WARNING: issue-101.pdf (trailer, file position 4184): attempting to recover stream length 33 WARNING: issue-101.pdf (trailer, file position 4184): attempting to recover stream length
28 -WARNING: issue-101.pdf (trailer, file position 4184): unable to recover stream data; treating stream as empty  
29 -issue-101.pdf: unable to find trailer dictionary while recovering damaged file 34 +WARNING: issue-101.pdf (trailer, file position 4184): recovered stream length: 8
  35 +WARNING: issue-101.pdf (file position 591): unknown token while reading object; treating as string
  36 +WARNING: issue-101.pdf (file position 625): treating unexpected brace token as null
  37 +WARNING: issue-101.pdf (file position 626): unknown token while reading object; treating as string
  38 +WARNING: issue-101.pdf (file position 637): unknown token while reading object; treating as string
  39 +WARNING: issue-101.pdf (file position 639): unknown token while reading object; treating as string
  40 +WARNING: issue-101.pdf (file position 644): unknown token while reading object; treating as string
  41 +WARNING: issue-101.pdf (file position 647): unknown token while reading object; treating as string
  42 +WARNING: issue-101.pdf (file position 687): unknown token while reading object; treating as string
  43 +WARNING: issue-101.pdf (file position 691): unknown token while reading object; treating as string
  44 +WARNING: issue-101.pdf (file position 696): unknown token while reading object; treating as string
  45 +WARNING: issue-101.pdf (file position 698): unknown token while reading object; treating as string
  46 +WARNING: issue-101.pdf (file position 701): unknown token while reading object; treating as string
  47 +WARNING: issue-101.pdf (file position 711): unknown token while reading object; treating as string
  48 +WARNING: issue-101.pdf (file position 742): unknown token while reading object; treating as string
  49 +WARNING: issue-101.pdf (file position 745): unknown token while reading object; treating as string
  50 +WARNING: issue-101.pdf (file position 747): unknown token while reading object; treating as string
  51 +WARNING: issue-101.pdf (file position 777): unknown token while reading object; treating as string
  52 +WARNING: issue-101.pdf (file position 790): unknown token while reading object; treating as string
  53 +WARNING: issue-101.pdf (file position 800): treating unexpected brace token as null
  54 +WARNING: issue-101.pdf (file position 801): unknown token while reading object; treating as string
  55 +WARNING: issue-101.pdf (file position 811): unknown token while reading object; treating as string
  56 +WARNING: issue-101.pdf (file position 819): unknown token while reading object; treating as string
  57 +WARNING: issue-101.pdf (file position 832): unknown token while reading object; treating as string
  58 +WARNING: issue-101.pdf (file position 856): unexpected >
  59 +issue-101.pdf (file position 856): unable to find /Root dictionary