Commit ca5b1d267ab77947cabebce0124a49480d4143b8

Authored by Jay Berkenbilt
1 parent 3082e4e6

Improve stream length recovery

Eliminate PCRE and find endobj not preceded by endstream. Be more lax
about placement of endstream and endobj.
ChangeLog
... ... @@ -11,6 +11,12 @@
11 11 the (bool, T*) version of the constructor instead. If not, just
12 12 remove the second parameter.
13 13  
  14 +2017-08-09 Jay Berkenbilt <ejb@ql.org>
  15 +
  16 + * When recovering stream length, find endobj without endstream as
  17 + well as just looking for endstream. Be a little more lax about
  18 + where we allow it to be found.
  19 +
14 20 2017-08-05 Jay Berkenbilt <ejb@ql.org>
15 21  
16 22 * Add --single-pages option to cause output to be written to a
... ...
include/qpdf/QPDF.hh
... ... @@ -1030,6 +1030,7 @@ class QPDF
1030 1030 // Methods to support pattern finding
1031 1031 bool findHeader();
1032 1032 bool findStartxref();
  1033 + bool findEndstream();
1033 1034  
1034 1035 // methods to support linearization checking -- implemented in
1035 1036 // QPDF_linearization.cc
... ...
libqpdf/QPDF.cc
... ... @@ -1231,76 +1231,43 @@ QPDF::readObject(PointerHolder&lt;InputSource&gt; input,
1231 1231 return object;
1232 1232 }
1233 1233  
  1234 +bool
  1235 +QPDF::findEndstream()
  1236 +{
  1237 + // Find endstream or endobj. Position the input at that token.
  1238 + QPDFTokenizer::Token t = readToken(this->file, true);
  1239 + if ((t.getType() == QPDFTokenizer::tt_word) &&
  1240 + ((t.getValue() == "endobj") ||
  1241 + (t.getValue() == "endstream")));
  1242 + {
  1243 + this->file->seek(this->file->getLastOffset(), SEEK_SET);
  1244 + return true;
  1245 + }
  1246 + return false;
  1247 +}
  1248 +
1234 1249 size_t
1235 1250 QPDF::recoverStreamLength(PointerHolder<InputSource> input,
1236 1251 int objid, int generation,
1237 1252 qpdf_offset_t stream_offset)
1238 1253 {
1239   - PCRE endobj_re("^\\s*endobj\\b");
1240   -
1241 1254 // Try to reconstruct stream length by looking for
1242   - // endstream(\r\n?|\n)endobj
  1255 + // endstream or endobj
1243 1256 warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(),
1244 1257 this->last_object_description, stream_offset,
1245 1258 "attempting to recover stream length"));
1246 1259  
1247   - input->seek(0, SEEK_END);
1248   - qpdf_offset_t eof = input->tell();
1249   - input->seek(stream_offset, SEEK_SET);
1250   - qpdf_offset_t last_line_offset = 0;
  1260 + PatternFinder ef(*this, &QPDF::findEndstream);
1251 1261 size_t length = 0;
1252   - static int const line_end_length = 12; // room for endstream\r\n\0
1253   - char last_line_end[line_end_length];
1254   - while (input->tell() < eof)
  1262 + if (this->file->findFirst("end", stream_offset, 0, ef))
1255 1263 {
1256   - std::string line = input->readLine(50);
1257   - qpdf_offset_t line_offset = input->getLastOffset();
1258   - if (endobj_re.match(line.c_str()))
  1264 + length = this->file->tell() - stream_offset;
  1265 + // Reread endstream but, if it was endobj, don't skip that.
  1266 + QPDFTokenizer::Token t = readToken(this->file);
  1267 + if (t.getValue() == "endobj")
1259 1268 {
1260   - qpdf_offset_t endstream_offset = 0;
1261   - if (last_line_offset >= line_end_length)
1262   - {
1263   - qpdf_offset_t cur_offset = input->tell();
1264   - // Read from the end of the last line, guaranteeing
1265   - // null termination
1266   - qpdf_offset_t search_offset =
1267   - line_offset - (line_end_length - 1);
1268   - input->seek(search_offset, SEEK_SET);
1269   - memset(last_line_end, '\0', line_end_length);
1270   - input->read(last_line_end, line_end_length - 1);
1271   - input->seek(cur_offset, SEEK_SET);
1272   - // if endstream[\r\n] will fit in last_line_end, the
1273   - // 'e' has to be in one of the first three spots.
1274   - // Check explicitly rather than using strstr directly
1275   - // in case there are nulls right before endstream.
1276   - char* p = ((last_line_end[0] == 'e') ? last_line_end :
1277   - (last_line_end[1] == 'e') ? last_line_end + 1 :
1278   - (last_line_end[2] == 'e') ? last_line_end + 2 :
1279   - 0);
1280   - char* endstream_p = 0;
1281   - if (p)
1282   - {
1283   - char* p1 = strstr(p, "endstream\n");
1284   - char* p2 = strstr(p, "endstream\r");
1285   - endstream_p = (p1 ? p1 : p2);
1286   - }
1287   - if (endstream_p)
1288   - {
1289   - endstream_offset =
1290   - search_offset + (endstream_p - last_line_end);
1291   - }
1292   - }
1293   - if (endstream_offset > 0)
1294   - {
1295   - // Stream probably ends right before "endstream"
1296   - length = endstream_offset - stream_offset;
1297   - // Go back to where we would have been if we had just
1298   - // read the endstream.
1299   - input->seek(line_offset, SEEK_SET);
1300   - break;
1301   - }
1302   - }
1303   - last_line_offset = line_offset;
  1269 + this->file->seek(this->file->getLastOffset(), SEEK_SET);
  1270 + }
1304 1271 }
1305 1272  
1306 1273 if (length)
... ...
qpdf/qtest/qpdf/bad24-recover.out
1 1 WARNING: bad24.pdf (object 4 0, file position 385): expected endstream
2 2 WARNING: bad24.pdf (object 4 0, file position 341): attempting to recover stream length
3   -WARNING: bad24.pdf (object 4 0, file position 341): unable to recover stream data; treating stream as empty
4   -WARNING: bad24.pdf (object 4 0, file position 778): EOF while reading token
5   -/QTest is implicit
6   -/QTest is indirect and has type null (2)
7   -/QTest is null
  3 +WARNING: bad24.pdf (object 4 0, file position 341): recovered stream length: 54
  4 +/QTest is indirect and has type stream (10)
  5 +/QTest is a stream. Dictionary: << /Length 44 >>
  6 +Raw stream data:
  7 +BT
  8 + /F1 24 Tf
  9 + 72 720 Td
  10 + (Potato) Tj
  11 +ET
  12 +enxstream
  13 +
  14 +Uncompressed stream data:
  15 +BT
  16 + /F1 24 Tf
  17 + 72 720 Td
  18 + (Potato) Tj
  19 +ET
  20 +enxstream
  21 +
  22 +End of stream data
8 23 unparse: 4 0 R
9   -unparseResolved: null
  24 +unparseResolved: 4 0 R
10 25 test 1 done
... ...
qpdf/qtest/qpdf/issue-101.out
... ... @@ -5,10 +5,16 @@ WARNING: issue-101.pdf (file position 1242): expected dictionary key but found n
5 5 WARNING: issue-101.pdf (file position 1242): dictionary ended prematurely; using null as value for last key
6 6 WARNING: issue-101.pdf (object 5 0, file position 1438): /Length key in stream dictionary is not an integer
7 7 WARNING: issue-101.pdf (object 5 0, file position 1509): attempting to recover stream length
8   -WARNING: issue-101.pdf (object 5 0, file position 1509): recovered stream length: 205
  8 +WARNING: issue-101.pdf (object 5 0, file position 1509): recovered stream length: 8
  9 +WARNING: issue-101.pdf (trailer, file position 1631): /Length key in stream dictionary is not an integer
  10 +WARNING: issue-101.pdf (trailer, file position 1702): attempting to recover stream length
  11 +WARNING: issue-101.pdf (trailer, file position 1702): recovered stream length: 12
9 12 WARNING: issue-101.pdf (trailer, file position 2026): /Length key in stream dictionary is not an integer
10 13 WARNING: issue-101.pdf (trailer, file position 2097): attempting to recover stream length
11   -WARNING: issue-101.pdf (trailer, file position 2097): recovered stream length: 709
  14 +WARNING: issue-101.pdf (trailer, file position 2097): recovered stream length: 12
  15 +WARNING: issue-101.pdf (trailer, file position 2613): /Length key in stream dictionary is not an integer
  16 +WARNING: issue-101.pdf (trailer, file position 2684): attempting to recover stream length
  17 +WARNING: issue-101.pdf (trailer, file position 2684): recovered stream length: 74
12 18 WARNING: issue-101.pdf (trailer, file position 2928): unknown token while reading object; treating as string
13 19 WARNING: issue-101.pdf (trailer, file position 2929): unknown token while reading object; treating as string
14 20 WARNING: issue-101.pdf (trailer, file position 2928): expected dictionary key but found non-name object; inserting key /QPDFFake1
... ... @@ -22,8 +28,32 @@ WARNING: issue-101.pdf (trailer, file position 3410): attempting to recover stre
22 28 WARNING: issue-101.pdf (trailer, file position 3410): recovered stream length: 12
23 29 WARNING: issue-101.pdf (trailer, file position 3560): /Length key in stream dictionary is not an integer
24 30 WARNING: issue-101.pdf (trailer, file position 3631): attempting to recover stream length
25   -WARNING: issue-101.pdf (trailer, file position 3631): recovered stream length: 167
  31 +WARNING: issue-101.pdf (trailer, file position 3631): recovered stream length: 8
26 32 WARNING: issue-101.pdf (trailer, file position 4113): /Length key in stream dictionary is not an integer
27 33 WARNING: issue-101.pdf (trailer, file position 4184): attempting to recover stream length
28   -WARNING: issue-101.pdf (trailer, file position 4184): unable to recover stream data; treating stream as empty
29   -issue-101.pdf: unable to find trailer dictionary while recovering damaged file
  34 +WARNING: issue-101.pdf (trailer, file position 4184): recovered stream length: 8
  35 +WARNING: issue-101.pdf (file position 591): unknown token while reading object; treating as string
  36 +WARNING: issue-101.pdf (file position 625): treating unexpected brace token as null
  37 +WARNING: issue-101.pdf (file position 626): unknown token while reading object; treating as string
  38 +WARNING: issue-101.pdf (file position 637): unknown token while reading object; treating as string
  39 +WARNING: issue-101.pdf (file position 639): unknown token while reading object; treating as string
  40 +WARNING: issue-101.pdf (file position 644): unknown token while reading object; treating as string
  41 +WARNING: issue-101.pdf (file position 647): unknown token while reading object; treating as string
  42 +WARNING: issue-101.pdf (file position 687): unknown token while reading object; treating as string
  43 +WARNING: issue-101.pdf (file position 691): unknown token while reading object; treating as string
  44 +WARNING: issue-101.pdf (file position 696): unknown token while reading object; treating as string
  45 +WARNING: issue-101.pdf (file position 698): unknown token while reading object; treating as string
  46 +WARNING: issue-101.pdf (file position 701): unknown token while reading object; treating as string
  47 +WARNING: issue-101.pdf (file position 711): unknown token while reading object; treating as string
  48 +WARNING: issue-101.pdf (file position 742): unknown token while reading object; treating as string
  49 +WARNING: issue-101.pdf (file position 745): unknown token while reading object; treating as string
  50 +WARNING: issue-101.pdf (file position 747): unknown token while reading object; treating as string
  51 +WARNING: issue-101.pdf (file position 777): unknown token while reading object; treating as string
  52 +WARNING: issue-101.pdf (file position 790): unknown token while reading object; treating as string
  53 +WARNING: issue-101.pdf (file position 800): treating unexpected brace token as null
  54 +WARNING: issue-101.pdf (file position 801): unknown token while reading object; treating as string
  55 +WARNING: issue-101.pdf (file position 811): unknown token while reading object; treating as string
  56 +WARNING: issue-101.pdf (file position 819): unknown token while reading object; treating as string
  57 +WARNING: issue-101.pdf (file position 832): unknown token while reading object; treating as string
  58 +WARNING: issue-101.pdf (file position 856): unexpected >
  59 +issue-101.pdf (file position 856): unable to find /Root dictionary
... ...