Commit ca5b1d267ab77947cabebce0124a49480d4143b8
1 parent
3082e4e6
Improve stream length recovery
Eliminate PCRE and find endobj not preceded by endstream. Be more lax about placement of endstream and endobj.
Showing
5 changed files
with
87 additions
and
68 deletions
ChangeLog
| @@ -11,6 +11,12 @@ | @@ -11,6 +11,12 @@ | ||
| 11 | the (bool, T*) version of the constructor instead. If not, just | 11 | the (bool, T*) version of the constructor instead. If not, just |
| 12 | remove the second parameter. | 12 | remove the second parameter. |
| 13 | 13 | ||
| 14 | +2017-08-09 Jay Berkenbilt <ejb@ql.org> | ||
| 15 | + | ||
| 16 | + * When recovering stream length, find endobj without endstream as | ||
| 17 | + well as just looking for endstream. Be a little more lax about | ||
| 18 | + where we allow it to be found. | ||
| 19 | + | ||
| 14 | 2017-08-05 Jay Berkenbilt <ejb@ql.org> | 20 | 2017-08-05 Jay Berkenbilt <ejb@ql.org> |
| 15 | 21 | ||
| 16 | * Add --single-pages option to cause output to be written to a | 22 | * Add --single-pages option to cause output to be written to a |
include/qpdf/QPDF.hh
| @@ -1030,6 +1030,7 @@ class QPDF | @@ -1030,6 +1030,7 @@ class QPDF | ||
| 1030 | // Methods to support pattern finding | 1030 | // Methods to support pattern finding |
| 1031 | bool findHeader(); | 1031 | bool findHeader(); |
| 1032 | bool findStartxref(); | 1032 | bool findStartxref(); |
| 1033 | + bool findEndstream(); | ||
| 1033 | 1034 | ||
| 1034 | // methods to support linearization checking -- implemented in | 1035 | // methods to support linearization checking -- implemented in |
| 1035 | // QPDF_linearization.cc | 1036 | // QPDF_linearization.cc |
libqpdf/QPDF.cc
| @@ -1231,76 +1231,43 @@ QPDF::readObject(PointerHolder<InputSource> input, | @@ -1231,76 +1231,43 @@ QPDF::readObject(PointerHolder<InputSource> input, | ||
| 1231 | return object; | 1231 | return object; |
| 1232 | } | 1232 | } |
| 1233 | 1233 | ||
| 1234 | +bool | ||
| 1235 | +QPDF::findEndstream() | ||
| 1236 | +{ | ||
| 1237 | + // Find endstream or endobj. Position the input at that token. | ||
| 1238 | + QPDFTokenizer::Token t = readToken(this->file, true); | ||
| 1239 | + if ((t.getType() == QPDFTokenizer::tt_word) && | ||
| 1240 | + ((t.getValue() == "endobj") || | ||
| 1241 | + (t.getValue() == "endstream"))); | ||
| 1242 | + { | ||
| 1243 | + this->file->seek(this->file->getLastOffset(), SEEK_SET); | ||
| 1244 | + return true; | ||
| 1245 | + } | ||
| 1246 | + return false; | ||
| 1247 | +} | ||
| 1248 | + | ||
| 1234 | size_t | 1249 | size_t |
| 1235 | QPDF::recoverStreamLength(PointerHolder<InputSource> input, | 1250 | QPDF::recoverStreamLength(PointerHolder<InputSource> input, |
| 1236 | int objid, int generation, | 1251 | int objid, int generation, |
| 1237 | qpdf_offset_t stream_offset) | 1252 | qpdf_offset_t stream_offset) |
| 1238 | { | 1253 | { |
| 1239 | - PCRE endobj_re("^\\s*endobj\\b"); | ||
| 1240 | - | ||
| 1241 | // Try to reconstruct stream length by looking for | 1254 | // Try to reconstruct stream length by looking for |
| 1242 | - // endstream(\r\n?|\n)endobj | 1255 | + // endstream or endobj |
| 1243 | warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(), | 1256 | warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(), |
| 1244 | this->last_object_description, stream_offset, | 1257 | this->last_object_description, stream_offset, |
| 1245 | "attempting to recover stream length")); | 1258 | "attempting to recover stream length")); |
| 1246 | 1259 | ||
| 1247 | - input->seek(0, SEEK_END); | ||
| 1248 | - qpdf_offset_t eof = input->tell(); | ||
| 1249 | - input->seek(stream_offset, SEEK_SET); | ||
| 1250 | - qpdf_offset_t last_line_offset = 0; | 1260 | + PatternFinder ef(*this, &QPDF::findEndstream); |
| 1251 | size_t length = 0; | 1261 | size_t length = 0; |
| 1252 | - static int const line_end_length = 12; // room for endstream\r\n\0 | ||
| 1253 | - char last_line_end[line_end_length]; | ||
| 1254 | - while (input->tell() < eof) | 1262 | + if (this->file->findFirst("end", stream_offset, 0, ef)) |
| 1255 | { | 1263 | { |
| 1256 | - std::string line = input->readLine(50); | ||
| 1257 | - qpdf_offset_t line_offset = input->getLastOffset(); | ||
| 1258 | - if (endobj_re.match(line.c_str())) | 1264 | + length = this->file->tell() - stream_offset; |
| 1265 | + // Reread endstream but, if it was endobj, don't skip that. | ||
| 1266 | + QPDFTokenizer::Token t = readToken(this->file); | ||
| 1267 | + if (t.getValue() == "endobj") | ||
| 1259 | { | 1268 | { |
| 1260 | - qpdf_offset_t endstream_offset = 0; | ||
| 1261 | - if (last_line_offset >= line_end_length) | ||
| 1262 | - { | ||
| 1263 | - qpdf_offset_t cur_offset = input->tell(); | ||
| 1264 | - // Read from the end of the last line, guaranteeing | ||
| 1265 | - // null termination | ||
| 1266 | - qpdf_offset_t search_offset = | ||
| 1267 | - line_offset - (line_end_length - 1); | ||
| 1268 | - input->seek(search_offset, SEEK_SET); | ||
| 1269 | - memset(last_line_end, '\0', line_end_length); | ||
| 1270 | - input->read(last_line_end, line_end_length - 1); | ||
| 1271 | - input->seek(cur_offset, SEEK_SET); | ||
| 1272 | - // if endstream[\r\n] will fit in last_line_end, the | ||
| 1273 | - // 'e' has to be in one of the first three spots. | ||
| 1274 | - // Check explicitly rather than using strstr directly | ||
| 1275 | - // in case there are nulls right before endstream. | ||
| 1276 | - char* p = ((last_line_end[0] == 'e') ? last_line_end : | ||
| 1277 | - (last_line_end[1] == 'e') ? last_line_end + 1 : | ||
| 1278 | - (last_line_end[2] == 'e') ? last_line_end + 2 : | ||
| 1279 | - 0); | ||
| 1280 | - char* endstream_p = 0; | ||
| 1281 | - if (p) | ||
| 1282 | - { | ||
| 1283 | - char* p1 = strstr(p, "endstream\n"); | ||
| 1284 | - char* p2 = strstr(p, "endstream\r"); | ||
| 1285 | - endstream_p = (p1 ? p1 : p2); | ||
| 1286 | - } | ||
| 1287 | - if (endstream_p) | ||
| 1288 | - { | ||
| 1289 | - endstream_offset = | ||
| 1290 | - search_offset + (endstream_p - last_line_end); | ||
| 1291 | - } | ||
| 1292 | - } | ||
| 1293 | - if (endstream_offset > 0) | ||
| 1294 | - { | ||
| 1295 | - // Stream probably ends right before "endstream" | ||
| 1296 | - length = endstream_offset - stream_offset; | ||
| 1297 | - // Go back to where we would have been if we had just | ||
| 1298 | - // read the endstream. | ||
| 1299 | - input->seek(line_offset, SEEK_SET); | ||
| 1300 | - break; | ||
| 1301 | - } | ||
| 1302 | - } | ||
| 1303 | - last_line_offset = line_offset; | 1269 | + this->file->seek(this->file->getLastOffset(), SEEK_SET); |
| 1270 | + } | ||
| 1304 | } | 1271 | } |
| 1305 | 1272 | ||
| 1306 | if (length) | 1273 | if (length) |
qpdf/qtest/qpdf/bad24-recover.out
| 1 | WARNING: bad24.pdf (object 4 0, file position 385): expected endstream | 1 | WARNING: bad24.pdf (object 4 0, file position 385): expected endstream |
| 2 | WARNING: bad24.pdf (object 4 0, file position 341): attempting to recover stream length | 2 | WARNING: bad24.pdf (object 4 0, file position 341): attempting to recover stream length |
| 3 | -WARNING: bad24.pdf (object 4 0, file position 341): unable to recover stream data; treating stream as empty | ||
| 4 | -WARNING: bad24.pdf (object 4 0, file position 778): EOF while reading token | ||
| 5 | -/QTest is implicit | ||
| 6 | -/QTest is indirect and has type null (2) | ||
| 7 | -/QTest is null | 3 | +WARNING: bad24.pdf (object 4 0, file position 341): recovered stream length: 54 |
| 4 | +/QTest is indirect and has type stream (10) | ||
| 5 | +/QTest is a stream. Dictionary: << /Length 44 >> | ||
| 6 | +Raw stream data: | ||
| 7 | +BT | ||
| 8 | + /F1 24 Tf | ||
| 9 | + 72 720 Td | ||
| 10 | + (Potato) Tj | ||
| 11 | +ET | ||
| 12 | +enxstream | ||
| 13 | + | ||
| 14 | +Uncompressed stream data: | ||
| 15 | +BT | ||
| 16 | + /F1 24 Tf | ||
| 17 | + 72 720 Td | ||
| 18 | + (Potato) Tj | ||
| 19 | +ET | ||
| 20 | +enxstream | ||
| 21 | + | ||
| 22 | +End of stream data | ||
| 8 | unparse: 4 0 R | 23 | unparse: 4 0 R |
| 9 | -unparseResolved: null | 24 | +unparseResolved: 4 0 R |
| 10 | test 1 done | 25 | test 1 done |
qpdf/qtest/qpdf/issue-101.out
| @@ -5,10 +5,16 @@ WARNING: issue-101.pdf (file position 1242): expected dictionary key but found n | @@ -5,10 +5,16 @@ WARNING: issue-101.pdf (file position 1242): expected dictionary key but found n | ||
| 5 | WARNING: issue-101.pdf (file position 1242): dictionary ended prematurely; using null as value for last key | 5 | WARNING: issue-101.pdf (file position 1242): dictionary ended prematurely; using null as value for last key |
| 6 | WARNING: issue-101.pdf (object 5 0, file position 1438): /Length key in stream dictionary is not an integer | 6 | WARNING: issue-101.pdf (object 5 0, file position 1438): /Length key in stream dictionary is not an integer |
| 7 | WARNING: issue-101.pdf (object 5 0, file position 1509): attempting to recover stream length | 7 | WARNING: issue-101.pdf (object 5 0, file position 1509): attempting to recover stream length |
| 8 | -WARNING: issue-101.pdf (object 5 0, file position 1509): recovered stream length: 205 | 8 | +WARNING: issue-101.pdf (object 5 0, file position 1509): recovered stream length: 8 |
| 9 | +WARNING: issue-101.pdf (trailer, file position 1631): /Length key in stream dictionary is not an integer | ||
| 10 | +WARNING: issue-101.pdf (trailer, file position 1702): attempting to recover stream length | ||
| 11 | +WARNING: issue-101.pdf (trailer, file position 1702): recovered stream length: 12 | ||
| 9 | WARNING: issue-101.pdf (trailer, file position 2026): /Length key in stream dictionary is not an integer | 12 | WARNING: issue-101.pdf (trailer, file position 2026): /Length key in stream dictionary is not an integer |
| 10 | WARNING: issue-101.pdf (trailer, file position 2097): attempting to recover stream length | 13 | WARNING: issue-101.pdf (trailer, file position 2097): attempting to recover stream length |
| 11 | -WARNING: issue-101.pdf (trailer, file position 2097): recovered stream length: 709 | 14 | +WARNING: issue-101.pdf (trailer, file position 2097): recovered stream length: 12 |
| 15 | +WARNING: issue-101.pdf (trailer, file position 2613): /Length key in stream dictionary is not an integer | ||
| 16 | +WARNING: issue-101.pdf (trailer, file position 2684): attempting to recover stream length | ||
| 17 | +WARNING: issue-101.pdf (trailer, file position 2684): recovered stream length: 74 | ||
| 12 | WARNING: issue-101.pdf (trailer, file position 2928): unknown token while reading object; treating as string | 18 | WARNING: issue-101.pdf (trailer, file position 2928): unknown token while reading object; treating as string |
| 13 | WARNING: issue-101.pdf (trailer, file position 2929): unknown token while reading object; treating as string | 19 | WARNING: issue-101.pdf (trailer, file position 2929): unknown token while reading object; treating as string |
| 14 | WARNING: issue-101.pdf (trailer, file position 2928): expected dictionary key but found non-name object; inserting key /QPDFFake1 | 20 | WARNING: issue-101.pdf (trailer, file position 2928): expected dictionary key but found non-name object; inserting key /QPDFFake1 |
| @@ -22,8 +28,32 @@ WARNING: issue-101.pdf (trailer, file position 3410): attempting to recover stre | @@ -22,8 +28,32 @@ WARNING: issue-101.pdf (trailer, file position 3410): attempting to recover stre | ||
| 22 | WARNING: issue-101.pdf (trailer, file position 3410): recovered stream length: 12 | 28 | WARNING: issue-101.pdf (trailer, file position 3410): recovered stream length: 12 |
| 23 | WARNING: issue-101.pdf (trailer, file position 3560): /Length key in stream dictionary is not an integer | 29 | WARNING: issue-101.pdf (trailer, file position 3560): /Length key in stream dictionary is not an integer |
| 24 | WARNING: issue-101.pdf (trailer, file position 3631): attempting to recover stream length | 30 | WARNING: issue-101.pdf (trailer, file position 3631): attempting to recover stream length |
| 25 | -WARNING: issue-101.pdf (trailer, file position 3631): recovered stream length: 167 | 31 | +WARNING: issue-101.pdf (trailer, file position 3631): recovered stream length: 8 |
| 26 | WARNING: issue-101.pdf (trailer, file position 4113): /Length key in stream dictionary is not an integer | 32 | WARNING: issue-101.pdf (trailer, file position 4113): /Length key in stream dictionary is not an integer |
| 27 | WARNING: issue-101.pdf (trailer, file position 4184): attempting to recover stream length | 33 | WARNING: issue-101.pdf (trailer, file position 4184): attempting to recover stream length |
| 28 | -WARNING: issue-101.pdf (trailer, file position 4184): unable to recover stream data; treating stream as empty | ||
| 29 | -issue-101.pdf: unable to find trailer dictionary while recovering damaged file | 34 | +WARNING: issue-101.pdf (trailer, file position 4184): recovered stream length: 8 |
| 35 | +WARNING: issue-101.pdf (file position 591): unknown token while reading object; treating as string | ||
| 36 | +WARNING: issue-101.pdf (file position 625): treating unexpected brace token as null | ||
| 37 | +WARNING: issue-101.pdf (file position 626): unknown token while reading object; treating as string | ||
| 38 | +WARNING: issue-101.pdf (file position 637): unknown token while reading object; treating as string | ||
| 39 | +WARNING: issue-101.pdf (file position 639): unknown token while reading object; treating as string | ||
| 40 | +WARNING: issue-101.pdf (file position 644): unknown token while reading object; treating as string | ||
| 41 | +WARNING: issue-101.pdf (file position 647): unknown token while reading object; treating as string | ||
| 42 | +WARNING: issue-101.pdf (file position 687): unknown token while reading object; treating as string | ||
| 43 | +WARNING: issue-101.pdf (file position 691): unknown token while reading object; treating as string | ||
| 44 | +WARNING: issue-101.pdf (file position 696): unknown token while reading object; treating as string | ||
| 45 | +WARNING: issue-101.pdf (file position 698): unknown token while reading object; treating as string | ||
| 46 | +WARNING: issue-101.pdf (file position 701): unknown token while reading object; treating as string | ||
| 47 | +WARNING: issue-101.pdf (file position 711): unknown token while reading object; treating as string | ||
| 48 | +WARNING: issue-101.pdf (file position 742): unknown token while reading object; treating as string | ||
| 49 | +WARNING: issue-101.pdf (file position 745): unknown token while reading object; treating as string | ||
| 50 | +WARNING: issue-101.pdf (file position 747): unknown token while reading object; treating as string | ||
| 51 | +WARNING: issue-101.pdf (file position 777): unknown token while reading object; treating as string | ||
| 52 | +WARNING: issue-101.pdf (file position 790): unknown token while reading object; treating as string | ||
| 53 | +WARNING: issue-101.pdf (file position 800): treating unexpected brace token as null | ||
| 54 | +WARNING: issue-101.pdf (file position 801): unknown token while reading object; treating as string | ||
| 55 | +WARNING: issue-101.pdf (file position 811): unknown token while reading object; treating as string | ||
| 56 | +WARNING: issue-101.pdf (file position 819): unknown token while reading object; treating as string | ||
| 57 | +WARNING: issue-101.pdf (file position 832): unknown token while reading object; treating as string | ||
| 58 | +WARNING: issue-101.pdf (file position 856): unexpected > | ||
| 59 | +issue-101.pdf (file position 856): unable to find /Root dictionary |