Commit ca5b1d267ab77947cabebce0124a49480d4143b8
1 parent
3082e4e6
Improve stream length recovery
Eliminate PCRE and find endobj not preceded by endstream. Be more lax about placement of endstream and endobj.
Showing
5 changed files
with
87 additions
and
68 deletions
ChangeLog
| ... | ... | @@ -11,6 +11,12 @@ |
| 11 | 11 | the (bool, T*) version of the constructor instead. If not, just |
| 12 | 12 | remove the second parameter. |
| 13 | 13 | |
| 14 | +2017-08-09 Jay Berkenbilt <ejb@ql.org> | |
| 15 | + | |
| 16 | + * When recovering stream length, find endobj without endstream as | |
| 17 | + well as just looking for endstream. Be a little more lax about | |
| 18 | + where we allow it to be found. | |
| 19 | + | |
| 14 | 20 | 2017-08-05 Jay Berkenbilt <ejb@ql.org> |
| 15 | 21 | |
| 16 | 22 | * Add --single-pages option to cause output to be written to a | ... | ... |
include/qpdf/QPDF.hh
| ... | ... | @@ -1030,6 +1030,7 @@ class QPDF |
| 1030 | 1030 | // Methods to support pattern finding |
| 1031 | 1031 | bool findHeader(); |
| 1032 | 1032 | bool findStartxref(); |
| 1033 | + bool findEndstream(); | |
| 1033 | 1034 | |
| 1034 | 1035 | // methods to support linearization checking -- implemented in |
| 1035 | 1036 | // QPDF_linearization.cc | ... | ... |
libqpdf/QPDF.cc
| ... | ... | @@ -1231,76 +1231,43 @@ QPDF::readObject(PointerHolder<InputSource> input, |
| 1231 | 1231 | return object; |
| 1232 | 1232 | } |
| 1233 | 1233 | |
| 1234 | +bool | |
| 1235 | +QPDF::findEndstream() | |
| 1236 | +{ | |
| 1237 | + // Find endstream or endobj. Position the input at that token. | |
| 1238 | + QPDFTokenizer::Token t = readToken(this->file, true); | |
| 1239 | + if ((t.getType() == QPDFTokenizer::tt_word) && | |
| 1240 | + ((t.getValue() == "endobj") || | |
| 1241 | + (t.getValue() == "endstream"))); | |
| 1242 | + { | |
| 1243 | + this->file->seek(this->file->getLastOffset(), SEEK_SET); | |
| 1244 | + return true; | |
| 1245 | + } | |
| 1246 | + return false; | |
| 1247 | +} | |
| 1248 | + | |
| 1234 | 1249 | size_t |
| 1235 | 1250 | QPDF::recoverStreamLength(PointerHolder<InputSource> input, |
| 1236 | 1251 | int objid, int generation, |
| 1237 | 1252 | qpdf_offset_t stream_offset) |
| 1238 | 1253 | { |
| 1239 | - PCRE endobj_re("^\\s*endobj\\b"); | |
| 1240 | - | |
| 1241 | 1254 | // Try to reconstruct stream length by looking for |
| 1242 | - // endstream(\r\n?|\n)endobj | |
| 1255 | + // endstream or endobj | |
| 1243 | 1256 | warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(), |
| 1244 | 1257 | this->last_object_description, stream_offset, |
| 1245 | 1258 | "attempting to recover stream length")); |
| 1246 | 1259 | |
| 1247 | - input->seek(0, SEEK_END); | |
| 1248 | - qpdf_offset_t eof = input->tell(); | |
| 1249 | - input->seek(stream_offset, SEEK_SET); | |
| 1250 | - qpdf_offset_t last_line_offset = 0; | |
| 1260 | + PatternFinder ef(*this, &QPDF::findEndstream); | |
| 1251 | 1261 | size_t length = 0; |
| 1252 | - static int const line_end_length = 12; // room for endstream\r\n\0 | |
| 1253 | - char last_line_end[line_end_length]; | |
| 1254 | - while (input->tell() < eof) | |
| 1262 | + if (this->file->findFirst("end", stream_offset, 0, ef)) | |
| 1255 | 1263 | { |
| 1256 | - std::string line = input->readLine(50); | |
| 1257 | - qpdf_offset_t line_offset = input->getLastOffset(); | |
| 1258 | - if (endobj_re.match(line.c_str())) | |
| 1264 | + length = this->file->tell() - stream_offset; | |
| 1265 | + // Reread endstream but, if it was endobj, don't skip that. | |
| 1266 | + QPDFTokenizer::Token t = readToken(this->file); | |
| 1267 | + if (t.getValue() == "endobj") | |
| 1259 | 1268 | { |
| 1260 | - qpdf_offset_t endstream_offset = 0; | |
| 1261 | - if (last_line_offset >= line_end_length) | |
| 1262 | - { | |
| 1263 | - qpdf_offset_t cur_offset = input->tell(); | |
| 1264 | - // Read from the end of the last line, guaranteeing | |
| 1265 | - // null termination | |
| 1266 | - qpdf_offset_t search_offset = | |
| 1267 | - line_offset - (line_end_length - 1); | |
| 1268 | - input->seek(search_offset, SEEK_SET); | |
| 1269 | - memset(last_line_end, '\0', line_end_length); | |
| 1270 | - input->read(last_line_end, line_end_length - 1); | |
| 1271 | - input->seek(cur_offset, SEEK_SET); | |
| 1272 | - // if endstream[\r\n] will fit in last_line_end, the | |
| 1273 | - // 'e' has to be in one of the first three spots. | |
| 1274 | - // Check explicitly rather than using strstr directly | |
| 1275 | - // in case there are nulls right before endstream. | |
| 1276 | - char* p = ((last_line_end[0] == 'e') ? last_line_end : | |
| 1277 | - (last_line_end[1] == 'e') ? last_line_end + 1 : | |
| 1278 | - (last_line_end[2] == 'e') ? last_line_end + 2 : | |
| 1279 | - 0); | |
| 1280 | - char* endstream_p = 0; | |
| 1281 | - if (p) | |
| 1282 | - { | |
| 1283 | - char* p1 = strstr(p, "endstream\n"); | |
| 1284 | - char* p2 = strstr(p, "endstream\r"); | |
| 1285 | - endstream_p = (p1 ? p1 : p2); | |
| 1286 | - } | |
| 1287 | - if (endstream_p) | |
| 1288 | - { | |
| 1289 | - endstream_offset = | |
| 1290 | - search_offset + (endstream_p - last_line_end); | |
| 1291 | - } | |
| 1292 | - } | |
| 1293 | - if (endstream_offset > 0) | |
| 1294 | - { | |
| 1295 | - // Stream probably ends right before "endstream" | |
| 1296 | - length = endstream_offset - stream_offset; | |
| 1297 | - // Go back to where we would have been if we had just | |
| 1298 | - // read the endstream. | |
| 1299 | - input->seek(line_offset, SEEK_SET); | |
| 1300 | - break; | |
| 1301 | - } | |
| 1302 | - } | |
| 1303 | - last_line_offset = line_offset; | |
| 1269 | + this->file->seek(this->file->getLastOffset(), SEEK_SET); | |
| 1270 | + } | |
| 1304 | 1271 | } |
| 1305 | 1272 | |
| 1306 | 1273 | if (length) | ... | ... |
qpdf/qtest/qpdf/bad24-recover.out
| 1 | 1 | WARNING: bad24.pdf (object 4 0, file position 385): expected endstream |
| 2 | 2 | WARNING: bad24.pdf (object 4 0, file position 341): attempting to recover stream length |
| 3 | -WARNING: bad24.pdf (object 4 0, file position 341): unable to recover stream data; treating stream as empty | |
| 4 | -WARNING: bad24.pdf (object 4 0, file position 778): EOF while reading token | |
| 5 | -/QTest is implicit | |
| 6 | -/QTest is indirect and has type null (2) | |
| 7 | -/QTest is null | |
| 3 | +WARNING: bad24.pdf (object 4 0, file position 341): recovered stream length: 54 | |
| 4 | +/QTest is indirect and has type stream (10) | |
| 5 | +/QTest is a stream. Dictionary: << /Length 44 >> | |
| 6 | +Raw stream data: | |
| 7 | +BT | |
| 8 | + /F1 24 Tf | |
| 9 | + 72 720 Td | |
| 10 | + (Potato) Tj | |
| 11 | +ET | |
| 12 | +enxstream | |
| 13 | + | |
| 14 | +Uncompressed stream data: | |
| 15 | +BT | |
| 16 | + /F1 24 Tf | |
| 17 | + 72 720 Td | |
| 18 | + (Potato) Tj | |
| 19 | +ET | |
| 20 | +enxstream | |
| 21 | + | |
| 22 | +End of stream data | |
| 8 | 23 | unparse: 4 0 R |
| 9 | -unparseResolved: null | |
| 24 | +unparseResolved: 4 0 R | |
| 10 | 25 | test 1 done | ... | ... |
qpdf/qtest/qpdf/issue-101.out
| ... | ... | @@ -5,10 +5,16 @@ WARNING: issue-101.pdf (file position 1242): expected dictionary key but found n |
| 5 | 5 | WARNING: issue-101.pdf (file position 1242): dictionary ended prematurely; using null as value for last key |
| 6 | 6 | WARNING: issue-101.pdf (object 5 0, file position 1438): /Length key in stream dictionary is not an integer |
| 7 | 7 | WARNING: issue-101.pdf (object 5 0, file position 1509): attempting to recover stream length |
| 8 | -WARNING: issue-101.pdf (object 5 0, file position 1509): recovered stream length: 205 | |
| 8 | +WARNING: issue-101.pdf (object 5 0, file position 1509): recovered stream length: 8 | |
| 9 | +WARNING: issue-101.pdf (trailer, file position 1631): /Length key in stream dictionary is not an integer | |
| 10 | +WARNING: issue-101.pdf (trailer, file position 1702): attempting to recover stream length | |
| 11 | +WARNING: issue-101.pdf (trailer, file position 1702): recovered stream length: 12 | |
| 9 | 12 | WARNING: issue-101.pdf (trailer, file position 2026): /Length key in stream dictionary is not an integer |
| 10 | 13 | WARNING: issue-101.pdf (trailer, file position 2097): attempting to recover stream length |
| 11 | -WARNING: issue-101.pdf (trailer, file position 2097): recovered stream length: 709 | |
| 14 | +WARNING: issue-101.pdf (trailer, file position 2097): recovered stream length: 12 | |
| 15 | +WARNING: issue-101.pdf (trailer, file position 2613): /Length key in stream dictionary is not an integer | |
| 16 | +WARNING: issue-101.pdf (trailer, file position 2684): attempting to recover stream length | |
| 17 | +WARNING: issue-101.pdf (trailer, file position 2684): recovered stream length: 74 | |
| 12 | 18 | WARNING: issue-101.pdf (trailer, file position 2928): unknown token while reading object; treating as string |
| 13 | 19 | WARNING: issue-101.pdf (trailer, file position 2929): unknown token while reading object; treating as string |
| 14 | 20 | WARNING: issue-101.pdf (trailer, file position 2928): expected dictionary key but found non-name object; inserting key /QPDFFake1 |
| ... | ... | @@ -22,8 +28,32 @@ WARNING: issue-101.pdf (trailer, file position 3410): attempting to recover stre |
| 22 | 28 | WARNING: issue-101.pdf (trailer, file position 3410): recovered stream length: 12 |
| 23 | 29 | WARNING: issue-101.pdf (trailer, file position 3560): /Length key in stream dictionary is not an integer |
| 24 | 30 | WARNING: issue-101.pdf (trailer, file position 3631): attempting to recover stream length |
| 25 | -WARNING: issue-101.pdf (trailer, file position 3631): recovered stream length: 167 | |
| 31 | +WARNING: issue-101.pdf (trailer, file position 3631): recovered stream length: 8 | |
| 26 | 32 | WARNING: issue-101.pdf (trailer, file position 4113): /Length key in stream dictionary is not an integer |
| 27 | 33 | WARNING: issue-101.pdf (trailer, file position 4184): attempting to recover stream length |
| 28 | -WARNING: issue-101.pdf (trailer, file position 4184): unable to recover stream data; treating stream as empty | |
| 29 | -issue-101.pdf: unable to find trailer dictionary while recovering damaged file | |
| 34 | +WARNING: issue-101.pdf (trailer, file position 4184): recovered stream length: 8 | |
| 35 | +WARNING: issue-101.pdf (file position 591): unknown token while reading object; treating as string | |
| 36 | +WARNING: issue-101.pdf (file position 625): treating unexpected brace token as null | |
| 37 | +WARNING: issue-101.pdf (file position 626): unknown token while reading object; treating as string | |
| 38 | +WARNING: issue-101.pdf (file position 637): unknown token while reading object; treating as string | |
| 39 | +WARNING: issue-101.pdf (file position 639): unknown token while reading object; treating as string | |
| 40 | +WARNING: issue-101.pdf (file position 644): unknown token while reading object; treating as string | |
| 41 | +WARNING: issue-101.pdf (file position 647): unknown token while reading object; treating as string | |
| 42 | +WARNING: issue-101.pdf (file position 687): unknown token while reading object; treating as string | |
| 43 | +WARNING: issue-101.pdf (file position 691): unknown token while reading object; treating as string | |
| 44 | +WARNING: issue-101.pdf (file position 696): unknown token while reading object; treating as string | |
| 45 | +WARNING: issue-101.pdf (file position 698): unknown token while reading object; treating as string | |
| 46 | +WARNING: issue-101.pdf (file position 701): unknown token while reading object; treating as string | |
| 47 | +WARNING: issue-101.pdf (file position 711): unknown token while reading object; treating as string | |
| 48 | +WARNING: issue-101.pdf (file position 742): unknown token while reading object; treating as string | |
| 49 | +WARNING: issue-101.pdf (file position 745): unknown token while reading object; treating as string | |
| 50 | +WARNING: issue-101.pdf (file position 747): unknown token while reading object; treating as string | |
| 51 | +WARNING: issue-101.pdf (file position 777): unknown token while reading object; treating as string | |
| 52 | +WARNING: issue-101.pdf (file position 790): unknown token while reading object; treating as string | |
| 53 | +WARNING: issue-101.pdf (file position 800): treating unexpected brace token as null | |
| 54 | +WARNING: issue-101.pdf (file position 801): unknown token while reading object; treating as string | |
| 55 | +WARNING: issue-101.pdf (file position 811): unknown token while reading object; treating as string | |
| 56 | +WARNING: issue-101.pdf (file position 819): unknown token while reading object; treating as string | |
| 57 | +WARNING: issue-101.pdf (file position 832): unknown token while reading object; treating as string | |
| 58 | +WARNING: issue-101.pdf (file position 856): unexpected > | |
| 59 | +issue-101.pdf (file position 856): unable to find /Root dictionary | ... | ... |