Commit b30deaeeaba3941d7615bc2cc89c664b1273e5df
1 parent
0dea2769
Avoid merging adjacent tokens when concatenating contents (fixes #444)
Showing
16 changed files
with
541 additions
and
43 deletions
ChangeLog
| 1 | 2020-10-23 Jay Berkenbilt <ejb@ql.org> | 1 | 2020-10-23 Jay Berkenbilt <ejb@ql.org> |
| 2 | 2 | ||
| 3 | + * Bug fix: when concatenating content streams, insert a newline if | ||
| 4 | + needed to prevent the last token from the old stream from being | ||
| 5 | + merged with the first token of the new stream. Qpdf was mistakenly | ||
| 6 | + concatenating the streams without regard to the specification that | ||
| 7 | + content streams are to be broken on token boundaries. Fixes #444. | ||
| 8 | + | ||
| 3 | * Bug fix: fix-qdf: properly handle empty streams with ignore | 9 | * Bug fix: fix-qdf: properly handle empty streams with ignore |
| 4 | newline. | 10 | newline. |
| 5 | 11 |
TODO
| @@ -4,7 +4,6 @@ Candidates for upcoming release | @@ -4,7 +4,6 @@ Candidates for upcoming release | ||
| 4 | * Open "next" issues | 4 | * Open "next" issues |
| 5 | * bugs | 5 | * bugs |
| 6 | * #473: zsh completion with directories | 6 | * #473: zsh completion with directories |
| 7 | - * #444: concatenated stream/whitespace bug | ||
| 8 | * Non-bugs | 7 | * Non-bugs |
| 9 | * #446: recognize edited QDF files | 8 | * #446: recognize edited QDF files |
| 10 | * #436: parsing of document with form xobject | 9 | * #436: parsing of document with form xobject |
libqpdf/QPDFObjectHandle.cc
| @@ -165,6 +165,47 @@ QPDFObjectHandle::ParserCallbacks::terminateParsing() | @@ -165,6 +165,47 @@ QPDFObjectHandle::ParserCallbacks::terminateParsing() | ||
| 165 | throw TerminateParsing(); | 165 | throw TerminateParsing(); |
| 166 | } | 166 | } |
| 167 | 167 | ||
| 168 | +class LastChar: public Pipeline | ||
| 169 | +{ | ||
| 170 | + public: | ||
| 171 | + LastChar(Pipeline* next); | ||
| 172 | + virtual ~LastChar() = default; | ||
| 173 | + virtual void write(unsigned char* data, size_t len); | ||
| 174 | + virtual void finish(); | ||
| 175 | + unsigned char getLastChar(); | ||
| 176 | + | ||
| 177 | + private: | ||
| 178 | + unsigned char last_char; | ||
| 179 | +}; | ||
| 180 | + | ||
| 181 | +LastChar::LastChar(Pipeline* next) : | ||
| 182 | + Pipeline("lastchar", next), | ||
| 183 | + last_char(0) | ||
| 184 | +{ | ||
| 185 | +} | ||
| 186 | + | ||
| 187 | +void | ||
| 188 | +LastChar::write(unsigned char* data, size_t len) | ||
| 189 | +{ | ||
| 190 | + if (len > 0) | ||
| 191 | + { | ||
| 192 | + this->last_char = data[len - 1]; | ||
| 193 | + } | ||
| 194 | + getNext()->write(data, len); | ||
| 195 | +} | ||
| 196 | + | ||
| 197 | +void | ||
| 198 | +LastChar::finish() | ||
| 199 | +{ | ||
| 200 | + getNext()->finish(); | ||
| 201 | +} | ||
| 202 | + | ||
| 203 | +unsigned char | ||
| 204 | +LastChar::getLastChar() | ||
| 205 | +{ | ||
| 206 | + return this->last_char; | ||
| 207 | +} | ||
| 208 | + | ||
| 168 | QPDFObjectHandle::QPDFObjectHandle() : | 209 | QPDFObjectHandle::QPDFObjectHandle() : |
| 169 | initialized(false), | 210 | initialized(false), |
| 170 | qpdf(0), | 211 | qpdf(0), |
| @@ -1600,21 +1641,31 @@ QPDFObjectHandle::pipeContentStreams( | @@ -1600,21 +1641,31 @@ QPDFObjectHandle::pipeContentStreams( | ||
| 1600 | std::vector<QPDFObjectHandle> streams = | 1641 | std::vector<QPDFObjectHandle> streams = |
| 1601 | arrayOrStreamToStreamArray( | 1642 | arrayOrStreamToStreamArray( |
| 1602 | description, all_description); | 1643 | description, all_description); |
| 1644 | + bool need_newline = false; | ||
| 1603 | for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin(); | 1645 | for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin(); |
| 1604 | iter != streams.end(); ++iter) | 1646 | iter != streams.end(); ++iter) |
| 1605 | { | 1647 | { |
| 1648 | + if (need_newline) | ||
| 1649 | + { | ||
| 1650 | + p->write(QUtil::unsigned_char_pointer("\n"), 1); | ||
| 1651 | + } | ||
| 1652 | + LastChar lc(p); | ||
| 1606 | QPDFObjectHandle stream = *iter; | 1653 | QPDFObjectHandle stream = *iter; |
| 1607 | std::string og = | 1654 | std::string og = |
| 1608 | QUtil::int_to_string(stream.getObjectID()) + " " + | 1655 | QUtil::int_to_string(stream.getObjectID()) + " " + |
| 1609 | QUtil::int_to_string(stream.getGeneration()); | 1656 | QUtil::int_to_string(stream.getGeneration()); |
| 1610 | std::string w_description = "content stream object " + og; | 1657 | std::string w_description = "content stream object " + og; |
| 1611 | - if (! stream.pipeStreamData(p, 0, qpdf_dl_specialized)) | 1658 | + if (! stream.pipeStreamData(&lc, 0, qpdf_dl_specialized)) |
| 1612 | { | 1659 | { |
| 1613 | QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent"); | 1660 | QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent"); |
| 1614 | throw QPDFExc(qpdf_e_damaged_pdf, "content stream", | 1661 | throw QPDFExc(qpdf_e_damaged_pdf, "content stream", |
| 1615 | w_description, 0, | 1662 | w_description, 0, |
| 1616 | "errors while decoding content stream"); | 1663 | "errors while decoding content stream"); |
| 1617 | } | 1664 | } |
| 1665 | + lc.finish(); | ||
| 1666 | + need_newline = (lc.getLastChar() != static_cast<unsigned char>('\n')); | ||
| 1667 | + QTC::TC("qpdf", "QPDFObjectHandle need_newline", | ||
| 1668 | + need_newline ? 0 : 1); | ||
| 1618 | } | 1669 | } |
| 1619 | } | 1670 | } |
| 1620 | 1671 |
manual/qpdf-manual.xml
| @@ -2090,14 +2090,9 @@ outfile.pdf</option> | @@ -2090,14 +2090,9 @@ outfile.pdf</option> | ||
| 2090 | option causes qpdf to combine them into a single stream. Use | 2090 | option causes qpdf to combine them into a single stream. Use |
| 2091 | of this option is never necessary for ordinary usage, but it | 2091 | of this option is never necessary for ordinary usage, but it |
| 2092 | can help when working with some files in some cases. For | 2092 | can help when working with some files in some cases. For |
| 2093 | - example, some PDF writers split page contents into small | ||
| 2094 | - streams at arbitrary points that may fall in the middle of | ||
| 2095 | - lexical tokens within the content, and some PDF readers may | ||
| 2096 | - get confused on such files. If you use qpdf to coalesce the | ||
| 2097 | - content streams, such readers may be able to work with the | ||
| 2098 | - file more easily. This can also be combined with QDF mode or | ||
| 2099 | - content normalization to make it easier to look at all of a | ||
| 2100 | - page's contents at once. | 2093 | + example, this can also be combined with QDF mode or content |
| 2094 | + normalization to make it easier to look at all of a page's | ||
| 2095 | + contents at once. | ||
| 2101 | </para> | 2096 | </para> |
| 2102 | </listitem> | 2097 | </listitem> |
| 2103 | </varlistentry> | 2098 | </varlistentry> |
| @@ -2398,25 +2393,15 @@ outfile.pdf</option> | @@ -2398,25 +2393,15 @@ outfile.pdf</option> | ||
| 2398 | You should not use this for “production” PDF files. | 2393 | You should not use this for “production” PDF files. |
| 2399 | </para> | 2394 | </para> |
| 2400 | <para> | 2395 | <para> |
| 2401 | - This paragraph discusses edge cases of content normalization that | ||
| 2402 | - are not of concern to most users and are not relevant when content | ||
| 2403 | - normalization is not enabled. When normalizing content, if qpdf | ||
| 2404 | - runs into any lexical errors, it will print a warning indicating | ||
| 2405 | - that content may be damaged. The only situation in which qpdf is | ||
| 2406 | - known to cause damage during content normalization is when a | ||
| 2407 | - page's contents are split across multiple streams and streams are | ||
| 2408 | - split in the middle of a lexical token such as a string, name, or | ||
| 2409 | - inline image. There may be some pathological cases in which qpdf | ||
| 2410 | - could damage content without noticing this, such as if the partial | ||
| 2411 | - tokens at the end of one stream and the beginning of the next | ||
| 2412 | - stream are both valid, but usually qpdf will be able to detect | ||
| 2413 | - this case. For slightly increased safety, you can specify | ||
| 2414 | - <option>--coalesce-contents</option> in addition to | ||
| 2415 | - <option>--normalize-content</option> or <option>--qdf</option>. | ||
| 2416 | - This will cause qpdf to combine all the content streams into one, | ||
| 2417 | - thus recombining any split tokens. However doing this will prevent | ||
| 2418 | - you from being able to see the original layout of the content | ||
| 2419 | - streams. If you must inspect the original content streams in an | 2396 | + When normalizing content, if qpdf runs into any lexical errors, it |
| 2397 | + will print a warning indicating that content may be damaged. The | ||
| 2398 | + only situation in which qpdf is known to cause damage during | ||
| 2399 | + content normalization is when a page's contents are split across | ||
| 2400 | + multiple streams and streams are split in the middle of a lexical | ||
| 2401 | + token such as a string, name, or inline image. Note that files | ||
| 2402 | + that do this are invalid since the PDF specification states that | ||
| 2403 | + content streams are not to be split in the middle of a token. If | ||
| 2404 | + you want to inspect the original content streams in an | ||
| 2420 | uncompressed format, you can always run with <option>--qdf | 2405 | uncompressed format, you can always run with <option>--qdf |
| 2421 | --normalize-content=n</option> for a QDF file without content | 2406 | --normalize-content=n</option> for a QDF file without content |
| 2422 | normalization, or alternatively | 2407 | normalization, or alternatively |
qpdf/qpdf.testcov
| @@ -455,3 +455,4 @@ qpdf found shared resources in leaf 0 | @@ -455,3 +455,4 @@ qpdf found shared resources in leaf 0 | ||
| 455 | qpdf found shared xobject in leaf 0 | 455 | qpdf found shared xobject in leaf 0 |
| 456 | QPDF copy foreign with data 1 | 456 | QPDF copy foreign with data 1 |
| 457 | QPDF copy foreign with foreign_stream 1 | 457 | QPDF copy foreign with foreign_stream 1 |
| 458 | +QPDFObjectHandle need_newline 1 |
qpdf/qtest/qpdf.test
| @@ -1591,15 +1591,23 @@ $td->runtest("type checks with object streams", | @@ -1591,15 +1591,23 @@ $td->runtest("type checks with object streams", | ||
| 1591 | 1591 | ||
| 1592 | # ---------- | 1592 | # ---------- |
| 1593 | $td->notify("--- Coalesce contents ---"); | 1593 | $td->notify("--- Coalesce contents ---"); |
| 1594 | -$n_tests += 6; | 1594 | +$n_tests += 8; |
| 1595 | 1595 | ||
| 1596 | $td->runtest("qdf with normalize warnings", | 1596 | $td->runtest("qdf with normalize warnings", |
| 1597 | {$td->COMMAND => | 1597 | {$td->COMMAND => |
| 1598 | - "qpdf --qdf --static-id coalesce.pdf a.pdf"}, | 1598 | + "qpdf --qdf --static-id split-tokens.pdf a.pdf"}, |
| 1599 | {$td->FILE => "normalize-warnings.out", $td->EXIT_STATUS => 3}, | 1599 | {$td->FILE => "normalize-warnings.out", $td->EXIT_STATUS => 3}, |
| 1600 | $td->NORMALIZE_NEWLINES); | 1600 | $td->NORMALIZE_NEWLINES); |
| 1601 | $td->runtest("check output", | 1601 | $td->runtest("check output", |
| 1602 | {$td->FILE => "a.pdf"}, | 1602 | {$td->FILE => "a.pdf"}, |
| 1603 | + {$td->FILE => "split-tokens.qdf"}); | ||
| 1604 | +$td->runtest("coalesce to qdf", | ||
| 1605 | + {$td->COMMAND => | ||
| 1606 | + "qpdf --qdf --static-id coalesce.pdf a.pdf"}, | ||
| 1607 | + {$td->STRING => "", $td->EXIT_STATUS => 0}, | ||
| 1608 | + $td->NORMALIZE_NEWLINES); | ||
| 1609 | +$td->runtest("check output", | ||
| 1610 | + {$td->FILE => "a.pdf"}, | ||
| 1603 | {$td->FILE => "coalesce.qdf"}); | 1611 | {$td->FILE => "coalesce.qdf"}); |
| 1604 | $td->runtest("coalesce contents with qdf", | 1612 | $td->runtest("coalesce contents with qdf", |
| 1605 | {$td->COMMAND => | 1613 | {$td->COMMAND => |
| @@ -1831,12 +1839,12 @@ $td->runtest("unreferenced resources with bad token", | @@ -1831,12 +1839,12 @@ $td->runtest("unreferenced resources with bad token", | ||
| 1831 | {$td->COMMAND => | 1839 | {$td->COMMAND => |
| 1832 | "qpdf --qdf --static-id --split-pages=2" . | 1840 | "qpdf --qdf --static-id --split-pages=2" . |
| 1833 | " --remove-unreferenced-resources=yes" . | 1841 | " --remove-unreferenced-resources=yes" . |
| 1834 | - " coalesce.pdf split-out-bad-token.pdf"}, | ||
| 1835 | - {$td->FILE => "coalesce-split.out", $td->EXIT_STATUS => 3}, | 1842 | + " split-tokens.pdf split-out-bad-token.pdf"}, |
| 1843 | + {$td->FILE => "split-tokens-split.out", $td->EXIT_STATUS => 3}, | ||
| 1836 | $td->NORMALIZE_NEWLINES); | 1844 | $td->NORMALIZE_NEWLINES); |
| 1837 | $td->runtest("check output", | 1845 | $td->runtest("check output", |
| 1838 | {$td->FILE => "split-out-bad-token-1-2.pdf"}, | 1846 | {$td->FILE => "split-out-bad-token-1-2.pdf"}, |
| 1839 | - {$td->FILE => "coalesce-split-1-2.pdf"}); | 1847 | + {$td->FILE => "split-tokens-split-1-2.pdf"}); |
| 1840 | 1848 | ||
| 1841 | $td->runtest("shared images in form xobject", | 1849 | $td->runtest("shared images in form xobject", |
| 1842 | {$td->COMMAND => "qpdf --qdf --static-id --split-pages". | 1850 | {$td->COMMAND => "qpdf --qdf --static-id --split-pages". |
qpdf/qtest/qpdf/coalesce-out.pdf
No preview for this file type
qpdf/qtest/qpdf/coalesce-out.qdf
No preview for this file type
qpdf/qtest/qpdf/coalesce.pdf
No preview for this file type
qpdf/qtest/qpdf/coalesce.qdf
No preview for this file type
qpdf/qtest/qpdf/normalize-warnings.out
| 1 | -WARNING: coalesce.pdf (offset 671): content normalization encountered bad tokens | ||
| 2 | -WARNING: coalesce.pdf (offset 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents | ||
| 3 | -WARNING: coalesce.pdf (offset 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | ||
| 4 | -WARNING: coalesce.pdf (offset 823): content normalization encountered bad tokens | ||
| 5 | -WARNING: coalesce.pdf (offset 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | ||
| 6 | -WARNING: coalesce.pdf (offset 962): content normalization encountered bad tokens | ||
| 7 | -WARNING: coalesce.pdf (offset 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents | ||
| 8 | -WARNING: coalesce.pdf (offset 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | 1 | +WARNING: split-tokens.pdf (offset 671): content normalization encountered bad tokens |
| 2 | +WARNING: split-tokens.pdf (offset 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents | ||
| 3 | +WARNING: split-tokens.pdf (offset 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | ||
| 4 | +WARNING: split-tokens.pdf (offset 823): content normalization encountered bad tokens | ||
| 5 | +WARNING: split-tokens.pdf (offset 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | ||
| 6 | +WARNING: split-tokens.pdf (offset 962): content normalization encountered bad tokens | ||
| 7 | +WARNING: split-tokens.pdf (offset 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents | ||
| 8 | +WARNING: split-tokens.pdf (offset 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | ||
| 9 | qpdf: operation succeeded with warnings; resulting file may have some problems | 9 | qpdf: operation succeeded with warnings; resulting file may have some problems |
qpdf/qtest/qpdf/coalesce-split-1-2.pdf renamed to qpdf/qtest/qpdf/split-tokens-split-1-2.pdf
No preview for this file type
qpdf/qtest/qpdf/coalesce-split.out renamed to qpdf/qtest/qpdf/split-tokens-split.out
| 1 | -WARNING: coalesce.pdf, object 3 0 at offset 181: Bad token found while scanning content stream; not attempting to remove unreferenced objects from this page | 1 | +WARNING: split-tokens.pdf, object 3 0 at offset 181: Bad token found while scanning content stream; not attempting to remove unreferenced objects from this page |
| 2 | WARNING: empty PDF: content normalization encountered bad tokens | 2 | WARNING: empty PDF: content normalization encountered bad tokens |
| 3 | WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents | 3 | WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents |
| 4 | WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | 4 | WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. |
qpdf/qtest/qpdf/split-tokens.pdf
0 → 100644
No preview for this file type
qpdf/qtest/qpdf/split-tokens.qdf
0 → 100644
No preview for this file type
qpdf/qtest/qpdf/token-filters-out.pdf
No preview for this file type