Commit b30deaeeaba3941d7615bc2cc89c664b1273e5df
1 parent
0dea2769
Avoid merging adjacent tokens when concatenating contents (fixes #444)
Showing
16 changed files
with
541 additions
and
43 deletions
ChangeLog
| 1 | 1 | 2020-10-23 Jay Berkenbilt <ejb@ql.org> |
| 2 | 2 | |
| 3 | + * Bug fix: when concatenating content streams, insert a newline if | |
| 4 | + needed to prevent the last token from the old stream from being | |
| 5 | + merged with the first token of the new stream. Qpdf was mistakenly | |
| 6 | + concatenating the streams without regard to the specification that | |
| 7 | + content streams are to be broken on token boundaries. Fixes #444. | |
| 8 | + | |
| 3 | 9 | * Bug fix: fix-qdf: properly handle empty streams with ignore |
| 4 | 10 | newline. |
| 5 | 11 | ... | ... |
TODO
libqpdf/QPDFObjectHandle.cc
| ... | ... | @@ -165,6 +165,47 @@ QPDFObjectHandle::ParserCallbacks::terminateParsing() |
| 165 | 165 | throw TerminateParsing(); |
| 166 | 166 | } |
| 167 | 167 | |
| 168 | +class LastChar: public Pipeline | |
| 169 | +{ | |
| 170 | + public: | |
| 171 | + LastChar(Pipeline* next); | |
| 172 | + virtual ~LastChar() = default; | |
| 173 | + virtual void write(unsigned char* data, size_t len); | |
| 174 | + virtual void finish(); | |
| 175 | + unsigned char getLastChar(); | |
| 176 | + | |
| 177 | + private: | |
| 178 | + unsigned char last_char; | |
| 179 | +}; | |
| 180 | + | |
| 181 | +LastChar::LastChar(Pipeline* next) : | |
| 182 | + Pipeline("lastchar", next), | |
| 183 | + last_char(0) | |
| 184 | +{ | |
| 185 | +} | |
| 186 | + | |
| 187 | +void | |
| 188 | +LastChar::write(unsigned char* data, size_t len) | |
| 189 | +{ | |
| 190 | + if (len > 0) | |
| 191 | + { | |
| 192 | + this->last_char = data[len - 1]; | |
| 193 | + } | |
| 194 | + getNext()->write(data, len); | |
| 195 | +} | |
| 196 | + | |
| 197 | +void | |
| 198 | +LastChar::finish() | |
| 199 | +{ | |
| 200 | + getNext()->finish(); | |
| 201 | +} | |
| 202 | + | |
| 203 | +unsigned char | |
| 204 | +LastChar::getLastChar() | |
| 205 | +{ | |
| 206 | + return this->last_char; | |
| 207 | +} | |
| 208 | + | |
| 168 | 209 | QPDFObjectHandle::QPDFObjectHandle() : |
| 169 | 210 | initialized(false), |
| 170 | 211 | qpdf(0), |
| ... | ... | @@ -1600,21 +1641,31 @@ QPDFObjectHandle::pipeContentStreams( |
| 1600 | 1641 | std::vector<QPDFObjectHandle> streams = |
| 1601 | 1642 | arrayOrStreamToStreamArray( |
| 1602 | 1643 | description, all_description); |
| 1644 | + bool need_newline = false; | |
| 1603 | 1645 | for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin(); |
| 1604 | 1646 | iter != streams.end(); ++iter) |
| 1605 | 1647 | { |
| 1648 | + if (need_newline) | |
| 1649 | + { | |
| 1650 | + p->write(QUtil::unsigned_char_pointer("\n"), 1); | |
| 1651 | + } | |
| 1652 | + LastChar lc(p); | |
| 1606 | 1653 | QPDFObjectHandle stream = *iter; |
| 1607 | 1654 | std::string og = |
| 1608 | 1655 | QUtil::int_to_string(stream.getObjectID()) + " " + |
| 1609 | 1656 | QUtil::int_to_string(stream.getGeneration()); |
| 1610 | 1657 | std::string w_description = "content stream object " + og; |
| 1611 | - if (! stream.pipeStreamData(p, 0, qpdf_dl_specialized)) | |
| 1658 | + if (! stream.pipeStreamData(&lc, 0, qpdf_dl_specialized)) | |
| 1612 | 1659 | { |
| 1613 | 1660 | QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent"); |
| 1614 | 1661 | throw QPDFExc(qpdf_e_damaged_pdf, "content stream", |
| 1615 | 1662 | w_description, 0, |
| 1616 | 1663 | "errors while decoding content stream"); |
| 1617 | 1664 | } |
| 1665 | + lc.finish(); | |
| 1666 | + need_newline = (lc.getLastChar() != static_cast<unsigned char>('\n')); | |
| 1667 | + QTC::TC("qpdf", "QPDFObjectHandle need_newline", | |
| 1668 | + need_newline ? 0 : 1); | |
| 1618 | 1669 | } |
| 1619 | 1670 | } |
| 1620 | 1671 | ... | ... |
manual/qpdf-manual.xml
| ... | ... | @@ -2090,14 +2090,9 @@ outfile.pdf</option> |
| 2090 | 2090 | option causes qpdf to combine them into a single stream. Use |
| 2091 | 2091 | of this option is never necessary for ordinary usage, but it |
| 2092 | 2092 | can help when working with some files in some cases. For |
| 2093 | - example, some PDF writers split page contents into small | |
| 2094 | - streams at arbitrary points that may fall in the middle of | |
| 2095 | - lexical tokens within the content, and some PDF readers may | |
| 2096 | - get confused on such files. If you use qpdf to coalesce the | |
| 2097 | - content streams, such readers may be able to work with the | |
| 2098 | - file more easily. This can also be combined with QDF mode or | |
| 2099 | - content normalization to make it easier to look at all of a | |
| 2100 | - page's contents at once. | |
| 2093 | + example, this can also be combined with QDF mode or content | |
| 2094 | + normalization to make it easier to look at all of a page's | |
| 2095 | + contents at once. | |
| 2101 | 2096 | </para> |
| 2102 | 2097 | </listitem> |
| 2103 | 2098 | </varlistentry> |
| ... | ... | @@ -2398,25 +2393,15 @@ outfile.pdf</option> |
| 2398 | 2393 | You should not use this for “production” PDF files. |
| 2399 | 2394 | </para> |
| 2400 | 2395 | <para> |
| 2401 | - This paragraph discusses edge cases of content normalization that | |
| 2402 | - are not of concern to most users and are not relevant when content | |
| 2403 | - normalization is not enabled. When normalizing content, if qpdf | |
| 2404 | - runs into any lexical errors, it will print a warning indicating | |
| 2405 | - that content may be damaged. The only situation in which qpdf is | |
| 2406 | - known to cause damage during content normalization is when a | |
| 2407 | - page's contents are split across multiple streams and streams are | |
| 2408 | - split in the middle of a lexical token such as a string, name, or | |
| 2409 | - inline image. There may be some pathological cases in which qpdf | |
| 2410 | - could damage content without noticing this, such as if the partial | |
| 2411 | - tokens at the end of one stream and the beginning of the next | |
| 2412 | - stream are both valid, but usually qpdf will be able to detect | |
| 2413 | - this case. For slightly increased safety, you can specify | |
| 2414 | - <option>--coalesce-contents</option> in addition to | |
| 2415 | - <option>--normalize-content</option> or <option>--qdf</option>. | |
| 2416 | - This will cause qpdf to combine all the content streams into one, | |
| 2417 | - thus recombining any split tokens. However doing this will prevent | |
| 2418 | - you from being able to see the original layout of the content | |
| 2419 | - streams. If you must inspect the original content streams in an | |
| 2396 | + When normalizing content, if qpdf runs into any lexical errors, it | |
| 2397 | + will print a warning indicating that content may be damaged. The | |
| 2398 | + only situation in which qpdf is known to cause damage during | |
| 2399 | + content normalization is when a page's contents are split across | |
| 2400 | + multiple streams and streams are split in the middle of a lexical | |
| 2401 | + token such as a string, name, or inline image. Note that files | |
| 2402 | + that do this are invalid since the PDF specification states that | |
| 2403 | + content streams are not to be split in the middle of a token. If | |
| 2404 | + you want to inspect the original content streams in an | |
| 2420 | 2405 | uncompressed format, you can always run with <option>--qdf |
| 2421 | 2406 | --normalize-content=n</option> for a QDF file without content |
| 2422 | 2407 | normalization, or alternatively | ... | ... |
qpdf/qpdf.testcov
qpdf/qtest/qpdf.test
| ... | ... | @@ -1591,15 +1591,23 @@ $td->runtest("type checks with object streams", |
| 1591 | 1591 | |
| 1592 | 1592 | # ---------- |
| 1593 | 1593 | $td->notify("--- Coalesce contents ---"); |
| 1594 | -$n_tests += 6; | |
| 1594 | +$n_tests += 8; | |
| 1595 | 1595 | |
| 1596 | 1596 | $td->runtest("qdf with normalize warnings", |
| 1597 | 1597 | {$td->COMMAND => |
| 1598 | - "qpdf --qdf --static-id coalesce.pdf a.pdf"}, | |
| 1598 | + "qpdf --qdf --static-id split-tokens.pdf a.pdf"}, | |
| 1599 | 1599 | {$td->FILE => "normalize-warnings.out", $td->EXIT_STATUS => 3}, |
| 1600 | 1600 | $td->NORMALIZE_NEWLINES); |
| 1601 | 1601 | $td->runtest("check output", |
| 1602 | 1602 | {$td->FILE => "a.pdf"}, |
| 1603 | + {$td->FILE => "split-tokens.qdf"}); | |
| 1604 | +$td->runtest("coalesce to qdf", | |
| 1605 | + {$td->COMMAND => | |
| 1606 | + "qpdf --qdf --static-id coalesce.pdf a.pdf"}, | |
| 1607 | + {$td->STRING => "", $td->EXIT_STATUS => 0}, | |
| 1608 | + $td->NORMALIZE_NEWLINES); | |
| 1609 | +$td->runtest("check output", | |
| 1610 | + {$td->FILE => "a.pdf"}, | |
| 1603 | 1611 | {$td->FILE => "coalesce.qdf"}); |
| 1604 | 1612 | $td->runtest("coalesce contents with qdf", |
| 1605 | 1613 | {$td->COMMAND => |
| ... | ... | @@ -1831,12 +1839,12 @@ $td->runtest("unreferenced resources with bad token", |
| 1831 | 1839 | {$td->COMMAND => |
| 1832 | 1840 | "qpdf --qdf --static-id --split-pages=2" . |
| 1833 | 1841 | " --remove-unreferenced-resources=yes" . |
| 1834 | - " coalesce.pdf split-out-bad-token.pdf"}, | |
| 1835 | - {$td->FILE => "coalesce-split.out", $td->EXIT_STATUS => 3}, | |
| 1842 | + " split-tokens.pdf split-out-bad-token.pdf"}, | |
| 1843 | + {$td->FILE => "split-tokens-split.out", $td->EXIT_STATUS => 3}, | |
| 1836 | 1844 | $td->NORMALIZE_NEWLINES); |
| 1837 | 1845 | $td->runtest("check output", |
| 1838 | 1846 | {$td->FILE => "split-out-bad-token-1-2.pdf"}, |
| 1839 | - {$td->FILE => "coalesce-split-1-2.pdf"}); | |
| 1847 | + {$td->FILE => "split-tokens-split-1-2.pdf"}); | |
| 1840 | 1848 | |
| 1841 | 1849 | $td->runtest("shared images in form xobject", |
| 1842 | 1850 | {$td->COMMAND => "qpdf --qdf --static-id --split-pages". | ... | ... |
qpdf/qtest/qpdf/coalesce-out.pdf
No preview for this file type
qpdf/qtest/qpdf/coalesce-out.qdf
No preview for this file type
qpdf/qtest/qpdf/coalesce.pdf
No preview for this file type
qpdf/qtest/qpdf/coalesce.qdf
No preview for this file type
qpdf/qtest/qpdf/normalize-warnings.out
| 1 | -WARNING: coalesce.pdf (offset 671): content normalization encountered bad tokens | |
| 2 | -WARNING: coalesce.pdf (offset 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents | |
| 3 | -WARNING: coalesce.pdf (offset 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | |
| 4 | -WARNING: coalesce.pdf (offset 823): content normalization encountered bad tokens | |
| 5 | -WARNING: coalesce.pdf (offset 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | |
| 6 | -WARNING: coalesce.pdf (offset 962): content normalization encountered bad tokens | |
| 7 | -WARNING: coalesce.pdf (offset 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents | |
| 8 | -WARNING: coalesce.pdf (offset 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | |
| 1 | +WARNING: split-tokens.pdf (offset 671): content normalization encountered bad tokens | |
| 2 | +WARNING: split-tokens.pdf (offset 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents | |
| 3 | +WARNING: split-tokens.pdf (offset 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | |
| 4 | +WARNING: split-tokens.pdf (offset 823): content normalization encountered bad tokens | |
| 5 | +WARNING: split-tokens.pdf (offset 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | |
| 6 | +WARNING: split-tokens.pdf (offset 962): content normalization encountered bad tokens | |
| 7 | +WARNING: split-tokens.pdf (offset 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents | |
| 8 | +WARNING: split-tokens.pdf (offset 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | |
| 9 | 9 | qpdf: operation succeeded with warnings; resulting file may have some problems | ... | ... |
qpdf/qtest/qpdf/coalesce-split-1-2.pdf renamed to qpdf/qtest/qpdf/split-tokens-split-1-2.pdf
No preview for this file type
qpdf/qtest/qpdf/coalesce-split.out renamed to qpdf/qtest/qpdf/split-tokens-split.out
| 1 | -WARNING: coalesce.pdf, object 3 0 at offset 181: Bad token found while scanning content stream; not attempting to remove unreferenced objects from this page | |
| 1 | +WARNING: split-tokens.pdf, object 3 0 at offset 181: Bad token found while scanning content stream; not attempting to remove unreferenced objects from this page | |
| 2 | 2 | WARNING: empty PDF: content normalization encountered bad tokens |
| 3 | 3 | WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents |
| 4 | 4 | WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | ... | ... |
qpdf/qtest/qpdf/split-tokens.pdf
0 → 100644
No preview for this file type
qpdf/qtest/qpdf/split-tokens.qdf
0 → 100644
No preview for this file type
qpdf/qtest/qpdf/token-filters-out.pdf
No preview for this file type