Commit b30deaeeaba3941d7615bc2cc89c664b1273e5df

Authored by Jay Berkenbilt
1 parent 0dea2769

Avoid merging adjacent tokens when concatenating contents (fixes #444)

ChangeLog
1 2020-10-23 Jay Berkenbilt <ejb@ql.org> 1 2020-10-23 Jay Berkenbilt <ejb@ql.org>
2 2
  3 + * Bug fix: when concatenating content streams, insert a newline if
  4 + needed to prevent the last token from the old stream from being
  5 + merged with the first token of the new stream. Qpdf was mistakenly
  6 + concatenating the streams without regard to the specification that
  7 + content streams are to be broken on token boundaries. Fixes #444.
  8 +
3 * Bug fix: fix-qdf: properly handle empty streams with ignore 9 * Bug fix: fix-qdf: properly handle empty streams with ignore
4 newline. 10 newline.
5 11
@@ -4,7 +4,6 @@ Candidates for upcoming release @@ -4,7 +4,6 @@ Candidates for upcoming release
4 * Open "next" issues 4 * Open "next" issues
5 * bugs 5 * bugs
6 * #473: zsh completion with directories 6 * #473: zsh completion with directories
7 - * #444: concatenated stream/whitespace bug  
8 * Non-bugs 7 * Non-bugs
9 * #446: recognize edited QDF files 8 * #446: recognize edited QDF files
10 * #436: parsing of document with form xobject 9 * #436: parsing of document with form xobject
libqpdf/QPDFObjectHandle.cc
@@ -165,6 +165,47 @@ QPDFObjectHandle::ParserCallbacks::terminateParsing() @@ -165,6 +165,47 @@ QPDFObjectHandle::ParserCallbacks::terminateParsing()
165 throw TerminateParsing(); 165 throw TerminateParsing();
166 } 166 }
167 167
  168 +class LastChar: public Pipeline
  169 +{
  170 + public:
  171 + LastChar(Pipeline* next);
  172 + virtual ~LastChar() = default;
  173 + virtual void write(unsigned char* data, size_t len);
  174 + virtual void finish();
  175 + unsigned char getLastChar();
  176 +
  177 + private:
  178 + unsigned char last_char;
  179 +};
  180 +
  181 +LastChar::LastChar(Pipeline* next) :
  182 + Pipeline("lastchar", next),
  183 + last_char(0)
  184 +{
  185 +}
  186 +
  187 +void
  188 +LastChar::write(unsigned char* data, size_t len)
  189 +{
  190 + if (len > 0)
  191 + {
  192 + this->last_char = data[len - 1];
  193 + }
  194 + getNext()->write(data, len);
  195 +}
  196 +
  197 +void
  198 +LastChar::finish()
  199 +{
  200 + getNext()->finish();
  201 +}
  202 +
  203 +unsigned char
  204 +LastChar::getLastChar()
  205 +{
  206 + return this->last_char;
  207 +}
  208 +
168 QPDFObjectHandle::QPDFObjectHandle() : 209 QPDFObjectHandle::QPDFObjectHandle() :
169 initialized(false), 210 initialized(false),
170 qpdf(0), 211 qpdf(0),
@@ -1600,21 +1641,31 @@ QPDFObjectHandle::pipeContentStreams( @@ -1600,21 +1641,31 @@ QPDFObjectHandle::pipeContentStreams(
1600 std::vector<QPDFObjectHandle> streams = 1641 std::vector<QPDFObjectHandle> streams =
1601 arrayOrStreamToStreamArray( 1642 arrayOrStreamToStreamArray(
1602 description, all_description); 1643 description, all_description);
  1644 + bool need_newline = false;
1603 for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin(); 1645 for (std::vector<QPDFObjectHandle>::iterator iter = streams.begin();
1604 iter != streams.end(); ++iter) 1646 iter != streams.end(); ++iter)
1605 { 1647 {
  1648 + if (need_newline)
  1649 + {
  1650 + p->write(QUtil::unsigned_char_pointer("\n"), 1);
  1651 + }
  1652 + LastChar lc(p);
1606 QPDFObjectHandle stream = *iter; 1653 QPDFObjectHandle stream = *iter;
1607 std::string og = 1654 std::string og =
1608 QUtil::int_to_string(stream.getObjectID()) + " " + 1655 QUtil::int_to_string(stream.getObjectID()) + " " +
1609 QUtil::int_to_string(stream.getGeneration()); 1656 QUtil::int_to_string(stream.getGeneration());
1610 std::string w_description = "content stream object " + og; 1657 std::string w_description = "content stream object " + og;
1611 - if (! stream.pipeStreamData(p, 0, qpdf_dl_specialized)) 1658 + if (! stream.pipeStreamData(&lc, 0, qpdf_dl_specialized))
1612 { 1659 {
1613 QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent"); 1660 QTC::TC("qpdf", "QPDFObjectHandle errors in parsecontent");
1614 throw QPDFExc(qpdf_e_damaged_pdf, "content stream", 1661 throw QPDFExc(qpdf_e_damaged_pdf, "content stream",
1615 w_description, 0, 1662 w_description, 0,
1616 "errors while decoding content stream"); 1663 "errors while decoding content stream");
1617 } 1664 }
  1665 + lc.finish();
  1666 + need_newline = (lc.getLastChar() != static_cast<unsigned char>('\n'));
  1667 + QTC::TC("qpdf", "QPDFObjectHandle need_newline",
  1668 + need_newline ? 0 : 1);
1618 } 1669 }
1619 } 1670 }
1620 1671
manual/qpdf-manual.xml
@@ -2090,14 +2090,9 @@ outfile.pdf&lt;/option&gt; @@ -2090,14 +2090,9 @@ outfile.pdf&lt;/option&gt;
2090 option causes qpdf to combine them into a single stream. Use 2090 option causes qpdf to combine them into a single stream. Use
2091 of this option is never necessary for ordinary usage, but it 2091 of this option is never necessary for ordinary usage, but it
2092 can help when working with some files in some cases. For 2092 can help when working with some files in some cases. For
2093 - example, some PDF writers split page contents into small  
2094 - streams at arbitrary points that may fall in the middle of  
2095 - lexical tokens within the content, and some PDF readers may  
2096 - get confused on such files. If you use qpdf to coalesce the  
2097 - content streams, such readers may be able to work with the  
2098 - file more easily. This can also be combined with QDF mode or  
2099 - content normalization to make it easier to look at all of a  
2100 - page's contents at once. 2093 + example, this can also be combined with QDF mode or content
  2094 + normalization to make it easier to look at all of a page's
  2095 + contents at once.
2101 </para> 2096 </para>
2102 </listitem> 2097 </listitem>
2103 </varlistentry> 2098 </varlistentry>
@@ -2398,25 +2393,15 @@ outfile.pdf&lt;/option&gt; @@ -2398,25 +2393,15 @@ outfile.pdf&lt;/option&gt;
2398 You should not use this for &ldquo;production&rdquo; PDF files. 2393 You should not use this for &ldquo;production&rdquo; PDF files.
2399 </para> 2394 </para>
2400 <para> 2395 <para>
2401 - This paragraph discusses edge cases of content normalization that  
2402 - are not of concern to most users and are not relevant when content  
2403 - normalization is not enabled. When normalizing content, if qpdf  
2404 - runs into any lexical errors, it will print a warning indicating  
2405 - that content may be damaged. The only situation in which qpdf is  
2406 - known to cause damage during content normalization is when a  
2407 - page's contents are split across multiple streams and streams are  
2408 - split in the middle of a lexical token such as a string, name, or  
2409 - inline image. There may be some pathological cases in which qpdf  
2410 - could damage content without noticing this, such as if the partial  
2411 - tokens at the end of one stream and the beginning of the next  
2412 - stream are both valid, but usually qpdf will be able to detect  
2413 - this case. For slightly increased safety, you can specify  
2414 - <option>--coalesce-contents</option> in addition to  
2415 - <option>--normalize-content</option> or <option>--qdf</option>.  
2416 - This will cause qpdf to combine all the content streams into one,  
2417 - thus recombining any split tokens. However doing this will prevent  
2418 - you from being able to see the original layout of the content  
2419 - streams. If you must inspect the original content streams in an 2396 + When normalizing content, if qpdf runs into any lexical errors, it
  2397 + will print a warning indicating that content may be damaged. The
  2398 + only situation in which qpdf is known to cause damage during
  2399 + content normalization is when a page's contents are split across
  2400 + multiple streams and streams are split in the middle of a lexical
  2401 + token such as a string, name, or inline image. Note that files
  2402 + that do this are invalid since the PDF specification states that
  2403 + content streams are not to be split in the middle of a token. If
  2404 + you want to inspect the original content streams in an
2420 uncompressed format, you can always run with <option>--qdf 2405 uncompressed format, you can always run with <option>--qdf
2421 --normalize-content=n</option> for a QDF file without content 2406 --normalize-content=n</option> for a QDF file without content
2422 normalization, or alternatively 2407 normalization, or alternatively
qpdf/qpdf.testcov
@@ -455,3 +455,4 @@ qpdf found shared resources in leaf 0 @@ -455,3 +455,4 @@ qpdf found shared resources in leaf 0
455 qpdf found shared xobject in leaf 0 455 qpdf found shared xobject in leaf 0
456 QPDF copy foreign with data 1 456 QPDF copy foreign with data 1
457 QPDF copy foreign with foreign_stream 1 457 QPDF copy foreign with foreign_stream 1
  458 +QPDFObjectHandle need_newline 1
qpdf/qtest/qpdf.test
@@ -1591,15 +1591,23 @@ $td-&gt;runtest(&quot;type checks with object streams&quot;, @@ -1591,15 +1591,23 @@ $td-&gt;runtest(&quot;type checks with object streams&quot;,
1591 1591
1592 # ---------- 1592 # ----------
1593 $td->notify("--- Coalesce contents ---"); 1593 $td->notify("--- Coalesce contents ---");
1594 -$n_tests += 6; 1594 +$n_tests += 8;
1595 1595
1596 $td->runtest("qdf with normalize warnings", 1596 $td->runtest("qdf with normalize warnings",
1597 {$td->COMMAND => 1597 {$td->COMMAND =>
1598 - "qpdf --qdf --static-id coalesce.pdf a.pdf"}, 1598 + "qpdf --qdf --static-id split-tokens.pdf a.pdf"},
1599 {$td->FILE => "normalize-warnings.out", $td->EXIT_STATUS => 3}, 1599 {$td->FILE => "normalize-warnings.out", $td->EXIT_STATUS => 3},
1600 $td->NORMALIZE_NEWLINES); 1600 $td->NORMALIZE_NEWLINES);
1601 $td->runtest("check output", 1601 $td->runtest("check output",
1602 {$td->FILE => "a.pdf"}, 1602 {$td->FILE => "a.pdf"},
  1603 + {$td->FILE => "split-tokens.qdf"});
  1604 +$td->runtest("coalesce to qdf",
  1605 + {$td->COMMAND =>
  1606 + "qpdf --qdf --static-id coalesce.pdf a.pdf"},
  1607 + {$td->STRING => "", $td->EXIT_STATUS => 0},
  1608 + $td->NORMALIZE_NEWLINES);
  1609 +$td->runtest("check output",
  1610 + {$td->FILE => "a.pdf"},
1603 {$td->FILE => "coalesce.qdf"}); 1611 {$td->FILE => "coalesce.qdf"});
1604 $td->runtest("coalesce contents with qdf", 1612 $td->runtest("coalesce contents with qdf",
1605 {$td->COMMAND => 1613 {$td->COMMAND =>
@@ -1831,12 +1839,12 @@ $td-&gt;runtest(&quot;unreferenced resources with bad token&quot;, @@ -1831,12 +1839,12 @@ $td-&gt;runtest(&quot;unreferenced resources with bad token&quot;,
1831 {$td->COMMAND => 1839 {$td->COMMAND =>
1832 "qpdf --qdf --static-id --split-pages=2" . 1840 "qpdf --qdf --static-id --split-pages=2" .
1833 " --remove-unreferenced-resources=yes" . 1841 " --remove-unreferenced-resources=yes" .
1834 - " coalesce.pdf split-out-bad-token.pdf"},  
1835 - {$td->FILE => "coalesce-split.out", $td->EXIT_STATUS => 3}, 1842 + " split-tokens.pdf split-out-bad-token.pdf"},
  1843 + {$td->FILE => "split-tokens-split.out", $td->EXIT_STATUS => 3},
1836 $td->NORMALIZE_NEWLINES); 1844 $td->NORMALIZE_NEWLINES);
1837 $td->runtest("check output", 1845 $td->runtest("check output",
1838 {$td->FILE => "split-out-bad-token-1-2.pdf"}, 1846 {$td->FILE => "split-out-bad-token-1-2.pdf"},
1839 - {$td->FILE => "coalesce-split-1-2.pdf"}); 1847 + {$td->FILE => "split-tokens-split-1-2.pdf"});
1840 1848
1841 $td->runtest("shared images in form xobject", 1849 $td->runtest("shared images in form xobject",
1842 {$td->COMMAND => "qpdf --qdf --static-id --split-pages". 1850 {$td->COMMAND => "qpdf --qdf --static-id --split-pages".
qpdf/qtest/qpdf/coalesce-out.pdf
No preview for this file type
qpdf/qtest/qpdf/coalesce-out.qdf
No preview for this file type
qpdf/qtest/qpdf/coalesce.pdf
No preview for this file type
qpdf/qtest/qpdf/coalesce.qdf
No preview for this file type
qpdf/qtest/qpdf/normalize-warnings.out
1 -WARNING: coalesce.pdf (offset 671): content normalization encountered bad tokens  
2 -WARNING: coalesce.pdf (offset 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents  
3 -WARNING: coalesce.pdf (offset 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.  
4 -WARNING: coalesce.pdf (offset 823): content normalization encountered bad tokens  
5 -WARNING: coalesce.pdf (offset 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.  
6 -WARNING: coalesce.pdf (offset 962): content normalization encountered bad tokens  
7 -WARNING: coalesce.pdf (offset 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents  
8 -WARNING: coalesce.pdf (offset 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. 1 +WARNING: split-tokens.pdf (offset 671): content normalization encountered bad tokens
  2 +WARNING: split-tokens.pdf (offset 671): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
  3 +WARNING: split-tokens.pdf (offset 671): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
  4 +WARNING: split-tokens.pdf (offset 823): content normalization encountered bad tokens
  5 +WARNING: split-tokens.pdf (offset 823): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
  6 +WARNING: split-tokens.pdf (offset 962): content normalization encountered bad tokens
  7 +WARNING: split-tokens.pdf (offset 962): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
  8 +WARNING: split-tokens.pdf (offset 962): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
9 qpdf: operation succeeded with warnings; resulting file may have some problems 9 qpdf: operation succeeded with warnings; resulting file may have some problems
qpdf/qtest/qpdf/coalesce-split-1-2.pdf renamed to qpdf/qtest/qpdf/split-tokens-split-1-2.pdf
No preview for this file type
qpdf/qtest/qpdf/coalesce-split.out renamed to qpdf/qtest/qpdf/split-tokens-split.out
1 -WARNING: coalesce.pdf, object 3 0 at offset 181: Bad token found while scanning content stream; not attempting to remove unreferenced objects from this page 1 +WARNING: split-tokens.pdf, object 3 0 at offset 181: Bad token found while scanning content stream; not attempting to remove unreferenced objects from this page
2 WARNING: empty PDF: content normalization encountered bad tokens 2 WARNING: empty PDF: content normalization encountered bad tokens
3 WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents 3 WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
4 WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. 4 WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
qpdf/qtest/qpdf/split-tokens.pdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/split-tokens.qdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/token-filters-out.pdf
No preview for this file type