Commit 5cfcd4f361063df8e216489915758ce40a15f15b
1 parent
e09ae710
Additional checks for unreferenced resources
Explicitly abandon removal of unreferenced resources if there are any lexical errors in the page's contents. This case always generated a warning, but it now also prevents removal of unreferenced resources, this strongly decreasing the likelihood of data loss.
Showing
5 changed files
with
272 additions
and
3 deletions
libqpdf/QPDFPageObjectHelper.cc
| @@ -99,11 +99,16 @@ QPDFPageObjectHelper::addContentTokenFilter( | @@ -99,11 +99,16 @@ QPDFPageObjectHelper::addContentTokenFilter( | ||
| 99 | class NameWatcher: public QPDFObjectHandle::TokenFilter | 99 | class NameWatcher: public QPDFObjectHandle::TokenFilter |
| 100 | { | 100 | { |
| 101 | public: | 101 | public: |
| 102 | + NameWatcher() : | ||
| 103 | + saw_bad(false) | ||
| 104 | + { | ||
| 105 | + } | ||
| 102 | virtual ~NameWatcher() | 106 | virtual ~NameWatcher() |
| 103 | { | 107 | { |
| 104 | } | 108 | } |
| 105 | virtual void handleToken(QPDFTokenizer::Token const&); | 109 | virtual void handleToken(QPDFTokenizer::Token const&); |
| 106 | std::set<std::string> names; | 110 | std::set<std::string> names; |
| 111 | + bool saw_bad; | ||
| 107 | }; | 112 | }; |
| 108 | 113 | ||
| 109 | void | 114 | void |
| @@ -116,6 +121,10 @@ NameWatcher::handleToken(QPDFTokenizer::Token const& token) | @@ -116,6 +121,10 @@ NameWatcher::handleToken(QPDFTokenizer::Token const& token) | ||
| 116 | this->names.insert( | 121 | this->names.insert( |
| 117 | QPDFObjectHandle::newName(token.getValue()).getName()); | 122 | QPDFObjectHandle::newName(token.getValue()).getName()); |
| 118 | } | 123 | } |
| 124 | + else if (token.getType() == QPDFTokenizer::tt_bad) | ||
| 125 | + { | ||
| 126 | + saw_bad = true; | ||
| 127 | + } | ||
| 119 | writeToken(token); | 128 | writeToken(token); |
| 120 | } | 129 | } |
| 121 | 130 | ||
| @@ -134,6 +143,14 @@ QPDFPageObjectHelper::removeUnreferencedResources() | @@ -134,6 +143,14 @@ QPDFPageObjectHelper::removeUnreferencedResources() | ||
| 134 | "; not attempting to remove unreferenced objects from this page"); | 143 | "; not attempting to remove unreferenced objects from this page"); |
| 135 | return; | 144 | return; |
| 136 | } | 145 | } |
| 146 | + if (nw.saw_bad) | ||
| 147 | + { | ||
| 148 | + QTC::TC("qpdf", "QPDFPageObjectHelper bad token finding names"); | ||
| 149 | + this->oh.warnIfPossible( | ||
| 150 | + "Bad token found while scanning content stream; " | ||
| 151 | + "not attempting to remove unreferenced objects from this page"); | ||
| 152 | + return; | ||
| 153 | + } | ||
| 137 | // Walk through /Font and /XObject dictionaries, removing any | 154 | // Walk through /Font and /XObject dictionaries, removing any |
| 138 | // resources that are not referenced. We must make copies of | 155 | // resources that are not referenced. We must make copies of |
| 139 | // resource dictionaries down into the dictionaries are mutating | 156 | // resource dictionaries down into the dictionaries are mutating |
qpdf/qpdf.testcov
| @@ -412,3 +412,4 @@ QPDF copy foreign stream with provider 0 | @@ -412,3 +412,4 @@ QPDF copy foreign stream with provider 0 | ||
| 412 | QPDF copy foreign stream with buffer 0 | 412 | QPDF copy foreign stream with buffer 0 |
| 413 | QPDF immediate copy stream data 0 | 413 | QPDF immediate copy stream data 0 |
| 414 | qpdf copy same page more than once 1 | 414 | qpdf copy same page more than once 1 |
| 415 | +QPDFPageObjectHelper bad token finding names 0 |
qpdf/qtest/qpdf.test
| @@ -1384,7 +1384,7 @@ my @sp_cases = ( | @@ -1384,7 +1384,7 @@ my @sp_cases = ( | ||
| 1384 | [11, 'pdf extension', '', 'split-out.Pdf'], | 1384 | [11, 'pdf extension', '', 'split-out.Pdf'], |
| 1385 | [4, 'fallback', '--pages 11-pages.pdf 1-3 minimal.pdf --', 'split-out'], | 1385 | [4, 'fallback', '--pages 11-pages.pdf 1-3 minimal.pdf --', 'split-out'], |
| 1386 | ); | 1386 | ); |
| 1387 | -$n_tests += 21; | 1387 | +$n_tests += 23; |
| 1388 | for (@sp_cases) | 1388 | for (@sp_cases) |
| 1389 | { | 1389 | { |
| 1390 | $n_tests += 1 + $_->[0]; | 1390 | $n_tests += 1 + $_->[0]; |
| @@ -1482,10 +1482,20 @@ $td->runtest("split shared font, xobject", | @@ -1482,10 +1482,20 @@ $td->runtest("split shared font, xobject", | ||
| 1482 | foreach my $i (qw(1 2 3 4)) | 1482 | foreach my $i (qw(1 2 3 4)) |
| 1483 | { | 1483 | { |
| 1484 | $td->runtest("check output ($i)", | 1484 | $td->runtest("check output ($i)", |
| 1485 | - {$td->FILE => "shared-font-xobject-split-$i.pdf"}, | ||
| 1486 | - {$td->FILE => "split-out-shared-font-xobject-$i.pdf"}); | 1485 | + {$td->FILE => "split-out-shared-font-xobject-$i.pdf"}, |
| 1486 | + {$td->FILE => "shared-font-xobject-split-$i.pdf"}); | ||
| 1487 | } | 1487 | } |
| 1488 | 1488 | ||
| 1489 | +$td->runtest("unreferenced resources with bad token", | ||
| 1490 | + {$td->COMMAND => | ||
| 1491 | + "qpdf --qdf --static-id --split-pages=2" . | ||
| 1492 | + " coalesce.pdf split-out-bad-token.pdf"}, | ||
| 1493 | + {$td->FILE => "coalesce-split.out", $td->EXIT_STATUS => 3}, | ||
| 1494 | + $td->NORMALIZE_NEWLINES); | ||
| 1495 | +$td->runtest("check output", | ||
| 1496 | + {$td->FILE => "split-out-bad-token-1-2.pdf"}, | ||
| 1497 | + {$td->FILE => "coalesce-split-1-2.pdf"}); | ||
| 1498 | + | ||
| 1489 | show_ntests(); | 1499 | show_ntests(); |
| 1490 | # ---------- | 1500 | # ---------- |
| 1491 | $td->notify("--- Keep Files Open ---"); | 1501 | $td->notify("--- Keep Files Open ---"); |
qpdf/qtest/qpdf/coalesce-split-1-2.pdf
0 → 100644
No preview for this file type
qpdf/qtest/qpdf/coalesce-split.out
0 → 100644
| 1 | +WARNING: coalesce.pdf, object 3 0 at offset 181: Bad token found while scanning content stream; not attempting to remove unreferenced objects from this page | ||
| 2 | +WARNING: empty PDF: content normalization encountered bad tokens | ||
| 3 | +WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents | ||
| 4 | +WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | ||
| 5 | +WARNING: empty PDF: content normalization encountered bad tokens | ||
| 6 | +WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | ||
| 7 | +WARNING: empty PDF: content normalization encountered bad tokens | ||
| 8 | +WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents | ||
| 9 | +WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual. | ||
| 10 | +qpdf: operation succeeded with warnings; resulting file may have some problems |