Commit 5cfcd4f361063df8e216489915758ce40a15f15b

Authored by Jay Berkenbilt
1 parent e09ae710

Additional checks for unreferenced resources

Explicitly abandon removal of unreferenced resources if there are any
lexical errors in the page's contents. This case always generated a
warning, but it now also prevents removal of unreferenced resources,
this strongly decreasing the likelihood of data loss.
libqpdf/QPDFPageObjectHelper.cc
@@ -99,11 +99,16 @@ QPDFPageObjectHelper::addContentTokenFilter( @@ -99,11 +99,16 @@ QPDFPageObjectHelper::addContentTokenFilter(
99 class NameWatcher: public QPDFObjectHandle::TokenFilter 99 class NameWatcher: public QPDFObjectHandle::TokenFilter
100 { 100 {
101 public: 101 public:
  102 + NameWatcher() :
  103 + saw_bad(false)
  104 + {
  105 + }
102 virtual ~NameWatcher() 106 virtual ~NameWatcher()
103 { 107 {
104 } 108 }
105 virtual void handleToken(QPDFTokenizer::Token const&); 109 virtual void handleToken(QPDFTokenizer::Token const&);
106 std::set<std::string> names; 110 std::set<std::string> names;
  111 + bool saw_bad;
107 }; 112 };
108 113
109 void 114 void
@@ -116,6 +121,10 @@ NameWatcher::handleToken(QPDFTokenizer::Token const&amp; token) @@ -116,6 +121,10 @@ NameWatcher::handleToken(QPDFTokenizer::Token const&amp; token)
116 this->names.insert( 121 this->names.insert(
117 QPDFObjectHandle::newName(token.getValue()).getName()); 122 QPDFObjectHandle::newName(token.getValue()).getName());
118 } 123 }
  124 + else if (token.getType() == QPDFTokenizer::tt_bad)
  125 + {
  126 + saw_bad = true;
  127 + }
119 writeToken(token); 128 writeToken(token);
120 } 129 }
121 130
@@ -134,6 +143,14 @@ QPDFPageObjectHelper::removeUnreferencedResources() @@ -134,6 +143,14 @@ QPDFPageObjectHelper::removeUnreferencedResources()
134 "; not attempting to remove unreferenced objects from this page"); 143 "; not attempting to remove unreferenced objects from this page");
135 return; 144 return;
136 } 145 }
  146 + if (nw.saw_bad)
  147 + {
  148 + QTC::TC("qpdf", "QPDFPageObjectHelper bad token finding names");
  149 + this->oh.warnIfPossible(
  150 + "Bad token found while scanning content stream; "
  151 + "not attempting to remove unreferenced objects from this page");
  152 + return;
  153 + }
137 // Walk through /Font and /XObject dictionaries, removing any 154 // Walk through /Font and /XObject dictionaries, removing any
138 // resources that are not referenced. We must make copies of 155 // resources that are not referenced. We must make copies of
139 // resource dictionaries down into the dictionaries are mutating 156 // resource dictionaries down into the dictionaries are mutating
qpdf/qpdf.testcov
@@ -412,3 +412,4 @@ QPDF copy foreign stream with provider 0 @@ -412,3 +412,4 @@ QPDF copy foreign stream with provider 0
412 QPDF copy foreign stream with buffer 0 412 QPDF copy foreign stream with buffer 0
413 QPDF immediate copy stream data 0 413 QPDF immediate copy stream data 0
414 qpdf copy same page more than once 1 414 qpdf copy same page more than once 1
  415 +QPDFPageObjectHelper bad token finding names 0
qpdf/qtest/qpdf.test
@@ -1384,7 +1384,7 @@ my @sp_cases = ( @@ -1384,7 +1384,7 @@ my @sp_cases = (
1384 [11, 'pdf extension', '', 'split-out.Pdf'], 1384 [11, 'pdf extension', '', 'split-out.Pdf'],
1385 [4, 'fallback', '--pages 11-pages.pdf 1-3 minimal.pdf --', 'split-out'], 1385 [4, 'fallback', '--pages 11-pages.pdf 1-3 minimal.pdf --', 'split-out'],
1386 ); 1386 );
1387 -$n_tests += 21; 1387 +$n_tests += 23;
1388 for (@sp_cases) 1388 for (@sp_cases)
1389 { 1389 {
1390 $n_tests += 1 + $_->[0]; 1390 $n_tests += 1 + $_->[0];
@@ -1482,10 +1482,20 @@ $td-&gt;runtest(&quot;split shared font, xobject&quot;, @@ -1482,10 +1482,20 @@ $td-&gt;runtest(&quot;split shared font, xobject&quot;,
1482 foreach my $i (qw(1 2 3 4)) 1482 foreach my $i (qw(1 2 3 4))
1483 { 1483 {
1484 $td->runtest("check output ($i)", 1484 $td->runtest("check output ($i)",
1485 - {$td->FILE => "shared-font-xobject-split-$i.pdf"},  
1486 - {$td->FILE => "split-out-shared-font-xobject-$i.pdf"}); 1485 + {$td->FILE => "split-out-shared-font-xobject-$i.pdf"},
  1486 + {$td->FILE => "shared-font-xobject-split-$i.pdf"});
1487 } 1487 }
1488 1488
  1489 +$td->runtest("unreferenced resources with bad token",
  1490 + {$td->COMMAND =>
  1491 + "qpdf --qdf --static-id --split-pages=2" .
  1492 + " coalesce.pdf split-out-bad-token.pdf"},
  1493 + {$td->FILE => "coalesce-split.out", $td->EXIT_STATUS => 3},
  1494 + $td->NORMALIZE_NEWLINES);
  1495 +$td->runtest("check output",
  1496 + {$td->FILE => "split-out-bad-token-1-2.pdf"},
  1497 + {$td->FILE => "coalesce-split-1-2.pdf"});
  1498 +
1489 show_ntests(); 1499 show_ntests();
1490 # ---------- 1500 # ----------
1491 $td->notify("--- Keep Files Open ---"); 1501 $td->notify("--- Keep Files Open ---");
qpdf/qtest/qpdf/coalesce-split-1-2.pdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/coalesce-split.out 0 → 100644
  1 +WARNING: coalesce.pdf, object 3 0 at offset 181: Bad token found while scanning content stream; not attempting to remove unreferenced objects from this page
  2 +WARNING: empty PDF: content normalization encountered bad tokens
  3 +WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
  4 +WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
  5 +WARNING: empty PDF: content normalization encountered bad tokens
  6 +WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
  7 +WARNING: empty PDF: content normalization encountered bad tokens
  8 +WARNING: empty PDF: normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
  9 +WARNING: empty PDF: Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
  10 +qpdf: operation succeeded with warnings; resulting file may have some problems