Commit 891751f618fb95b82af289edfd2e1219e3522e6f

Authored by Jay Berkenbilt
1 parent dc92574c

Remove unreferenced resources only from relevant pages

ChangeLog
1 2021-01-04 Jay Berkenbilt <ejb@ql.org> 1 2021-01-04 Jay Berkenbilt <ejb@ql.org>
2 2
  3 + * When qpdf CLI extracts pages, it now only attempts to remove
  4 + unreferenced resourecs from the pages that it is keeping. This
  5 + change dramatically reduces the time it takes to extract a small
  6 + number of pages from a large, complex file.
  7 +
3 * Move getNext()->write() calls in some pipelines to ensure that 8 * Move getNext()->write() calls in some pipelines to ensure that
4 state gates properly reset even if the next pipeline's write 9 state gates properly reset even if the next pipeline's write
5 throws an exception (fuzz issue 28262). 10 throws an exception (fuzz issue 28262).
1 -Candidates for upcoming release  
2 -===============================  
3 -  
4 -* Remember to check work `qpdf` project for private issues  
5 - * file with very slow page extraction  
6 - * big page even with --remove-unreferenced-resources=yes, even with --empty  
7 -  
8 Fuzz Errors 1 Fuzz Errors
9 =========== 2 ===========
10 3
manual/qpdf-manual.xml
@@ -5001,6 +5001,15 @@ print &quot;\n&quot;; @@ -5001,6 +5001,15 @@ print &quot;\n&quot;;
5001 <literal>/DecodeParms</literal>. 5001 <literal>/DecodeParms</literal>.
5002 </para> 5002 </para>
5003 </listitem> 5003 </listitem>
  5004 + <listitem>
  5005 + <para>
  5006 + When extracting pages, the <command>qpdf</command> CLI only
  5007 + removes unreferenced resources from the pages that are being
  5008 + kept, resulting in a significant performance improvement
  5009 + when extracting small numbers of pages from large, complex
  5010 + documents.
  5011 + </para>
  5012 + </listitem>
5004 </itemizedlist> 5013 </itemizedlist>
5005 </listitem> 5014 </listitem>
5006 <listitem> 5015 <listitem>
qpdf/qpdf.cc
@@ -5120,6 +5120,7 @@ static void handle_page_specs(QPDF&amp; pdf, Options&amp; o) @@ -5120,6 +5120,7 @@ static void handle_page_specs(QPDF&amp; pdf, Options&amp; o)
5120 page_spec.range)); 5120 page_spec.range));
5121 } 5121 }
5122 5122
  5123 + std::map<unsigned long long, bool> remove_unreferenced;
5123 if (o.remove_unreferenced_page_resources != re_no) 5124 if (o.remove_unreferenced_page_resources != re_no)
5124 { 5125 {
5125 for (std::map<std::string, QPDF*>::iterator iter = 5126 for (std::map<std::string, QPDF*>::iterator iter =
@@ -5134,10 +5135,11 @@ static void handle_page_specs(QPDF&amp; pdf, Options&amp; o) @@ -5134,10 +5135,11 @@ static void handle_page_specs(QPDF&amp; pdf, Options&amp; o)
5134 cis->stayOpen(true); 5135 cis->stayOpen(true);
5135 } 5136 }
5136 QPDF& other(*((*iter).second)); 5137 QPDF& other(*((*iter).second));
5137 - if (should_remove_unreferenced_resources(other, o)) 5138 + auto other_uuid = other.getUniqueId();
  5139 + if (remove_unreferenced.count(other_uuid) == 0)
5138 { 5140 {
5139 - QPDFPageDocumentHelper dh(other);  
5140 - dh.removeUnreferencedResources(); 5141 + remove_unreferenced[other_uuid] =
  5142 + should_remove_unreferenced_resources(other, o);
5141 } 5143 }
5142 if (cis) 5144 if (cis)
5143 { 5145 {
@@ -5246,6 +5248,10 @@ static void handle_page_specs(QPDF&amp; pdf, Options&amp; o) @@ -5246,6 +5248,10 @@ static void handle_page_specs(QPDF&amp; pdf, Options&amp; o)
5246 else 5248 else
5247 { 5249 {
5248 copied_pages[from_uuid].insert(to_copy_og); 5250 copied_pages[from_uuid].insert(to_copy_og);
  5251 + if (remove_unreferenced[from_uuid])
  5252 + {
  5253 + to_copy.removeUnreferencedResources();
  5254 + }
5249 } 5255 }
5250 dh.addPage(to_copy, false); 5256 dh.addPage(to_copy, false);
5251 if (page_data.qpdf == &pdf) 5257 if (page_data.qpdf == &pdf)
qpdf/qtest/qpdf.test
@@ -2247,12 +2247,15 @@ $td-&gt;runtest(&quot;check output&quot;, @@ -2247,12 +2247,15 @@ $td-&gt;runtest(&quot;check output&quot;,
2247 {$td->FILE => "a.pdf"}, 2247 {$td->FILE => "a.pdf"},
2248 {$td->FILE => "shared-images-errors-2-out.pdf"}); 2248 {$td->FILE => "shared-images-errors-2-out.pdf"});
2249 2249
  2250 +# This test used to generate warnings about images on pages we didn't
  2251 +# care about, but qpdf was modified not to process those pages, so the
  2252 +# "irrelevant" errors went away.
2250 $td->runtest("shared resources irrelevant errors", 2253 $td->runtest("shared resources irrelevant errors",
2251 {$td->COMMAND => 2254 {$td->COMMAND =>
2252 "qpdf --qdf --static-id" . 2255 "qpdf --qdf --static-id" .
2253 " shared-images-errors.pdf --pages . 1 -- a.pdf"}, 2256 " shared-images-errors.pdf --pages . 1 -- a.pdf"},
2254 - {$td->FILE => "shared-images-errors-1.out",  
2255 - $td->EXIT_STATUS => 3}, 2257 + {$td->STRING => "",
  2258 + $td->EXIT_STATUS => 0},
2256 $td->NORMALIZE_NEWLINES); 2259 $td->NORMALIZE_NEWLINES);
2257 $td->runtest("check output", 2260 $td->runtest("check output",
2258 {$td->FILE => "a.pdf"}, 2261 {$td->FILE => "a.pdf"},
qpdf/qtest/qpdf/shared-images-errors-1.out deleted
1 -WARNING: shared-images-errors.pdf (offset 4933): error decoding stream data for object 19 0: stream inflate: inflate: data: incorrect header check  
2 -WARNING: shared-images-errors.pdf, object 4 0 at offset 676: Unable to parse content stream: content stream (content stream object 19 0): errors while decoding content stream; not attempting to remove unreferenced objects from this page  
3 -qpdf: operation succeeded with warnings; resulting file may have some problems