Commit 891751f618fb95b82af289edfd2e1219e3522e6f

Authored by Jay Berkenbilt
1 parent dc92574c

Remove unreferenced resources only from relevant pages

ChangeLog
1 1 2021-01-04 Jay Berkenbilt <ejb@ql.org>
2 2  
  3 + * When qpdf CLI extracts pages, it now only attempts to remove
  4 + unreferenced resourecs from the pages that it is keeping. This
  5 + change dramatically reduces the time it takes to extract a small
  6 + number of pages from a large, complex file.
  7 +
3 8 * Move getNext()->write() calls in some pipelines to ensure that
4 9 state gates properly reset even if the next pipeline's write
5 10 throws an exception (fuzz issue 28262).
... ...
1   -Candidates for upcoming release
2   -===============================
3   -
4   -* Remember to check work `qpdf` project for private issues
5   - * file with very slow page extraction
6   - * big page even with --remove-unreferenced-resources=yes, even with --empty
7   -
8 1 Fuzz Errors
9 2 ===========
10 3  
... ...
manual/qpdf-manual.xml
... ... @@ -5001,6 +5001,15 @@ print &quot;\n&quot;;
5001 5001 <literal>/DecodeParms</literal>.
5002 5002 </para>
5003 5003 </listitem>
  5004 + <listitem>
  5005 + <para>
  5006 + When extracting pages, the <command>qpdf</command> CLI only
  5007 + removes unreferenced resources from the pages that are being
  5008 + kept, resulting in a significant performance improvement
  5009 + when extracting small numbers of pages from large, complex
  5010 + documents.
  5011 + </para>
  5012 + </listitem>
5004 5013 </itemizedlist>
5005 5014 </listitem>
5006 5015 <listitem>
... ...
qpdf/qpdf.cc
... ... @@ -5120,6 +5120,7 @@ static void handle_page_specs(QPDF&amp; pdf, Options&amp; o)
5120 5120 page_spec.range));
5121 5121 }
5122 5122  
  5123 + std::map<unsigned long long, bool> remove_unreferenced;
5123 5124 if (o.remove_unreferenced_page_resources != re_no)
5124 5125 {
5125 5126 for (std::map<std::string, QPDF*>::iterator iter =
... ... @@ -5134,10 +5135,11 @@ static void handle_page_specs(QPDF&amp; pdf, Options&amp; o)
5134 5135 cis->stayOpen(true);
5135 5136 }
5136 5137 QPDF& other(*((*iter).second));
5137   - if (should_remove_unreferenced_resources(other, o))
  5138 + auto other_uuid = other.getUniqueId();
  5139 + if (remove_unreferenced.count(other_uuid) == 0)
5138 5140 {
5139   - QPDFPageDocumentHelper dh(other);
5140   - dh.removeUnreferencedResources();
  5141 + remove_unreferenced[other_uuid] =
  5142 + should_remove_unreferenced_resources(other, o);
5141 5143 }
5142 5144 if (cis)
5143 5145 {
... ... @@ -5246,6 +5248,10 @@ static void handle_page_specs(QPDF&amp; pdf, Options&amp; o)
5246 5248 else
5247 5249 {
5248 5250 copied_pages[from_uuid].insert(to_copy_og);
  5251 + if (remove_unreferenced[from_uuid])
  5252 + {
  5253 + to_copy.removeUnreferencedResources();
  5254 + }
5249 5255 }
5250 5256 dh.addPage(to_copy, false);
5251 5257 if (page_data.qpdf == &pdf)
... ...
qpdf/qtest/qpdf.test
... ... @@ -2247,12 +2247,15 @@ $td-&gt;runtest(&quot;check output&quot;,
2247 2247 {$td->FILE => "a.pdf"},
2248 2248 {$td->FILE => "shared-images-errors-2-out.pdf"});
2249 2249  
  2250 +# This test used to generate warnings about images on pages we didn't
  2251 +# care about, but qpdf was modified not to process those pages, so the
  2252 +# "irrelevant" errors went away.
2250 2253 $td->runtest("shared resources irrelevant errors",
2251 2254 {$td->COMMAND =>
2252 2255 "qpdf --qdf --static-id" .
2253 2256 " shared-images-errors.pdf --pages . 1 -- a.pdf"},
2254   - {$td->FILE => "shared-images-errors-1.out",
2255   - $td->EXIT_STATUS => 3},
  2257 + {$td->STRING => "",
  2258 + $td->EXIT_STATUS => 0},
2256 2259 $td->NORMALIZE_NEWLINES);
2257 2260 $td->runtest("check output",
2258 2261 {$td->FILE => "a.pdf"},
... ...
qpdf/qtest/qpdf/shared-images-errors-1.out deleted
1   -WARNING: shared-images-errors.pdf (offset 4933): error decoding stream data for object 19 0: stream inflate: inflate: data: incorrect header check
2   -WARNING: shared-images-errors.pdf, object 4 0 at offset 676: Unable to parse content stream: content stream (content stream object 19 0): errors while decoding content stream; not attempting to remove unreferenced objects from this page
3   -qpdf: operation succeeded with warnings; resulting file may have some problems