Commit 891751f618fb95b82af289edfd2e1219e3522e6f
1 parent
dc92574c
Remove unreferenced resources only from relevant pages
Showing
6 changed files
with
28 additions
and
15 deletions
ChangeLog
| 1 | 2021-01-04 Jay Berkenbilt <ejb@ql.org> | 1 | 2021-01-04 Jay Berkenbilt <ejb@ql.org> |
| 2 | 2 | ||
| 3 | + * When qpdf CLI extracts pages, it now only attempts to remove | ||
| 4 | + unreferenced resourecs from the pages that it is keeping. This | ||
| 5 | + change dramatically reduces the time it takes to extract a small | ||
| 6 | + number of pages from a large, complex file. | ||
| 7 | + | ||
| 3 | * Move getNext()->write() calls in some pipelines to ensure that | 8 | * Move getNext()->write() calls in some pipelines to ensure that |
| 4 | state gates properly reset even if the next pipeline's write | 9 | state gates properly reset even if the next pipeline's write |
| 5 | throws an exception (fuzz issue 28262). | 10 | throws an exception (fuzz issue 28262). |
TODO
| 1 | -Candidates for upcoming release | ||
| 2 | -=============================== | ||
| 3 | - | ||
| 4 | -* Remember to check work `qpdf` project for private issues | ||
| 5 | - * file with very slow page extraction | ||
| 6 | - * big page even with --remove-unreferenced-resources=yes, even with --empty | ||
| 7 | - | ||
| 8 | Fuzz Errors | 1 | Fuzz Errors |
| 9 | =========== | 2 | =========== |
| 10 | 3 |
manual/qpdf-manual.xml
| @@ -5001,6 +5001,15 @@ print "\n"; | @@ -5001,6 +5001,15 @@ print "\n"; | ||
| 5001 | <literal>/DecodeParms</literal>. | 5001 | <literal>/DecodeParms</literal>. |
| 5002 | </para> | 5002 | </para> |
| 5003 | </listitem> | 5003 | </listitem> |
| 5004 | + <listitem> | ||
| 5005 | + <para> | ||
| 5006 | + When extracting pages, the <command>qpdf</command> CLI only | ||
| 5007 | + removes unreferenced resources from the pages that are being | ||
| 5008 | + kept, resulting in a significant performance improvement | ||
| 5009 | + when extracting small numbers of pages from large, complex | ||
| 5010 | + documents. | ||
| 5011 | + </para> | ||
| 5012 | + </listitem> | ||
| 5004 | </itemizedlist> | 5013 | </itemizedlist> |
| 5005 | </listitem> | 5014 | </listitem> |
| 5006 | <listitem> | 5015 | <listitem> |
qpdf/qpdf.cc
| @@ -5120,6 +5120,7 @@ static void handle_page_specs(QPDF& pdf, Options& o) | @@ -5120,6 +5120,7 @@ static void handle_page_specs(QPDF& pdf, Options& o) | ||
| 5120 | page_spec.range)); | 5120 | page_spec.range)); |
| 5121 | } | 5121 | } |
| 5122 | 5122 | ||
| 5123 | + std::map<unsigned long long, bool> remove_unreferenced; | ||
| 5123 | if (o.remove_unreferenced_page_resources != re_no) | 5124 | if (o.remove_unreferenced_page_resources != re_no) |
| 5124 | { | 5125 | { |
| 5125 | for (std::map<std::string, QPDF*>::iterator iter = | 5126 | for (std::map<std::string, QPDF*>::iterator iter = |
| @@ -5134,10 +5135,11 @@ static void handle_page_specs(QPDF& pdf, Options& o) | @@ -5134,10 +5135,11 @@ static void handle_page_specs(QPDF& pdf, Options& o) | ||
| 5134 | cis->stayOpen(true); | 5135 | cis->stayOpen(true); |
| 5135 | } | 5136 | } |
| 5136 | QPDF& other(*((*iter).second)); | 5137 | QPDF& other(*((*iter).second)); |
| 5137 | - if (should_remove_unreferenced_resources(other, o)) | 5138 | + auto other_uuid = other.getUniqueId(); |
| 5139 | + if (remove_unreferenced.count(other_uuid) == 0) | ||
| 5138 | { | 5140 | { |
| 5139 | - QPDFPageDocumentHelper dh(other); | ||
| 5140 | - dh.removeUnreferencedResources(); | 5141 | + remove_unreferenced[other_uuid] = |
| 5142 | + should_remove_unreferenced_resources(other, o); | ||
| 5141 | } | 5143 | } |
| 5142 | if (cis) | 5144 | if (cis) |
| 5143 | { | 5145 | { |
| @@ -5246,6 +5248,10 @@ static void handle_page_specs(QPDF& pdf, Options& o) | @@ -5246,6 +5248,10 @@ static void handle_page_specs(QPDF& pdf, Options& o) | ||
| 5246 | else | 5248 | else |
| 5247 | { | 5249 | { |
| 5248 | copied_pages[from_uuid].insert(to_copy_og); | 5250 | copied_pages[from_uuid].insert(to_copy_og); |
| 5251 | + if (remove_unreferenced[from_uuid]) | ||
| 5252 | + { | ||
| 5253 | + to_copy.removeUnreferencedResources(); | ||
| 5254 | + } | ||
| 5249 | } | 5255 | } |
| 5250 | dh.addPage(to_copy, false); | 5256 | dh.addPage(to_copy, false); |
| 5251 | if (page_data.qpdf == &pdf) | 5257 | if (page_data.qpdf == &pdf) |
qpdf/qtest/qpdf.test
| @@ -2247,12 +2247,15 @@ $td->runtest("check output", | @@ -2247,12 +2247,15 @@ $td->runtest("check output", | ||
| 2247 | {$td->FILE => "a.pdf"}, | 2247 | {$td->FILE => "a.pdf"}, |
| 2248 | {$td->FILE => "shared-images-errors-2-out.pdf"}); | 2248 | {$td->FILE => "shared-images-errors-2-out.pdf"}); |
| 2249 | 2249 | ||
| 2250 | +# This test used to generate warnings about images on pages we didn't | ||
| 2251 | +# care about, but qpdf was modified not to process those pages, so the | ||
| 2252 | +# "irrelevant" errors went away. | ||
| 2250 | $td->runtest("shared resources irrelevant errors", | 2253 | $td->runtest("shared resources irrelevant errors", |
| 2251 | {$td->COMMAND => | 2254 | {$td->COMMAND => |
| 2252 | "qpdf --qdf --static-id" . | 2255 | "qpdf --qdf --static-id" . |
| 2253 | " shared-images-errors.pdf --pages . 1 -- a.pdf"}, | 2256 | " shared-images-errors.pdf --pages . 1 -- a.pdf"}, |
| 2254 | - {$td->FILE => "shared-images-errors-1.out", | ||
| 2255 | - $td->EXIT_STATUS => 3}, | 2257 | + {$td->STRING => "", |
| 2258 | + $td->EXIT_STATUS => 0}, | ||
| 2256 | $td->NORMALIZE_NEWLINES); | 2259 | $td->NORMALIZE_NEWLINES); |
| 2257 | $td->runtest("check output", | 2260 | $td->runtest("check output", |
| 2258 | {$td->FILE => "a.pdf"}, | 2261 | {$td->FILE => "a.pdf"}, |
qpdf/qtest/qpdf/shared-images-errors-1.out deleted
| 1 | -WARNING: shared-images-errors.pdf (offset 4933): error decoding stream data for object 19 0: stream inflate: inflate: data: incorrect header check | ||
| 2 | -WARNING: shared-images-errors.pdf, object 4 0 at offset 676: Unable to parse content stream: content stream (content stream object 19 0): errors while decoding content stream; not attempting to remove unreferenced objects from this page | ||
| 3 | -qpdf: operation succeeded with warnings; resulting file may have some problems |