Commit 891751f618fb95b82af289edfd2e1219e3522e6f
1 parent
dc92574c
Remove unreferenced resources only from relevant pages
Showing
6 changed files
with
28 additions
and
15 deletions
ChangeLog
| 1 | 1 | 2021-01-04 Jay Berkenbilt <ejb@ql.org> |
| 2 | 2 | |
| 3 | + * When qpdf CLI extracts pages, it now only attempts to remove | |
| 4 | + unreferenced resourecs from the pages that it is keeping. This | |
| 5 | + change dramatically reduces the time it takes to extract a small | |
| 6 | + number of pages from a large, complex file. | |
| 7 | + | |
| 3 | 8 | * Move getNext()->write() calls in some pipelines to ensure that |
| 4 | 9 | state gates properly reset even if the next pipeline's write |
| 5 | 10 | throws an exception (fuzz issue 28262). | ... | ... |
TODO
| 1 | -Candidates for upcoming release | |
| 2 | -=============================== | |
| 3 | - | |
| 4 | -* Remember to check work `qpdf` project for private issues | |
| 5 | - * file with very slow page extraction | |
| 6 | - * big page even with --remove-unreferenced-resources=yes, even with --empty | |
| 7 | - | |
| 8 | 1 | Fuzz Errors |
| 9 | 2 | =========== |
| 10 | 3 | ... | ... |
manual/qpdf-manual.xml
| ... | ... | @@ -5001,6 +5001,15 @@ print "\n"; |
| 5001 | 5001 | <literal>/DecodeParms</literal>. |
| 5002 | 5002 | </para> |
| 5003 | 5003 | </listitem> |
| 5004 | + <listitem> | |
| 5005 | + <para> | |
| 5006 | + When extracting pages, the <command>qpdf</command> CLI only | |
| 5007 | + removes unreferenced resources from the pages that are being | |
| 5008 | + kept, resulting in a significant performance improvement | |
| 5009 | + when extracting small numbers of pages from large, complex | |
| 5010 | + documents. | |
| 5011 | + </para> | |
| 5012 | + </listitem> | |
| 5004 | 5013 | </itemizedlist> |
| 5005 | 5014 | </listitem> |
| 5006 | 5015 | <listitem> | ... | ... |
qpdf/qpdf.cc
| ... | ... | @@ -5120,6 +5120,7 @@ static void handle_page_specs(QPDF& pdf, Options& o) |
| 5120 | 5120 | page_spec.range)); |
| 5121 | 5121 | } |
| 5122 | 5122 | |
| 5123 | + std::map<unsigned long long, bool> remove_unreferenced; | |
| 5123 | 5124 | if (o.remove_unreferenced_page_resources != re_no) |
| 5124 | 5125 | { |
| 5125 | 5126 | for (std::map<std::string, QPDF*>::iterator iter = |
| ... | ... | @@ -5134,10 +5135,11 @@ static void handle_page_specs(QPDF& pdf, Options& o) |
| 5134 | 5135 | cis->stayOpen(true); |
| 5135 | 5136 | } |
| 5136 | 5137 | QPDF& other(*((*iter).second)); |
| 5137 | - if (should_remove_unreferenced_resources(other, o)) | |
| 5138 | + auto other_uuid = other.getUniqueId(); | |
| 5139 | + if (remove_unreferenced.count(other_uuid) == 0) | |
| 5138 | 5140 | { |
| 5139 | - QPDFPageDocumentHelper dh(other); | |
| 5140 | - dh.removeUnreferencedResources(); | |
| 5141 | + remove_unreferenced[other_uuid] = | |
| 5142 | + should_remove_unreferenced_resources(other, o); | |
| 5141 | 5143 | } |
| 5142 | 5144 | if (cis) |
| 5143 | 5145 | { |
| ... | ... | @@ -5246,6 +5248,10 @@ static void handle_page_specs(QPDF& pdf, Options& o) |
| 5246 | 5248 | else |
| 5247 | 5249 | { |
| 5248 | 5250 | copied_pages[from_uuid].insert(to_copy_og); |
| 5251 | + if (remove_unreferenced[from_uuid]) | |
| 5252 | + { | |
| 5253 | + to_copy.removeUnreferencedResources(); | |
| 5254 | + } | |
| 5249 | 5255 | } |
| 5250 | 5256 | dh.addPage(to_copy, false); |
| 5251 | 5257 | if (page_data.qpdf == &pdf) | ... | ... |
qpdf/qtest/qpdf.test
| ... | ... | @@ -2247,12 +2247,15 @@ $td->runtest("check output", |
| 2247 | 2247 | {$td->FILE => "a.pdf"}, |
| 2248 | 2248 | {$td->FILE => "shared-images-errors-2-out.pdf"}); |
| 2249 | 2249 | |
| 2250 | +# This test used to generate warnings about images on pages we didn't | |
| 2251 | +# care about, but qpdf was modified not to process those pages, so the | |
| 2252 | +# "irrelevant" errors went away. | |
| 2250 | 2253 | $td->runtest("shared resources irrelevant errors", |
| 2251 | 2254 | {$td->COMMAND => |
| 2252 | 2255 | "qpdf --qdf --static-id" . |
| 2253 | 2256 | " shared-images-errors.pdf --pages . 1 -- a.pdf"}, |
| 2254 | - {$td->FILE => "shared-images-errors-1.out", | |
| 2255 | - $td->EXIT_STATUS => 3}, | |
| 2257 | + {$td->STRING => "", | |
| 2258 | + $td->EXIT_STATUS => 0}, | |
| 2256 | 2259 | $td->NORMALIZE_NEWLINES); |
| 2257 | 2260 | $td->runtest("check output", |
| 2258 | 2261 | {$td->FILE => "a.pdf"}, | ... | ... |
qpdf/qtest/qpdf/shared-images-errors-1.out deleted
| 1 | -WARNING: shared-images-errors.pdf (offset 4933): error decoding stream data for object 19 0: stream inflate: inflate: data: incorrect header check | |
| 2 | -WARNING: shared-images-errors.pdf, object 4 0 at offset 676: Unable to parse content stream: content stream (content stream object 19 0): errors while decoding content stream; not attempting to remove unreferenced objects from this page | |
| 3 | -qpdf: operation succeeded with warnings; resulting file may have some problems |