Commit 858c7b89bc67698a112b3b07885310d8e0132eb0
1 parent
1a62cce9
Let optimize filter stream parameters instead of making them direct
Also removes preclusion of stream references in stream parameters of filterable streams and reduces write times by about 8% by eliminating an extra traversal of the objects.
Showing
5 changed files
with
53 additions
and
114 deletions
ChangeLog
| 1 | +2020-12-25 Jay Berkenbilt <ejb@ql.org> | |
| 2 | + | |
| 3 | + * Refactor write code to eliminate an extra full traversal of | |
| 4 | + objects in the file and to remove assumptions that preclude stream | |
| 5 | + references from appearing in /DecodeParms of filterable streams. | |
| 6 | + This results in an approximately 8% performance reduction in write | |
| 7 | + times. | |
| 8 | + | |
| 1 | 9 | 2020-12-23 Jay Berkenbilt <ejb@ql.org> |
| 2 | 10 | |
| 3 | 11 | * Allow library users to provide their own decoders for stream | ... | ... |
libqpdf/QPDFWriter.cc
| ... | ... | @@ -2452,115 +2452,36 @@ QPDFWriter::getTrimmedTrailer() |
| 2452 | 2452 | void |
| 2453 | 2453 | QPDFWriter::prepareFileForWrite() |
| 2454 | 2454 | { |
| 2455 | - // Do a traversal of the entire PDF file structure replacing all | |
| 2456 | - // indirect objects that QPDFWriter wants to be direct. This | |
| 2457 | - // includes stream lengths, stream filtering parameters, and | |
| 2458 | - // document extension level information. | |
| 2455 | + // Make document extension level information direct as required by | |
| 2456 | + // the spec. | |
| 2459 | 2457 | |
| 2460 | 2458 | this->m->pdf.fixDanglingReferences(true); |
| 2461 | - std::list<QPDFObjectHandle> queue; | |
| 2462 | - queue.push_back(getTrimmedTrailer()); | |
| 2463 | - std::set<int> visited; | |
| 2464 | - | |
| 2465 | - while (! queue.empty()) | |
| 2459 | + QPDFObjectHandle root = this->m->pdf.getRoot(); | |
| 2460 | + for (auto const& key: root.getKeys()) | |
| 2466 | 2461 | { |
| 2467 | - QPDFObjectHandle node = queue.front(); | |
| 2468 | - queue.pop_front(); | |
| 2469 | - if (node.isIndirect()) | |
| 2470 | - { | |
| 2471 | - if (visited.count(node.getObjectID()) > 0) | |
| 2472 | - { | |
| 2473 | - continue; | |
| 2474 | - } | |
| 2475 | - indicateProgress(false, false); | |
| 2476 | - visited.insert(node.getObjectID()); | |
| 2477 | - } | |
| 2478 | - | |
| 2479 | - if (node.isArray()) | |
| 2480 | - { | |
| 2481 | - int nitems = node.getArrayNItems(); | |
| 2482 | - for (int i = 0; i < nitems; ++i) | |
| 2483 | - { | |
| 2484 | - QPDFObjectHandle oh = node.getArrayItem(i); | |
| 2485 | - if (! oh.isScalar()) | |
| 2486 | - { | |
| 2487 | - queue.push_back(oh); | |
| 2488 | - } | |
| 2489 | - } | |
| 2490 | - } | |
| 2491 | - else if (node.isDictionary() || node.isStream()) | |
| 2492 | - { | |
| 2493 | - bool is_stream = false; | |
| 2494 | - bool is_root = false; | |
| 2495 | - bool filterable = false; | |
| 2496 | - QPDFObjectHandle dict = node; | |
| 2497 | - if (node.isStream()) | |
| 2498 | - { | |
| 2499 | - is_stream = true; | |
| 2500 | - dict = node.getDict(); | |
| 2501 | - // See whether we are able to filter this stream. | |
| 2502 | - filterable = node.pipeStreamData( | |
| 2503 | - 0, 0, this->m->stream_decode_level, true); | |
| 2504 | - } | |
| 2505 | - else if (this->m->pdf.getRoot().getObjectID() == node.getObjectID()) | |
| 2462 | + QPDFObjectHandle oh = root.getKey(key); | |
| 2463 | + if ((key == "/Extensions") && (oh.isDictionary())) | |
| 2464 | + { | |
| 2465 | + bool extensions_indirect = false; | |
| 2466 | + if (oh.isIndirect()) | |
| 2506 | 2467 | { |
| 2507 | - is_root = true; | |
| 2468 | + QTC::TC("qpdf", "QPDFWriter make Extensions direct"); | |
| 2469 | + extensions_indirect = true; | |
| 2470 | + oh = oh.shallowCopy(); | |
| 2471 | + root.replaceKey(key, oh); | |
| 2508 | 2472 | } |
| 2509 | - | |
| 2510 | - std::set<std::string> keys = dict.getKeys(); | |
| 2511 | - for (std::set<std::string>::iterator iter = keys.begin(); | |
| 2512 | - iter != keys.end(); ++iter) | |
| 2513 | - { | |
| 2514 | - std::string const& key = *iter; | |
| 2515 | - QPDFObjectHandle oh = dict.getKey(key); | |
| 2516 | - bool add_to_queue = true; | |
| 2517 | - if (is_stream) | |
| 2518 | - { | |
| 2519 | - if (oh.isIndirect() && | |
| 2520 | - ((key == "/Length") || | |
| 2521 | - (filterable && | |
| 2522 | - ((key == "/Filter") || | |
| 2523 | - (key == "/DecodeParms"))))) | |
| 2524 | - { | |
| 2525 | - QTC::TC("qpdf", "QPDFWriter make stream key direct"); | |
| 2526 | - add_to_queue = false; | |
| 2527 | - oh.makeDirect(); | |
| 2528 | - dict.replaceKey(key, oh); | |
| 2529 | - } | |
| 2530 | - } | |
| 2531 | - else if (is_root) | |
| 2473 | + if (oh.hasKey("/ADBE")) | |
| 2474 | + { | |
| 2475 | + QPDFObjectHandle adbe = oh.getKey("/ADBE"); | |
| 2476 | + if (adbe.isIndirect()) | |
| 2532 | 2477 | { |
| 2533 | - if ((key == "/Extensions") && (oh.isDictionary())) | |
| 2534 | - { | |
| 2535 | - bool extensions_indirect = false; | |
| 2536 | - if (oh.isIndirect()) | |
| 2537 | - { | |
| 2538 | - QTC::TC("qpdf", "QPDFWriter make Extensions direct"); | |
| 2539 | - extensions_indirect = true; | |
| 2540 | - add_to_queue = false; | |
| 2541 | - oh = oh.shallowCopy(); | |
| 2542 | - dict.replaceKey(key, oh); | |
| 2543 | - } | |
| 2544 | - if (oh.hasKey("/ADBE")) | |
| 2545 | - { | |
| 2546 | - QPDFObjectHandle adbe = oh.getKey("/ADBE"); | |
| 2547 | - if (adbe.isIndirect()) | |
| 2548 | - { | |
| 2549 | - QTC::TC("qpdf", "QPDFWriter make ADBE direct", | |
| 2550 | - extensions_indirect ? 0 : 1); | |
| 2551 | - adbe.makeDirect(); | |
| 2552 | - oh.replaceKey("/ADBE", adbe); | |
| 2553 | - } | |
| 2554 | - } | |
| 2555 | - } | |
| 2478 | + QTC::TC("qpdf", "QPDFWriter make ADBE direct", | |
| 2479 | + extensions_indirect ? 0 : 1); | |
| 2480 | + adbe.makeDirect(); | |
| 2481 | + oh.replaceKey("/ADBE", adbe); | |
| 2556 | 2482 | } |
| 2557 | - | |
| 2558 | - if (add_to_queue) | |
| 2559 | - { | |
| 2560 | - queue.push_back(oh); | |
| 2561 | - } | |
| 2562 | - } | |
| 2563 | - } | |
| 2483 | + } | |
| 2484 | + } | |
| 2564 | 2485 | } |
| 2565 | 2486 | } |
| 2566 | 2487 | |
| ... | ... | @@ -2737,14 +2658,11 @@ QPDFWriter::write() |
| 2737 | 2658 | { |
| 2738 | 2659 | doWriteSetup(); |
| 2739 | 2660 | |
| 2740 | - // Set up progress reporting. We spent about equal amounts of time | |
| 2741 | - // preparing and writing one pass. To get a rough estimate of | |
| 2742 | - // progress, we track handling of indirect objects. For linearized | |
| 2743 | - // files, we write two passes. events_expected is an | |
| 2744 | - // approximation, but it's good enough for progress reporting, | |
| 2745 | - // which is mostly a guess anyway. | |
| 2661 | + // Set up progress reporting. For linearized files, we write two | |
| 2662 | + // passes. events_expected is an approximation, but it's good | |
| 2663 | + // enough for progress reporting, which is mostly a guess anyway. | |
| 2746 | 2664 | this->m->events_expected = QIntC::to_int( |
| 2747 | - this->m->pdf.getObjectCount() * (this->m->linearized ? 3 : 2)); | |
| 2665 | + this->m->pdf.getObjectCount() * (this->m->linearized ? 2 : 1)); | |
| 2748 | 2666 | |
| 2749 | 2667 | prepareFileForWrite(); |
| 2750 | 2668 | |
| ... | ... | @@ -3138,8 +3056,21 @@ QPDFWriter::writeLinearized() |
| 3138 | 3056 | discardGeneration(this->m->object_to_object_stream, |
| 3139 | 3057 | this->m->object_to_object_stream_no_gen); |
| 3140 | 3058 | |
| 3141 | - bool need_xref_stream = (! this->m->object_to_object_stream.empty()); | |
| 3142 | - this->m->pdf.optimize(this->m->object_to_object_stream_no_gen); | |
| 3059 | + auto skip_stream_parameters = [this](QPDFObjectHandle& stream) { | |
| 3060 | + bool compress_stream; | |
| 3061 | + bool is_metadata; | |
| 3062 | + if (willFilterStream(stream, compress_stream, is_metadata, nullptr)) | |
| 3063 | + { | |
| 3064 | + return 2; | |
| 3065 | + } | |
| 3066 | + else | |
| 3067 | + { | |
| 3068 | + return 1; | |
| 3069 | + } | |
| 3070 | + }; | |
| 3071 | + | |
| 3072 | + this->m->pdf.optimize(this->m->object_to_object_stream_no_gen, | |
| 3073 | + true, skip_stream_parameters); | |
| 3143 | 3074 | |
| 3144 | 3075 | std::vector<QPDFObjectHandle> part4; |
| 3145 | 3076 | std::vector<QPDFObjectHandle> part6; |
| ... | ... | @@ -3173,6 +3104,7 @@ QPDFWriter::writeLinearized() |
| 3173 | 3104 | int after_second_half = 1 + second_half_uncompressed; |
| 3174 | 3105 | this->m->next_objid = after_second_half; |
| 3175 | 3106 | int second_half_xref = 0; |
| 3107 | + bool need_xref_stream = (! this->m->object_to_object_stream.empty()); | |
| 3176 | 3108 | if (need_xref_stream) |
| 3177 | 3109 | { |
| 3178 | 3110 | second_half_xref = this->m->next_objid++; | ... | ... |
qpdf/qpdf.testcov
| ... | ... | @@ -234,7 +234,6 @@ QPDFWriter extra header text no newline 0 |
| 234 | 234 | QPDFWriter extra header text add newline 0 |
| 235 | 235 | QPDF bogus 0 offset 0 |
| 236 | 236 | QPDF global offset 0 |
| 237 | -QPDFWriter make stream key direct 0 | |
| 238 | 237 | QPDFWriter copy V5 0 |
| 239 | 238 | QPDFWriter increasing extension level 0 |
| 240 | 239 | QPDFWriter make Extensions direct 0 | ... | ... |
qpdf/qtest/qpdf.test
| ... | ... | @@ -716,7 +716,7 @@ my @bug_tests = ( |
| 716 | 716 | ["99b", "object 0", 2], |
| 717 | 717 | ["100", "xref reconstruction loop", 2], |
| 718 | 718 | ["101", "resolve for exception text", 2], |
| 719 | - ["117", "other infinite loop", 2], | |
| 719 | + ["117", "other infinite loop", 3], | |
| 720 | 720 | ["118", "other infinite loop", 2], |
| 721 | 721 | ["119", "other infinite loop", 3], |
| 722 | 722 | ["120", "other infinite loop", 3], | ... | ... |
qpdf/qtest/qpdf/issue-117.out
| ... | ... | @@ -13,4 +13,4 @@ WARNING: issue-117.pdf (object 7 0, offset 1791): unknown token while reading ob |
| 13 | 13 | WARNING: issue-117.pdf (object 7 0, offset 1267): /Length key in stream dictionary is not an integer |
| 14 | 14 | WARNING: issue-117.pdf (object 7 0, offset 1418): attempting to recover stream length |
| 15 | 15 | WARNING: issue-117.pdf (object 7 0, offset 1418): recovered stream length: 347 |
| 16 | -attempt to make a stream into a direct object | |
| 16 | +qpdf: operation succeeded with warnings; resulting file may have some problems | ... | ... |