Commit 858c7b89bc67698a112b3b07885310d8e0132eb0

Authored by Jay Berkenbilt
1 parent 1a62cce9

Let optimize filter stream parameters instead of making them direct

Also removes preclusion of stream references in stream parameters of
filterable streams and reduces write times by about 8% by eliminating
an extra traversal of the objects.
ChangeLog
  1 +2020-12-25 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * Refactor write code to eliminate an extra full traversal of
  4 + objects in the file and to remove assumptions that preclude stream
  5 + references from appearing in /DecodeParms of filterable streams.
  6 + This results in an approximately 8% performance reduction in write
  7 + times.
  8 +
1 2020-12-23 Jay Berkenbilt <ejb@ql.org> 9 2020-12-23 Jay Berkenbilt <ejb@ql.org>
2 10
3 * Allow library users to provide their own decoders for stream 11 * Allow library users to provide their own decoders for stream
libqpdf/QPDFWriter.cc
@@ -2452,115 +2452,36 @@ QPDFWriter::getTrimmedTrailer() @@ -2452,115 +2452,36 @@ QPDFWriter::getTrimmedTrailer()
2452 void 2452 void
2453 QPDFWriter::prepareFileForWrite() 2453 QPDFWriter::prepareFileForWrite()
2454 { 2454 {
2455 - // Do a traversal of the entire PDF file structure replacing all  
2456 - // indirect objects that QPDFWriter wants to be direct. This  
2457 - // includes stream lengths, stream filtering parameters, and  
2458 - // document extension level information. 2455 + // Make document extension level information direct as required by
  2456 + // the spec.
2459 2457
2460 this->m->pdf.fixDanglingReferences(true); 2458 this->m->pdf.fixDanglingReferences(true);
2461 - std::list<QPDFObjectHandle> queue;  
2462 - queue.push_back(getTrimmedTrailer());  
2463 - std::set<int> visited;  
2464 -  
2465 - while (! queue.empty()) 2459 + QPDFObjectHandle root = this->m->pdf.getRoot();
  2460 + for (auto const& key: root.getKeys())
2466 { 2461 {
2467 - QPDFObjectHandle node = queue.front();  
2468 - queue.pop_front();  
2469 - if (node.isIndirect())  
2470 - {  
2471 - if (visited.count(node.getObjectID()) > 0)  
2472 - {  
2473 - continue;  
2474 - }  
2475 - indicateProgress(false, false);  
2476 - visited.insert(node.getObjectID());  
2477 - }  
2478 -  
2479 - if (node.isArray())  
2480 - {  
2481 - int nitems = node.getArrayNItems();  
2482 - for (int i = 0; i < nitems; ++i)  
2483 - {  
2484 - QPDFObjectHandle oh = node.getArrayItem(i);  
2485 - if (! oh.isScalar())  
2486 - {  
2487 - queue.push_back(oh);  
2488 - }  
2489 - }  
2490 - }  
2491 - else if (node.isDictionary() || node.isStream())  
2492 - {  
2493 - bool is_stream = false;  
2494 - bool is_root = false;  
2495 - bool filterable = false;  
2496 - QPDFObjectHandle dict = node;  
2497 - if (node.isStream())  
2498 - {  
2499 - is_stream = true;  
2500 - dict = node.getDict();  
2501 - // See whether we are able to filter this stream.  
2502 - filterable = node.pipeStreamData(  
2503 - 0, 0, this->m->stream_decode_level, true);  
2504 - }  
2505 - else if (this->m->pdf.getRoot().getObjectID() == node.getObjectID()) 2462 + QPDFObjectHandle oh = root.getKey(key);
  2463 + if ((key == "/Extensions") && (oh.isDictionary()))
  2464 + {
  2465 + bool extensions_indirect = false;
  2466 + if (oh.isIndirect())
2506 { 2467 {
2507 - is_root = true; 2468 + QTC::TC("qpdf", "QPDFWriter make Extensions direct");
  2469 + extensions_indirect = true;
  2470 + oh = oh.shallowCopy();
  2471 + root.replaceKey(key, oh);
2508 } 2472 }
2509 -  
2510 - std::set<std::string> keys = dict.getKeys();  
2511 - for (std::set<std::string>::iterator iter = keys.begin();  
2512 - iter != keys.end(); ++iter)  
2513 - {  
2514 - std::string const& key = *iter;  
2515 - QPDFObjectHandle oh = dict.getKey(key);  
2516 - bool add_to_queue = true;  
2517 - if (is_stream)  
2518 - {  
2519 - if (oh.isIndirect() &&  
2520 - ((key == "/Length") ||  
2521 - (filterable &&  
2522 - ((key == "/Filter") ||  
2523 - (key == "/DecodeParms")))))  
2524 - {  
2525 - QTC::TC("qpdf", "QPDFWriter make stream key direct");  
2526 - add_to_queue = false;  
2527 - oh.makeDirect();  
2528 - dict.replaceKey(key, oh);  
2529 - }  
2530 - }  
2531 - else if (is_root) 2473 + if (oh.hasKey("/ADBE"))
  2474 + {
  2475 + QPDFObjectHandle adbe = oh.getKey("/ADBE");
  2476 + if (adbe.isIndirect())
2532 { 2477 {
2533 - if ((key == "/Extensions") && (oh.isDictionary()))  
2534 - {  
2535 - bool extensions_indirect = false;  
2536 - if (oh.isIndirect())  
2537 - {  
2538 - QTC::TC("qpdf", "QPDFWriter make Extensions direct");  
2539 - extensions_indirect = true;  
2540 - add_to_queue = false;  
2541 - oh = oh.shallowCopy();  
2542 - dict.replaceKey(key, oh);  
2543 - }  
2544 - if (oh.hasKey("/ADBE"))  
2545 - {  
2546 - QPDFObjectHandle adbe = oh.getKey("/ADBE");  
2547 - if (adbe.isIndirect())  
2548 - {  
2549 - QTC::TC("qpdf", "QPDFWriter make ADBE direct",  
2550 - extensions_indirect ? 0 : 1);  
2551 - adbe.makeDirect();  
2552 - oh.replaceKey("/ADBE", adbe);  
2553 - }  
2554 - }  
2555 - } 2478 + QTC::TC("qpdf", "QPDFWriter make ADBE direct",
  2479 + extensions_indirect ? 0 : 1);
  2480 + adbe.makeDirect();
  2481 + oh.replaceKey("/ADBE", adbe);
2556 } 2482 }
2557 -  
2558 - if (add_to_queue)  
2559 - {  
2560 - queue.push_back(oh);  
2561 - }  
2562 - }  
2563 - } 2483 + }
  2484 + }
2564 } 2485 }
2565 } 2486 }
2566 2487
@@ -2737,14 +2658,11 @@ QPDFWriter::write() @@ -2737,14 +2658,11 @@ QPDFWriter::write()
2737 { 2658 {
2738 doWriteSetup(); 2659 doWriteSetup();
2739 2660
2740 - // Set up progress reporting. We spent about equal amounts of time  
2741 - // preparing and writing one pass. To get a rough estimate of  
2742 - // progress, we track handling of indirect objects. For linearized  
2743 - // files, we write two passes. events_expected is an  
2744 - // approximation, but it's good enough for progress reporting,  
2745 - // which is mostly a guess anyway. 2661 + // Set up progress reporting. For linearized files, we write two
  2662 + // passes. events_expected is an approximation, but it's good
  2663 + // enough for progress reporting, which is mostly a guess anyway.
2746 this->m->events_expected = QIntC::to_int( 2664 this->m->events_expected = QIntC::to_int(
2747 - this->m->pdf.getObjectCount() * (this->m->linearized ? 3 : 2)); 2665 + this->m->pdf.getObjectCount() * (this->m->linearized ? 2 : 1));
2748 2666
2749 prepareFileForWrite(); 2667 prepareFileForWrite();
2750 2668
@@ -3138,8 +3056,21 @@ QPDFWriter::writeLinearized() @@ -3138,8 +3056,21 @@ QPDFWriter::writeLinearized()
3138 discardGeneration(this->m->object_to_object_stream, 3056 discardGeneration(this->m->object_to_object_stream,
3139 this->m->object_to_object_stream_no_gen); 3057 this->m->object_to_object_stream_no_gen);
3140 3058
3141 - bool need_xref_stream = (! this->m->object_to_object_stream.empty());  
3142 - this->m->pdf.optimize(this->m->object_to_object_stream_no_gen); 3059 + auto skip_stream_parameters = [this](QPDFObjectHandle& stream) {
  3060 + bool compress_stream;
  3061 + bool is_metadata;
  3062 + if (willFilterStream(stream, compress_stream, is_metadata, nullptr))
  3063 + {
  3064 + return 2;
  3065 + }
  3066 + else
  3067 + {
  3068 + return 1;
  3069 + }
  3070 + };
  3071 +
  3072 + this->m->pdf.optimize(this->m->object_to_object_stream_no_gen,
  3073 + true, skip_stream_parameters);
3143 3074
3144 std::vector<QPDFObjectHandle> part4; 3075 std::vector<QPDFObjectHandle> part4;
3145 std::vector<QPDFObjectHandle> part6; 3076 std::vector<QPDFObjectHandle> part6;
@@ -3173,6 +3104,7 @@ QPDFWriter::writeLinearized() @@ -3173,6 +3104,7 @@ QPDFWriter::writeLinearized()
3173 int after_second_half = 1 + second_half_uncompressed; 3104 int after_second_half = 1 + second_half_uncompressed;
3174 this->m->next_objid = after_second_half; 3105 this->m->next_objid = after_second_half;
3175 int second_half_xref = 0; 3106 int second_half_xref = 0;
  3107 + bool need_xref_stream = (! this->m->object_to_object_stream.empty());
3176 if (need_xref_stream) 3108 if (need_xref_stream)
3177 { 3109 {
3178 second_half_xref = this->m->next_objid++; 3110 second_half_xref = this->m->next_objid++;
qpdf/qpdf.testcov
@@ -234,7 +234,6 @@ QPDFWriter extra header text no newline 0 @@ -234,7 +234,6 @@ QPDFWriter extra header text no newline 0
234 QPDFWriter extra header text add newline 0 234 QPDFWriter extra header text add newline 0
235 QPDF bogus 0 offset 0 235 QPDF bogus 0 offset 0
236 QPDF global offset 0 236 QPDF global offset 0
237 -QPDFWriter make stream key direct 0  
238 QPDFWriter copy V5 0 237 QPDFWriter copy V5 0
239 QPDFWriter increasing extension level 0 238 QPDFWriter increasing extension level 0
240 QPDFWriter make Extensions direct 0 239 QPDFWriter make Extensions direct 0
qpdf/qtest/qpdf.test
@@ -716,7 +716,7 @@ my @bug_tests = ( @@ -716,7 +716,7 @@ my @bug_tests = (
716 ["99b", "object 0", 2], 716 ["99b", "object 0", 2],
717 ["100", "xref reconstruction loop", 2], 717 ["100", "xref reconstruction loop", 2],
718 ["101", "resolve for exception text", 2], 718 ["101", "resolve for exception text", 2],
719 - ["117", "other infinite loop", 2], 719 + ["117", "other infinite loop", 3],
720 ["118", "other infinite loop", 2], 720 ["118", "other infinite loop", 2],
721 ["119", "other infinite loop", 3], 721 ["119", "other infinite loop", 3],
722 ["120", "other infinite loop", 3], 722 ["120", "other infinite loop", 3],
qpdf/qtest/qpdf/issue-117.out
@@ -13,4 +13,4 @@ WARNING: issue-117.pdf (object 7 0, offset 1791): unknown token while reading ob @@ -13,4 +13,4 @@ WARNING: issue-117.pdf (object 7 0, offset 1791): unknown token while reading ob
13 WARNING: issue-117.pdf (object 7 0, offset 1267): /Length key in stream dictionary is not an integer 13 WARNING: issue-117.pdf (object 7 0, offset 1267): /Length key in stream dictionary is not an integer
14 WARNING: issue-117.pdf (object 7 0, offset 1418): attempting to recover stream length 14 WARNING: issue-117.pdf (object 7 0, offset 1418): attempting to recover stream length
15 WARNING: issue-117.pdf (object 7 0, offset 1418): recovered stream length: 347 15 WARNING: issue-117.pdf (object 7 0, offset 1418): recovered stream length: 347
16 -attempt to make a stream into a direct object 16 +qpdf: operation succeeded with warnings; resulting file may have some problems