Commit 858c7b89bc67698a112b3b07885310d8e0132eb0

Authored by Jay Berkenbilt
1 parent 1a62cce9

Let optimize filter stream parameters instead of making them direct

Also removes preclusion of stream references in stream parameters of
filterable streams and reduces write times by about 8% by eliminating
an extra traversal of the objects.
ChangeLog
  1 +2020-12-25 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * Refactor write code to eliminate an extra full traversal of
  4 + objects in the file and to remove assumptions that preclude stream
  5 + references from appearing in /DecodeParms of filterable streams.
  6 + This results in an approximately 8% performance reduction in write
  7 + times.
  8 +
1 9 2020-12-23 Jay Berkenbilt <ejb@ql.org>
2 10  
3 11 * Allow library users to provide their own decoders for stream
... ...
libqpdf/QPDFWriter.cc
... ... @@ -2452,115 +2452,36 @@ QPDFWriter::getTrimmedTrailer()
2452 2452 void
2453 2453 QPDFWriter::prepareFileForWrite()
2454 2454 {
2455   - // Do a traversal of the entire PDF file structure replacing all
2456   - // indirect objects that QPDFWriter wants to be direct. This
2457   - // includes stream lengths, stream filtering parameters, and
2458   - // document extension level information.
  2455 + // Make document extension level information direct as required by
  2456 + // the spec.
2459 2457  
2460 2458 this->m->pdf.fixDanglingReferences(true);
2461   - std::list<QPDFObjectHandle> queue;
2462   - queue.push_back(getTrimmedTrailer());
2463   - std::set<int> visited;
2464   -
2465   - while (! queue.empty())
  2459 + QPDFObjectHandle root = this->m->pdf.getRoot();
  2460 + for (auto const& key: root.getKeys())
2466 2461 {
2467   - QPDFObjectHandle node = queue.front();
2468   - queue.pop_front();
2469   - if (node.isIndirect())
2470   - {
2471   - if (visited.count(node.getObjectID()) > 0)
2472   - {
2473   - continue;
2474   - }
2475   - indicateProgress(false, false);
2476   - visited.insert(node.getObjectID());
2477   - }
2478   -
2479   - if (node.isArray())
2480   - {
2481   - int nitems = node.getArrayNItems();
2482   - for (int i = 0; i < nitems; ++i)
2483   - {
2484   - QPDFObjectHandle oh = node.getArrayItem(i);
2485   - if (! oh.isScalar())
2486   - {
2487   - queue.push_back(oh);
2488   - }
2489   - }
2490   - }
2491   - else if (node.isDictionary() || node.isStream())
2492   - {
2493   - bool is_stream = false;
2494   - bool is_root = false;
2495   - bool filterable = false;
2496   - QPDFObjectHandle dict = node;
2497   - if (node.isStream())
2498   - {
2499   - is_stream = true;
2500   - dict = node.getDict();
2501   - // See whether we are able to filter this stream.
2502   - filterable = node.pipeStreamData(
2503   - 0, 0, this->m->stream_decode_level, true);
2504   - }
2505   - else if (this->m->pdf.getRoot().getObjectID() == node.getObjectID())
  2462 + QPDFObjectHandle oh = root.getKey(key);
  2463 + if ((key == "/Extensions") && (oh.isDictionary()))
  2464 + {
  2465 + bool extensions_indirect = false;
  2466 + if (oh.isIndirect())
2506 2467 {
2507   - is_root = true;
  2468 + QTC::TC("qpdf", "QPDFWriter make Extensions direct");
  2469 + extensions_indirect = true;
  2470 + oh = oh.shallowCopy();
  2471 + root.replaceKey(key, oh);
2508 2472 }
2509   -
2510   - std::set<std::string> keys = dict.getKeys();
2511   - for (std::set<std::string>::iterator iter = keys.begin();
2512   - iter != keys.end(); ++iter)
2513   - {
2514   - std::string const& key = *iter;
2515   - QPDFObjectHandle oh = dict.getKey(key);
2516   - bool add_to_queue = true;
2517   - if (is_stream)
2518   - {
2519   - if (oh.isIndirect() &&
2520   - ((key == "/Length") ||
2521   - (filterable &&
2522   - ((key == "/Filter") ||
2523   - (key == "/DecodeParms")))))
2524   - {
2525   - QTC::TC("qpdf", "QPDFWriter make stream key direct");
2526   - add_to_queue = false;
2527   - oh.makeDirect();
2528   - dict.replaceKey(key, oh);
2529   - }
2530   - }
2531   - else if (is_root)
  2473 + if (oh.hasKey("/ADBE"))
  2474 + {
  2475 + QPDFObjectHandle adbe = oh.getKey("/ADBE");
  2476 + if (adbe.isIndirect())
2532 2477 {
2533   - if ((key == "/Extensions") && (oh.isDictionary()))
2534   - {
2535   - bool extensions_indirect = false;
2536   - if (oh.isIndirect())
2537   - {
2538   - QTC::TC("qpdf", "QPDFWriter make Extensions direct");
2539   - extensions_indirect = true;
2540   - add_to_queue = false;
2541   - oh = oh.shallowCopy();
2542   - dict.replaceKey(key, oh);
2543   - }
2544   - if (oh.hasKey("/ADBE"))
2545   - {
2546   - QPDFObjectHandle adbe = oh.getKey("/ADBE");
2547   - if (adbe.isIndirect())
2548   - {
2549   - QTC::TC("qpdf", "QPDFWriter make ADBE direct",
2550   - extensions_indirect ? 0 : 1);
2551   - adbe.makeDirect();
2552   - oh.replaceKey("/ADBE", adbe);
2553   - }
2554   - }
2555   - }
  2478 + QTC::TC("qpdf", "QPDFWriter make ADBE direct",
  2479 + extensions_indirect ? 0 : 1);
  2480 + adbe.makeDirect();
  2481 + oh.replaceKey("/ADBE", adbe);
2556 2482 }
2557   -
2558   - if (add_to_queue)
2559   - {
2560   - queue.push_back(oh);
2561   - }
2562   - }
2563   - }
  2483 + }
  2484 + }
2564 2485 }
2565 2486 }
2566 2487  
... ... @@ -2737,14 +2658,11 @@ QPDFWriter::write()
2737 2658 {
2738 2659 doWriteSetup();
2739 2660  
2740   - // Set up progress reporting. We spent about equal amounts of time
2741   - // preparing and writing one pass. To get a rough estimate of
2742   - // progress, we track handling of indirect objects. For linearized
2743   - // files, we write two passes. events_expected is an
2744   - // approximation, but it's good enough for progress reporting,
2745   - // which is mostly a guess anyway.
  2661 + // Set up progress reporting. For linearized files, we write two
  2662 + // passes. events_expected is an approximation, but it's good
  2663 + // enough for progress reporting, which is mostly a guess anyway.
2746 2664 this->m->events_expected = QIntC::to_int(
2747   - this->m->pdf.getObjectCount() * (this->m->linearized ? 3 : 2));
  2665 + this->m->pdf.getObjectCount() * (this->m->linearized ? 2 : 1));
2748 2666  
2749 2667 prepareFileForWrite();
2750 2668  
... ... @@ -3138,8 +3056,21 @@ QPDFWriter::writeLinearized()
3138 3056 discardGeneration(this->m->object_to_object_stream,
3139 3057 this->m->object_to_object_stream_no_gen);
3140 3058  
3141   - bool need_xref_stream = (! this->m->object_to_object_stream.empty());
3142   - this->m->pdf.optimize(this->m->object_to_object_stream_no_gen);
  3059 + auto skip_stream_parameters = [this](QPDFObjectHandle& stream) {
  3060 + bool compress_stream;
  3061 + bool is_metadata;
  3062 + if (willFilterStream(stream, compress_stream, is_metadata, nullptr))
  3063 + {
  3064 + return 2;
  3065 + }
  3066 + else
  3067 + {
  3068 + return 1;
  3069 + }
  3070 + };
  3071 +
  3072 + this->m->pdf.optimize(this->m->object_to_object_stream_no_gen,
  3073 + true, skip_stream_parameters);
3143 3074  
3144 3075 std::vector<QPDFObjectHandle> part4;
3145 3076 std::vector<QPDFObjectHandle> part6;
... ... @@ -3173,6 +3104,7 @@ QPDFWriter::writeLinearized()
3173 3104 int after_second_half = 1 + second_half_uncompressed;
3174 3105 this->m->next_objid = after_second_half;
3175 3106 int second_half_xref = 0;
  3107 + bool need_xref_stream = (! this->m->object_to_object_stream.empty());
3176 3108 if (need_xref_stream)
3177 3109 {
3178 3110 second_half_xref = this->m->next_objid++;
... ...
qpdf/qpdf.testcov
... ... @@ -234,7 +234,6 @@ QPDFWriter extra header text no newline 0
234 234 QPDFWriter extra header text add newline 0
235 235 QPDF bogus 0 offset 0
236 236 QPDF global offset 0
237   -QPDFWriter make stream key direct 0
238 237 QPDFWriter copy V5 0
239 238 QPDFWriter increasing extension level 0
240 239 QPDFWriter make Extensions direct 0
... ...
qpdf/qtest/qpdf.test
... ... @@ -716,7 +716,7 @@ my @bug_tests = (
716 716 ["99b", "object 0", 2],
717 717 ["100", "xref reconstruction loop", 2],
718 718 ["101", "resolve for exception text", 2],
719   - ["117", "other infinite loop", 2],
  719 + ["117", "other infinite loop", 3],
720 720 ["118", "other infinite loop", 2],
721 721 ["119", "other infinite loop", 3],
722 722 ["120", "other infinite loop", 3],
... ...
qpdf/qtest/qpdf/issue-117.out
... ... @@ -13,4 +13,4 @@ WARNING: issue-117.pdf (object 7 0, offset 1791): unknown token while reading ob
13 13 WARNING: issue-117.pdf (object 7 0, offset 1267): /Length key in stream dictionary is not an integer
14 14 WARNING: issue-117.pdf (object 7 0, offset 1418): attempting to recover stream length
15 15 WARNING: issue-117.pdf (object 7 0, offset 1418): recovered stream length: 347
16   -attempt to make a stream into a direct object
  16 +qpdf: operation succeeded with warnings; resulting file may have some problems
... ...