Commit b8723e97f4b94fe03e631aab0309382ead3137ed
1 parent
25988e8d
Add coalesce contents capability
Showing
9 changed files
with
566 additions
and
21 deletions
ChangeLog
| ... | ... | @@ -78,6 +78,35 @@ |
| 78 | 78 | production use. Even if it did, it would be very unusual for a PDF |
| 79 | 79 | file to actually be adversely affected by this issue. |
| 80 | 80 | |
| 81 | + * Add support for coalescing a page's contents into a single | |
| 82 | + stream if they are represented as an array of streams. This can be | |
| 83 | + performed from the command line using the --coalesce-contents | |
| 84 | + option. Coalescing content streams can simplify things for | |
| 85 | + software that wants to operate on a page's content streams without | |
| 86 | + having to handle weird edge cases like content streams split in | |
| 87 | + the middle of tokens. Note that | |
| 88 | + QPDFObjectHandle::parsePageContents and | |
| 89 | + QPDFObjectHandle::parseContentStream already handled split content | |
| 90 | + streams. This is mainly to set the stage for new methods of | |
| 91 | + operating on page contents. The new method | |
| 92 | + QPDFObjectHandle::pipeContentStreams will pipe all of a page's | |
| 93 | + content streams though a single pipeline. The new method | |
| 94 | + QPDFObjectHandle.coalesceContentStreams, when called on a page | |
| 95 | + object, will do nothing if the page's contents are a single | |
| 96 | + stream, but if they are an array of streams, it will replace the | |
| 97 | + page's contents with a single stream whose contents are the | |
| 98 | + concatenation of the original streams. | |
| 99 | + | |
| 100 | + * A few library routines throw exceptions if called on non-page | |
| 101 | + objects. These constraints have been relaxed somewhat to make qpdf | |
| 102 | + more tolerant of files whose page dictionaries are not properly | |
| 103 | + marked as such. Mostly exceptions about page operations being | |
| 104 | + called on non page objects will only be thrown in cases where the | |
| 105 | + operation had no chance of succeeding anyway. This change has no | |
| 106 | + impact on any default mode operations, but it could allow | |
| 107 | + applications that use page-level APIs in QPDFObjectHandle to be | |
| 108 | + more tolerant of certain types of damaged files. | |
| 109 | + | |
| 81 | 110 | 2018-02-04 Jay Berkenbilt <ejb@ql.org> |
| 82 | 111 | |
| 83 | 112 | * Add QPDFWriter::setLinearizationPass1Filename method and | ... | ... |
include/qpdf/QPDFObjectHandle.hh
| ... | ... | @@ -200,6 +200,20 @@ class QPDFObjectHandle |
| 200 | 200 | QPDF_DLL |
| 201 | 201 | void pipePageContents(Pipeline* p); |
| 202 | 202 | |
| 203 | + // When called on a stream or stream array that is some page's | |
| 204 | + // content streams, do the same as pipePageContents. This method | |
| 205 | + // is a lower level way to do what pipePageContents does, but it | |
| 206 | + // allows you to perform this operation on a contents object that | |
| 207 | + // is disconnected from a page object. The description argument | |
| 208 | + // should describe the containing page and is used in error | |
| 209 | + // messages. The all_description argument is initialized to | |
| 210 | + // something that could be used to describe the result of the | |
| 211 | + // pipeline. It is the description amended with the identifiers of | |
| 212 | + // the underlying objects. | |
| 213 | + QPDF_DLL | |
| 214 | + void pipeContentStreams(Pipeline* p, std::string const& description, | |
| 215 | + std::string& all_description); | |
| 216 | + | |
| 203 | 217 | // Older method: stream_or_array should be the value of /Contents |
| 204 | 218 | // from a page object. It's more convenient to just call |
| 205 | 219 | // parsePageContents on the page object, and error messages will |
| ... | ... | @@ -556,30 +570,30 @@ class QPDFObjectHandle |
| 556 | 570 | |
| 557 | 571 | // Convenience routines for commonly performed functions |
| 558 | 572 | |
| 559 | - // Throws an exception if this is not a Page object. Returns an | |
| 560 | - // empty map if there are no images or no resources. This | |
| 561 | - // function does not presently support inherited resources. If | |
| 562 | - // this is a significant concern, call | |
| 573 | + // Returns an empty map if there are no images or no resources. | |
| 574 | + // This function does not presently support inherited resources. | |
| 575 | + // If this is a significant concern, call | |
| 563 | 576 | // pushInheritedAttributesToPage() on the QPDF object that owns |
| 564 | - // this page. See comment in the source for details. Return | |
| 565 | - // value is a map from XObject name to the image object, which is | |
| 566 | - // always a stream. | |
| 577 | + // this page. See comment in the source for details. Return value | |
| 578 | + // is a map from XObject name to the image object, which is always | |
| 579 | + // a stream. | |
| 567 | 580 | QPDF_DLL |
| 568 | 581 | std::map<std::string, QPDFObjectHandle> getPageImages(); |
| 569 | 582 | |
| 570 | 583 | // Returns a vector of stream objects representing the content |
| 571 | 584 | // streams for the given page. This routine allows the caller to |
| 572 | 585 | // not care whether there are one or more than one content streams |
| 573 | - // for a page. Throws an exception if this is not a Page object. | |
| 586 | + // for a page. | |
| 574 | 587 | QPDF_DLL |
| 575 | 588 | std::vector<QPDFObjectHandle> getPageContents(); |
| 576 | 589 | |
| 577 | - // Add the given object as a new content stream for this page. If | |
| 578 | - // parameter 'first' is true, add to the beginning. Otherwise, | |
| 579 | - // add to the end. This routine automatically converts the page | |
| 590 | + // Add the given object as a new content stream for this page. If | |
| 591 | + // parameter 'first' is true, add to the beginning. Otherwise, add | |
| 592 | + // to the end. This routine automatically converts the page | |
| 580 | 593 | // contents to an array if it is a scalar, allowing the caller not |
| 581 | - // to care what the initial structure is. Throws an exception if | |
| 582 | - // this is not a Page object. | |
| 594 | + // to care what the initial structure is. You can call | |
| 595 | + // coalesceContentStreams() afterwards if you want to force it to | |
| 596 | + // be a single stream. | |
| 583 | 597 | QPDF_DLL |
| 584 | 598 | void addPageContents(QPDFObjectHandle contents, bool first); |
| 585 | 599 | |
| ... | ... | @@ -590,6 +604,16 @@ class QPDFObjectHandle |
| 590 | 604 | QPDF_DLL |
| 591 | 605 | void rotatePage(int angle, bool relative); |
| 592 | 606 | |
| 607 | + // Coalesce a page's content streams. A page's content may be a | |
| 608 | + // stream or an array of streams. If this page's content is an | |
| 609 | + // array, concatenate the streams into a single stream. This can | |
| 610 | + // be useful when working with files that split content streams in | |
| 611 | + // arbitary spots, such as in the middle of a token, as that can | |
| 612 | + // confuse some software. You could also call this after calling | |
| 613 | + // addPageContents. | |
| 614 | + QPDF_DLL | |
| 615 | + void coalesceContentStreams(); | |
| 616 | + | |
| 593 | 617 | // Initializers for objects. This Factory class gives the QPDF |
| 594 | 618 | // class specific permission to call factory methods without |
| 595 | 619 | // making it a friend of the whole QPDFObjectHandle class. |
| ... | ... | @@ -724,8 +748,6 @@ class QPDFObjectHandle |
| 724 | 748 | ParserCallbacks* callbacks); |
| 725 | 749 | std::vector<QPDFObjectHandle> arrayOrStreamToStreamArray( |
| 726 | 750 | std::string const& description, std::string& all_description); |
| 727 | - void pipeContentStreams(Pipeline* p, std::string const& description, | |
| 728 | - std::string& all_description); | |
| 729 | 751 | static void warn(QPDF*, QPDFExc const&); |
| 730 | 752 | |
| 731 | 753 | bool initialized; | ... | ... |
libqpdf/QPDFObjectHandle.cc
| ... | ... | @@ -14,6 +14,7 @@ |
| 14 | 14 | #include <qpdf/QPDF_Stream.hh> |
| 15 | 15 | #include <qpdf/QPDF_Reserved.hh> |
| 16 | 16 | #include <qpdf/Pl_Buffer.hh> |
| 17 | +#include <qpdf/Pl_Concatenate.hh> | |
| 17 | 18 | #include <qpdf/BufferInputSource.hh> |
| 18 | 19 | #include <qpdf/QPDFExc.hh> |
| 19 | 20 | |
| ... | ... | @@ -28,6 +29,39 @@ class TerminateParsing |
| 28 | 29 | { |
| 29 | 30 | }; |
| 30 | 31 | |
| 32 | +class CoalesceProvider: public QPDFObjectHandle::StreamDataProvider | |
| 33 | +{ | |
| 34 | + public: | |
| 35 | + CoalesceProvider(QPDFObjectHandle containing_page, | |
| 36 | + QPDFObjectHandle old_contents) : | |
| 37 | + containing_page(containing_page), | |
| 38 | + old_contents(old_contents) | |
| 39 | + { | |
| 40 | + } | |
| 41 | + virtual ~CoalesceProvider() | |
| 42 | + { | |
| 43 | + } | |
| 44 | + virtual void provideStreamData(int objid, int generation, | |
| 45 | + Pipeline* pipeline); | |
| 46 | + | |
| 47 | + private: | |
| 48 | + QPDFObjectHandle containing_page; | |
| 49 | + QPDFObjectHandle old_contents; | |
| 50 | +}; | |
| 51 | + | |
| 52 | +void | |
| 53 | +CoalesceProvider::provideStreamData(int, int, Pipeline* p) | |
| 54 | +{ | |
| 55 | + QTC::TC("qpdf", "QPDFObjectHandle coalesce provide stream data"); | |
| 56 | + Pl_Concatenate concat("concatenate", p); | |
| 57 | + std::string description = "page object " + | |
| 58 | + QUtil::int_to_string(containing_page.getObjectID()) + " " + | |
| 59 | + QUtil::int_to_string(containing_page.getGeneration()); | |
| 60 | + std::string all_description; | |
| 61 | + old_contents.pipeContentStreams(&concat, description, all_description); | |
| 62 | + concat.manualFinish(); | |
| 63 | +} | |
| 64 | + | |
| 31 | 65 | void |
| 32 | 66 | QPDFObjectHandle::ParserCallbacks::terminateParsing() |
| 33 | 67 | { |
| ... | ... | @@ -691,7 +725,6 @@ QPDFObjectHandle::arrayOrStreamToStreamArray( |
| 691 | 725 | std::vector<QPDFObjectHandle> |
| 692 | 726 | QPDFObjectHandle::getPageContents() |
| 693 | 727 | { |
| 694 | - assertPageObject(); | |
| 695 | 728 | std::string description = "page object " + |
| 696 | 729 | QUtil::int_to_string(this->objid) + " " + |
| 697 | 730 | QUtil::int_to_string(this->generation); |
| ... | ... | @@ -703,7 +736,6 @@ QPDFObjectHandle::getPageContents() |
| 703 | 736 | void |
| 704 | 737 | QPDFObjectHandle::addPageContents(QPDFObjectHandle new_contents, bool first) |
| 705 | 738 | { |
| 706 | - assertPageObject(); | |
| 707 | 739 | new_contents.assertStream(); |
| 708 | 740 | |
| 709 | 741 | std::vector<QPDFObjectHandle> orig_contents = getPageContents(); |
| ... | ... | @@ -785,6 +817,33 @@ QPDFObjectHandle::rotatePage(int angle, bool relative) |
| 785 | 817 | replaceKey("/Rotate", QPDFObjectHandle::newInteger(new_angle)); |
| 786 | 818 | } |
| 787 | 819 | |
| 820 | +void | |
| 821 | +QPDFObjectHandle::coalesceContentStreams() | |
| 822 | +{ | |
| 823 | + assertPageObject(); | |
| 824 | + QPDFObjectHandle contents = this->getKey("/Contents"); | |
| 825 | + if (contents.isStream()) | |
| 826 | + { | |
| 827 | + QTC::TC("qpdf", "QPDFObjectHandle coalesce called on stream"); | |
| 828 | + return; | |
| 829 | + } | |
| 830 | + QPDF* qpdf = getOwningQPDF(); | |
| 831 | + if (qpdf == 0) | |
| 832 | + { | |
| 833 | + // Should not be possible for a page object to not have an | |
| 834 | + // owning PDF unless it was manually constructed in some | |
| 835 | + // incorrect way. | |
| 836 | + throw std::logic_error("coalesceContentStreams called on object" | |
| 837 | + " with no associated PDF file"); | |
| 838 | + } | |
| 839 | + QPDFObjectHandle new_contents = newStream(qpdf); | |
| 840 | + this->replaceKey("/Contents", new_contents); | |
| 841 | + | |
| 842 | + PointerHolder<StreamDataProvider> provider = | |
| 843 | + new CoalesceProvider(*this, contents); | |
| 844 | + new_contents.replaceStreamData(provider, newNull(), newNull()); | |
| 845 | +} | |
| 846 | + | |
| 788 | 847 | std::string |
| 789 | 848 | QPDFObjectHandle::unparse() |
| 790 | 849 | { |
| ... | ... | @@ -842,6 +901,7 @@ QPDFObjectHandle::parse(std::string const& object_str, |
| 842 | 901 | void |
| 843 | 902 | QPDFObjectHandle::pipePageContents(Pipeline* p) |
| 844 | 903 | { |
| 904 | + assertPageObject(); | |
| 845 | 905 | std::string description = "page object " + |
| 846 | 906 | QUtil::int_to_string(this->objid) + " " + |
| 847 | 907 | QUtil::int_to_string(this->generation); |
| ... | ... | @@ -879,6 +939,7 @@ QPDFObjectHandle::pipeContentStreams( |
| 879 | 939 | void |
| 880 | 940 | QPDFObjectHandle::parsePageContents(ParserCallbacks* callbacks) |
| 881 | 941 | { |
| 942 | + assertPageObject(); | |
| 882 | 943 | std::string description = "page object " + |
| 883 | 944 | QUtil::int_to_string(this->objid) + " " + |
| 884 | 945 | QUtil::int_to_string(this->generation); |
| ... | ... | @@ -1728,15 +1789,15 @@ QPDFObjectHandle::assertNumber() |
| 1728 | 1789 | bool |
| 1729 | 1790 | QPDFObjectHandle::isPageObject() |
| 1730 | 1791 | { |
| 1731 | - return (this->isDictionary() && this->hasKey("/Type") && | |
| 1732 | - (this->getKey("/Type").getName() == "/Page")); | |
| 1792 | + // Some PDF files have /Type broken on pages. | |
| 1793 | + return (this->isDictionary() && this->hasKey("/Contents")); | |
| 1733 | 1794 | } |
| 1734 | 1795 | |
| 1735 | 1796 | bool |
| 1736 | 1797 | QPDFObjectHandle::isPagesObject() |
| 1737 | 1798 | { |
| 1738 | - return (this->isDictionary() && this->hasKey("/Type") && | |
| 1739 | - (this->getKey("/Type").getName() == "/Pages")); | |
| 1799 | + // Some PDF files have /Type broken on pages. | |
| 1800 | + return (this->isDictionary() && this->hasKey("/Kids")); | |
| 1740 | 1801 | } |
| 1741 | 1802 | |
| 1742 | 1803 | void | ... | ... |
qpdf/qpdf.cc
| ... | ... | @@ -90,6 +90,7 @@ struct Options |
| 90 | 90 | qdf_mode(false), |
| 91 | 91 | preserve_unreferenced_objects(false), |
| 92 | 92 | newline_before_endstream(false), |
| 93 | + coalesce_contents(false), | |
| 93 | 94 | show_npages(false), |
| 94 | 95 | deterministic_id(false), |
| 95 | 96 | static_id(false), |
| ... | ... | @@ -154,6 +155,7 @@ struct Options |
| 154 | 155 | bool preserve_unreferenced_objects; |
| 155 | 156 | bool newline_before_endstream; |
| 156 | 157 | std::string linearize_pass1; |
| 158 | + bool coalesce_contents; | |
| 157 | 159 | std::string min_version; |
| 158 | 160 | std::string force_version; |
| 159 | 161 | bool show_npages; |
| ... | ... | @@ -391,6 +393,7 @@ familiar with the PDF file format or who are PDF developers.\n\ |
| 391 | 393 | --object-streams=mode controls handing of object streams\n\ |
| 392 | 394 | --preserve-unreferenced preserve unreferenced objects\n\ |
| 393 | 395 | --newline-before-endstream always put a newline before endstream\n\ |
| 396 | +--coalesce-contents force all pages' content to be a single stream\n\ | |
| 394 | 397 | --qdf turns on \"QDF mode\" (below)\n\ |
| 395 | 398 | --linearize-pass1=file write intermediate pass of linearized file\n\ |
| 396 | 399 | for debugging\n\ |
| ... | ... | @@ -1543,6 +1546,10 @@ static void parse_options(int argc, char* argv[], Options& o) |
| 1543 | 1546 | } |
| 1544 | 1547 | o.linearize_pass1 = parameter; |
| 1545 | 1548 | } |
| 1549 | + else if (strcmp(arg, "coalesce-contents") == 0) | |
| 1550 | + { | |
| 1551 | + o.coalesce_contents = true; | |
| 1552 | + } | |
| 1546 | 1553 | else if (strcmp(arg, "min-version") == 0) |
| 1547 | 1554 | { |
| 1548 | 1555 | if (parameter == 0) |
| ... | ... | @@ -1960,6 +1967,19 @@ static void do_inspection(QPDF& pdf, Options& o) |
| 1960 | 1967 | } |
| 1961 | 1968 | } |
| 1962 | 1969 | |
| 1970 | +static void handle_transformations(QPDF& pdf, Options& o) | |
| 1971 | +{ | |
| 1972 | + if (o.coalesce_contents) | |
| 1973 | + { | |
| 1974 | + std::vector<QPDFObjectHandle> pages = pdf.getAllPages(); | |
| 1975 | + for (std::vector<QPDFObjectHandle>::iterator iter = pages.begin(); | |
| 1976 | + iter != pages.end(); ++iter) | |
| 1977 | + { | |
| 1978 | + (*iter).coalesceContentStreams(); | |
| 1979 | + } | |
| 1980 | + } | |
| 1981 | +} | |
| 1982 | + | |
| 1963 | 1983 | static void handle_page_specs(QPDF& pdf, Options& o, |
| 1964 | 1984 | std::vector<PointerHolder<QPDF> >& page_heap) |
| 1965 | 1985 | { |
| ... | ... | @@ -2382,6 +2402,7 @@ int main(int argc, char* argv[]) |
| 2382 | 2402 | pdf.processFile(o.infilename, o.password); |
| 2383 | 2403 | } |
| 2384 | 2404 | |
| 2405 | + handle_transformations(pdf, o); | |
| 2385 | 2406 | std::vector<PointerHolder<QPDF> > page_heap; |
| 2386 | 2407 | if (! o.page_specs.empty()) |
| 2387 | 2408 | { | ... | ... |
qpdf/qpdf.testcov
| ... | ... | @@ -304,3 +304,5 @@ QPDFTokenizer EOF when not allowed 0 |
| 304 | 304 | QPDFTokenizer inline image at EOF 0 |
| 305 | 305 | Pl_QPDFTokenizer found ID 0 |
| 306 | 306 | QPDFObjectHandle non-stream in stream array 0 |
| 307 | +QPDFObjectHandle coalesce called on stream 0 | |
| 308 | +QPDFObjectHandle coalesce provide stream data 0 | ... | ... |
qpdf/qtest/qpdf.test
| ... | ... | @@ -736,6 +736,28 @@ $td->runtest("stream with tiff predictor", |
| 736 | 736 | |
| 737 | 737 | show_ntests(); |
| 738 | 738 | # ---------- |
| 739 | +$td->notify("--- Coalesce contents ---"); | |
| 740 | +$n_tests += 4; | |
| 741 | + | |
| 742 | +$td->runtest("coalesce contents with qdf", | |
| 743 | + {$td->COMMAND => | |
| 744 | + "qpdf --qdf --static-id" . | |
| 745 | + " --coalesce-contents coalesce.pdf a.pdf"}, | |
| 746 | + {$td->STRING => "", $td->EXIT_STATUS => 0}); | |
| 747 | +$td->runtest("check output", | |
| 748 | + {$td->FILE => "a.pdf"}, | |
| 749 | + {$td->FILE => "coalesce-out.qdf"}); | |
| 750 | +$td->runtest("coalesce contents without qdf", | |
| 751 | + {$td->COMMAND => | |
| 752 | + "qpdf --static-id" . | |
| 753 | + " --coalesce-contents coalesce.pdf a.pdf"}, | |
| 754 | + {$td->STRING => "", $td->EXIT_STATUS => 0}); | |
| 755 | +$td->runtest("check output", | |
| 756 | + {$td->FILE => "a.pdf"}, | |
| 757 | + {$td->FILE => "coalesce-out.pdf"}); | |
| 758 | + | |
| 759 | +show_ntests(); | |
| 760 | +# ---------- | |
| 739 | 761 | $td->notify("--- Newline before endstream ---"); |
| 740 | 762 | $n_tests += 10; |
| 741 | 763 | ... | ... |
qpdf/qtest/qpdf/coalesce-out.pdf
0 โ 100644
No preview for this file type
qpdf/qtest/qpdf/coalesce-out.qdf
0 โ 100644
No preview for this file type
qpdf/qtest/qpdf/coalesce.pdf
0 โ 100644
No preview for this file type