Commit b8723e97f4b94fe03e631aab0309382ead3137ed

Authored by Jay Berkenbilt
1 parent 25988e8d

Add coalesce contents capability

ChangeLog
@@ -78,6 +78,35 @@ @@ -78,6 +78,35 @@
78 production use. Even if it did, it would be very unusual for a PDF 78 production use. Even if it did, it would be very unusual for a PDF
79 file to actually be adversely affected by this issue. 79 file to actually be adversely affected by this issue.
80 80
  81 + * Add support for coalescing a page's contents into a single
  82 + stream if they are represented as an array of streams. This can be
  83 + performed from the command line using the --coalesce-contents
  84 + option. Coalescing content streams can simplify things for
  85 + software that wants to operate on a page's content streams without
  86 + having to handle weird edge cases like content streams split in
  87 + the middle of tokens. Note that
  88 + QPDFObjectHandle::parsePageContents and
  89 + QPDFObjectHandle::parseContentStream already handled split content
  90 + streams. This is mainly to set the stage for new methods of
  91 + operating on page contents. The new method
  92 + QPDFObjectHandle::pipeContentStreams will pipe all of a page's
  93 + content streams though a single pipeline. The new method
  94 + QPDFObjectHandle.coalesceContentStreams, when called on a page
  95 + object, will do nothing if the page's contents are a single
  96 + stream, but if they are an array of streams, it will replace the
  97 + page's contents with a single stream whose contents are the
  98 + concatenation of the original streams.
  99 +
  100 + * A few library routines throw exceptions if called on non-page
  101 + objects. These constraints have been relaxed somewhat to make qpdf
  102 + more tolerant of files whose page dictionaries are not properly
  103 + marked as such. Mostly exceptions about page operations being
  104 + called on non page objects will only be thrown in cases where the
  105 + operation had no chance of succeeding anyway. This change has no
  106 + impact on any default mode operations, but it could allow
  107 + applications that use page-level APIs in QPDFObjectHandle to be
  108 + more tolerant of certain types of damaged files.
  109 +
81 2018-02-04 Jay Berkenbilt <ejb@ql.org> 110 2018-02-04 Jay Berkenbilt <ejb@ql.org>
82 111
83 * Add QPDFWriter::setLinearizationPass1Filename method and 112 * Add QPDFWriter::setLinearizationPass1Filename method and
include/qpdf/QPDFObjectHandle.hh
@@ -200,6 +200,20 @@ class QPDFObjectHandle @@ -200,6 +200,20 @@ class QPDFObjectHandle
200 QPDF_DLL 200 QPDF_DLL
201 void pipePageContents(Pipeline* p); 201 void pipePageContents(Pipeline* p);
202 202
  203 + // When called on a stream or stream array that is some page's
  204 + // content streams, do the same as pipePageContents. This method
  205 + // is a lower level way to do what pipePageContents does, but it
  206 + // allows you to perform this operation on a contents object that
  207 + // is disconnected from a page object. The description argument
  208 + // should describe the containing page and is used in error
  209 + // messages. The all_description argument is initialized to
  210 + // something that could be used to describe the result of the
  211 + // pipeline. It is the description amended with the identifiers of
  212 + // the underlying objects.
  213 + QPDF_DLL
  214 + void pipeContentStreams(Pipeline* p, std::string const& description,
  215 + std::string& all_description);
  216 +
203 // Older method: stream_or_array should be the value of /Contents 217 // Older method: stream_or_array should be the value of /Contents
204 // from a page object. It's more convenient to just call 218 // from a page object. It's more convenient to just call
205 // parsePageContents on the page object, and error messages will 219 // parsePageContents on the page object, and error messages will
@@ -556,30 +570,30 @@ class QPDFObjectHandle @@ -556,30 +570,30 @@ class QPDFObjectHandle
556 570
557 // Convenience routines for commonly performed functions 571 // Convenience routines for commonly performed functions
558 572
559 - // Throws an exception if this is not a Page object. Returns an  
560 - // empty map if there are no images or no resources. This  
561 - // function does not presently support inherited resources. If  
562 - // this is a significant concern, call 573 + // Returns an empty map if there are no images or no resources.
  574 + // This function does not presently support inherited resources.
  575 + // If this is a significant concern, call
563 // pushInheritedAttributesToPage() on the QPDF object that owns 576 // pushInheritedAttributesToPage() on the QPDF object that owns
564 - // this page. See comment in the source for details. Return  
565 - // value is a map from XObject name to the image object, which is  
566 - // always a stream. 577 + // this page. See comment in the source for details. Return value
  578 + // is a map from XObject name to the image object, which is always
  579 + // a stream.
567 QPDF_DLL 580 QPDF_DLL
568 std::map<std::string, QPDFObjectHandle> getPageImages(); 581 std::map<std::string, QPDFObjectHandle> getPageImages();
569 582
570 // Returns a vector of stream objects representing the content 583 // Returns a vector of stream objects representing the content
571 // streams for the given page. This routine allows the caller to 584 // streams for the given page. This routine allows the caller to
572 // not care whether there are one or more than one content streams 585 // not care whether there are one or more than one content streams
573 - // for a page. Throws an exception if this is not a Page object. 586 + // for a page.
574 QPDF_DLL 587 QPDF_DLL
575 std::vector<QPDFObjectHandle> getPageContents(); 588 std::vector<QPDFObjectHandle> getPageContents();
576 589
577 - // Add the given object as a new content stream for this page. If  
578 - // parameter 'first' is true, add to the beginning. Otherwise,  
579 - // add to the end. This routine automatically converts the page 590 + // Add the given object as a new content stream for this page. If
  591 + // parameter 'first' is true, add to the beginning. Otherwise, add
  592 + // to the end. This routine automatically converts the page
580 // contents to an array if it is a scalar, allowing the caller not 593 // contents to an array if it is a scalar, allowing the caller not
581 - // to care what the initial structure is. Throws an exception if  
582 - // this is not a Page object. 594 + // to care what the initial structure is. You can call
  595 + // coalesceContentStreams() afterwards if you want to force it to
  596 + // be a single stream.
583 QPDF_DLL 597 QPDF_DLL
584 void addPageContents(QPDFObjectHandle contents, bool first); 598 void addPageContents(QPDFObjectHandle contents, bool first);
585 599
@@ -590,6 +604,16 @@ class QPDFObjectHandle @@ -590,6 +604,16 @@ class QPDFObjectHandle
590 QPDF_DLL 604 QPDF_DLL
591 void rotatePage(int angle, bool relative); 605 void rotatePage(int angle, bool relative);
592 606
  607 + // Coalesce a page's content streams. A page's content may be a
  608 + // stream or an array of streams. If this page's content is an
  609 + // array, concatenate the streams into a single stream. This can
  610 + // be useful when working with files that split content streams in
  611 + // arbitary spots, such as in the middle of a token, as that can
  612 + // confuse some software. You could also call this after calling
  613 + // addPageContents.
  614 + QPDF_DLL
  615 + void coalesceContentStreams();
  616 +
593 // Initializers for objects. This Factory class gives the QPDF 617 // Initializers for objects. This Factory class gives the QPDF
594 // class specific permission to call factory methods without 618 // class specific permission to call factory methods without
595 // making it a friend of the whole QPDFObjectHandle class. 619 // making it a friend of the whole QPDFObjectHandle class.
@@ -724,8 +748,6 @@ class QPDFObjectHandle @@ -724,8 +748,6 @@ class QPDFObjectHandle
724 ParserCallbacks* callbacks); 748 ParserCallbacks* callbacks);
725 std::vector<QPDFObjectHandle> arrayOrStreamToStreamArray( 749 std::vector<QPDFObjectHandle> arrayOrStreamToStreamArray(
726 std::string const& description, std::string& all_description); 750 std::string const& description, std::string& all_description);
727 - void pipeContentStreams(Pipeline* p, std::string const& description,  
728 - std::string& all_description);  
729 static void warn(QPDF*, QPDFExc const&); 751 static void warn(QPDF*, QPDFExc const&);
730 752
731 bool initialized; 753 bool initialized;
libqpdf/QPDFObjectHandle.cc
@@ -14,6 +14,7 @@ @@ -14,6 +14,7 @@
14 #include <qpdf/QPDF_Stream.hh> 14 #include <qpdf/QPDF_Stream.hh>
15 #include <qpdf/QPDF_Reserved.hh> 15 #include <qpdf/QPDF_Reserved.hh>
16 #include <qpdf/Pl_Buffer.hh> 16 #include <qpdf/Pl_Buffer.hh>
  17 +#include <qpdf/Pl_Concatenate.hh>
17 #include <qpdf/BufferInputSource.hh> 18 #include <qpdf/BufferInputSource.hh>
18 #include <qpdf/QPDFExc.hh> 19 #include <qpdf/QPDFExc.hh>
19 20
@@ -28,6 +29,39 @@ class TerminateParsing @@ -28,6 +29,39 @@ class TerminateParsing
28 { 29 {
29 }; 30 };
30 31
  32 +class CoalesceProvider: public QPDFObjectHandle::StreamDataProvider
  33 +{
  34 + public:
  35 + CoalesceProvider(QPDFObjectHandle containing_page,
  36 + QPDFObjectHandle old_contents) :
  37 + containing_page(containing_page),
  38 + old_contents(old_contents)
  39 + {
  40 + }
  41 + virtual ~CoalesceProvider()
  42 + {
  43 + }
  44 + virtual void provideStreamData(int objid, int generation,
  45 + Pipeline* pipeline);
  46 +
  47 + private:
  48 + QPDFObjectHandle containing_page;
  49 + QPDFObjectHandle old_contents;
  50 +};
  51 +
  52 +void
  53 +CoalesceProvider::provideStreamData(int, int, Pipeline* p)
  54 +{
  55 + QTC::TC("qpdf", "QPDFObjectHandle coalesce provide stream data");
  56 + Pl_Concatenate concat("concatenate", p);
  57 + std::string description = "page object " +
  58 + QUtil::int_to_string(containing_page.getObjectID()) + " " +
  59 + QUtil::int_to_string(containing_page.getGeneration());
  60 + std::string all_description;
  61 + old_contents.pipeContentStreams(&concat, description, all_description);
  62 + concat.manualFinish();
  63 +}
  64 +
31 void 65 void
32 QPDFObjectHandle::ParserCallbacks::terminateParsing() 66 QPDFObjectHandle::ParserCallbacks::terminateParsing()
33 { 67 {
@@ -691,7 +725,6 @@ QPDFObjectHandle::arrayOrStreamToStreamArray( @@ -691,7 +725,6 @@ QPDFObjectHandle::arrayOrStreamToStreamArray(
691 std::vector<QPDFObjectHandle> 725 std::vector<QPDFObjectHandle>
692 QPDFObjectHandle::getPageContents() 726 QPDFObjectHandle::getPageContents()
693 { 727 {
694 - assertPageObject();  
695 std::string description = "page object " + 728 std::string description = "page object " +
696 QUtil::int_to_string(this->objid) + " " + 729 QUtil::int_to_string(this->objid) + " " +
697 QUtil::int_to_string(this->generation); 730 QUtil::int_to_string(this->generation);
@@ -703,7 +736,6 @@ QPDFObjectHandle::getPageContents() @@ -703,7 +736,6 @@ QPDFObjectHandle::getPageContents()
703 void 736 void
704 QPDFObjectHandle::addPageContents(QPDFObjectHandle new_contents, bool first) 737 QPDFObjectHandle::addPageContents(QPDFObjectHandle new_contents, bool first)
705 { 738 {
706 - assertPageObject();  
707 new_contents.assertStream(); 739 new_contents.assertStream();
708 740
709 std::vector<QPDFObjectHandle> orig_contents = getPageContents(); 741 std::vector<QPDFObjectHandle> orig_contents = getPageContents();
@@ -785,6 +817,33 @@ QPDFObjectHandle::rotatePage(int angle, bool relative) @@ -785,6 +817,33 @@ QPDFObjectHandle::rotatePage(int angle, bool relative)
785 replaceKey("/Rotate", QPDFObjectHandle::newInteger(new_angle)); 817 replaceKey("/Rotate", QPDFObjectHandle::newInteger(new_angle));
786 } 818 }
787 819
  820 +void
  821 +QPDFObjectHandle::coalesceContentStreams()
  822 +{
  823 + assertPageObject();
  824 + QPDFObjectHandle contents = this->getKey("/Contents");
  825 + if (contents.isStream())
  826 + {
  827 + QTC::TC("qpdf", "QPDFObjectHandle coalesce called on stream");
  828 + return;
  829 + }
  830 + QPDF* qpdf = getOwningQPDF();
  831 + if (qpdf == 0)
  832 + {
  833 + // Should not be possible for a page object to not have an
  834 + // owning PDF unless it was manually constructed in some
  835 + // incorrect way.
  836 + throw std::logic_error("coalesceContentStreams called on object"
  837 + " with no associated PDF file");
  838 + }
  839 + QPDFObjectHandle new_contents = newStream(qpdf);
  840 + this->replaceKey("/Contents", new_contents);
  841 +
  842 + PointerHolder<StreamDataProvider> provider =
  843 + new CoalesceProvider(*this, contents);
  844 + new_contents.replaceStreamData(provider, newNull(), newNull());
  845 +}
  846 +
788 std::string 847 std::string
789 QPDFObjectHandle::unparse() 848 QPDFObjectHandle::unparse()
790 { 849 {
@@ -842,6 +901,7 @@ QPDFObjectHandle::parse(std::string const&amp; object_str, @@ -842,6 +901,7 @@ QPDFObjectHandle::parse(std::string const&amp; object_str,
842 void 901 void
843 QPDFObjectHandle::pipePageContents(Pipeline* p) 902 QPDFObjectHandle::pipePageContents(Pipeline* p)
844 { 903 {
  904 + assertPageObject();
845 std::string description = "page object " + 905 std::string description = "page object " +
846 QUtil::int_to_string(this->objid) + " " + 906 QUtil::int_to_string(this->objid) + " " +
847 QUtil::int_to_string(this->generation); 907 QUtil::int_to_string(this->generation);
@@ -879,6 +939,7 @@ QPDFObjectHandle::pipeContentStreams( @@ -879,6 +939,7 @@ QPDFObjectHandle::pipeContentStreams(
879 void 939 void
880 QPDFObjectHandle::parsePageContents(ParserCallbacks* callbacks) 940 QPDFObjectHandle::parsePageContents(ParserCallbacks* callbacks)
881 { 941 {
  942 + assertPageObject();
882 std::string description = "page object " + 943 std::string description = "page object " +
883 QUtil::int_to_string(this->objid) + " " + 944 QUtil::int_to_string(this->objid) + " " +
884 QUtil::int_to_string(this->generation); 945 QUtil::int_to_string(this->generation);
@@ -1728,15 +1789,15 @@ QPDFObjectHandle::assertNumber() @@ -1728,15 +1789,15 @@ QPDFObjectHandle::assertNumber()
1728 bool 1789 bool
1729 QPDFObjectHandle::isPageObject() 1790 QPDFObjectHandle::isPageObject()
1730 { 1791 {
1731 - return (this->isDictionary() && this->hasKey("/Type") &&  
1732 - (this->getKey("/Type").getName() == "/Page")); 1792 + // Some PDF files have /Type broken on pages.
  1793 + return (this->isDictionary() && this->hasKey("/Contents"));
1733 } 1794 }
1734 1795
1735 bool 1796 bool
1736 QPDFObjectHandle::isPagesObject() 1797 QPDFObjectHandle::isPagesObject()
1737 { 1798 {
1738 - return (this->isDictionary() && this->hasKey("/Type") &&  
1739 - (this->getKey("/Type").getName() == "/Pages")); 1799 + // Some PDF files have /Type broken on pages.
  1800 + return (this->isDictionary() && this->hasKey("/Kids"));
1740 } 1801 }
1741 1802
1742 void 1803 void
qpdf/qpdf.cc
@@ -90,6 +90,7 @@ struct Options @@ -90,6 +90,7 @@ struct Options
90 qdf_mode(false), 90 qdf_mode(false),
91 preserve_unreferenced_objects(false), 91 preserve_unreferenced_objects(false),
92 newline_before_endstream(false), 92 newline_before_endstream(false),
  93 + coalesce_contents(false),
93 show_npages(false), 94 show_npages(false),
94 deterministic_id(false), 95 deterministic_id(false),
95 static_id(false), 96 static_id(false),
@@ -154,6 +155,7 @@ struct Options @@ -154,6 +155,7 @@ struct Options
154 bool preserve_unreferenced_objects; 155 bool preserve_unreferenced_objects;
155 bool newline_before_endstream; 156 bool newline_before_endstream;
156 std::string linearize_pass1; 157 std::string linearize_pass1;
  158 + bool coalesce_contents;
157 std::string min_version; 159 std::string min_version;
158 std::string force_version; 160 std::string force_version;
159 bool show_npages; 161 bool show_npages;
@@ -391,6 +393,7 @@ familiar with the PDF file format or who are PDF developers.\n\ @@ -391,6 +393,7 @@ familiar with the PDF file format or who are PDF developers.\n\
391 --object-streams=mode controls handing of object streams\n\ 393 --object-streams=mode controls handing of object streams\n\
392 --preserve-unreferenced preserve unreferenced objects\n\ 394 --preserve-unreferenced preserve unreferenced objects\n\
393 --newline-before-endstream always put a newline before endstream\n\ 395 --newline-before-endstream always put a newline before endstream\n\
  396 +--coalesce-contents force all pages' content to be a single stream\n\
394 --qdf turns on \"QDF mode\" (below)\n\ 397 --qdf turns on \"QDF mode\" (below)\n\
395 --linearize-pass1=file write intermediate pass of linearized file\n\ 398 --linearize-pass1=file write intermediate pass of linearized file\n\
396 for debugging\n\ 399 for debugging\n\
@@ -1543,6 +1546,10 @@ static void parse_options(int argc, char* argv[], Options&amp; o) @@ -1543,6 +1546,10 @@ static void parse_options(int argc, char* argv[], Options&amp; o)
1543 } 1546 }
1544 o.linearize_pass1 = parameter; 1547 o.linearize_pass1 = parameter;
1545 } 1548 }
  1549 + else if (strcmp(arg, "coalesce-contents") == 0)
  1550 + {
  1551 + o.coalesce_contents = true;
  1552 + }
1546 else if (strcmp(arg, "min-version") == 0) 1553 else if (strcmp(arg, "min-version") == 0)
1547 { 1554 {
1548 if (parameter == 0) 1555 if (parameter == 0)
@@ -1960,6 +1967,19 @@ static void do_inspection(QPDF&amp; pdf, Options&amp; o) @@ -1960,6 +1967,19 @@ static void do_inspection(QPDF&amp; pdf, Options&amp; o)
1960 } 1967 }
1961 } 1968 }
1962 1969
  1970 +static void handle_transformations(QPDF& pdf, Options& o)
  1971 +{
  1972 + if (o.coalesce_contents)
  1973 + {
  1974 + std::vector<QPDFObjectHandle> pages = pdf.getAllPages();
  1975 + for (std::vector<QPDFObjectHandle>::iterator iter = pages.begin();
  1976 + iter != pages.end(); ++iter)
  1977 + {
  1978 + (*iter).coalesceContentStreams();
  1979 + }
  1980 + }
  1981 +}
  1982 +
1963 static void handle_page_specs(QPDF& pdf, Options& o, 1983 static void handle_page_specs(QPDF& pdf, Options& o,
1964 std::vector<PointerHolder<QPDF> >& page_heap) 1984 std::vector<PointerHolder<QPDF> >& page_heap)
1965 { 1985 {
@@ -2382,6 +2402,7 @@ int main(int argc, char* argv[]) @@ -2382,6 +2402,7 @@ int main(int argc, char* argv[])
2382 pdf.processFile(o.infilename, o.password); 2402 pdf.processFile(o.infilename, o.password);
2383 } 2403 }
2384 2404
  2405 + handle_transformations(pdf, o);
2385 std::vector<PointerHolder<QPDF> > page_heap; 2406 std::vector<PointerHolder<QPDF> > page_heap;
2386 if (! o.page_specs.empty()) 2407 if (! o.page_specs.empty())
2387 { 2408 {
qpdf/qpdf.testcov
@@ -304,3 +304,5 @@ QPDFTokenizer EOF when not allowed 0 @@ -304,3 +304,5 @@ QPDFTokenizer EOF when not allowed 0
304 QPDFTokenizer inline image at EOF 0 304 QPDFTokenizer inline image at EOF 0
305 Pl_QPDFTokenizer found ID 0 305 Pl_QPDFTokenizer found ID 0
306 QPDFObjectHandle non-stream in stream array 0 306 QPDFObjectHandle non-stream in stream array 0
  307 +QPDFObjectHandle coalesce called on stream 0
  308 +QPDFObjectHandle coalesce provide stream data 0
qpdf/qtest/qpdf.test
@@ -736,6 +736,28 @@ $td-&gt;runtest(&quot;stream with tiff predictor&quot;, @@ -736,6 +736,28 @@ $td-&gt;runtest(&quot;stream with tiff predictor&quot;,
736 736
737 show_ntests(); 737 show_ntests();
738 # ---------- 738 # ----------
  739 +$td->notify("--- Coalesce contents ---");
  740 +$n_tests += 4;
  741 +
  742 +$td->runtest("coalesce contents with qdf",
  743 + {$td->COMMAND =>
  744 + "qpdf --qdf --static-id" .
  745 + " --coalesce-contents coalesce.pdf a.pdf"},
  746 + {$td->STRING => "", $td->EXIT_STATUS => 0});
  747 +$td->runtest("check output",
  748 + {$td->FILE => "a.pdf"},
  749 + {$td->FILE => "coalesce-out.qdf"});
  750 +$td->runtest("coalesce contents without qdf",
  751 + {$td->COMMAND =>
  752 + "qpdf --static-id" .
  753 + " --coalesce-contents coalesce.pdf a.pdf"},
  754 + {$td->STRING => "", $td->EXIT_STATUS => 0});
  755 +$td->runtest("check output",
  756 + {$td->FILE => "a.pdf"},
  757 + {$td->FILE => "coalesce-out.pdf"});
  758 +
  759 +show_ntests();
  760 +# ----------
739 $td->notify("--- Newline before endstream ---"); 761 $td->notify("--- Newline before endstream ---");
740 $n_tests += 10; 762 $n_tests += 10;
741 763
qpdf/qtest/qpdf/coalesce-out.pdf 0 โ†’ 100644
No preview for this file type
qpdf/qtest/qpdf/coalesce-out.qdf 0 โ†’ 100644
No preview for this file type
qpdf/qtest/qpdf/coalesce.pdf 0 โ†’ 100644
No preview for this file type