Commit 5211bcb5eaa2b6b3c8aa48580f1b97314c37bb4a

Authored by Jay Berkenbilt
1 parent 22bcdbe7

Externalize inline images (fixes #278)

ChangeLog
1 2019-01-31 Jay Berkenbilt <ejb@ql.org> 1 2019-01-31 Jay Berkenbilt <ejb@ql.org>
2 2
  3 + * Add new options --externalize-inline-images, which converts
  4 + inline images larger than a specified size to regular images, and
  5 + --ii-min-bytes, which tweaks that size.
  6 +
  7 + * When optimizing images, inline images are now included in the
  8 + optimization, first being converted to regular images. Use
  9 + --keep-inline-images to exclude them from optimization. Fixes #278.
  10 +
  11 + * Add method QPDFPageObjectHelper::externalizeInlineImages, which
  12 + converts inline images whose size is at least a specified amount
  13 + to regular images.
  14 +
3 * Remove traces of acroread, which hasn't been available in Linux 15 * Remove traces of acroread, which hasn't been available in Linux
4 for a long time. 16 for a long time.
5 17
  1 +Now
  2 +===
  3 +
  4 +* Deal with compiler warnings
  5 +
1 Soon 6 Soon
2 ==== 7 ====
3 8
@@ -96,23 +101,6 @@ directory or that are otherwise not publicly accessible. This includes @@ -96,23 +101,6 @@ directory or that are otherwise not publicly accessible. This includes
96 things sent to me by email that are specifically not public. Even so, 101 things sent to me by email that are specifically not public. Even so,
97 I find it useful to make reference to them in this list 102 I find it useful to make reference to them in this list
98 103
99 - * Do something better for inline images (see #278)  
100 - * Figure out a way to add an expectInlineImage method that takes  
101 - the offset of the EI image so an external system can locate the  
102 - end tag. Hopefully Both QPDFObjectHandle and Pl_QPDFTokenizer can  
103 - do this. Somewhere we might want something that uses an input  
104 - source to do it, but for the pipeline, it will also have to be  
105 - possible to do it as we go.  
106 - * Improve location of EI to handle EI embedded in the image data;  
107 - consider trying to parse after EI and, if errors, keep looking.  
108 - Will have to look at what happens with random binary characters  
109 - regarding token type.  
110 - * Add a method to replace inline images with real images. Look at  
111 - existing code for adding new resources used with form XObjects  
112 - and reuse if possible  
113 - * Have image optimization replace inline images that are of more  
114 - than a certain size prior to optimizing  
115 -  
116 * Add support for writing name and number trees 104 * Add support for writing name and number trees
117 105
118 * Figure out how to render Gajić correctly in the PDF version of the 106 * Figure out how to render Gajić correctly in the PDF version of the
include/qpdf/QPDFPageObjectHelper.hh
@@ -73,6 +73,11 @@ class QPDFPageObjectHelper: public QPDFObjectHelper @@ -73,6 +73,11 @@ class QPDFPageObjectHelper: public QPDFObjectHelper
73 QPDF_DLL 73 QPDF_DLL
74 std::map<std::string, QPDFObjectHandle> getPageImages(); 74 std::map<std::string, QPDFObjectHandle> getPageImages();
75 75
  76 + // Convert each inline image to an external (normal) image if the
  77 + // size is at least the specified number of bytes.
  78 + QPDF_DLL
  79 + void externalizeInlineImages(size_t min_size = 0);
  80 +
76 // Return the annotations in the page's "/Annots" list, if any. If 81 // Return the annotations in the page's "/Annots" list, if any. If
77 // only_subtype is non-empty, only include annotations of the 82 // only_subtype is non-empty, only include annotations of the
78 // given subtype. 83 // given subtype.
libqpdf/QPDFPageObjectHelper.cc
@@ -2,6 +2,7 @@ @@ -2,6 +2,7 @@
2 #include <qpdf/QTC.hh> 2 #include <qpdf/QTC.hh>
3 #include <qpdf/QPDF.hh> 3 #include <qpdf/QPDF.hh>
4 #include <qpdf/Pl_Concatenate.hh> 4 #include <qpdf/Pl_Concatenate.hh>
  5 +#include <qpdf/Pl_Buffer.hh>
5 #include <qpdf/QUtil.hh> 6 #include <qpdf/QUtil.hh>
6 #include <qpdf/QPDFExc.hh> 7 #include <qpdf/QPDFExc.hh>
7 #include <qpdf/QPDFMatrix.hh> 8 #include <qpdf/QPDFMatrix.hh>
@@ -36,6 +37,251 @@ ContentProvider::provideStreamData(int, int, Pipeline* p) @@ -36,6 +37,251 @@ ContentProvider::provideStreamData(int, int, Pipeline* p)
36 concat.manualFinish(); 37 concat.manualFinish();
37 } 38 }
38 39
  40 +class InlineImageTracker: public QPDFObjectHandle::TokenFilter
  41 +{
  42 + public:
  43 + InlineImageTracker(QPDF*, size_t min_size, QPDFObjectHandle resources);
  44 + virtual ~InlineImageTracker()
  45 + {
  46 + }
  47 + virtual void handleToken(QPDFTokenizer::Token const&);
  48 + QPDFObjectHandle convertIIDict(QPDFObjectHandle odict);
  49 +
  50 + QPDF* qpdf;
  51 + size_t min_size;
  52 + QPDFObjectHandle resources;
  53 + std::string dict_str;
  54 + std::string bi_str;
  55 + int min_suffix;
  56 + bool any_images;
  57 + enum { st_top, st_bi } state;
  58 +};
  59 +
  60 +InlineImageTracker::InlineImageTracker(QPDF* qpdf, size_t min_size,
  61 + QPDFObjectHandle resources) :
  62 + qpdf(qpdf),
  63 + min_size(min_size),
  64 + resources(resources),
  65 + min_suffix(1),
  66 + any_images(false),
  67 + state(st_top)
  68 +{
  69 +}
  70 +
  71 +QPDFObjectHandle
  72 +InlineImageTracker::convertIIDict(QPDFObjectHandle odict)
  73 +{
  74 + QPDFObjectHandle dict = QPDFObjectHandle::newDictionary();
  75 + dict.replaceKey("/Type", QPDFObjectHandle::newName("/XObject"));
  76 + dict.replaceKey("/Subtype", QPDFObjectHandle::newName("/Image"));
  77 + std::set<std::string> keys = odict.getKeys();
  78 + for (std::set<std::string>::iterator iter = keys.begin();
  79 + iter != keys.end(); ++iter)
  80 + {
  81 + std::string key = *iter;
  82 + QPDFObjectHandle value = odict.getKey(key);
  83 + if (key == "/BPC")
  84 + {
  85 + key = "/BitsPerComponent";
  86 + }
  87 + else if (key == "/CS")
  88 + {
  89 + key = "/ColorSpace";
  90 + }
  91 + else if (key == "/D")
  92 + {
  93 + key = "/Decode";
  94 + }
  95 + else if (key == "/DP")
  96 + {
  97 + key = "/DecodeParms";
  98 + }
  99 + else if (key == "/F")
  100 + {
  101 + key = "/Filter";
  102 + }
  103 + else if (key == "/H")
  104 + {
  105 + key = "/Height";
  106 + }
  107 + else if (key == "/IM")
  108 + {
  109 + key = "/ImageMask";
  110 + }
  111 + else if (key == "/I")
  112 + {
  113 + key = "/Interpolate";
  114 + }
  115 + else if (key == "/W")
  116 + {
  117 + key = "/Width";
  118 + }
  119 +
  120 + if (key == "/ColorSpace")
  121 + {
  122 + if (value.isName())
  123 + {
  124 + std::string name = value.getName();
  125 + if (name == "/G")
  126 + {
  127 + name = "/DeviceGray";
  128 + }
  129 + else if (name == "/RGB")
  130 + {
  131 + name = "/DeviceRGB";
  132 + }
  133 + else if (name == "/CMYK")
  134 + {
  135 + name = "/DeviceCMYK";
  136 + }
  137 + else if (name == "/I")
  138 + {
  139 + name = "/Indexed";
  140 + }
  141 + else
  142 + {
  143 + name.clear();
  144 + }
  145 + if (! name.empty())
  146 + {
  147 + value = QPDFObjectHandle::newName(name);
  148 + }
  149 + }
  150 + }
  151 + else if (key == "/Filter")
  152 + {
  153 + std::vector<QPDFObjectHandle> filters;
  154 + if (value.isName())
  155 + {
  156 + filters.push_back(value);
  157 + }
  158 + else if (value.isArray())
  159 + {
  160 + filters = value.getArrayAsVector();
  161 + }
  162 + for (std::vector<QPDFObjectHandle>::iterator iter =
  163 + filters.begin();
  164 + iter != filters.end(); ++iter)
  165 + {
  166 + std::string name;
  167 + if ((*iter).isName())
  168 + {
  169 + name = (*iter).getName();
  170 + }
  171 + if (name == "/AHx")
  172 + {
  173 + name = "/ASCIIHexDecode";
  174 + }
  175 + else if (name == "/A85")
  176 + {
  177 + name = "/ASCII85Decode";
  178 + }
  179 + else if (name == "/LZW")
  180 + {
  181 + name = "/LZWDecode";
  182 + }
  183 + else if (name == "/Fl")
  184 + {
  185 + name = "/FlateDecode";
  186 + }
  187 + else if (name == "/RL")
  188 + {
  189 + name = "/RunLengthDecode";
  190 + }
  191 + else if (name == "/CCF")
  192 + {
  193 + name = "/CCITTFaxDecode";
  194 + }
  195 + else if (name == "/DCT")
  196 + {
  197 + name = "/DCTDecode";
  198 + }
  199 + else
  200 + {
  201 + name.clear();
  202 + }
  203 + if (! name.empty())
  204 + {
  205 + *iter = QPDFObjectHandle::newName(name);
  206 + }
  207 + }
  208 + if (value.isName() && (filters.size() == 1))
  209 + {
  210 + value = filters.at(0);
  211 + }
  212 + else if (value.isArray())
  213 + {
  214 + value = QPDFObjectHandle::newArray(filters);
  215 + }
  216 + }
  217 + dict.replaceKey(key, value);
  218 + }
  219 + return dict;
  220 +}
  221 +
  222 +void
  223 +InlineImageTracker::handleToken(QPDFTokenizer::Token const& token)
  224 +{
  225 + if (state == st_bi)
  226 + {
  227 + if (token.getType() == QPDFTokenizer::tt_inline_image)
  228 + {
  229 + std::string image_data(token.getValue());
  230 + size_t len = image_data.length();
  231 + // The token ends with delimiter followed by EI, so it
  232 + // will always be at least 3 bytes long. We want to
  233 + // exclude the EI and preceding delimiter.
  234 + len = (len >= 3 ? len - 3 : 0);
  235 + if (len >= this->min_size)
  236 + {
  237 + QTC::TC("qpdf", "QPDFPageObjectHelper externalize inline image");
  238 + Pl_Buffer b("image_data");
  239 + b.write(QUtil::unsigned_char_pointer(image_data), len);
  240 + b.finish();
  241 + QPDFObjectHandle dict =
  242 + convertIIDict(QPDFObjectHandle::parse(dict_str));
  243 + dict.replaceKey("/Length", QPDFObjectHandle::newInteger(len));
  244 + std::string name = resources.getUniqueResourceName(
  245 + "/IIm", this->min_suffix);
  246 + QPDFObjectHandle image = QPDFObjectHandle::newStream(
  247 + this->qpdf, b.getBuffer());
  248 + image.replaceDict(dict);
  249 + resources.getKey("/XObject").replaceKey(name, image);
  250 + write(name);
  251 + write(" Do\n");
  252 + any_images = true;
  253 + }
  254 + else
  255 + {
  256 + QTC::TC("qpdf", "QPDFPageObjectHelper keep inline image");
  257 + write(bi_str);
  258 + writeToken(token);
  259 + }
  260 + state = st_top;
  261 + }
  262 + else if (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "ID"))
  263 + {
  264 + bi_str += token.getValue();
  265 + dict_str += " >>";
  266 + }
  267 + else
  268 + {
  269 + bi_str += token.getValue();
  270 + dict_str += token.getValue();
  271 + }
  272 + }
  273 + else if (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "BI"))
  274 + {
  275 + bi_str = token.getValue();
  276 + dict_str = "<< ";
  277 + state = st_bi;
  278 + }
  279 + else
  280 + {
  281 + writeToken(token);
  282 + }
  283 +}
  284 +
39 QPDFPageObjectHelper::Members::~Members() 285 QPDFPageObjectHelper::Members::~Members()
40 { 286 {
41 } 287 }
@@ -112,13 +358,32 @@ QPDFPageObjectHelper::getMediaBox(bool copy_if_shared) @@ -112,13 +358,32 @@ QPDFPageObjectHelper::getMediaBox(bool copy_if_shared)
112 return getAttribute("/MediaBox", copy_if_shared); 358 return getAttribute("/MediaBox", copy_if_shared);
113 } 359 }
114 360
115 -  
116 std::map<std::string, QPDFObjectHandle> 361 std::map<std::string, QPDFObjectHandle>
117 QPDFPageObjectHelper::getPageImages() 362 QPDFPageObjectHelper::getPageImages()
118 { 363 {
119 return this->oh.getPageImages(); 364 return this->oh.getPageImages();
120 } 365 }
121 366
  367 +void
  368 +QPDFPageObjectHelper::externalizeInlineImages(size_t min_size)
  369 +{
  370 + QPDFObjectHandle resources = getAttribute("/Resources", true);
  371 + // Calling mergeResources also ensures that /XObject becomes
  372 + // direct and is not shared with other pages.
  373 + resources.mergeResources(
  374 + QPDFObjectHandle::parse("<< /XObject << >> >>"));
  375 + InlineImageTracker iit(this->oh.getOwningQPDF(), min_size, resources);
  376 + Pl_Buffer b("new page content");
  377 + filterPageContents(&iit, &b);
  378 + if (iit.any_images)
  379 + {
  380 + getObjectHandle().replaceKey(
  381 + "/Contents",
  382 + QPDFObjectHandle::newStream(
  383 + this->oh.getOwningQPDF(), b.getBuffer()));
  384 + }
  385 +}
  386 +
122 std::vector<QPDFAnnotationObjectHelper> 387 std::vector<QPDFAnnotationObjectHelper>
123 QPDFPageObjectHelper::getAnnotations(std::string const& only_subtype) 388 QPDFPageObjectHelper::getAnnotations(std::string const& only_subtype)
124 { 389 {
manual/qpdf-manual.xml
@@ -1746,7 +1746,11 @@ outfile.pdf&lt;/option&gt; @@ -1746,7 +1746,11 @@ outfile.pdf&lt;/option&gt;
1746 <option>--verbose</option>. See also the 1746 <option>--verbose</option>. See also the
1747 <option>--oi-min-width</option>, 1747 <option>--oi-min-width</option>,
1748 <option>--oi-min-height</option>, and 1748 <option>--oi-min-height</option>, and
1749 - <option>--oi-min-area</option> options. 1749 + <option>--oi-min-area</option> options. By default, starting
  1750 + in qpdf 8.4, inline images are converted to regular images
  1751 + and optimized as well. Use
  1752 + <option>--keep-inline-images</option> to prevent inline images
  1753 + from being included.
1750 </para> 1754 </para>
1751 </listitem> 1755 </listitem>
1752 </varlistentry> 1756 </varlistentry>
@@ -1780,6 +1784,43 @@ outfile.pdf&lt;/option&gt; @@ -1780,6 +1784,43 @@ outfile.pdf&lt;/option&gt;
1780 </para> 1784 </para>
1781 </listitem> 1785 </listitem>
1782 </varlistentry> 1786 </varlistentry>
  1787 +
  1788 +
  1789 +
  1790 + <varlistentry>
  1791 + <term><option>--externalize-inline-images</option></term>
  1792 + <listitem>
  1793 + <para>
  1794 + Convert inline images to regular images. By default, images
  1795 + whose data is at least 1,024 bytes are converted when this
  1796 + option is selected. Use <option>--ii-min-bytes</option> to
  1797 + change the size threshold. This option is implicitly selected
  1798 + when <option>--optimize-images</option> is selected. Use
  1799 + <option>--keep-inline-images</option> to exclude inline images
  1800 + from image optimization.
  1801 + </para>
  1802 + </listitem>
  1803 + </varlistentry>
  1804 + <varlistentry>
  1805 + <term><option>--ii-min-bytes=<replaceable>bytes</replaceable></option></term>
  1806 + <listitem>
  1807 + <para>
  1808 + Avoid converting inline images whose size is below the
  1809 + specified minimum size to regular images. If omitted, the
  1810 + default is 1,024 bytes. Use 0 for no minimum.
  1811 + </para>
  1812 + </listitem>
  1813 + </varlistentry>
  1814 + <varlistentry>
  1815 + <term><option>--keep-inline-images</option></term>
  1816 + <listitem>
  1817 + <para>
  1818 + Prevent inline images from being included in image
  1819 + optimization. This option has no affect when
  1820 + <option>--optimize-images</option> is not specified.
  1821 + </para>
  1822 + </listitem>
  1823 + </varlistentry>
1783 <varlistentry> 1824 <varlistentry>
1784 <term><option>--qdf</option></term> 1825 <term><option>--qdf</option></term>
1785 <listitem> 1826 <listitem>
@@ -4323,6 +4364,18 @@ print &quot;\n&quot;; @@ -4323,6 +4364,18 @@ print &quot;\n&quot;;
4323 </listitem> 4364 </listitem>
4324 <listitem> 4365 <listitem>
4325 <para> 4366 <para>
  4367 + New options <option>--externalize-inline-images</option>,
  4368 + <option>--ii-min-bytes</option>, and
  4369 + <option>--keep-inline-images</option> control qpdf's
  4370 + handling of inline images and possible conversion of them to
  4371 + regular images. By default,
  4372 + <option>--optimize-images</option> now also applies to
  4373 + inline images. These options are discussed in <xref
  4374 + linkend="ref.advanced-transformation"/>.
  4375 + </para>
  4376 + </listitem>
  4377 + <listitem>
  4378 + <para>
4326 Add options <option>--overlay</option> and 4379 Add options <option>--overlay</option> and
4327 <option>--underlay</option> for overlaying or underlaying 4380 <option>--underlay</option> for overlaying or underlaying
4328 pages of other files onto output pages. See <xref 4381 pages of other files onto output pages. See <xref
@@ -4415,6 +4468,14 @@ print &quot;\n&quot;; @@ -4415,6 +4468,14 @@ print &quot;\n&quot;;
4415 not compressed. 4468 not compressed.
4416 </para> 4469 </para>
4417 </listitem> 4470 </listitem>
  4471 + <listitem>
  4472 + <para>
  4473 + When the tokenizer returns inline image tokens, delimiters
  4474 + following <literal>ID</literal> and <literal>EI</literal>
  4475 + operators are no longer excluded. This makes it possible to
  4476 + reliably extract the actual image data.
  4477 + </para>
  4478 + </listitem>
4418 </itemizedlist> 4479 </itemizedlist>
4419 </listitem> 4480 </listitem>
4420 <listitem> 4481 <listitem>
@@ -4425,6 +4486,13 @@ print &quot;\n&quot;; @@ -4425,6 +4486,13 @@ print &quot;\n&quot;;
4425 <listitem> 4486 <listitem>
4426 <para> 4487 <para>
4427 Add method 4488 Add method
  4489 + <function>QPDFPageObjectHelper::externalizeInlineImages</function>
  4490 + to convert inline images to regular images.
  4491 + </para>
  4492 + </listitem>
  4493 + <listitem>
  4494 + <para>
  4495 + Add method
4428 <function>QUtil::possible_repaired_encodings()</function> to 4496 <function>QUtil::possible_repaired_encodings()</function> to
4429 generate a list of strings that represent other ways the 4497 generate a list of strings that represent other ways the
4430 given string could have been encoded. This is the method the 4498 given string could have been encoded. This is the method the
qpdf/qpdf.cc
@@ -161,9 +161,12 @@ struct Options @@ -161,9 +161,12 @@ struct Options
161 json(false), 161 json(false),
162 check(false), 162 check(false),
163 optimize_images(false), 163 optimize_images(false),
  164 + externalize_inline_images(false),
  165 + keep_inline_images(false),
164 oi_min_width(128), // Default values for these 166 oi_min_width(128), // Default values for these
165 oi_min_height(128), // oi flags are in --help 167 oi_min_height(128), // oi flags are in --help
166 oi_min_area(16384), // and in the manual. 168 oi_min_area(16384), // and in the manual.
  169 + ii_min_bytes(1024), //
167 underlay("underlay"), 170 underlay("underlay"),
168 overlay("overlay"), 171 overlay("overlay"),
169 under_overlay(0), 172 under_overlay(0),
@@ -254,9 +257,12 @@ struct Options @@ -254,9 +257,12 @@ struct Options
254 std::set<std::string> json_objects; 257 std::set<std::string> json_objects;
255 bool check; 258 bool check;
256 bool optimize_images; 259 bool optimize_images;
  260 + bool externalize_inline_images;
  261 + bool keep_inline_images;
257 size_t oi_min_width; 262 size_t oi_min_width;
258 size_t oi_min_height; 263 size_t oi_min_height;
259 size_t oi_min_area; 264 size_t oi_min_area;
  265 + size_t ii_min_bytes;
260 UnderOverlay underlay; 266 UnderOverlay underlay;
261 UnderOverlay overlay; 267 UnderOverlay overlay;
262 UnderOverlay* under_overlay; 268 UnderOverlay* under_overlay;
@@ -659,9 +665,12 @@ class ArgParser @@ -659,9 +665,12 @@ class ArgParser
659 void argJsonObject(char* parameter); 665 void argJsonObject(char* parameter);
660 void argCheck(); 666 void argCheck();
661 void argOptimizeImages(); 667 void argOptimizeImages();
  668 + void argExternalizeInlineImages();
  669 + void argKeepInlineImages();
662 void argOiMinWidth(char* parameter); 670 void argOiMinWidth(char* parameter);
663 void argOiMinHeight(char* parameter); 671 void argOiMinHeight(char* parameter);
664 void argOiMinArea(char* parameter); 672 void argOiMinArea(char* parameter);
  673 + void argIiMinBytes(char* parameter);
665 void arg40Print(char* parameter); 674 void arg40Print(char* parameter);
666 void arg40Modify(char* parameter); 675 void arg40Modify(char* parameter);
667 void arg40Extract(char* parameter); 676 void arg40Extract(char* parameter);
@@ -894,12 +903,17 @@ ArgParser::initOptionTable() @@ -894,12 +903,17 @@ ArgParser::initOptionTable()
894 &ArgParser::argJsonObject, "trailer|obj[,gen]"); 903 &ArgParser::argJsonObject, "trailer|obj[,gen]");
895 (*t)["check"] = oe_bare(&ArgParser::argCheck); 904 (*t)["check"] = oe_bare(&ArgParser::argCheck);
896 (*t)["optimize-images"] = oe_bare(&ArgParser::argOptimizeImages); 905 (*t)["optimize-images"] = oe_bare(&ArgParser::argOptimizeImages);
  906 + (*t)["externalize-inline-images"] =
  907 + oe_bare(&ArgParser::argExternalizeInlineImages);
  908 + (*t)["keep-inline-images"] = oe_bare(&ArgParser::argKeepInlineImages);
897 (*t)["oi-min-width"] = oe_requiredParameter( 909 (*t)["oi-min-width"] = oe_requiredParameter(
898 &ArgParser::argOiMinWidth, "minimum-width"); 910 &ArgParser::argOiMinWidth, "minimum-width");
899 (*t)["oi-min-height"] = oe_requiredParameter( 911 (*t)["oi-min-height"] = oe_requiredParameter(
900 &ArgParser::argOiMinHeight, "minimum-height"); 912 &ArgParser::argOiMinHeight, "minimum-height");
901 (*t)["oi-min-area"] = oe_requiredParameter( 913 (*t)["oi-min-area"] = oe_requiredParameter(
902 &ArgParser::argOiMinArea, "minimum-area"); 914 &ArgParser::argOiMinArea, "minimum-area");
  915 + (*t)["ii-min-bytes"] = oe_requiredParameter(
  916 + &ArgParser::argIiMinBytes, "minimum-bytes");
903 (*t)["overlay"] = oe_bare(&ArgParser::argOverlay); 917 (*t)["overlay"] = oe_bare(&ArgParser::argOverlay);
904 (*t)["underlay"] = oe_bare(&ArgParser::argUnderlay); 918 (*t)["underlay"] = oe_bare(&ArgParser::argUnderlay);
905 919
@@ -1308,6 +1322,12 @@ ArgParser::argHelp() @@ -1308,6 +1322,12 @@ ArgParser::argHelp()
1308 << " default is 128. Use 0 to mean no minimum\n" 1322 << " default is 128. Use 0 to mean no minimum\n"
1309 << "--oi-min-area=a do not optimize images whose pixel count is below a\n" 1323 << "--oi-min-area=a do not optimize images whose pixel count is below a\n"
1310 << " default is 16,384. Use 0 to mean no minimum\n" 1324 << " default is 16,384. Use 0 to mean no minimum\n"
  1325 + << "--externalize-inline-images convert inline images to regular images; by\n"
  1326 + << " default, images of at least 1,024 bytes are\n"
  1327 + << " externalized\n"
  1328 + << "--ii-min-bytes=bytes specify minimum size of inline images to be\n"
  1329 + << " converted to regular images\n"
  1330 + << "--keep-inline-images exclude inline images from image optimization\n"
1311 << "--qdf turns on \"QDF mode\" (below)\n" 1331 << "--qdf turns on \"QDF mode\" (below)\n"
1312 << "--linearize-pass1=file write intermediate pass of linearized file\n" 1332 << "--linearize-pass1=file write intermediate pass of linearized file\n"
1313 << " for debugging\n" 1333 << " for debugging\n"
@@ -1966,6 +1986,18 @@ ArgParser::argOptimizeImages() @@ -1966,6 +1986,18 @@ ArgParser::argOptimizeImages()
1966 } 1986 }
1967 1987
1968 void 1988 void
  1989 +ArgParser::argExternalizeInlineImages()
  1990 +{
  1991 + o.externalize_inline_images = true;
  1992 +}
  1993 +
  1994 +void
  1995 +ArgParser::argKeepInlineImages()
  1996 +{
  1997 + o.keep_inline_images = true;
  1998 +}
  1999 +
  2000 +void
1969 ArgParser::argOiMinWidth(char* parameter) 2001 ArgParser::argOiMinWidth(char* parameter)
1970 { 2002 {
1971 o.oi_min_width = QUtil::string_to_int(parameter); 2003 o.oi_min_width = QUtil::string_to_int(parameter);
@@ -1984,6 +2016,12 @@ ArgParser::argOiMinArea(char* parameter) @@ -1984,6 +2016,12 @@ ArgParser::argOiMinArea(char* parameter)
1984 } 2016 }
1985 2017
1986 void 2018 void
  2019 +ArgParser::argIiMinBytes(char* parameter)
  2020 +{
  2021 + o.ii_min_bytes = QUtil::string_to_int(parameter);
  2022 +}
  2023 +
  2024 +void
1987 ArgParser::arg40Print(char* parameter) 2025 ArgParser::arg40Print(char* parameter)
1988 { 2026 {
1989 o.r2_print = (strcmp(parameter, "y") == 0); 2027 o.r2_print = (strcmp(parameter, "y") == 0);
@@ -2933,6 +2971,10 @@ ArgParser::doFinalChecks() @@ -2933,6 +2971,10 @@ ArgParser::doFinalChecks()
2933 { 2971 {
2934 usage("no output file may be given for this option"); 2972 usage("no output file may be given for this option");
2935 } 2973 }
  2974 + if (o.optimize_images && (! o.keep_inline_images))
  2975 + {
  2976 + o.externalize_inline_images = true;
  2977 + }
2936 2978
2937 if (o.require_outfile && (strcmp(o.outfilename, "-") == 0)) 2979 if (o.require_outfile && (strcmp(o.outfilename, "-") == 0))
2938 { 2980 {
@@ -3764,10 +3806,7 @@ ImageOptimizer::makePipeline(std::string const&amp; description, Pipeline* next) @@ -3764,10 +3806,7 @@ ImageOptimizer::makePipeline(std::string const&amp; description, Pipeline* next)
3764 QPDFObjectHandle w_obj = dict.getKey("/Width"); 3806 QPDFObjectHandle w_obj = dict.getKey("/Width");
3765 QPDFObjectHandle h_obj = dict.getKey("/Height"); 3807 QPDFObjectHandle h_obj = dict.getKey("/Height");
3766 QPDFObjectHandle colorspace_obj = dict.getKey("/ColorSpace"); 3808 QPDFObjectHandle colorspace_obj = dict.getKey("/ColorSpace");
3767 - QPDFObjectHandle components_obj = dict.getKey("/BitsPerComponent");  
3768 - if (! (w_obj.isInteger() &&  
3769 - h_obj.isInteger() &&  
3770 - components_obj.isInteger())) 3809 + if (! (w_obj.isNumber() && h_obj.isNumber()))
3771 { 3810 {
3772 if (o.verbose && (! description.empty())) 3811 if (o.verbose && (! description.empty()))
3773 { 3812 {
@@ -3777,8 +3816,12 @@ ImageOptimizer::makePipeline(std::string const&amp; description, Pipeline* next) @@ -3777,8 +3816,12 @@ ImageOptimizer::makePipeline(std::string const&amp; description, Pipeline* next)
3777 } 3816 }
3778 return result; 3817 return result;
3779 } 3818 }
3780 - JDIMENSION w = w_obj.getIntValue();  
3781 - JDIMENSION h = h_obj.getIntValue(); 3819 + // Files have been seen in the wild whose width and height are
  3820 + // floating point, which is goofy, but we can deal with it.
  3821 + JDIMENSION w = static_cast<JDIMENSION>(
  3822 + w_obj.isInteger() ? w_obj.getIntValue() : w_obj.getNumericValue());
  3823 + JDIMENSION h = static_cast<JDIMENSION>(
  3824 + h_obj.isInteger() ? h_obj.getIntValue() : h_obj.getNumericValue());
3782 std::string colorspace = (colorspace_obj.isName() ? 3825 std::string colorspace = (colorspace_obj.isName() ?
3783 colorspace_obj.getName() : 3826 colorspace_obj.getName() :
3784 ""); 3827 "");
@@ -4198,6 +4241,16 @@ static void handle_under_overlay(QPDF&amp; pdf, Options&amp; o) @@ -4198,6 +4241,16 @@ static void handle_under_overlay(QPDF&amp; pdf, Options&amp; o)
4198 static void handle_transformations(QPDF& pdf, Options& o) 4241 static void handle_transformations(QPDF& pdf, Options& o)
4199 { 4242 {
4200 QPDFPageDocumentHelper dh(pdf); 4243 QPDFPageDocumentHelper dh(pdf);
  4244 + if (o.externalize_inline_images)
  4245 + {
  4246 + std::vector<QPDFPageObjectHelper> pages = dh.getAllPages();
  4247 + for (std::vector<QPDFPageObjectHelper>::iterator iter = pages.begin();
  4248 + iter != pages.end(); ++iter)
  4249 + {
  4250 + QPDFPageObjectHelper& ph(*iter);
  4251 + ph.externalizeInlineImages(o.ii_min_bytes);
  4252 + }
  4253 + }
4201 if (o.optimize_images) 4254 if (o.optimize_images)
4202 { 4255 {
4203 int pageno = 0; 4256 int pageno = 0;
qpdf/qpdf.testcov
@@ -436,3 +436,5 @@ QPDFTokenizer found EI the old way 0 @@ -436,3 +436,5 @@ QPDFTokenizer found EI the old way 0
436 QPDFTokenizer found EI by byte count 0 436 QPDFTokenizer found EI by byte count 0
437 QPDFTokenizer inline image at EOF the old way 0 437 QPDFTokenizer inline image at EOF the old way 0
438 QPDFTokenizer found EI after more than one try 0 438 QPDFTokenizer found EI after more than one try 0
  439 +QPDFPageObjectHelper externalize inline image 0
  440 +QPDFPageObjectHelper keep inline image 0
qpdf/qtest/qpdf.test
@@ -679,7 +679,7 @@ $td-&gt;runtest(&quot;check pass1 file&quot;, @@ -679,7 +679,7 @@ $td-&gt;runtest(&quot;check pass1 file&quot;,
679 show_ntests(); 679 show_ntests();
680 # ---------- 680 # ----------
681 $td->notify("--- Inline Images ---"); 681 $td->notify("--- Inline Images ---");
682 -$n_tests += 2; 682 +$n_tests += 8;
683 683
684 # The file large-inline-image.pdf is a hand-crafted file with several 684 # The file large-inline-image.pdf is a hand-crafted file with several
685 # inline images of various sizes including one that is two megabytes, 685 # inline images of various sizes including one that is two megabytes,
@@ -696,6 +696,69 @@ $td-&gt;runtest(&quot;check output&quot;, @@ -696,6 +696,69 @@ $td-&gt;runtest(&quot;check output&quot;,
696 {$td->FILE => "a.pdf"}, 696 {$td->FILE => "a.pdf"},
697 {$td->FILE => "large-inline-image.qdf"}); 697 {$td->FILE => "large-inline-image.qdf"});
698 698
  699 +$td->runtest("eof in inline image",
  700 + {$td->COMMAND =>
  701 + "qpdf --qdf --static-id eof-in-inline-image.pdf a.pdf"},
  702 + {$td->FILE => "eof-inline-qdf.out", $td->EXIT_STATUS => 3},
  703 + $td->NORMALIZE_NEWLINES);
  704 +$td->runtest("check output",
  705 + {$td->FILE => "a.pdf"},
  706 + {$td->FILE => "eof-in-inline-image.qdf"});
  707 +$td->runtest("externalize eof in inline image",
  708 + {$td->COMMAND =>
  709 + "qpdf --qdf --externalize-inline-images" .
  710 + " --static-id eof-in-inline-image.pdf a.pdf"},
  711 + {$td->FILE => "eof-inline-qdf.out", $td->EXIT_STATUS => 3},
  712 + $td->NORMALIZE_NEWLINES);
  713 +$td->runtest("check output",
  714 + {$td->FILE => "a.pdf"},
  715 + {$td->FILE => "eof-in-inline-image-ii.qdf"});
  716 +$td->runtest("externalize damaged image",
  717 + {$td->COMMAND =>
  718 + "qpdf --externalize-inline-images" .
  719 + " --compress-streams=n --static-id" .
  720 + " damaged-inline-image.pdf a.pdf"},
  721 + {$td->STRING => "", $td->EXIT_STATUS => 0},
  722 + $td->NORMALIZE_NEWLINES);
  723 +$td->runtest("check output",
  724 + {$td->FILE => "a.pdf"},
  725 + {$td->FILE => "damaged-inline-image-out.pdf"});
  726 +
  727 +my @eii_tests = (
  728 + ['inline-images', 80],
  729 + ['large-inline-image', 1024],
  730 + );
  731 +$n_tests += 4 * scalar(@eii_tests);
  732 +$n_compare_pdfs += 2 * scalar(@eii_tests);
  733 +
  734 +foreach my $d (@eii_tests)
  735 +{
  736 + my ($file, $threshold) = @$d;
  737 + $td->runtest("inline image $file (all)",
  738 + {$td->COMMAND =>
  739 + "qpdf --qdf --static-id --externalize-inline-images" .
  740 + " --ii-min-bytes=0 $file.pdf a.pdf"},
  741 + {$td->STRING => "", $td->EXIT_STATUS => 0},
  742 + $td->NORMALIZE_NEWLINES);
  743 + $td->runtest("check output",
  744 + {$td->FILE => "a.pdf"},
  745 + {$td->FILE => "$file-ii-all.pdf"});
  746 + compare_pdfs("$file.pdf", "a.pdf");
  747 +
  748 + $td->runtest("inline image $file (some)",
  749 + {$td->COMMAND =>
  750 + "qpdf --qdf --static-id --externalize-inline-images" .
  751 + " --ii-min-bytes=$threshold $file.pdf a.pdf"},
  752 + {$td->STRING => "", $td->EXIT_STATUS => 0},
  753 + $td->NORMALIZE_NEWLINES);
  754 + $td->runtest("check output",
  755 + {$td->FILE => "a.pdf"},
  756 + {$td->FILE => "$file-ii-some.pdf"});
  757 + compare_pdfs("$file.pdf", "a.pdf");
  758 +}
  759 +
  760 +# QXXXQ externalize tests with min size
  761 +
699 show_ntests(); 762 show_ntests();
700 # ---------- 763 # ----------
701 $td->notify("--- Tokenizer ---"); 764 $td->notify("--- Tokenizer ---");
@@ -2019,6 +2082,12 @@ my @image_opt = ( @@ -2019,6 +2082,12 @@ my @image_opt = (
2019 '--oi-min-width=0 --oi-min-height=0 --oi-min-area=30000'], 2082 '--oi-min-width=0 --oi-min-height=0 --oi-min-area=30000'],
2020 ['small-images', 'min-area-all', 2083 ['small-images', 'min-area-all',
2021 '--oi-min-width=0 --oi-min-height=0 --oi-min-area=30000'], 2084 '--oi-min-width=0 --oi-min-height=0 --oi-min-area=30000'],
  2085 + ['large-inline-image', 'inline-images',
  2086 + '--ii-min-bytes=0'],
  2087 + ['large-inline-image', 'inline-images-all-size',
  2088 + '--oi-min-width=0 --oi-min-height=0 --oi-min-area=0 --ii-min-bytes=0'],
  2089 + ['large-inline-image', 'inline-images-keep-some', ''],
  2090 + ['large-inline-image', 'inline-images-keep-all', '--keep-inline-images'],
2022 ); 2091 );
2023 2092
2024 $n_tests += 2 * scalar(@image_opt); 2093 $n_tests += 2 * scalar(@image_opt);
qpdf/qtest/qpdf/damaged-inline-image-out.pdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/damaged-inline-image.pdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/eof-in-inline-image-ii.qdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/eof-in-inline-image.qdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/eof-inline-qdf.out 0 → 100644
  1 +WARNING: eof-in-inline-image.pdf (offset 299): content normalization encountered bad tokens
  2 +WARNING: eof-in-inline-image.pdf (offset 299): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
  3 +WARNING: eof-in-inline-image.pdf (offset 299): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
  4 +qpdf: operation succeeded with warnings; resulting file may have some problems
qpdf/qtest/qpdf/inline-images-ii-all.pdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/inline-images-ii-some.pdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/large-inline-image-ii-all.pdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/large-inline-image-ii-some.pdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/optimize-images-inline-images-all-size-json.out 0 → 100644
  1 +{
  2 + "pages": [
  3 + {
  4 + "contents": [
  5 + "7 0 R"
  6 + ],
  7 + "images": [
  8 + {
  9 + "bitspercomponent": 8,
  10 + "colorspace": "/DeviceRGB",
  11 + "decodeparms": [
  12 + null
  13 + ],
  14 + "filter": [
  15 + "/DCTDecode"
  16 + ],
  17 + "filterable": false,
  18 + "height": 56,
  19 + "name": "/IIm1",
  20 + "object": "8 0 R",
  21 + "width": 49
  22 + },
  23 + {
  24 + "bitspercomponent": 8,
  25 + "colorspace": "/DeviceRGB",
  26 + "decodeparms": [
  27 + null
  28 + ],
  29 + "filter": [
  30 + "/DCTDecode"
  31 + ],
  32 + "filterable": false,
  33 + "height": 675,
  34 + "name": "/IIm2",
  35 + "object": "9 0 R",
  36 + "width": 1200
  37 + },
  38 + {
  39 + "bitspercomponent": 8,
  40 + "colorspace": "/DeviceRGB",
  41 + "decodeparms": [
  42 + null
  43 + ],
  44 + "filter": [
  45 + "/DCTDecode"
  46 + ],
  47 + "filterable": false,
  48 + "height": 56,
  49 + "name": "/IIm3",
  50 + "object": "10 0 R",
  51 + "width": 49
  52 + },
  53 + {
  54 + "bitspercomponent": 8,
  55 + "colorspace": "/DeviceGray",
  56 + "decodeparms": [
  57 + null
  58 + ],
  59 + "filter": [
  60 + "/FlateDecode"
  61 + ],
  62 + "filterable": true,
  63 + "height": 8,
  64 + "name": "/IIm4",
  65 + "object": "11 0 R",
  66 + "width": 8
  67 + }
  68 + ],
  69 + "label": null,
  70 + "object": "4 0 R",
  71 + "outlines": [],
  72 + "pageposfrom1": 1
  73 + }
  74 + ],
  75 + "parameters": {
  76 + "decodelevel": "generalized"
  77 + },
  78 + "version": 1
  79 +}
qpdf/qtest/qpdf/optimize-images-inline-images-all-size.out 0 → 100644
  1 +qpdf: image /IIm1 on page 1: optimizing image reduces size from 2391 to ...
  2 +qpdf: image /IIm2 on page 1: optimizing image reduces size from 2134996 to ...
  3 +qpdf: image /IIm3 on page 1: not optimizing because unable to decode data or data already uses DCT
  4 +qpdf: image /IIm4 on page 1: not optimizing because DCT compression does not reduce image size
  5 +qpdf: wrote file a.pdf
qpdf/qtest/qpdf/optimize-images-inline-images-json.out 0 → 100644
  1 +{
  2 + "pages": [
  3 + {
  4 + "contents": [
  5 + "7 0 R"
  6 + ],
  7 + "images": [
  8 + {
  9 + "bitspercomponent": 8,
  10 + "colorspace": "/DeviceRGB",
  11 + "decodeparms": [
  12 + null
  13 + ],
  14 + "filter": [
  15 + "/FlateDecode"
  16 + ],
  17 + "filterable": true,
  18 + "height": 56,
  19 + "name": "/IIm1",
  20 + "object": "8 0 R",
  21 + "width": 49
  22 + },
  23 + {
  24 + "bitspercomponent": 8,
  25 + "colorspace": "/DeviceRGB",
  26 + "decodeparms": [
  27 + null
  28 + ],
  29 + "filter": [
  30 + "/DCTDecode"
  31 + ],
  32 + "filterable": false,
  33 + "height": 675,
  34 + "name": "/IIm2",
  35 + "object": "9 0 R",
  36 + "width": 1200
  37 + },
  38 + {
  39 + "bitspercomponent": 8,
  40 + "colorspace": "/DeviceRGB",
  41 + "decodeparms": [
  42 + null
  43 + ],
  44 + "filter": [
  45 + "/DCTDecode"
  46 + ],
  47 + "filterable": false,
  48 + "height": 56,
  49 + "name": "/IIm3",
  50 + "object": "10 0 R",
  51 + "width": 49
  52 + },
  53 + {
  54 + "bitspercomponent": 8,
  55 + "colorspace": "/DeviceGray",
  56 + "decodeparms": [
  57 + null
  58 + ],
  59 + "filter": [
  60 + "/FlateDecode"
  61 + ],
  62 + "filterable": true,
  63 + "height": 8,
  64 + "name": "/IIm4",
  65 + "object": "11 0 R",
  66 + "width": 8
  67 + }
  68 + ],
  69 + "label": null,
  70 + "object": "4 0 R",
  71 + "outlines": [],
  72 + "pageposfrom1": 1
  73 + }
  74 + ],
  75 + "parameters": {
  76 + "decodelevel": "generalized"
  77 + },
  78 + "version": 1
  79 +}
qpdf/qtest/qpdf/optimize-images-inline-images-keep-all-json.out 0 → 100644
  1 +{
  2 + "pages": [
  3 + {
  4 + "contents": [
  5 + "7 0 R"
  6 + ],
  7 + "images": [],
  8 + "label": null,
  9 + "object": "4 0 R",
  10 + "outlines": [],
  11 + "pageposfrom1": 1
  12 + }
  13 + ],
  14 + "parameters": {
  15 + "decodelevel": "generalized"
  16 + },
  17 + "version": 1
  18 +}
qpdf/qtest/qpdf/optimize-images-inline-images-keep-all.out 0 → 100644
  1 +qpdf: wrote file a.pdf
qpdf/qtest/qpdf/optimize-images-inline-images-keep-some-json.out 0 → 100644
  1 +{
  2 + "pages": [
  3 + {
  4 + "contents": [
  5 + "7 0 R"
  6 + ],
  7 + "images": [
  8 + {
  9 + "bitspercomponent": 8,
  10 + "colorspace": "/DeviceRGB",
  11 + "decodeparms": [
  12 + null
  13 + ],
  14 + "filter": [
  15 + "/FlateDecode"
  16 + ],
  17 + "filterable": true,
  18 + "height": 56,
  19 + "name": "/IIm1",
  20 + "object": "8 0 R",
  21 + "width": 49
  22 + },
  23 + {
  24 + "bitspercomponent": 8,
  25 + "colorspace": "/DeviceRGB",
  26 + "decodeparms": [
  27 + null
  28 + ],
  29 + "filter": [
  30 + "/DCTDecode"
  31 + ],
  32 + "filterable": false,
  33 + "height": 675,
  34 + "name": "/IIm2",
  35 + "object": "9 0 R",
  36 + "width": 1200
  37 + },
  38 + {
  39 + "bitspercomponent": 8,
  40 + "colorspace": "/DeviceRGB",
  41 + "decodeparms": [
  42 + null
  43 + ],
  44 + "filter": [
  45 + "/DCTDecode"
  46 + ],
  47 + "filterable": false,
  48 + "height": 56,
  49 + "name": "/IIm3",
  50 + "object": "10 0 R",
  51 + "width": 49
  52 + }
  53 + ],
  54 + "label": null,
  55 + "object": "4 0 R",
  56 + "outlines": [],
  57 + "pageposfrom1": 1
  58 + }
  59 + ],
  60 + "parameters": {
  61 + "decodelevel": "generalized"
  62 + },
  63 + "version": 1
  64 +}
qpdf/qtest/qpdf/optimize-images-inline-images-keep-some.out 0 → 100644
  1 +qpdf: image /IIm1 on page 1: not optimizing because image is smaller than requested minimum dimensions
  2 +qpdf: image /IIm2 on page 1: optimizing image reduces size from 2134996 to ...
  3 +qpdf: image /IIm3 on page 1: not optimizing because unable to decode data or data already uses DCT
  4 +qpdf: wrote file a.pdf
qpdf/qtest/qpdf/optimize-images-inline-images.out 0 → 100644
  1 +qpdf: image /IIm1 on page 1: not optimizing because image is smaller than requested minimum dimensions
  2 +qpdf: image /IIm2 on page 1: optimizing image reduces size from 2134996 to ...
  3 +qpdf: image /IIm3 on page 1: not optimizing because unable to decode data or data already uses DCT
  4 +qpdf: image /IIm4 on page 1: not optimizing because image is smaller than requested minimum dimensions
  5 +qpdf: wrote file a.pdf