Commit 5211bcb5eaa2b6b3c8aa48580f1b97314c37bb4a

Authored by Jay Berkenbilt
1 parent 22bcdbe7

Externalize inline images (fixes #278)

ChangeLog
1 1 2019-01-31 Jay Berkenbilt <ejb@ql.org>
2 2  
  3 + * Add new options --externalize-inline-images, which converts
  4 + inline images larger than a specified size to regular images, and
  5 + --ii-min-bytes, which tweaks that size.
  6 +
  7 + * When optimizing images, inline images are now included in the
  8 + optimization, first being converted to regular images. Use
  9 + --keep-inline-images to exclude them from optimization. Fixes #278.
  10 +
  11 + * Add method QPDFPageObjectHelper::externalizeInlineImages, which
  12 + converts inline images whose size is at least a specified amount
  13 + to regular images.
  14 +
3 15 * Remove traces of acroread, which hasn't been available in Linux
4 16 for a long time.
5 17  
... ...
  1 +Now
  2 +===
  3 +
  4 +* Deal with compiler warnings
  5 +
1 6 Soon
2 7 ====
3 8  
... ... @@ -96,23 +101,6 @@ directory or that are otherwise not publicly accessible. This includes
96 101 things sent to me by email that are specifically not public. Even so,
97 102 I find it useful to make reference to them in this list
98 103  
99   - * Do something better for inline images (see #278)
100   - * Figure out a way to add an expectInlineImage method that takes
101   - the offset of the EI image so an external system can locate the
102   - end tag. Hopefully Both QPDFObjectHandle and Pl_QPDFTokenizer can
103   - do this. Somewhere we might want something that uses an input
104   - source to do it, but for the pipeline, it will also have to be
105   - possible to do it as we go.
106   - * Improve location of EI to handle EI embedded in the image data;
107   - consider trying to parse after EI and, if errors, keep looking.
108   - Will have to look at what happens with random binary characters
109   - regarding token type.
110   - * Add a method to replace inline images with real images. Look at
111   - existing code for adding new resources used with form XObjects
112   - and reuse if possible
113   - * Have image optimization replace inline images that are of more
114   - than a certain size prior to optimizing
115   -
116 104 * Add support for writing name and number trees
117 105  
118 106 * Figure out how to render Gajić correctly in the PDF version of the
... ...
include/qpdf/QPDFPageObjectHelper.hh
... ... @@ -73,6 +73,11 @@ class QPDFPageObjectHelper: public QPDFObjectHelper
73 73 QPDF_DLL
74 74 std::map<std::string, QPDFObjectHandle> getPageImages();
75 75  
  76 + // Convert each inline image to an external (normal) image if the
  77 + // size is at least the specified number of bytes.
  78 + QPDF_DLL
  79 + void externalizeInlineImages(size_t min_size = 0);
  80 +
76 81 // Return the annotations in the page's "/Annots" list, if any. If
77 82 // only_subtype is non-empty, only include annotations of the
78 83 // given subtype.
... ...
libqpdf/QPDFPageObjectHelper.cc
... ... @@ -2,6 +2,7 @@
2 2 #include <qpdf/QTC.hh>
3 3 #include <qpdf/QPDF.hh>
4 4 #include <qpdf/Pl_Concatenate.hh>
  5 +#include <qpdf/Pl_Buffer.hh>
5 6 #include <qpdf/QUtil.hh>
6 7 #include <qpdf/QPDFExc.hh>
7 8 #include <qpdf/QPDFMatrix.hh>
... ... @@ -36,6 +37,251 @@ ContentProvider::provideStreamData(int, int, Pipeline* p)
36 37 concat.manualFinish();
37 38 }
38 39  
  40 +class InlineImageTracker: public QPDFObjectHandle::TokenFilter
  41 +{
  42 + public:
  43 + InlineImageTracker(QPDF*, size_t min_size, QPDFObjectHandle resources);
  44 + virtual ~InlineImageTracker()
  45 + {
  46 + }
  47 + virtual void handleToken(QPDFTokenizer::Token const&);
  48 + QPDFObjectHandle convertIIDict(QPDFObjectHandle odict);
  49 +
  50 + QPDF* qpdf;
  51 + size_t min_size;
  52 + QPDFObjectHandle resources;
  53 + std::string dict_str;
  54 + std::string bi_str;
  55 + int min_suffix;
  56 + bool any_images;
  57 + enum { st_top, st_bi } state;
  58 +};
  59 +
  60 +InlineImageTracker::InlineImageTracker(QPDF* qpdf, size_t min_size,
  61 + QPDFObjectHandle resources) :
  62 + qpdf(qpdf),
  63 + min_size(min_size),
  64 + resources(resources),
  65 + min_suffix(1),
  66 + any_images(false),
  67 + state(st_top)
  68 +{
  69 +}
  70 +
  71 +QPDFObjectHandle
  72 +InlineImageTracker::convertIIDict(QPDFObjectHandle odict)
  73 +{
  74 + QPDFObjectHandle dict = QPDFObjectHandle::newDictionary();
  75 + dict.replaceKey("/Type", QPDFObjectHandle::newName("/XObject"));
  76 + dict.replaceKey("/Subtype", QPDFObjectHandle::newName("/Image"));
  77 + std::set<std::string> keys = odict.getKeys();
  78 + for (std::set<std::string>::iterator iter = keys.begin();
  79 + iter != keys.end(); ++iter)
  80 + {
  81 + std::string key = *iter;
  82 + QPDFObjectHandle value = odict.getKey(key);
  83 + if (key == "/BPC")
  84 + {
  85 + key = "/BitsPerComponent";
  86 + }
  87 + else if (key == "/CS")
  88 + {
  89 + key = "/ColorSpace";
  90 + }
  91 + else if (key == "/D")
  92 + {
  93 + key = "/Decode";
  94 + }
  95 + else if (key == "/DP")
  96 + {
  97 + key = "/DecodeParms";
  98 + }
  99 + else if (key == "/F")
  100 + {
  101 + key = "/Filter";
  102 + }
  103 + else if (key == "/H")
  104 + {
  105 + key = "/Height";
  106 + }
  107 + else if (key == "/IM")
  108 + {
  109 + key = "/ImageMask";
  110 + }
  111 + else if (key == "/I")
  112 + {
  113 + key = "/Interpolate";
  114 + }
  115 + else if (key == "/W")
  116 + {
  117 + key = "/Width";
  118 + }
  119 +
  120 + if (key == "/ColorSpace")
  121 + {
  122 + if (value.isName())
  123 + {
  124 + std::string name = value.getName();
  125 + if (name == "/G")
  126 + {
  127 + name = "/DeviceGray";
  128 + }
  129 + else if (name == "/RGB")
  130 + {
  131 + name = "/DeviceRGB";
  132 + }
  133 + else if (name == "/CMYK")
  134 + {
  135 + name = "/DeviceCMYK";
  136 + }
  137 + else if (name == "/I")
  138 + {
  139 + name = "/Indexed";
  140 + }
  141 + else
  142 + {
  143 + name.clear();
  144 + }
  145 + if (! name.empty())
  146 + {
  147 + value = QPDFObjectHandle::newName(name);
  148 + }
  149 + }
  150 + }
  151 + else if (key == "/Filter")
  152 + {
  153 + std::vector<QPDFObjectHandle> filters;
  154 + if (value.isName())
  155 + {
  156 + filters.push_back(value);
  157 + }
  158 + else if (value.isArray())
  159 + {
  160 + filters = value.getArrayAsVector();
  161 + }
  162 + for (std::vector<QPDFObjectHandle>::iterator iter =
  163 + filters.begin();
  164 + iter != filters.end(); ++iter)
  165 + {
  166 + std::string name;
  167 + if ((*iter).isName())
  168 + {
  169 + name = (*iter).getName();
  170 + }
  171 + if (name == "/AHx")
  172 + {
  173 + name = "/ASCIIHexDecode";
  174 + }
  175 + else if (name == "/A85")
  176 + {
  177 + name = "/ASCII85Decode";
  178 + }
  179 + else if (name == "/LZW")
  180 + {
  181 + name = "/LZWDecode";
  182 + }
  183 + else if (name == "/Fl")
  184 + {
  185 + name = "/FlateDecode";
  186 + }
  187 + else if (name == "/RL")
  188 + {
  189 + name = "/RunLengthDecode";
  190 + }
  191 + else if (name == "/CCF")
  192 + {
  193 + name = "/CCITTFaxDecode";
  194 + }
  195 + else if (name == "/DCT")
  196 + {
  197 + name = "/DCTDecode";
  198 + }
  199 + else
  200 + {
  201 + name.clear();
  202 + }
  203 + if (! name.empty())
  204 + {
  205 + *iter = QPDFObjectHandle::newName(name);
  206 + }
  207 + }
  208 + if (value.isName() && (filters.size() == 1))
  209 + {
  210 + value = filters.at(0);
  211 + }
  212 + else if (value.isArray())
  213 + {
  214 + value = QPDFObjectHandle::newArray(filters);
  215 + }
  216 + }
  217 + dict.replaceKey(key, value);
  218 + }
  219 + return dict;
  220 +}
  221 +
  222 +void
  223 +InlineImageTracker::handleToken(QPDFTokenizer::Token const& token)
  224 +{
  225 + if (state == st_bi)
  226 + {
  227 + if (token.getType() == QPDFTokenizer::tt_inline_image)
  228 + {
  229 + std::string image_data(token.getValue());
  230 + size_t len = image_data.length();
  231 + // The token ends with delimiter followed by EI, so it
  232 + // will always be at least 3 bytes long. We want to
  233 + // exclude the EI and preceding delimiter.
  234 + len = (len >= 3 ? len - 3 : 0);
  235 + if (len >= this->min_size)
  236 + {
  237 + QTC::TC("qpdf", "QPDFPageObjectHelper externalize inline image");
  238 + Pl_Buffer b("image_data");
  239 + b.write(QUtil::unsigned_char_pointer(image_data), len);
  240 + b.finish();
  241 + QPDFObjectHandle dict =
  242 + convertIIDict(QPDFObjectHandle::parse(dict_str));
  243 + dict.replaceKey("/Length", QPDFObjectHandle::newInteger(len));
  244 + std::string name = resources.getUniqueResourceName(
  245 + "/IIm", this->min_suffix);
  246 + QPDFObjectHandle image = QPDFObjectHandle::newStream(
  247 + this->qpdf, b.getBuffer());
  248 + image.replaceDict(dict);
  249 + resources.getKey("/XObject").replaceKey(name, image);
  250 + write(name);
  251 + write(" Do\n");
  252 + any_images = true;
  253 + }
  254 + else
  255 + {
  256 + QTC::TC("qpdf", "QPDFPageObjectHelper keep inline image");
  257 + write(bi_str);
  258 + writeToken(token);
  259 + }
  260 + state = st_top;
  261 + }
  262 + else if (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "ID"))
  263 + {
  264 + bi_str += token.getValue();
  265 + dict_str += " >>";
  266 + }
  267 + else
  268 + {
  269 + bi_str += token.getValue();
  270 + dict_str += token.getValue();
  271 + }
  272 + }
  273 + else if (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "BI"))
  274 + {
  275 + bi_str = token.getValue();
  276 + dict_str = "<< ";
  277 + state = st_bi;
  278 + }
  279 + else
  280 + {
  281 + writeToken(token);
  282 + }
  283 +}
  284 +
39 285 QPDFPageObjectHelper::Members::~Members()
40 286 {
41 287 }
... ... @@ -112,13 +358,32 @@ QPDFPageObjectHelper::getMediaBox(bool copy_if_shared)
112 358 return getAttribute("/MediaBox", copy_if_shared);
113 359 }
114 360  
115   -
116 361 std::map<std::string, QPDFObjectHandle>
117 362 QPDFPageObjectHelper::getPageImages()
118 363 {
119 364 return this->oh.getPageImages();
120 365 }
121 366  
  367 +void
  368 +QPDFPageObjectHelper::externalizeInlineImages(size_t min_size)
  369 +{
  370 + QPDFObjectHandle resources = getAttribute("/Resources", true);
  371 + // Calling mergeResources also ensures that /XObject becomes
  372 + // direct and is not shared with other pages.
  373 + resources.mergeResources(
  374 + QPDFObjectHandle::parse("<< /XObject << >> >>"));
  375 + InlineImageTracker iit(this->oh.getOwningQPDF(), min_size, resources);
  376 + Pl_Buffer b("new page content");
  377 + filterPageContents(&iit, &b);
  378 + if (iit.any_images)
  379 + {
  380 + getObjectHandle().replaceKey(
  381 + "/Contents",
  382 + QPDFObjectHandle::newStream(
  383 + this->oh.getOwningQPDF(), b.getBuffer()));
  384 + }
  385 +}
  386 +
122 387 std::vector<QPDFAnnotationObjectHelper>
123 388 QPDFPageObjectHelper::getAnnotations(std::string const& only_subtype)
124 389 {
... ...
manual/qpdf-manual.xml
... ... @@ -1746,7 +1746,11 @@ outfile.pdf&lt;/option&gt;
1746 1746 <option>--verbose</option>. See also the
1747 1747 <option>--oi-min-width</option>,
1748 1748 <option>--oi-min-height</option>, and
1749   - <option>--oi-min-area</option> options.
  1749 + <option>--oi-min-area</option> options. By default, starting
  1750 + in qpdf 8.4, inline images are converted to regular images
  1751 + and optimized as well. Use
  1752 + <option>--keep-inline-images</option> to prevent inline images
  1753 + from being included.
1750 1754 </para>
1751 1755 </listitem>
1752 1756 </varlistentry>
... ... @@ -1780,6 +1784,43 @@ outfile.pdf&lt;/option&gt;
1780 1784 </para>
1781 1785 </listitem>
1782 1786 </varlistentry>
  1787 +
  1788 +
  1789 +
  1790 + <varlistentry>
  1791 + <term><option>--externalize-inline-images</option></term>
  1792 + <listitem>
  1793 + <para>
  1794 + Convert inline images to regular images. By default, images
  1795 + whose data is at least 1,024 bytes are converted when this
  1796 + option is selected. Use <option>--ii-min-bytes</option> to
  1797 + change the size threshold. This option is implicitly selected
  1798 + when <option>--optimize-images</option> is selected. Use
  1799 + <option>--keep-inline-images</option> to exclude inline images
  1800 + from image optimization.
  1801 + </para>
  1802 + </listitem>
  1803 + </varlistentry>
  1804 + <varlistentry>
  1805 + <term><option>--ii-min-bytes=<replaceable>bytes</replaceable></option></term>
  1806 + <listitem>
  1807 + <para>
  1808 + Avoid converting inline images whose size is below the
  1809 + specified minimum size to regular images. If omitted, the
  1810 + default is 1,024 bytes. Use 0 for no minimum.
  1811 + </para>
  1812 + </listitem>
  1813 + </varlistentry>
  1814 + <varlistentry>
  1815 + <term><option>--keep-inline-images</option></term>
  1816 + <listitem>
  1817 + <para>
  1818 + Prevent inline images from being included in image
  1819 + optimization. This option has no affect when
  1820 + <option>--optimize-images</option> is not specified.
  1821 + </para>
  1822 + </listitem>
  1823 + </varlistentry>
1783 1824 <varlistentry>
1784 1825 <term><option>--qdf</option></term>
1785 1826 <listitem>
... ... @@ -4323,6 +4364,18 @@ print &quot;\n&quot;;
4323 4364 </listitem>
4324 4365 <listitem>
4325 4366 <para>
  4367 + New options <option>--externalize-inline-images</option>,
  4368 + <option>--ii-min-bytes</option>, and
  4369 + <option>--keep-inline-images</option> control qpdf's
  4370 + handling of inline images and possible conversion of them to
  4371 + regular images. By default,
  4372 + <option>--optimize-images</option> now also applies to
  4373 + inline images. These options are discussed in <xref
  4374 + linkend="ref.advanced-transformation"/>.
  4375 + </para>
  4376 + </listitem>
  4377 + <listitem>
  4378 + <para>
4326 4379 Add options <option>--overlay</option> and
4327 4380 <option>--underlay</option> for overlaying or underlaying
4328 4381 pages of other files onto output pages. See <xref
... ... @@ -4415,6 +4468,14 @@ print &quot;\n&quot;;
4415 4468 not compressed.
4416 4469 </para>
4417 4470 </listitem>
  4471 + <listitem>
  4472 + <para>
  4473 + When the tokenizer returns inline image tokens, delimiters
  4474 + following <literal>ID</literal> and <literal>EI</literal>
  4475 + operators are no longer excluded. This makes it possible to
  4476 + reliably extract the actual image data.
  4477 + </para>
  4478 + </listitem>
4418 4479 </itemizedlist>
4419 4480 </listitem>
4420 4481 <listitem>
... ... @@ -4425,6 +4486,13 @@ print &quot;\n&quot;;
4425 4486 <listitem>
4426 4487 <para>
4427 4488 Add method
  4489 + <function>QPDFPageObjectHelper::externalizeInlineImages</function>
  4490 + to convert inline images to regular images.
  4491 + </para>
  4492 + </listitem>
  4493 + <listitem>
  4494 + <para>
  4495 + Add method
4428 4496 <function>QUtil::possible_repaired_encodings()</function> to
4429 4497 generate a list of strings that represent other ways the
4430 4498 given string could have been encoded. This is the method the
... ...
qpdf/qpdf.cc
... ... @@ -161,9 +161,12 @@ struct Options
161 161 json(false),
162 162 check(false),
163 163 optimize_images(false),
  164 + externalize_inline_images(false),
  165 + keep_inline_images(false),
164 166 oi_min_width(128), // Default values for these
165 167 oi_min_height(128), // oi flags are in --help
166 168 oi_min_area(16384), // and in the manual.
  169 + ii_min_bytes(1024), //
167 170 underlay("underlay"),
168 171 overlay("overlay"),
169 172 under_overlay(0),
... ... @@ -254,9 +257,12 @@ struct Options
254 257 std::set<std::string> json_objects;
255 258 bool check;
256 259 bool optimize_images;
  260 + bool externalize_inline_images;
  261 + bool keep_inline_images;
257 262 size_t oi_min_width;
258 263 size_t oi_min_height;
259 264 size_t oi_min_area;
  265 + size_t ii_min_bytes;
260 266 UnderOverlay underlay;
261 267 UnderOverlay overlay;
262 268 UnderOverlay* under_overlay;
... ... @@ -659,9 +665,12 @@ class ArgParser
659 665 void argJsonObject(char* parameter);
660 666 void argCheck();
661 667 void argOptimizeImages();
  668 + void argExternalizeInlineImages();
  669 + void argKeepInlineImages();
662 670 void argOiMinWidth(char* parameter);
663 671 void argOiMinHeight(char* parameter);
664 672 void argOiMinArea(char* parameter);
  673 + void argIiMinBytes(char* parameter);
665 674 void arg40Print(char* parameter);
666 675 void arg40Modify(char* parameter);
667 676 void arg40Extract(char* parameter);
... ... @@ -894,12 +903,17 @@ ArgParser::initOptionTable()
894 903 &ArgParser::argJsonObject, "trailer|obj[,gen]");
895 904 (*t)["check"] = oe_bare(&ArgParser::argCheck);
896 905 (*t)["optimize-images"] = oe_bare(&ArgParser::argOptimizeImages);
  906 + (*t)["externalize-inline-images"] =
  907 + oe_bare(&ArgParser::argExternalizeInlineImages);
  908 + (*t)["keep-inline-images"] = oe_bare(&ArgParser::argKeepInlineImages);
897 909 (*t)["oi-min-width"] = oe_requiredParameter(
898 910 &ArgParser::argOiMinWidth, "minimum-width");
899 911 (*t)["oi-min-height"] = oe_requiredParameter(
900 912 &ArgParser::argOiMinHeight, "minimum-height");
901 913 (*t)["oi-min-area"] = oe_requiredParameter(
902 914 &ArgParser::argOiMinArea, "minimum-area");
  915 + (*t)["ii-min-bytes"] = oe_requiredParameter(
  916 + &ArgParser::argIiMinBytes, "minimum-bytes");
903 917 (*t)["overlay"] = oe_bare(&ArgParser::argOverlay);
904 918 (*t)["underlay"] = oe_bare(&ArgParser::argUnderlay);
905 919  
... ... @@ -1308,6 +1322,12 @@ ArgParser::argHelp()
1308 1322 << " default is 128. Use 0 to mean no minimum\n"
1309 1323 << "--oi-min-area=a do not optimize images whose pixel count is below a\n"
1310 1324 << " default is 16,384. Use 0 to mean no minimum\n"
  1325 + << "--externalize-inline-images convert inline images to regular images; by\n"
  1326 + << " default, images of at least 1,024 bytes are\n"
  1327 + << " externalized\n"
  1328 + << "--ii-min-bytes=bytes specify minimum size of inline images to be\n"
  1329 + << " converted to regular images\n"
  1330 + << "--keep-inline-images exclude inline images from image optimization\n"
1311 1331 << "--qdf turns on \"QDF mode\" (below)\n"
1312 1332 << "--linearize-pass1=file write intermediate pass of linearized file\n"
1313 1333 << " for debugging\n"
... ... @@ -1966,6 +1986,18 @@ ArgParser::argOptimizeImages()
1966 1986 }
1967 1987  
1968 1988 void
  1989 +ArgParser::argExternalizeInlineImages()
  1990 +{
  1991 + o.externalize_inline_images = true;
  1992 +}
  1993 +
  1994 +void
  1995 +ArgParser::argKeepInlineImages()
  1996 +{
  1997 + o.keep_inline_images = true;
  1998 +}
  1999 +
  2000 +void
1969 2001 ArgParser::argOiMinWidth(char* parameter)
1970 2002 {
1971 2003 o.oi_min_width = QUtil::string_to_int(parameter);
... ... @@ -1984,6 +2016,12 @@ ArgParser::argOiMinArea(char* parameter)
1984 2016 }
1985 2017  
1986 2018 void
  2019 +ArgParser::argIiMinBytes(char* parameter)
  2020 +{
  2021 + o.ii_min_bytes = QUtil::string_to_int(parameter);
  2022 +}
  2023 +
  2024 +void
1987 2025 ArgParser::arg40Print(char* parameter)
1988 2026 {
1989 2027 o.r2_print = (strcmp(parameter, "y") == 0);
... ... @@ -2933,6 +2971,10 @@ ArgParser::doFinalChecks()
2933 2971 {
2934 2972 usage("no output file may be given for this option");
2935 2973 }
  2974 + if (o.optimize_images && (! o.keep_inline_images))
  2975 + {
  2976 + o.externalize_inline_images = true;
  2977 + }
2936 2978  
2937 2979 if (o.require_outfile && (strcmp(o.outfilename, "-") == 0))
2938 2980 {
... ... @@ -3764,10 +3806,7 @@ ImageOptimizer::makePipeline(std::string const&amp; description, Pipeline* next)
3764 3806 QPDFObjectHandle w_obj = dict.getKey("/Width");
3765 3807 QPDFObjectHandle h_obj = dict.getKey("/Height");
3766 3808 QPDFObjectHandle colorspace_obj = dict.getKey("/ColorSpace");
3767   - QPDFObjectHandle components_obj = dict.getKey("/BitsPerComponent");
3768   - if (! (w_obj.isInteger() &&
3769   - h_obj.isInteger() &&
3770   - components_obj.isInteger()))
  3809 + if (! (w_obj.isNumber() && h_obj.isNumber()))
3771 3810 {
3772 3811 if (o.verbose && (! description.empty()))
3773 3812 {
... ... @@ -3777,8 +3816,12 @@ ImageOptimizer::makePipeline(std::string const&amp; description, Pipeline* next)
3777 3816 }
3778 3817 return result;
3779 3818 }
3780   - JDIMENSION w = w_obj.getIntValue();
3781   - JDIMENSION h = h_obj.getIntValue();
  3819 + // Files have been seen in the wild whose width and height are
  3820 + // floating point, which is goofy, but we can deal with it.
  3821 + JDIMENSION w = static_cast<JDIMENSION>(
  3822 + w_obj.isInteger() ? w_obj.getIntValue() : w_obj.getNumericValue());
  3823 + JDIMENSION h = static_cast<JDIMENSION>(
  3824 + h_obj.isInteger() ? h_obj.getIntValue() : h_obj.getNumericValue());
3782 3825 std::string colorspace = (colorspace_obj.isName() ?
3783 3826 colorspace_obj.getName() :
3784 3827 "");
... ... @@ -4198,6 +4241,16 @@ static void handle_under_overlay(QPDF&amp; pdf, Options&amp; o)
4198 4241 static void handle_transformations(QPDF& pdf, Options& o)
4199 4242 {
4200 4243 QPDFPageDocumentHelper dh(pdf);
  4244 + if (o.externalize_inline_images)
  4245 + {
  4246 + std::vector<QPDFPageObjectHelper> pages = dh.getAllPages();
  4247 + for (std::vector<QPDFPageObjectHelper>::iterator iter = pages.begin();
  4248 + iter != pages.end(); ++iter)
  4249 + {
  4250 + QPDFPageObjectHelper& ph(*iter);
  4251 + ph.externalizeInlineImages(o.ii_min_bytes);
  4252 + }
  4253 + }
4201 4254 if (o.optimize_images)
4202 4255 {
4203 4256 int pageno = 0;
... ...
qpdf/qpdf.testcov
... ... @@ -436,3 +436,5 @@ QPDFTokenizer found EI the old way 0
436 436 QPDFTokenizer found EI by byte count 0
437 437 QPDFTokenizer inline image at EOF the old way 0
438 438 QPDFTokenizer found EI after more than one try 0
  439 +QPDFPageObjectHelper externalize inline image 0
  440 +QPDFPageObjectHelper keep inline image 0
... ...
qpdf/qtest/qpdf.test
... ... @@ -679,7 +679,7 @@ $td-&gt;runtest(&quot;check pass1 file&quot;,
679 679 show_ntests();
680 680 # ----------
681 681 $td->notify("--- Inline Images ---");
682   -$n_tests += 2;
  682 +$n_tests += 8;
683 683  
684 684 # The file large-inline-image.pdf is a hand-crafted file with several
685 685 # inline images of various sizes including one that is two megabytes,
... ... @@ -696,6 +696,69 @@ $td-&gt;runtest(&quot;check output&quot;,
696 696 {$td->FILE => "a.pdf"},
697 697 {$td->FILE => "large-inline-image.qdf"});
698 698  
  699 +$td->runtest("eof in inline image",
  700 + {$td->COMMAND =>
  701 + "qpdf --qdf --static-id eof-in-inline-image.pdf a.pdf"},
  702 + {$td->FILE => "eof-inline-qdf.out", $td->EXIT_STATUS => 3},
  703 + $td->NORMALIZE_NEWLINES);
  704 +$td->runtest("check output",
  705 + {$td->FILE => "a.pdf"},
  706 + {$td->FILE => "eof-in-inline-image.qdf"});
  707 +$td->runtest("externalize eof in inline image",
  708 + {$td->COMMAND =>
  709 + "qpdf --qdf --externalize-inline-images" .
  710 + " --static-id eof-in-inline-image.pdf a.pdf"},
  711 + {$td->FILE => "eof-inline-qdf.out", $td->EXIT_STATUS => 3},
  712 + $td->NORMALIZE_NEWLINES);
  713 +$td->runtest("check output",
  714 + {$td->FILE => "a.pdf"},
  715 + {$td->FILE => "eof-in-inline-image-ii.qdf"});
  716 +$td->runtest("externalize damaged image",
  717 + {$td->COMMAND =>
  718 + "qpdf --externalize-inline-images" .
  719 + " --compress-streams=n --static-id" .
  720 + " damaged-inline-image.pdf a.pdf"},
  721 + {$td->STRING => "", $td->EXIT_STATUS => 0},
  722 + $td->NORMALIZE_NEWLINES);
  723 +$td->runtest("check output",
  724 + {$td->FILE => "a.pdf"},
  725 + {$td->FILE => "damaged-inline-image-out.pdf"});
  726 +
  727 +my @eii_tests = (
  728 + ['inline-images', 80],
  729 + ['large-inline-image', 1024],
  730 + );
  731 +$n_tests += 4 * scalar(@eii_tests);
  732 +$n_compare_pdfs += 2 * scalar(@eii_tests);
  733 +
  734 +foreach my $d (@eii_tests)
  735 +{
  736 + my ($file, $threshold) = @$d;
  737 + $td->runtest("inline image $file (all)",
  738 + {$td->COMMAND =>
  739 + "qpdf --qdf --static-id --externalize-inline-images" .
  740 + " --ii-min-bytes=0 $file.pdf a.pdf"},
  741 + {$td->STRING => "", $td->EXIT_STATUS => 0},
  742 + $td->NORMALIZE_NEWLINES);
  743 + $td->runtest("check output",
  744 + {$td->FILE => "a.pdf"},
  745 + {$td->FILE => "$file-ii-all.pdf"});
  746 + compare_pdfs("$file.pdf", "a.pdf");
  747 +
  748 + $td->runtest("inline image $file (some)",
  749 + {$td->COMMAND =>
  750 + "qpdf --qdf --static-id --externalize-inline-images" .
  751 + " --ii-min-bytes=$threshold $file.pdf a.pdf"},
  752 + {$td->STRING => "", $td->EXIT_STATUS => 0},
  753 + $td->NORMALIZE_NEWLINES);
  754 + $td->runtest("check output",
  755 + {$td->FILE => "a.pdf"},
  756 + {$td->FILE => "$file-ii-some.pdf"});
  757 + compare_pdfs("$file.pdf", "a.pdf");
  758 +}
  759 +
  760 +# QXXXQ externalize tests with min size
  761 +
699 762 show_ntests();
700 763 # ----------
701 764 $td->notify("--- Tokenizer ---");
... ... @@ -2019,6 +2082,12 @@ my @image_opt = (
2019 2082 '--oi-min-width=0 --oi-min-height=0 --oi-min-area=30000'],
2020 2083 ['small-images', 'min-area-all',
2021 2084 '--oi-min-width=0 --oi-min-height=0 --oi-min-area=30000'],
  2085 + ['large-inline-image', 'inline-images',
  2086 + '--ii-min-bytes=0'],
  2087 + ['large-inline-image', 'inline-images-all-size',
  2088 + '--oi-min-width=0 --oi-min-height=0 --oi-min-area=0 --ii-min-bytes=0'],
  2089 + ['large-inline-image', 'inline-images-keep-some', ''],
  2090 + ['large-inline-image', 'inline-images-keep-all', '--keep-inline-images'],
2022 2091 );
2023 2092  
2024 2093 $n_tests += 2 * scalar(@image_opt);
... ...
qpdf/qtest/qpdf/damaged-inline-image-out.pdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/damaged-inline-image.pdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/eof-in-inline-image-ii.qdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/eof-in-inline-image.qdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/eof-inline-qdf.out 0 → 100644
  1 +WARNING: eof-in-inline-image.pdf (offset 299): content normalization encountered bad tokens
  2 +WARNING: eof-in-inline-image.pdf (offset 299): normalized content ended with a bad token; you may be able to resolve this by coalescing content streams in combination with normalizing content. From the command line, specify --coalesce-contents
  3 +WARNING: eof-in-inline-image.pdf (offset 299): Resulting stream data may be corrupted but is may still useful for manual inspection. For more information on this warning, search for content normalization in the manual.
  4 +qpdf: operation succeeded with warnings; resulting file may have some problems
... ...
qpdf/qtest/qpdf/inline-images-ii-all.pdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/inline-images-ii-some.pdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/large-inline-image-ii-all.pdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/large-inline-image-ii-some.pdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/optimize-images-inline-images-all-size-json.out 0 → 100644
  1 +{
  2 + "pages": [
  3 + {
  4 + "contents": [
  5 + "7 0 R"
  6 + ],
  7 + "images": [
  8 + {
  9 + "bitspercomponent": 8,
  10 + "colorspace": "/DeviceRGB",
  11 + "decodeparms": [
  12 + null
  13 + ],
  14 + "filter": [
  15 + "/DCTDecode"
  16 + ],
  17 + "filterable": false,
  18 + "height": 56,
  19 + "name": "/IIm1",
  20 + "object": "8 0 R",
  21 + "width": 49
  22 + },
  23 + {
  24 + "bitspercomponent": 8,
  25 + "colorspace": "/DeviceRGB",
  26 + "decodeparms": [
  27 + null
  28 + ],
  29 + "filter": [
  30 + "/DCTDecode"
  31 + ],
  32 + "filterable": false,
  33 + "height": 675,
  34 + "name": "/IIm2",
  35 + "object": "9 0 R",
  36 + "width": 1200
  37 + },
  38 + {
  39 + "bitspercomponent": 8,
  40 + "colorspace": "/DeviceRGB",
  41 + "decodeparms": [
  42 + null
  43 + ],
  44 + "filter": [
  45 + "/DCTDecode"
  46 + ],
  47 + "filterable": false,
  48 + "height": 56,
  49 + "name": "/IIm3",
  50 + "object": "10 0 R",
  51 + "width": 49
  52 + },
  53 + {
  54 + "bitspercomponent": 8,
  55 + "colorspace": "/DeviceGray",
  56 + "decodeparms": [
  57 + null
  58 + ],
  59 + "filter": [
  60 + "/FlateDecode"
  61 + ],
  62 + "filterable": true,
  63 + "height": 8,
  64 + "name": "/IIm4",
  65 + "object": "11 0 R",
  66 + "width": 8
  67 + }
  68 + ],
  69 + "label": null,
  70 + "object": "4 0 R",
  71 + "outlines": [],
  72 + "pageposfrom1": 1
  73 + }
  74 + ],
  75 + "parameters": {
  76 + "decodelevel": "generalized"
  77 + },
  78 + "version": 1
  79 +}
... ...
qpdf/qtest/qpdf/optimize-images-inline-images-all-size.out 0 → 100644
  1 +qpdf: image /IIm1 on page 1: optimizing image reduces size from 2391 to ...
  2 +qpdf: image /IIm2 on page 1: optimizing image reduces size from 2134996 to ...
  3 +qpdf: image /IIm3 on page 1: not optimizing because unable to decode data or data already uses DCT
  4 +qpdf: image /IIm4 on page 1: not optimizing because DCT compression does not reduce image size
  5 +qpdf: wrote file a.pdf
... ...
qpdf/qtest/qpdf/optimize-images-inline-images-json.out 0 → 100644
  1 +{
  2 + "pages": [
  3 + {
  4 + "contents": [
  5 + "7 0 R"
  6 + ],
  7 + "images": [
  8 + {
  9 + "bitspercomponent": 8,
  10 + "colorspace": "/DeviceRGB",
  11 + "decodeparms": [
  12 + null
  13 + ],
  14 + "filter": [
  15 + "/FlateDecode"
  16 + ],
  17 + "filterable": true,
  18 + "height": 56,
  19 + "name": "/IIm1",
  20 + "object": "8 0 R",
  21 + "width": 49
  22 + },
  23 + {
  24 + "bitspercomponent": 8,
  25 + "colorspace": "/DeviceRGB",
  26 + "decodeparms": [
  27 + null
  28 + ],
  29 + "filter": [
  30 + "/DCTDecode"
  31 + ],
  32 + "filterable": false,
  33 + "height": 675,
  34 + "name": "/IIm2",
  35 + "object": "9 0 R",
  36 + "width": 1200
  37 + },
  38 + {
  39 + "bitspercomponent": 8,
  40 + "colorspace": "/DeviceRGB",
  41 + "decodeparms": [
  42 + null
  43 + ],
  44 + "filter": [
  45 + "/DCTDecode"
  46 + ],
  47 + "filterable": false,
  48 + "height": 56,
  49 + "name": "/IIm3",
  50 + "object": "10 0 R",
  51 + "width": 49
  52 + },
  53 + {
  54 + "bitspercomponent": 8,
  55 + "colorspace": "/DeviceGray",
  56 + "decodeparms": [
  57 + null
  58 + ],
  59 + "filter": [
  60 + "/FlateDecode"
  61 + ],
  62 + "filterable": true,
  63 + "height": 8,
  64 + "name": "/IIm4",
  65 + "object": "11 0 R",
  66 + "width": 8
  67 + }
  68 + ],
  69 + "label": null,
  70 + "object": "4 0 R",
  71 + "outlines": [],
  72 + "pageposfrom1": 1
  73 + }
  74 + ],
  75 + "parameters": {
  76 + "decodelevel": "generalized"
  77 + },
  78 + "version": 1
  79 +}
... ...
qpdf/qtest/qpdf/optimize-images-inline-images-keep-all-json.out 0 → 100644
  1 +{
  2 + "pages": [
  3 + {
  4 + "contents": [
  5 + "7 0 R"
  6 + ],
  7 + "images": [],
  8 + "label": null,
  9 + "object": "4 0 R",
  10 + "outlines": [],
  11 + "pageposfrom1": 1
  12 + }
  13 + ],
  14 + "parameters": {
  15 + "decodelevel": "generalized"
  16 + },
  17 + "version": 1
  18 +}
... ...
qpdf/qtest/qpdf/optimize-images-inline-images-keep-all.out 0 → 100644
  1 +qpdf: wrote file a.pdf
... ...
qpdf/qtest/qpdf/optimize-images-inline-images-keep-some-json.out 0 → 100644
  1 +{
  2 + "pages": [
  3 + {
  4 + "contents": [
  5 + "7 0 R"
  6 + ],
  7 + "images": [
  8 + {
  9 + "bitspercomponent": 8,
  10 + "colorspace": "/DeviceRGB",
  11 + "decodeparms": [
  12 + null
  13 + ],
  14 + "filter": [
  15 + "/FlateDecode"
  16 + ],
  17 + "filterable": true,
  18 + "height": 56,
  19 + "name": "/IIm1",
  20 + "object": "8 0 R",
  21 + "width": 49
  22 + },
  23 + {
  24 + "bitspercomponent": 8,
  25 + "colorspace": "/DeviceRGB",
  26 + "decodeparms": [
  27 + null
  28 + ],
  29 + "filter": [
  30 + "/DCTDecode"
  31 + ],
  32 + "filterable": false,
  33 + "height": 675,
  34 + "name": "/IIm2",
  35 + "object": "9 0 R",
  36 + "width": 1200
  37 + },
  38 + {
  39 + "bitspercomponent": 8,
  40 + "colorspace": "/DeviceRGB",
  41 + "decodeparms": [
  42 + null
  43 + ],
  44 + "filter": [
  45 + "/DCTDecode"
  46 + ],
  47 + "filterable": false,
  48 + "height": 56,
  49 + "name": "/IIm3",
  50 + "object": "10 0 R",
  51 + "width": 49
  52 + }
  53 + ],
  54 + "label": null,
  55 + "object": "4 0 R",
  56 + "outlines": [],
  57 + "pageposfrom1": 1
  58 + }
  59 + ],
  60 + "parameters": {
  61 + "decodelevel": "generalized"
  62 + },
  63 + "version": 1
  64 +}
... ...
qpdf/qtest/qpdf/optimize-images-inline-images-keep-some.out 0 → 100644
  1 +qpdf: image /IIm1 on page 1: not optimizing because image is smaller than requested minimum dimensions
  2 +qpdf: image /IIm2 on page 1: optimizing image reduces size from 2134996 to ...
  3 +qpdf: image /IIm3 on page 1: not optimizing because unable to decode data or data already uses DCT
  4 +qpdf: wrote file a.pdf
... ...
qpdf/qtest/qpdf/optimize-images-inline-images.out 0 → 100644
  1 +qpdf: image /IIm1 on page 1: not optimizing because image is smaller than requested minimum dimensions
  2 +qpdf: image /IIm2 on page 1: optimizing image reduces size from 2134996 to ...
  3 +qpdf: image /IIm3 on page 1: not optimizing because unable to decode data or data already uses DCT
  4 +qpdf: image /IIm4 on page 1: not optimizing because image is smaller than requested minimum dimensions
  5 +qpdf: wrote file a.pdf
... ...