Commit 2794bfb1a665cad93a38144bea0ba0daea7152e7

Authored by Jay Berkenbilt
1 parent dac0598b

Add flags to control zlib compression level (fixes #113)

ChangeLog
1 1 2019-08-23 Jay Berkenbilt <ejb@ql.org>
2 2  
  3 + * Add --recompress-streams option to qpdf and
  4 + QPDFWriter::setRecompressFlate to cause QPDFWriter to recompress
  5 + streams that are already compressed with /FlateDecode.
  6 +
3 7 * Add option Pl_Flate::setCompressionLevel to globally set the
4 8 zlib compression level used by all Pl_Flate pipelines.
5 9  
  10 + * Add --compression-level flag to qpdf to set the zlib compression
  11 + level. When combined with --recompress-flate, this will cause most
  12 + of qpdf's streams to use the maximum compression level. This
  13 + results in only a very small amount of savings in size that comes
  14 + at a fairly significant performance cost, but it could be useful
  15 + for archival files or other cases where every byte counts and
  16 + creation time doesn't matter so much. Note that using
  17 + --object-streams=generate in combination with these options gives
  18 + you the biggest advantage. Fixes #113.
  19 +
6 20 2019-08-22 Jay Berkenbilt <ejb@ql.org>
7 21  
8 22 * In QPDFObjectHandle::ParserCallbacks, in addition to
... ...
include/qpdf/QPDFWriter.hh
... ... @@ -189,10 +189,11 @@ class QPDFWriter
189 189 // filters on the input. When combined with
190 190 // setCompressStreams(true), which the default, the effect of this
191 191 // is that streams filtered with these older and less efficient
192   - // filters will be recompressed with the Flate filter. As a
193   - // special case, if a stream is already compressed with
  192 + // filters will be recompressed with the Flate filter. By default,
  193 + // as a special case, if a stream is already compressed with
194 194 // FlateDecode and setCompressStreams is enabled, the original
195   - // compressed data will be preserved.
  195 + // compressed data will be preserved. This behavior can be
  196 + // overridden by calling setRecompressFlate(true).
196 197 //
197 198 // qpdf_dl_specialized: In addition to uncompressing the
198 199 // generalized compression formats, supported non-lossy
... ... @@ -209,6 +210,15 @@ class QPDFWriter
209 210 QPDF_DLL
210 211 void setDecodeLevel(qpdf_stream_decode_level_e);
211 212  
  213 + // By default, when both the input and output contents of a stream
  214 + // are compressed with Flate, qpdf does not uncompress and
  215 + // recompress the stream. Passing true here causes it to do so.
  216 + // This can be useful if recompressing all streams with a higher
  217 + // compression level, which can be set by calling the static
  218 + // method Pl_Flate::setCompressionLevel.
  219 + QPDF_DLL
  220 + void setRecompressFlate(bool);
  221 +
212 222 // Set value of content stream normalization. The default is
213 223 // "false". If true, we attempt to normalize newlines inside of
214 224 // content streams. Some constructs such as inline images may
... ... @@ -597,6 +607,7 @@ class QPDFWriter
597 607 bool compress_streams_set;
598 608 qpdf_stream_decode_level_e stream_decode_level;
599 609 bool stream_decode_level_set;
  610 + bool recompress_flate;
600 611 bool qdf_mode;
601 612 bool preserve_unreferenced_objects;
602 613 bool newline_before_endstream;
... ...
libqpdf/QPDFWriter.cc
... ... @@ -37,6 +37,7 @@ QPDFWriter::Members::Members(QPDF&amp; pdf) :
37 37 compress_streams_set(false),
38 38 stream_decode_level(qpdf_dl_none),
39 39 stream_decode_level_set(false),
  40 + recompress_flate(false),
40 41 qdf_mode(false),
41 42 preserve_unreferenced_objects(false),
42 43 newline_before_endstream(false),
... ... @@ -207,6 +208,12 @@ QPDFWriter::setDecodeLevel(qpdf_stream_decode_level_e val)
207 208 }
208 209  
209 210 void
  211 +QPDFWriter::setRecompressFlate(bool val)
  212 +{
  213 + this->m->recompress_flate = val;
  214 +}
  215 +
  216 +void
210 217 QPDFWriter::setContentNormalization(bool val)
211 218 {
212 219 this->m->normalize_content_set = true;
... ... @@ -1716,13 +1723,14 @@ QPDFWriter::unparseObject(QPDFObjectHandle object, int level,
1716 1723 if (this->m->compress_streams)
1717 1724 {
1718 1725 // Don't filter if the stream is already compressed with
1719   - // FlateDecode. We don't want to make it worse by getting
1720   - // rid of a predictor or otherwise messing with it. We
1721   - // should also avoid messing with anything that's
1722   - // compressed with a lossy compression scheme, but we
1723   - // don't support any of those right now.
  1726 + // FlateDecode. This way we don't make it worse if the
  1727 + // original file used a better Flate algorithm, and we
  1728 + // don't spend time and CPU cycles uncompressing and
  1729 + // recompressing stuff. This can be overridden with
  1730 + // setRecompressFlate(true).
1724 1731 QPDFObjectHandle filter_obj = stream_dict.getKey("/Filter");
1725   - if ((! object.isDataModified()) &&
  1732 + if ((! this->m->recompress_flate) &&
  1733 + (! object.isDataModified()) &&
1726 1734 filter_obj.isName() &&
1727 1735 ((filter_obj.getName() == "/FlateDecode") ||
1728 1736 (filter_obj.getName() == "/Fl")))
... ...
manual/build.mk
... ... @@ -26,7 +26,8 @@ endif
26 26  
27 27 $(OUTDOC).pdf: $(OUTDOC).fo qpdf/build/qpdf
28 28 $(FOP) $< -pdf $@.tmp
29   - qpdf/build/qpdf --linearize $@.tmp $@
  29 + qpdf/build/qpdf --linearize --object-streams=generate \
  30 + --recompress-flate --compression-level=9 $@.tmp $@
30 31  
31 32 $(OUTDOC).html: $(INDOC).xml manual/html.xsl $(VALIDATE)
32 33 $(XSLTPROC) --output $@ manual/html.xsl $<
... ...
manual/qpdf-manual.xml
... ... @@ -1433,27 +1433,32 @@ outfile.pdf&lt;/option&gt;
1433 1433 <listitem>
1434 1434 <para>
1435 1435 <option>generalized</option>: decode streams filtered with
1436   - supported generalized filters: <option>/LZWDecode</option>,
1437   - <option>/FlateDecode</option>,
1438   - <option>/ASCII85Decode</option>, and
1439   - <option>/ASCIIHexDecode</option>. We define generalized
  1436 + supported generalized filters:
  1437 + <literal>/LZWDecode</literal>,
  1438 + <literal>/FlateDecode</literal>,
  1439 + <literal>/ASCII85Decode</literal>, and
  1440 + <literal>/ASCIIHexDecode</literal>. We define generalized
1440 1441 filters as those to be used for general-purpose compression
1441 1442 or encoding, as opposed to filters specifically designed
1442   - for image data.
  1443 + for image data. Note that, by default, streams already
  1444 + compressed with <literal>/FlateDecode</literal> are not
  1445 + uncompressed and recompressed unless you also specify
  1446 + <option>--recompress-flate</option>.
1443 1447 </para>
1444 1448 </listitem>
1445 1449 <listitem>
1446 1450 <para>
1447 1451 <option>specialized</option>: in addition to generalized,
1448 1452 decode streams with supported non-lossy specialized
1449   - filters; currently this is just <option>/RunLengthDecode</option>
  1453 + filters; currently this is just
  1454 + <literal>/RunLengthDecode</literal>
1450 1455 </para>
1451 1456 </listitem>
1452 1457 <listitem>
1453 1458 <para>
1454 1459 <option>all</option>: in addition to generalized and
1455 1460 specialized, decode streams with supported lossy filters;
1456   - currently this is just <option>/DCTDecode</option> (JPEG)
  1461 + currently this is just <literal>/DCTDecode</literal> (JPEG)
1457 1462 </para>
1458 1463 </listitem>
1459 1464 </itemizedlist>
... ... @@ -1476,7 +1481,10 @@ outfile.pdf&lt;/option&gt;
1476 1481 <option>compress</option>: recompress stream data when
1477 1482 possible (default); equivalent to
1478 1483 <option>--compress-streams=y</option>
1479   - <option>--decode-level=generalized</option>
  1484 + <option>--decode-level=generalized</option>. Does not
  1485 + recompress streams already compressed with
  1486 + <literal>/FlateDecode</literal> unless
  1487 + <option>--recompress-flate</option> is also specified.
1480 1488 </para>
1481 1489 </listitem>
1482 1490 <listitem>
... ... @@ -1499,6 +1507,37 @@ outfile.pdf&lt;/option&gt;
1499 1507 </listitem>
1500 1508 </varlistentry>
1501 1509 <varlistentry>
  1510 + <term><option>--recompress-flate</option></term>
  1511 + <listitem>
  1512 + <para>
  1513 + By default, streams already compressed with
  1514 + <literal>/FlateDecode</literal> are left alone rather than
  1515 + being uncompressed and recompressed. This option causes qpdf
  1516 + to uncompress and recompress the streams. There is a
  1517 + significant performance cost to using this option, but you
  1518 + probably want to use it if you specify
  1519 + <option>--compression-level</option>.
  1520 + </para>
  1521 + </listitem>
  1522 + </varlistentry>
  1523 + <varlistentry>
  1524 + <term><option>--compression-level=<replaceable>level</replaceable></option></term>
  1525 + <listitem>
  1526 + <para>
  1527 + When writing new streams that are compressed with
  1528 + <literal>/FlateDecode</literal>, use the specified compression
  1529 + level. The value of <option>level</option> should be a number
  1530 + from 1 to 9 and is passed directly to zlib, which implements
  1531 + deflate compression. Note that qpdf doesn't uncompress and
  1532 + recompress streams by default. To have this option apply to
  1533 + already compressed streams, you should also specify
  1534 + <option>--recompress-flate</option>. If your goal is to shrink
  1535 + the size of PDF files, you should also use
  1536 + <option>--object-streams=generate</option>.
  1537 + </para>
  1538 + </listitem>
  1539 + </varlistentry>
  1540 + <varlistentry>
1502 1541 <term><option>--normalize-content=[yn]</option></term>
1503 1542 <listitem>
1504 1543 <para>
... ... @@ -4449,7 +4488,7 @@ print &quot;\n&quot;;
4449 4488 </listitem>
4450 4489 <listitem>
4451 4490 <para>
4452   - Library Enhancements
  4491 + Library and CLI Enhancements
4453 4492 </para>
4454 4493 <itemizedlist>
4455 4494 <listitem>
... ... @@ -4510,6 +4549,41 @@ print &quot;\n&quot;;
4510 4549 </listitem>
4511 4550 <listitem>
4512 4551 <para>
  4552 + Static method
  4553 + <function>Pl_Flate::setCompressionLevel</function> can be
  4554 + called to set the zlib compression level globally used by
  4555 + all instances of Pl_Flate in deflate mode.
  4556 + </para>
  4557 + </listitem>
  4558 + <listitem>
  4559 + <para>
  4560 + The method
  4561 + <function>QPDFWriter::setRecompressFlate</function> can be
  4562 + called to tell <classname>QPDFWriter</classname> to
  4563 + uncompress and recompress streams already compressed with
  4564 + <literal>/FlateDecode</literal>.
  4565 + </para>
  4566 + </listitem>
  4567 + <listitem>
  4568 + <para>
  4569 + CLI enhancement: the <option>--recompress-flate</option>
  4570 + instructs <command>qpdf</command> to recompress streams that
  4571 + are already compressed with <literal>/FlateDecode</literal>.
  4572 + Useful with <option>--compression-level</option>.
  4573 + </para>
  4574 + </listitem>
  4575 + <listitem>
  4576 + <para>
  4577 + CLI enhancement: the
  4578 + <option>--compression-level=<replaceable>level</replaceable></option>
  4579 + sets the zlib compression level used for any streams
  4580 + compressed by <literal>/FlateDecode</literal>. Most
  4581 + effective when combined with
  4582 + <option>--recompress-flate</option>.
  4583 + </para>
  4584 + </listitem>
  4585 + <listitem>
  4586 + <para>
4513 4587 The underlying implementation of QPDF arrays has been
4514 4588 enhanced to be much more memory efficient when dealing with
4515 4589 arrays with lots of nulls. This enables qpdf to use
... ... @@ -5699,9 +5773,9 @@ print &quot;\n&quot;;
5699 5773 <listitem>
5700 5774 <para>
5701 5775 Disregard data check errors when uncompressing
5702   - <option>/FlateDecode</option> streams. This is consistent with
5703   - most other PDF readers and allows qpdf to recover data from
5704   - another class of malformed PDF files.
  5776 + <literal>/FlateDecode</literal> streams. This is consistent
  5777 + with most other PDF readers and allows qpdf to recover data
  5778 + from another class of malformed PDF files.
5705 5779 </para>
5706 5780 </listitem>
5707 5781 <listitem>
... ...
qpdf/qpdf.cc
... ... @@ -13,6 +13,7 @@
13 13 #include <qpdf/Pl_Discard.hh>
14 14 #include <qpdf/Pl_DCT.hh>
15 15 #include <qpdf/Pl_Count.hh>
  16 +#include <qpdf/Pl_Flate.hh>
16 17 #include <qpdf/PointerHolder.hh>
17 18  
18 19 #include <qpdf/QPDF.hh>
... ... @@ -124,6 +125,9 @@ struct Options
124 125 stream_data_mode(qpdf_s_compress),
125 126 compress_streams(true),
126 127 compress_streams_set(false),
  128 + recompress_flate(false),
  129 + recompress_flate_set(false),
  130 + compression_level(-1),
127 131 decode_level(qpdf_dl_generalized),
128 132 decode_level_set(false),
129 133 normalize_set(false),
... ... @@ -217,6 +221,9 @@ struct Options
217 221 qpdf_stream_data_e stream_data_mode;
218 222 bool compress_streams;
219 223 bool compress_streams_set;
  224 + bool recompress_flate;
  225 + bool recompress_flate_set;
  226 + int compression_level;
220 227 qpdf_stream_decode_level_e decode_level;
221 228 bool decode_level_set;
222 229 bool normalize_set;
... ... @@ -632,6 +639,8 @@ class ArgParser
632 639 void argCollate();
633 640 void argStreamData(char* parameter);
634 641 void argCompressStreams(char* parameter);
  642 + void argRecompressFlate();
  643 + void argCompressionLevel(char* parameter);
635 644 void argDecodeLevel(char* parameter);
636 645 void argNormalizeContent(char* parameter);
637 646 void argSuppressRecovery();
... ... @@ -847,6 +856,9 @@ ArgParser::initOptionTable()
847 856 &ArgParser::argStreamData, stream_data_choices);
848 857 (*t)["compress-streams"] = oe_requiredChoices(
849 858 &ArgParser::argCompressStreams, yn);
  859 + (*t)["recompress-flate"] = oe_bare(&ArgParser::argRecompressFlate);
  860 + (*t)["compression-level"] = oe_requiredParameter(
  861 + &ArgParser::argCompressionLevel, "level");
850 862 char const* decode_level_choices[] =
851 863 {"none", "generalized", "specialized", "all", 0};
852 864 (*t)["decode-level"] = oe_requiredChoices(
... ... @@ -1328,6 +1340,9 @@ ArgParser::argHelp()
1328 1340 << "--stream-data=option controls transformation of stream data (below)\n"
1329 1341 << "--compress-streams=[yn] controls whether to compress streams on output\n"
1330 1342 << "--decode-level=option controls how to filter streams from the input\n"
  1343 + << "--recompress-flate recompress streams already compressed with Flate\n"
  1344 + << "--compression-level=n set zlib compression level; most effective with\n"
  1345 + << " --recompress-flate --object-streams=generate\n"
1331 1346 << "--normalize-content=[yn] enables or disables normalization of content streams\n"
1332 1347 << "--object-streams=mode controls handing of object streams\n"
1333 1348 << "--preserve-unreferenced preserve unreferenced objects\n"
... ... @@ -1725,6 +1740,19 @@ ArgParser::argCompressStreams(char* parameter)
1725 1740 }
1726 1741  
1727 1742 void
  1743 +ArgParser::argRecompressFlate()
  1744 +{
  1745 + o.recompress_flate_set = true;
  1746 + o.recompress_flate = true;
  1747 +}
  1748 +
  1749 +void
  1750 +ArgParser::argCompressionLevel(char* parameter)
  1751 +{
  1752 + o.compression_level = QUtil::string_to_int(parameter);
  1753 +}
  1754 +
  1755 +void
1728 1756 ArgParser::argDecodeLevel(char* parameter)
1729 1757 {
1730 1758 o.decode_level_set = true;
... ... @@ -4889,6 +4917,10 @@ static void set_encryption_options(QPDF&amp; pdf, Options&amp; o, QPDFWriter&amp; w)
4889 4917  
4890 4918 static void set_writer_options(QPDF& pdf, Options& o, QPDFWriter& w)
4891 4919 {
  4920 + if (o.compression_level >= 0)
  4921 + {
  4922 + Pl_Flate::setCompressionLevel(o.compression_level);
  4923 + }
4892 4924 if (o.qdf_mode)
4893 4925 {
4894 4926 w.setQDFMode(true);
... ... @@ -4913,6 +4945,10 @@ static void set_writer_options(QPDF&amp; pdf, Options&amp; o, QPDFWriter&amp; w)
4913 4945 {
4914 4946 w.setCompressStreams(o.compress_streams);
4915 4947 }
  4948 + if (o.recompress_flate_set)
  4949 + {
  4950 + w.setRecompressFlate(o.recompress_flate);
  4951 + }
4916 4952 if (o.decode_level_set)
4917 4953 {
4918 4954 w.setDecodeLevel(o.decode_level);
... ...
qpdf/qtest/qpdf.test
... ... @@ -3876,8 +3876,20 @@ $td-&gt;runtest(&quot;convert inline-images to qdf&quot;,
3876 3876 compare_pdfs("inline-images.pdf", "a.pdf");
3877 3877  
3878 3878 show_ntests();
  3879 +# ----------
  3880 +$td->notify("--- Compression Level ---");
  3881 +$n_tests += 4;
3879 3882  
  3883 +check_pdf("recompress with level",
  3884 + "qpdf --static-id --recompress-flate --compression-level=9" .
  3885 + " --object-streams=generate minimal.pdf",
  3886 + "minimal-9.pdf", 0);
  3887 +check_pdf("recompress with level",
  3888 + "qpdf --static-id --recompress-flate --compression-level=1" .
  3889 + " --object-streams=generate minimal.pdf",
  3890 + "minimal-1.pdf", 0);
3880 3891  
  3892 +show_ntests();
3881 3893 # ----------
3882 3894 $td->notify("--- Specialized filtering Tests ---");
3883 3895 $n_tests += 3;
... ...
qpdf/qtest/qpdf/minimal-1.pdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/minimal-9.pdf 0 → 100644
No preview for this file type