Commit 2794bfb1a665cad93a38144bea0ba0daea7152e7
1 parent
dac0598b
Add flags to control zlib compression level (fixes #113)
Showing
9 changed files
with
178 additions
and
22 deletions
ChangeLog
| 1 | 1 | 2019-08-23 Jay Berkenbilt <ejb@ql.org> |
| 2 | 2 | |
| 3 | + * Add --recompress-streams option to qpdf and | |
| 4 | + QPDFWriter::setRecompressFlate to cause QPDFWriter to recompress | |
| 5 | + streams that are already compressed with /FlateDecode. | |
| 6 | + | |
| 3 | 7 | * Add option Pl_Flate::setCompressionLevel to globally set the |
| 4 | 8 | zlib compression level used by all Pl_Flate pipelines. |
| 5 | 9 | |
| 10 | + * Add --compression-level flag to qpdf to set the zlib compression | |
| 11 | + level. When combined with --recompress-flate, this will cause most | |
| 12 | + of qpdf's streams to use the maximum compression level. This | |
| 13 | + results in only a very small amount of savings in size that comes | |
| 14 | + at a fairly significant performance cost, but it could be useful | |
| 15 | + for archival files or other cases where every byte counts and | |
| 16 | + creation time doesn't matter so much. Note that using | |
| 17 | + --object-streams=generate in combination with these options gives | |
| 18 | + you the biggest advantage. Fixes #113. | |
| 19 | + | |
| 6 | 20 | 2019-08-22 Jay Berkenbilt <ejb@ql.org> |
| 7 | 21 | |
| 8 | 22 | * In QPDFObjectHandle::ParserCallbacks, in addition to | ... | ... |
include/qpdf/QPDFWriter.hh
| ... | ... | @@ -189,10 +189,11 @@ class QPDFWriter |
| 189 | 189 | // filters on the input. When combined with |
| 190 | 190 | // setCompressStreams(true), which the default, the effect of this |
| 191 | 191 | // is that streams filtered with these older and less efficient |
| 192 | - // filters will be recompressed with the Flate filter. As a | |
| 193 | - // special case, if a stream is already compressed with | |
| 192 | + // filters will be recompressed with the Flate filter. By default, | |
| 193 | + // as a special case, if a stream is already compressed with | |
| 194 | 194 | // FlateDecode and setCompressStreams is enabled, the original |
| 195 | - // compressed data will be preserved. | |
| 195 | + // compressed data will be preserved. This behavior can be | |
| 196 | + // overridden by calling setRecompressFlate(true). | |
| 196 | 197 | // |
| 197 | 198 | // qpdf_dl_specialized: In addition to uncompressing the |
| 198 | 199 | // generalized compression formats, supported non-lossy |
| ... | ... | @@ -209,6 +210,15 @@ class QPDFWriter |
| 209 | 210 | QPDF_DLL |
| 210 | 211 | void setDecodeLevel(qpdf_stream_decode_level_e); |
| 211 | 212 | |
| 213 | + // By default, when both the input and output contents of a stream | |
| 214 | + // are compressed with Flate, qpdf does not uncompress and | |
| 215 | + // recompress the stream. Passing true here causes it to do so. | |
| 216 | + // This can be useful if recompressing all streams with a higher | |
| 217 | + // compression level, which can be set by calling the static | |
| 218 | + // method Pl_Flate::setCompressionLevel. | |
| 219 | + QPDF_DLL | |
| 220 | + void setRecompressFlate(bool); | |
| 221 | + | |
| 212 | 222 | // Set value of content stream normalization. The default is |
| 213 | 223 | // "false". If true, we attempt to normalize newlines inside of |
| 214 | 224 | // content streams. Some constructs such as inline images may |
| ... | ... | @@ -597,6 +607,7 @@ class QPDFWriter |
| 597 | 607 | bool compress_streams_set; |
| 598 | 608 | qpdf_stream_decode_level_e stream_decode_level; |
| 599 | 609 | bool stream_decode_level_set; |
| 610 | + bool recompress_flate; | |
| 600 | 611 | bool qdf_mode; |
| 601 | 612 | bool preserve_unreferenced_objects; |
| 602 | 613 | bool newline_before_endstream; | ... | ... |
libqpdf/QPDFWriter.cc
| ... | ... | @@ -37,6 +37,7 @@ QPDFWriter::Members::Members(QPDF& pdf) : |
| 37 | 37 | compress_streams_set(false), |
| 38 | 38 | stream_decode_level(qpdf_dl_none), |
| 39 | 39 | stream_decode_level_set(false), |
| 40 | + recompress_flate(false), | |
| 40 | 41 | qdf_mode(false), |
| 41 | 42 | preserve_unreferenced_objects(false), |
| 42 | 43 | newline_before_endstream(false), |
| ... | ... | @@ -207,6 +208,12 @@ QPDFWriter::setDecodeLevel(qpdf_stream_decode_level_e val) |
| 207 | 208 | } |
| 208 | 209 | |
| 209 | 210 | void |
| 211 | +QPDFWriter::setRecompressFlate(bool val) | |
| 212 | +{ | |
| 213 | + this->m->recompress_flate = val; | |
| 214 | +} | |
| 215 | + | |
| 216 | +void | |
| 210 | 217 | QPDFWriter::setContentNormalization(bool val) |
| 211 | 218 | { |
| 212 | 219 | this->m->normalize_content_set = true; |
| ... | ... | @@ -1716,13 +1723,14 @@ QPDFWriter::unparseObject(QPDFObjectHandle object, int level, |
| 1716 | 1723 | if (this->m->compress_streams) |
| 1717 | 1724 | { |
| 1718 | 1725 | // Don't filter if the stream is already compressed with |
| 1719 | - // FlateDecode. We don't want to make it worse by getting | |
| 1720 | - // rid of a predictor or otherwise messing with it. We | |
| 1721 | - // should also avoid messing with anything that's | |
| 1722 | - // compressed with a lossy compression scheme, but we | |
| 1723 | - // don't support any of those right now. | |
| 1726 | + // FlateDecode. This way we don't make it worse if the | |
| 1727 | + // original file used a better Flate algorithm, and we | |
| 1728 | + // don't spend time and CPU cycles uncompressing and | |
| 1729 | + // recompressing stuff. This can be overridden with | |
| 1730 | + // setRecompressFlate(true). | |
| 1724 | 1731 | QPDFObjectHandle filter_obj = stream_dict.getKey("/Filter"); |
| 1725 | - if ((! object.isDataModified()) && | |
| 1732 | + if ((! this->m->recompress_flate) && | |
| 1733 | + (! object.isDataModified()) && | |
| 1726 | 1734 | filter_obj.isName() && |
| 1727 | 1735 | ((filter_obj.getName() == "/FlateDecode") || |
| 1728 | 1736 | (filter_obj.getName() == "/Fl"))) | ... | ... |
manual/build.mk
| ... | ... | @@ -26,7 +26,8 @@ endif |
| 26 | 26 | |
| 27 | 27 | $(OUTDOC).pdf: $(OUTDOC).fo qpdf/build/qpdf |
| 28 | 28 | $(FOP) $< -pdf $@.tmp |
| 29 | - qpdf/build/qpdf --linearize $@.tmp $@ | |
| 29 | + qpdf/build/qpdf --linearize --object-streams=generate \ | |
| 30 | + --recompress-flate --compression-level=9 $@.tmp $@ | |
| 30 | 31 | |
| 31 | 32 | $(OUTDOC).html: $(INDOC).xml manual/html.xsl $(VALIDATE) |
| 32 | 33 | $(XSLTPROC) --output $@ manual/html.xsl $< | ... | ... |
manual/qpdf-manual.xml
| ... | ... | @@ -1433,27 +1433,32 @@ outfile.pdf</option> |
| 1433 | 1433 | <listitem> |
| 1434 | 1434 | <para> |
| 1435 | 1435 | <option>generalized</option>: decode streams filtered with |
| 1436 | - supported generalized filters: <option>/LZWDecode</option>, | |
| 1437 | - <option>/FlateDecode</option>, | |
| 1438 | - <option>/ASCII85Decode</option>, and | |
| 1439 | - <option>/ASCIIHexDecode</option>. We define generalized | |
| 1436 | + supported generalized filters: | |
| 1437 | + <literal>/LZWDecode</literal>, | |
| 1438 | + <literal>/FlateDecode</literal>, | |
| 1439 | + <literal>/ASCII85Decode</literal>, and | |
| 1440 | + <literal>/ASCIIHexDecode</literal>. We define generalized | |
| 1440 | 1441 | filters as those to be used for general-purpose compression |
| 1441 | 1442 | or encoding, as opposed to filters specifically designed |
| 1442 | - for image data. | |
| 1443 | + for image data. Note that, by default, streams already | |
| 1444 | + compressed with <literal>/FlateDecode</literal> are not | |
| 1445 | + uncompressed and recompressed unless you also specify | |
| 1446 | + <option>--recompress-flate</option>. | |
| 1443 | 1447 | </para> |
| 1444 | 1448 | </listitem> |
| 1445 | 1449 | <listitem> |
| 1446 | 1450 | <para> |
| 1447 | 1451 | <option>specialized</option>: in addition to generalized, |
| 1448 | 1452 | decode streams with supported non-lossy specialized |
| 1449 | - filters; currently this is just <option>/RunLengthDecode</option> | |
| 1453 | + filters; currently this is just | |
| 1454 | + <literal>/RunLengthDecode</literal> | |
| 1450 | 1455 | </para> |
| 1451 | 1456 | </listitem> |
| 1452 | 1457 | <listitem> |
| 1453 | 1458 | <para> |
| 1454 | 1459 | <option>all</option>: in addition to generalized and |
| 1455 | 1460 | specialized, decode streams with supported lossy filters; |
| 1456 | - currently this is just <option>/DCTDecode</option> (JPEG) | |
| 1461 | + currently this is just <literal>/DCTDecode</literal> (JPEG) | |
| 1457 | 1462 | </para> |
| 1458 | 1463 | </listitem> |
| 1459 | 1464 | </itemizedlist> |
| ... | ... | @@ -1476,7 +1481,10 @@ outfile.pdf</option> |
| 1476 | 1481 | <option>compress</option>: recompress stream data when |
| 1477 | 1482 | possible (default); equivalent to |
| 1478 | 1483 | <option>--compress-streams=y</option> |
| 1479 | - <option>--decode-level=generalized</option> | |
| 1484 | + <option>--decode-level=generalized</option>. Does not | |
| 1485 | + recompress streams already compressed with | |
| 1486 | + <literal>/FlateDecode</literal> unless | |
| 1487 | + <option>--recompress-flate</option> is also specified. | |
| 1480 | 1488 | </para> |
| 1481 | 1489 | </listitem> |
| 1482 | 1490 | <listitem> |
| ... | ... | @@ -1499,6 +1507,37 @@ outfile.pdf</option> |
| 1499 | 1507 | </listitem> |
| 1500 | 1508 | </varlistentry> |
| 1501 | 1509 | <varlistentry> |
| 1510 | + <term><option>--recompress-flate</option></term> | |
| 1511 | + <listitem> | |
| 1512 | + <para> | |
| 1513 | + By default, streams already compressed with | |
| 1514 | + <literal>/FlateDecode</literal> are left alone rather than | |
| 1515 | + being uncompressed and recompressed. This option causes qpdf | |
| 1516 | + to uncompress and recompress the streams. There is a | |
| 1517 | + significant performance cost to using this option, but you | |
| 1518 | + probably want to use it if you specify | |
| 1519 | + <option>--compression-level</option>. | |
| 1520 | + </para> | |
| 1521 | + </listitem> | |
| 1522 | + </varlistentry> | |
| 1523 | + <varlistentry> | |
| 1524 | + <term><option>--compression-level=<replaceable>level</replaceable></option></term> | |
| 1525 | + <listitem> | |
| 1526 | + <para> | |
| 1527 | + When writing new streams that are compressed with | |
| 1528 | + <literal>/FlateDecode</literal>, use the specified compression | |
| 1529 | + level. The value of <option>level</option> should be a number | |
| 1530 | + from 1 to 9 and is passed directly to zlib, which implements | |
| 1531 | + deflate compression. Note that qpdf doesn't uncompress and | |
| 1532 | + recompress streams by default. To have this option apply to | |
| 1533 | + already compressed streams, you should also specify | |
| 1534 | + <option>--recompress-flate</option>. If your goal is to shrink | |
| 1535 | + the size of PDF files, you should also use | |
| 1536 | + <option>--object-streams=generate</option>. | |
| 1537 | + </para> | |
| 1538 | + </listitem> | |
| 1539 | + </varlistentry> | |
| 1540 | + <varlistentry> | |
| 1502 | 1541 | <term><option>--normalize-content=[yn]</option></term> |
| 1503 | 1542 | <listitem> |
| 1504 | 1543 | <para> |
| ... | ... | @@ -4449,7 +4488,7 @@ print "\n"; |
| 4449 | 4488 | </listitem> |
| 4450 | 4489 | <listitem> |
| 4451 | 4490 | <para> |
| 4452 | - Library Enhancements | |
| 4491 | + Library and CLI Enhancements | |
| 4453 | 4492 | </para> |
| 4454 | 4493 | <itemizedlist> |
| 4455 | 4494 | <listitem> |
| ... | ... | @@ -4510,6 +4549,41 @@ print "\n"; |
| 4510 | 4549 | </listitem> |
| 4511 | 4550 | <listitem> |
| 4512 | 4551 | <para> |
| 4552 | + Static method | |
| 4553 | + <function>Pl_Flate::setCompressionLevel</function> can be | |
| 4554 | + called to set the zlib compression level globally used by | |
| 4555 | + all instances of Pl_Flate in deflate mode. | |
| 4556 | + </para> | |
| 4557 | + </listitem> | |
| 4558 | + <listitem> | |
| 4559 | + <para> | |
| 4560 | + The method | |
| 4561 | + <function>QPDFWriter::setRecompressFlate</function> can be | |
| 4562 | + called to tell <classname>QPDFWriter</classname> to | |
| 4563 | + uncompress and recompress streams already compressed with | |
| 4564 | + <literal>/FlateDecode</literal>. | |
| 4565 | + </para> | |
| 4566 | + </listitem> | |
| 4567 | + <listitem> | |
| 4568 | + <para> | |
| 4569 | + CLI enhancement: the <option>--recompress-flate</option> | |
| 4570 | + instructs <command>qpdf</command> to recompress streams that | |
| 4571 | + are already compressed with <literal>/FlateDecode</literal>. | |
| 4572 | + Useful with <option>--compression-level</option>. | |
| 4573 | + </para> | |
| 4574 | + </listitem> | |
| 4575 | + <listitem> | |
| 4576 | + <para> | |
| 4577 | + CLI enhancement: the | |
| 4578 | + <option>--compression-level=<replaceable>level</replaceable></option> | |
| 4579 | + sets the zlib compression level used for any streams | |
| 4580 | + compressed by <literal>/FlateDecode</literal>. Most | |
| 4581 | + effective when combined with | |
| 4582 | + <option>--recompress-flate</option>. | |
| 4583 | + </para> | |
| 4584 | + </listitem> | |
| 4585 | + <listitem> | |
| 4586 | + <para> | |
| 4513 | 4587 | The underlying implementation of QPDF arrays has been |
| 4514 | 4588 | enhanced to be much more memory efficient when dealing with |
| 4515 | 4589 | arrays with lots of nulls. This enables qpdf to use |
| ... | ... | @@ -5699,9 +5773,9 @@ print "\n"; |
| 5699 | 5773 | <listitem> |
| 5700 | 5774 | <para> |
| 5701 | 5775 | Disregard data check errors when uncompressing |
| 5702 | - <option>/FlateDecode</option> streams. This is consistent with | |
| 5703 | - most other PDF readers and allows qpdf to recover data from | |
| 5704 | - another class of malformed PDF files. | |
| 5776 | + <literal>/FlateDecode</literal> streams. This is consistent | |
| 5777 | + with most other PDF readers and allows qpdf to recover data | |
| 5778 | + from another class of malformed PDF files. | |
| 5705 | 5779 | </para> |
| 5706 | 5780 | </listitem> |
| 5707 | 5781 | <listitem> | ... | ... |
qpdf/qpdf.cc
| ... | ... | @@ -13,6 +13,7 @@ |
| 13 | 13 | #include <qpdf/Pl_Discard.hh> |
| 14 | 14 | #include <qpdf/Pl_DCT.hh> |
| 15 | 15 | #include <qpdf/Pl_Count.hh> |
| 16 | +#include <qpdf/Pl_Flate.hh> | |
| 16 | 17 | #include <qpdf/PointerHolder.hh> |
| 17 | 18 | |
| 18 | 19 | #include <qpdf/QPDF.hh> |
| ... | ... | @@ -124,6 +125,9 @@ struct Options |
| 124 | 125 | stream_data_mode(qpdf_s_compress), |
| 125 | 126 | compress_streams(true), |
| 126 | 127 | compress_streams_set(false), |
| 128 | + recompress_flate(false), | |
| 129 | + recompress_flate_set(false), | |
| 130 | + compression_level(-1), | |
| 127 | 131 | decode_level(qpdf_dl_generalized), |
| 128 | 132 | decode_level_set(false), |
| 129 | 133 | normalize_set(false), |
| ... | ... | @@ -217,6 +221,9 @@ struct Options |
| 217 | 221 | qpdf_stream_data_e stream_data_mode; |
| 218 | 222 | bool compress_streams; |
| 219 | 223 | bool compress_streams_set; |
| 224 | + bool recompress_flate; | |
| 225 | + bool recompress_flate_set; | |
| 226 | + int compression_level; | |
| 220 | 227 | qpdf_stream_decode_level_e decode_level; |
| 221 | 228 | bool decode_level_set; |
| 222 | 229 | bool normalize_set; |
| ... | ... | @@ -632,6 +639,8 @@ class ArgParser |
| 632 | 639 | void argCollate(); |
| 633 | 640 | void argStreamData(char* parameter); |
| 634 | 641 | void argCompressStreams(char* parameter); |
| 642 | + void argRecompressFlate(); | |
| 643 | + void argCompressionLevel(char* parameter); | |
| 635 | 644 | void argDecodeLevel(char* parameter); |
| 636 | 645 | void argNormalizeContent(char* parameter); |
| 637 | 646 | void argSuppressRecovery(); |
| ... | ... | @@ -847,6 +856,9 @@ ArgParser::initOptionTable() |
| 847 | 856 | &ArgParser::argStreamData, stream_data_choices); |
| 848 | 857 | (*t)["compress-streams"] = oe_requiredChoices( |
| 849 | 858 | &ArgParser::argCompressStreams, yn); |
| 859 | + (*t)["recompress-flate"] = oe_bare(&ArgParser::argRecompressFlate); | |
| 860 | + (*t)["compression-level"] = oe_requiredParameter( | |
| 861 | + &ArgParser::argCompressionLevel, "level"); | |
| 850 | 862 | char const* decode_level_choices[] = |
| 851 | 863 | {"none", "generalized", "specialized", "all", 0}; |
| 852 | 864 | (*t)["decode-level"] = oe_requiredChoices( |
| ... | ... | @@ -1328,6 +1340,9 @@ ArgParser::argHelp() |
| 1328 | 1340 | << "--stream-data=option controls transformation of stream data (below)\n" |
| 1329 | 1341 | << "--compress-streams=[yn] controls whether to compress streams on output\n" |
| 1330 | 1342 | << "--decode-level=option controls how to filter streams from the input\n" |
| 1343 | + << "--recompress-flate recompress streams already compressed with Flate\n" | |
| 1344 | + << "--compression-level=n set zlib compression level; most effective with\n" | |
| 1345 | + << " --recompress-flate --object-streams=generate\n" | |
| 1331 | 1346 | << "--normalize-content=[yn] enables or disables normalization of content streams\n" |
| 1332 | 1347 | << "--object-streams=mode controls handing of object streams\n" |
| 1333 | 1348 | << "--preserve-unreferenced preserve unreferenced objects\n" |
| ... | ... | @@ -1725,6 +1740,19 @@ ArgParser::argCompressStreams(char* parameter) |
| 1725 | 1740 | } |
| 1726 | 1741 | |
| 1727 | 1742 | void |
| 1743 | +ArgParser::argRecompressFlate() | |
| 1744 | +{ | |
| 1745 | + o.recompress_flate_set = true; | |
| 1746 | + o.recompress_flate = true; | |
| 1747 | +} | |
| 1748 | + | |
| 1749 | +void | |
| 1750 | +ArgParser::argCompressionLevel(char* parameter) | |
| 1751 | +{ | |
| 1752 | + o.compression_level = QUtil::string_to_int(parameter); | |
| 1753 | +} | |
| 1754 | + | |
| 1755 | +void | |
| 1728 | 1756 | ArgParser::argDecodeLevel(char* parameter) |
| 1729 | 1757 | { |
| 1730 | 1758 | o.decode_level_set = true; |
| ... | ... | @@ -4889,6 +4917,10 @@ static void set_encryption_options(QPDF& pdf, Options& o, QPDFWriter& w) |
| 4889 | 4917 | |
| 4890 | 4918 | static void set_writer_options(QPDF& pdf, Options& o, QPDFWriter& w) |
| 4891 | 4919 | { |
| 4920 | + if (o.compression_level >= 0) | |
| 4921 | + { | |
| 4922 | + Pl_Flate::setCompressionLevel(o.compression_level); | |
| 4923 | + } | |
| 4892 | 4924 | if (o.qdf_mode) |
| 4893 | 4925 | { |
| 4894 | 4926 | w.setQDFMode(true); |
| ... | ... | @@ -4913,6 +4945,10 @@ static void set_writer_options(QPDF& pdf, Options& o, QPDFWriter& w) |
| 4913 | 4945 | { |
| 4914 | 4946 | w.setCompressStreams(o.compress_streams); |
| 4915 | 4947 | } |
| 4948 | + if (o.recompress_flate_set) | |
| 4949 | + { | |
| 4950 | + w.setRecompressFlate(o.recompress_flate); | |
| 4951 | + } | |
| 4916 | 4952 | if (o.decode_level_set) |
| 4917 | 4953 | { |
| 4918 | 4954 | w.setDecodeLevel(o.decode_level); | ... | ... |
qpdf/qtest/qpdf.test
| ... | ... | @@ -3876,8 +3876,20 @@ $td->runtest("convert inline-images to qdf", |
| 3876 | 3876 | compare_pdfs("inline-images.pdf", "a.pdf"); |
| 3877 | 3877 | |
| 3878 | 3878 | show_ntests(); |
| 3879 | +# ---------- | |
| 3880 | +$td->notify("--- Compression Level ---"); | |
| 3881 | +$n_tests += 4; | |
| 3879 | 3882 | |
| 3883 | +check_pdf("recompress with level", | |
| 3884 | + "qpdf --static-id --recompress-flate --compression-level=9" . | |
| 3885 | + " --object-streams=generate minimal.pdf", | |
| 3886 | + "minimal-9.pdf", 0); | |
| 3887 | +check_pdf("recompress with level", | |
| 3888 | + "qpdf --static-id --recompress-flate --compression-level=1" . | |
| 3889 | + " --object-streams=generate minimal.pdf", | |
| 3890 | + "minimal-1.pdf", 0); | |
| 3880 | 3891 | |
| 3892 | +show_ntests(); | |
| 3881 | 3893 | # ---------- |
| 3882 | 3894 | $td->notify("--- Specialized filtering Tests ---"); |
| 3883 | 3895 | $n_tests += 3; | ... | ... |
qpdf/qtest/qpdf/minimal-1.pdf
0 → 100644
No preview for this file type
qpdf/qtest/qpdf/minimal-9.pdf
0 → 100644
No preview for this file type