Commit 88a62f78fd2e7f30409ded65eb9565c28c18388f

Authored by m-holger
Committed by GitHub
2 parents 885674ab c97da2df

Merge pull request #1550 from m-holger/writer

Refactor QPDFWriter
libqpdf/QPDFWriter.cc
... ... @@ -22,6 +22,7 @@
22 22 #include <concepts>
23 23 #include <cstdlib>
24 24 #include <stdexcept>
  25 +#include <tuple>
25 26  
26 27 using namespace std::literals;
27 28 using namespace qpdf;
... ... @@ -258,7 +259,76 @@ Pl_stack::Popper::pop()
258 259 stack = nullptr;
259 260 }
260 261  
261   -class QPDFWriter::Members
  262 +// Writer class is restricted to QPDFWriter so that only it can call certain methods.
  263 +class QPDF::Writer
  264 +{
  265 + friend class QPDFWriter;
  266 + Writer(QPDF& pdf) :
  267 + pdf(pdf)
  268 + {
  269 + }
  270 +
  271 + protected:
  272 + void
  273 + optimize(
  274 + QPDFWriter::ObjTable const& obj,
  275 + std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
  276 + {
  277 + pdf.optimize(obj, skip_stream_parameters);
  278 + }
  279 +
  280 + void
  281 + getLinearizedParts(
  282 + QPDFWriter::ObjTable const& obj,
  283 + std::vector<QPDFObjectHandle>& part4,
  284 + std::vector<QPDFObjectHandle>& part6,
  285 + std::vector<QPDFObjectHandle>& part7,
  286 + std::vector<QPDFObjectHandle>& part8,
  287 + std::vector<QPDFObjectHandle>& part9)
  288 + {
  289 + pdf.getLinearizedParts(obj, part4, part6, part7, part8, part9);
  290 + }
  291 +
  292 + void
  293 + generateHintStream(
  294 + QPDFWriter::NewObjTable const& new_obj,
  295 + QPDFWriter::ObjTable const& obj,
  296 + std::string& hint_stream,
  297 + int& S,
  298 + int& O,
  299 + bool compressed)
  300 + {
  301 + pdf.generateHintStream(new_obj, obj, hint_stream, S, O, compressed);
  302 + }
  303 +
  304 + std::vector<QPDFObjGen>
  305 + getCompressibleObjGens()
  306 + {
  307 + return pdf.getCompressibleObjVector();
  308 + }
  309 +
  310 + std::vector<bool>
  311 + getCompressibleObjSet()
  312 + {
  313 + return pdf.getCompressibleObjSet();
  314 + }
  315 +
  316 + std::map<QPDFObjGen, QPDFXRefEntry> const&
  317 + getXRefTable()
  318 + {
  319 + return pdf.getXRefTableInternal();
  320 + }
  321 +
  322 + size_t
  323 + tableSize()
  324 + {
  325 + return pdf.tableSize();
  326 + }
  327 +
  328 + QPDF& pdf;
  329 +};
  330 +
  331 +class QPDFWriter::Members: QPDF::Writer
262 332 {
263 333 friend class QPDFWriter;
264 334  
... ... @@ -273,8 +343,8 @@ class QPDFWriter::Members
273 343 enum trailer_e { t_normal, t_lin_first, t_lin_second };
274 344  
275 345 Members(QPDFWriter& w, QPDF& pdf) :
  346 + QPDF::Writer(pdf),
276 347 w(w),
277   - pdf(pdf),
278 348 root_og(
279 349 pdf.getRoot().getObjGen().isIndirect() ? pdf.getRoot().getObjGen() : QPDFObjGen(-1, 0)),
280 350 pipeline_stack(pipeline)
... ... @@ -323,13 +393,14 @@ class QPDFWriter::Members
323 393 void enqueueObjectsPCLm();
324 394 void enqueuePart(std::vector<QPDFObjectHandle>& part);
325 395 void assignCompressedObjectNumbers(QPDFObjGen og);
326   - QPDFObjectHandle getTrimmedTrailer();
  396 + Dictionary trimmed_trailer();
327 397  
328   - bool willFilterStream(
329   - QPDFObjectHandle stream,
330   - bool& compress_stream,
331   - bool& is_metadata,
332   - std::string* stream_data);
  398 + // Returns tuple<filter, compress_stream, is_root_metadata>
  399 + std::tuple<const bool, const bool, const bool>
  400 + will_filter_stream(QPDFObjectHandle stream, std::string* stream_data);
  401 +
  402 + // Test whether stream would be filtered if it were written.
  403 + bool will_filter_stream(QPDFObjectHandle stream);
333 404 unsigned int bytesNeeded(long long n);
334 405 void writeBinary(unsigned long long val, unsigned int bytes);
335 406 Members& write(std::string_view str);
... ... @@ -409,7 +480,6 @@ class QPDFWriter::Members
409 480  
410 481 private:
411 482 QPDFWriter& w;
412   - QPDF& pdf;
413 483 QPDFObjGen root_og{-1, 0};
414 484 char const* filename{"unspecified"};
415 485 FILE* file{nullptr};
... ... @@ -1375,7 +1445,7 @@ void
1375 1445 QPDFWriter::Members::writeTrailer(
1376 1446 trailer_e which, int size, bool xref_stream, qpdf_offset_t prev, int linearization_pass)
1377 1447 {
1378   - QPDFObjectHandle trailer = getTrimmedTrailer();
  1448 + auto trailer = trimmed_trailer();
1379 1449 if (xref_stream) {
1380 1450 cur_data_key.clear();
1381 1451 } else {
... ... @@ -1385,7 +1455,7 @@ QPDFWriter::Members::writeTrailer(
1385 1455 if (which == t_lin_second) {
1386 1456 write(" /Size ").write(size);
1387 1457 } else {
1388   - for (auto const& [key, value]: trailer.as_dictionary()) {
  1458 + for (auto const& [key, value]: trailer) {
1389 1459 if (value.null()) {
1390 1460 continue;
1391 1461 }
... ... @@ -1439,97 +1509,84 @@ QPDFWriter::Members::writeTrailer(
1439 1509 }
1440 1510  
1441 1511 bool
1442   -QPDFWriter::Members::willFilterStream(
1443   - QPDFObjectHandle stream,
1444   - bool& compress_stream, // out only
1445   - bool& is_root_metadata, // out only
1446   - std::string* stream_data)
1447   -{
1448   - compress_stream = false;
1449   - is_root_metadata = false;
1450   -
1451   - QPDFObjGen old_og = stream.getObjGen();
1452   - QPDFObjectHandle stream_dict = stream.getDict();
1453   -
1454   - if (stream.isRootMetadata()) {
1455   - is_root_metadata = true;
1456   - }
1457   - bool filter = stream.isDataModified() || compress_streams || stream_decode_level;
1458   - bool filter_on_write = stream.getFilterOnWrite();
1459   - if (!filter_on_write) {
1460   - filter = false;
1461   - }
1462   - if (filter_on_write && compress_streams) {
1463   - // Don't filter if the stream is already compressed with FlateDecode. This way we don't make
1464   - // it worse if the original file used a better Flate algorithm, and we don't spend time and
1465   - // CPU cycles uncompressing and recompressing stuff. This can be overridden with
1466   - // setRecompressFlate(true).
1467   - QPDFObjectHandle filter_obj = stream_dict.getKey("/Filter");
1468   - if (!recompress_flate && !stream.isDataModified() && filter_obj.isName() &&
1469   - (filter_obj.getName() == "/FlateDecode" || filter_obj.getName() == "/Fl")) {
1470   - filter = false;
  1512 +QPDFWriter::Members::will_filter_stream(QPDFObjectHandle stream)
  1513 +{
  1514 + std::string s;
  1515 + [[maybe_unused]] auto [filter, ignore1, ignore2] = will_filter_stream(stream, &s);
  1516 + return filter;
  1517 +}
  1518 +
  1519 +std::tuple<const bool, const bool, const bool>
  1520 +QPDFWriter::Members::will_filter_stream(QPDFObjectHandle stream, std::string* stream_data)
  1521 +{
  1522 + const bool is_root_metadata = stream.isRootMetadata();
  1523 + bool filter = false;
  1524 + auto decode_level = stream_decode_level;
  1525 + int encode_flags = 0;
  1526 + Dictionary stream_dict = stream.getDict();
  1527 +
  1528 + if (stream.getFilterOnWrite()) {
  1529 + filter = stream.isDataModified() || compress_streams || decode_level != qpdf_dl_none;
  1530 + if (compress_streams) {
  1531 + // Don't filter if the stream is already compressed with FlateDecode. This way we don't
  1532 + // make it worse if the original file used a better Flate algorithm, and we don't spend
  1533 + // time and CPU cycles uncompressing and recompressing stuff. This can be overridden
  1534 + // with setRecompressFlate(true).
  1535 + Name Filter = stream_dict["/Filter"];
  1536 + if (Filter && !recompress_flate && !stream.isDataModified() &&
  1537 + (Filter == "/FlateDecode" || Filter == "/Fl")) {
  1538 + filter = false;
  1539 + }
  1540 + }
  1541 + if (is_root_metadata && (!encryption || !encryption->getEncryptMetadata())) {
  1542 + filter = true;
  1543 + decode_level = qpdf_dl_all;
  1544 + } else if (normalize_content && normalized_streams.contains(stream)) {
  1545 + encode_flags = qpdf_ef_normalize;
  1546 + filter = true;
  1547 + } else if (filter && compress_streams) {
  1548 + encode_flags = qpdf_ef_compress;
1471 1549 }
1472   - }
1473   - bool normalize = false;
1474   - bool uncompress = false;
1475   - if (filter_on_write && is_root_metadata && (!encryption || !encryption->getEncryptMetadata())) {
1476   - filter = true;
1477   - compress_stream = false;
1478   - uncompress = true;
1479   - } else if (filter_on_write && normalize_content && normalized_streams.contains(old_og)) {
1480   - normalize = true;
1481   - filter = true;
1482   - } else if (filter_on_write && filter && compress_streams) {
1483   - compress_stream = true;
1484 1550 }
1485 1551  
1486 1552 // Disable compression for empty streams to improve compatibility
1487   - if (stream_dict.getKey("/Length").isInteger() &&
1488   - stream_dict.getKey("/Length").getIntValue() == 0) {
  1553 + if (Integer(stream_dict["/Length"]) == 0) {
1489 1554 filter = true;
1490   - compress_stream = false;
  1555 + encode_flags = 0;
1491 1556 }
1492 1557  
1493   - bool filtered = false;
1494 1558 for (bool first_attempt: {true, false}) {
1495 1559 auto pp_stream_data =
1496 1560 stream_data ? pipeline_stack.activate(*stream_data) : pipeline_stack.activate(true);
1497 1561  
1498 1562 try {
1499   - filtered = stream.pipeStreamData(
1500   - pipeline,
1501   - !filter ? 0
1502   - : ((normalize ? qpdf_ef_normalize : 0) |
1503   - (compress_stream ? qpdf_ef_compress : 0)),
1504   - !filter ? qpdf_dl_none : (uncompress ? qpdf_dl_all : stream_decode_level),
1505   - false,
1506   - first_attempt);
1507   - if (filter && !filtered) {
1508   - // Try again
1509   - filter = false;
1510   - stream.setFilterOnWrite(false);
1511   - } else {
  1563 + if (stream.pipeStreamData(
  1564 + pipeline,
  1565 + filter ? encode_flags : 0,
  1566 + filter ? decode_level : qpdf_dl_none,
  1567 + false,
  1568 + first_attempt)) {
  1569 + return {true, encode_flags & qpdf_ef_compress, is_root_metadata};
  1570 + }
  1571 + if (!filter) {
1512 1572 break;
1513 1573 }
1514 1574 } catch (std::runtime_error& e) {
1515   - if (filter && first_attempt) {
1516   - stream.warn("error while getting stream data: "s + e.what());
1517   - stream.warn("qpdf will attempt to write the damaged stream unchanged");
1518   - filter = false;
1519   - stream.setFilterOnWrite(false);
1520   - continue;
  1575 + if (!(filter && first_attempt)) {
  1576 + throw std::runtime_error(
  1577 + "error while getting stream data for " + stream.unparse() + ": " + e.what());
1521 1578 }
1522   - throw std::runtime_error(
1523   - "error while getting stream data for " + stream.unparse() + ": " + e.what());
  1579 + stream.warn("error while getting stream data: "s + e.what());
  1580 + stream.warn("qpdf will attempt to write the damaged stream unchanged");
1524 1581 }
  1582 + // Try again
  1583 + filter = false;
  1584 + stream.setFilterOnWrite(false);
1525 1585 if (stream_data) {
1526 1586 stream_data->clear();
1527 1587 }
1528 1588 }
1529   - if (!filtered) {
1530   - compress_stream = false;
1531   - }
1532   - return filtered;
  1589 + return {false, false, is_root_metadata};
1533 1590 }
1534 1591  
1535 1592 void
... ... @@ -1724,16 +1781,15 @@ QPDFWriter::Members::unparseObject(
1724 1781 }
1725 1782  
1726 1783 flags |= f_stream;
1727   - bool compress_stream = false;
1728   - bool is_metadata = false;
1729 1784 std::string stream_data;
1730   - if (willFilterStream(object, compress_stream, is_metadata, &stream_data)) {
  1785 + auto [filter, compress_stream, is_root_metadata] = will_filter_stream(object, &stream_data);
  1786 + if (filter) {
1731 1787 flags |= f_filtered;
1732 1788 }
1733 1789 QPDFObjectHandle stream_dict = object.getDict();
1734 1790  
1735 1791 cur_stream_length = stream_data.size();
1736   - if (is_metadata && encryption && !encryption->getEncryptMetadata()) {
  1792 + if (is_root_metadata && encryption && !encryption->getEncryptMetadata()) {
1737 1793 // Don't encrypt stream data for the metadata stream
1738 1794 cur_data_key.clear();
1739 1795 }
... ... @@ -2089,7 +2145,7 @@ QPDFWriter::Members::initializeSpecialStreams()
2089 2145 void
2090 2146 QPDFWriter::Members::preserveObjectStreams()
2091 2147 {
2092   - auto const& xref = QPDF::Writer::getXRefTable(pdf);
  2148 + auto const& xref = getXRefTable();
2093 2149 // Our object_to_object_stream map has to map ObjGen -> ObjGen since we may be generating object
2094 2150 // streams out of old objects that have generation numbers greater than zero. However in an
2095 2151 // existing PDF, all object stream objects and all objects in them must have generation 0
... ... @@ -2114,7 +2170,7 @@ QPDFWriter::Members::preserveObjectStreams()
2114 2170 if (iter->second.getType() == 2) {
2115 2171 // Pdf contains object streams.
2116 2172 obj.streams_empty = false;
2117   - auto eligible = QPDF::Writer::getCompressibleObjSet(pdf);
  2173 + auto eligible = getCompressibleObjSet();
2118 2174 // The object pointed to by iter may be a previous generation, in which case it is
2119 2175 // removed by getCompressibleObjSet. We need to restart the loop (while the object
2120 2176 // table may contain multiple generations of an object).
... ... @@ -2145,7 +2201,7 @@ QPDFWriter::Members::generateObjectStreams()
2145 2201  
2146 2202 // This code doesn't do anything with /Extends.
2147 2203  
2148   - std::vector<QPDFObjGen> eligible = QPDF::Writer::getCompressibleObjGens(pdf);
  2204 + std::vector<QPDFObjGen> eligible = getCompressibleObjGens();
2149 2205 size_t n_object_streams = (eligible.size() + 99U) / 100U;
2150 2206  
2151 2207 initializeTables(2U * n_object_streams);
... ... @@ -2173,28 +2229,28 @@ QPDFWriter::Members::generateObjectStreams()
2173 2229 }
2174 2230 }
2175 2231  
2176   -QPDFObjectHandle
2177   -QPDFWriter::Members::getTrimmedTrailer()
  2232 +Dictionary
  2233 +QPDFWriter::Members::trimmed_trailer()
2178 2234 {
2179 2235 // Remove keys from the trailer that necessarily have to be replaced when writing the file.
2180 2236  
2181   - QPDFObjectHandle trailer = pdf.getTrailer().unsafeShallowCopy();
  2237 + Dictionary trailer = pdf.getTrailer().unsafeShallowCopy();
2182 2238  
2183 2239 // Remove encryption keys
2184   - trailer.removeKey("/ID");
2185   - trailer.removeKey("/Encrypt");
  2240 + trailer.erase("/ID");
  2241 + trailer.erase("/Encrypt");
2186 2242  
2187 2243 // Remove modification information
2188   - trailer.removeKey("/Prev");
  2244 + trailer.erase("/Prev");
2189 2245  
2190 2246 // Remove all trailer keys that potentially come from a cross-reference stream
2191   - trailer.removeKey("/Index");
2192   - trailer.removeKey("/W");
2193   - trailer.removeKey("/Length");
2194   - trailer.removeKey("/Filter");
2195   - trailer.removeKey("/DecodeParms");
2196   - trailer.removeKey("/Type");
2197   - trailer.removeKey("/XRefStm");
  2247 + trailer.erase("/Index");
  2248 + trailer.erase("/W");
  2249 + trailer.erase("/Length");
  2250 + trailer.erase("/Filter");
  2251 + trailer.erase("/DecodeParms");
  2252 + trailer.erase("/Type");
  2253 + trailer.erase("/XRefStm");
2198 2254  
2199 2255 return trailer;
2200 2256 }
... ... @@ -2226,7 +2282,7 @@ QPDFWriter::Members::prepareFileForWrite()
2226 2282 void
2227 2283 QPDFWriter::Members::initializeTables(size_t extra)
2228 2284 {
2229   - auto size = QIntC::to_size(QPDF::Writer::tableSize(pdf) + 100) + extra;
  2285 + auto size = QIntC::to_size(tableSize() + 100) + extra;
2230 2286 obj.resize(size);
2231 2287 new_obj.resize(size);
2232 2288 }
... ... @@ -2505,8 +2561,8 @@ QPDFWriter::Members::writeHintStream(int hint_id)
2505 2561 std::string hint_buffer;
2506 2562 int S = 0;
2507 2563 int O = 0;
2508   - bool compressed = compress_streams && !qdf_mode;
2509   - QPDF::Writer::generateHintStream(pdf, new_obj, obj, hint_buffer, S, O, compressed);
  2564 + bool compressed = compress_streams;
  2565 + generateHintStream(new_obj, obj, hint_buffer, S, O, compressed);
2510 2566  
2511 2567 openObject(hint_id);
2512 2568 setDataKey(hint_id);
... ... @@ -2702,27 +2758,21 @@ QPDFWriter::Members::writeLinearized()
2702 2758 std::map<int, int> stream_cache;
2703 2759  
2704 2760 auto skip_stream_parameters = [this, &stream_cache](QPDFObjectHandle& stream) {
2705   - auto& result = stream_cache[stream.getObjectID()];
2706   - if (result == 0) {
2707   - bool compress_stream;
2708   - bool is_metadata;
2709   - if (willFilterStream(stream, compress_stream, is_metadata, nullptr)) {
2710   - result = 2;
2711   - } else {
2712   - result = 1;
2713   - }
  2761 + if (auto& result = stream_cache[stream.getObjectID()]) {
  2762 + return result;
  2763 + } else {
  2764 + return result = will_filter_stream(stream) ? 2 : 1;
2714 2765 }
2715   - return result;
2716 2766 };
2717 2767  
2718   - QPDF::Writer::optimize(pdf, obj, skip_stream_parameters);
  2768 + optimize(obj, skip_stream_parameters);
2719 2769  
2720 2770 std::vector<QPDFObjectHandle> part4;
2721 2771 std::vector<QPDFObjectHandle> part6;
2722 2772 std::vector<QPDFObjectHandle> part7;
2723 2773 std::vector<QPDFObjectHandle> part8;
2724 2774 std::vector<QPDFObjectHandle> part9;
2725   - QPDF::Writer::getLinearizedParts(pdf, obj, part4, part6, part7, part8, part9);
  2775 + getLinearizedParts(obj, part4, part6, part7, part8, part9);
2726 2776  
2727 2777 // Object number sequence:
2728 2778 //
... ... @@ -3060,12 +3110,12 @@ QPDFWriter::Members::enqueueObjectsStandard()
3060 3110 }
3061 3111  
3062 3112 // Put root first on queue.
3063   - QPDFObjectHandle trailer = getTrimmedTrailer();
3064   - enqueueObject(trailer.getKey("/Root"));
  3113 + auto trailer = trimmed_trailer();
  3114 + enqueueObject(trailer["/Root"]);
3065 3115  
3066 3116 // Next place any other objects referenced from the trailer dictionary into the queue, handling
3067 3117 // direct objects recursively. Root is already there, so enqueuing it a second time is a no-op.
3068   - for (auto& item: trailer.as_dictionary()) {
  3118 + for (auto& item: trailer) {
3069 3119 if (!item.second.null()) {
3070 3120 enqueueObject(item.second);
3071 3121 }
... ... @@ -3098,9 +3148,7 @@ QPDFWriter::Members::enqueueObjectsPCLm()
3098 3148 }
3099 3149 }
3100 3150  
3101   - // Put root in queue.
3102   - QPDFObjectHandle trailer = getTrimmedTrailer();
3103   - enqueueObject(trailer.getKey("/Root"));
  3151 + enqueueObject(trimmed_trailer()["/Root"]);
3104 3152 }
3105 3153  
3106 3154 void
... ...
libqpdf/qpdf/QPDF_private.hh
... ... @@ -13,72 +13,6 @@
13 13  
14 14 using namespace qpdf;
15 15  
16   -// Writer class is restricted to QPDFWriter so that only it can call certain methods.
17   -class QPDF::Writer
18   -{
19   - friend class QPDFWriter;
20   -
21   - private:
22   - static void
23   - optimize(
24   - QPDF& qpdf,
25   - QPDFWriter::ObjTable const& obj,
26   - std::function<int(QPDFObjectHandle&)> skip_stream_parameters)
27   - {
28   - qpdf.optimize(obj, skip_stream_parameters);
29   - }
30   -
31   - static void
32   - getLinearizedParts(
33   - QPDF& qpdf,
34   - QPDFWriter::ObjTable const& obj,
35   - std::vector<QPDFObjectHandle>& part4,
36   - std::vector<QPDFObjectHandle>& part6,
37   - std::vector<QPDFObjectHandle>& part7,
38   - std::vector<QPDFObjectHandle>& part8,
39   - std::vector<QPDFObjectHandle>& part9)
40   - {
41   - qpdf.getLinearizedParts(obj, part4, part6, part7, part8, part9);
42   - }
43   -
44   - static void
45   - generateHintStream(
46   - QPDF& qpdf,
47   - QPDFWriter::NewObjTable const& new_obj,
48   - QPDFWriter::ObjTable const& obj,
49   - std::string& hint_stream,
50   - int& S,
51   - int& O,
52   - bool compressed)
53   - {
54   - qpdf.generateHintStream(new_obj, obj, hint_stream, S, O, compressed);
55   - }
56   -
57   - static std::vector<QPDFObjGen>
58   - getCompressibleObjGens(QPDF& qpdf)
59   - {
60   - return qpdf.getCompressibleObjVector();
61   - }
62   -
63   - static std::vector<bool>
64   - getCompressibleObjSet(QPDF& qpdf)
65   - {
66   - return qpdf.getCompressibleObjSet();
67   - }
68   -
69   - static std::map<QPDFObjGen, QPDFXRefEntry> const&
70   - getXRefTable(QPDF& qpdf)
71   - {
72   - return qpdf.getXRefTableInternal();
73   - }
74   -
75   - static size_t
76   - tableSize(QPDF& qpdf)
77   - {
78   - return qpdf.tableSize();
79   - }
80   -};
81   -
82 16 // The Resolver class is restricted to QPDFObject so that only it can resolve indirect
83 17 // references.
84 18 class QPDF::Resolver
... ...