Commit 626d5061de49471a62a05438f6d122d17fad2aa9

Authored by m-holger
1 parent db87f3ca

Refactor object stream warnings and object descriptions

Only build strings when needed.
include/qpdf/QPDF.hh
... ... @@ -785,7 +785,7 @@ class QPDF
785 785 QPDFObjectHandle readObject(std::string const& description, QPDFObjGen og);
786 786 void readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset);
787 787 void validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset);
788   - QPDFObjectHandle readObjectInStream(BufferInputSource& input, int obj);
  788 + QPDFObjectHandle readObjectInStream(BufferInputSource& input, int stream_id, int obj_id);
789 789 size_t recoverStreamLength(
790 790 std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset);
791 791 QPDFTokenizer::Token readToken(InputSource&, size_t max_len = 0);
... ...
libqpdf/QPDFObject.cc
... ... @@ -3,6 +3,10 @@
3 3 std::string
4 4 QPDFObject::getDescription()
5 5 {
  6 + qpdf_offset_t shift = (getTypeCode() == ::ot_dictionary) ? 2
  7 + : (getTypeCode() == ::ot_array) ? 1
  8 + : 0;
  9 +
6 10 if (object_description) {
7 11 switch (object_description->index()) {
8 12 case 0:
... ... @@ -14,10 +18,6 @@ QPDFObject::getDescription()
14 18 description.replace(pos, 3, og.unparse(' '));
15 19 }
16 20 if (auto pos = description.find("$PO"); pos != std::string::npos) {
17   - qpdf_offset_t shift = (getTypeCode() == ::ot_dictionary) ? 2
18   - : (getTypeCode() == ::ot_array) ? 1
19   - : 0;
20   -
21 21 description.replace(pos, 3, std::to_string(parsed_offset + shift));
22 22 }
23 23 return description;
... ... @@ -44,7 +44,14 @@ QPDFObject::getDescription()
44 44 }
45 45 return result;
46 46 }
  47 + case 3:
  48 + auto [stream_id, obj_id] = std::get<3>(*object_description);
  49 + std::string result = qpdf ? qpdf->getFilename() : "";
  50 + result += " object stream " + std::to_string(stream_id) + ", object " +
  51 + std::to_string(obj_id) + " 0 at offset " + std::to_string(parsed_offset + shift);
  52 + return result;
47 53 }
  54 +
48 55 } else if (og.isIndirect()) {
49 56 return "object " + og.unparse(' ');
50 57 }
... ...
libqpdf/QPDFParser.cc
... ... @@ -10,6 +10,8 @@
10 10  
11 11 #include <memory>
12 12  
  13 +using namespace std::literals;
  14 +
13 15 using ObjectPtr = std::shared_ptr<QPDFObject>;
14 16  
15 17 QPDFObjectHandle
... ... @@ -524,7 +526,13 @@ QPDFParser::warnDuplicateKey()
524 526 void
525 527 QPDFParser::warn(qpdf_offset_t offset, std::string const& msg) const
526 528 {
527   - warn(QPDFExc(qpdf_e_damaged_pdf, input.getName(), object_description, offset, msg));
  529 + if (stream_id) {
  530 + std::string descr = "object "s + std::to_string(obj_id) + " 0";
  531 + std::string name = context->getFilename() + " object stream " + std::to_string(stream_id);
  532 + warn(QPDFExc(qpdf_e_damaged_pdf, name, descr, offset, msg));
  533 + } else {
  534 + warn(QPDFExc(qpdf_e_damaged_pdf, input.getName(), object_description, offset, msg));
  535 + }
528 536 }
529 537  
530 538 void
... ...
libqpdf/QPDF_objects.cc
... ... @@ -1292,19 +1292,22 @@ QPDF::validateStreamLineEnd(QPDFObjectHandle&amp; object, QPDFObjGen og, qpdf_offset
1292 1292 }
1293 1293  
1294 1294 QPDFObjectHandle
1295   -QPDF::readObjectInStream(BufferInputSource& input, int obj)
  1295 +QPDF::readObjectInStream(BufferInputSource& input, int stream_id, int obj_id)
1296 1296 {
1297   - m->last_object_description.erase(7); // last_object_description starts with "object "
1298   - m->last_object_description += std::to_string(obj);
1299   - m->last_object_description += " 0";
1300   -
1301 1297 bool empty = false;
1302   - auto object = QPDFParser(input, m->last_object_description, m->tokenizer, nullptr, this, true)
1303   - .parse(empty, false);
  1298 + auto object =
  1299 + QPDFParser(input, stream_id, obj_id, m->last_object_description, m->tokenizer, this)
  1300 + .parse(empty, false);
1304 1301 if (empty) {
1305 1302 // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
1306 1303 // actual PDF files and Adobe Reader appears to ignore them.
1307   - warn(damagedPDF(input, input.getLastOffset(), "empty object treated as null"));
  1304 + warn(QPDFExc(
  1305 + qpdf_e_damaged_pdf,
  1306 + m->file->getName() + " object stream " + std::to_string(stream_id),
  1307 + +"object " + std::to_string(obj_id) + " 0, offset " +
  1308 + std::to_string(input.getLastOffset()),
  1309 + 0,
  1310 + "empty object treated as null"));
1308 1311 }
1309 1312 return object;
1310 1313 }
... ... @@ -1605,13 +1608,23 @@ QPDF::resolve(QPDFObjGen og)
1605 1608 void
1606 1609 QPDF::resolveObjectsInStream(int obj_stream_number)
1607 1610 {
  1611 + auto damaged =
  1612 + [this, obj_stream_number](int id, qpdf_offset_t offset, std::string const& msg) -> QPDFExc {
  1613 + return {
  1614 + qpdf_e_damaged_pdf,
  1615 + m->file->getName() + " object stream " + std::to_string(obj_stream_number),
  1616 + +"object " + std::to_string(id) + " 0",
  1617 + offset,
  1618 + msg};
  1619 + };
  1620 +
1608 1621 if (m->resolved_object_streams.count(obj_stream_number)) {
1609 1622 return;
1610 1623 }
1611 1624 m->resolved_object_streams.insert(obj_stream_number);
1612 1625 // Force resolution of object stream
1613   - QPDFObjectHandle obj_stream = getObject(obj_stream_number, 0);
1614   - if (!obj_stream.isStream()) {
  1626 + auto obj_stream = getObject(obj_stream_number, 0).as_stream();
  1627 + if (!obj_stream) {
1615 1628 throw damagedPDF(
1616 1629 "object " + std::to_string(obj_stream_number) + " 0",
1617 1630 "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream");
... ... @@ -1642,19 +1655,14 @@ QPDF::resolveObjectsInStream(int obj_stream_number)
1642 1655 std::vector<std::pair<int, long long>> offsets;
1643 1656  
1644 1657 auto bp = obj_stream.getStreamData(qpdf_dl_specialized);
1645   - BufferInputSource input(
1646   - (m->file->getName() + " object stream " + std::to_string(obj_stream_number)), bp.get());
  1658 + BufferInputSource input("", bp.get());
1647 1659  
1648 1660 long long last_offset = -1;
1649 1661 for (unsigned int i = 0; i < n; ++i) {
1650 1662 auto tnum = readToken(input);
1651 1663 auto toffset = readToken(input);
1652 1664 if (!(tnum.isInteger() && toffset.isInteger())) {
1653   - throw damagedPDF(
1654   - input,
1655   - "object " + std::to_string(obj_stream_number) + " 0",
1656   - input.getLastOffset(),
1657   - "expected integer in object stream header");
  1665 + throw damaged(0, input.getLastOffset(), "expected integer in object stream header");
1658 1666 }
1659 1667  
1660 1668 int num = QUtil::string_to_int(tnum.getValue().c_str());
... ... @@ -1662,29 +1670,20 @@ QPDF::resolveObjectsInStream(int obj_stream_number)
1662 1670  
1663 1671 if (num == obj_stream_number) {
1664 1672 QTC::TC("qpdf", "QPDF ignore self-referential object stream");
1665   - warn(damagedPDF(
1666   - input,
1667   - "object " + std::to_string(obj_stream_number) + " 0",
1668   - input.getLastOffset(),
1669   - "object stream claims to contain itself"));
  1673 + warn(damaged(num, input.getLastOffset(), "object stream claims to contain itself"));
1670 1674 continue;
1671 1675 }
1672 1676  
1673 1677 if (num < 1) {
1674 1678 QTC::TC("qpdf", "QPDF object stream contains id < 1");
1675   - warn(damagedPDF(
1676   - input,
1677   - "object " + std::to_string(num) + " 0",
1678   - input.getLastOffset(),
1679   - "object id is invalid"s));
  1679 + warn(damaged(num, input.getLastOffset(), "object id is invalid"s));
1680 1680 continue;
1681 1681 }
1682 1682  
1683 1683 if (offset <= last_offset) {
1684 1684 QTC::TC("qpdf", "QPDF object stream offsets not increasing");
1685   - warn(damagedPDF(
1686   - input,
1687   - "object " + std::to_string(num) + " 0",
  1685 + warn(damaged(
  1686 + num,
1688 1687 offset,
1689 1688 "offset is invalid (must be larger than previous offset " +
1690 1689 std::to_string(last_offset) + ")"));
... ... @@ -1703,15 +1702,13 @@ QPDF::resolveObjectsInStream(int obj_stream_number)
1703 1702 // found here in the cache. Remember that some objects stored here might have been overridden
1704 1703 // by new objects appended to the file, so it is necessary to recheck the xref table and only
1705 1704 // cache what would actually be resolved here.
1706   - m->last_object_description.clear();
1707   - m->last_object_description += "object ";
1708 1705 for (auto const& [id, offset]: offsets) {
1709 1706 QPDFObjGen og(id, 0);
1710 1707 auto entry = m->xref_table.find(og);
1711 1708 if (entry != m->xref_table.end() && entry->second.getType() == 2 &&
1712 1709 entry->second.getObjStreamNumber() == obj_stream_number) {
1713 1710 input.seek(offset, SEEK_SET);
1714   - QPDFObjectHandle oh = readObjectInStream(input, id);
  1711 + QPDFObjectHandle oh = readObjectInStream(input, obj_stream_number, id);
1715 1712 updateCache(og, oh.getObj(), end_before_space, end_after_space);
1716 1713 } else {
1717 1714 QTC::TC("qpdf", "QPDF not caching overridden objstm object");
... ...
libqpdf/qpdf/QPDFObject_private.hh
... ... @@ -381,7 +381,17 @@ class QPDFObject
381 381 std::string var_descr;
382 382 };
383 383  
384   - using Description = std::variant<std::string, JSON_Descr, ChildDescr>;
  384 + struct ObjStreamDescr
  385 + {
  386 + ObjStreamDescr(int stream_id, int obj_id) :
  387 + stream_id(stream_id),
  388 + obj_id(obj_id) {};
  389 +
  390 + int stream_id;
  391 + int obj_id;
  392 + };
  393 +
  394 + using Description = std::variant<std::string, JSON_Descr, ChildDescr, ObjStreamDescr>;
385 395  
386 396 void
387 397 setDescription(
... ...
libqpdf/qpdf/QPDFParser.hh
... ... @@ -62,9 +62,32 @@ class QPDFParser
62 62 decrypter(nullptr),
63 63 context(context),
64 64 description(std::move(sp_description)),
65   - parse_pdf(false)
  65 + parse_pdf(true)
66 66 {
67 67 }
  68 +
  69 + // Used by readObjectInStream only
  70 + QPDFParser(
  71 + InputSource& input,
  72 + int stream_id,
  73 + int obj_id,
  74 + std::string const& object_description,
  75 + qpdf::Tokenizer& tokenizer,
  76 + QPDF* context) :
  77 + input(input),
  78 + object_description(object_description),
  79 + tokenizer(tokenizer),
  80 + decrypter(nullptr),
  81 + context(context),
  82 + description(
  83 + std::make_shared<QPDFObject::Description>(
  84 + QPDFObject::ObjStreamDescr(stream_id, obj_id))),
  85 + parse_pdf(true),
  86 + stream_id(stream_id),
  87 + obj_id(obj_id)
  88 + {
  89 + }
  90 +
68 91 ~QPDFParser() = default;
69 92  
70 93 QPDFObjectHandle parse(bool& empty, bool content_stream);
... ... @@ -124,6 +147,8 @@ class QPDFParser
124 147 QPDF* context;
125 148 std::shared_ptr<QPDFObject::Description> description;
126 149 bool parse_pdf;
  150 + int stream_id{0};
  151 + int obj_id{0};
127 152  
128 153 std::vector<StackFrame> stack;
129 154 StackFrame* frame{nullptr};
... ...
libqpdf/qpdf/QPDF_private.hh
... ... @@ -3,6 +3,7 @@
3 3  
4 4 #include <qpdf/QPDF.hh>
5 5  
  6 +#include <qpdf/QPDFObject_private.hh>
6 7 #include <qpdf/QPDFTokenizer_private.hh>
7 8  
8 9 // Writer class is restricted to QPDFWriter so that only it can call certain methods.
... ... @@ -457,6 +458,7 @@ class QPDF::Members
457 458 qpdf::Tokenizer tokenizer;
458 459 std::shared_ptr<InputSource> file;
459 460 std::string last_object_description;
  461 + std::shared_ptr<QPDFObject::Description> last_ostream_description;
460 462 bool provided_password_is_hex_key{false};
461 463 bool ignore_xref_streams{false};
462 464 bool suppress_warnings{false};
... ...
manual/release-notes.rst
... ... @@ -29,6 +29,12 @@ more detail.
29 29 - There have been further enhancements to how files with damaged xref
30 30 tables are recovered.
31 31  
  32 + - Other changes
  33 +
  34 + - The parsing of object streams including the creation of error/warning
  35 + messages and object descriptions has been refactored with some
  36 + improvement both in runtime and memory usage.
  37 +
32 38 - There has been some refactoring of how object streams are written with
33 39 some performance improvement.
34 40  
... ...