Commit 626d5061de49471a62a05438f6d122d17fad2aa9

Authored by m-holger
1 parent db87f3ca

Refactor object stream warnings and object descriptions

Only build strings when needed.
include/qpdf/QPDF.hh
@@ -785,7 +785,7 @@ class QPDF @@ -785,7 +785,7 @@ class QPDF
785 QPDFObjectHandle readObject(std::string const& description, QPDFObjGen og); 785 QPDFObjectHandle readObject(std::string const& description, QPDFObjGen og);
786 void readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset); 786 void readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset);
787 void validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset); 787 void validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset);
788 - QPDFObjectHandle readObjectInStream(BufferInputSource& input, int obj); 788 + QPDFObjectHandle readObjectInStream(BufferInputSource& input, int stream_id, int obj_id);
789 size_t recoverStreamLength( 789 size_t recoverStreamLength(
790 std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset); 790 std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset);
791 QPDFTokenizer::Token readToken(InputSource&, size_t max_len = 0); 791 QPDFTokenizer::Token readToken(InputSource&, size_t max_len = 0);
libqpdf/QPDFObject.cc
@@ -3,6 +3,10 @@ @@ -3,6 +3,10 @@
3 std::string 3 std::string
4 QPDFObject::getDescription() 4 QPDFObject::getDescription()
5 { 5 {
  6 + qpdf_offset_t shift = (getTypeCode() == ::ot_dictionary) ? 2
  7 + : (getTypeCode() == ::ot_array) ? 1
  8 + : 0;
  9 +
6 if (object_description) { 10 if (object_description) {
7 switch (object_description->index()) { 11 switch (object_description->index()) {
8 case 0: 12 case 0:
@@ -14,10 +18,6 @@ QPDFObject::getDescription() @@ -14,10 +18,6 @@ QPDFObject::getDescription()
14 description.replace(pos, 3, og.unparse(' ')); 18 description.replace(pos, 3, og.unparse(' '));
15 } 19 }
16 if (auto pos = description.find("$PO"); pos != std::string::npos) { 20 if (auto pos = description.find("$PO"); pos != std::string::npos) {
17 - qpdf_offset_t shift = (getTypeCode() == ::ot_dictionary) ? 2  
18 - : (getTypeCode() == ::ot_array) ? 1  
19 - : 0;  
20 -  
21 description.replace(pos, 3, std::to_string(parsed_offset + shift)); 21 description.replace(pos, 3, std::to_string(parsed_offset + shift));
22 } 22 }
23 return description; 23 return description;
@@ -44,7 +44,14 @@ QPDFObject::getDescription() @@ -44,7 +44,14 @@ QPDFObject::getDescription()
44 } 44 }
45 return result; 45 return result;
46 } 46 }
  47 + case 3:
  48 + auto [stream_id, obj_id] = std::get<3>(*object_description);
  49 + std::string result = qpdf ? qpdf->getFilename() : "";
  50 + result += " object stream " + std::to_string(stream_id) + ", object " +
  51 + std::to_string(obj_id) + " 0 at offset " + std::to_string(parsed_offset + shift);
  52 + return result;
47 } 53 }
  54 +
48 } else if (og.isIndirect()) { 55 } else if (og.isIndirect()) {
49 return "object " + og.unparse(' '); 56 return "object " + og.unparse(' ');
50 } 57 }
libqpdf/QPDFParser.cc
@@ -10,6 +10,8 @@ @@ -10,6 +10,8 @@
10 10
11 #include <memory> 11 #include <memory>
12 12
  13 +using namespace std::literals;
  14 +
13 using ObjectPtr = std::shared_ptr<QPDFObject>; 15 using ObjectPtr = std::shared_ptr<QPDFObject>;
14 16
15 QPDFObjectHandle 17 QPDFObjectHandle
@@ -524,7 +526,13 @@ QPDFParser::warnDuplicateKey() @@ -524,7 +526,13 @@ QPDFParser::warnDuplicateKey()
524 void 526 void
525 QPDFParser::warn(qpdf_offset_t offset, std::string const& msg) const 527 QPDFParser::warn(qpdf_offset_t offset, std::string const& msg) const
526 { 528 {
527 - warn(QPDFExc(qpdf_e_damaged_pdf, input.getName(), object_description, offset, msg)); 529 + if (stream_id) {
  530 + std::string descr = "object "s + std::to_string(obj_id) + " 0";
  531 + std::string name = context->getFilename() + " object stream " + std::to_string(stream_id);
  532 + warn(QPDFExc(qpdf_e_damaged_pdf, name, descr, offset, msg));
  533 + } else {
  534 + warn(QPDFExc(qpdf_e_damaged_pdf, input.getName(), object_description, offset, msg));
  535 + }
528 } 536 }
529 537
530 void 538 void
libqpdf/QPDF_objects.cc
@@ -1292,19 +1292,22 @@ QPDF::validateStreamLineEnd(QPDFObjectHandle&amp; object, QPDFObjGen og, qpdf_offset @@ -1292,19 +1292,22 @@ QPDF::validateStreamLineEnd(QPDFObjectHandle&amp; object, QPDFObjGen og, qpdf_offset
1292 } 1292 }
1293 1293
1294 QPDFObjectHandle 1294 QPDFObjectHandle
1295 -QPDF::readObjectInStream(BufferInputSource& input, int obj) 1295 +QPDF::readObjectInStream(BufferInputSource& input, int stream_id, int obj_id)
1296 { 1296 {
1297 - m->last_object_description.erase(7); // last_object_description starts with "object "  
1298 - m->last_object_description += std::to_string(obj);  
1299 - m->last_object_description += " 0";  
1300 -  
1301 bool empty = false; 1297 bool empty = false;
1302 - auto object = QPDFParser(input, m->last_object_description, m->tokenizer, nullptr, this, true)  
1303 - .parse(empty, false); 1298 + auto object =
  1299 + QPDFParser(input, stream_id, obj_id, m->last_object_description, m->tokenizer, this)
  1300 + .parse(empty, false);
1304 if (empty) { 1301 if (empty) {
1305 // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in 1302 // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
1306 // actual PDF files and Adobe Reader appears to ignore them. 1303 // actual PDF files and Adobe Reader appears to ignore them.
1307 - warn(damagedPDF(input, input.getLastOffset(), "empty object treated as null")); 1304 + warn(QPDFExc(
  1305 + qpdf_e_damaged_pdf,
  1306 + m->file->getName() + " object stream " + std::to_string(stream_id),
  1307 + +"object " + std::to_string(obj_id) + " 0, offset " +
  1308 + std::to_string(input.getLastOffset()),
  1309 + 0,
  1310 + "empty object treated as null"));
1308 } 1311 }
1309 return object; 1312 return object;
1310 } 1313 }
@@ -1605,13 +1608,23 @@ QPDF::resolve(QPDFObjGen og) @@ -1605,13 +1608,23 @@ QPDF::resolve(QPDFObjGen og)
1605 void 1608 void
1606 QPDF::resolveObjectsInStream(int obj_stream_number) 1609 QPDF::resolveObjectsInStream(int obj_stream_number)
1607 { 1610 {
  1611 + auto damaged =
  1612 + [this, obj_stream_number](int id, qpdf_offset_t offset, std::string const& msg) -> QPDFExc {
  1613 + return {
  1614 + qpdf_e_damaged_pdf,
  1615 + m->file->getName() + " object stream " + std::to_string(obj_stream_number),
  1616 + +"object " + std::to_string(id) + " 0",
  1617 + offset,
  1618 + msg};
  1619 + };
  1620 +
1608 if (m->resolved_object_streams.count(obj_stream_number)) { 1621 if (m->resolved_object_streams.count(obj_stream_number)) {
1609 return; 1622 return;
1610 } 1623 }
1611 m->resolved_object_streams.insert(obj_stream_number); 1624 m->resolved_object_streams.insert(obj_stream_number);
1612 // Force resolution of object stream 1625 // Force resolution of object stream
1613 - QPDFObjectHandle obj_stream = getObject(obj_stream_number, 0);  
1614 - if (!obj_stream.isStream()) { 1626 + auto obj_stream = getObject(obj_stream_number, 0).as_stream();
  1627 + if (!obj_stream) {
1615 throw damagedPDF( 1628 throw damagedPDF(
1616 "object " + std::to_string(obj_stream_number) + " 0", 1629 "object " + std::to_string(obj_stream_number) + " 0",
1617 "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream"); 1630 "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream");
@@ -1642,19 +1655,14 @@ QPDF::resolveObjectsInStream(int obj_stream_number) @@ -1642,19 +1655,14 @@ QPDF::resolveObjectsInStream(int obj_stream_number)
1642 std::vector<std::pair<int, long long>> offsets; 1655 std::vector<std::pair<int, long long>> offsets;
1643 1656
1644 auto bp = obj_stream.getStreamData(qpdf_dl_specialized); 1657 auto bp = obj_stream.getStreamData(qpdf_dl_specialized);
1645 - BufferInputSource input(  
1646 - (m->file->getName() + " object stream " + std::to_string(obj_stream_number)), bp.get()); 1658 + BufferInputSource input("", bp.get());
1647 1659
1648 long long last_offset = -1; 1660 long long last_offset = -1;
1649 for (unsigned int i = 0; i < n; ++i) { 1661 for (unsigned int i = 0; i < n; ++i) {
1650 auto tnum = readToken(input); 1662 auto tnum = readToken(input);
1651 auto toffset = readToken(input); 1663 auto toffset = readToken(input);
1652 if (!(tnum.isInteger() && toffset.isInteger())) { 1664 if (!(tnum.isInteger() && toffset.isInteger())) {
1653 - throw damagedPDF(  
1654 - input,  
1655 - "object " + std::to_string(obj_stream_number) + " 0",  
1656 - input.getLastOffset(),  
1657 - "expected integer in object stream header"); 1665 + throw damaged(0, input.getLastOffset(), "expected integer in object stream header");
1658 } 1666 }
1659 1667
1660 int num = QUtil::string_to_int(tnum.getValue().c_str()); 1668 int num = QUtil::string_to_int(tnum.getValue().c_str());
@@ -1662,29 +1670,20 @@ QPDF::resolveObjectsInStream(int obj_stream_number) @@ -1662,29 +1670,20 @@ QPDF::resolveObjectsInStream(int obj_stream_number)
1662 1670
1663 if (num == obj_stream_number) { 1671 if (num == obj_stream_number) {
1664 QTC::TC("qpdf", "QPDF ignore self-referential object stream"); 1672 QTC::TC("qpdf", "QPDF ignore self-referential object stream");
1665 - warn(damagedPDF(  
1666 - input,  
1667 - "object " + std::to_string(obj_stream_number) + " 0",  
1668 - input.getLastOffset(),  
1669 - "object stream claims to contain itself")); 1673 + warn(damaged(num, input.getLastOffset(), "object stream claims to contain itself"));
1670 continue; 1674 continue;
1671 } 1675 }
1672 1676
1673 if (num < 1) { 1677 if (num < 1) {
1674 QTC::TC("qpdf", "QPDF object stream contains id < 1"); 1678 QTC::TC("qpdf", "QPDF object stream contains id < 1");
1675 - warn(damagedPDF(  
1676 - input,  
1677 - "object " + std::to_string(num) + " 0",  
1678 - input.getLastOffset(),  
1679 - "object id is invalid"s)); 1679 + warn(damaged(num, input.getLastOffset(), "object id is invalid"s));
1680 continue; 1680 continue;
1681 } 1681 }
1682 1682
1683 if (offset <= last_offset) { 1683 if (offset <= last_offset) {
1684 QTC::TC("qpdf", "QPDF object stream offsets not increasing"); 1684 QTC::TC("qpdf", "QPDF object stream offsets not increasing");
1685 - warn(damagedPDF(  
1686 - input,  
1687 - "object " + std::to_string(num) + " 0", 1685 + warn(damaged(
  1686 + num,
1688 offset, 1687 offset,
1689 "offset is invalid (must be larger than previous offset " + 1688 "offset is invalid (must be larger than previous offset " +
1690 std::to_string(last_offset) + ")")); 1689 std::to_string(last_offset) + ")"));
@@ -1703,15 +1702,13 @@ QPDF::resolveObjectsInStream(int obj_stream_number) @@ -1703,15 +1702,13 @@ QPDF::resolveObjectsInStream(int obj_stream_number)
1703 // found here in the cache. Remember that some objects stored here might have been overridden 1702 // found here in the cache. Remember that some objects stored here might have been overridden
1704 // by new objects appended to the file, so it is necessary to recheck the xref table and only 1703 // by new objects appended to the file, so it is necessary to recheck the xref table and only
1705 // cache what would actually be resolved here. 1704 // cache what would actually be resolved here.
1706 - m->last_object_description.clear();  
1707 - m->last_object_description += "object ";  
1708 for (auto const& [id, offset]: offsets) { 1705 for (auto const& [id, offset]: offsets) {
1709 QPDFObjGen og(id, 0); 1706 QPDFObjGen og(id, 0);
1710 auto entry = m->xref_table.find(og); 1707 auto entry = m->xref_table.find(og);
1711 if (entry != m->xref_table.end() && entry->second.getType() == 2 && 1708 if (entry != m->xref_table.end() && entry->second.getType() == 2 &&
1712 entry->second.getObjStreamNumber() == obj_stream_number) { 1709 entry->second.getObjStreamNumber() == obj_stream_number) {
1713 input.seek(offset, SEEK_SET); 1710 input.seek(offset, SEEK_SET);
1714 - QPDFObjectHandle oh = readObjectInStream(input, id); 1711 + QPDFObjectHandle oh = readObjectInStream(input, obj_stream_number, id);
1715 updateCache(og, oh.getObj(), end_before_space, end_after_space); 1712 updateCache(og, oh.getObj(), end_before_space, end_after_space);
1716 } else { 1713 } else {
1717 QTC::TC("qpdf", "QPDF not caching overridden objstm object"); 1714 QTC::TC("qpdf", "QPDF not caching overridden objstm object");
libqpdf/qpdf/QPDFObject_private.hh
@@ -381,7 +381,17 @@ class QPDFObject @@ -381,7 +381,17 @@ class QPDFObject
381 std::string var_descr; 381 std::string var_descr;
382 }; 382 };
383 383
384 - using Description = std::variant<std::string, JSON_Descr, ChildDescr>; 384 + struct ObjStreamDescr
  385 + {
  386 + ObjStreamDescr(int stream_id, int obj_id) :
  387 + stream_id(stream_id),
  388 + obj_id(obj_id) {};
  389 +
  390 + int stream_id;
  391 + int obj_id;
  392 + };
  393 +
  394 + using Description = std::variant<std::string, JSON_Descr, ChildDescr, ObjStreamDescr>;
385 395
386 void 396 void
387 setDescription( 397 setDescription(
libqpdf/qpdf/QPDFParser.hh
@@ -62,9 +62,32 @@ class QPDFParser @@ -62,9 +62,32 @@ class QPDFParser
62 decrypter(nullptr), 62 decrypter(nullptr),
63 context(context), 63 context(context),
64 description(std::move(sp_description)), 64 description(std::move(sp_description)),
65 - parse_pdf(false) 65 + parse_pdf(true)
66 { 66 {
67 } 67 }
  68 +
  69 + // Used by readObjectInStream only
  70 + QPDFParser(
  71 + InputSource& input,
  72 + int stream_id,
  73 + int obj_id,
  74 + std::string const& object_description,
  75 + qpdf::Tokenizer& tokenizer,
  76 + QPDF* context) :
  77 + input(input),
  78 + object_description(object_description),
  79 + tokenizer(tokenizer),
  80 + decrypter(nullptr),
  81 + context(context),
  82 + description(
  83 + std::make_shared<QPDFObject::Description>(
  84 + QPDFObject::ObjStreamDescr(stream_id, obj_id))),
  85 + parse_pdf(true),
  86 + stream_id(stream_id),
  87 + obj_id(obj_id)
  88 + {
  89 + }
  90 +
68 ~QPDFParser() = default; 91 ~QPDFParser() = default;
69 92
70 QPDFObjectHandle parse(bool& empty, bool content_stream); 93 QPDFObjectHandle parse(bool& empty, bool content_stream);
@@ -124,6 +147,8 @@ class QPDFParser @@ -124,6 +147,8 @@ class QPDFParser
124 QPDF* context; 147 QPDF* context;
125 std::shared_ptr<QPDFObject::Description> description; 148 std::shared_ptr<QPDFObject::Description> description;
126 bool parse_pdf; 149 bool parse_pdf;
  150 + int stream_id{0};
  151 + int obj_id{0};
127 152
128 std::vector<StackFrame> stack; 153 std::vector<StackFrame> stack;
129 StackFrame* frame{nullptr}; 154 StackFrame* frame{nullptr};
libqpdf/qpdf/QPDF_private.hh
@@ -3,6 +3,7 @@ @@ -3,6 +3,7 @@
3 3
4 #include <qpdf/QPDF.hh> 4 #include <qpdf/QPDF.hh>
5 5
  6 +#include <qpdf/QPDFObject_private.hh>
6 #include <qpdf/QPDFTokenizer_private.hh> 7 #include <qpdf/QPDFTokenizer_private.hh>
7 8
8 // Writer class is restricted to QPDFWriter so that only it can call certain methods. 9 // Writer class is restricted to QPDFWriter so that only it can call certain methods.
@@ -457,6 +458,7 @@ class QPDF::Members @@ -457,6 +458,7 @@ class QPDF::Members
457 qpdf::Tokenizer tokenizer; 458 qpdf::Tokenizer tokenizer;
458 std::shared_ptr<InputSource> file; 459 std::shared_ptr<InputSource> file;
459 std::string last_object_description; 460 std::string last_object_description;
  461 + std::shared_ptr<QPDFObject::Description> last_ostream_description;
460 bool provided_password_is_hex_key{false}; 462 bool provided_password_is_hex_key{false};
461 bool ignore_xref_streams{false}; 463 bool ignore_xref_streams{false};
462 bool suppress_warnings{false}; 464 bool suppress_warnings{false};
manual/release-notes.rst
@@ -29,6 +29,12 @@ more detail. @@ -29,6 +29,12 @@ more detail.
29 - There have been further enhancements to how files with damaged xref 29 - There have been further enhancements to how files with damaged xref
30 tables are recovered. 30 tables are recovered.
31 31
  32 + - Other changes
  33 +
  34 + - The parsing of object streams including the creation of error/warning
  35 + messages and object descriptions has been refactored with some
  36 + improvement both in runtime and memory usage.
  37 +
32 - There has been some refactoring of how object streams are written with 38 - There has been some refactoring of how object streams are written with
33 some performance improvement. 39 some performance improvement.
34 40