Commit 626d5061de49471a62a05438f6d122d17fad2aa9
1 parent
db87f3ca
Refactor object stream warnings and object descriptions
Only build strings when needed.
Showing
8 changed files
with
96 additions
and
41 deletions
include/qpdf/QPDF.hh
| ... | ... | @@ -785,7 +785,7 @@ class QPDF |
| 785 | 785 | QPDFObjectHandle readObject(std::string const& description, QPDFObjGen og); |
| 786 | 786 | void readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset); |
| 787 | 787 | void validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset); |
| 788 | - QPDFObjectHandle readObjectInStream(BufferInputSource& input, int obj); | |
| 788 | + QPDFObjectHandle readObjectInStream(BufferInputSource& input, int stream_id, int obj_id); | |
| 789 | 789 | size_t recoverStreamLength( |
| 790 | 790 | std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset); |
| 791 | 791 | QPDFTokenizer::Token readToken(InputSource&, size_t max_len = 0); | ... | ... |
libqpdf/QPDFObject.cc
| ... | ... | @@ -3,6 +3,10 @@ |
| 3 | 3 | std::string |
| 4 | 4 | QPDFObject::getDescription() |
| 5 | 5 | { |
| 6 | + qpdf_offset_t shift = (getTypeCode() == ::ot_dictionary) ? 2 | |
| 7 | + : (getTypeCode() == ::ot_array) ? 1 | |
| 8 | + : 0; | |
| 9 | + | |
| 6 | 10 | if (object_description) { |
| 7 | 11 | switch (object_description->index()) { |
| 8 | 12 | case 0: |
| ... | ... | @@ -14,10 +18,6 @@ QPDFObject::getDescription() |
| 14 | 18 | description.replace(pos, 3, og.unparse(' ')); |
| 15 | 19 | } |
| 16 | 20 | if (auto pos = description.find("$PO"); pos != std::string::npos) { |
| 17 | - qpdf_offset_t shift = (getTypeCode() == ::ot_dictionary) ? 2 | |
| 18 | - : (getTypeCode() == ::ot_array) ? 1 | |
| 19 | - : 0; | |
| 20 | - | |
| 21 | 21 | description.replace(pos, 3, std::to_string(parsed_offset + shift)); |
| 22 | 22 | } |
| 23 | 23 | return description; |
| ... | ... | @@ -44,7 +44,14 @@ QPDFObject::getDescription() |
| 44 | 44 | } |
| 45 | 45 | return result; |
| 46 | 46 | } |
| 47 | + case 3: | |
| 48 | + auto [stream_id, obj_id] = std::get<3>(*object_description); | |
| 49 | + std::string result = qpdf ? qpdf->getFilename() : ""; | |
| 50 | + result += " object stream " + std::to_string(stream_id) + ", object " + | |
| 51 | + std::to_string(obj_id) + " 0 at offset " + std::to_string(parsed_offset + shift); | |
| 52 | + return result; | |
| 47 | 53 | } |
| 54 | + | |
| 48 | 55 | } else if (og.isIndirect()) { |
| 49 | 56 | return "object " + og.unparse(' '); |
| 50 | 57 | } | ... | ... |
libqpdf/QPDFParser.cc
| ... | ... | @@ -10,6 +10,8 @@ |
| 10 | 10 | |
| 11 | 11 | #include <memory> |
| 12 | 12 | |
| 13 | +using namespace std::literals; | |
| 14 | + | |
| 13 | 15 | using ObjectPtr = std::shared_ptr<QPDFObject>; |
| 14 | 16 | |
| 15 | 17 | QPDFObjectHandle |
| ... | ... | @@ -524,7 +526,13 @@ QPDFParser::warnDuplicateKey() |
| 524 | 526 | void |
| 525 | 527 | QPDFParser::warn(qpdf_offset_t offset, std::string const& msg) const |
| 526 | 528 | { |
| 527 | - warn(QPDFExc(qpdf_e_damaged_pdf, input.getName(), object_description, offset, msg)); | |
| 529 | + if (stream_id) { | |
| 530 | + std::string descr = "object "s + std::to_string(obj_id) + " 0"; | |
| 531 | + std::string name = context->getFilename() + " object stream " + std::to_string(stream_id); | |
| 532 | + warn(QPDFExc(qpdf_e_damaged_pdf, name, descr, offset, msg)); | |
| 533 | + } else { | |
| 534 | + warn(QPDFExc(qpdf_e_damaged_pdf, input.getName(), object_description, offset, msg)); | |
| 535 | + } | |
| 528 | 536 | } |
| 529 | 537 | |
| 530 | 538 | void | ... | ... |
libqpdf/QPDF_objects.cc
| ... | ... | @@ -1292,19 +1292,22 @@ QPDF::validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset |
| 1292 | 1292 | } |
| 1293 | 1293 | |
| 1294 | 1294 | QPDFObjectHandle |
| 1295 | -QPDF::readObjectInStream(BufferInputSource& input, int obj) | |
| 1295 | +QPDF::readObjectInStream(BufferInputSource& input, int stream_id, int obj_id) | |
| 1296 | 1296 | { |
| 1297 | - m->last_object_description.erase(7); // last_object_description starts with "object " | |
| 1298 | - m->last_object_description += std::to_string(obj); | |
| 1299 | - m->last_object_description += " 0"; | |
| 1300 | - | |
| 1301 | 1297 | bool empty = false; |
| 1302 | - auto object = QPDFParser(input, m->last_object_description, m->tokenizer, nullptr, this, true) | |
| 1303 | - .parse(empty, false); | |
| 1298 | + auto object = | |
| 1299 | + QPDFParser(input, stream_id, obj_id, m->last_object_description, m->tokenizer, this) | |
| 1300 | + .parse(empty, false); | |
| 1304 | 1301 | if (empty) { |
| 1305 | 1302 | // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in |
| 1306 | 1303 | // actual PDF files and Adobe Reader appears to ignore them. |
| 1307 | - warn(damagedPDF(input, input.getLastOffset(), "empty object treated as null")); | |
| 1304 | + warn(QPDFExc( | |
| 1305 | + qpdf_e_damaged_pdf, | |
| 1306 | + m->file->getName() + " object stream " + std::to_string(stream_id), | |
| 1307 | + +"object " + std::to_string(obj_id) + " 0, offset " + | |
| 1308 | + std::to_string(input.getLastOffset()), | |
| 1309 | + 0, | |
| 1310 | + "empty object treated as null")); | |
| 1308 | 1311 | } |
| 1309 | 1312 | return object; |
| 1310 | 1313 | } |
| ... | ... | @@ -1605,13 +1608,23 @@ QPDF::resolve(QPDFObjGen og) |
| 1605 | 1608 | void |
| 1606 | 1609 | QPDF::resolveObjectsInStream(int obj_stream_number) |
| 1607 | 1610 | { |
| 1611 | + auto damaged = | |
| 1612 | + [this, obj_stream_number](int id, qpdf_offset_t offset, std::string const& msg) -> QPDFExc { | |
| 1613 | + return { | |
| 1614 | + qpdf_e_damaged_pdf, | |
| 1615 | + m->file->getName() + " object stream " + std::to_string(obj_stream_number), | |
| 1616 | + +"object " + std::to_string(id) + " 0", | |
| 1617 | + offset, | |
| 1618 | + msg}; | |
| 1619 | + }; | |
| 1620 | + | |
| 1608 | 1621 | if (m->resolved_object_streams.count(obj_stream_number)) { |
| 1609 | 1622 | return; |
| 1610 | 1623 | } |
| 1611 | 1624 | m->resolved_object_streams.insert(obj_stream_number); |
| 1612 | 1625 | // Force resolution of object stream |
| 1613 | - QPDFObjectHandle obj_stream = getObject(obj_stream_number, 0); | |
| 1614 | - if (!obj_stream.isStream()) { | |
| 1626 | + auto obj_stream = getObject(obj_stream_number, 0).as_stream(); | |
| 1627 | + if (!obj_stream) { | |
| 1615 | 1628 | throw damagedPDF( |
| 1616 | 1629 | "object " + std::to_string(obj_stream_number) + " 0", |
| 1617 | 1630 | "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream"); |
| ... | ... | @@ -1642,19 +1655,14 @@ QPDF::resolveObjectsInStream(int obj_stream_number) |
| 1642 | 1655 | std::vector<std::pair<int, long long>> offsets; |
| 1643 | 1656 | |
| 1644 | 1657 | auto bp = obj_stream.getStreamData(qpdf_dl_specialized); |
| 1645 | - BufferInputSource input( | |
| 1646 | - (m->file->getName() + " object stream " + std::to_string(obj_stream_number)), bp.get()); | |
| 1658 | + BufferInputSource input("", bp.get()); | |
| 1647 | 1659 | |
| 1648 | 1660 | long long last_offset = -1; |
| 1649 | 1661 | for (unsigned int i = 0; i < n; ++i) { |
| 1650 | 1662 | auto tnum = readToken(input); |
| 1651 | 1663 | auto toffset = readToken(input); |
| 1652 | 1664 | if (!(tnum.isInteger() && toffset.isInteger())) { |
| 1653 | - throw damagedPDF( | |
| 1654 | - input, | |
| 1655 | - "object " + std::to_string(obj_stream_number) + " 0", | |
| 1656 | - input.getLastOffset(), | |
| 1657 | - "expected integer in object stream header"); | |
| 1665 | + throw damaged(0, input.getLastOffset(), "expected integer in object stream header"); | |
| 1658 | 1666 | } |
| 1659 | 1667 | |
| 1660 | 1668 | int num = QUtil::string_to_int(tnum.getValue().c_str()); |
| ... | ... | @@ -1662,29 +1670,20 @@ QPDF::resolveObjectsInStream(int obj_stream_number) |
| 1662 | 1670 | |
| 1663 | 1671 | if (num == obj_stream_number) { |
| 1664 | 1672 | QTC::TC("qpdf", "QPDF ignore self-referential object stream"); |
| 1665 | - warn(damagedPDF( | |
| 1666 | - input, | |
| 1667 | - "object " + std::to_string(obj_stream_number) + " 0", | |
| 1668 | - input.getLastOffset(), | |
| 1669 | - "object stream claims to contain itself")); | |
| 1673 | + warn(damaged(num, input.getLastOffset(), "object stream claims to contain itself")); | |
| 1670 | 1674 | continue; |
| 1671 | 1675 | } |
| 1672 | 1676 | |
| 1673 | 1677 | if (num < 1) { |
| 1674 | 1678 | QTC::TC("qpdf", "QPDF object stream contains id < 1"); |
| 1675 | - warn(damagedPDF( | |
| 1676 | - input, | |
| 1677 | - "object " + std::to_string(num) + " 0", | |
| 1678 | - input.getLastOffset(), | |
| 1679 | - "object id is invalid"s)); | |
| 1679 | + warn(damaged(num, input.getLastOffset(), "object id is invalid"s)); | |
| 1680 | 1680 | continue; |
| 1681 | 1681 | } |
| 1682 | 1682 | |
| 1683 | 1683 | if (offset <= last_offset) { |
| 1684 | 1684 | QTC::TC("qpdf", "QPDF object stream offsets not increasing"); |
| 1685 | - warn(damagedPDF( | |
| 1686 | - input, | |
| 1687 | - "object " + std::to_string(num) + " 0", | |
| 1685 | + warn(damaged( | |
| 1686 | + num, | |
| 1688 | 1687 | offset, |
| 1689 | 1688 | "offset is invalid (must be larger than previous offset " + |
| 1690 | 1689 | std::to_string(last_offset) + ")")); |
| ... | ... | @@ -1703,15 +1702,13 @@ QPDF::resolveObjectsInStream(int obj_stream_number) |
| 1703 | 1702 | // found here in the cache. Remember that some objects stored here might have been overridden |
| 1704 | 1703 | // by new objects appended to the file, so it is necessary to recheck the xref table and only |
| 1705 | 1704 | // cache what would actually be resolved here. |
| 1706 | - m->last_object_description.clear(); | |
| 1707 | - m->last_object_description += "object "; | |
| 1708 | 1705 | for (auto const& [id, offset]: offsets) { |
| 1709 | 1706 | QPDFObjGen og(id, 0); |
| 1710 | 1707 | auto entry = m->xref_table.find(og); |
| 1711 | 1708 | if (entry != m->xref_table.end() && entry->second.getType() == 2 && |
| 1712 | 1709 | entry->second.getObjStreamNumber() == obj_stream_number) { |
| 1713 | 1710 | input.seek(offset, SEEK_SET); |
| 1714 | - QPDFObjectHandle oh = readObjectInStream(input, id); | |
| 1711 | + QPDFObjectHandle oh = readObjectInStream(input, obj_stream_number, id); | |
| 1715 | 1712 | updateCache(og, oh.getObj(), end_before_space, end_after_space); |
| 1716 | 1713 | } else { |
| 1717 | 1714 | QTC::TC("qpdf", "QPDF not caching overridden objstm object"); | ... | ... |
libqpdf/qpdf/QPDFObject_private.hh
| ... | ... | @@ -381,7 +381,17 @@ class QPDFObject |
| 381 | 381 | std::string var_descr; |
| 382 | 382 | }; |
| 383 | 383 | |
| 384 | - using Description = std::variant<std::string, JSON_Descr, ChildDescr>; | |
| 384 | + struct ObjStreamDescr | |
| 385 | + { | |
| 386 | + ObjStreamDescr(int stream_id, int obj_id) : | |
| 387 | + stream_id(stream_id), | |
| 388 | + obj_id(obj_id) {}; | |
| 389 | + | |
| 390 | + int stream_id; | |
| 391 | + int obj_id; | |
| 392 | + }; | |
| 393 | + | |
| 394 | + using Description = std::variant<std::string, JSON_Descr, ChildDescr, ObjStreamDescr>; | |
| 385 | 395 | |
| 386 | 396 | void |
| 387 | 397 | setDescription( | ... | ... |
libqpdf/qpdf/QPDFParser.hh
| ... | ... | @@ -62,9 +62,32 @@ class QPDFParser |
| 62 | 62 | decrypter(nullptr), |
| 63 | 63 | context(context), |
| 64 | 64 | description(std::move(sp_description)), |
| 65 | - parse_pdf(false) | |
| 65 | + parse_pdf(true) | |
| 66 | 66 | { |
| 67 | 67 | } |
| 68 | + | |
| 69 | + // Used by readObjectInStream only | |
| 70 | + QPDFParser( | |
| 71 | + InputSource& input, | |
| 72 | + int stream_id, | |
| 73 | + int obj_id, | |
| 74 | + std::string const& object_description, | |
| 75 | + qpdf::Tokenizer& tokenizer, | |
| 76 | + QPDF* context) : | |
| 77 | + input(input), | |
| 78 | + object_description(object_description), | |
| 79 | + tokenizer(tokenizer), | |
| 80 | + decrypter(nullptr), | |
| 81 | + context(context), | |
| 82 | + description( | |
| 83 | + std::make_shared<QPDFObject::Description>( | |
| 84 | + QPDFObject::ObjStreamDescr(stream_id, obj_id))), | |
| 85 | + parse_pdf(true), | |
| 86 | + stream_id(stream_id), | |
| 87 | + obj_id(obj_id) | |
| 88 | + { | |
| 89 | + } | |
| 90 | + | |
| 68 | 91 | ~QPDFParser() = default; |
| 69 | 92 | |
| 70 | 93 | QPDFObjectHandle parse(bool& empty, bool content_stream); |
| ... | ... | @@ -124,6 +147,8 @@ class QPDFParser |
| 124 | 147 | QPDF* context; |
| 125 | 148 | std::shared_ptr<QPDFObject::Description> description; |
| 126 | 149 | bool parse_pdf; |
| 150 | + int stream_id{0}; | |
| 151 | + int obj_id{0}; | |
| 127 | 152 | |
| 128 | 153 | std::vector<StackFrame> stack; |
| 129 | 154 | StackFrame* frame{nullptr}; | ... | ... |
libqpdf/qpdf/QPDF_private.hh
| ... | ... | @@ -3,6 +3,7 @@ |
| 3 | 3 | |
| 4 | 4 | #include <qpdf/QPDF.hh> |
| 5 | 5 | |
| 6 | +#include <qpdf/QPDFObject_private.hh> | |
| 6 | 7 | #include <qpdf/QPDFTokenizer_private.hh> |
| 7 | 8 | |
| 8 | 9 | // Writer class is restricted to QPDFWriter so that only it can call certain methods. |
| ... | ... | @@ -457,6 +458,7 @@ class QPDF::Members |
| 457 | 458 | qpdf::Tokenizer tokenizer; |
| 458 | 459 | std::shared_ptr<InputSource> file; |
| 459 | 460 | std::string last_object_description; |
| 461 | + std::shared_ptr<QPDFObject::Description> last_ostream_description; | |
| 460 | 462 | bool provided_password_is_hex_key{false}; |
| 461 | 463 | bool ignore_xref_streams{false}; |
| 462 | 464 | bool suppress_warnings{false}; | ... | ... |
manual/release-notes.rst
| ... | ... | @@ -29,6 +29,12 @@ more detail. |
| 29 | 29 | - There have been further enhancements to how files with damaged xref |
| 30 | 30 | tables are recovered. |
| 31 | 31 | |
| 32 | + - Other changes | |
| 33 | + | |
| 34 | + - The parsing of object streams including the creation of error/warning | |
| 35 | + messages and object descriptions has been refactored with some | |
| 36 | + improvement both in runtime and memory usage. | |
| 37 | + | |
| 32 | 38 | - There has been some refactoring of how object streams are written with |
| 33 | 39 | some performance improvement. |
| 34 | 40 | ... | ... |