Commit 626d5061de49471a62a05438f6d122d17fad2aa9
1 parent
db87f3ca
Refactor object stream warnings and object descriptions
Only build strings when needed.
Showing
8 changed files
with
96 additions
and
41 deletions
include/qpdf/QPDF.hh
| @@ -785,7 +785,7 @@ class QPDF | @@ -785,7 +785,7 @@ class QPDF | ||
| 785 | QPDFObjectHandle readObject(std::string const& description, QPDFObjGen og); | 785 | QPDFObjectHandle readObject(std::string const& description, QPDFObjGen og); |
| 786 | void readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset); | 786 | void readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset); |
| 787 | void validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset); | 787 | void validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset); |
| 788 | - QPDFObjectHandle readObjectInStream(BufferInputSource& input, int obj); | 788 | + QPDFObjectHandle readObjectInStream(BufferInputSource& input, int stream_id, int obj_id); |
| 789 | size_t recoverStreamLength( | 789 | size_t recoverStreamLength( |
| 790 | std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset); | 790 | std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset); |
| 791 | QPDFTokenizer::Token readToken(InputSource&, size_t max_len = 0); | 791 | QPDFTokenizer::Token readToken(InputSource&, size_t max_len = 0); |
libqpdf/QPDFObject.cc
| @@ -3,6 +3,10 @@ | @@ -3,6 +3,10 @@ | ||
| 3 | std::string | 3 | std::string |
| 4 | QPDFObject::getDescription() | 4 | QPDFObject::getDescription() |
| 5 | { | 5 | { |
| 6 | + qpdf_offset_t shift = (getTypeCode() == ::ot_dictionary) ? 2 | ||
| 7 | + : (getTypeCode() == ::ot_array) ? 1 | ||
| 8 | + : 0; | ||
| 9 | + | ||
| 6 | if (object_description) { | 10 | if (object_description) { |
| 7 | switch (object_description->index()) { | 11 | switch (object_description->index()) { |
| 8 | case 0: | 12 | case 0: |
| @@ -14,10 +18,6 @@ QPDFObject::getDescription() | @@ -14,10 +18,6 @@ QPDFObject::getDescription() | ||
| 14 | description.replace(pos, 3, og.unparse(' ')); | 18 | description.replace(pos, 3, og.unparse(' ')); |
| 15 | } | 19 | } |
| 16 | if (auto pos = description.find("$PO"); pos != std::string::npos) { | 20 | if (auto pos = description.find("$PO"); pos != std::string::npos) { |
| 17 | - qpdf_offset_t shift = (getTypeCode() == ::ot_dictionary) ? 2 | ||
| 18 | - : (getTypeCode() == ::ot_array) ? 1 | ||
| 19 | - : 0; | ||
| 20 | - | ||
| 21 | description.replace(pos, 3, std::to_string(parsed_offset + shift)); | 21 | description.replace(pos, 3, std::to_string(parsed_offset + shift)); |
| 22 | } | 22 | } |
| 23 | return description; | 23 | return description; |
| @@ -44,7 +44,14 @@ QPDFObject::getDescription() | @@ -44,7 +44,14 @@ QPDFObject::getDescription() | ||
| 44 | } | 44 | } |
| 45 | return result; | 45 | return result; |
| 46 | } | 46 | } |
| 47 | + case 3: | ||
| 48 | + auto [stream_id, obj_id] = std::get<3>(*object_description); | ||
| 49 | + std::string result = qpdf ? qpdf->getFilename() : ""; | ||
| 50 | + result += " object stream " + std::to_string(stream_id) + ", object " + | ||
| 51 | + std::to_string(obj_id) + " 0 at offset " + std::to_string(parsed_offset + shift); | ||
| 52 | + return result; | ||
| 47 | } | 53 | } |
| 54 | + | ||
| 48 | } else if (og.isIndirect()) { | 55 | } else if (og.isIndirect()) { |
| 49 | return "object " + og.unparse(' '); | 56 | return "object " + og.unparse(' '); |
| 50 | } | 57 | } |
libqpdf/QPDFParser.cc
| @@ -10,6 +10,8 @@ | @@ -10,6 +10,8 @@ | ||
| 10 | 10 | ||
| 11 | #include <memory> | 11 | #include <memory> |
| 12 | 12 | ||
| 13 | +using namespace std::literals; | ||
| 14 | + | ||
| 13 | using ObjectPtr = std::shared_ptr<QPDFObject>; | 15 | using ObjectPtr = std::shared_ptr<QPDFObject>; |
| 14 | 16 | ||
| 15 | QPDFObjectHandle | 17 | QPDFObjectHandle |
| @@ -524,7 +526,13 @@ QPDFParser::warnDuplicateKey() | @@ -524,7 +526,13 @@ QPDFParser::warnDuplicateKey() | ||
| 524 | void | 526 | void |
| 525 | QPDFParser::warn(qpdf_offset_t offset, std::string const& msg) const | 527 | QPDFParser::warn(qpdf_offset_t offset, std::string const& msg) const |
| 526 | { | 528 | { |
| 527 | - warn(QPDFExc(qpdf_e_damaged_pdf, input.getName(), object_description, offset, msg)); | 529 | + if (stream_id) { |
| 530 | + std::string descr = "object "s + std::to_string(obj_id) + " 0"; | ||
| 531 | + std::string name = context->getFilename() + " object stream " + std::to_string(stream_id); | ||
| 532 | + warn(QPDFExc(qpdf_e_damaged_pdf, name, descr, offset, msg)); | ||
| 533 | + } else { | ||
| 534 | + warn(QPDFExc(qpdf_e_damaged_pdf, input.getName(), object_description, offset, msg)); | ||
| 535 | + } | ||
| 528 | } | 536 | } |
| 529 | 537 | ||
| 530 | void | 538 | void |
libqpdf/QPDF_objects.cc
| @@ -1292,19 +1292,22 @@ QPDF::validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset | @@ -1292,19 +1292,22 @@ QPDF::validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset | ||
| 1292 | } | 1292 | } |
| 1293 | 1293 | ||
| 1294 | QPDFObjectHandle | 1294 | QPDFObjectHandle |
| 1295 | -QPDF::readObjectInStream(BufferInputSource& input, int obj) | 1295 | +QPDF::readObjectInStream(BufferInputSource& input, int stream_id, int obj_id) |
| 1296 | { | 1296 | { |
| 1297 | - m->last_object_description.erase(7); // last_object_description starts with "object " | ||
| 1298 | - m->last_object_description += std::to_string(obj); | ||
| 1299 | - m->last_object_description += " 0"; | ||
| 1300 | - | ||
| 1301 | bool empty = false; | 1297 | bool empty = false; |
| 1302 | - auto object = QPDFParser(input, m->last_object_description, m->tokenizer, nullptr, this, true) | ||
| 1303 | - .parse(empty, false); | 1298 | + auto object = |
| 1299 | + QPDFParser(input, stream_id, obj_id, m->last_object_description, m->tokenizer, this) | ||
| 1300 | + .parse(empty, false); | ||
| 1304 | if (empty) { | 1301 | if (empty) { |
| 1305 | // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in | 1302 | // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in |
| 1306 | // actual PDF files and Adobe Reader appears to ignore them. | 1303 | // actual PDF files and Adobe Reader appears to ignore them. |
| 1307 | - warn(damagedPDF(input, input.getLastOffset(), "empty object treated as null")); | 1304 | + warn(QPDFExc( |
| 1305 | + qpdf_e_damaged_pdf, | ||
| 1306 | + m->file->getName() + " object stream " + std::to_string(stream_id), | ||
| 1307 | + +"object " + std::to_string(obj_id) + " 0, offset " + | ||
| 1308 | + std::to_string(input.getLastOffset()), | ||
| 1309 | + 0, | ||
| 1310 | + "empty object treated as null")); | ||
| 1308 | } | 1311 | } |
| 1309 | return object; | 1312 | return object; |
| 1310 | } | 1313 | } |
| @@ -1605,13 +1608,23 @@ QPDF::resolve(QPDFObjGen og) | @@ -1605,13 +1608,23 @@ QPDF::resolve(QPDFObjGen og) | ||
| 1605 | void | 1608 | void |
| 1606 | QPDF::resolveObjectsInStream(int obj_stream_number) | 1609 | QPDF::resolveObjectsInStream(int obj_stream_number) |
| 1607 | { | 1610 | { |
| 1611 | + auto damaged = | ||
| 1612 | + [this, obj_stream_number](int id, qpdf_offset_t offset, std::string const& msg) -> QPDFExc { | ||
| 1613 | + return { | ||
| 1614 | + qpdf_e_damaged_pdf, | ||
| 1615 | + m->file->getName() + " object stream " + std::to_string(obj_stream_number), | ||
| 1616 | + +"object " + std::to_string(id) + " 0", | ||
| 1617 | + offset, | ||
| 1618 | + msg}; | ||
| 1619 | + }; | ||
| 1620 | + | ||
| 1608 | if (m->resolved_object_streams.count(obj_stream_number)) { | 1621 | if (m->resolved_object_streams.count(obj_stream_number)) { |
| 1609 | return; | 1622 | return; |
| 1610 | } | 1623 | } |
| 1611 | m->resolved_object_streams.insert(obj_stream_number); | 1624 | m->resolved_object_streams.insert(obj_stream_number); |
| 1612 | // Force resolution of object stream | 1625 | // Force resolution of object stream |
| 1613 | - QPDFObjectHandle obj_stream = getObject(obj_stream_number, 0); | ||
| 1614 | - if (!obj_stream.isStream()) { | 1626 | + auto obj_stream = getObject(obj_stream_number, 0).as_stream(); |
| 1627 | + if (!obj_stream) { | ||
| 1615 | throw damagedPDF( | 1628 | throw damagedPDF( |
| 1616 | "object " + std::to_string(obj_stream_number) + " 0", | 1629 | "object " + std::to_string(obj_stream_number) + " 0", |
| 1617 | "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream"); | 1630 | "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream"); |
| @@ -1642,19 +1655,14 @@ QPDF::resolveObjectsInStream(int obj_stream_number) | @@ -1642,19 +1655,14 @@ QPDF::resolveObjectsInStream(int obj_stream_number) | ||
| 1642 | std::vector<std::pair<int, long long>> offsets; | 1655 | std::vector<std::pair<int, long long>> offsets; |
| 1643 | 1656 | ||
| 1644 | auto bp = obj_stream.getStreamData(qpdf_dl_specialized); | 1657 | auto bp = obj_stream.getStreamData(qpdf_dl_specialized); |
| 1645 | - BufferInputSource input( | ||
| 1646 | - (m->file->getName() + " object stream " + std::to_string(obj_stream_number)), bp.get()); | 1658 | + BufferInputSource input("", bp.get()); |
| 1647 | 1659 | ||
| 1648 | long long last_offset = -1; | 1660 | long long last_offset = -1; |
| 1649 | for (unsigned int i = 0; i < n; ++i) { | 1661 | for (unsigned int i = 0; i < n; ++i) { |
| 1650 | auto tnum = readToken(input); | 1662 | auto tnum = readToken(input); |
| 1651 | auto toffset = readToken(input); | 1663 | auto toffset = readToken(input); |
| 1652 | if (!(tnum.isInteger() && toffset.isInteger())) { | 1664 | if (!(tnum.isInteger() && toffset.isInteger())) { |
| 1653 | - throw damagedPDF( | ||
| 1654 | - input, | ||
| 1655 | - "object " + std::to_string(obj_stream_number) + " 0", | ||
| 1656 | - input.getLastOffset(), | ||
| 1657 | - "expected integer in object stream header"); | 1665 | + throw damaged(0, input.getLastOffset(), "expected integer in object stream header"); |
| 1658 | } | 1666 | } |
| 1659 | 1667 | ||
| 1660 | int num = QUtil::string_to_int(tnum.getValue().c_str()); | 1668 | int num = QUtil::string_to_int(tnum.getValue().c_str()); |
| @@ -1662,29 +1670,20 @@ QPDF::resolveObjectsInStream(int obj_stream_number) | @@ -1662,29 +1670,20 @@ QPDF::resolveObjectsInStream(int obj_stream_number) | ||
| 1662 | 1670 | ||
| 1663 | if (num == obj_stream_number) { | 1671 | if (num == obj_stream_number) { |
| 1664 | QTC::TC("qpdf", "QPDF ignore self-referential object stream"); | 1672 | QTC::TC("qpdf", "QPDF ignore self-referential object stream"); |
| 1665 | - warn(damagedPDF( | ||
| 1666 | - input, | ||
| 1667 | - "object " + std::to_string(obj_stream_number) + " 0", | ||
| 1668 | - input.getLastOffset(), | ||
| 1669 | - "object stream claims to contain itself")); | 1673 | + warn(damaged(num, input.getLastOffset(), "object stream claims to contain itself")); |
| 1670 | continue; | 1674 | continue; |
| 1671 | } | 1675 | } |
| 1672 | 1676 | ||
| 1673 | if (num < 1) { | 1677 | if (num < 1) { |
| 1674 | QTC::TC("qpdf", "QPDF object stream contains id < 1"); | 1678 | QTC::TC("qpdf", "QPDF object stream contains id < 1"); |
| 1675 | - warn(damagedPDF( | ||
| 1676 | - input, | ||
| 1677 | - "object " + std::to_string(num) + " 0", | ||
| 1678 | - input.getLastOffset(), | ||
| 1679 | - "object id is invalid"s)); | 1679 | + warn(damaged(num, input.getLastOffset(), "object id is invalid"s)); |
| 1680 | continue; | 1680 | continue; |
| 1681 | } | 1681 | } |
| 1682 | 1682 | ||
| 1683 | if (offset <= last_offset) { | 1683 | if (offset <= last_offset) { |
| 1684 | QTC::TC("qpdf", "QPDF object stream offsets not increasing"); | 1684 | QTC::TC("qpdf", "QPDF object stream offsets not increasing"); |
| 1685 | - warn(damagedPDF( | ||
| 1686 | - input, | ||
| 1687 | - "object " + std::to_string(num) + " 0", | 1685 | + warn(damaged( |
| 1686 | + num, | ||
| 1688 | offset, | 1687 | offset, |
| 1689 | "offset is invalid (must be larger than previous offset " + | 1688 | "offset is invalid (must be larger than previous offset " + |
| 1690 | std::to_string(last_offset) + ")")); | 1689 | std::to_string(last_offset) + ")")); |
| @@ -1703,15 +1702,13 @@ QPDF::resolveObjectsInStream(int obj_stream_number) | @@ -1703,15 +1702,13 @@ QPDF::resolveObjectsInStream(int obj_stream_number) | ||
| 1703 | // found here in the cache. Remember that some objects stored here might have been overridden | 1702 | // found here in the cache. Remember that some objects stored here might have been overridden |
| 1704 | // by new objects appended to the file, so it is necessary to recheck the xref table and only | 1703 | // by new objects appended to the file, so it is necessary to recheck the xref table and only |
| 1705 | // cache what would actually be resolved here. | 1704 | // cache what would actually be resolved here. |
| 1706 | - m->last_object_description.clear(); | ||
| 1707 | - m->last_object_description += "object "; | ||
| 1708 | for (auto const& [id, offset]: offsets) { | 1705 | for (auto const& [id, offset]: offsets) { |
| 1709 | QPDFObjGen og(id, 0); | 1706 | QPDFObjGen og(id, 0); |
| 1710 | auto entry = m->xref_table.find(og); | 1707 | auto entry = m->xref_table.find(og); |
| 1711 | if (entry != m->xref_table.end() && entry->second.getType() == 2 && | 1708 | if (entry != m->xref_table.end() && entry->second.getType() == 2 && |
| 1712 | entry->second.getObjStreamNumber() == obj_stream_number) { | 1709 | entry->second.getObjStreamNumber() == obj_stream_number) { |
| 1713 | input.seek(offset, SEEK_SET); | 1710 | input.seek(offset, SEEK_SET); |
| 1714 | - QPDFObjectHandle oh = readObjectInStream(input, id); | 1711 | + QPDFObjectHandle oh = readObjectInStream(input, obj_stream_number, id); |
| 1715 | updateCache(og, oh.getObj(), end_before_space, end_after_space); | 1712 | updateCache(og, oh.getObj(), end_before_space, end_after_space); |
| 1716 | } else { | 1713 | } else { |
| 1717 | QTC::TC("qpdf", "QPDF not caching overridden objstm object"); | 1714 | QTC::TC("qpdf", "QPDF not caching overridden objstm object"); |
libqpdf/qpdf/QPDFObject_private.hh
| @@ -381,7 +381,17 @@ class QPDFObject | @@ -381,7 +381,17 @@ class QPDFObject | ||
| 381 | std::string var_descr; | 381 | std::string var_descr; |
| 382 | }; | 382 | }; |
| 383 | 383 | ||
| 384 | - using Description = std::variant<std::string, JSON_Descr, ChildDescr>; | 384 | + struct ObjStreamDescr |
| 385 | + { | ||
| 386 | + ObjStreamDescr(int stream_id, int obj_id) : | ||
| 387 | + stream_id(stream_id), | ||
| 388 | + obj_id(obj_id) {}; | ||
| 389 | + | ||
| 390 | + int stream_id; | ||
| 391 | + int obj_id; | ||
| 392 | + }; | ||
| 393 | + | ||
| 394 | + using Description = std::variant<std::string, JSON_Descr, ChildDescr, ObjStreamDescr>; | ||
| 385 | 395 | ||
| 386 | void | 396 | void |
| 387 | setDescription( | 397 | setDescription( |
libqpdf/qpdf/QPDFParser.hh
| @@ -62,9 +62,32 @@ class QPDFParser | @@ -62,9 +62,32 @@ class QPDFParser | ||
| 62 | decrypter(nullptr), | 62 | decrypter(nullptr), |
| 63 | context(context), | 63 | context(context), |
| 64 | description(std::move(sp_description)), | 64 | description(std::move(sp_description)), |
| 65 | - parse_pdf(false) | 65 | + parse_pdf(true) |
| 66 | { | 66 | { |
| 67 | } | 67 | } |
| 68 | + | ||
| 69 | + // Used by readObjectInStream only | ||
| 70 | + QPDFParser( | ||
| 71 | + InputSource& input, | ||
| 72 | + int stream_id, | ||
| 73 | + int obj_id, | ||
| 74 | + std::string const& object_description, | ||
| 75 | + qpdf::Tokenizer& tokenizer, | ||
| 76 | + QPDF* context) : | ||
| 77 | + input(input), | ||
| 78 | + object_description(object_description), | ||
| 79 | + tokenizer(tokenizer), | ||
| 80 | + decrypter(nullptr), | ||
| 81 | + context(context), | ||
| 82 | + description( | ||
| 83 | + std::make_shared<QPDFObject::Description>( | ||
| 84 | + QPDFObject::ObjStreamDescr(stream_id, obj_id))), | ||
| 85 | + parse_pdf(true), | ||
| 86 | + stream_id(stream_id), | ||
| 87 | + obj_id(obj_id) | ||
| 88 | + { | ||
| 89 | + } | ||
| 90 | + | ||
| 68 | ~QPDFParser() = default; | 91 | ~QPDFParser() = default; |
| 69 | 92 | ||
| 70 | QPDFObjectHandle parse(bool& empty, bool content_stream); | 93 | QPDFObjectHandle parse(bool& empty, bool content_stream); |
| @@ -124,6 +147,8 @@ class QPDFParser | @@ -124,6 +147,8 @@ class QPDFParser | ||
| 124 | QPDF* context; | 147 | QPDF* context; |
| 125 | std::shared_ptr<QPDFObject::Description> description; | 148 | std::shared_ptr<QPDFObject::Description> description; |
| 126 | bool parse_pdf; | 149 | bool parse_pdf; |
| 150 | + int stream_id{0}; | ||
| 151 | + int obj_id{0}; | ||
| 127 | 152 | ||
| 128 | std::vector<StackFrame> stack; | 153 | std::vector<StackFrame> stack; |
| 129 | StackFrame* frame{nullptr}; | 154 | StackFrame* frame{nullptr}; |
libqpdf/qpdf/QPDF_private.hh
| @@ -3,6 +3,7 @@ | @@ -3,6 +3,7 @@ | ||
| 3 | 3 | ||
| 4 | #include <qpdf/QPDF.hh> | 4 | #include <qpdf/QPDF.hh> |
| 5 | 5 | ||
| 6 | +#include <qpdf/QPDFObject_private.hh> | ||
| 6 | #include <qpdf/QPDFTokenizer_private.hh> | 7 | #include <qpdf/QPDFTokenizer_private.hh> |
| 7 | 8 | ||
| 8 | // Writer class is restricted to QPDFWriter so that only it can call certain methods. | 9 | // Writer class is restricted to QPDFWriter so that only it can call certain methods. |
| @@ -457,6 +458,7 @@ class QPDF::Members | @@ -457,6 +458,7 @@ class QPDF::Members | ||
| 457 | qpdf::Tokenizer tokenizer; | 458 | qpdf::Tokenizer tokenizer; |
| 458 | std::shared_ptr<InputSource> file; | 459 | std::shared_ptr<InputSource> file; |
| 459 | std::string last_object_description; | 460 | std::string last_object_description; |
| 461 | + std::shared_ptr<QPDFObject::Description> last_ostream_description; | ||
| 460 | bool provided_password_is_hex_key{false}; | 462 | bool provided_password_is_hex_key{false}; |
| 461 | bool ignore_xref_streams{false}; | 463 | bool ignore_xref_streams{false}; |
| 462 | bool suppress_warnings{false}; | 464 | bool suppress_warnings{false}; |
manual/release-notes.rst
| @@ -29,6 +29,12 @@ more detail. | @@ -29,6 +29,12 @@ more detail. | ||
| 29 | - There have been further enhancements to how files with damaged xref | 29 | - There have been further enhancements to how files with damaged xref |
| 30 | tables are recovered. | 30 | tables are recovered. |
| 31 | 31 | ||
| 32 | + - Other changes | ||
| 33 | + | ||
| 34 | + - The parsing of object streams including the creation of error/warning | ||
| 35 | + messages and object descriptions has been refactored with some | ||
| 36 | + improvement both in runtime and memory usage. | ||
| 37 | + | ||
| 32 | - There has been some refactoring of how object streams are written with | 38 | - There has been some refactoring of how object streams are written with |
| 33 | some performance improvement. | 39 | some performance improvement. |
| 34 | 40 |