diff --git a/include/qpdf/QPDF.hh b/include/qpdf/QPDF.hh index 9d1e90e..9f076af 100644 --- a/include/qpdf/QPDF.hh +++ b/include/qpdf/QPDF.hh @@ -785,7 +785,7 @@ class QPDF QPDFObjectHandle readObject(std::string const& description, QPDFObjGen og); void readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset); void validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset); - QPDFObjectHandle readObjectInStream(BufferInputSource& input, int obj); + QPDFObjectHandle readObjectInStream(BufferInputSource& input, int stream_id, int obj_id); size_t recoverStreamLength( std::shared_ptr input, QPDFObjGen og, qpdf_offset_t stream_offset); QPDFTokenizer::Token readToken(InputSource&, size_t max_len = 0); diff --git a/libqpdf/QPDFObject.cc b/libqpdf/QPDFObject.cc index a5cb986..ef568cc 100644 --- a/libqpdf/QPDFObject.cc +++ b/libqpdf/QPDFObject.cc @@ -3,6 +3,10 @@ std::string QPDFObject::getDescription() { + qpdf_offset_t shift = (getTypeCode() == ::ot_dictionary) ? 2 + : (getTypeCode() == ::ot_array) ? 1 + : 0; + if (object_description) { switch (object_description->index()) { case 0: @@ -14,10 +18,6 @@ QPDFObject::getDescription() description.replace(pos, 3, og.unparse(' ')); } if (auto pos = description.find("$PO"); pos != std::string::npos) { - qpdf_offset_t shift = (getTypeCode() == ::ot_dictionary) ? 2 - : (getTypeCode() == ::ot_array) ? 1 - : 0; - description.replace(pos, 3, std::to_string(parsed_offset + shift)); } return description; @@ -44,7 +44,14 @@ QPDFObject::getDescription() } return result; } + case 3: + auto [stream_id, obj_id] = std::get<3>(*object_description); + std::string result = qpdf ? qpdf->getFilename() : ""; + result += " object stream " + std::to_string(stream_id) + ", object " + + std::to_string(obj_id) + " 0 at offset " + std::to_string(parsed_offset + shift); + return result; } + } else if (og.isIndirect()) { return "object " + og.unparse(' '); } diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc index 3edbc44..fdb4827 100644 --- a/libqpdf/QPDFParser.cc +++ b/libqpdf/QPDFParser.cc @@ -10,6 +10,8 @@ #include +using namespace std::literals; + using ObjectPtr = std::shared_ptr; QPDFObjectHandle @@ -524,7 +526,13 @@ QPDFParser::warnDuplicateKey() void QPDFParser::warn(qpdf_offset_t offset, std::string const& msg) const { - warn(QPDFExc(qpdf_e_damaged_pdf, input.getName(), object_description, offset, msg)); + if (stream_id) { + std::string descr = "object "s + std::to_string(obj_id) + " 0"; + std::string name = context->getFilename() + " object stream " + std::to_string(stream_id); + warn(QPDFExc(qpdf_e_damaged_pdf, name, descr, offset, msg)); + } else { + warn(QPDFExc(qpdf_e_damaged_pdf, input.getName(), object_description, offset, msg)); + } } void diff --git a/libqpdf/QPDF_objects.cc b/libqpdf/QPDF_objects.cc index e67b85f..8142bd2 100644 --- a/libqpdf/QPDF_objects.cc +++ b/libqpdf/QPDF_objects.cc @@ -1292,19 +1292,22 @@ QPDF::validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset } QPDFObjectHandle -QPDF::readObjectInStream(BufferInputSource& input, int obj) +QPDF::readObjectInStream(BufferInputSource& input, int stream_id, int obj_id) { - m->last_object_description.erase(7); // last_object_description starts with "object " - m->last_object_description += std::to_string(obj); - m->last_object_description += " 0"; - bool empty = false; - auto object = QPDFParser(input, m->last_object_description, m->tokenizer, nullptr, this, true) - .parse(empty, false); + auto object = + QPDFParser(input, stream_id, obj_id, m->last_object_description, m->tokenizer, this) + .parse(empty, false); if (empty) { // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in // actual PDF files and Adobe Reader appears to ignore them. - warn(damagedPDF(input, input.getLastOffset(), "empty object treated as null")); + warn(QPDFExc( + qpdf_e_damaged_pdf, + m->file->getName() + " object stream " + std::to_string(stream_id), + +"object " + std::to_string(obj_id) + " 0, offset " + + std::to_string(input.getLastOffset()), + 0, + "empty object treated as null")); } return object; } @@ -1605,13 +1608,23 @@ QPDF::resolve(QPDFObjGen og) void QPDF::resolveObjectsInStream(int obj_stream_number) { + auto damaged = + [this, obj_stream_number](int id, qpdf_offset_t offset, std::string const& msg) -> QPDFExc { + return { + qpdf_e_damaged_pdf, + m->file->getName() + " object stream " + std::to_string(obj_stream_number), + +"object " + std::to_string(id) + " 0", + offset, + msg}; + }; + if (m->resolved_object_streams.count(obj_stream_number)) { return; } m->resolved_object_streams.insert(obj_stream_number); // Force resolution of object stream - QPDFObjectHandle obj_stream = getObject(obj_stream_number, 0); - if (!obj_stream.isStream()) { + auto obj_stream = getObject(obj_stream_number, 0).as_stream(); + if (!obj_stream) { throw damagedPDF( "object " + std::to_string(obj_stream_number) + " 0", "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream"); @@ -1642,19 +1655,14 @@ QPDF::resolveObjectsInStream(int obj_stream_number) std::vector> offsets; auto bp = obj_stream.getStreamData(qpdf_dl_specialized); - BufferInputSource input( - (m->file->getName() + " object stream " + std::to_string(obj_stream_number)), bp.get()); + BufferInputSource input("", bp.get()); long long last_offset = -1; for (unsigned int i = 0; i < n; ++i) { auto tnum = readToken(input); auto toffset = readToken(input); if (!(tnum.isInteger() && toffset.isInteger())) { - throw damagedPDF( - input, - "object " + std::to_string(obj_stream_number) + " 0", - input.getLastOffset(), - "expected integer in object stream header"); + throw damaged(0, input.getLastOffset(), "expected integer in object stream header"); } int num = QUtil::string_to_int(tnum.getValue().c_str()); @@ -1662,29 +1670,20 @@ QPDF::resolveObjectsInStream(int obj_stream_number) if (num == obj_stream_number) { QTC::TC("qpdf", "QPDF ignore self-referential object stream"); - warn(damagedPDF( - input, - "object " + std::to_string(obj_stream_number) + " 0", - input.getLastOffset(), - "object stream claims to contain itself")); + warn(damaged(num, input.getLastOffset(), "object stream claims to contain itself")); continue; } if (num < 1) { QTC::TC("qpdf", "QPDF object stream contains id < 1"); - warn(damagedPDF( - input, - "object " + std::to_string(num) + " 0", - input.getLastOffset(), - "object id is invalid"s)); + warn(damaged(num, input.getLastOffset(), "object id is invalid"s)); continue; } if (offset <= last_offset) { QTC::TC("qpdf", "QPDF object stream offsets not increasing"); - warn(damagedPDF( - input, - "object " + std::to_string(num) + " 0", + warn(damaged( + num, offset, "offset is invalid (must be larger than previous offset " + std::to_string(last_offset) + ")")); @@ -1703,15 +1702,13 @@ QPDF::resolveObjectsInStream(int obj_stream_number) // found here in the cache. Remember that some objects stored here might have been overridden // by new objects appended to the file, so it is necessary to recheck the xref table and only // cache what would actually be resolved here. - m->last_object_description.clear(); - m->last_object_description += "object "; for (auto const& [id, offset]: offsets) { QPDFObjGen og(id, 0); auto entry = m->xref_table.find(og); if (entry != m->xref_table.end() && entry->second.getType() == 2 && entry->second.getObjStreamNumber() == obj_stream_number) { input.seek(offset, SEEK_SET); - QPDFObjectHandle oh = readObjectInStream(input, id); + QPDFObjectHandle oh = readObjectInStream(input, obj_stream_number, id); updateCache(og, oh.getObj(), end_before_space, end_after_space); } else { QTC::TC("qpdf", "QPDF not caching overridden objstm object"); diff --git a/libqpdf/qpdf/QPDFObject_private.hh b/libqpdf/qpdf/QPDFObject_private.hh index 888f8ca..61b8d60 100644 --- a/libqpdf/qpdf/QPDFObject_private.hh +++ b/libqpdf/qpdf/QPDFObject_private.hh @@ -381,7 +381,17 @@ class QPDFObject std::string var_descr; }; - using Description = std::variant; + struct ObjStreamDescr + { + ObjStreamDescr(int stream_id, int obj_id) : + stream_id(stream_id), + obj_id(obj_id) {}; + + int stream_id; + int obj_id; + }; + + using Description = std::variant; void setDescription( diff --git a/libqpdf/qpdf/QPDFParser.hh b/libqpdf/qpdf/QPDFParser.hh index 184308e..f422fda 100644 --- a/libqpdf/qpdf/QPDFParser.hh +++ b/libqpdf/qpdf/QPDFParser.hh @@ -62,9 +62,32 @@ class QPDFParser decrypter(nullptr), context(context), description(std::move(sp_description)), - parse_pdf(false) + parse_pdf(true) { } + + // Used by readObjectInStream only + QPDFParser( + InputSource& input, + int stream_id, + int obj_id, + std::string const& object_description, + qpdf::Tokenizer& tokenizer, + QPDF* context) : + input(input), + object_description(object_description), + tokenizer(tokenizer), + decrypter(nullptr), + context(context), + description( + std::make_shared( + QPDFObject::ObjStreamDescr(stream_id, obj_id))), + parse_pdf(true), + stream_id(stream_id), + obj_id(obj_id) + { + } + ~QPDFParser() = default; QPDFObjectHandle parse(bool& empty, bool content_stream); @@ -124,6 +147,8 @@ class QPDFParser QPDF* context; std::shared_ptr description; bool parse_pdf; + int stream_id{0}; + int obj_id{0}; std::vector stack; StackFrame* frame{nullptr}; diff --git a/libqpdf/qpdf/QPDF_private.hh b/libqpdf/qpdf/QPDF_private.hh index a7d9989..33621af 100644 --- a/libqpdf/qpdf/QPDF_private.hh +++ b/libqpdf/qpdf/QPDF_private.hh @@ -3,6 +3,7 @@ #include +#include #include // Writer class is restricted to QPDFWriter so that only it can call certain methods. @@ -457,6 +458,7 @@ class QPDF::Members qpdf::Tokenizer tokenizer; std::shared_ptr file; std::string last_object_description; + std::shared_ptr last_ostream_description; bool provided_password_is_hex_key{false}; bool ignore_xref_streams{false}; bool suppress_warnings{false}; diff --git a/manual/release-notes.rst b/manual/release-notes.rst index e0e889d..0f1a7c2 100644 --- a/manual/release-notes.rst +++ b/manual/release-notes.rst @@ -29,6 +29,12 @@ more detail. - There have been further enhancements to how files with damaged xref tables are recovered. + - Other changes + + - The parsing of object streams including the creation of error/warning + messages and object descriptions has been refactored with some + improvement both in runtime and memory usage. + - There has been some refactoring of how object streams are written with some performance improvement.