diff --git a/include/qpdf/QPDF.hh b/include/qpdf/QPDF.hh index 150f7e6..9f076af 100644 --- a/include/qpdf/QPDF.hh +++ b/include/qpdf/QPDF.hh @@ -48,6 +48,7 @@ class QPDF_Stream; class BitStream; class BitWriter; +class BufferInputSource; class QPDFLogger; class QPDFParser; @@ -784,7 +785,7 @@ class QPDF QPDFObjectHandle readObject(std::string const& description, QPDFObjGen og); void readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset); void validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset); - QPDFObjectHandle readObjectInStream(std::shared_ptr& input, int obj); + QPDFObjectHandle readObjectInStream(BufferInputSource& input, int stream_id, int obj_id); size_t recoverStreamLength( std::shared_ptr input, QPDFObjGen og, qpdf_offset_t stream_offset); QPDFTokenizer::Token readToken(InputSource&, size_t max_len = 0); diff --git a/libqpdf/QPDFObject.cc b/libqpdf/QPDFObject.cc index a5cb986..ef568cc 100644 --- a/libqpdf/QPDFObject.cc +++ b/libqpdf/QPDFObject.cc @@ -3,6 +3,10 @@ std::string QPDFObject::getDescription() { + qpdf_offset_t shift = (getTypeCode() == ::ot_dictionary) ? 2 + : (getTypeCode() == ::ot_array) ? 1 + : 0; + if (object_description) { switch (object_description->index()) { case 0: @@ -14,10 +18,6 @@ QPDFObject::getDescription() description.replace(pos, 3, og.unparse(' ')); } if (auto pos = description.find("$PO"); pos != std::string::npos) { - qpdf_offset_t shift = (getTypeCode() == ::ot_dictionary) ? 2 - : (getTypeCode() == ::ot_array) ? 1 - : 0; - description.replace(pos, 3, std::to_string(parsed_offset + shift)); } return description; @@ -44,7 +44,14 @@ QPDFObject::getDescription() } return result; } + case 3: + auto [stream_id, obj_id] = std::get<3>(*object_description); + std::string result = qpdf ? qpdf->getFilename() : ""; + result += " object stream " + std::to_string(stream_id) + ", object " + + std::to_string(obj_id) + " 0 at offset " + std::to_string(parsed_offset + shift); + return result; } + } else if (og.isIndirect()) { return "object " + og.unparse(' '); } diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc index 3edbc44..fdb4827 100644 --- a/libqpdf/QPDFParser.cc +++ b/libqpdf/QPDFParser.cc @@ -10,6 +10,8 @@ #include +using namespace std::literals; + using ObjectPtr = std::shared_ptr; QPDFObjectHandle @@ -524,7 +526,13 @@ QPDFParser::warnDuplicateKey() void QPDFParser::warn(qpdf_offset_t offset, std::string const& msg) const { - warn(QPDFExc(qpdf_e_damaged_pdf, input.getName(), object_description, offset, msg)); + if (stream_id) { + std::string descr = "object "s + std::to_string(obj_id) + " 0"; + std::string name = context->getFilename() + " object stream " + std::to_string(stream_id); + warn(QPDFExc(qpdf_e_damaged_pdf, name, descr, offset, msg)); + } else { + warn(QPDFExc(qpdf_e_damaged_pdf, input.getName(), object_description, offset, msg)); + } } void diff --git a/libqpdf/QPDF_objects.cc b/libqpdf/QPDF_objects.cc index 8fc86f7..8142bd2 100644 --- a/libqpdf/QPDF_objects.cc +++ b/libqpdf/QPDF_objects.cc @@ -1292,19 +1292,22 @@ QPDF::validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset } QPDFObjectHandle -QPDF::readObjectInStream(std::shared_ptr& input, int obj) +QPDF::readObjectInStream(BufferInputSource& input, int stream_id, int obj_id) { - m->last_object_description.erase(7); // last_object_description starts with "object " - m->last_object_description += std::to_string(obj); - m->last_object_description += " 0"; - bool empty = false; - auto object = QPDFParser(*input, m->last_object_description, m->tokenizer, nullptr, this, true) - .parse(empty, false); + auto object = + QPDFParser(input, stream_id, obj_id, m->last_object_description, m->tokenizer, this) + .parse(empty, false); if (empty) { // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in // actual PDF files and Adobe Reader appears to ignore them. - warn(damagedPDF(*input, input->getLastOffset(), "empty object treated as null")); + warn(QPDFExc( + qpdf_e_damaged_pdf, + m->file->getName() + " object stream " + std::to_string(stream_id), + +"object " + std::to_string(obj_id) + " 0, offset " + + std::to_string(input.getLastOffset()), + 0, + "empty object treated as null")); } return object; } @@ -1605,13 +1608,23 @@ QPDF::resolve(QPDFObjGen og) void QPDF::resolveObjectsInStream(int obj_stream_number) { + auto damaged = + [this, obj_stream_number](int id, qpdf_offset_t offset, std::string const& msg) -> QPDFExc { + return { + qpdf_e_damaged_pdf, + m->file->getName() + " object stream " + std::to_string(obj_stream_number), + +"object " + std::to_string(id) + " 0", + offset, + msg}; + }; + if (m->resolved_object_streams.count(obj_stream_number)) { return; } m->resolved_object_streams.insert(obj_stream_number); // Force resolution of object stream - QPDFObjectHandle obj_stream = getObjectByID(obj_stream_number, 0); - if (!obj_stream.isStream()) { + auto obj_stream = getObject(obj_stream_number, 0).as_stream(); + if (!obj_stream) { throw damagedPDF( "object " + std::to_string(obj_stream_number) + " 0", "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream"); @@ -1631,34 +1644,25 @@ QPDF::resolveObjectsInStream(int obj_stream_number) "supposed object stream " + std::to_string(obj_stream_number) + " has wrong type")); } - if (!(dict.getKey("/N").isInteger() && dict.getKey("/First").isInteger())) { + unsigned int n{0}; + int first{0}; + if (!(dict.getKey("/N").getValueAsUInt(n) && dict.getKey("/First").getValueAsInt(first))) { throw damagedPDF( "object " + std::to_string(obj_stream_number) + " 0", "object stream " + std::to_string(obj_stream_number) + " has incorrect keys"); } - int n = dict.getKey("/N").getIntValueAsInt(); - int first = dict.getKey("/First").getIntValueAsInt(); + std::vector> offsets; - std::map offsets; - - std::shared_ptr bp = obj_stream.getStreamData(qpdf_dl_specialized); - auto input = std::shared_ptr( - // line-break - new BufferInputSource( - (m->file->getName() + " object stream " + std::to_string(obj_stream_number)), - bp.get())); + auto bp = obj_stream.getStreamData(qpdf_dl_specialized); + BufferInputSource input("", bp.get()); long long last_offset = -1; - for (int i = 0; i < n; ++i) { - QPDFTokenizer::Token tnum = readToken(*input); - QPDFTokenizer::Token toffset = readToken(*input); + for (unsigned int i = 0; i < n; ++i) { + auto tnum = readToken(input); + auto toffset = readToken(input); if (!(tnum.isInteger() && toffset.isInteger())) { - throw damagedPDF( - *input, - "object " + std::to_string(obj_stream_number) + " 0", - input->getLastOffset(), - "expected integer in object stream header"); + throw damaged(0, input.getLastOffset(), "expected integer in object stream header"); } int num = QUtil::string_to_int(tnum.getValue().c_str()); @@ -1666,29 +1670,20 @@ QPDF::resolveObjectsInStream(int obj_stream_number) if (num == obj_stream_number) { QTC::TC("qpdf", "QPDF ignore self-referential object stream"); - warn(damagedPDF( - *input, - "object " + std::to_string(obj_stream_number) + " 0", - input->getLastOffset(), - "object stream claims to contain itself")); + warn(damaged(num, input.getLastOffset(), "object stream claims to contain itself")); continue; } if (num < 1) { QTC::TC("qpdf", "QPDF object stream contains id < 1"); - warn(damagedPDF( - *input, - "object " + std::to_string(num) + " 0", - input->getLastOffset(), - "object id is invalid"s)); + warn(damaged(num, input.getLastOffset(), "object id is invalid"s)); continue; } if (offset <= last_offset) { QTC::TC("qpdf", "QPDF object stream offsets not increasing"); - warn(damagedPDF( - *input, - "object " + std::to_string(num) + " 0", + warn(damaged( + num, offset, "offset is invalid (must be larger than previous offset " + std::to_string(last_offset) + ")")); @@ -1700,23 +1695,20 @@ QPDF::resolveObjectsInStream(int obj_stream_number) continue; } - offsets[num] = toI(offset + first); + offsets.emplace_back(num, offset + first); } // To avoid having to read the object stream multiple times, store all objects that would be // found here in the cache. Remember that some objects stored here might have been overridden // by new objects appended to the file, so it is necessary to recheck the xref table and only // cache what would actually be resolved here. - m->last_object_description.clear(); - m->last_object_description += "object "; - for (auto const& iter: offsets) { - QPDFObjGen og(iter.first, 0); + for (auto const& [id, offset]: offsets) { + QPDFObjGen og(id, 0); auto entry = m->xref_table.find(og); if (entry != m->xref_table.end() && entry->second.getType() == 2 && entry->second.getObjStreamNumber() == obj_stream_number) { - int offset = iter.second; - input->seek(offset, SEEK_SET); - QPDFObjectHandle oh = readObjectInStream(input, iter.first); + input.seek(offset, SEEK_SET); + QPDFObjectHandle oh = readObjectInStream(input, obj_stream_number, id); updateCache(og, oh.getObj(), end_before_space, end_after_space); } else { QTC::TC("qpdf", "QPDF not caching overridden objstm object"); diff --git a/libqpdf/qpdf/QPDFObjectHandle_private.hh b/libqpdf/qpdf/QPDFObjectHandle_private.hh index 009f43b..2473c2b 100644 --- a/libqpdf/qpdf/QPDFObjectHandle_private.hh +++ b/libqpdf/qpdf/QPDFObjectHandle_private.hh @@ -4,6 +4,7 @@ #include #include +#include #include namespace qpdf @@ -428,6 +429,18 @@ QPDFObject::create(Args&&... args) return std::make_shared(std::forward(T(std::forward(args)...))); } +inline qpdf_object_type_e +QPDFObject::getResolvedTypeCode() const +{ + if (getTypeCode() == ::ot_unresolved) { + return QPDF::Resolver::resolved(qpdf, og)->getTypeCode(); + } + if (getTypeCode() == ::ot_reference) { + return std::get(value).obj->getTypeCode(); + } + return getTypeCode(); +} + inline qpdf::Array QPDFObjectHandle::as_array(qpdf::typed options) const { diff --git a/libqpdf/qpdf/QPDFObject_private.hh b/libqpdf/qpdf/QPDFObject_private.hh index 8a9be46..61b8d60 100644 --- a/libqpdf/qpdf/QPDFObject_private.hh +++ b/libqpdf/qpdf/QPDFObject_private.hh @@ -7,8 +7,8 @@ #include #include #include +#include #include -#include #include #include @@ -301,17 +301,8 @@ class QPDFObject std::string getStringValue() const; // Return a unique type code for the resolved object - qpdf_object_type_e - getResolvedTypeCode() const - { - if (getTypeCode() == ::ot_unresolved) { - return QPDF::Resolver::resolved(qpdf, og)->getTypeCode(); - } - if (getTypeCode() == ::ot_reference) { - return std::get(value).obj->getTypeCode(); - } - return getTypeCode(); - } + inline qpdf_object_type_e getResolvedTypeCode() const; + // Return a unique type code for the object qpdf_object_type_e getTypeCode() const @@ -390,7 +381,17 @@ class QPDFObject std::string var_descr; }; - using Description = std::variant; + struct ObjStreamDescr + { + ObjStreamDescr(int stream_id, int obj_id) : + stream_id(stream_id), + obj_id(obj_id) {}; + + int stream_id; + int obj_id; + }; + + using Description = std::variant; void setDescription( diff --git a/libqpdf/qpdf/QPDFParser.hh b/libqpdf/qpdf/QPDFParser.hh index 184308e..f422fda 100644 --- a/libqpdf/qpdf/QPDFParser.hh +++ b/libqpdf/qpdf/QPDFParser.hh @@ -62,9 +62,32 @@ class QPDFParser decrypter(nullptr), context(context), description(std::move(sp_description)), - parse_pdf(false) + parse_pdf(true) { } + + // Used by readObjectInStream only + QPDFParser( + InputSource& input, + int stream_id, + int obj_id, + std::string const& object_description, + qpdf::Tokenizer& tokenizer, + QPDF* context) : + input(input), + object_description(object_description), + tokenizer(tokenizer), + decrypter(nullptr), + context(context), + description( + std::make_shared( + QPDFObject::ObjStreamDescr(stream_id, obj_id))), + parse_pdf(true), + stream_id(stream_id), + obj_id(obj_id) + { + } + ~QPDFParser() = default; QPDFObjectHandle parse(bool& empty, bool content_stream); @@ -124,6 +147,8 @@ class QPDFParser QPDF* context; std::shared_ptr description; bool parse_pdf; + int stream_id{0}; + int obj_id{0}; std::vector stack; StackFrame* frame{nullptr}; diff --git a/libqpdf/qpdf/QPDF_private.hh b/libqpdf/qpdf/QPDF_private.hh index a7d9989..33621af 100644 --- a/libqpdf/qpdf/QPDF_private.hh +++ b/libqpdf/qpdf/QPDF_private.hh @@ -3,6 +3,7 @@ #include +#include #include // Writer class is restricted to QPDFWriter so that only it can call certain methods. @@ -457,6 +458,7 @@ class QPDF::Members qpdf::Tokenizer tokenizer; std::shared_ptr file; std::string last_object_description; + std::shared_ptr last_ostream_description; bool provided_password_is_hex_key{false}; bool ignore_xref_streams{false}; bool suppress_warnings{false}; diff --git a/manual/release-notes.rst b/manual/release-notes.rst index e0e889d..0f1a7c2 100644 --- a/manual/release-notes.rst +++ b/manual/release-notes.rst @@ -29,6 +29,12 @@ more detail. - There have been further enhancements to how files with damaged xref tables are recovered. + - Other changes + + - The parsing of object streams including the creation of error/warning + messages and object descriptions has been refactored with some + improvement both in runtime and memory usage. + - There has been some refactoring of how object streams are written with some performance improvement.