Commit be25fc30d471b55132eab7f590f3db7a51c7fbd2

Authored by m-holger
Committed by GitHub
2 parents 49043112 626d5061

Merge pull request #1397 from m-holger/ostream

Refactor QPDF::resolveObjectsInStream
include/qpdf/QPDF.hh
... ... @@ -48,6 +48,7 @@
48 48 class QPDF_Stream;
49 49 class BitStream;
50 50 class BitWriter;
  51 +class BufferInputSource;
51 52 class QPDFLogger;
52 53 class QPDFParser;
53 54  
... ... @@ -784,7 +785,7 @@ class QPDF
784 785 QPDFObjectHandle readObject(std::string const& description, QPDFObjGen og);
785 786 void readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset);
786 787 void validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset);
787   - QPDFObjectHandle readObjectInStream(std::shared_ptr<InputSource>& input, int obj);
  788 + QPDFObjectHandle readObjectInStream(BufferInputSource& input, int stream_id, int obj_id);
788 789 size_t recoverStreamLength(
789 790 std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset);
790 791 QPDFTokenizer::Token readToken(InputSource&, size_t max_len = 0);
... ...
libqpdf/QPDFObject.cc
... ... @@ -3,6 +3,10 @@
3 3 std::string
4 4 QPDFObject::getDescription()
5 5 {
  6 + qpdf_offset_t shift = (getTypeCode() == ::ot_dictionary) ? 2
  7 + : (getTypeCode() == ::ot_array) ? 1
  8 + : 0;
  9 +
6 10 if (object_description) {
7 11 switch (object_description->index()) {
8 12 case 0:
... ... @@ -14,10 +18,6 @@ QPDFObject::getDescription()
14 18 description.replace(pos, 3, og.unparse(' '));
15 19 }
16 20 if (auto pos = description.find("$PO"); pos != std::string::npos) {
17   - qpdf_offset_t shift = (getTypeCode() == ::ot_dictionary) ? 2
18   - : (getTypeCode() == ::ot_array) ? 1
19   - : 0;
20   -
21 21 description.replace(pos, 3, std::to_string(parsed_offset + shift));
22 22 }
23 23 return description;
... ... @@ -44,7 +44,14 @@ QPDFObject::getDescription()
44 44 }
45 45 return result;
46 46 }
  47 + case 3:
  48 + auto [stream_id, obj_id] = std::get<3>(*object_description);
  49 + std::string result = qpdf ? qpdf->getFilename() : "";
  50 + result += " object stream " + std::to_string(stream_id) + ", object " +
  51 + std::to_string(obj_id) + " 0 at offset " + std::to_string(parsed_offset + shift);
  52 + return result;
47 53 }
  54 +
48 55 } else if (og.isIndirect()) {
49 56 return "object " + og.unparse(' ');
50 57 }
... ...
libqpdf/QPDFParser.cc
... ... @@ -10,6 +10,8 @@
10 10  
11 11 #include <memory>
12 12  
  13 +using namespace std::literals;
  14 +
13 15 using ObjectPtr = std::shared_ptr<QPDFObject>;
14 16  
15 17 QPDFObjectHandle
... ... @@ -524,7 +526,13 @@ QPDFParser::warnDuplicateKey()
524 526 void
525 527 QPDFParser::warn(qpdf_offset_t offset, std::string const& msg) const
526 528 {
527   - warn(QPDFExc(qpdf_e_damaged_pdf, input.getName(), object_description, offset, msg));
  529 + if (stream_id) {
  530 + std::string descr = "object "s + std::to_string(obj_id) + " 0";
  531 + std::string name = context->getFilename() + " object stream " + std::to_string(stream_id);
  532 + warn(QPDFExc(qpdf_e_damaged_pdf, name, descr, offset, msg));
  533 + } else {
  534 + warn(QPDFExc(qpdf_e_damaged_pdf, input.getName(), object_description, offset, msg));
  535 + }
528 536 }
529 537  
530 538 void
... ...
libqpdf/QPDF_objects.cc
... ... @@ -1292,19 +1292,22 @@ QPDF::validateStreamLineEnd(QPDFObjectHandle&amp; object, QPDFObjGen og, qpdf_offset
1292 1292 }
1293 1293  
1294 1294 QPDFObjectHandle
1295   -QPDF::readObjectInStream(std::shared_ptr<InputSource>& input, int obj)
  1295 +QPDF::readObjectInStream(BufferInputSource& input, int stream_id, int obj_id)
1296 1296 {
1297   - m->last_object_description.erase(7); // last_object_description starts with "object "
1298   - m->last_object_description += std::to_string(obj);
1299   - m->last_object_description += " 0";
1300   -
1301 1297 bool empty = false;
1302   - auto object = QPDFParser(*input, m->last_object_description, m->tokenizer, nullptr, this, true)
1303   - .parse(empty, false);
  1298 + auto object =
  1299 + QPDFParser(input, stream_id, obj_id, m->last_object_description, m->tokenizer, this)
  1300 + .parse(empty, false);
1304 1301 if (empty) {
1305 1302 // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
1306 1303 // actual PDF files and Adobe Reader appears to ignore them.
1307   - warn(damagedPDF(*input, input->getLastOffset(), "empty object treated as null"));
  1304 + warn(QPDFExc(
  1305 + qpdf_e_damaged_pdf,
  1306 + m->file->getName() + " object stream " + std::to_string(stream_id),
  1307 + +"object " + std::to_string(obj_id) + " 0, offset " +
  1308 + std::to_string(input.getLastOffset()),
  1309 + 0,
  1310 + "empty object treated as null"));
1308 1311 }
1309 1312 return object;
1310 1313 }
... ... @@ -1605,13 +1608,23 @@ QPDF::resolve(QPDFObjGen og)
1605 1608 void
1606 1609 QPDF::resolveObjectsInStream(int obj_stream_number)
1607 1610 {
  1611 + auto damaged =
  1612 + [this, obj_stream_number](int id, qpdf_offset_t offset, std::string const& msg) -> QPDFExc {
  1613 + return {
  1614 + qpdf_e_damaged_pdf,
  1615 + m->file->getName() + " object stream " + std::to_string(obj_stream_number),
  1616 + +"object " + std::to_string(id) + " 0",
  1617 + offset,
  1618 + msg};
  1619 + };
  1620 +
1608 1621 if (m->resolved_object_streams.count(obj_stream_number)) {
1609 1622 return;
1610 1623 }
1611 1624 m->resolved_object_streams.insert(obj_stream_number);
1612 1625 // Force resolution of object stream
1613   - QPDFObjectHandle obj_stream = getObjectByID(obj_stream_number, 0);
1614   - if (!obj_stream.isStream()) {
  1626 + auto obj_stream = getObject(obj_stream_number, 0).as_stream();
  1627 + if (!obj_stream) {
1615 1628 throw damagedPDF(
1616 1629 "object " + std::to_string(obj_stream_number) + " 0",
1617 1630 "supposed object stream " + std::to_string(obj_stream_number) + " is not a stream");
... ... @@ -1631,34 +1644,25 @@ QPDF::resolveObjectsInStream(int obj_stream_number)
1631 1644 "supposed object stream " + std::to_string(obj_stream_number) + " has wrong type"));
1632 1645 }
1633 1646  
1634   - if (!(dict.getKey("/N").isInteger() && dict.getKey("/First").isInteger())) {
  1647 + unsigned int n{0};
  1648 + int first{0};
  1649 + if (!(dict.getKey("/N").getValueAsUInt(n) && dict.getKey("/First").getValueAsInt(first))) {
1635 1650 throw damagedPDF(
1636 1651 "object " + std::to_string(obj_stream_number) + " 0",
1637 1652 "object stream " + std::to_string(obj_stream_number) + " has incorrect keys");
1638 1653 }
1639 1654  
1640   - int n = dict.getKey("/N").getIntValueAsInt();
1641   - int first = dict.getKey("/First").getIntValueAsInt();
  1655 + std::vector<std::pair<int, long long>> offsets;
1642 1656  
1643   - std::map<int, int> offsets;
1644   -
1645   - std::shared_ptr<Buffer> bp = obj_stream.getStreamData(qpdf_dl_specialized);
1646   - auto input = std::shared_ptr<InputSource>(
1647   - // line-break
1648   - new BufferInputSource(
1649   - (m->file->getName() + " object stream " + std::to_string(obj_stream_number)),
1650   - bp.get()));
  1657 + auto bp = obj_stream.getStreamData(qpdf_dl_specialized);
  1658 + BufferInputSource input("", bp.get());
1651 1659  
1652 1660 long long last_offset = -1;
1653   - for (int i = 0; i < n; ++i) {
1654   - QPDFTokenizer::Token tnum = readToken(*input);
1655   - QPDFTokenizer::Token toffset = readToken(*input);
  1661 + for (unsigned int i = 0; i < n; ++i) {
  1662 + auto tnum = readToken(input);
  1663 + auto toffset = readToken(input);
1656 1664 if (!(tnum.isInteger() && toffset.isInteger())) {
1657   - throw damagedPDF(
1658   - *input,
1659   - "object " + std::to_string(obj_stream_number) + " 0",
1660   - input->getLastOffset(),
1661   - "expected integer in object stream header");
  1665 + throw damaged(0, input.getLastOffset(), "expected integer in object stream header");
1662 1666 }
1663 1667  
1664 1668 int num = QUtil::string_to_int(tnum.getValue().c_str());
... ... @@ -1666,29 +1670,20 @@ QPDF::resolveObjectsInStream(int obj_stream_number)
1666 1670  
1667 1671 if (num == obj_stream_number) {
1668 1672 QTC::TC("qpdf", "QPDF ignore self-referential object stream");
1669   - warn(damagedPDF(
1670   - *input,
1671   - "object " + std::to_string(obj_stream_number) + " 0",
1672   - input->getLastOffset(),
1673   - "object stream claims to contain itself"));
  1673 + warn(damaged(num, input.getLastOffset(), "object stream claims to contain itself"));
1674 1674 continue;
1675 1675 }
1676 1676  
1677 1677 if (num < 1) {
1678 1678 QTC::TC("qpdf", "QPDF object stream contains id < 1");
1679   - warn(damagedPDF(
1680   - *input,
1681   - "object " + std::to_string(num) + " 0",
1682   - input->getLastOffset(),
1683   - "object id is invalid"s));
  1679 + warn(damaged(num, input.getLastOffset(), "object id is invalid"s));
1684 1680 continue;
1685 1681 }
1686 1682  
1687 1683 if (offset <= last_offset) {
1688 1684 QTC::TC("qpdf", "QPDF object stream offsets not increasing");
1689   - warn(damagedPDF(
1690   - *input,
1691   - "object " + std::to_string(num) + " 0",
  1685 + warn(damaged(
  1686 + num,
1692 1687 offset,
1693 1688 "offset is invalid (must be larger than previous offset " +
1694 1689 std::to_string(last_offset) + ")"));
... ... @@ -1700,23 +1695,20 @@ QPDF::resolveObjectsInStream(int obj_stream_number)
1700 1695 continue;
1701 1696 }
1702 1697  
1703   - offsets[num] = toI(offset + first);
  1698 + offsets.emplace_back(num, offset + first);
1704 1699 }
1705 1700  
1706 1701 // To avoid having to read the object stream multiple times, store all objects that would be
1707 1702 // found here in the cache. Remember that some objects stored here might have been overridden
1708 1703 // by new objects appended to the file, so it is necessary to recheck the xref table and only
1709 1704 // cache what would actually be resolved here.
1710   - m->last_object_description.clear();
1711   - m->last_object_description += "object ";
1712   - for (auto const& iter: offsets) {
1713   - QPDFObjGen og(iter.first, 0);
  1705 + for (auto const& [id, offset]: offsets) {
  1706 + QPDFObjGen og(id, 0);
1714 1707 auto entry = m->xref_table.find(og);
1715 1708 if (entry != m->xref_table.end() && entry->second.getType() == 2 &&
1716 1709 entry->second.getObjStreamNumber() == obj_stream_number) {
1717   - int offset = iter.second;
1718   - input->seek(offset, SEEK_SET);
1719   - QPDFObjectHandle oh = readObjectInStream(input, iter.first);
  1710 + input.seek(offset, SEEK_SET);
  1711 + QPDFObjectHandle oh = readObjectInStream(input, obj_stream_number, id);
1720 1712 updateCache(og, oh.getObj(), end_before_space, end_after_space);
1721 1713 } else {
1722 1714 QTC::TC("qpdf", "QPDF not caching overridden objstm object");
... ...
libqpdf/qpdf/QPDFObjectHandle_private.hh
... ... @@ -4,6 +4,7 @@
4 4 #include <qpdf/QPDFObjectHandle.hh>
5 5  
6 6 #include <qpdf/QPDFObject_private.hh>
  7 +#include <qpdf/QPDF_private.hh>
7 8 #include <qpdf/QUtil.hh>
8 9  
9 10 namespace qpdf
... ... @@ -428,6 +429,18 @@ QPDFObject::create(Args&amp;&amp;... args)
428 429 return std::make_shared<QPDFObject>(std::forward<T>(T(std::forward<Args>(args)...)));
429 430 }
430 431  
  432 +inline qpdf_object_type_e
  433 +QPDFObject::getResolvedTypeCode() const
  434 +{
  435 + if (getTypeCode() == ::ot_unresolved) {
  436 + return QPDF::Resolver::resolved(qpdf, og)->getTypeCode();
  437 + }
  438 + if (getTypeCode() == ::ot_reference) {
  439 + return std::get<QPDF_Reference>(value).obj->getTypeCode();
  440 + }
  441 + return getTypeCode();
  442 +}
  443 +
431 444 inline qpdf::Array
432 445 QPDFObjectHandle::as_array(qpdf::typed options) const
433 446 {
... ...
libqpdf/qpdf/QPDFObject_private.hh
... ... @@ -7,8 +7,8 @@
7 7 #include <qpdf/Constants.h>
8 8 #include <qpdf/JSON.hh>
9 9 #include <qpdf/JSON_writer.hh>
  10 +#include <qpdf/QPDF.hh>
10 11 #include <qpdf/QPDFObjGen.hh>
11   -#include <qpdf/QPDF_private.hh>
12 12 #include <qpdf/Types.h>
13 13  
14 14 #include <map>
... ... @@ -301,17 +301,8 @@ class QPDFObject
301 301 std::string getStringValue() const;
302 302  
303 303 // Return a unique type code for the resolved object
304   - qpdf_object_type_e
305   - getResolvedTypeCode() const
306   - {
307   - if (getTypeCode() == ::ot_unresolved) {
308   - return QPDF::Resolver::resolved(qpdf, og)->getTypeCode();
309   - }
310   - if (getTypeCode() == ::ot_reference) {
311   - return std::get<QPDF_Reference>(value).obj->getTypeCode();
312   - }
313   - return getTypeCode();
314   - }
  304 + inline qpdf_object_type_e getResolvedTypeCode() const;
  305 +
315 306 // Return a unique type code for the object
316 307 qpdf_object_type_e
317 308 getTypeCode() const
... ... @@ -390,7 +381,17 @@ class QPDFObject
390 381 std::string var_descr;
391 382 };
392 383  
393   - using Description = std::variant<std::string, JSON_Descr, ChildDescr>;
  384 + struct ObjStreamDescr
  385 + {
  386 + ObjStreamDescr(int stream_id, int obj_id) :
  387 + stream_id(stream_id),
  388 + obj_id(obj_id) {};
  389 +
  390 + int stream_id;
  391 + int obj_id;
  392 + };
  393 +
  394 + using Description = std::variant<std::string, JSON_Descr, ChildDescr, ObjStreamDescr>;
394 395  
395 396 void
396 397 setDescription(
... ...
libqpdf/qpdf/QPDFParser.hh
... ... @@ -62,9 +62,32 @@ class QPDFParser
62 62 decrypter(nullptr),
63 63 context(context),
64 64 description(std::move(sp_description)),
65   - parse_pdf(false)
  65 + parse_pdf(true)
66 66 {
67 67 }
  68 +
  69 + // Used by readObjectInStream only
  70 + QPDFParser(
  71 + InputSource& input,
  72 + int stream_id,
  73 + int obj_id,
  74 + std::string const& object_description,
  75 + qpdf::Tokenizer& tokenizer,
  76 + QPDF* context) :
  77 + input(input),
  78 + object_description(object_description),
  79 + tokenizer(tokenizer),
  80 + decrypter(nullptr),
  81 + context(context),
  82 + description(
  83 + std::make_shared<QPDFObject::Description>(
  84 + QPDFObject::ObjStreamDescr(stream_id, obj_id))),
  85 + parse_pdf(true),
  86 + stream_id(stream_id),
  87 + obj_id(obj_id)
  88 + {
  89 + }
  90 +
68 91 ~QPDFParser() = default;
69 92  
70 93 QPDFObjectHandle parse(bool& empty, bool content_stream);
... ... @@ -124,6 +147,8 @@ class QPDFParser
124 147 QPDF* context;
125 148 std::shared_ptr<QPDFObject::Description> description;
126 149 bool parse_pdf;
  150 + int stream_id{0};
  151 + int obj_id{0};
127 152  
128 153 std::vector<StackFrame> stack;
129 154 StackFrame* frame{nullptr};
... ...
libqpdf/qpdf/QPDF_private.hh
... ... @@ -3,6 +3,7 @@
3 3  
4 4 #include <qpdf/QPDF.hh>
5 5  
  6 +#include <qpdf/QPDFObject_private.hh>
6 7 #include <qpdf/QPDFTokenizer_private.hh>
7 8  
8 9 // Writer class is restricted to QPDFWriter so that only it can call certain methods.
... ... @@ -457,6 +458,7 @@ class QPDF::Members
457 458 qpdf::Tokenizer tokenizer;
458 459 std::shared_ptr<InputSource> file;
459 460 std::string last_object_description;
  461 + std::shared_ptr<QPDFObject::Description> last_ostream_description;
460 462 bool provided_password_is_hex_key{false};
461 463 bool ignore_xref_streams{false};
462 464 bool suppress_warnings{false};
... ...
manual/release-notes.rst
... ... @@ -29,6 +29,12 @@ more detail.
29 29 - There have been further enhancements to how files with damaged xref
30 30 tables are recovered.
31 31  
  32 + - Other changes
  33 +
  34 + - The parsing of object streams including the creation of error/warning
  35 + messages and object descriptions has been refactored with some
  36 + improvement both in runtime and memory usage.
  37 +
32 38 - There has been some refactoring of how object streams are written with
33 39 some performance improvement.
34 40  
... ...