Commit b37fc7174c3a10725c39b5a0d9e9587adf26e996

Authored by m-holger
1 parent 5eb9a18a

Fix parsing of object streams

... containing objects with no white-space between them.

To enforce the rule that objects end at the start-offset of the next
object, each object is parsed in it own object stream.

To facilitate this, a new private API input source is::OffsetBuffer has
been added which only contains the object but reports offsets relative to
the start of the object stream. This is adapted from OffsetInputSource by
changing the direction of the offset, endowing it with its own
BufferInputSource and striooing out checks duplicated in BufferInputSource.

Fixes the expected failure in the test case added in #1266.
include/qpdf/QPDF.hh
... ... @@ -45,6 +45,11 @@
45 45 #include <qpdf/QPDFWriter.hh>
46 46 #include <qpdf/QPDFXRefEntry.hh>
47 47  
  48 +namespace qpdf::is
  49 +{
  50 + class OffsetBuffer;
  51 +}
  52 +
48 53 class QPDF_Stream;
49 54 class BitStream;
50 55 class BitWriter;
... ... @@ -785,7 +790,7 @@ class QPDF
785 790 QPDFObjectHandle readObject(std::string const& description, QPDFObjGen og);
786 791 void readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset);
787 792 void validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset);
788   - QPDFObjectHandle readObjectInStream(BufferInputSource& input, int stream_id, int obj_id);
  793 + QPDFObjectHandle readObjectInStream(qpdf::is::OffsetBuffer& input, int stream_id, int obj_id);
789 794 size_t recoverStreamLength(
790 795 std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset);
791 796 QPDFTokenizer::Token readToken(InputSource&, size_t max_len = 0);
... ...
libqpdf/QPDFParser.cc
... ... @@ -12,6 +12,7 @@
12 12 #include <memory>
13 13  
14 14 using namespace std::literals;
  15 +using namespace qpdf;
15 16  
16 17 using ObjectPtr = std::shared_ptr<QPDFObject>;
17 18  
... ... @@ -87,7 +88,7 @@ QPDFParser::parse(
87 88  
88 89 std::pair<QPDFObjectHandle, bool>
89 90 QPDFParser::parse(
90   - BufferInputSource& input, int stream_id, int obj_id, qpdf::Tokenizer& tokenizer, QPDF& context)
  91 + is::OffsetBuffer& input, int stream_id, int obj_id, qpdf::Tokenizer& tokenizer, QPDF& context)
91 92 {
92 93 bool empty{false};
93 94 auto result = QPDFParser(
... ...
libqpdf/QPDF_objects.cc
... ... @@ -1288,7 +1288,7 @@ QPDF::validateStreamLineEnd(QPDFObjectHandle&amp; object, QPDFObjGen og, qpdf_offset
1288 1288 }
1289 1289  
1290 1290 QPDFObjectHandle
1291   -QPDF::readObjectInStream(BufferInputSource& input, int stream_id, int obj_id)
  1291 +QPDF::readObjectInStream(is::OffsetBuffer& input, int stream_id, int obj_id)
1292 1292 {
1293 1293 auto [object, empty] = QPDFParser::parse(input, stream_id, obj_id, m->tokenizer, *this);
1294 1294 if (empty) {
... ... @@ -1645,12 +1645,26 @@ QPDF::resolveObjectsInStream(int obj_stream_number)
1645 1645 "object stream " + std::to_string(obj_stream_number) + " has incorrect keys");
1646 1646 }
1647 1647  
1648   - std::vector<std::pair<int, long long>> offsets;
  1648 + // id, offset, size
  1649 + std::vector<std::tuple<int, qpdf_offset_t, size_t>> offsets;
1649 1650  
1650 1651 auto bp = obj_stream.getStreamData(qpdf_dl_specialized);
  1652 +
1651 1653 BufferInputSource input("", bp.get());
1652 1654  
  1655 + const auto b_size = bp->getSize();
  1656 + const auto end_offset = static_cast<qpdf_offset_t>(b_size);
  1657 + auto b_start = bp->getBuffer();
  1658 +
  1659 + if (first >= end_offset) {
  1660 + throw damagedPDF(
  1661 + "object " + std::to_string(obj_stream_number) + " 0",
  1662 + "object stream " + std::to_string(obj_stream_number) + " has invalid /First entry");
  1663 + }
  1664 +
  1665 + int id = 0;
1653 1666 long long last_offset = -1;
  1667 + bool is_first = true;
1654 1668 for (unsigned int i = 0; i < n; ++i) {
1655 1669 auto tnum = readToken(input);
1656 1670 auto toffset = readToken(input);
... ... @@ -1682,26 +1696,45 @@ QPDF::resolveObjectsInStream(int obj_stream_number)
1682 1696 std::to_string(last_offset) + ")"));
1683 1697 continue;
1684 1698 }
1685   - last_offset = offset;
1686 1699  
1687 1700 if (num > m->xref_table_max_id) {
1688 1701 continue;
1689 1702 }
1690 1703  
1691   - offsets.emplace_back(num, offset + first);
  1704 + if (first + offset >= end_offset) {
  1705 + warn(damaged(num, offset, "offset is too large"));
  1706 + continue;
  1707 + }
  1708 +
  1709 + if (is_first) {
  1710 + is_first = false;
  1711 + } else {
  1712 + offsets.emplace_back(
  1713 + id, last_offset + first, static_cast<size_t>(offset - last_offset));
  1714 + }
  1715 +
  1716 + last_offset = offset;
  1717 + id = num;
  1718 + }
  1719 +
  1720 + if (!is_first) {
  1721 + // We found at least one valid entry.
  1722 + offsets.emplace_back(
  1723 + id, last_offset + first, b_size - static_cast<size_t>(last_offset + first));
1692 1724 }
1693 1725  
1694 1726 // To avoid having to read the object stream multiple times, store all objects that would be
1695 1727 // found here in the cache. Remember that some objects stored here might have been overridden
1696 1728 // by new objects appended to the file, so it is necessary to recheck the xref table and only
1697 1729 // cache what would actually be resolved here.
1698   - for (auto const& [id, offset]: offsets) {
1699   - QPDFObjGen og(id, 0);
  1730 + for (auto const& [obj_id, obj_offset, obj_size]: offsets) {
  1731 + QPDFObjGen og(obj_id, 0);
1700 1732 auto entry = m->xref_table.find(og);
1701 1733 if (entry != m->xref_table.end() && entry->second.getType() == 2 &&
1702 1734 entry->second.getObjStreamNumber() == obj_stream_number) {
1703   - input.seek(offset, SEEK_SET);
1704   - QPDFObjectHandle oh = readObjectInStream(input, obj_stream_number, id);
  1735 + Buffer obj_buffer{b_start + obj_offset, obj_size};
  1736 + is::OffsetBuffer in("", &obj_buffer, obj_offset);
  1737 + auto oh = readObjectInStream(in, obj_stream_number, obj_id);
1705 1738 updateCache(og, oh.getObj(), end_before_space, end_after_space);
1706 1739 } else {
1707 1740 QTC::TC("qpdf", "QPDF not caching overridden objstm object");
... ...
libqpdf/qpdf/InputSource_private.hh
1 1 #ifndef QPDF_INPUTSOURCE_PRIVATE_HH
2 2 #define QPDF_INPUTSOURCE_PRIVATE_HH
3 3  
  4 +#include <qpdf/BufferInputSource.hh>
4 5 #include <qpdf/InputSource.hh>
5 6  
  7 +#include <limits>
  8 +#include <sstream>
  9 +#include <stdexcept>
  10 +
  11 +namespace qpdf::is
  12 +{
  13 + class OffsetBuffer final: public InputSource
  14 + {
  15 + public:
  16 + OffsetBuffer(std::string const& description, Buffer* buf, qpdf_offset_t global_offset) :
  17 + proxied(description, buf),
  18 + global_offset(global_offset)
  19 + {
  20 + if (global_offset < 0) {
  21 + throw std::logic_error("is::OffsetBuffer constructed with negative offset");
  22 + }
  23 + last_offset = global_offset;
  24 + }
  25 +
  26 + ~OffsetBuffer() final = default;
  27 +
  28 + qpdf_offset_t
  29 + findAndSkipNextEOL() final
  30 + {
  31 + return proxied.findAndSkipNextEOL() + global_offset;
  32 + }
  33 +
  34 + std::string const&
  35 + getName() const final
  36 + {
  37 + return proxied.getName();
  38 + }
  39 +
  40 + qpdf_offset_t
  41 + tell() final
  42 + {
  43 + return proxied.tell() + global_offset;
  44 + }
  45 +
  46 + void
  47 + seek(qpdf_offset_t offset, int whence) final
  48 + {
  49 + if (whence == SEEK_SET) {
  50 + proxied.seek(offset - global_offset, whence);
  51 + } else {
  52 + proxied.seek(offset, whence);
  53 + }
  54 + }
  55 +
  56 + void
  57 + rewind() final
  58 + {
  59 + seek(0, SEEK_SET);
  60 + }
  61 +
  62 + size_t
  63 + read(char* buffer, size_t length) final
  64 + {
  65 + size_t result = proxied.read(buffer, length);
  66 + setLastOffset(proxied.getLastOffset() + global_offset);
  67 + return result;
  68 + }
  69 +
  70 + void
  71 + unreadCh(char ch) final
  72 + {
  73 + proxied.unreadCh(ch);
  74 + }
  75 +
  76 + private:
  77 + BufferInputSource proxied;
  78 + qpdf_offset_t global_offset;
  79 + };
  80 +
  81 +} // namespace qpdf::is
  82 +
6 83 inline size_t
7 84 InputSource::read(std::string& str, size_t count, qpdf_offset_t at)
8 85 {
... ...
libqpdf/qpdf/QPDFParser.hh
1 1 #ifndef QPDFPARSER_HH
2 2 #define QPDFPARSER_HH
3 3  
  4 +#include <qpdf/InputSource_private.hh>
4 5 #include <qpdf/QPDFObjectHandle_private.hh>
5 6 #include <qpdf/QPDFObject_private.hh>
6 7 #include <qpdf/QPDFTokenizer_private.hh>
... ... @@ -38,7 +39,7 @@ class QPDFParser
38 39 QPDF& context);
39 40  
40 41 static std::pair<QPDFObjectHandle, bool> parse(
41   - BufferInputSource& input,
  42 + qpdf::is::OffsetBuffer& input,
42 43 int stream_id,
43 44 int obj_id,
44 45 qpdf::Tokenizer& tokenizer,
... ...
manual/release-notes.rst
... ... @@ -21,6 +21,11 @@ more detail.
21 21 integer object. Previously the method returned false if the first
22 22 dictionary object was not a linearization parameter dictionary.
23 23  
  24 + = Fix parsing of object streams containing objects not seperated by
  25 + white-space. Pre-2020 editions of the PDF specification incorrectly
  26 + stated that white-space was required between objects. qpdf relied on this
  27 + when parsing object streams.
  28 +
24 29 - Fix two object stream error/warning messages that reported the wrong
25 30 object id.
26 31  
... ...
qpdf/qtest/object-stream.test
... ... @@ -124,7 +124,7 @@ $td-&gt;runtest(&quot;adjacent compressed objects&quot;,
124 124 {$td->COMMAND => "test_driver 99 no-space-compressed-object.pdf"},
125 125 {$td->FILE => "no-space-compressed-object.out",
126 126 $td->EXIT_STATUS => 0},
127   - $td->EXPECT_FAILURE);
  127 + $td->NORMALIZE_NEWLINES);
128 128  
129 129 cleanup();
130 130 $td->report(calc_ntests($n_tests, $n_compare_pdfs));
... ...