Commit 65e7e3db36ac731fb56d3c675c3bacfcd21dec61

Authored by m-holger
Committed by GitHub
2 parents 5eb9a18a b37fc717

Merge pull request #1400 from m-holger/pr1266

Fix parsing of object streams containing objects with no white-space between them
include/qpdf/QPDF.hh
... ... @@ -45,6 +45,11 @@
45 45 #include <qpdf/QPDFWriter.hh>
46 46 #include <qpdf/QPDFXRefEntry.hh>
47 47  
  48 +namespace qpdf::is
  49 +{
  50 + class OffsetBuffer;
  51 +}
  52 +
48 53 class QPDF_Stream;
49 54 class BitStream;
50 55 class BitWriter;
... ... @@ -785,7 +790,7 @@ class QPDF
785 790 QPDFObjectHandle readObject(std::string const& description, QPDFObjGen og);
786 791 void readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset);
787 792 void validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset);
788   - QPDFObjectHandle readObjectInStream(BufferInputSource& input, int stream_id, int obj_id);
  793 + QPDFObjectHandle readObjectInStream(qpdf::is::OffsetBuffer& input, int stream_id, int obj_id);
789 794 size_t recoverStreamLength(
790 795 std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset);
791 796 QPDFTokenizer::Token readToken(InputSource&, size_t max_len = 0);
... ...
libqpdf/QPDFParser.cc
... ... @@ -12,6 +12,7 @@
12 12 #include <memory>
13 13  
14 14 using namespace std::literals;
  15 +using namespace qpdf;
15 16  
16 17 using ObjectPtr = std::shared_ptr<QPDFObject>;
17 18  
... ... @@ -87,7 +88,7 @@ QPDFParser::parse(
87 88  
88 89 std::pair<QPDFObjectHandle, bool>
89 90 QPDFParser::parse(
90   - BufferInputSource& input, int stream_id, int obj_id, qpdf::Tokenizer& tokenizer, QPDF& context)
  91 + is::OffsetBuffer& input, int stream_id, int obj_id, qpdf::Tokenizer& tokenizer, QPDF& context)
91 92 {
92 93 bool empty{false};
93 94 auto result = QPDFParser(
... ...
libqpdf/QPDF_objects.cc
... ... @@ -1288,7 +1288,7 @@ QPDF::validateStreamLineEnd(QPDFObjectHandle&amp; object, QPDFObjGen og, qpdf_offset
1288 1288 }
1289 1289  
1290 1290 QPDFObjectHandle
1291   -QPDF::readObjectInStream(BufferInputSource& input, int stream_id, int obj_id)
  1291 +QPDF::readObjectInStream(is::OffsetBuffer& input, int stream_id, int obj_id)
1292 1292 {
1293 1293 auto [object, empty] = QPDFParser::parse(input, stream_id, obj_id, m->tokenizer, *this);
1294 1294 if (empty) {
... ... @@ -1645,12 +1645,26 @@ QPDF::resolveObjectsInStream(int obj_stream_number)
1645 1645 "object stream " + std::to_string(obj_stream_number) + " has incorrect keys");
1646 1646 }
1647 1647  
1648   - std::vector<std::pair<int, long long>> offsets;
  1648 + // id, offset, size
  1649 + std::vector<std::tuple<int, qpdf_offset_t, size_t>> offsets;
1649 1650  
1650 1651 auto bp = obj_stream.getStreamData(qpdf_dl_specialized);
  1652 +
1651 1653 BufferInputSource input("", bp.get());
1652 1654  
  1655 + const auto b_size = bp->getSize();
  1656 + const auto end_offset = static_cast<qpdf_offset_t>(b_size);
  1657 + auto b_start = bp->getBuffer();
  1658 +
  1659 + if (first >= end_offset) {
  1660 + throw damagedPDF(
  1661 + "object " + std::to_string(obj_stream_number) + " 0",
  1662 + "object stream " + std::to_string(obj_stream_number) + " has invalid /First entry");
  1663 + }
  1664 +
  1665 + int id = 0;
1653 1666 long long last_offset = -1;
  1667 + bool is_first = true;
1654 1668 for (unsigned int i = 0; i < n; ++i) {
1655 1669 auto tnum = readToken(input);
1656 1670 auto toffset = readToken(input);
... ... @@ -1682,26 +1696,45 @@ QPDF::resolveObjectsInStream(int obj_stream_number)
1682 1696 std::to_string(last_offset) + ")"));
1683 1697 continue;
1684 1698 }
1685   - last_offset = offset;
1686 1699  
1687 1700 if (num > m->xref_table_max_id) {
1688 1701 continue;
1689 1702 }
1690 1703  
1691   - offsets.emplace_back(num, offset + first);
  1704 + if (first + offset >= end_offset) {
  1705 + warn(damaged(num, offset, "offset is too large"));
  1706 + continue;
  1707 + }
  1708 +
  1709 + if (is_first) {
  1710 + is_first = false;
  1711 + } else {
  1712 + offsets.emplace_back(
  1713 + id, last_offset + first, static_cast<size_t>(offset - last_offset));
  1714 + }
  1715 +
  1716 + last_offset = offset;
  1717 + id = num;
  1718 + }
  1719 +
  1720 + if (!is_first) {
  1721 + // We found at least one valid entry.
  1722 + offsets.emplace_back(
  1723 + id, last_offset + first, b_size - static_cast<size_t>(last_offset + first));
1692 1724 }
1693 1725  
1694 1726 // To avoid having to read the object stream multiple times, store all objects that would be
1695 1727 // found here in the cache. Remember that some objects stored here might have been overridden
1696 1728 // by new objects appended to the file, so it is necessary to recheck the xref table and only
1697 1729 // cache what would actually be resolved here.
1698   - for (auto const& [id, offset]: offsets) {
1699   - QPDFObjGen og(id, 0);
  1730 + for (auto const& [obj_id, obj_offset, obj_size]: offsets) {
  1731 + QPDFObjGen og(obj_id, 0);
1700 1732 auto entry = m->xref_table.find(og);
1701 1733 if (entry != m->xref_table.end() && entry->second.getType() == 2 &&
1702 1734 entry->second.getObjStreamNumber() == obj_stream_number) {
1703   - input.seek(offset, SEEK_SET);
1704   - QPDFObjectHandle oh = readObjectInStream(input, obj_stream_number, id);
  1735 + Buffer obj_buffer{b_start + obj_offset, obj_size};
  1736 + is::OffsetBuffer in("", &obj_buffer, obj_offset);
  1737 + auto oh = readObjectInStream(in, obj_stream_number, obj_id);
1705 1738 updateCache(og, oh.getObj(), end_before_space, end_after_space);
1706 1739 } else {
1707 1740 QTC::TC("qpdf", "QPDF not caching overridden objstm object");
... ...
libqpdf/qpdf/InputSource_private.hh
1 1 #ifndef QPDF_INPUTSOURCE_PRIVATE_HH
2 2 #define QPDF_INPUTSOURCE_PRIVATE_HH
3 3  
  4 +#include <qpdf/BufferInputSource.hh>
4 5 #include <qpdf/InputSource.hh>
5 6  
  7 +#include <limits>
  8 +#include <sstream>
  9 +#include <stdexcept>
  10 +
  11 +namespace qpdf::is
  12 +{
  13 + class OffsetBuffer final: public InputSource
  14 + {
  15 + public:
  16 + OffsetBuffer(std::string const& description, Buffer* buf, qpdf_offset_t global_offset) :
  17 + proxied(description, buf),
  18 + global_offset(global_offset)
  19 + {
  20 + if (global_offset < 0) {
  21 + throw std::logic_error("is::OffsetBuffer constructed with negative offset");
  22 + }
  23 + last_offset = global_offset;
  24 + }
  25 +
  26 + ~OffsetBuffer() final = default;
  27 +
  28 + qpdf_offset_t
  29 + findAndSkipNextEOL() final
  30 + {
  31 + return proxied.findAndSkipNextEOL() + global_offset;
  32 + }
  33 +
  34 + std::string const&
  35 + getName() const final
  36 + {
  37 + return proxied.getName();
  38 + }
  39 +
  40 + qpdf_offset_t
  41 + tell() final
  42 + {
  43 + return proxied.tell() + global_offset;
  44 + }
  45 +
  46 + void
  47 + seek(qpdf_offset_t offset, int whence) final
  48 + {
  49 + if (whence == SEEK_SET) {
  50 + proxied.seek(offset - global_offset, whence);
  51 + } else {
  52 + proxied.seek(offset, whence);
  53 + }
  54 + }
  55 +
  56 + void
  57 + rewind() final
  58 + {
  59 + seek(0, SEEK_SET);
  60 + }
  61 +
  62 + size_t
  63 + read(char* buffer, size_t length) final
  64 + {
  65 + size_t result = proxied.read(buffer, length);
  66 + setLastOffset(proxied.getLastOffset() + global_offset);
  67 + return result;
  68 + }
  69 +
  70 + void
  71 + unreadCh(char ch) final
  72 + {
  73 + proxied.unreadCh(ch);
  74 + }
  75 +
  76 + private:
  77 + BufferInputSource proxied;
  78 + qpdf_offset_t global_offset;
  79 + };
  80 +
  81 +} // namespace qpdf::is
  82 +
6 83 inline size_t
7 84 InputSource::read(std::string& str, size_t count, qpdf_offset_t at)
8 85 {
... ...
libqpdf/qpdf/QPDFParser.hh
1 1 #ifndef QPDFPARSER_HH
2 2 #define QPDFPARSER_HH
3 3  
  4 +#include <qpdf/InputSource_private.hh>
4 5 #include <qpdf/QPDFObjectHandle_private.hh>
5 6 #include <qpdf/QPDFObject_private.hh>
6 7 #include <qpdf/QPDFTokenizer_private.hh>
... ... @@ -38,7 +39,7 @@ class QPDFParser
38 39 QPDF& context);
39 40  
40 41 static std::pair<QPDFObjectHandle, bool> parse(
41   - BufferInputSource& input,
  42 + qpdf::is::OffsetBuffer& input,
42 43 int stream_id,
43 44 int obj_id,
44 45 qpdf::Tokenizer& tokenizer,
... ...
manual/release-notes.rst
... ... @@ -21,6 +21,11 @@ more detail.
21 21 integer object. Previously the method returned false if the first
22 22 dictionary object was not a linearization parameter dictionary.
23 23  
  24 + = Fix parsing of object streams containing objects not seperated by
  25 + white-space. Pre-2020 editions of the PDF specification incorrectly
  26 + stated that white-space was required between objects. qpdf relied on this
  27 + when parsing object streams.
  28 +
24 29 - Fix two object stream error/warning messages that reported the wrong
25 30 object id.
26 31  
... ...
qpdf/qtest/object-stream.test
... ... @@ -124,7 +124,7 @@ $td-&gt;runtest(&quot;adjacent compressed objects&quot;,
124 124 {$td->COMMAND => "test_driver 99 no-space-compressed-object.pdf"},
125 125 {$td->FILE => "no-space-compressed-object.out",
126 126 $td->EXIT_STATUS => 0},
127   - $td->EXPECT_FAILURE);
  127 + $td->NORMALIZE_NEWLINES);
128 128  
129 129 cleanup();
130 130 $td->report(calc_ntests($n_tests, $n_compare_pdfs));
... ...