Commit b37fc7174c3a10725c39b5a0d9e9587adf26e996

Authored by m-holger
1 parent 5eb9a18a

Fix parsing of object streams

... containing objects with no white-space between them.

To enforce the rule that objects end at the start-offset of the next
object, each object is parsed in it own object stream.

To facilitate this, a new private API input source is::OffsetBuffer has
been added which only contains the object but reports offsets relative to
the start of the object stream. This is adapted from OffsetInputSource by
changing the direction of the offset, endowing it with its own
BufferInputSource and striooing out checks duplicated in BufferInputSource.

Fixes the expected failure in the test case added in #1266.
include/qpdf/QPDF.hh
@@ -45,6 +45,11 @@ @@ -45,6 +45,11 @@
45 #include <qpdf/QPDFWriter.hh> 45 #include <qpdf/QPDFWriter.hh>
46 #include <qpdf/QPDFXRefEntry.hh> 46 #include <qpdf/QPDFXRefEntry.hh>
47 47
  48 +namespace qpdf::is
  49 +{
  50 + class OffsetBuffer;
  51 +}
  52 +
48 class QPDF_Stream; 53 class QPDF_Stream;
49 class BitStream; 54 class BitStream;
50 class BitWriter; 55 class BitWriter;
@@ -785,7 +790,7 @@ class QPDF @@ -785,7 +790,7 @@ class QPDF
785 QPDFObjectHandle readObject(std::string const& description, QPDFObjGen og); 790 QPDFObjectHandle readObject(std::string const& description, QPDFObjGen og);
786 void readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset); 791 void readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset);
787 void validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset); 792 void validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset);
788 - QPDFObjectHandle readObjectInStream(BufferInputSource& input, int stream_id, int obj_id); 793 + QPDFObjectHandle readObjectInStream(qpdf::is::OffsetBuffer& input, int stream_id, int obj_id);
789 size_t recoverStreamLength( 794 size_t recoverStreamLength(
790 std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset); 795 std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset);
791 QPDFTokenizer::Token readToken(InputSource&, size_t max_len = 0); 796 QPDFTokenizer::Token readToken(InputSource&, size_t max_len = 0);
libqpdf/QPDFParser.cc
@@ -12,6 +12,7 @@ @@ -12,6 +12,7 @@
12 #include <memory> 12 #include <memory>
13 13
14 using namespace std::literals; 14 using namespace std::literals;
  15 +using namespace qpdf;
15 16
16 using ObjectPtr = std::shared_ptr<QPDFObject>; 17 using ObjectPtr = std::shared_ptr<QPDFObject>;
17 18
@@ -87,7 +88,7 @@ QPDFParser::parse( @@ -87,7 +88,7 @@ QPDFParser::parse(
87 88
88 std::pair<QPDFObjectHandle, bool> 89 std::pair<QPDFObjectHandle, bool>
89 QPDFParser::parse( 90 QPDFParser::parse(
90 - BufferInputSource& input, int stream_id, int obj_id, qpdf::Tokenizer& tokenizer, QPDF& context) 91 + is::OffsetBuffer& input, int stream_id, int obj_id, qpdf::Tokenizer& tokenizer, QPDF& context)
91 { 92 {
92 bool empty{false}; 93 bool empty{false};
93 auto result = QPDFParser( 94 auto result = QPDFParser(
libqpdf/QPDF_objects.cc
@@ -1288,7 +1288,7 @@ QPDF::validateStreamLineEnd(QPDFObjectHandle&amp; object, QPDFObjGen og, qpdf_offset @@ -1288,7 +1288,7 @@ QPDF::validateStreamLineEnd(QPDFObjectHandle&amp; object, QPDFObjGen og, qpdf_offset
1288 } 1288 }
1289 1289
1290 QPDFObjectHandle 1290 QPDFObjectHandle
1291 -QPDF::readObjectInStream(BufferInputSource& input, int stream_id, int obj_id) 1291 +QPDF::readObjectInStream(is::OffsetBuffer& input, int stream_id, int obj_id)
1292 { 1292 {
1293 auto [object, empty] = QPDFParser::parse(input, stream_id, obj_id, m->tokenizer, *this); 1293 auto [object, empty] = QPDFParser::parse(input, stream_id, obj_id, m->tokenizer, *this);
1294 if (empty) { 1294 if (empty) {
@@ -1645,12 +1645,26 @@ QPDF::resolveObjectsInStream(int obj_stream_number) @@ -1645,12 +1645,26 @@ QPDF::resolveObjectsInStream(int obj_stream_number)
1645 "object stream " + std::to_string(obj_stream_number) + " has incorrect keys"); 1645 "object stream " + std::to_string(obj_stream_number) + " has incorrect keys");
1646 } 1646 }
1647 1647
1648 - std::vector<std::pair<int, long long>> offsets; 1648 + // id, offset, size
  1649 + std::vector<std::tuple<int, qpdf_offset_t, size_t>> offsets;
1649 1650
1650 auto bp = obj_stream.getStreamData(qpdf_dl_specialized); 1651 auto bp = obj_stream.getStreamData(qpdf_dl_specialized);
  1652 +
1651 BufferInputSource input("", bp.get()); 1653 BufferInputSource input("", bp.get());
1652 1654
  1655 + const auto b_size = bp->getSize();
  1656 + const auto end_offset = static_cast<qpdf_offset_t>(b_size);
  1657 + auto b_start = bp->getBuffer();
  1658 +
  1659 + if (first >= end_offset) {
  1660 + throw damagedPDF(
  1661 + "object " + std::to_string(obj_stream_number) + " 0",
  1662 + "object stream " + std::to_string(obj_stream_number) + " has invalid /First entry");
  1663 + }
  1664 +
  1665 + int id = 0;
1653 long long last_offset = -1; 1666 long long last_offset = -1;
  1667 + bool is_first = true;
1654 for (unsigned int i = 0; i < n; ++i) { 1668 for (unsigned int i = 0; i < n; ++i) {
1655 auto tnum = readToken(input); 1669 auto tnum = readToken(input);
1656 auto toffset = readToken(input); 1670 auto toffset = readToken(input);
@@ -1682,26 +1696,45 @@ QPDF::resolveObjectsInStream(int obj_stream_number) @@ -1682,26 +1696,45 @@ QPDF::resolveObjectsInStream(int obj_stream_number)
1682 std::to_string(last_offset) + ")")); 1696 std::to_string(last_offset) + ")"));
1683 continue; 1697 continue;
1684 } 1698 }
1685 - last_offset = offset;  
1686 1699
1687 if (num > m->xref_table_max_id) { 1700 if (num > m->xref_table_max_id) {
1688 continue; 1701 continue;
1689 } 1702 }
1690 1703
1691 - offsets.emplace_back(num, offset + first); 1704 + if (first + offset >= end_offset) {
  1705 + warn(damaged(num, offset, "offset is too large"));
  1706 + continue;
  1707 + }
  1708 +
  1709 + if (is_first) {
  1710 + is_first = false;
  1711 + } else {
  1712 + offsets.emplace_back(
  1713 + id, last_offset + first, static_cast<size_t>(offset - last_offset));
  1714 + }
  1715 +
  1716 + last_offset = offset;
  1717 + id = num;
  1718 + }
  1719 +
  1720 + if (!is_first) {
  1721 + // We found at least one valid entry.
  1722 + offsets.emplace_back(
  1723 + id, last_offset + first, b_size - static_cast<size_t>(last_offset + first));
1692 } 1724 }
1693 1725
1694 // To avoid having to read the object stream multiple times, store all objects that would be 1726 // To avoid having to read the object stream multiple times, store all objects that would be
1695 // found here in the cache. Remember that some objects stored here might have been overridden 1727 // found here in the cache. Remember that some objects stored here might have been overridden
1696 // by new objects appended to the file, so it is necessary to recheck the xref table and only 1728 // by new objects appended to the file, so it is necessary to recheck the xref table and only
1697 // cache what would actually be resolved here. 1729 // cache what would actually be resolved here.
1698 - for (auto const& [id, offset]: offsets) {  
1699 - QPDFObjGen og(id, 0); 1730 + for (auto const& [obj_id, obj_offset, obj_size]: offsets) {
  1731 + QPDFObjGen og(obj_id, 0);
1700 auto entry = m->xref_table.find(og); 1732 auto entry = m->xref_table.find(og);
1701 if (entry != m->xref_table.end() && entry->second.getType() == 2 && 1733 if (entry != m->xref_table.end() && entry->second.getType() == 2 &&
1702 entry->second.getObjStreamNumber() == obj_stream_number) { 1734 entry->second.getObjStreamNumber() == obj_stream_number) {
1703 - input.seek(offset, SEEK_SET);  
1704 - QPDFObjectHandle oh = readObjectInStream(input, obj_stream_number, id); 1735 + Buffer obj_buffer{b_start + obj_offset, obj_size};
  1736 + is::OffsetBuffer in("", &obj_buffer, obj_offset);
  1737 + auto oh = readObjectInStream(in, obj_stream_number, obj_id);
1705 updateCache(og, oh.getObj(), end_before_space, end_after_space); 1738 updateCache(og, oh.getObj(), end_before_space, end_after_space);
1706 } else { 1739 } else {
1707 QTC::TC("qpdf", "QPDF not caching overridden objstm object"); 1740 QTC::TC("qpdf", "QPDF not caching overridden objstm object");
libqpdf/qpdf/InputSource_private.hh
1 #ifndef QPDF_INPUTSOURCE_PRIVATE_HH 1 #ifndef QPDF_INPUTSOURCE_PRIVATE_HH
2 #define QPDF_INPUTSOURCE_PRIVATE_HH 2 #define QPDF_INPUTSOURCE_PRIVATE_HH
3 3
  4 +#include <qpdf/BufferInputSource.hh>
4 #include <qpdf/InputSource.hh> 5 #include <qpdf/InputSource.hh>
5 6
  7 +#include <limits>
  8 +#include <sstream>
  9 +#include <stdexcept>
  10 +
  11 +namespace qpdf::is
  12 +{
  13 + class OffsetBuffer final: public InputSource
  14 + {
  15 + public:
  16 + OffsetBuffer(std::string const& description, Buffer* buf, qpdf_offset_t global_offset) :
  17 + proxied(description, buf),
  18 + global_offset(global_offset)
  19 + {
  20 + if (global_offset < 0) {
  21 + throw std::logic_error("is::OffsetBuffer constructed with negative offset");
  22 + }
  23 + last_offset = global_offset;
  24 + }
  25 +
  26 + ~OffsetBuffer() final = default;
  27 +
  28 + qpdf_offset_t
  29 + findAndSkipNextEOL() final
  30 + {
  31 + return proxied.findAndSkipNextEOL() + global_offset;
  32 + }
  33 +
  34 + std::string const&
  35 + getName() const final
  36 + {
  37 + return proxied.getName();
  38 + }
  39 +
  40 + qpdf_offset_t
  41 + tell() final
  42 + {
  43 + return proxied.tell() + global_offset;
  44 + }
  45 +
  46 + void
  47 + seek(qpdf_offset_t offset, int whence) final
  48 + {
  49 + if (whence == SEEK_SET) {
  50 + proxied.seek(offset - global_offset, whence);
  51 + } else {
  52 + proxied.seek(offset, whence);
  53 + }
  54 + }
  55 +
  56 + void
  57 + rewind() final
  58 + {
  59 + seek(0, SEEK_SET);
  60 + }
  61 +
  62 + size_t
  63 + read(char* buffer, size_t length) final
  64 + {
  65 + size_t result = proxied.read(buffer, length);
  66 + setLastOffset(proxied.getLastOffset() + global_offset);
  67 + return result;
  68 + }
  69 +
  70 + void
  71 + unreadCh(char ch) final
  72 + {
  73 + proxied.unreadCh(ch);
  74 + }
  75 +
  76 + private:
  77 + BufferInputSource proxied;
  78 + qpdf_offset_t global_offset;
  79 + };
  80 +
  81 +} // namespace qpdf::is
  82 +
6 inline size_t 83 inline size_t
7 InputSource::read(std::string& str, size_t count, qpdf_offset_t at) 84 InputSource::read(std::string& str, size_t count, qpdf_offset_t at)
8 { 85 {
libqpdf/qpdf/QPDFParser.hh
1 #ifndef QPDFPARSER_HH 1 #ifndef QPDFPARSER_HH
2 #define QPDFPARSER_HH 2 #define QPDFPARSER_HH
3 3
  4 +#include <qpdf/InputSource_private.hh>
4 #include <qpdf/QPDFObjectHandle_private.hh> 5 #include <qpdf/QPDFObjectHandle_private.hh>
5 #include <qpdf/QPDFObject_private.hh> 6 #include <qpdf/QPDFObject_private.hh>
6 #include <qpdf/QPDFTokenizer_private.hh> 7 #include <qpdf/QPDFTokenizer_private.hh>
@@ -38,7 +39,7 @@ class QPDFParser @@ -38,7 +39,7 @@ class QPDFParser
38 QPDF& context); 39 QPDF& context);
39 40
40 static std::pair<QPDFObjectHandle, bool> parse( 41 static std::pair<QPDFObjectHandle, bool> parse(
41 - BufferInputSource& input, 42 + qpdf::is::OffsetBuffer& input,
42 int stream_id, 43 int stream_id,
43 int obj_id, 44 int obj_id,
44 qpdf::Tokenizer& tokenizer, 45 qpdf::Tokenizer& tokenizer,
manual/release-notes.rst
@@ -21,6 +21,11 @@ more detail. @@ -21,6 +21,11 @@ more detail.
21 integer object. Previously the method returned false if the first 21 integer object. Previously the method returned false if the first
22 dictionary object was not a linearization parameter dictionary. 22 dictionary object was not a linearization parameter dictionary.
23 23
  24 + = Fix parsing of object streams containing objects not seperated by
  25 + white-space. Pre-2020 editions of the PDF specification incorrectly
  26 + stated that white-space was required between objects. qpdf relied on this
  27 + when parsing object streams.
  28 +
24 - Fix two object stream error/warning messages that reported the wrong 29 - Fix two object stream error/warning messages that reported the wrong
25 object id. 30 object id.
26 31
qpdf/qtest/object-stream.test
@@ -124,7 +124,7 @@ $td-&gt;runtest(&quot;adjacent compressed objects&quot;, @@ -124,7 +124,7 @@ $td-&gt;runtest(&quot;adjacent compressed objects&quot;,
124 {$td->COMMAND => "test_driver 99 no-space-compressed-object.pdf"}, 124 {$td->COMMAND => "test_driver 99 no-space-compressed-object.pdf"},
125 {$td->FILE => "no-space-compressed-object.out", 125 {$td->FILE => "no-space-compressed-object.out",
126 $td->EXIT_STATUS => 0}, 126 $td->EXIT_STATUS => 0},
127 - $td->EXPECT_FAILURE); 127 + $td->NORMALIZE_NEWLINES);
128 128
129 cleanup(); 129 cleanup();
130 $td->report(calc_ntests($n_tests, $n_compare_pdfs)); 130 $td->report(calc_ntests($n_tests, $n_compare_pdfs));