Commit b37fc7174c3a10725c39b5a0d9e9587adf26e996
1 parent
5eb9a18a
Fix parsing of object streams
... containing objects with no white-space between them. To enforce the rule that objects end at the start-offset of the next object, each object is parsed in it own object stream. To facilitate this, a new private API input source is::OffsetBuffer has been added which only contains the object but reports offsets relative to the start of the object stream. This is adapted from OffsetInputSource by changing the direction of the offset, endowing it with its own BufferInputSource and striooing out checks duplicated in BufferInputSource. Fixes the expected failure in the test case added in #1266.
Showing
7 changed files
with
134 additions
and
12 deletions
include/qpdf/QPDF.hh
| @@ -45,6 +45,11 @@ | @@ -45,6 +45,11 @@ | ||
| 45 | #include <qpdf/QPDFWriter.hh> | 45 | #include <qpdf/QPDFWriter.hh> |
| 46 | #include <qpdf/QPDFXRefEntry.hh> | 46 | #include <qpdf/QPDFXRefEntry.hh> |
| 47 | 47 | ||
| 48 | +namespace qpdf::is | ||
| 49 | +{ | ||
| 50 | + class OffsetBuffer; | ||
| 51 | +} | ||
| 52 | + | ||
| 48 | class QPDF_Stream; | 53 | class QPDF_Stream; |
| 49 | class BitStream; | 54 | class BitStream; |
| 50 | class BitWriter; | 55 | class BitWriter; |
| @@ -785,7 +790,7 @@ class QPDF | @@ -785,7 +790,7 @@ class QPDF | ||
| 785 | QPDFObjectHandle readObject(std::string const& description, QPDFObjGen og); | 790 | QPDFObjectHandle readObject(std::string const& description, QPDFObjGen og); |
| 786 | void readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset); | 791 | void readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset); |
| 787 | void validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset); | 792 | void validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset); |
| 788 | - QPDFObjectHandle readObjectInStream(BufferInputSource& input, int stream_id, int obj_id); | 793 | + QPDFObjectHandle readObjectInStream(qpdf::is::OffsetBuffer& input, int stream_id, int obj_id); |
| 789 | size_t recoverStreamLength( | 794 | size_t recoverStreamLength( |
| 790 | std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset); | 795 | std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset); |
| 791 | QPDFTokenizer::Token readToken(InputSource&, size_t max_len = 0); | 796 | QPDFTokenizer::Token readToken(InputSource&, size_t max_len = 0); |
libqpdf/QPDFParser.cc
| @@ -12,6 +12,7 @@ | @@ -12,6 +12,7 @@ | ||
| 12 | #include <memory> | 12 | #include <memory> |
| 13 | 13 | ||
| 14 | using namespace std::literals; | 14 | using namespace std::literals; |
| 15 | +using namespace qpdf; | ||
| 15 | 16 | ||
| 16 | using ObjectPtr = std::shared_ptr<QPDFObject>; | 17 | using ObjectPtr = std::shared_ptr<QPDFObject>; |
| 17 | 18 | ||
| @@ -87,7 +88,7 @@ QPDFParser::parse( | @@ -87,7 +88,7 @@ QPDFParser::parse( | ||
| 87 | 88 | ||
| 88 | std::pair<QPDFObjectHandle, bool> | 89 | std::pair<QPDFObjectHandle, bool> |
| 89 | QPDFParser::parse( | 90 | QPDFParser::parse( |
| 90 | - BufferInputSource& input, int stream_id, int obj_id, qpdf::Tokenizer& tokenizer, QPDF& context) | 91 | + is::OffsetBuffer& input, int stream_id, int obj_id, qpdf::Tokenizer& tokenizer, QPDF& context) |
| 91 | { | 92 | { |
| 92 | bool empty{false}; | 93 | bool empty{false}; |
| 93 | auto result = QPDFParser( | 94 | auto result = QPDFParser( |
libqpdf/QPDF_objects.cc
| @@ -1288,7 +1288,7 @@ QPDF::validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset | @@ -1288,7 +1288,7 @@ QPDF::validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset | ||
| 1288 | } | 1288 | } |
| 1289 | 1289 | ||
| 1290 | QPDFObjectHandle | 1290 | QPDFObjectHandle |
| 1291 | -QPDF::readObjectInStream(BufferInputSource& input, int stream_id, int obj_id) | 1291 | +QPDF::readObjectInStream(is::OffsetBuffer& input, int stream_id, int obj_id) |
| 1292 | { | 1292 | { |
| 1293 | auto [object, empty] = QPDFParser::parse(input, stream_id, obj_id, m->tokenizer, *this); | 1293 | auto [object, empty] = QPDFParser::parse(input, stream_id, obj_id, m->tokenizer, *this); |
| 1294 | if (empty) { | 1294 | if (empty) { |
| @@ -1645,12 +1645,26 @@ QPDF::resolveObjectsInStream(int obj_stream_number) | @@ -1645,12 +1645,26 @@ QPDF::resolveObjectsInStream(int obj_stream_number) | ||
| 1645 | "object stream " + std::to_string(obj_stream_number) + " has incorrect keys"); | 1645 | "object stream " + std::to_string(obj_stream_number) + " has incorrect keys"); |
| 1646 | } | 1646 | } |
| 1647 | 1647 | ||
| 1648 | - std::vector<std::pair<int, long long>> offsets; | 1648 | + // id, offset, size |
| 1649 | + std::vector<std::tuple<int, qpdf_offset_t, size_t>> offsets; | ||
| 1649 | 1650 | ||
| 1650 | auto bp = obj_stream.getStreamData(qpdf_dl_specialized); | 1651 | auto bp = obj_stream.getStreamData(qpdf_dl_specialized); |
| 1652 | + | ||
| 1651 | BufferInputSource input("", bp.get()); | 1653 | BufferInputSource input("", bp.get()); |
| 1652 | 1654 | ||
| 1655 | + const auto b_size = bp->getSize(); | ||
| 1656 | + const auto end_offset = static_cast<qpdf_offset_t>(b_size); | ||
| 1657 | + auto b_start = bp->getBuffer(); | ||
| 1658 | + | ||
| 1659 | + if (first >= end_offset) { | ||
| 1660 | + throw damagedPDF( | ||
| 1661 | + "object " + std::to_string(obj_stream_number) + " 0", | ||
| 1662 | + "object stream " + std::to_string(obj_stream_number) + " has invalid /First entry"); | ||
| 1663 | + } | ||
| 1664 | + | ||
| 1665 | + int id = 0; | ||
| 1653 | long long last_offset = -1; | 1666 | long long last_offset = -1; |
| 1667 | + bool is_first = true; | ||
| 1654 | for (unsigned int i = 0; i < n; ++i) { | 1668 | for (unsigned int i = 0; i < n; ++i) { |
| 1655 | auto tnum = readToken(input); | 1669 | auto tnum = readToken(input); |
| 1656 | auto toffset = readToken(input); | 1670 | auto toffset = readToken(input); |
| @@ -1682,26 +1696,45 @@ QPDF::resolveObjectsInStream(int obj_stream_number) | @@ -1682,26 +1696,45 @@ QPDF::resolveObjectsInStream(int obj_stream_number) | ||
| 1682 | std::to_string(last_offset) + ")")); | 1696 | std::to_string(last_offset) + ")")); |
| 1683 | continue; | 1697 | continue; |
| 1684 | } | 1698 | } |
| 1685 | - last_offset = offset; | ||
| 1686 | 1699 | ||
| 1687 | if (num > m->xref_table_max_id) { | 1700 | if (num > m->xref_table_max_id) { |
| 1688 | continue; | 1701 | continue; |
| 1689 | } | 1702 | } |
| 1690 | 1703 | ||
| 1691 | - offsets.emplace_back(num, offset + first); | 1704 | + if (first + offset >= end_offset) { |
| 1705 | + warn(damaged(num, offset, "offset is too large")); | ||
| 1706 | + continue; | ||
| 1707 | + } | ||
| 1708 | + | ||
| 1709 | + if (is_first) { | ||
| 1710 | + is_first = false; | ||
| 1711 | + } else { | ||
| 1712 | + offsets.emplace_back( | ||
| 1713 | + id, last_offset + first, static_cast<size_t>(offset - last_offset)); | ||
| 1714 | + } | ||
| 1715 | + | ||
| 1716 | + last_offset = offset; | ||
| 1717 | + id = num; | ||
| 1718 | + } | ||
| 1719 | + | ||
| 1720 | + if (!is_first) { | ||
| 1721 | + // We found at least one valid entry. | ||
| 1722 | + offsets.emplace_back( | ||
| 1723 | + id, last_offset + first, b_size - static_cast<size_t>(last_offset + first)); | ||
| 1692 | } | 1724 | } |
| 1693 | 1725 | ||
| 1694 | // To avoid having to read the object stream multiple times, store all objects that would be | 1726 | // To avoid having to read the object stream multiple times, store all objects that would be |
| 1695 | // found here in the cache. Remember that some objects stored here might have been overridden | 1727 | // found here in the cache. Remember that some objects stored here might have been overridden |
| 1696 | // by new objects appended to the file, so it is necessary to recheck the xref table and only | 1728 | // by new objects appended to the file, so it is necessary to recheck the xref table and only |
| 1697 | // cache what would actually be resolved here. | 1729 | // cache what would actually be resolved here. |
| 1698 | - for (auto const& [id, offset]: offsets) { | ||
| 1699 | - QPDFObjGen og(id, 0); | 1730 | + for (auto const& [obj_id, obj_offset, obj_size]: offsets) { |
| 1731 | + QPDFObjGen og(obj_id, 0); | ||
| 1700 | auto entry = m->xref_table.find(og); | 1732 | auto entry = m->xref_table.find(og); |
| 1701 | if (entry != m->xref_table.end() && entry->second.getType() == 2 && | 1733 | if (entry != m->xref_table.end() && entry->second.getType() == 2 && |
| 1702 | entry->second.getObjStreamNumber() == obj_stream_number) { | 1734 | entry->second.getObjStreamNumber() == obj_stream_number) { |
| 1703 | - input.seek(offset, SEEK_SET); | ||
| 1704 | - QPDFObjectHandle oh = readObjectInStream(input, obj_stream_number, id); | 1735 | + Buffer obj_buffer{b_start + obj_offset, obj_size}; |
| 1736 | + is::OffsetBuffer in("", &obj_buffer, obj_offset); | ||
| 1737 | + auto oh = readObjectInStream(in, obj_stream_number, obj_id); | ||
| 1705 | updateCache(og, oh.getObj(), end_before_space, end_after_space); | 1738 | updateCache(og, oh.getObj(), end_before_space, end_after_space); |
| 1706 | } else { | 1739 | } else { |
| 1707 | QTC::TC("qpdf", "QPDF not caching overridden objstm object"); | 1740 | QTC::TC("qpdf", "QPDF not caching overridden objstm object"); |
libqpdf/qpdf/InputSource_private.hh
| 1 | #ifndef QPDF_INPUTSOURCE_PRIVATE_HH | 1 | #ifndef QPDF_INPUTSOURCE_PRIVATE_HH |
| 2 | #define QPDF_INPUTSOURCE_PRIVATE_HH | 2 | #define QPDF_INPUTSOURCE_PRIVATE_HH |
| 3 | 3 | ||
| 4 | +#include <qpdf/BufferInputSource.hh> | ||
| 4 | #include <qpdf/InputSource.hh> | 5 | #include <qpdf/InputSource.hh> |
| 5 | 6 | ||
| 7 | +#include <limits> | ||
| 8 | +#include <sstream> | ||
| 9 | +#include <stdexcept> | ||
| 10 | + | ||
| 11 | +namespace qpdf::is | ||
| 12 | +{ | ||
| 13 | + class OffsetBuffer final: public InputSource | ||
| 14 | + { | ||
| 15 | + public: | ||
| 16 | + OffsetBuffer(std::string const& description, Buffer* buf, qpdf_offset_t global_offset) : | ||
| 17 | + proxied(description, buf), | ||
| 18 | + global_offset(global_offset) | ||
| 19 | + { | ||
| 20 | + if (global_offset < 0) { | ||
| 21 | + throw std::logic_error("is::OffsetBuffer constructed with negative offset"); | ||
| 22 | + } | ||
| 23 | + last_offset = global_offset; | ||
| 24 | + } | ||
| 25 | + | ||
| 26 | + ~OffsetBuffer() final = default; | ||
| 27 | + | ||
| 28 | + qpdf_offset_t | ||
| 29 | + findAndSkipNextEOL() final | ||
| 30 | + { | ||
| 31 | + return proxied.findAndSkipNextEOL() + global_offset; | ||
| 32 | + } | ||
| 33 | + | ||
| 34 | + std::string const& | ||
| 35 | + getName() const final | ||
| 36 | + { | ||
| 37 | + return proxied.getName(); | ||
| 38 | + } | ||
| 39 | + | ||
| 40 | + qpdf_offset_t | ||
| 41 | + tell() final | ||
| 42 | + { | ||
| 43 | + return proxied.tell() + global_offset; | ||
| 44 | + } | ||
| 45 | + | ||
| 46 | + void | ||
| 47 | + seek(qpdf_offset_t offset, int whence) final | ||
| 48 | + { | ||
| 49 | + if (whence == SEEK_SET) { | ||
| 50 | + proxied.seek(offset - global_offset, whence); | ||
| 51 | + } else { | ||
| 52 | + proxied.seek(offset, whence); | ||
| 53 | + } | ||
| 54 | + } | ||
| 55 | + | ||
| 56 | + void | ||
| 57 | + rewind() final | ||
| 58 | + { | ||
| 59 | + seek(0, SEEK_SET); | ||
| 60 | + } | ||
| 61 | + | ||
| 62 | + size_t | ||
| 63 | + read(char* buffer, size_t length) final | ||
| 64 | + { | ||
| 65 | + size_t result = proxied.read(buffer, length); | ||
| 66 | + setLastOffset(proxied.getLastOffset() + global_offset); | ||
| 67 | + return result; | ||
| 68 | + } | ||
| 69 | + | ||
| 70 | + void | ||
| 71 | + unreadCh(char ch) final | ||
| 72 | + { | ||
| 73 | + proxied.unreadCh(ch); | ||
| 74 | + } | ||
| 75 | + | ||
| 76 | + private: | ||
| 77 | + BufferInputSource proxied; | ||
| 78 | + qpdf_offset_t global_offset; | ||
| 79 | + }; | ||
| 80 | + | ||
| 81 | +} // namespace qpdf::is | ||
| 82 | + | ||
| 6 | inline size_t | 83 | inline size_t |
| 7 | InputSource::read(std::string& str, size_t count, qpdf_offset_t at) | 84 | InputSource::read(std::string& str, size_t count, qpdf_offset_t at) |
| 8 | { | 85 | { |
libqpdf/qpdf/QPDFParser.hh
| 1 | #ifndef QPDFPARSER_HH | 1 | #ifndef QPDFPARSER_HH |
| 2 | #define QPDFPARSER_HH | 2 | #define QPDFPARSER_HH |
| 3 | 3 | ||
| 4 | +#include <qpdf/InputSource_private.hh> | ||
| 4 | #include <qpdf/QPDFObjectHandle_private.hh> | 5 | #include <qpdf/QPDFObjectHandle_private.hh> |
| 5 | #include <qpdf/QPDFObject_private.hh> | 6 | #include <qpdf/QPDFObject_private.hh> |
| 6 | #include <qpdf/QPDFTokenizer_private.hh> | 7 | #include <qpdf/QPDFTokenizer_private.hh> |
| @@ -38,7 +39,7 @@ class QPDFParser | @@ -38,7 +39,7 @@ class QPDFParser | ||
| 38 | QPDF& context); | 39 | QPDF& context); |
| 39 | 40 | ||
| 40 | static std::pair<QPDFObjectHandle, bool> parse( | 41 | static std::pair<QPDFObjectHandle, bool> parse( |
| 41 | - BufferInputSource& input, | 42 | + qpdf::is::OffsetBuffer& input, |
| 42 | int stream_id, | 43 | int stream_id, |
| 43 | int obj_id, | 44 | int obj_id, |
| 44 | qpdf::Tokenizer& tokenizer, | 45 | qpdf::Tokenizer& tokenizer, |
manual/release-notes.rst
| @@ -21,6 +21,11 @@ more detail. | @@ -21,6 +21,11 @@ more detail. | ||
| 21 | integer object. Previously the method returned false if the first | 21 | integer object. Previously the method returned false if the first |
| 22 | dictionary object was not a linearization parameter dictionary. | 22 | dictionary object was not a linearization parameter dictionary. |
| 23 | 23 | ||
| 24 | + = Fix parsing of object streams containing objects not seperated by | ||
| 25 | + white-space. Pre-2020 editions of the PDF specification incorrectly | ||
| 26 | + stated that white-space was required between objects. qpdf relied on this | ||
| 27 | + when parsing object streams. | ||
| 28 | + | ||
| 24 | - Fix two object stream error/warning messages that reported the wrong | 29 | - Fix two object stream error/warning messages that reported the wrong |
| 25 | object id. | 30 | object id. |
| 26 | 31 |
qpdf/qtest/object-stream.test
| @@ -124,7 +124,7 @@ $td->runtest("adjacent compressed objects", | @@ -124,7 +124,7 @@ $td->runtest("adjacent compressed objects", | ||
| 124 | {$td->COMMAND => "test_driver 99 no-space-compressed-object.pdf"}, | 124 | {$td->COMMAND => "test_driver 99 no-space-compressed-object.pdf"}, |
| 125 | {$td->FILE => "no-space-compressed-object.out", | 125 | {$td->FILE => "no-space-compressed-object.out", |
| 126 | $td->EXIT_STATUS => 0}, | 126 | $td->EXIT_STATUS => 0}, |
| 127 | - $td->EXPECT_FAILURE); | 127 | + $td->NORMALIZE_NEWLINES); |
| 128 | 128 | ||
| 129 | cleanup(); | 129 | cleanup(); |
| 130 | $td->report(calc_ntests($n_tests, $n_compare_pdfs)); | 130 | $td->report(calc_ntests($n_tests, $n_compare_pdfs)); |