#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace std::literals; using namespace qpdf; // Pipe class is restricted to QPDF_Stream. class QPDF::Doc::Streams { public: static bool pipeStreamData( QPDF* qpdf, QPDFObjGen og, qpdf_offset_t offset, size_t length, QPDFObjectHandle dict, bool is_root_metadata, Pipeline* pipeline, bool suppress_warnings, bool will_retry) { return qpdf->pipeStreamData( og, offset, length, dict, is_root_metadata, pipeline, suppress_warnings, will_retry); } }; namespace { class SF_Crypt final: public QPDFStreamFilter { public: SF_Crypt() = default; ~SF_Crypt() final = default; bool setDecodeParms(QPDFObjectHandle decode_parms) final { // we only validate here - processing happens in decryptStream if (Dictionary dict = decode_parms) { for (auto const& [key, value]: dict) { if (key == "/Type" && (value.null() || Name(value) == "/CryptFilterDecodeParms")) { continue; } if (key == "/Name") { continue; } if (!value.null()) { return false; } } return true; } return decode_parms.null(); } Pipeline* getDecodePipeline(Pipeline*) final { // Not used -- handled by pipeStreamData return nullptr; } }; class StreamBlobProvider { public: StreamBlobProvider(Stream stream, qpdf_stream_decode_level_e decode_level) : stream(stream), decode_level(decode_level) { } void operator()(Pipeline* p) { stream.pipeStreamData(p, nullptr, 0, decode_level, false, false); } private: Stream stream; qpdf_stream_decode_level_e decode_level; }; /// User defined streamfilter factories std::map()>> filter_factories; bool filter_factories_registered = false; } // namespace std::string QPDF_Stream::Members::expand_filter_name(std::string const& name) const { // The PDF specification provides these filter abbreviations for use in inline images, but // according to table H.1 in the pre-ISO versions of the PDF specification, Adobe Reader also // accepts them for stream filters. if (name == "/AHx") { return "/ASCIIHexDecode"; } if (name == "/A85") { return "/ASCII85Decode"; } if (name == "/LZW") { return "/LZWDecode"; } if (name == "/Fl") { return "/FlateDecode"; } if (name == "/RL") { return "/RunLengthDecode"; } if (name == "/CCF") { return "/CCITTFaxDecode"; } if (name == "/DCT") { return "/DCTDecode"; } return name; }; std::function()> QPDF_Stream::Members::filter_factory(std::string const& name) const { if (filter_factories_registered) [[unlikely]] { // We need to check user provided filters first as we allow users to replace qpdf provided // default filters. This will have a performance impact if the facility to register stream // filters is actually used. We can optimize this away if necessary. auto ff = filter_factories.find(expand_filter_name(name)); if (ff != filter_factories.end()) { return ff->second; } } if (name == "/FlateDecode") { return SF_FlateLzwDecode::flate_factory; } if (name == "/Crypt") { return []() { return std::make_shared(); }; } if (name == "/LZWDecode") { return SF_FlateLzwDecode::lzw_factory; } if (name == "/RunLengthDecode") { return SF_RunLengthDecode::factory; } if (name == "/DCTDecode") { return SF_DCTDecode::factory; } if (name == "/ASCII85Decode") { return SF_ASCII85Decode::factory; } if (name == "/ASCIIHexDecode") { return SF_ASCIIHexDecode::factory; } // The PDF specification provides these filter abbreviations for use in inline images, but // according to table H.1 in the pre-ISO versions of the PDF specification, Adobe Reader // also accepts them for stream filters. if (name == "/Fl") { return SF_FlateLzwDecode::flate_factory; } if (name == "/AHx") { return SF_ASCIIHexDecode::factory; } if (name == "/A85") { return SF_ASCII85Decode::factory; } if (name == "/LZW") { return SF_FlateLzwDecode::lzw_factory; } if (name == "/RL") { return SF_RunLengthDecode::factory; } if (name == "/DCT") { return SF_DCTDecode::factory; } return nullptr; } Stream::Stream( QPDF& qpdf, QPDFObjGen og, QPDFObjectHandle stream_dict, qpdf_offset_t offset, size_t length) : BaseHandle(QPDFObject::create(&qpdf, og, std::move(stream_dict), length)) { auto descr = std::make_shared( qpdf.getFilename() + ", stream object " + og.unparse(' ')); obj->setDescription(&qpdf, descr, offset); setDictDescription(); } void Stream::registerStreamFilter( std::string const& filter_name, std::function()> factory) { filter_factories[filter_name] = factory; filter_factories_registered = true; } JSON Stream::getStreamJSON( int json_version, qpdf_json_stream_data_e json_data, qpdf_stream_decode_level_e decode_level, Pipeline* p, std::string const& data_filename) { Pl_Buffer pb{"streamjson"}; JSON::Writer jw{&pb, 0}; decode_level = writeStreamJSON(json_version, jw, json_data, decode_level, p, data_filename, true); pb.finish(); auto result = JSON::parse(pb.getString()); if (json_data == qpdf_sj_inline) { result.addDictionaryMember("data", JSON::makeBlob(StreamBlobProvider(*this, decode_level))); } return result; } qpdf_stream_decode_level_e Stream::writeStreamJSON( int json_version, JSON::Writer& jw, qpdf_json_stream_data_e json_data, qpdf_stream_decode_level_e decode_level, Pipeline* p, std::string const& data_filename, bool no_data_key) { auto s = stream(); switch (json_data) { case qpdf_sj_none: case qpdf_sj_inline: if (p != nullptr) { throw std::logic_error( "QPDF_Stream::writeStreamJSON: pipeline should only be supplied " "when json_data is file"); } break; case qpdf_sj_file: if (p == nullptr) { throw std::logic_error( "QPDF_Stream::writeStreamJSON: pipeline must be supplied when json_data is file"); } if (data_filename.empty()) { throw std::logic_error( "QPDF_Stream::writeStreamJSON: data_filename must be supplied " "when json_data is file"); } break; } jw.writeStart('{'); if (json_data == qpdf_sj_none) { jw.writeNext(); jw << R"("dict": )"; s->stream_dict.writeJSON(json_version, jw); jw.writeEnd('}'); return decode_level; } Pl_Discard discard; Pl_Buffer buf_pl{"stream data"}; Pipeline* data_pipeline = &buf_pl; if (no_data_key && json_data == qpdf_sj_inline) { data_pipeline = &discard; } // pipeStreamData produced valid data. bool buf_pl_ready = false; bool filtered = false; bool filter = (decode_level != qpdf_dl_none); for (int attempt = 1; attempt <= 2; ++attempt) { bool succeeded = pipeStreamData(data_pipeline, &filtered, 0, decode_level, false, (attempt == 1)); if (!succeeded || (filter && !filtered)) { // Try again filter = false; decode_level = qpdf_dl_none; buf_pl.getString(); // reset buf_pl } else { buf_pl_ready = true; break; } } if (!buf_pl_ready) { throw std::logic_error("QPDF_Stream: failed to get stream data"); } // We can use unsafeShallowCopy because we are only touching top-level keys. auto dict = s->stream_dict.unsafeShallowCopy(); dict.removeKey("/Length"); if (filter && filtered) { dict.removeKey("/Filter"); dict.removeKey("/DecodeParms"); } if (json_data == qpdf_sj_file) { jw.writeNext() << R"("datafile": ")" << JSON::Writer::encode_string(data_filename) << "\""; p->writeString(buf_pl.getString()); } else if (json_data == qpdf_sj_inline) { if (!no_data_key) { jw.writeNext() << R"("data": ")"; jw.writeBase64(buf_pl.getString()) << "\""; } } else { throw std::logic_error("QPDF_Stream::writeStreamJSON : unexpected value of json_data"); } jw.writeNext() << R"("dict": )"; dict.writeJSON(json_version, jw); jw.writeEnd('}'); return decode_level; } void qpdf::Stream::setDictDescription() { auto s = stream(); if (!s->stream_dict.hasObjectDescription()) { s->stream_dict.setObjectDescription( obj->getQPDF(), obj->getDescription() + " -> stream dictionary"); } } std::string Stream::getStreamData(qpdf_stream_decode_level_e decode_level) { std::string result; pl::String buf(result); bool filtered; pipeStreamData(&buf, &filtered, 0, decode_level, false, false); if (!filtered) { throw QPDFExc( qpdf_e_unsupported, obj->getQPDF()->getFilename(), "", obj->getParsedOffset(), "getStreamData called on unfilterable stream"); } QTC::TC("qpdf", "QPDF_Stream getStreamData"); return result; } std::string Stream::getRawStreamData() { std::string result; pl::String buf(result); if (!pipeStreamData(&buf, nullptr, 0, qpdf_dl_none, false, false)) { throw QPDFExc( qpdf_e_unsupported, obj->getQPDF()->getFilename(), "", obj->getParsedOffset(), "error getting raw stream data"); } QTC::TC("qpdf", "QPDF_Stream getRawStreamData"); return result; } bool Stream::isRootMetadata() const { if (!getDict().isDictionaryOfType("/Metadata", "/XML")) { return false; } auto root_metadata = qpdf()->getRoot().getKey("/Metadata"); return root_metadata.isSameObjectAs(obj); } bool Stream::filterable( qpdf_stream_decode_level_e decode_level, std::vector>& filters) { auto s = stream(); // Check filters auto const& filter_obj = s->stream_dict["/Filter"]; if (filter_obj.null()) { // No filters return true; } if (filter_obj.isName()) { // One filter auto ff = s->filter_factory(filter_obj.getName()); if (!ff) { return false; } filters.emplace_back(ff()); } else if (Array array = filter_obj) { // Potentially multiple filters for (Name item: array) { if (!item) { warn("stream filter type is not name or array"); return false; } auto ff = s->filter_factory(item); if (!ff) { filters.clear(); return false; } filters.emplace_back(ff()); } } else { warn("stream filter type is not name or array"); return false; } // filters now contains a list of filters to be applied in order. See which ones we can support. // See if we can support any decode parameters that are specified. auto decode_obj = s->stream_dict.getKey("/DecodeParms"); auto can_filter = // linebreak [](auto d_level, auto& filter, auto& d_obj) -> bool { if (!filter.setDecodeParms(d_obj) || (d_level < qpdf_dl_all && filter.isLossyCompression()) || (d_level < qpdf_dl_specialized && filter.isSpecializedCompression())) { return false; } return true; }; auto decode_array = decode_obj.as_array(strict); if (!decode_array || decode_array.size() == 0) { if (decode_array) { decode_obj = QPDFObjectHandle::newNull(); } for (auto& filter: filters) { if (!can_filter(decode_level, *filter, decode_obj)) { return false; } } } else { // Ignore /DecodeParms entirely if /Filters is empty. At least one case of a file whose // /DecodeParms was [ << >> ] when /Filters was empty has been seen in the wild. if (!filters.empty() && QIntC::to_size(decode_array.size()) != filters.size()) { warn("stream /DecodeParms length is inconsistent with filters"); return false; } int i = -1; for (auto& filter: filters) { auto d_obj = decode_array.get(++i); if (!can_filter(decode_level, *filter, d_obj)) { return false; } } } return true; } bool Stream::pipeStreamData( Pipeline* pipeline, bool* filterp, int encode_flags, qpdf_stream_decode_level_e decode_level, bool suppress_warnings, bool will_retry) { auto s = stream(); std::vector> filters; bool ignored; if (!filterp) { filterp = &ignored; } bool& filter = *filterp; const bool empty_stream = !s->stream_provider && !s->stream_data && s->length == 0; const bool empty_stream_data = s->stream_data && s->stream_data->getSize() == 0; const bool empty = empty_stream || empty_stream_data; if (empty_stream || empty_stream_data) { filter = true; } filter = empty || encode_flags || decode_level != qpdf_dl_none; if (filter) { filter = filterable(decode_level, filters); } if (!pipeline) { QTC::TC("qpdf", "QPDF_Stream pipeStreamData with null pipeline"); // Return value is whether we can filter in this case. return filter; } // Construct the pipeline in reverse order. Force pipelines we create to be deleted when this // function finishes. Pipelines created by QPDFStreamFilter objects will be deleted by those // objects. std::vector> to_delete; ContentNormalizer normalizer; if (filter) { if (encode_flags & qpdf_ef_compress) { auto new_pipeline = std::make_unique("compress stream", pipeline, Pl_Flate::a_deflate); pipeline = new_pipeline.get(); to_delete.push_back(std::move(new_pipeline)); } if (encode_flags & qpdf_ef_normalize) { auto new_pipeline = std::make_unique("normalizer", &normalizer, pipeline); pipeline = new_pipeline.get(); to_delete.push_back(std::move(new_pipeline)); } for (auto iter = s->token_filters.rbegin(); iter != s->token_filters.rend(); ++iter) { auto new_pipeline = std::make_unique("token filter", (*iter).get(), pipeline); pipeline = new_pipeline.get(); to_delete.push_back(std::move(new_pipeline)); } for (auto f_iter = filters.rbegin(); f_iter != filters.rend(); ++f_iter) { if (auto decode_pipeline = (*f_iter)->getDecodePipeline(pipeline)) { pipeline = decode_pipeline; } auto* flate = dynamic_cast(pipeline); if (flate) { flate->setWarnCallback([this](char const* msg, int code) { warn(msg); }); } } } if (s->stream_data.get()) { QTC::TC("qpdf", "QPDF_Stream pipe replaced stream data"); pipeline->write(s->stream_data->getBuffer(), s->stream_data->getSize()); pipeline->finish(); } else if (s->stream_provider.get()) { Pl_Count count("stream provider count", pipeline); if (s->stream_provider->supportsRetry()) { if (!s->stream_provider->provideStreamData( obj->getObjGen(), &count, suppress_warnings, will_retry)) { filter = false; return false; } } else { s->stream_provider->provideStreamData(obj->getObjGen(), &count); } qpdf_offset_t actual_length = count.getCount(); if (s->stream_dict.hasKey("/Length")) { auto desired_length = s->stream_dict.getKey("/Length").getIntValue(); if (actual_length != desired_length) { QTC::TC("qpdf", "QPDF_Stream provider length mismatch"); // This would be caused by programmer error on the part of a library user, not by // invalid input data. throw std::runtime_error( "stream data provider for " + obj->getObjGen().unparse(' ') + " provided " + std::to_string(actual_length) + " bytes instead of expected " + std::to_string(desired_length) + " bytes"); } } else { QTC::TC("qpdf", "QPDF_Stream provider length not provided"); s->stream_dict.replaceKey("/Length", QPDFObjectHandle::newInteger(actual_length)); } } else { if (obj->getParsedOffset() == 0) { QTC::TC("qpdf", "QPDF_Stream pipe no stream data"); throw std::logic_error("pipeStreamData called for stream with no data"); } QTC::TC("qpdf", "QPDF_Stream pipe original stream data"); if (!QPDF::Doc::Streams::pipeStreamData( obj->getQPDF(), obj->getObjGen(), obj->getParsedOffset(), s->length, s->stream_dict, isRootMetadata(), pipeline, suppress_warnings, will_retry)) { filter = false; return false; } } if (filter && !suppress_warnings && normalizer.anyBadTokens()) { warn("content normalization encountered bad tokens"); if (normalizer.lastTokenWasBad()) { QTC::TC("qpdf", "QPDF_Stream bad token at end during normalize"); warn( "normalized content ended with a bad token; you may be able to resolve this by " "coalescing content streams in combination with normalizing content. From the " "command line, specify --coalesce-contents"); } warn( "Resulting stream data may be corrupted but is may still useful for manual " "inspection. For more information on this warning, search for content normalization " "in the manual."); } return true; } void Stream::replaceStreamData( std::shared_ptr data, QPDFObjectHandle const& filter, QPDFObjectHandle const& decode_parms) { auto s = stream(); s->stream_data = data; s->stream_provider = nullptr; replaceFilterData(filter, decode_parms, data->getSize()); } void Stream::replaceStreamData( std::shared_ptr provider, QPDFObjectHandle const& filter, QPDFObjectHandle const& decode_parms) { auto s = stream(); s->stream_provider = provider; s->stream_data = nullptr; replaceFilterData(filter, decode_parms, 0); } void Stream::replaceFilterData( QPDFObjectHandle const& filter, QPDFObjectHandle const& decode_parms, size_t length) { auto s = stream(); if (filter) { s->stream_dict.replaceKey("/Filter", filter); } if (decode_parms) { s->stream_dict.replaceKey("/DecodeParms", decode_parms); } if (length == 0) { QTC::TC("qpdf", "QPDF_Stream unknown stream length"); s->stream_dict.removeKey("/Length"); } else { s->stream_dict.replaceKey( "/Length", QPDFObjectHandle::newInteger(QIntC::to_longlong(length))); } } void Stream::warn(std::string const& message) { obj->getQPDF()->warn(qpdf_e_damaged_pdf, "", obj->getParsedOffset(), message); } QPDFObjectHandle QPDFObjectHandle::getDict() const { return as_stream(error).getDict(); } void QPDFObjectHandle::setFilterOnWrite(bool val) { as_stream(error).setFilterOnWrite(val); } bool QPDFObjectHandle::getFilterOnWrite() { return as_stream(error).getFilterOnWrite(); } bool QPDFObjectHandle::isDataModified() { return as_stream(error).isDataModified(); } void QPDFObjectHandle::replaceDict(QPDFObjectHandle const& new_dict) { as_stream(error).replaceDict(new_dict); } bool QPDFObjectHandle::isRootMetadata() const { return as_stream(error).isRootMetadata(); } std::shared_ptr QPDFObjectHandle::getStreamData(qpdf_stream_decode_level_e level) { return std::make_shared(as_stream(error).getStreamData(level)); } std::shared_ptr QPDFObjectHandle::getRawStreamData() { return std::make_shared(as_stream(error).getRawStreamData()); } bool QPDFObjectHandle::pipeStreamData( Pipeline* p, bool* filtering_attempted, int encode_flags, qpdf_stream_decode_level_e decode_level, bool suppress_warnings, bool will_retry) { return as_stream(error).pipeStreamData( p, filtering_attempted, encode_flags, decode_level, suppress_warnings, will_retry); } bool QPDFObjectHandle::pipeStreamData( Pipeline* p, int encode_flags, qpdf_stream_decode_level_e decode_level, bool suppress_warnings, bool will_retry) { bool filtering_attempted; as_stream(error).pipeStreamData( p, &filtering_attempted, encode_flags, decode_level, suppress_warnings, will_retry); return filtering_attempted; } bool QPDFObjectHandle::pipeStreamData(Pipeline* p, bool filter, bool normalize, bool compress) { int encode_flags = 0; qpdf_stream_decode_level_e decode_level = qpdf_dl_none; if (filter) { decode_level = qpdf_dl_generalized; if (normalize) { encode_flags |= qpdf_ef_normalize; } if (compress) { encode_flags |= qpdf_ef_compress; } } return pipeStreamData(p, encode_flags, decode_level, false); } void QPDFObjectHandle::replaceStreamData( std::shared_ptr data, QPDFObjectHandle const& filter, QPDFObjectHandle const& decode_parms) { as_stream(error).replaceStreamData(data, filter, decode_parms); } void QPDFObjectHandle::replaceStreamData( std::string const& data, QPDFObjectHandle const& filter, QPDFObjectHandle const& decode_parms) { auto b = std::make_shared(data.length()); unsigned char* bp = b->getBuffer(); if (bp) { memcpy(bp, data.c_str(), data.length()); } as_stream(error).replaceStreamData(b, filter, decode_parms); } void QPDFObjectHandle::replaceStreamData( std::shared_ptr provider, QPDFObjectHandle const& filter, QPDFObjectHandle const& decode_parms) { as_stream(error).replaceStreamData(provider, filter, decode_parms); } namespace { class FunctionProvider: public QPDFObjectHandle::StreamDataProvider { public: FunctionProvider(std::function provider) : StreamDataProvider(false), p1(provider), p2(nullptr) { } FunctionProvider(std::function provider) : StreamDataProvider(true), p1(nullptr), p2(provider) { } void provideStreamData(QPDFObjGen const&, Pipeline* pipeline) override { p1(pipeline); } bool provideStreamData( QPDFObjGen const&, Pipeline* pipeline, bool suppress_warnings, bool will_retry) override { return p2(pipeline, suppress_warnings, will_retry); } private: std::function p1; std::function p2; }; } // namespace void QPDFObjectHandle::replaceStreamData( std::function provider, QPDFObjectHandle const& filter, QPDFObjectHandle const& decode_parms) { auto sdp = std::shared_ptr(new FunctionProvider(provider)); as_stream(error).replaceStreamData(sdp, filter, decode_parms); } void QPDFObjectHandle::replaceStreamData( std::function provider, QPDFObjectHandle const& filter, QPDFObjectHandle const& decode_parms) { auto sdp = std::shared_ptr(new FunctionProvider(provider)); as_stream(error).replaceStreamData(sdp, filter, decode_parms); } JSON QPDFObjectHandle::getStreamJSON( int json_version, qpdf_json_stream_data_e json_data, qpdf_stream_decode_level_e decode_level, Pipeline* p, std::string const& data_filename) { return as_stream(error).getStreamJSON(json_version, json_data, decode_level, p, data_filename); }