Add pdf-custom-filter example

Jay Berkenbilt
1 parent ac042d16
Showing 9 changed files with 752 additions and 11 deletions
ChangeLog
TODO
examples/build.mk
examples/pdf-custom-filter.cc
examples/qtest/custom-filter.test
examples/qtest/custom-filter/generalized.pdf
examples/qtest/custom-filter/input.pdf
examples/qtest/custom-filter/specialized.pdf
include/qpdf/QPDFObjectHandle.hh
@@ -27,7 +27,8 @@
 	provide code to validate and interpret /DecodeParms for a specific
 	/Filter and also to provide a pipeline that will decode. Note that
 	it is possible to encode to a filter type that is not supported
-	even without this feature.
+	even without this feature. See examples/pdf-custom-filter.cc for
+	an example of using custom stream filters.
  
 2020-12-22  Jay Berkenbilt  <ejb@ql.org>
  
@@ -589,6 +589,8 @@ I find it useful to make reference to them in this list
      a stream data provider is especially expensive, it can implement
      its own cache.
  
-   The implementation of pluggable stream filters includes an example
-   that illustrates how a program might handle making decisions about
-   filters and decode parameters based on the input data.
+   The example examples/pdf-custom-filter.cc demonstrates the use of
+   custom stream filters. This includes a custom pipeline, a custom
+   stream filter, as well as modification of a stream's dictionary to
+   include creation of a new stream that is referenced from
+   /DecodeParms.
 BINS_examples = \
 	pdf-bookmarks \
-	pdf-mod-info \
-	pdf-npages \
+	pdf-count-strings \
+	pdf-create \
+	pdf-custom-filter \
 	pdf-double-page-size \
+	pdf-filter-tokens \
 	pdf-invert-images \
-	pdf-create \
+	pdf-mod-info \
+	pdf-npages \
+	pdf-overlay-page \
 	pdf-parse-content \
-	pdf-split-pages \
-	pdf-filter-tokens \
-	pdf-count-strings \
 	pdf-set-form-values \
-	pdf-overlay-page
+	pdf-split-pages
 CBINS_examples = \
 	pdf-c-objects \
 	pdf-linearize
+#include <qpdf/QPDF.hh>
+#include <qpdf/QUtil.hh>
+#include <qpdf/QPDFWriter.hh>
+#include <qpdf/QPDFStreamFilter.hh>
+
+#include <cstring>
+#include <exception>
+#include <iostream>
+#include <memory>
+
+// This example shows you everything you need to know to implement a
+// custom stream filter for encoding and decoding as well as a stream
+// data provider that modifies the stream's dictionary. This example
+// uses the pattern of having the stream data provider class use a
+// second QPDF instance with copies of streams from the original QPDF
+// so that the stream data provider can access the original stream
+// data. This is implement very efficiently inside the qpdf library as
+// the second QPDF instance knows how to read the stream data from the
+// original input file, so no extra copies of the original stream data
+// are made.
+
+// This example creates an imaginary filter called /XORDecode. There
+// is no such filter in PDF, so the streams created by the example
+// would not be usable by any PDF reader. However, the techniques here
+// would work if you were going to implement support for a filter that
+// qpdf does not support natively. For example, using the techinques
+// shown here, it would be possible to create an application that
+// downsampled or re-encoded images or that re-compressed streams
+// using a more efficient "deflate" implementation than zlib.
+
+// Comments appear throughout the code describing each piece of code
+// and its purpose. You can read the file top to bottom, or you can
+// start with main() and follow the flow.
+
+// Please also see the test suite, qtest/custom-filter.test, which
+// contains additional comments describing how to observe the results
+// of running this example on test files that are specifically crafted
+// for it.
+
+static char const* whoami = 0;
+
+
+class Pl_XOR: public Pipeline
+{
+    // This class implements a Pipeline for the made-up XOR decoder.
+    // It is initialized with a single-byte "key" and just XORs each
+    // byte with that key. This makes it reversible, so there is no
+    // distinction between encoding and decoding.
+
+  public:
+    Pl_XOR(char const* identifier, Pipeline* next, unsigned char key);
+    virtual ~Pl_XOR() = default;
+    virtual void write(unsigned char* data, size_t len) override;
+    virtual void finish() override;
+
+  private:
+    unsigned char key;
+};
+
+Pl_XOR::Pl_XOR(char const* identifier, Pipeline* next, unsigned char key) :
+    Pipeline(identifier, next),
+    key(key)
+{
+}
+
+void
+Pl_XOR::write(unsigned char* data, size_t len)
+{
+    for (size_t i = 0; i < len; ++i)
+    {
+        unsigned char p = data[i] ^ this->key;
+        getNext()->write(&p, 1);
+    }
+}
+
+void
+Pl_XOR::finish()
+{
+    getNext()->finish();
+}
+
+class SF_XORDecode: public QPDFStreamFilter
+{
+    // This class implements a QPDFStreamFilter that knows how to
+    // validate and interpret decode parameters (/DecodeParms) for the
+    // made-up /XORDecode stream filter. Since this is not a real
+    // stream filter, no actual PDF reader would know how to interpret
+    // it. This is just to illlustrate how to create a stream filter.
+    // In main(), we call QPDF::registerStreamFilter to tell the
+    // library about the filter. See comments in QPDFStreamFilter.hh
+    // for details on how to implement the methods. For purposes of
+    // example, we are calling this a "specialized" compression
+    // filter, which just means QPDF assumes that it should not
+    // "uncompress" the stream by default.
+  public:
+    virtual ~SF_XORDecode() = default;
+    virtual bool setDecodeParms(QPDFObjectHandle decode_parms) override;
+    virtual Pipeline* getDecodePipeline(Pipeline* next) override;
+    virtual bool isSpecializedCompression() override;
+
+  private:
+    unsigned char key;
+    // It is the responsibility of the QPDFStreamFilter implementation
+    // to ensure that the pipeline returned by getDecodePipeline() is
+    // deleted when the class is deleted. The easiest way to do this
+    // is to stash the pipeline in a std::shared_ptr, which enables us
+    // to use the default destructor implementation.
+    std::shared_ptr<Pl_XOR> pipeline;
+};
+
+bool
+SF_XORDecode::setDecodeParms(QPDFObjectHandle decode_parms)
+{
+    // For purposes of example, we store the key in a separate stream.
+    // We could just as well store the key directly in /DecodeParms,
+    // but this example uses a stream to illustrate how one might do
+    // that. For example, if implementing /JBIG2Decode, one would need
+    // to handle the /JBIG2Globals key, which points to a stream. See
+    // comments in SF_XORDecode::registerStream for additional notes
+    // on this.
+    try
+    {
+        // Expect /DecodeParms to be a dictionary with a /KeyStream
+        // key that points to a one-byte stream whose single byte is
+        // the key. If we are successful at retrieving the key, return
+        // true, indicating that we are able to process with the given
+        // decode parameters. Under any other circumstances, return
+        // false. For other examples of QPDFStreamFilter
+        // implementations, look at the classes whose names start with
+        // SF_ in the qpdf library implementation.
+        auto buf = decode_parms.getKey("/KeyStream").getStreamData();
+        if (buf->getSize() != 1)
+        {
+            return false;
+        }
+        this->key = buf->getBuffer()[0];
+        return true;
+    }
+    catch (std::exception& e)
+    {
+        std::cerr << "Error extracting key for /XORDecode: "
+                  << e.what() << std::endl;
+    }
+    return false;
+}
+
+Pipeline*
+SF_XORDecode::getDecodePipeline(Pipeline* next)
+{
+    // Return a pipeline that the qpdf library should pass the stream
+    // data through. The pipeline should receive encoded data and pass
+    // decoded data to "next". getDecodePipeline() can always count on
+    // setDecodeParms() having been called first. The setDecodeParms()
+    // method should store any parameters needed by the pipeline. To
+    // ensure that the pipeline we return disappears when the class
+    // disappears, stash it in a std::shared_ptr<Pl_XOR> and retrieve
+    // the raw pointer from there.
+    this->pipeline = std::make_shared<Pl_XOR>("xor", next, this->key);
+    return this->pipeline.get();
+}
+
+bool
+SF_XORDecode::isSpecializedCompression()
+{
+    // The default implementation of QPDFStreamFilter would return
+    // false, so if you want a specialized or lossy compression
+    // filter, override one of the methods as described in
+    // QPDFStreamFilter.hh.
+    return true;
+}
+
+class StreamReplacer: public QPDFObjectHandle::StreamDataProvider
+{
+    // This class implements a StreamDataProvider that, under specific
+    // conditions, replaces the stream data with data encoded with the
+    // made-up /XORDecode filter.
+
+    // The flow for this class is as follows:
+    //
+    // * The main application iterates through streams that should be
+    //   replaced and calls registerStream. registerStream in turn
+    //   calls maybeReplace passing nullptr to pipeline and the
+    //   address of a valid QPDFObjectHandle to dict_updates. The
+    //   stream passed in for this call is the stream for the original
+    //   QPDF object. It has not yet been altered, so we have access
+    //   to its original dictionary and data. As described in the
+    //   method, the method when called in this way makes a
+    //   determination as to whether the stream should be replaced. If
+    //   so, registerStream makes whatever changes are required. We
+    //   have to do this now because we can't modify the stream during
+    //   the writing process.
+    //
+    // * provideStreamData(), which is called by QPDFWriter during the
+    //   write process, actually writes the modified stream data. It
+    //   calls maybeReplace again, but this time it passes a valid
+    //   pipeline and passes nullptr to dict_updates. In this mode,
+    //   the stream dictionary has already been altered, and the
+    //   original stream data is no longer directly accessible. Trying
+    //   to retrieve the stream data would be an infinite loop because
+    //   it would just end up calling provideStreamData again. This is
+    //   why maybeReplace uses a stashed copy of the original stream
+    //   from the "other" QPDF object.
+
+    // Additional explanation can be found in the method
+    // implementations.
+
+  public:
+    StreamReplacer(QPDF* pdf);
+    virtual ~StreamReplacer() = default;
+    virtual void provideStreamData(int objid, int generation,
+				   Pipeline* pipeline) override;
+
+    void registerStream(
+        QPDFObjectHandle stream,
+        PointerHolder<QPDFObjectHandle::StreamDataProvider> self);
+
+  private:
+    bool maybeReplace(QPDFObjGen const& og,
+                      QPDFObjectHandle& stream, Pipeline* pipeline,
+                      QPDFObjectHandle* dict_updates);
+
+    // Hang onto a reference to the QPDF object containing the streams
+    // we are replacing. We need this to create a new stream.
+    QPDF* pdf;
+
+    // This second QPDF instance gives us a place to copy streams to
+    // so that we can access the original stream data of the streams
+    // whose data we are replacing.
+    QPDF other;
+
+    // Map the object/generation in original file to the copied stream
+    // in "other". We use this to retrieve the original data.
+    std::map<QPDFObjGen, QPDFObjectHandle> copied_streams;
+
+    // Each stream gets is own "key" for the XOR filter. We use a
+    // single instance of StreamReplacer for all streams, so stash all
+    // the keys here.
+    std::map<QPDFObjGen, unsigned char> keys;
+};
+
+StreamReplacer::StreamReplacer(QPDF* pdf) :
+    pdf(pdf)
+{
+    // Our "other" QPDF is just a place to stash streams. It doesn't
+    // have to be a valid PDF with pages, etc. We are never going to
+    // write this out.
+    this->other.emptyPDF();
+}
+
+bool
+StreamReplacer::maybeReplace(QPDFObjGen const& og,
+                             QPDFObjectHandle& stream,
+                             Pipeline* pipeline,
+                             QPDFObjectHandle* dict_updates)
+{
+    // As described in the class comments, this method is called
+    // twice. Before writing has started pipeline is nullptr, and
+    // dict_updates is provided. In this mode, we figure out whether
+    // we should replace the stream and, if so, take care of the
+    // necessary setup. When we are actually ready to supply the data,
+    // this method is called again with pipeline populated and
+    // dict_updates as a nullptr. In this mode, we are not allowed to
+    // change anything, sincing writing is already in progress. We
+    // must simply provide the stream data.
+
+    // The return value indicates whether or not we should replace the
+    // stream. If the first call returns false, there will be no
+    // second call. If the second call returns false, something went
+    // wrong since the method should always make the same decision for
+    // a given stream.
+
+    // For this example, all the determination logic could have
+    // appeared inside the if (dict_updates) block rather than being
+    // duplicated, but in some cases, there may be a reason to
+    // duplicate things. For example, if you wanted to write code that
+    // re-encoded an image if the new encoding was more efficient,
+    // you'd have to actually try it out. Then you would either have
+    // to cache the result somewhere or just repeat the calculations,
+    // depending on space/time constraints, etc.
+
+    // In our contrived example, we are replacing the data for all
+    // streams that have /DoXOR = true in the stream dictionary. If
+    // this were a more realistic application, our criteria would be
+    // more sensible. For example, an image downsampler might choose
+    // to replace a stream that represented an image with a high pixel
+    // density.
+    auto dict = stream.getDict();
+    auto mark = dict.getKey("/DoXOR");
+    if (! (mark.isBool() && mark.getBoolValue()))
+    {
+        return false;
+    }
+
+    // We can't replace the stream data if we can't get the original
+    // stream data for any reason. A more realistic application may
+    // actually look at the data here as well, or it may be able to
+    // make all its decisions from the stream dictionary. However,
+    // it's a good idea to make sure we can retrieve the filtered data
+    // if we are going to need it later.
+    PointerHolder<Buffer> out;
+    try
+    {
+        out = stream.getStreamData();
+    }
+    catch (...)
+    {
+        return false;
+    }
+
+    if (dict_updates)
+    {
+        // It's not safe to make any modifications to any objects
+        // during the writing process since the updated objects may
+        // have already been written. In this mode, when dict_updates
+        // is provided, we have not started writing. Store the
+        // modifications we intend to make to the stream dictionary
+        // here. We're just storing /OrigLength for purposes of
+        // example. Again, a realistic application would make other
+        // changes. For example, an image resampler might change the
+        // dimensions or other properties of the image.
+        dict_updates->replaceKey(
+            "/OrigLength", QPDFObjectHandle::newInteger(
+                QIntC::to_longlong(out->getSize())));
+        // We are also storing the "key" that we will access when
+        // writing the data.
+        this->keys[og] = QIntC::to_uchar(
+            (og.getObj() * QIntC::to_int(out->getSize())) & 0xff);
+    }
+
+    if (pipeline)
+    {
+        unsigned char key = this->keys[og];
+        Pl_XOR p("xor", pipeline, key);
+        p.write(out->getBuffer(), out->getSize());
+        p.finish();
+    }
+    return true;
+}
+
+void
+StreamReplacer::registerStream(
+    QPDFObjectHandle stream,
+    PointerHolder<QPDFObjectHandle::StreamDataProvider> self)
+{
+    QPDFObjGen og(stream.getObjGen());
+
+    // We don't need to process a stream more than once. In this
+    // example, we are just iterating through objects, but if we were
+    // doing something like iterating through images on pages, we
+    // might realistically encounter the same stream more than once.
+    if (this->copied_streams.count(og) > 0)
+    {
+        return;
+    }
+    // Store something in copied_streams so that we don't
+    // double-process even in the negative case. This gets replaced
+    // later if needed.
+    this->copied_streams[og] = QPDFObjectHandle::newNull();
+
+    // Call maybeReplace with dict_updates. In this mode, it
+    // determines whether we should replace the stream data and, if
+    // so, supplies dictionary updates we should make.
+    bool should_replace = false;
+    QPDFObjectHandle dict_updates = QPDFObjectHandle::newDictionary();
+    try
+    {
+        should_replace = maybeReplace(og, stream, nullptr, &dict_updates);
+    }
+    catch (std::exception& e)
+    {
+        stream.warnIfPossible(
+            std::string("exception while attempting to replace: ") +
+            e.what());
+    }
+
+    if (should_replace)
+    {
+        // Copy the stream to another QPDF object so we can get to the
+        // original data from the stream data provider.
+        this->copied_streams[og] = this->other.copyForeignObject(stream);
+        // Update the stream dictionary with any changes.
+        auto dict = stream.getDict();
+        for (auto const& k: dict_updates.getKeys())
+        {
+            dict.replaceKey(k, dict_updates.getKey(k));
+        }
+        // Create the key stream that will be referenced from
+        // /DecodeParms. We have to do this now since you can't modify
+        // or create objects during write.
+        char p[1] = { static_cast<char>(this->keys[og]) };
+        std::string p_str(p, 1);
+        QPDFObjectHandle dp_stream =
+            QPDFObjectHandle::newStream(this->pdf, p_str);
+        // Create /DecodeParms as expected by our fictitious
+        // /XORDecode filter.
+        QPDFObjectHandle decode_parms =
+            QPDFObjectHandle::newDictionary({{"/KeyStream", dp_stream}});
+        stream.replaceStreamData(
+            self,
+            QPDFObjectHandle::newName("/XORDecode"),
+            decode_parms);
+        // Further, if /ProtectXOR = true, we disable filtering on write
+        // so that QPDFWriter will not decode the stream even though we
+        // have registered a stream filter for /XORDecode.
+        auto protect = dict.getKey("/ProtectXOR");
+        if (protect.isBool() && protect.getBoolValue())
+        {
+            stream.setFilterOnWrite(false);
+        }
+    }
+}
+
+void
+StreamReplacer::provideStreamData(int objid, int generation,
+                                  Pipeline* pipeline)
+{
+    QPDFObjGen og(objid, generation);
+    QPDFObjectHandle orig = this->copied_streams[og];
+    // call maybeReplace again, this time with the pipeline and no
+    // dict_updates. In this mode, maybeReplace doesn't make any
+    // changes. We have to hand it the original stream data, which we
+    // get from copied_streams.
+    if (! maybeReplace(og, orig, pipeline, nullptr))
+    {
+        // Since this only gets called for streams we already
+        // determined we are replacing, a false return would indicate
+        // a logic error.
+        throw std::logic_error(
+            "should_replace return false in provideStreamData");
+    }
+}
+
+static void process(char const* infilename, char const* outfilename,
+                    bool decode_specialized)
+{
+    QPDF qpdf;
+    qpdf.processFile(infilename);
+
+    // Create a single StreamReplacer instance. The interface requires
+    // a PointerHolder in various places, so allocate a StreamReplacer
+    // and stash it in a PointerHolder.
+    StreamReplacer* replacer = new StreamReplacer(&qpdf);
+    PointerHolder<QPDFObjectHandle::StreamDataProvider> p(replacer);
+
+    for (auto& o: qpdf.getAllObjects())
+    {
+        if (o.isStream())
+        {
+            // Call registerStream for every stream. Only ones that
+            // registerStream decides to replace will actually be
+            // replaced.
+            replacer->registerStream(o, p);
+        }
+    }
+
+    QPDFWriter w(qpdf, outfilename);
+    if (decode_specialized)
+    {
+        w.setDecodeLevel(qpdf_dl_specialized);
+    }
+    // For the test suite, use static IDs.
+    w.setStaticID(true); // for testing only
+    w.write();
+    std::cout << whoami << ": new file written to " << outfilename
+              << std::endl;
+}
+
+static void usage()
+{
+    std::cerr
+        << "\n"
+        << "Usage: " << whoami << " [ --decode-specialized ] infile outfile\n"
+        << std::endl;
+    exit(2);
+}
+
+int main(int argc, char* argv[])
+{
+    whoami = QUtil::getWhoami(argv[0]);
+
+    // For libtool's sake....
+    if (strncmp(whoami, "lt-", 3) == 0)
+    {
+	whoami += 3;
+    }
+
+    char const* infilename = 0;
+    char const* outfilename = 0;
+    bool decode_specialized = false;
+    for (int i = 1; i < argc; ++i)
+    {
+        if (strcmp(argv[i], "--decode-specialized") == 0)
+        {
+            decode_specialized = true;
+        }
+        else if (! infilename)
+        {
+            infilename = argv[i];
+        }
+        else if (! outfilename)
+        {
+            outfilename = argv[i];
+        }
+        else
+        {
+            usage();
+        }
+    }
+    if (! (infilename && outfilename))
+    {
+        usage();
+    }
+
+    try
+    {
+        // Register our fictitious filter. This enables QPDFWriter to
+        // decode our streams. This is not a real filter, so no real
+        // PDF reading application would be able to interpret it. This
+        // is just for illustrative purposes.
+        QPDF::registerStreamFilter(
+            "/XORDecode", []{ return std::make_shared<SF_XORDecode>(); });
+        // Do the actual processing.
+        process(infilename, outfilename, decode_specialized);
+    }
+    catch (std::exception &e)
+    {
+        std::cerr << whoami << ": exception: " << e.what() << std::endl;
+	exit(2);
+    }
+
+    return 0;
+}
+#!/usr/bin/env perl
+require 5.008;
+use warnings;
+use strict;
+
+chdir("custom-filter") or die "chdir testdir failed: $!\n";
+
+require TestDriver;
+
+cleanup();
+
+my $td = new TestDriver('custom-filter');
+
+# The file input.pdf contains two streams, whose contents appear
+# uncompressed with explanatory text. They are marked with the keys
+# that pdf-custom-filter uses to decide 1) to re-encode using the
+# fictitious /XORDecode filter, and 2) whether to protect the stream
+# to prevent decoding using the custom filter even when decoding
+# specialized filters is requested.
+
+$td->runtest("custom filter, decode generalized",
+             {$td->COMMAND => "pdf-custom-filter input.pdf a.pdf"},
+             {$td->STRING => "pdf-custom-filter: new file written to a.pdf\n",
+                  $td->EXIT_STATUS => 0},
+             $td->NORMALIZE_NEWLINES);
+$td->runtest("check output",
+             {$td->FILE => "a.pdf"},
+             {$td->FILE => "generalized.pdf"});
+
+$td->runtest("custom filter, decode specialized",
+             {$td->COMMAND =>
+                  "pdf-custom-filter --decode-specialized input.pdf a.pdf"},
+             {$td->STRING => "pdf-custom-filter: new file written to a.pdf\n",
+                  $td->EXIT_STATUS => 0},
+             $td->NORMALIZE_NEWLINES);
+$td->runtest("check output",
+             {$td->FILE => "a.pdf"},
+             {$td->FILE => "specialized.pdf"});
+
+cleanup();
+
+$td->report(4);
+
+sub cleanup
+{
+    unlink "a.pdf";
+}
+%PDF-1.3
+%¿÷¢þ
+%QDF-1.0
+
+%% Original object ID: 1 0
+1 0 obj
+<<
+  /Pages 2 0 R
+  /Type /Catalog
+>>
+endobj
+
+%% Original object ID: 2 0
+2 0 obj
+<<
+  /Count 1
+  /Kids [
+    3 0 R
+  ]
+  /Type /Pages
+>>
+endobj
+
+%% Page 1
+%% Original object ID: 3 0
+3 0 obj
+<<
+  /Contents 4 0 R
+  /MediaBox [
+    0
+    0
+    612
+    792
+  ]
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 6 0 R
+    >>
+    /ProcSet 7 0 R
+  >>
+  /Type /Page
+>>
+endobj
+
+%% Contents for page 1
+%% Original object ID: 4 0
+4 0 obj
+<<
+  /Length 5 0 R
+>>
+stream
+BT
+  /F1 24 Tf
+  72 720 Td
+  (Potato) Tj
+ET
+endstream
+endobj
+
+5 0 obj
+44
+endobj
+
+%% Original object ID: 6 0
+6 0 obj
+<<
+  /BaseFont /Helvetica
+  /Encoding /WinAnsiEncoding
+  /Name /F1
+  /Subtype /Type1
+  /Type /Font
+>>
+endobj
+
+%% Original object ID: 5 0
+7 0 obj
+[
+  /PDF
+  /Text
+]
+endobj
+
+8 0 obj
+<<
+  /Length 9 0 R
+  /DoXOR true
+>>
+stream
+
+This stream has /DoXOR true. When processed with pdf-custom-filter
+without the --decode-specialized option, the stream will appear in the
+output encoded with the fictitious /XORDecode filter, and its
+/DecodeParms will contain a reference to the key stream. When
+processed with pdf-custom-filter with the --decode-specialized option,
+it will appear in the output as a regular stream with /FlateDecode,
+but the /OrigLength key will still have been added.
+
+endstream
+endobj
+
+9 0 obj
+455
+endobj
+
+10 0 obj
+<<
+  /Length 11 0 R
+  /DoXOR true
+  /ProtectXOR true
+>>
+stream
+
+This stream has /DoXOR true and /ProtectXOR true. When processed with
+pdf-custom-filter with or without the --decode-specialized option, the
+stream will appear in the output encoded with the fictitious
+/XORDecode filter, and its /DecodeParms will contain a reference to
+the key stream.
+
+endstream
+endobj
+
+11 0 obj
+288
+endobj
+
+xref
+0 12
+0000000000 65535 f 
+0000000052 00000 n 
+0000000133 00000 n 
+0000000242 00000 n 
+0000000484 00000 n 
+0000000583 00000 n 
+0000000629 00000 n 
+0000000774 00000 n 
+0000000809 00000 n 
+0000001333 00000 n 
+0000001353 00000 n 
+0000001731 00000 n 
+trailer <<
+  /Root 1 0 R
+  /Size 12
+  /Example [ 8 0 R 10 0 R ]
+  /ID [<01f4bb169ae6e6b5f27505733e9abf42><01f4bb169ae6e6b5f27505733e9abf42>]
+>>
+startxref
+1752
+%%EOF
@@ -92,6 +92,15 @@ class QPDFObjectHandle
         //   writing linearized files, if the work done by your stream
         //   data provider is slow or computationally intensive, you
         //   might want to implement your own cache.
+        //
+        // * Once you have called replaceStreamData, the original
+        //   stream data is no longer directly accessible from the
+        //   stream, but this is easy to work around by copying the
+        //   stream to a separate QPDF object. The qpdf library
+        //   implements this very efficiently without actually making
+        //   a copy of the stream data. You can find examples of this
+        //   pattern in some of the examples, including
+        //   pdf-custom-filter.cc and pdf-invert-images.cc.
  
         // Prior to qpdf 10.0.0, it was not possible to handle errors
         // the way pipeStreamData does or to pass back success.