Commit d4d7630cf544dc295202382026658b55bf49f76b
1 parent
ac042d16
Add pdf-custom-filter example
Showing
9 changed files
with
752 additions
and
11 deletions
ChangeLog
| ... | ... | @@ -27,7 +27,8 @@ |
| 27 | 27 | provide code to validate and interpret /DecodeParms for a specific |
| 28 | 28 | /Filter and also to provide a pipeline that will decode. Note that |
| 29 | 29 | it is possible to encode to a filter type that is not supported |
| 30 | - even without this feature. | |
| 30 | + even without this feature. See examples/pdf-custom-filter.cc for | |
| 31 | + an example of using custom stream filters. | |
| 31 | 32 | |
| 32 | 33 | 2020-12-22 Jay Berkenbilt <ejb@ql.org> |
| 33 | 34 | ... | ... |
TODO
| ... | ... | @@ -589,6 +589,8 @@ I find it useful to make reference to them in this list |
| 589 | 589 | a stream data provider is especially expensive, it can implement |
| 590 | 590 | its own cache. |
| 591 | 591 | |
| 592 | - The implementation of pluggable stream filters includes an example | |
| 593 | - that illustrates how a program might handle making decisions about | |
| 594 | - filters and decode parameters based on the input data. | |
| 592 | + The example examples/pdf-custom-filter.cc demonstrates the use of | |
| 593 | + custom stream filters. This includes a custom pipeline, a custom | |
| 594 | + stream filter, as well as modification of a stream's dictionary to | |
| 595 | + include creation of a new stream that is referenced from | |
| 596 | + /DecodeParms. | ... | ... |
examples/build.mk
| 1 | 1 | BINS_examples = \ |
| 2 | 2 | pdf-bookmarks \ |
| 3 | - pdf-mod-info \ | |
| 4 | - pdf-npages \ | |
| 3 | + pdf-count-strings \ | |
| 4 | + pdf-create \ | |
| 5 | + pdf-custom-filter \ | |
| 5 | 6 | pdf-double-page-size \ |
| 7 | + pdf-filter-tokens \ | |
| 6 | 8 | pdf-invert-images \ |
| 7 | - pdf-create \ | |
| 9 | + pdf-mod-info \ | |
| 10 | + pdf-npages \ | |
| 11 | + pdf-overlay-page \ | |
| 8 | 12 | pdf-parse-content \ |
| 9 | - pdf-split-pages \ | |
| 10 | - pdf-filter-tokens \ | |
| 11 | - pdf-count-strings \ | |
| 12 | 13 | pdf-set-form-values \ |
| 13 | - pdf-overlay-page | |
| 14 | + pdf-split-pages | |
| 14 | 15 | CBINS_examples = \ |
| 15 | 16 | pdf-c-objects \ |
| 16 | 17 | pdf-linearize | ... | ... |
examples/pdf-custom-filter.cc
0 → 100644
| 1 | +#include <qpdf/QPDF.hh> | |
| 2 | +#include <qpdf/QUtil.hh> | |
| 3 | +#include <qpdf/QPDFWriter.hh> | |
| 4 | +#include <qpdf/QPDFStreamFilter.hh> | |
| 5 | + | |
| 6 | +#include <cstring> | |
| 7 | +#include <exception> | |
| 8 | +#include <iostream> | |
| 9 | +#include <memory> | |
| 10 | + | |
| 11 | +// This example shows you everything you need to know to implement a | |
| 12 | +// custom stream filter for encoding and decoding as well as a stream | |
| 13 | +// data provider that modifies the stream's dictionary. This example | |
| 14 | +// uses the pattern of having the stream data provider class use a | |
| 15 | +// second QPDF instance with copies of streams from the original QPDF | |
| 16 | +// so that the stream data provider can access the original stream | |
| 17 | +// data. This is implement very efficiently inside the qpdf library as | |
| 18 | +// the second QPDF instance knows how to read the stream data from the | |
| 19 | +// original input file, so no extra copies of the original stream data | |
| 20 | +// are made. | |
| 21 | + | |
| 22 | +// This example creates an imaginary filter called /XORDecode. There | |
| 23 | +// is no such filter in PDF, so the streams created by the example | |
| 24 | +// would not be usable by any PDF reader. However, the techniques here | |
| 25 | +// would work if you were going to implement support for a filter that | |
| 26 | +// qpdf does not support natively. For example, using the techinques | |
| 27 | +// shown here, it would be possible to create an application that | |
| 28 | +// downsampled or re-encoded images or that re-compressed streams | |
| 29 | +// using a more efficient "deflate" implementation than zlib. | |
| 30 | + | |
| 31 | +// Comments appear throughout the code describing each piece of code | |
| 32 | +// and its purpose. You can read the file top to bottom, or you can | |
| 33 | +// start with main() and follow the flow. | |
| 34 | + | |
| 35 | +// Please also see the test suite, qtest/custom-filter.test, which | |
| 36 | +// contains additional comments describing how to observe the results | |
| 37 | +// of running this example on test files that are specifically crafted | |
| 38 | +// for it. | |
| 39 | + | |
| 40 | +static char const* whoami = 0; | |
| 41 | + | |
| 42 | + | |
| 43 | +class Pl_XOR: public Pipeline | |
| 44 | +{ | |
| 45 | + // This class implements a Pipeline for the made-up XOR decoder. | |
| 46 | + // It is initialized with a single-byte "key" and just XORs each | |
| 47 | + // byte with that key. This makes it reversible, so there is no | |
| 48 | + // distinction between encoding and decoding. | |
| 49 | + | |
| 50 | + public: | |
| 51 | + Pl_XOR(char const* identifier, Pipeline* next, unsigned char key); | |
| 52 | + virtual ~Pl_XOR() = default; | |
| 53 | + virtual void write(unsigned char* data, size_t len) override; | |
| 54 | + virtual void finish() override; | |
| 55 | + | |
| 56 | + private: | |
| 57 | + unsigned char key; | |
| 58 | +}; | |
| 59 | + | |
| 60 | +Pl_XOR::Pl_XOR(char const* identifier, Pipeline* next, unsigned char key) : | |
| 61 | + Pipeline(identifier, next), | |
| 62 | + key(key) | |
| 63 | +{ | |
| 64 | +} | |
| 65 | + | |
| 66 | +void | |
| 67 | +Pl_XOR::write(unsigned char* data, size_t len) | |
| 68 | +{ | |
| 69 | + for (size_t i = 0; i < len; ++i) | |
| 70 | + { | |
| 71 | + unsigned char p = data[i] ^ this->key; | |
| 72 | + getNext()->write(&p, 1); | |
| 73 | + } | |
| 74 | +} | |
| 75 | + | |
| 76 | +void | |
| 77 | +Pl_XOR::finish() | |
| 78 | +{ | |
| 79 | + getNext()->finish(); | |
| 80 | +} | |
| 81 | + | |
| 82 | +class SF_XORDecode: public QPDFStreamFilter | |
| 83 | +{ | |
| 84 | + // This class implements a QPDFStreamFilter that knows how to | |
| 85 | + // validate and interpret decode parameters (/DecodeParms) for the | |
| 86 | + // made-up /XORDecode stream filter. Since this is not a real | |
| 87 | + // stream filter, no actual PDF reader would know how to interpret | |
| 88 | + // it. This is just to illlustrate how to create a stream filter. | |
| 89 | + // In main(), we call QPDF::registerStreamFilter to tell the | |
| 90 | + // library about the filter. See comments in QPDFStreamFilter.hh | |
| 91 | + // for details on how to implement the methods. For purposes of | |
| 92 | + // example, we are calling this a "specialized" compression | |
| 93 | + // filter, which just means QPDF assumes that it should not | |
| 94 | + // "uncompress" the stream by default. | |
| 95 | + public: | |
| 96 | + virtual ~SF_XORDecode() = default; | |
| 97 | + virtual bool setDecodeParms(QPDFObjectHandle decode_parms) override; | |
| 98 | + virtual Pipeline* getDecodePipeline(Pipeline* next) override; | |
| 99 | + virtual bool isSpecializedCompression() override; | |
| 100 | + | |
| 101 | + private: | |
| 102 | + unsigned char key; | |
| 103 | + // It is the responsibility of the QPDFStreamFilter implementation | |
| 104 | + // to ensure that the pipeline returned by getDecodePipeline() is | |
| 105 | + // deleted when the class is deleted. The easiest way to do this | |
| 106 | + // is to stash the pipeline in a std::shared_ptr, which enables us | |
| 107 | + // to use the default destructor implementation. | |
| 108 | + std::shared_ptr<Pl_XOR> pipeline; | |
| 109 | +}; | |
| 110 | + | |
| 111 | +bool | |
| 112 | +SF_XORDecode::setDecodeParms(QPDFObjectHandle decode_parms) | |
| 113 | +{ | |
| 114 | + // For purposes of example, we store the key in a separate stream. | |
| 115 | + // We could just as well store the key directly in /DecodeParms, | |
| 116 | + // but this example uses a stream to illustrate how one might do | |
| 117 | + // that. For example, if implementing /JBIG2Decode, one would need | |
| 118 | + // to handle the /JBIG2Globals key, which points to a stream. See | |
| 119 | + // comments in SF_XORDecode::registerStream for additional notes | |
| 120 | + // on this. | |
| 121 | + try | |
| 122 | + { | |
| 123 | + // Expect /DecodeParms to be a dictionary with a /KeyStream | |
| 124 | + // key that points to a one-byte stream whose single byte is | |
| 125 | + // the key. If we are successful at retrieving the key, return | |
| 126 | + // true, indicating that we are able to process with the given | |
| 127 | + // decode parameters. Under any other circumstances, return | |
| 128 | + // false. For other examples of QPDFStreamFilter | |
| 129 | + // implementations, look at the classes whose names start with | |
| 130 | + // SF_ in the qpdf library implementation. | |
| 131 | + auto buf = decode_parms.getKey("/KeyStream").getStreamData(); | |
| 132 | + if (buf->getSize() != 1) | |
| 133 | + { | |
| 134 | + return false; | |
| 135 | + } | |
| 136 | + this->key = buf->getBuffer()[0]; | |
| 137 | + return true; | |
| 138 | + } | |
| 139 | + catch (std::exception& e) | |
| 140 | + { | |
| 141 | + std::cerr << "Error extracting key for /XORDecode: " | |
| 142 | + << e.what() << std::endl; | |
| 143 | + } | |
| 144 | + return false; | |
| 145 | +} | |
| 146 | + | |
| 147 | +Pipeline* | |
| 148 | +SF_XORDecode::getDecodePipeline(Pipeline* next) | |
| 149 | +{ | |
| 150 | + // Return a pipeline that the qpdf library should pass the stream | |
| 151 | + // data through. The pipeline should receive encoded data and pass | |
| 152 | + // decoded data to "next". getDecodePipeline() can always count on | |
| 153 | + // setDecodeParms() having been called first. The setDecodeParms() | |
| 154 | + // method should store any parameters needed by the pipeline. To | |
| 155 | + // ensure that the pipeline we return disappears when the class | |
| 156 | + // disappears, stash it in a std::shared_ptr<Pl_XOR> and retrieve | |
| 157 | + // the raw pointer from there. | |
| 158 | + this->pipeline = std::make_shared<Pl_XOR>("xor", next, this->key); | |
| 159 | + return this->pipeline.get(); | |
| 160 | +} | |
| 161 | + | |
| 162 | +bool | |
| 163 | +SF_XORDecode::isSpecializedCompression() | |
| 164 | +{ | |
| 165 | + // The default implementation of QPDFStreamFilter would return | |
| 166 | + // false, so if you want a specialized or lossy compression | |
| 167 | + // filter, override one of the methods as described in | |
| 168 | + // QPDFStreamFilter.hh. | |
| 169 | + return true; | |
| 170 | +} | |
| 171 | + | |
| 172 | +class StreamReplacer: public QPDFObjectHandle::StreamDataProvider | |
| 173 | +{ | |
| 174 | + // This class implements a StreamDataProvider that, under specific | |
| 175 | + // conditions, replaces the stream data with data encoded with the | |
| 176 | + // made-up /XORDecode filter. | |
| 177 | + | |
| 178 | + // The flow for this class is as follows: | |
| 179 | + // | |
| 180 | + // * The main application iterates through streams that should be | |
| 181 | + // replaced and calls registerStream. registerStream in turn | |
| 182 | + // calls maybeReplace passing nullptr to pipeline and the | |
| 183 | + // address of a valid QPDFObjectHandle to dict_updates. The | |
| 184 | + // stream passed in for this call is the stream for the original | |
| 185 | + // QPDF object. It has not yet been altered, so we have access | |
| 186 | + // to its original dictionary and data. As described in the | |
| 187 | + // method, the method when called in this way makes a | |
| 188 | + // determination as to whether the stream should be replaced. If | |
| 189 | + // so, registerStream makes whatever changes are required. We | |
| 190 | + // have to do this now because we can't modify the stream during | |
| 191 | + // the writing process. | |
| 192 | + // | |
| 193 | + // * provideStreamData(), which is called by QPDFWriter during the | |
| 194 | + // write process, actually writes the modified stream data. It | |
| 195 | + // calls maybeReplace again, but this time it passes a valid | |
| 196 | + // pipeline and passes nullptr to dict_updates. In this mode, | |
| 197 | + // the stream dictionary has already been altered, and the | |
| 198 | + // original stream data is no longer directly accessible. Trying | |
| 199 | + // to retrieve the stream data would be an infinite loop because | |
| 200 | + // it would just end up calling provideStreamData again. This is | |
| 201 | + // why maybeReplace uses a stashed copy of the original stream | |
| 202 | + // from the "other" QPDF object. | |
| 203 | + | |
| 204 | + // Additional explanation can be found in the method | |
| 205 | + // implementations. | |
| 206 | + | |
| 207 | + public: | |
| 208 | + StreamReplacer(QPDF* pdf); | |
| 209 | + virtual ~StreamReplacer() = default; | |
| 210 | + virtual void provideStreamData(int objid, int generation, | |
| 211 | + Pipeline* pipeline) override; | |
| 212 | + | |
| 213 | + void registerStream( | |
| 214 | + QPDFObjectHandle stream, | |
| 215 | + PointerHolder<QPDFObjectHandle::StreamDataProvider> self); | |
| 216 | + | |
| 217 | + private: | |
| 218 | + bool maybeReplace(QPDFObjGen const& og, | |
| 219 | + QPDFObjectHandle& stream, Pipeline* pipeline, | |
| 220 | + QPDFObjectHandle* dict_updates); | |
| 221 | + | |
| 222 | + // Hang onto a reference to the QPDF object containing the streams | |
| 223 | + // we are replacing. We need this to create a new stream. | |
| 224 | + QPDF* pdf; | |
| 225 | + | |
| 226 | + // This second QPDF instance gives us a place to copy streams to | |
| 227 | + // so that we can access the original stream data of the streams | |
| 228 | + // whose data we are replacing. | |
| 229 | + QPDF other; | |
| 230 | + | |
| 231 | + // Map the object/generation in original file to the copied stream | |
| 232 | + // in "other". We use this to retrieve the original data. | |
| 233 | + std::map<QPDFObjGen, QPDFObjectHandle> copied_streams; | |
| 234 | + | |
| 235 | + // Each stream gets is own "key" for the XOR filter. We use a | |
| 236 | + // single instance of StreamReplacer for all streams, so stash all | |
| 237 | + // the keys here. | |
| 238 | + std::map<QPDFObjGen, unsigned char> keys; | |
| 239 | +}; | |
| 240 | + | |
| 241 | +StreamReplacer::StreamReplacer(QPDF* pdf) : | |
| 242 | + pdf(pdf) | |
| 243 | +{ | |
| 244 | + // Our "other" QPDF is just a place to stash streams. It doesn't | |
| 245 | + // have to be a valid PDF with pages, etc. We are never going to | |
| 246 | + // write this out. | |
| 247 | + this->other.emptyPDF(); | |
| 248 | +} | |
| 249 | + | |
| 250 | +bool | |
| 251 | +StreamReplacer::maybeReplace(QPDFObjGen const& og, | |
| 252 | + QPDFObjectHandle& stream, | |
| 253 | + Pipeline* pipeline, | |
| 254 | + QPDFObjectHandle* dict_updates) | |
| 255 | +{ | |
| 256 | + // As described in the class comments, this method is called | |
| 257 | + // twice. Before writing has started pipeline is nullptr, and | |
| 258 | + // dict_updates is provided. In this mode, we figure out whether | |
| 259 | + // we should replace the stream and, if so, take care of the | |
| 260 | + // necessary setup. When we are actually ready to supply the data, | |
| 261 | + // this method is called again with pipeline populated and | |
| 262 | + // dict_updates as a nullptr. In this mode, we are not allowed to | |
| 263 | + // change anything, sincing writing is already in progress. We | |
| 264 | + // must simply provide the stream data. | |
| 265 | + | |
| 266 | + // The return value indicates whether or not we should replace the | |
| 267 | + // stream. If the first call returns false, there will be no | |
| 268 | + // second call. If the second call returns false, something went | |
| 269 | + // wrong since the method should always make the same decision for | |
| 270 | + // a given stream. | |
| 271 | + | |
| 272 | + // For this example, all the determination logic could have | |
| 273 | + // appeared inside the if (dict_updates) block rather than being | |
| 274 | + // duplicated, but in some cases, there may be a reason to | |
| 275 | + // duplicate things. For example, if you wanted to write code that | |
| 276 | + // re-encoded an image if the new encoding was more efficient, | |
| 277 | + // you'd have to actually try it out. Then you would either have | |
| 278 | + // to cache the result somewhere or just repeat the calculations, | |
| 279 | + // depending on space/time constraints, etc. | |
| 280 | + | |
| 281 | + // In our contrived example, we are replacing the data for all | |
| 282 | + // streams that have /DoXOR = true in the stream dictionary. If | |
| 283 | + // this were a more realistic application, our criteria would be | |
| 284 | + // more sensible. For example, an image downsampler might choose | |
| 285 | + // to replace a stream that represented an image with a high pixel | |
| 286 | + // density. | |
| 287 | + auto dict = stream.getDict(); | |
| 288 | + auto mark = dict.getKey("/DoXOR"); | |
| 289 | + if (! (mark.isBool() && mark.getBoolValue())) | |
| 290 | + { | |
| 291 | + return false; | |
| 292 | + } | |
| 293 | + | |
| 294 | + // We can't replace the stream data if we can't get the original | |
| 295 | + // stream data for any reason. A more realistic application may | |
| 296 | + // actually look at the data here as well, or it may be able to | |
| 297 | + // make all its decisions from the stream dictionary. However, | |
| 298 | + // it's a good idea to make sure we can retrieve the filtered data | |
| 299 | + // if we are going to need it later. | |
| 300 | + PointerHolder<Buffer> out; | |
| 301 | + try | |
| 302 | + { | |
| 303 | + out = stream.getStreamData(); | |
| 304 | + } | |
| 305 | + catch (...) | |
| 306 | + { | |
| 307 | + return false; | |
| 308 | + } | |
| 309 | + | |
| 310 | + if (dict_updates) | |
| 311 | + { | |
| 312 | + // It's not safe to make any modifications to any objects | |
| 313 | + // during the writing process since the updated objects may | |
| 314 | + // have already been written. In this mode, when dict_updates | |
| 315 | + // is provided, we have not started writing. Store the | |
| 316 | + // modifications we intend to make to the stream dictionary | |
| 317 | + // here. We're just storing /OrigLength for purposes of | |
| 318 | + // example. Again, a realistic application would make other | |
| 319 | + // changes. For example, an image resampler might change the | |
| 320 | + // dimensions or other properties of the image. | |
| 321 | + dict_updates->replaceKey( | |
| 322 | + "/OrigLength", QPDFObjectHandle::newInteger( | |
| 323 | + QIntC::to_longlong(out->getSize()))); | |
| 324 | + // We are also storing the "key" that we will access when | |
| 325 | + // writing the data. | |
| 326 | + this->keys[og] = QIntC::to_uchar( | |
| 327 | + (og.getObj() * QIntC::to_int(out->getSize())) & 0xff); | |
| 328 | + } | |
| 329 | + | |
| 330 | + if (pipeline) | |
| 331 | + { | |
| 332 | + unsigned char key = this->keys[og]; | |
| 333 | + Pl_XOR p("xor", pipeline, key); | |
| 334 | + p.write(out->getBuffer(), out->getSize()); | |
| 335 | + p.finish(); | |
| 336 | + } | |
| 337 | + return true; | |
| 338 | +} | |
| 339 | + | |
| 340 | +void | |
| 341 | +StreamReplacer::registerStream( | |
| 342 | + QPDFObjectHandle stream, | |
| 343 | + PointerHolder<QPDFObjectHandle::StreamDataProvider> self) | |
| 344 | +{ | |
| 345 | + QPDFObjGen og(stream.getObjGen()); | |
| 346 | + | |
| 347 | + // We don't need to process a stream more than once. In this | |
| 348 | + // example, we are just iterating through objects, but if we were | |
| 349 | + // doing something like iterating through images on pages, we | |
| 350 | + // might realistically encounter the same stream more than once. | |
| 351 | + if (this->copied_streams.count(og) > 0) | |
| 352 | + { | |
| 353 | + return; | |
| 354 | + } | |
| 355 | + // Store something in copied_streams so that we don't | |
| 356 | + // double-process even in the negative case. This gets replaced | |
| 357 | + // later if needed. | |
| 358 | + this->copied_streams[og] = QPDFObjectHandle::newNull(); | |
| 359 | + | |
| 360 | + // Call maybeReplace with dict_updates. In this mode, it | |
| 361 | + // determines whether we should replace the stream data and, if | |
| 362 | + // so, supplies dictionary updates we should make. | |
| 363 | + bool should_replace = false; | |
| 364 | + QPDFObjectHandle dict_updates = QPDFObjectHandle::newDictionary(); | |
| 365 | + try | |
| 366 | + { | |
| 367 | + should_replace = maybeReplace(og, stream, nullptr, &dict_updates); | |
| 368 | + } | |
| 369 | + catch (std::exception& e) | |
| 370 | + { | |
| 371 | + stream.warnIfPossible( | |
| 372 | + std::string("exception while attempting to replace: ") + | |
| 373 | + e.what()); | |
| 374 | + } | |
| 375 | + | |
| 376 | + if (should_replace) | |
| 377 | + { | |
| 378 | + // Copy the stream to another QPDF object so we can get to the | |
| 379 | + // original data from the stream data provider. | |
| 380 | + this->copied_streams[og] = this->other.copyForeignObject(stream); | |
| 381 | + // Update the stream dictionary with any changes. | |
| 382 | + auto dict = stream.getDict(); | |
| 383 | + for (auto const& k: dict_updates.getKeys()) | |
| 384 | + { | |
| 385 | + dict.replaceKey(k, dict_updates.getKey(k)); | |
| 386 | + } | |
| 387 | + // Create the key stream that will be referenced from | |
| 388 | + // /DecodeParms. We have to do this now since you can't modify | |
| 389 | + // or create objects during write. | |
| 390 | + char p[1] = { static_cast<char>(this->keys[og]) }; | |
| 391 | + std::string p_str(p, 1); | |
| 392 | + QPDFObjectHandle dp_stream = | |
| 393 | + QPDFObjectHandle::newStream(this->pdf, p_str); | |
| 394 | + // Create /DecodeParms as expected by our fictitious | |
| 395 | + // /XORDecode filter. | |
| 396 | + QPDFObjectHandle decode_parms = | |
| 397 | + QPDFObjectHandle::newDictionary({{"/KeyStream", dp_stream}}); | |
| 398 | + stream.replaceStreamData( | |
| 399 | + self, | |
| 400 | + QPDFObjectHandle::newName("/XORDecode"), | |
| 401 | + decode_parms); | |
| 402 | + // Further, if /ProtectXOR = true, we disable filtering on write | |
| 403 | + // so that QPDFWriter will not decode the stream even though we | |
| 404 | + // have registered a stream filter for /XORDecode. | |
| 405 | + auto protect = dict.getKey("/ProtectXOR"); | |
| 406 | + if (protect.isBool() && protect.getBoolValue()) | |
| 407 | + { | |
| 408 | + stream.setFilterOnWrite(false); | |
| 409 | + } | |
| 410 | + } | |
| 411 | +} | |
| 412 | + | |
| 413 | +void | |
| 414 | +StreamReplacer::provideStreamData(int objid, int generation, | |
| 415 | + Pipeline* pipeline) | |
| 416 | +{ | |
| 417 | + QPDFObjGen og(objid, generation); | |
| 418 | + QPDFObjectHandle orig = this->copied_streams[og]; | |
| 419 | + // call maybeReplace again, this time with the pipeline and no | |
| 420 | + // dict_updates. In this mode, maybeReplace doesn't make any | |
| 421 | + // changes. We have to hand it the original stream data, which we | |
| 422 | + // get from copied_streams. | |
| 423 | + if (! maybeReplace(og, orig, pipeline, nullptr)) | |
| 424 | + { | |
| 425 | + // Since this only gets called for streams we already | |
| 426 | + // determined we are replacing, a false return would indicate | |
| 427 | + // a logic error. | |
| 428 | + throw std::logic_error( | |
| 429 | + "should_replace return false in provideStreamData"); | |
| 430 | + } | |
| 431 | +} | |
| 432 | + | |
| 433 | +static void process(char const* infilename, char const* outfilename, | |
| 434 | + bool decode_specialized) | |
| 435 | +{ | |
| 436 | + QPDF qpdf; | |
| 437 | + qpdf.processFile(infilename); | |
| 438 | + | |
| 439 | + // Create a single StreamReplacer instance. The interface requires | |
| 440 | + // a PointerHolder in various places, so allocate a StreamReplacer | |
| 441 | + // and stash it in a PointerHolder. | |
| 442 | + StreamReplacer* replacer = new StreamReplacer(&qpdf); | |
| 443 | + PointerHolder<QPDFObjectHandle::StreamDataProvider> p(replacer); | |
| 444 | + | |
| 445 | + for (auto& o: qpdf.getAllObjects()) | |
| 446 | + { | |
| 447 | + if (o.isStream()) | |
| 448 | + { | |
| 449 | + // Call registerStream for every stream. Only ones that | |
| 450 | + // registerStream decides to replace will actually be | |
| 451 | + // replaced. | |
| 452 | + replacer->registerStream(o, p); | |
| 453 | + } | |
| 454 | + } | |
| 455 | + | |
| 456 | + QPDFWriter w(qpdf, outfilename); | |
| 457 | + if (decode_specialized) | |
| 458 | + { | |
| 459 | + w.setDecodeLevel(qpdf_dl_specialized); | |
| 460 | + } | |
| 461 | + // For the test suite, use static IDs. | |
| 462 | + w.setStaticID(true); // for testing only | |
| 463 | + w.write(); | |
| 464 | + std::cout << whoami << ": new file written to " << outfilename | |
| 465 | + << std::endl; | |
| 466 | +} | |
| 467 | + | |
| 468 | +static void usage() | |
| 469 | +{ | |
| 470 | + std::cerr | |
| 471 | + << "\n" | |
| 472 | + << "Usage: " << whoami << " [ --decode-specialized ] infile outfile\n" | |
| 473 | + << std::endl; | |
| 474 | + exit(2); | |
| 475 | +} | |
| 476 | + | |
| 477 | +int main(int argc, char* argv[]) | |
| 478 | +{ | |
| 479 | + whoami = QUtil::getWhoami(argv[0]); | |
| 480 | + | |
| 481 | + // For libtool's sake.... | |
| 482 | + if (strncmp(whoami, "lt-", 3) == 0) | |
| 483 | + { | |
| 484 | + whoami += 3; | |
| 485 | + } | |
| 486 | + | |
| 487 | + char const* infilename = 0; | |
| 488 | + char const* outfilename = 0; | |
| 489 | + bool decode_specialized = false; | |
| 490 | + for (int i = 1; i < argc; ++i) | |
| 491 | + { | |
| 492 | + if (strcmp(argv[i], "--decode-specialized") == 0) | |
| 493 | + { | |
| 494 | + decode_specialized = true; | |
| 495 | + } | |
| 496 | + else if (! infilename) | |
| 497 | + { | |
| 498 | + infilename = argv[i]; | |
| 499 | + } | |
| 500 | + else if (! outfilename) | |
| 501 | + { | |
| 502 | + outfilename = argv[i]; | |
| 503 | + } | |
| 504 | + else | |
| 505 | + { | |
| 506 | + usage(); | |
| 507 | + } | |
| 508 | + } | |
| 509 | + if (! (infilename && outfilename)) | |
| 510 | + { | |
| 511 | + usage(); | |
| 512 | + } | |
| 513 | + | |
| 514 | + try | |
| 515 | + { | |
| 516 | + // Register our fictitious filter. This enables QPDFWriter to | |
| 517 | + // decode our streams. This is not a real filter, so no real | |
| 518 | + // PDF reading application would be able to interpret it. This | |
| 519 | + // is just for illustrative purposes. | |
| 520 | + QPDF::registerStreamFilter( | |
| 521 | + "/XORDecode", []{ return std::make_shared<SF_XORDecode>(); }); | |
| 522 | + // Do the actual processing. | |
| 523 | + process(infilename, outfilename, decode_specialized); | |
| 524 | + } | |
| 525 | + catch (std::exception &e) | |
| 526 | + { | |
| 527 | + std::cerr << whoami << ": exception: " << e.what() << std::endl; | |
| 528 | + exit(2); | |
| 529 | + } | |
| 530 | + | |
| 531 | + return 0; | |
| 532 | +} | ... | ... |
examples/qtest/custom-filter.test
0 → 100644
| 1 | +#!/usr/bin/env perl | |
| 2 | +require 5.008; | |
| 3 | +use warnings; | |
| 4 | +use strict; | |
| 5 | + | |
| 6 | +chdir("custom-filter") or die "chdir testdir failed: $!\n"; | |
| 7 | + | |
| 8 | +require TestDriver; | |
| 9 | + | |
| 10 | +cleanup(); | |
| 11 | + | |
| 12 | +my $td = new TestDriver('custom-filter'); | |
| 13 | + | |
| 14 | +# The file input.pdf contains two streams, whose contents appear | |
| 15 | +# uncompressed with explanatory text. They are marked with the keys | |
| 16 | +# that pdf-custom-filter uses to decide 1) to re-encode using the | |
| 17 | +# fictitious /XORDecode filter, and 2) whether to protect the stream | |
| 18 | +# to prevent decoding using the custom filter even when decoding | |
| 19 | +# specialized filters is requested. | |
| 20 | + | |
| 21 | +$td->runtest("custom filter, decode generalized", | |
| 22 | + {$td->COMMAND => "pdf-custom-filter input.pdf a.pdf"}, | |
| 23 | + {$td->STRING => "pdf-custom-filter: new file written to a.pdf\n", | |
| 24 | + $td->EXIT_STATUS => 0}, | |
| 25 | + $td->NORMALIZE_NEWLINES); | |
| 26 | +$td->runtest("check output", | |
| 27 | + {$td->FILE => "a.pdf"}, | |
| 28 | + {$td->FILE => "generalized.pdf"}); | |
| 29 | + | |
| 30 | +$td->runtest("custom filter, decode specialized", | |
| 31 | + {$td->COMMAND => | |
| 32 | + "pdf-custom-filter --decode-specialized input.pdf a.pdf"}, | |
| 33 | + {$td->STRING => "pdf-custom-filter: new file written to a.pdf\n", | |
| 34 | + $td->EXIT_STATUS => 0}, | |
| 35 | + $td->NORMALIZE_NEWLINES); | |
| 36 | +$td->runtest("check output", | |
| 37 | + {$td->FILE => "a.pdf"}, | |
| 38 | + {$td->FILE => "specialized.pdf"}); | |
| 39 | + | |
| 40 | +cleanup(); | |
| 41 | + | |
| 42 | +$td->report(4); | |
| 43 | + | |
| 44 | +sub cleanup | |
| 45 | +{ | |
| 46 | + unlink "a.pdf"; | |
| 47 | +} | ... | ... |
examples/qtest/custom-filter/generalized.pdf
0 → 100644
No preview for this file type
examples/qtest/custom-filter/input.pdf
0 → 100644
| 1 | +%PDF-1.3 | |
| 2 | +%¿÷¢þ | |
| 3 | +%QDF-1.0 | |
| 4 | + | |
| 5 | +%% Original object ID: 1 0 | |
| 6 | +1 0 obj | |
| 7 | +<< | |
| 8 | + /Pages 2 0 R | |
| 9 | + /Type /Catalog | |
| 10 | +>> | |
| 11 | +endobj | |
| 12 | + | |
| 13 | +%% Original object ID: 2 0 | |
| 14 | +2 0 obj | |
| 15 | +<< | |
| 16 | + /Count 1 | |
| 17 | + /Kids [ | |
| 18 | + 3 0 R | |
| 19 | + ] | |
| 20 | + /Type /Pages | |
| 21 | +>> | |
| 22 | +endobj | |
| 23 | + | |
| 24 | +%% Page 1 | |
| 25 | +%% Original object ID: 3 0 | |
| 26 | +3 0 obj | |
| 27 | +<< | |
| 28 | + /Contents 4 0 R | |
| 29 | + /MediaBox [ | |
| 30 | + 0 | |
| 31 | + 0 | |
| 32 | + 612 | |
| 33 | + 792 | |
| 34 | + ] | |
| 35 | + /Parent 2 0 R | |
| 36 | + /Resources << | |
| 37 | + /Font << | |
| 38 | + /F1 6 0 R | |
| 39 | + >> | |
| 40 | + /ProcSet 7 0 R | |
| 41 | + >> | |
| 42 | + /Type /Page | |
| 43 | +>> | |
| 44 | +endobj | |
| 45 | + | |
| 46 | +%% Contents for page 1 | |
| 47 | +%% Original object ID: 4 0 | |
| 48 | +4 0 obj | |
| 49 | +<< | |
| 50 | + /Length 5 0 R | |
| 51 | +>> | |
| 52 | +stream | |
| 53 | +BT | |
| 54 | + /F1 24 Tf | |
| 55 | + 72 720 Td | |
| 56 | + (Potato) Tj | |
| 57 | +ET | |
| 58 | +endstream | |
| 59 | +endobj | |
| 60 | + | |
| 61 | +5 0 obj | |
| 62 | +44 | |
| 63 | +endobj | |
| 64 | + | |
| 65 | +%% Original object ID: 6 0 | |
| 66 | +6 0 obj | |
| 67 | +<< | |
| 68 | + /BaseFont /Helvetica | |
| 69 | + /Encoding /WinAnsiEncoding | |
| 70 | + /Name /F1 | |
| 71 | + /Subtype /Type1 | |
| 72 | + /Type /Font | |
| 73 | +>> | |
| 74 | +endobj | |
| 75 | + | |
| 76 | +%% Original object ID: 5 0 | |
| 77 | +7 0 obj | |
| 78 | +[ | |
| 79 | ||
| 80 | + /Text | |
| 81 | +] | |
| 82 | +endobj | |
| 83 | + | |
| 84 | +8 0 obj | |
| 85 | +<< | |
| 86 | + /Length 9 0 R | |
| 87 | + /DoXOR true | |
| 88 | +>> | |
| 89 | +stream | |
| 90 | + | |
| 91 | +This stream has /DoXOR true. When processed with pdf-custom-filter | |
| 92 | +without the --decode-specialized option, the stream will appear in the | |
| 93 | +output encoded with the fictitious /XORDecode filter, and its | |
| 94 | +/DecodeParms will contain a reference to the key stream. When | |
| 95 | +processed with pdf-custom-filter with the --decode-specialized option, | |
| 96 | +it will appear in the output as a regular stream with /FlateDecode, | |
| 97 | +but the /OrigLength key will still have been added. | |
| 98 | + | |
| 99 | +endstream | |
| 100 | +endobj | |
| 101 | + | |
| 102 | +9 0 obj | |
| 103 | +455 | |
| 104 | +endobj | |
| 105 | + | |
| 106 | +10 0 obj | |
| 107 | +<< | |
| 108 | + /Length 11 0 R | |
| 109 | + /DoXOR true | |
| 110 | + /ProtectXOR true | |
| 111 | +>> | |
| 112 | +stream | |
| 113 | + | |
| 114 | +This stream has /DoXOR true and /ProtectXOR true. When processed with | |
| 115 | +pdf-custom-filter with or without the --decode-specialized option, the | |
| 116 | +stream will appear in the output encoded with the fictitious | |
| 117 | +/XORDecode filter, and its /DecodeParms will contain a reference to | |
| 118 | +the key stream. | |
| 119 | + | |
| 120 | +endstream | |
| 121 | +endobj | |
| 122 | + | |
| 123 | +11 0 obj | |
| 124 | +288 | |
| 125 | +endobj | |
| 126 | + | |
| 127 | +xref | |
| 128 | +0 12 | |
| 129 | +0000000000 65535 f | |
| 130 | +0000000052 00000 n | |
| 131 | +0000000133 00000 n | |
| 132 | +0000000242 00000 n | |
| 133 | +0000000484 00000 n | |
| 134 | +0000000583 00000 n | |
| 135 | +0000000629 00000 n | |
| 136 | +0000000774 00000 n | |
| 137 | +0000000809 00000 n | |
| 138 | +0000001333 00000 n | |
| 139 | +0000001353 00000 n | |
| 140 | +0000001731 00000 n | |
| 141 | +trailer << | |
| 142 | + /Root 1 0 R | |
| 143 | + /Size 12 | |
| 144 | + /Example [ 8 0 R 10 0 R ] | |
| 145 | + /ID [<01f4bb169ae6e6b5f27505733e9abf42><01f4bb169ae6e6b5f27505733e9abf42>] | |
| 146 | +>> | |
| 147 | +startxref | |
| 148 | +1752 | |
| 149 | +%%EOF | ... | ... |
examples/qtest/custom-filter/specialized.pdf
0 → 100644
No preview for this file type
include/qpdf/QPDFObjectHandle.hh
| ... | ... | @@ -92,6 +92,15 @@ class QPDFObjectHandle |
| 92 | 92 | // writing linearized files, if the work done by your stream |
| 93 | 93 | // data provider is slow or computationally intensive, you |
| 94 | 94 | // might want to implement your own cache. |
| 95 | + // | |
| 96 | + // * Once you have called replaceStreamData, the original | |
| 97 | + // stream data is no longer directly accessible from the | |
| 98 | + // stream, but this is easy to work around by copying the | |
| 99 | + // stream to a separate QPDF object. The qpdf library | |
| 100 | + // implements this very efficiently without actually making | |
| 101 | + // a copy of the stream data. You can find examples of this | |
| 102 | + // pattern in some of the examples, including | |
| 103 | + // pdf-custom-filter.cc and pdf-invert-images.cc. | |
| 95 | 104 | |
| 96 | 105 | // Prior to qpdf 10.0.0, it was not possible to handle errors |
| 97 | 106 | // the way pipeStreamData does or to pass back success. | ... | ... |