Commit d4d7630cf544dc295202382026658b55bf49f76b
1 parent
ac042d16
Add pdf-custom-filter example
Showing
9 changed files
with
752 additions
and
11 deletions
ChangeLog
| @@ -27,7 +27,8 @@ | @@ -27,7 +27,8 @@ | ||
| 27 | provide code to validate and interpret /DecodeParms for a specific | 27 | provide code to validate and interpret /DecodeParms for a specific |
| 28 | /Filter and also to provide a pipeline that will decode. Note that | 28 | /Filter and also to provide a pipeline that will decode. Note that |
| 29 | it is possible to encode to a filter type that is not supported | 29 | it is possible to encode to a filter type that is not supported |
| 30 | - even without this feature. | 30 | + even without this feature. See examples/pdf-custom-filter.cc for |
| 31 | + an example of using custom stream filters. | ||
| 31 | 32 | ||
| 32 | 2020-12-22 Jay Berkenbilt <ejb@ql.org> | 33 | 2020-12-22 Jay Berkenbilt <ejb@ql.org> |
| 33 | 34 |
TODO
| @@ -589,6 +589,8 @@ I find it useful to make reference to them in this list | @@ -589,6 +589,8 @@ I find it useful to make reference to them in this list | ||
| 589 | a stream data provider is especially expensive, it can implement | 589 | a stream data provider is especially expensive, it can implement |
| 590 | its own cache. | 590 | its own cache. |
| 591 | 591 | ||
| 592 | - The implementation of pluggable stream filters includes an example | ||
| 593 | - that illustrates how a program might handle making decisions about | ||
| 594 | - filters and decode parameters based on the input data. | 592 | + The example examples/pdf-custom-filter.cc demonstrates the use of |
| 593 | + custom stream filters. This includes a custom pipeline, a custom | ||
| 594 | + stream filter, as well as modification of a stream's dictionary to | ||
| 595 | + include creation of a new stream that is referenced from | ||
| 596 | + /DecodeParms. |
examples/build.mk
| 1 | BINS_examples = \ | 1 | BINS_examples = \ |
| 2 | pdf-bookmarks \ | 2 | pdf-bookmarks \ |
| 3 | - pdf-mod-info \ | ||
| 4 | - pdf-npages \ | 3 | + pdf-count-strings \ |
| 4 | + pdf-create \ | ||
| 5 | + pdf-custom-filter \ | ||
| 5 | pdf-double-page-size \ | 6 | pdf-double-page-size \ |
| 7 | + pdf-filter-tokens \ | ||
| 6 | pdf-invert-images \ | 8 | pdf-invert-images \ |
| 7 | - pdf-create \ | 9 | + pdf-mod-info \ |
| 10 | + pdf-npages \ | ||
| 11 | + pdf-overlay-page \ | ||
| 8 | pdf-parse-content \ | 12 | pdf-parse-content \ |
| 9 | - pdf-split-pages \ | ||
| 10 | - pdf-filter-tokens \ | ||
| 11 | - pdf-count-strings \ | ||
| 12 | pdf-set-form-values \ | 13 | pdf-set-form-values \ |
| 13 | - pdf-overlay-page | 14 | + pdf-split-pages |
| 14 | CBINS_examples = \ | 15 | CBINS_examples = \ |
| 15 | pdf-c-objects \ | 16 | pdf-c-objects \ |
| 16 | pdf-linearize | 17 | pdf-linearize |
examples/pdf-custom-filter.cc
0 → 100644
| 1 | +#include <qpdf/QPDF.hh> | ||
| 2 | +#include <qpdf/QUtil.hh> | ||
| 3 | +#include <qpdf/QPDFWriter.hh> | ||
| 4 | +#include <qpdf/QPDFStreamFilter.hh> | ||
| 5 | + | ||
| 6 | +#include <cstring> | ||
| 7 | +#include <exception> | ||
| 8 | +#include <iostream> | ||
| 9 | +#include <memory> | ||
| 10 | + | ||
| 11 | +// This example shows you everything you need to know to implement a | ||
| 12 | +// custom stream filter for encoding and decoding as well as a stream | ||
| 13 | +// data provider that modifies the stream's dictionary. This example | ||
| 14 | +// uses the pattern of having the stream data provider class use a | ||
| 15 | +// second QPDF instance with copies of streams from the original QPDF | ||
| 16 | +// so that the stream data provider can access the original stream | ||
| 17 | +// data. This is implement very efficiently inside the qpdf library as | ||
| 18 | +// the second QPDF instance knows how to read the stream data from the | ||
| 19 | +// original input file, so no extra copies of the original stream data | ||
| 20 | +// are made. | ||
| 21 | + | ||
| 22 | +// This example creates an imaginary filter called /XORDecode. There | ||
| 23 | +// is no such filter in PDF, so the streams created by the example | ||
| 24 | +// would not be usable by any PDF reader. However, the techniques here | ||
| 25 | +// would work if you were going to implement support for a filter that | ||
| 26 | +// qpdf does not support natively. For example, using the techinques | ||
| 27 | +// shown here, it would be possible to create an application that | ||
| 28 | +// downsampled or re-encoded images or that re-compressed streams | ||
| 29 | +// using a more efficient "deflate" implementation than zlib. | ||
| 30 | + | ||
| 31 | +// Comments appear throughout the code describing each piece of code | ||
| 32 | +// and its purpose. You can read the file top to bottom, or you can | ||
| 33 | +// start with main() and follow the flow. | ||
| 34 | + | ||
| 35 | +// Please also see the test suite, qtest/custom-filter.test, which | ||
| 36 | +// contains additional comments describing how to observe the results | ||
| 37 | +// of running this example on test files that are specifically crafted | ||
| 38 | +// for it. | ||
| 39 | + | ||
| 40 | +static char const* whoami = 0; | ||
| 41 | + | ||
| 42 | + | ||
| 43 | +class Pl_XOR: public Pipeline | ||
| 44 | +{ | ||
| 45 | + // This class implements a Pipeline for the made-up XOR decoder. | ||
| 46 | + // It is initialized with a single-byte "key" and just XORs each | ||
| 47 | + // byte with that key. This makes it reversible, so there is no | ||
| 48 | + // distinction between encoding and decoding. | ||
| 49 | + | ||
| 50 | + public: | ||
| 51 | + Pl_XOR(char const* identifier, Pipeline* next, unsigned char key); | ||
| 52 | + virtual ~Pl_XOR() = default; | ||
| 53 | + virtual void write(unsigned char* data, size_t len) override; | ||
| 54 | + virtual void finish() override; | ||
| 55 | + | ||
| 56 | + private: | ||
| 57 | + unsigned char key; | ||
| 58 | +}; | ||
| 59 | + | ||
| 60 | +Pl_XOR::Pl_XOR(char const* identifier, Pipeline* next, unsigned char key) : | ||
| 61 | + Pipeline(identifier, next), | ||
| 62 | + key(key) | ||
| 63 | +{ | ||
| 64 | +} | ||
| 65 | + | ||
| 66 | +void | ||
| 67 | +Pl_XOR::write(unsigned char* data, size_t len) | ||
| 68 | +{ | ||
| 69 | + for (size_t i = 0; i < len; ++i) | ||
| 70 | + { | ||
| 71 | + unsigned char p = data[i] ^ this->key; | ||
| 72 | + getNext()->write(&p, 1); | ||
| 73 | + } | ||
| 74 | +} | ||
| 75 | + | ||
| 76 | +void | ||
| 77 | +Pl_XOR::finish() | ||
| 78 | +{ | ||
| 79 | + getNext()->finish(); | ||
| 80 | +} | ||
| 81 | + | ||
| 82 | +class SF_XORDecode: public QPDFStreamFilter | ||
| 83 | +{ | ||
| 84 | + // This class implements a QPDFStreamFilter that knows how to | ||
| 85 | + // validate and interpret decode parameters (/DecodeParms) for the | ||
| 86 | + // made-up /XORDecode stream filter. Since this is not a real | ||
| 87 | + // stream filter, no actual PDF reader would know how to interpret | ||
| 88 | + // it. This is just to illlustrate how to create a stream filter. | ||
| 89 | + // In main(), we call QPDF::registerStreamFilter to tell the | ||
| 90 | + // library about the filter. See comments in QPDFStreamFilter.hh | ||
| 91 | + // for details on how to implement the methods. For purposes of | ||
| 92 | + // example, we are calling this a "specialized" compression | ||
| 93 | + // filter, which just means QPDF assumes that it should not | ||
| 94 | + // "uncompress" the stream by default. | ||
| 95 | + public: | ||
| 96 | + virtual ~SF_XORDecode() = default; | ||
| 97 | + virtual bool setDecodeParms(QPDFObjectHandle decode_parms) override; | ||
| 98 | + virtual Pipeline* getDecodePipeline(Pipeline* next) override; | ||
| 99 | + virtual bool isSpecializedCompression() override; | ||
| 100 | + | ||
| 101 | + private: | ||
| 102 | + unsigned char key; | ||
| 103 | + // It is the responsibility of the QPDFStreamFilter implementation | ||
| 104 | + // to ensure that the pipeline returned by getDecodePipeline() is | ||
| 105 | + // deleted when the class is deleted. The easiest way to do this | ||
| 106 | + // is to stash the pipeline in a std::shared_ptr, which enables us | ||
| 107 | + // to use the default destructor implementation. | ||
| 108 | + std::shared_ptr<Pl_XOR> pipeline; | ||
| 109 | +}; | ||
| 110 | + | ||
| 111 | +bool | ||
| 112 | +SF_XORDecode::setDecodeParms(QPDFObjectHandle decode_parms) | ||
| 113 | +{ | ||
| 114 | + // For purposes of example, we store the key in a separate stream. | ||
| 115 | + // We could just as well store the key directly in /DecodeParms, | ||
| 116 | + // but this example uses a stream to illustrate how one might do | ||
| 117 | + // that. For example, if implementing /JBIG2Decode, one would need | ||
| 118 | + // to handle the /JBIG2Globals key, which points to a stream. See | ||
| 119 | + // comments in SF_XORDecode::registerStream for additional notes | ||
| 120 | + // on this. | ||
| 121 | + try | ||
| 122 | + { | ||
| 123 | + // Expect /DecodeParms to be a dictionary with a /KeyStream | ||
| 124 | + // key that points to a one-byte stream whose single byte is | ||
| 125 | + // the key. If we are successful at retrieving the key, return | ||
| 126 | + // true, indicating that we are able to process with the given | ||
| 127 | + // decode parameters. Under any other circumstances, return | ||
| 128 | + // false. For other examples of QPDFStreamFilter | ||
| 129 | + // implementations, look at the classes whose names start with | ||
| 130 | + // SF_ in the qpdf library implementation. | ||
| 131 | + auto buf = decode_parms.getKey("/KeyStream").getStreamData(); | ||
| 132 | + if (buf->getSize() != 1) | ||
| 133 | + { | ||
| 134 | + return false; | ||
| 135 | + } | ||
| 136 | + this->key = buf->getBuffer()[0]; | ||
| 137 | + return true; | ||
| 138 | + } | ||
| 139 | + catch (std::exception& e) | ||
| 140 | + { | ||
| 141 | + std::cerr << "Error extracting key for /XORDecode: " | ||
| 142 | + << e.what() << std::endl; | ||
| 143 | + } | ||
| 144 | + return false; | ||
| 145 | +} | ||
| 146 | + | ||
| 147 | +Pipeline* | ||
| 148 | +SF_XORDecode::getDecodePipeline(Pipeline* next) | ||
| 149 | +{ | ||
| 150 | + // Return a pipeline that the qpdf library should pass the stream | ||
| 151 | + // data through. The pipeline should receive encoded data and pass | ||
| 152 | + // decoded data to "next". getDecodePipeline() can always count on | ||
| 153 | + // setDecodeParms() having been called first. The setDecodeParms() | ||
| 154 | + // method should store any parameters needed by the pipeline. To | ||
| 155 | + // ensure that the pipeline we return disappears when the class | ||
| 156 | + // disappears, stash it in a std::shared_ptr<Pl_XOR> and retrieve | ||
| 157 | + // the raw pointer from there. | ||
| 158 | + this->pipeline = std::make_shared<Pl_XOR>("xor", next, this->key); | ||
| 159 | + return this->pipeline.get(); | ||
| 160 | +} | ||
| 161 | + | ||
| 162 | +bool | ||
| 163 | +SF_XORDecode::isSpecializedCompression() | ||
| 164 | +{ | ||
| 165 | + // The default implementation of QPDFStreamFilter would return | ||
| 166 | + // false, so if you want a specialized or lossy compression | ||
| 167 | + // filter, override one of the methods as described in | ||
| 168 | + // QPDFStreamFilter.hh. | ||
| 169 | + return true; | ||
| 170 | +} | ||
| 171 | + | ||
| 172 | +class StreamReplacer: public QPDFObjectHandle::StreamDataProvider | ||
| 173 | +{ | ||
| 174 | + // This class implements a StreamDataProvider that, under specific | ||
| 175 | + // conditions, replaces the stream data with data encoded with the | ||
| 176 | + // made-up /XORDecode filter. | ||
| 177 | + | ||
| 178 | + // The flow for this class is as follows: | ||
| 179 | + // | ||
| 180 | + // * The main application iterates through streams that should be | ||
| 181 | + // replaced and calls registerStream. registerStream in turn | ||
| 182 | + // calls maybeReplace passing nullptr to pipeline and the | ||
| 183 | + // address of a valid QPDFObjectHandle to dict_updates. The | ||
| 184 | + // stream passed in for this call is the stream for the original | ||
| 185 | + // QPDF object. It has not yet been altered, so we have access | ||
| 186 | + // to its original dictionary and data. As described in the | ||
| 187 | + // method, the method when called in this way makes a | ||
| 188 | + // determination as to whether the stream should be replaced. If | ||
| 189 | + // so, registerStream makes whatever changes are required. We | ||
| 190 | + // have to do this now because we can't modify the stream during | ||
| 191 | + // the writing process. | ||
| 192 | + // | ||
| 193 | + // * provideStreamData(), which is called by QPDFWriter during the | ||
| 194 | + // write process, actually writes the modified stream data. It | ||
| 195 | + // calls maybeReplace again, but this time it passes a valid | ||
| 196 | + // pipeline and passes nullptr to dict_updates. In this mode, | ||
| 197 | + // the stream dictionary has already been altered, and the | ||
| 198 | + // original stream data is no longer directly accessible. Trying | ||
| 199 | + // to retrieve the stream data would be an infinite loop because | ||
| 200 | + // it would just end up calling provideStreamData again. This is | ||
| 201 | + // why maybeReplace uses a stashed copy of the original stream | ||
| 202 | + // from the "other" QPDF object. | ||
| 203 | + | ||
| 204 | + // Additional explanation can be found in the method | ||
| 205 | + // implementations. | ||
| 206 | + | ||
| 207 | + public: | ||
| 208 | + StreamReplacer(QPDF* pdf); | ||
| 209 | + virtual ~StreamReplacer() = default; | ||
| 210 | + virtual void provideStreamData(int objid, int generation, | ||
| 211 | + Pipeline* pipeline) override; | ||
| 212 | + | ||
| 213 | + void registerStream( | ||
| 214 | + QPDFObjectHandle stream, | ||
| 215 | + PointerHolder<QPDFObjectHandle::StreamDataProvider> self); | ||
| 216 | + | ||
| 217 | + private: | ||
| 218 | + bool maybeReplace(QPDFObjGen const& og, | ||
| 219 | + QPDFObjectHandle& stream, Pipeline* pipeline, | ||
| 220 | + QPDFObjectHandle* dict_updates); | ||
| 221 | + | ||
| 222 | + // Hang onto a reference to the QPDF object containing the streams | ||
| 223 | + // we are replacing. We need this to create a new stream. | ||
| 224 | + QPDF* pdf; | ||
| 225 | + | ||
| 226 | + // This second QPDF instance gives us a place to copy streams to | ||
| 227 | + // so that we can access the original stream data of the streams | ||
| 228 | + // whose data we are replacing. | ||
| 229 | + QPDF other; | ||
| 230 | + | ||
| 231 | + // Map the object/generation in original file to the copied stream | ||
| 232 | + // in "other". We use this to retrieve the original data. | ||
| 233 | + std::map<QPDFObjGen, QPDFObjectHandle> copied_streams; | ||
| 234 | + | ||
| 235 | + // Each stream gets is own "key" for the XOR filter. We use a | ||
| 236 | + // single instance of StreamReplacer for all streams, so stash all | ||
| 237 | + // the keys here. | ||
| 238 | + std::map<QPDFObjGen, unsigned char> keys; | ||
| 239 | +}; | ||
| 240 | + | ||
| 241 | +StreamReplacer::StreamReplacer(QPDF* pdf) : | ||
| 242 | + pdf(pdf) | ||
| 243 | +{ | ||
| 244 | + // Our "other" QPDF is just a place to stash streams. It doesn't | ||
| 245 | + // have to be a valid PDF with pages, etc. We are never going to | ||
| 246 | + // write this out. | ||
| 247 | + this->other.emptyPDF(); | ||
| 248 | +} | ||
| 249 | + | ||
| 250 | +bool | ||
| 251 | +StreamReplacer::maybeReplace(QPDFObjGen const& og, | ||
| 252 | + QPDFObjectHandle& stream, | ||
| 253 | + Pipeline* pipeline, | ||
| 254 | + QPDFObjectHandle* dict_updates) | ||
| 255 | +{ | ||
| 256 | + // As described in the class comments, this method is called | ||
| 257 | + // twice. Before writing has started pipeline is nullptr, and | ||
| 258 | + // dict_updates is provided. In this mode, we figure out whether | ||
| 259 | + // we should replace the stream and, if so, take care of the | ||
| 260 | + // necessary setup. When we are actually ready to supply the data, | ||
| 261 | + // this method is called again with pipeline populated and | ||
| 262 | + // dict_updates as a nullptr. In this mode, we are not allowed to | ||
| 263 | + // change anything, sincing writing is already in progress. We | ||
| 264 | + // must simply provide the stream data. | ||
| 265 | + | ||
| 266 | + // The return value indicates whether or not we should replace the | ||
| 267 | + // stream. If the first call returns false, there will be no | ||
| 268 | + // second call. If the second call returns false, something went | ||
| 269 | + // wrong since the method should always make the same decision for | ||
| 270 | + // a given stream. | ||
| 271 | + | ||
| 272 | + // For this example, all the determination logic could have | ||
| 273 | + // appeared inside the if (dict_updates) block rather than being | ||
| 274 | + // duplicated, but in some cases, there may be a reason to | ||
| 275 | + // duplicate things. For example, if you wanted to write code that | ||
| 276 | + // re-encoded an image if the new encoding was more efficient, | ||
| 277 | + // you'd have to actually try it out. Then you would either have | ||
| 278 | + // to cache the result somewhere or just repeat the calculations, | ||
| 279 | + // depending on space/time constraints, etc. | ||
| 280 | + | ||
| 281 | + // In our contrived example, we are replacing the data for all | ||
| 282 | + // streams that have /DoXOR = true in the stream dictionary. If | ||
| 283 | + // this were a more realistic application, our criteria would be | ||
| 284 | + // more sensible. For example, an image downsampler might choose | ||
| 285 | + // to replace a stream that represented an image with a high pixel | ||
| 286 | + // density. | ||
| 287 | + auto dict = stream.getDict(); | ||
| 288 | + auto mark = dict.getKey("/DoXOR"); | ||
| 289 | + if (! (mark.isBool() && mark.getBoolValue())) | ||
| 290 | + { | ||
| 291 | + return false; | ||
| 292 | + } | ||
| 293 | + | ||
| 294 | + // We can't replace the stream data if we can't get the original | ||
| 295 | + // stream data for any reason. A more realistic application may | ||
| 296 | + // actually look at the data here as well, or it may be able to | ||
| 297 | + // make all its decisions from the stream dictionary. However, | ||
| 298 | + // it's a good idea to make sure we can retrieve the filtered data | ||
| 299 | + // if we are going to need it later. | ||
| 300 | + PointerHolder<Buffer> out; | ||
| 301 | + try | ||
| 302 | + { | ||
| 303 | + out = stream.getStreamData(); | ||
| 304 | + } | ||
| 305 | + catch (...) | ||
| 306 | + { | ||
| 307 | + return false; | ||
| 308 | + } | ||
| 309 | + | ||
| 310 | + if (dict_updates) | ||
| 311 | + { | ||
| 312 | + // It's not safe to make any modifications to any objects | ||
| 313 | + // during the writing process since the updated objects may | ||
| 314 | + // have already been written. In this mode, when dict_updates | ||
| 315 | + // is provided, we have not started writing. Store the | ||
| 316 | + // modifications we intend to make to the stream dictionary | ||
| 317 | + // here. We're just storing /OrigLength for purposes of | ||
| 318 | + // example. Again, a realistic application would make other | ||
| 319 | + // changes. For example, an image resampler might change the | ||
| 320 | + // dimensions or other properties of the image. | ||
| 321 | + dict_updates->replaceKey( | ||
| 322 | + "/OrigLength", QPDFObjectHandle::newInteger( | ||
| 323 | + QIntC::to_longlong(out->getSize()))); | ||
| 324 | + // We are also storing the "key" that we will access when | ||
| 325 | + // writing the data. | ||
| 326 | + this->keys[og] = QIntC::to_uchar( | ||
| 327 | + (og.getObj() * QIntC::to_int(out->getSize())) & 0xff); | ||
| 328 | + } | ||
| 329 | + | ||
| 330 | + if (pipeline) | ||
| 331 | + { | ||
| 332 | + unsigned char key = this->keys[og]; | ||
| 333 | + Pl_XOR p("xor", pipeline, key); | ||
| 334 | + p.write(out->getBuffer(), out->getSize()); | ||
| 335 | + p.finish(); | ||
| 336 | + } | ||
| 337 | + return true; | ||
| 338 | +} | ||
| 339 | + | ||
| 340 | +void | ||
| 341 | +StreamReplacer::registerStream( | ||
| 342 | + QPDFObjectHandle stream, | ||
| 343 | + PointerHolder<QPDFObjectHandle::StreamDataProvider> self) | ||
| 344 | +{ | ||
| 345 | + QPDFObjGen og(stream.getObjGen()); | ||
| 346 | + | ||
| 347 | + // We don't need to process a stream more than once. In this | ||
| 348 | + // example, we are just iterating through objects, but if we were | ||
| 349 | + // doing something like iterating through images on pages, we | ||
| 350 | + // might realistically encounter the same stream more than once. | ||
| 351 | + if (this->copied_streams.count(og) > 0) | ||
| 352 | + { | ||
| 353 | + return; | ||
| 354 | + } | ||
| 355 | + // Store something in copied_streams so that we don't | ||
| 356 | + // double-process even in the negative case. This gets replaced | ||
| 357 | + // later if needed. | ||
| 358 | + this->copied_streams[og] = QPDFObjectHandle::newNull(); | ||
| 359 | + | ||
| 360 | + // Call maybeReplace with dict_updates. In this mode, it | ||
| 361 | + // determines whether we should replace the stream data and, if | ||
| 362 | + // so, supplies dictionary updates we should make. | ||
| 363 | + bool should_replace = false; | ||
| 364 | + QPDFObjectHandle dict_updates = QPDFObjectHandle::newDictionary(); | ||
| 365 | + try | ||
| 366 | + { | ||
| 367 | + should_replace = maybeReplace(og, stream, nullptr, &dict_updates); | ||
| 368 | + } | ||
| 369 | + catch (std::exception& e) | ||
| 370 | + { | ||
| 371 | + stream.warnIfPossible( | ||
| 372 | + std::string("exception while attempting to replace: ") + | ||
| 373 | + e.what()); | ||
| 374 | + } | ||
| 375 | + | ||
| 376 | + if (should_replace) | ||
| 377 | + { | ||
| 378 | + // Copy the stream to another QPDF object so we can get to the | ||
| 379 | + // original data from the stream data provider. | ||
| 380 | + this->copied_streams[og] = this->other.copyForeignObject(stream); | ||
| 381 | + // Update the stream dictionary with any changes. | ||
| 382 | + auto dict = stream.getDict(); | ||
| 383 | + for (auto const& k: dict_updates.getKeys()) | ||
| 384 | + { | ||
| 385 | + dict.replaceKey(k, dict_updates.getKey(k)); | ||
| 386 | + } | ||
| 387 | + // Create the key stream that will be referenced from | ||
| 388 | + // /DecodeParms. We have to do this now since you can't modify | ||
| 389 | + // or create objects during write. | ||
| 390 | + char p[1] = { static_cast<char>(this->keys[og]) }; | ||
| 391 | + std::string p_str(p, 1); | ||
| 392 | + QPDFObjectHandle dp_stream = | ||
| 393 | + QPDFObjectHandle::newStream(this->pdf, p_str); | ||
| 394 | + // Create /DecodeParms as expected by our fictitious | ||
| 395 | + // /XORDecode filter. | ||
| 396 | + QPDFObjectHandle decode_parms = | ||
| 397 | + QPDFObjectHandle::newDictionary({{"/KeyStream", dp_stream}}); | ||
| 398 | + stream.replaceStreamData( | ||
| 399 | + self, | ||
| 400 | + QPDFObjectHandle::newName("/XORDecode"), | ||
| 401 | + decode_parms); | ||
| 402 | + // Further, if /ProtectXOR = true, we disable filtering on write | ||
| 403 | + // so that QPDFWriter will not decode the stream even though we | ||
| 404 | + // have registered a stream filter for /XORDecode. | ||
| 405 | + auto protect = dict.getKey("/ProtectXOR"); | ||
| 406 | + if (protect.isBool() && protect.getBoolValue()) | ||
| 407 | + { | ||
| 408 | + stream.setFilterOnWrite(false); | ||
| 409 | + } | ||
| 410 | + } | ||
| 411 | +} | ||
| 412 | + | ||
| 413 | +void | ||
| 414 | +StreamReplacer::provideStreamData(int objid, int generation, | ||
| 415 | + Pipeline* pipeline) | ||
| 416 | +{ | ||
| 417 | + QPDFObjGen og(objid, generation); | ||
| 418 | + QPDFObjectHandle orig = this->copied_streams[og]; | ||
| 419 | + // call maybeReplace again, this time with the pipeline and no | ||
| 420 | + // dict_updates. In this mode, maybeReplace doesn't make any | ||
| 421 | + // changes. We have to hand it the original stream data, which we | ||
| 422 | + // get from copied_streams. | ||
| 423 | + if (! maybeReplace(og, orig, pipeline, nullptr)) | ||
| 424 | + { | ||
| 425 | + // Since this only gets called for streams we already | ||
| 426 | + // determined we are replacing, a false return would indicate | ||
| 427 | + // a logic error. | ||
| 428 | + throw std::logic_error( | ||
| 429 | + "should_replace return false in provideStreamData"); | ||
| 430 | + } | ||
| 431 | +} | ||
| 432 | + | ||
| 433 | +static void process(char const* infilename, char const* outfilename, | ||
| 434 | + bool decode_specialized) | ||
| 435 | +{ | ||
| 436 | + QPDF qpdf; | ||
| 437 | + qpdf.processFile(infilename); | ||
| 438 | + | ||
| 439 | + // Create a single StreamReplacer instance. The interface requires | ||
| 440 | + // a PointerHolder in various places, so allocate a StreamReplacer | ||
| 441 | + // and stash it in a PointerHolder. | ||
| 442 | + StreamReplacer* replacer = new StreamReplacer(&qpdf); | ||
| 443 | + PointerHolder<QPDFObjectHandle::StreamDataProvider> p(replacer); | ||
| 444 | + | ||
| 445 | + for (auto& o: qpdf.getAllObjects()) | ||
| 446 | + { | ||
| 447 | + if (o.isStream()) | ||
| 448 | + { | ||
| 449 | + // Call registerStream for every stream. Only ones that | ||
| 450 | + // registerStream decides to replace will actually be | ||
| 451 | + // replaced. | ||
| 452 | + replacer->registerStream(o, p); | ||
| 453 | + } | ||
| 454 | + } | ||
| 455 | + | ||
| 456 | + QPDFWriter w(qpdf, outfilename); | ||
| 457 | + if (decode_specialized) | ||
| 458 | + { | ||
| 459 | + w.setDecodeLevel(qpdf_dl_specialized); | ||
| 460 | + } | ||
| 461 | + // For the test suite, use static IDs. | ||
| 462 | + w.setStaticID(true); // for testing only | ||
| 463 | + w.write(); | ||
| 464 | + std::cout << whoami << ": new file written to " << outfilename | ||
| 465 | + << std::endl; | ||
| 466 | +} | ||
| 467 | + | ||
| 468 | +static void usage() | ||
| 469 | +{ | ||
| 470 | + std::cerr | ||
| 471 | + << "\n" | ||
| 472 | + << "Usage: " << whoami << " [ --decode-specialized ] infile outfile\n" | ||
| 473 | + << std::endl; | ||
| 474 | + exit(2); | ||
| 475 | +} | ||
| 476 | + | ||
| 477 | +int main(int argc, char* argv[]) | ||
| 478 | +{ | ||
| 479 | + whoami = QUtil::getWhoami(argv[0]); | ||
| 480 | + | ||
| 481 | + // For libtool's sake.... | ||
| 482 | + if (strncmp(whoami, "lt-", 3) == 0) | ||
| 483 | + { | ||
| 484 | + whoami += 3; | ||
| 485 | + } | ||
| 486 | + | ||
| 487 | + char const* infilename = 0; | ||
| 488 | + char const* outfilename = 0; | ||
| 489 | + bool decode_specialized = false; | ||
| 490 | + for (int i = 1; i < argc; ++i) | ||
| 491 | + { | ||
| 492 | + if (strcmp(argv[i], "--decode-specialized") == 0) | ||
| 493 | + { | ||
| 494 | + decode_specialized = true; | ||
| 495 | + } | ||
| 496 | + else if (! infilename) | ||
| 497 | + { | ||
| 498 | + infilename = argv[i]; | ||
| 499 | + } | ||
| 500 | + else if (! outfilename) | ||
| 501 | + { | ||
| 502 | + outfilename = argv[i]; | ||
| 503 | + } | ||
| 504 | + else | ||
| 505 | + { | ||
| 506 | + usage(); | ||
| 507 | + } | ||
| 508 | + } | ||
| 509 | + if (! (infilename && outfilename)) | ||
| 510 | + { | ||
| 511 | + usage(); | ||
| 512 | + } | ||
| 513 | + | ||
| 514 | + try | ||
| 515 | + { | ||
| 516 | + // Register our fictitious filter. This enables QPDFWriter to | ||
| 517 | + // decode our streams. This is not a real filter, so no real | ||
| 518 | + // PDF reading application would be able to interpret it. This | ||
| 519 | + // is just for illustrative purposes. | ||
| 520 | + QPDF::registerStreamFilter( | ||
| 521 | + "/XORDecode", []{ return std::make_shared<SF_XORDecode>(); }); | ||
| 522 | + // Do the actual processing. | ||
| 523 | + process(infilename, outfilename, decode_specialized); | ||
| 524 | + } | ||
| 525 | + catch (std::exception &e) | ||
| 526 | + { | ||
| 527 | + std::cerr << whoami << ": exception: " << e.what() << std::endl; | ||
| 528 | + exit(2); | ||
| 529 | + } | ||
| 530 | + | ||
| 531 | + return 0; | ||
| 532 | +} |
examples/qtest/custom-filter.test
0 → 100644
| 1 | +#!/usr/bin/env perl | ||
| 2 | +require 5.008; | ||
| 3 | +use warnings; | ||
| 4 | +use strict; | ||
| 5 | + | ||
| 6 | +chdir("custom-filter") or die "chdir testdir failed: $!\n"; | ||
| 7 | + | ||
| 8 | +require TestDriver; | ||
| 9 | + | ||
| 10 | +cleanup(); | ||
| 11 | + | ||
| 12 | +my $td = new TestDriver('custom-filter'); | ||
| 13 | + | ||
| 14 | +# The file input.pdf contains two streams, whose contents appear | ||
| 15 | +# uncompressed with explanatory text. They are marked with the keys | ||
| 16 | +# that pdf-custom-filter uses to decide 1) to re-encode using the | ||
| 17 | +# fictitious /XORDecode filter, and 2) whether to protect the stream | ||
| 18 | +# to prevent decoding using the custom filter even when decoding | ||
| 19 | +# specialized filters is requested. | ||
| 20 | + | ||
| 21 | +$td->runtest("custom filter, decode generalized", | ||
| 22 | + {$td->COMMAND => "pdf-custom-filter input.pdf a.pdf"}, | ||
| 23 | + {$td->STRING => "pdf-custom-filter: new file written to a.pdf\n", | ||
| 24 | + $td->EXIT_STATUS => 0}, | ||
| 25 | + $td->NORMALIZE_NEWLINES); | ||
| 26 | +$td->runtest("check output", | ||
| 27 | + {$td->FILE => "a.pdf"}, | ||
| 28 | + {$td->FILE => "generalized.pdf"}); | ||
| 29 | + | ||
| 30 | +$td->runtest("custom filter, decode specialized", | ||
| 31 | + {$td->COMMAND => | ||
| 32 | + "pdf-custom-filter --decode-specialized input.pdf a.pdf"}, | ||
| 33 | + {$td->STRING => "pdf-custom-filter: new file written to a.pdf\n", | ||
| 34 | + $td->EXIT_STATUS => 0}, | ||
| 35 | + $td->NORMALIZE_NEWLINES); | ||
| 36 | +$td->runtest("check output", | ||
| 37 | + {$td->FILE => "a.pdf"}, | ||
| 38 | + {$td->FILE => "specialized.pdf"}); | ||
| 39 | + | ||
| 40 | +cleanup(); | ||
| 41 | + | ||
| 42 | +$td->report(4); | ||
| 43 | + | ||
| 44 | +sub cleanup | ||
| 45 | +{ | ||
| 46 | + unlink "a.pdf"; | ||
| 47 | +} |
examples/qtest/custom-filter/generalized.pdf
0 → 100644
No preview for this file type
examples/qtest/custom-filter/input.pdf
0 → 100644
| 1 | +%PDF-1.3 | ||
| 2 | +%¿÷¢þ | ||
| 3 | +%QDF-1.0 | ||
| 4 | + | ||
| 5 | +%% Original object ID: 1 0 | ||
| 6 | +1 0 obj | ||
| 7 | +<< | ||
| 8 | + /Pages 2 0 R | ||
| 9 | + /Type /Catalog | ||
| 10 | +>> | ||
| 11 | +endobj | ||
| 12 | + | ||
| 13 | +%% Original object ID: 2 0 | ||
| 14 | +2 0 obj | ||
| 15 | +<< | ||
| 16 | + /Count 1 | ||
| 17 | + /Kids [ | ||
| 18 | + 3 0 R | ||
| 19 | + ] | ||
| 20 | + /Type /Pages | ||
| 21 | +>> | ||
| 22 | +endobj | ||
| 23 | + | ||
| 24 | +%% Page 1 | ||
| 25 | +%% Original object ID: 3 0 | ||
| 26 | +3 0 obj | ||
| 27 | +<< | ||
| 28 | + /Contents 4 0 R | ||
| 29 | + /MediaBox [ | ||
| 30 | + 0 | ||
| 31 | + 0 | ||
| 32 | + 612 | ||
| 33 | + 792 | ||
| 34 | + ] | ||
| 35 | + /Parent 2 0 R | ||
| 36 | + /Resources << | ||
| 37 | + /Font << | ||
| 38 | + /F1 6 0 R | ||
| 39 | + >> | ||
| 40 | + /ProcSet 7 0 R | ||
| 41 | + >> | ||
| 42 | + /Type /Page | ||
| 43 | +>> | ||
| 44 | +endobj | ||
| 45 | + | ||
| 46 | +%% Contents for page 1 | ||
| 47 | +%% Original object ID: 4 0 | ||
| 48 | +4 0 obj | ||
| 49 | +<< | ||
| 50 | + /Length 5 0 R | ||
| 51 | +>> | ||
| 52 | +stream | ||
| 53 | +BT | ||
| 54 | + /F1 24 Tf | ||
| 55 | + 72 720 Td | ||
| 56 | + (Potato) Tj | ||
| 57 | +ET | ||
| 58 | +endstream | ||
| 59 | +endobj | ||
| 60 | + | ||
| 61 | +5 0 obj | ||
| 62 | +44 | ||
| 63 | +endobj | ||
| 64 | + | ||
| 65 | +%% Original object ID: 6 0 | ||
| 66 | +6 0 obj | ||
| 67 | +<< | ||
| 68 | + /BaseFont /Helvetica | ||
| 69 | + /Encoding /WinAnsiEncoding | ||
| 70 | + /Name /F1 | ||
| 71 | + /Subtype /Type1 | ||
| 72 | + /Type /Font | ||
| 73 | +>> | ||
| 74 | +endobj | ||
| 75 | + | ||
| 76 | +%% Original object ID: 5 0 | ||
| 77 | +7 0 obj | ||
| 78 | +[ | ||
| 79 | |||
| 80 | + /Text | ||
| 81 | +] | ||
| 82 | +endobj | ||
| 83 | + | ||
| 84 | +8 0 obj | ||
| 85 | +<< | ||
| 86 | + /Length 9 0 R | ||
| 87 | + /DoXOR true | ||
| 88 | +>> | ||
| 89 | +stream | ||
| 90 | + | ||
| 91 | +This stream has /DoXOR true. When processed with pdf-custom-filter | ||
| 92 | +without the --decode-specialized option, the stream will appear in the | ||
| 93 | +output encoded with the fictitious /XORDecode filter, and its | ||
| 94 | +/DecodeParms will contain a reference to the key stream. When | ||
| 95 | +processed with pdf-custom-filter with the --decode-specialized option, | ||
| 96 | +it will appear in the output as a regular stream with /FlateDecode, | ||
| 97 | +but the /OrigLength key will still have been added. | ||
| 98 | + | ||
| 99 | +endstream | ||
| 100 | +endobj | ||
| 101 | + | ||
| 102 | +9 0 obj | ||
| 103 | +455 | ||
| 104 | +endobj | ||
| 105 | + | ||
| 106 | +10 0 obj | ||
| 107 | +<< | ||
| 108 | + /Length 11 0 R | ||
| 109 | + /DoXOR true | ||
| 110 | + /ProtectXOR true | ||
| 111 | +>> | ||
| 112 | +stream | ||
| 113 | + | ||
| 114 | +This stream has /DoXOR true and /ProtectXOR true. When processed with | ||
| 115 | +pdf-custom-filter with or without the --decode-specialized option, the | ||
| 116 | +stream will appear in the output encoded with the fictitious | ||
| 117 | +/XORDecode filter, and its /DecodeParms will contain a reference to | ||
| 118 | +the key stream. | ||
| 119 | + | ||
| 120 | +endstream | ||
| 121 | +endobj | ||
| 122 | + | ||
| 123 | +11 0 obj | ||
| 124 | +288 | ||
| 125 | +endobj | ||
| 126 | + | ||
| 127 | +xref | ||
| 128 | +0 12 | ||
| 129 | +0000000000 65535 f | ||
| 130 | +0000000052 00000 n | ||
| 131 | +0000000133 00000 n | ||
| 132 | +0000000242 00000 n | ||
| 133 | +0000000484 00000 n | ||
| 134 | +0000000583 00000 n | ||
| 135 | +0000000629 00000 n | ||
| 136 | +0000000774 00000 n | ||
| 137 | +0000000809 00000 n | ||
| 138 | +0000001333 00000 n | ||
| 139 | +0000001353 00000 n | ||
| 140 | +0000001731 00000 n | ||
| 141 | +trailer << | ||
| 142 | + /Root 1 0 R | ||
| 143 | + /Size 12 | ||
| 144 | + /Example [ 8 0 R 10 0 R ] | ||
| 145 | + /ID [<01f4bb169ae6e6b5f27505733e9abf42><01f4bb169ae6e6b5f27505733e9abf42>] | ||
| 146 | +>> | ||
| 147 | +startxref | ||
| 148 | +1752 | ||
| 149 | +%%EOF |
examples/qtest/custom-filter/specialized.pdf
0 → 100644
No preview for this file type
include/qpdf/QPDFObjectHandle.hh
| @@ -92,6 +92,15 @@ class QPDFObjectHandle | @@ -92,6 +92,15 @@ class QPDFObjectHandle | ||
| 92 | // writing linearized files, if the work done by your stream | 92 | // writing linearized files, if the work done by your stream |
| 93 | // data provider is slow or computationally intensive, you | 93 | // data provider is slow or computationally intensive, you |
| 94 | // might want to implement your own cache. | 94 | // might want to implement your own cache. |
| 95 | + // | ||
| 96 | + // * Once you have called replaceStreamData, the original | ||
| 97 | + // stream data is no longer directly accessible from the | ||
| 98 | + // stream, but this is easy to work around by copying the | ||
| 99 | + // stream to a separate QPDF object. The qpdf library | ||
| 100 | + // implements this very efficiently without actually making | ||
| 101 | + // a copy of the stream data. You can find examples of this | ||
| 102 | + // pattern in some of the examples, including | ||
| 103 | + // pdf-custom-filter.cc and pdf-invert-images.cc. | ||
| 95 | 104 | ||
| 96 | // Prior to qpdf 10.0.0, it was not possible to handle errors | 105 | // Prior to qpdf 10.0.0, it was not possible to handle errors |
| 97 | // the way pipeStreamData does or to pass back success. | 106 | // the way pipeStreamData does or to pass back success. |