Commit 99101044429c3c91bd11bdd1b26e5b6c2ceb140b
1 parent
b8723e97
Implement TokenFilter and refactor Pl_QPDFTokenizer
Implement a TokenFilter class and refactor Pl_QPDFTokenizer to use a TokenFilter class called ContentNormalizer. Pl_QPDFTokenizer is now a general filter that passes data through a TokenFilter.
Showing
16 changed files
with
631 additions
and
115 deletions
ChangeLog
| ... | ... | @@ -107,6 +107,49 @@ |
| 107 | 107 | applications that use page-level APIs in QPDFObjectHandle to be |
| 108 | 108 | more tolerant of certain types of damaged files. |
| 109 | 109 | |
| 110 | + * Add QPDFObjectHandle::TokenFilter class and methods to use it to | |
| 111 | + perform lexical filtering on content streams. You can call | |
| 112 | + QPDFObjectHandle::addTokenFilter on stream object, or you can call | |
| 113 | + the higher level QPDFObjectHandle::addContentTokenFilter on a page | |
| 114 | + object to cause the stream's contents to passed through a token | |
| 115 | + filter while being retrieved by QPDFWriter or any other consumer. | |
| 116 | + For details on using TokenFilter, please see comments in | |
| 117 | + QPDFObjectHandle.hh. | |
| 118 | + | |
| 119 | + * Enhance the string, type QPDFTokenizer::Token constructor to | |
| 120 | + initialize a raw value in addition to a value. Tokens have a | |
| 121 | + value, which is a canonical representation, and a raw value. For | |
| 122 | + all tokens except strings and names, the raw value and the value | |
| 123 | + are the same. For strings, the value excludes the outer delimiters | |
| 124 | + and has non-printing characters normalized. For names, the value | |
| 125 | + resolves non-printing characters. In order to better facilitate | |
| 126 | + token filters that mostly preserve contents and to enable | |
| 127 | + developers to be mostly unconcerned about the nuances of token | |
| 128 | + values and raw values, creating string and name tokens now | |
| 129 | + properly handles this subtlety of values and raw values. When | |
| 130 | + constructing string tokens, take care to avoid passing in the | |
| 131 | + outer delimiters. This has always been the case, but it is now | |
| 132 | + clarified in comments in QPDFObjectHandle.hh::TokenFilter. This | |
| 133 | + has no impact on any existing code unless there's some code | |
| 134 | + somewhere that was relying on Token::getRawValue() returning an | |
| 135 | + empty string for a manually constructed token. The token class's | |
| 136 | + operator== method still only looks at type and value, not raw | |
| 137 | + value. For example, string tokens for <41> and (A) would still be | |
| 138 | + equal because both are representations of the string "A". | |
| 139 | + | |
| 140 | + * Add QPDFObjectHandle::isDataModified method. This method just | |
| 141 | + returns true if addTokenFilter has been called on the stream. It | |
| 142 | + enables a caller to determine whether it is safe to optimize away | |
| 143 | + piping of stream data in cases where the input and output are | |
| 144 | + expected to be the same. QPDFWriter uses this internally to skip | |
| 145 | + the optimization of not re-compressing already compressed streams | |
| 146 | + if addTokenFilter has been called. Most developers will not have | |
| 147 | + to worry about this as it is used internally in the library in the | |
| 148 | + places that need it. If you are manually retrieving stream data | |
| 149 | + with QPDFObjectHandle::getStreamData or | |
| 150 | + QPDFObjectHandle::pipeStreamData, you don't need to worry about | |
| 151 | + this at all. | |
| 152 | + | |
| 110 | 153 | 2018-02-04 Jay Berkenbilt <ejb@ql.org> |
| 111 | 154 | |
| 112 | 155 | * Add QPDFWriter::setLinearizationPass1Filename method and | ... | ... |
include/qpdf/QPDFObjectHandle.hh
| ... | ... | @@ -35,6 +35,7 @@ |
| 35 | 35 | #include <qpdf/PointerHolder.hh> |
| 36 | 36 | #include <qpdf/Buffer.hh> |
| 37 | 37 | #include <qpdf/InputSource.hh> |
| 38 | +#include <qpdf/QPDFTokenizer.hh> | |
| 38 | 39 | |
| 39 | 40 | #include <qpdf/QPDFObject.hh> |
| 40 | 41 | |
| ... | ... | @@ -76,6 +77,66 @@ class QPDFObjectHandle |
| 76 | 77 | Pipeline* pipeline) = 0; |
| 77 | 78 | }; |
| 78 | 79 | |
| 80 | + // The TokenFilter class provides a way to filter content streams | |
| 81 | + // in a lexically aware fashion. TokenFilters can be attached to | |
| 82 | + // streams using the addTokenFilter or addContentTokenFilter | |
| 83 | + // methods. The handleToken method is called for each token, | |
| 84 | + // including the eof token, and then handleEOF is called at the | |
| 85 | + // very end. Handlers may call write (or writeToken) to pass data | |
| 86 | + // downstream. The finish() method must be called exactly one time | |
| 87 | + // to ensure that any written data is flushed out. The default | |
| 88 | + // handleEOF calls finish. If you override handleEOF, you must | |
| 89 | + // ensure that finish() is called either there or in response to | |
| 90 | + // whatever event causes you to terminate creation of output. | |
| 91 | + // Failure to call finish() may result in some of the data you | |
| 92 | + // have written being lost. You should not rely on a destructor | |
| 93 | + // for calling finish() since the destructor call may occur later | |
| 94 | + // than you expect. Please see examples/token-filters.cc for | |
| 95 | + // examples of using TokenFilters. | |
| 96 | + // | |
| 97 | + // Please note that when you call token.getValue() on a token of | |
| 98 | + // type tt_string, you get the string value without any | |
| 99 | + // delimiters. token.getRawValue() will return something suitable | |
| 100 | + // for being written to output, or calling writeToken with a | |
| 101 | + // string token will also work. The correct way to construct a | |
| 102 | + // string token that would write the literal value (str) is | |
| 103 | + // QPDFTokenizer::Token(QPDFTokenizer::tt_string, "str"). | |
| 104 | + class TokenFilter | |
| 105 | + { | |
| 106 | + public: | |
| 107 | + QPDF_DLL | |
| 108 | + TokenFilter() | |
| 109 | + { | |
| 110 | + } | |
| 111 | + QPDF_DLL | |
| 112 | + virtual ~TokenFilter() | |
| 113 | + { | |
| 114 | + } | |
| 115 | + virtual void handleToken(QPDFTokenizer::Token const&) = 0; | |
| 116 | + virtual void handleEOF() | |
| 117 | + { | |
| 118 | + // If you override handleEOF, you must be sure to call | |
| 119 | + // finish(). | |
| 120 | + finish(); | |
| 121 | + } | |
| 122 | + | |
| 123 | + // This is called internally by the qpdf library. | |
| 124 | + void setPipeline(Pipeline*); | |
| 125 | + | |
| 126 | + protected: | |
| 127 | + QPDF_DLL | |
| 128 | + void write(char const* data, size_t len); | |
| 129 | + QPDF_DLL | |
| 130 | + void write(std::string const& str); | |
| 131 | + QPDF_DLL | |
| 132 | + void writeToken(QPDFTokenizer::Token const&); | |
| 133 | + QPDF_DLL | |
| 134 | + void finish(); | |
| 135 | + | |
| 136 | + private: | |
| 137 | + Pipeline* pipeline; | |
| 138 | + }; | |
| 139 | + | |
| 79 | 140 | // This class is used by parse to decrypt strings when reading an |
| 80 | 141 | // object that contains encrypted strings. |
| 81 | 142 | class StringDecrypter |
| ... | ... | @@ -223,6 +284,23 @@ class QPDFObjectHandle |
| 223 | 284 | static void parseContentStream(QPDFObjectHandle stream_or_array, |
| 224 | 285 | ParserCallbacks* callbacks); |
| 225 | 286 | |
| 287 | + // Attach a token filter to a page's contents. If the page's | |
| 288 | + // contents is an array of streams, it is automatically coalesced. | |
| 289 | + // The token filter is applied to the page's contents as a single | |
| 290 | + // stream. | |
| 291 | + QPDF_DLL | |
| 292 | + void addContentTokenFilter(PointerHolder<TokenFilter> token_filter); | |
| 293 | + | |
| 294 | + // As of qpdf 8, it is possible to add custom token filters to a | |
| 295 | + // stream. The tokenized stream data is passed through the token | |
| 296 | + // filter after all original filters but before content stream | |
| 297 | + // normalization if requested. This is a low-level interface to | |
| 298 | + // add it to a stream. You will usually want to call | |
| 299 | + // addContentTokenFilter instead, which can be applied to a page | |
| 300 | + // object, and which will automatically handle the case of pages | |
| 301 | + // whose contents are split across multiple streams. | |
| 302 | + void addTokenFilter(PointerHolder<TokenFilter> token_filter); | |
| 303 | + | |
| 226 | 304 | // Type-specific factories |
| 227 | 305 | QPDF_DLL |
| 228 | 306 | static QPDFObjectHandle newNull(); |
| ... | ... | @@ -414,6 +492,13 @@ class QPDFObjectHandle |
| 414 | 492 | QPDF_DLL |
| 415 | 493 | QPDFObjectHandle getDict(); |
| 416 | 494 | |
| 495 | + // If addTokenFilter has been called for this stream, then the | |
| 496 | + // original data should be considered to be modified. This means we | |
| 497 | + // should avoid optimizations such as not filtering a stream that | |
| 498 | + // is already compressed. | |
| 499 | + QPDF_DLL | |
| 500 | + bool isDataModified(); | |
| 501 | + | |
| 417 | 502 | // Returns filtered (uncompressed) stream data. Throws an |
| 418 | 503 | // exception if the stream is filtered and we can't decode it. |
| 419 | 504 | QPDF_DLL |
| ... | ... | @@ -608,7 +693,7 @@ class QPDFObjectHandle |
| 608 | 693 | // stream or an array of streams. If this page's content is an |
| 609 | 694 | // array, concatenate the streams into a single stream. This can |
| 610 | 695 | // be useful when working with files that split content streams in |
| 611 | - // arbitary spots, such as in the middle of a token, as that can | |
| 696 | + // arbitrary spots, such as in the middle of a token, as that can | |
| 612 | 697 | // confuse some software. You could also call this after calling |
| 613 | 698 | // addPageContents. |
| 614 | 699 | QPDF_DLL | ... | ... |
include/qpdf/QPDFTokenizer.hh
| ... | ... | @@ -62,13 +62,8 @@ class QPDFTokenizer |
| 62 | 62 | { |
| 63 | 63 | public: |
| 64 | 64 | Token() : type(tt_bad) {} |
| 65 | - | |
| 66 | - Token(token_type_e type, std::string const& value) : | |
| 67 | - type(type), | |
| 68 | - value(value) | |
| 69 | - { | |
| 70 | - } | |
| 71 | - | |
| 65 | + QPDF_DLL | |
| 66 | + Token(token_type_e type, std::string const& value); | |
| 72 | 67 | Token(token_type_e type, std::string const& value, |
| 73 | 68 | std::string raw_value, std::string error_message) : |
| 74 | 69 | type(type), |
| ... | ... | @@ -93,7 +88,7 @@ class QPDFTokenizer |
| 93 | 88 | { |
| 94 | 89 | return this->error_message; |
| 95 | 90 | } |
| 96 | - bool operator==(Token const& rhs) | |
| 91 | + bool operator==(Token const& rhs) const | |
| 97 | 92 | { |
| 98 | 93 | // Ignore fields other than type and value |
| 99 | 94 | return ((this->type != tt_bad) && | ... | ... |
libqpdf/ContentNormalizer.cc
0 → 100644
| 1 | +#include <qpdf/ContentNormalizer.hh> | |
| 2 | +#include <qpdf/QUtil.hh> | |
| 3 | + | |
| 4 | +ContentNormalizer::ContentNormalizer() | |
| 5 | +{ | |
| 6 | +} | |
| 7 | + | |
| 8 | +ContentNormalizer::~ContentNormalizer() | |
| 9 | +{ | |
| 10 | +} | |
| 11 | + | |
| 12 | +void | |
| 13 | +ContentNormalizer::handleToken(QPDFTokenizer::Token const& token) | |
| 14 | +{ | |
| 15 | + std::string value = token.getRawValue(); | |
| 16 | + QPDFTokenizer::token_type_e token_type = token.getType(); | |
| 17 | + | |
| 18 | + switch (token_type) | |
| 19 | + { | |
| 20 | + case QPDFTokenizer::tt_space: | |
| 21 | + { | |
| 22 | + size_t len = value.length(); | |
| 23 | + for (size_t i = 0; i < len; ++i) | |
| 24 | + { | |
| 25 | + char ch = value.at(i); | |
| 26 | + if (ch == '\r') | |
| 27 | + { | |
| 28 | + if ((i + 1 < len) && (value.at(i + 1) == '\n')) | |
| 29 | + { | |
| 30 | + // ignore | |
| 31 | + } | |
| 32 | + else | |
| 33 | + { | |
| 34 | + write("\n"); | |
| 35 | + } | |
| 36 | + } | |
| 37 | + else | |
| 38 | + { | |
| 39 | + write(&ch, 1); | |
| 40 | + } | |
| 41 | + } | |
| 42 | + } | |
| 43 | + break; | |
| 44 | + | |
| 45 | + case QPDFTokenizer::tt_string: | |
| 46 | + // Replacing string and name tokens in this way normalizes | |
| 47 | + // their representation as this will automatically handle | |
| 48 | + // quoting of unprintable characters, etc. | |
| 49 | + writeToken(QPDFTokenizer::Token( | |
| 50 | + QPDFTokenizer::tt_string, token.getValue())); | |
| 51 | + break; | |
| 52 | + | |
| 53 | + case QPDFTokenizer::tt_name: | |
| 54 | + writeToken(QPDFTokenizer::Token( | |
| 55 | + QPDFTokenizer::tt_name, token.getValue())); | |
| 56 | + break; | |
| 57 | + | |
| 58 | + default: | |
| 59 | + writeToken(token); | |
| 60 | + break; | |
| 61 | + } | |
| 62 | + | |
| 63 | + value = token.getRawValue(); | |
| 64 | + if (((token_type == QPDFTokenizer::tt_string) || | |
| 65 | + (token_type == QPDFTokenizer::tt_name)) && | |
| 66 | + ((value.find('\r') != std::string::npos) || | |
| 67 | + (value.find('\n') != std::string::npos))) | |
| 68 | + { | |
| 69 | + write("\n"); | |
| 70 | + } | |
| 71 | +} | |
| 72 | + | |
| 73 | +void | |
| 74 | +ContentNormalizer::handleEOF() | |
| 75 | +{ | |
| 76 | + finish(); | |
| 77 | +} | ... | ... |
libqpdf/Pl_QPDFTokenizer.cc
| 1 | 1 | #include <qpdf/Pl_QPDFTokenizer.hh> |
| 2 | -#include <qpdf/QPDF_String.hh> | |
| 3 | -#include <qpdf/QPDF_Name.hh> | |
| 4 | 2 | #include <qpdf/QTC.hh> |
| 5 | -#include <qpdf/QUtil.hh> | |
| 6 | 3 | #include <stdexcept> |
| 7 | 4 | #include <string.h> |
| 8 | 5 | |
| 9 | -Pl_QPDFTokenizer::Pl_QPDFTokenizer(char const* identifier, Pipeline* next) : | |
| 10 | - Pipeline(identifier, next), | |
| 11 | - just_wrote_nl(false), | |
| 6 | +Pl_QPDFTokenizer::Members::Members() : | |
| 7 | + filter(0), | |
| 12 | 8 | last_char_was_cr(false), |
| 13 | 9 | unread_char(false), |
| 14 | 10 | char_to_unread('\0') |
| 15 | 11 | { |
| 16 | - tokenizer.allowEOF(); | |
| 17 | - tokenizer.includeIgnorable(); | |
| 18 | 12 | } |
| 19 | 13 | |
| 20 | -Pl_QPDFTokenizer::~Pl_QPDFTokenizer() | |
| 14 | +Pl_QPDFTokenizer::Members::~Members() | |
| 21 | 15 | { |
| 22 | 16 | } |
| 23 | 17 | |
| 24 | -void | |
| 25 | -Pl_QPDFTokenizer::writeNext(char const* buf, size_t len) | |
| 18 | +Pl_QPDFTokenizer::Pl_QPDFTokenizer( | |
| 19 | + char const* identifier, | |
| 20 | + QPDFObjectHandle::TokenFilter* filter) | |
| 21 | + : | |
| 22 | + Pipeline(identifier, 0), | |
| 23 | + m(new Members) | |
| 26 | 24 | { |
| 27 | - if (len) | |
| 28 | - { | |
| 29 | - getNext()->write(QUtil::unsigned_char_pointer(buf), len); | |
| 30 | - this->just_wrote_nl = (buf[len-1] == '\n'); | |
| 31 | - } | |
| 25 | + m->filter = filter; | |
| 26 | + m->tokenizer.allowEOF(); | |
| 27 | + m->tokenizer.includeIgnorable(); | |
| 32 | 28 | } |
| 33 | 29 | |
| 34 | -void | |
| 35 | -Pl_QPDFTokenizer::writeToken(QPDFTokenizer::Token& token) | |
| 30 | +Pl_QPDFTokenizer::~Pl_QPDFTokenizer() | |
| 36 | 31 | { |
| 37 | - std::string value = token.getRawValue(); | |
| 38 | - | |
| 39 | - switch (token.getType()) | |
| 40 | - { | |
| 41 | - case QPDFTokenizer::tt_space: | |
| 42 | - { | |
| 43 | - size_t len = value.length(); | |
| 44 | - for (size_t i = 0; i < len; ++i) | |
| 45 | - { | |
| 46 | - char ch = value.at(i); | |
| 47 | - if (ch == '\r') | |
| 48 | - { | |
| 49 | - if ((i + 1 < len) && (value.at(i + 1) == '\n')) | |
| 50 | - { | |
| 51 | - // ignore | |
| 52 | - } | |
| 53 | - else | |
| 54 | - { | |
| 55 | - writeNext("\n", 1); | |
| 56 | - } | |
| 57 | - } | |
| 58 | - else | |
| 59 | - { | |
| 60 | - writeNext(&ch, 1); | |
| 61 | - } | |
| 62 | - } | |
| 63 | - } | |
| 64 | - value.clear(); | |
| 65 | - break; | |
| 66 | - | |
| 67 | - case QPDFTokenizer::tt_string: | |
| 68 | - value = QPDF_String(token.getValue()).unparse(); | |
| 69 | - | |
| 70 | - break; | |
| 71 | - | |
| 72 | - case QPDFTokenizer::tt_name: | |
| 73 | - value = QPDF_Name(token.getValue()).unparse(); | |
| 74 | - break; | |
| 75 | - | |
| 76 | - default: | |
| 77 | - break; | |
| 78 | - } | |
| 79 | - writeNext(value.c_str(), value.length()); | |
| 80 | 32 | } |
| 81 | 33 | |
| 82 | 34 | void |
| 83 | 35 | Pl_QPDFTokenizer::processChar(char ch) |
| 84 | 36 | { |
| 85 | - tokenizer.presentCharacter(ch); | |
| 37 | + this->m->tokenizer.presentCharacter(ch); | |
| 86 | 38 | QPDFTokenizer::Token token; |
| 87 | - if (tokenizer.getToken(token, this->unread_char, this->char_to_unread)) | |
| 39 | + if (this->m->tokenizer.getToken( | |
| 40 | + token, this->m->unread_char, this->m->char_to_unread)) | |
| 88 | 41 | { |
| 89 | - writeToken(token); | |
| 90 | - std::string value = token.getRawValue(); | |
| 91 | - QPDFTokenizer::token_type_e token_type = token.getType(); | |
| 92 | - if (((token_type == QPDFTokenizer::tt_string) || | |
| 93 | - (token_type == QPDFTokenizer::tt_name)) && | |
| 94 | - ((value.find('\r') != std::string::npos) || | |
| 95 | - (value.find('\n') != std::string::npos))) | |
| 42 | + this->m->filter->handleToken(token); | |
| 43 | + if ((token.getType() == QPDFTokenizer::tt_word) && | |
| 44 | + (token.getValue() == "ID")) | |
| 96 | 45 | { |
| 97 | - writeNext("\n", 1); | |
| 98 | - } | |
| 99 | - if ((token.getType() == QPDFTokenizer::tt_word) && | |
| 100 | - (token.getValue() == "ID")) | |
| 101 | - { | |
| 102 | 46 | QTC::TC("qpdf", "Pl_QPDFTokenizer found ID"); |
| 103 | - tokenizer.expectInlineImage(); | |
| 104 | - } | |
| 47 | + this->m->tokenizer.expectInlineImage(); | |
| 48 | + } | |
| 105 | 49 | } |
| 106 | 50 | } |
| 107 | 51 | |
| ... | ... | @@ -109,10 +53,10 @@ Pl_QPDFTokenizer::processChar(char ch) |
| 109 | 53 | void |
| 110 | 54 | Pl_QPDFTokenizer::checkUnread() |
| 111 | 55 | { |
| 112 | - if (this->unread_char) | |
| 56 | + if (this->m->unread_char) | |
| 113 | 57 | { |
| 114 | - processChar(this->char_to_unread); | |
| 115 | - if (this->unread_char) | |
| 58 | + processChar(this->m->char_to_unread); | |
| 59 | + if (this->m->unread_char) | |
| 116 | 60 | { |
| 117 | 61 | throw std::logic_error( |
| 118 | 62 | "INTERNAL ERROR: unread_char still true after processing " |
| ... | ... | @@ -135,20 +79,13 @@ Pl_QPDFTokenizer::write(unsigned char* buf, size_t len) |
| 135 | 79 | void |
| 136 | 80 | Pl_QPDFTokenizer::finish() |
| 137 | 81 | { |
| 138 | - this->tokenizer.presentEOF(); | |
| 82 | + this->m->tokenizer.presentEOF(); | |
| 139 | 83 | QPDFTokenizer::Token token; |
| 140 | - if (tokenizer.getToken(token, this->unread_char, this->char_to_unread)) | |
| 84 | + if (this->m->tokenizer.getToken( | |
| 85 | + token, this->m->unread_char, this->m->char_to_unread)) | |
| 141 | 86 | { |
| 142 | - writeToken(token); | |
| 143 | - if (unread_char) | |
| 144 | - { | |
| 145 | - if (this->char_to_unread == '\r') | |
| 146 | - { | |
| 147 | - this->char_to_unread = '\n'; | |
| 148 | - } | |
| 149 | - writeNext(&this->char_to_unread, 1); | |
| 150 | - } | |
| 87 | + this->m->filter->handleToken(token); | |
| 151 | 88 | } |
| 152 | 89 | |
| 153 | - getNext()->finish(); | |
| 90 | + this->m->filter->handleEOF(); | |
| 154 | 91 | } | ... | ... |
libqpdf/QPDFObjectHandle.cc
| ... | ... | @@ -63,6 +63,50 @@ CoalesceProvider::provideStreamData(int, int, Pipeline* p) |
| 63 | 63 | } |
| 64 | 64 | |
| 65 | 65 | void |
| 66 | +QPDFObjectHandle::TokenFilter::setPipeline(Pipeline* p) | |
| 67 | +{ | |
| 68 | + this->pipeline = p; | |
| 69 | +} | |
| 70 | + | |
| 71 | +void | |
| 72 | +QPDFObjectHandle::TokenFilter::write(char const* data, size_t len) | |
| 73 | +{ | |
| 74 | + if (! this->pipeline) | |
| 75 | + { | |
| 76 | + throw std::logic_error( | |
| 77 | + "TokenFilter::write called before setPipeline"); | |
| 78 | + } | |
| 79 | + if (len) | |
| 80 | + { | |
| 81 | + this->pipeline->write(QUtil::unsigned_char_pointer(data), len); | |
| 82 | + } | |
| 83 | +} | |
| 84 | + | |
| 85 | +void | |
| 86 | +QPDFObjectHandle::TokenFilter::write(std::string const& str) | |
| 87 | +{ | |
| 88 | + write(str.c_str(), str.length()); | |
| 89 | +} | |
| 90 | + | |
| 91 | +void | |
| 92 | +QPDFObjectHandle::TokenFilter::writeToken(QPDFTokenizer::Token const& token) | |
| 93 | +{ | |
| 94 | + std::string value = token.getRawValue(); | |
| 95 | + write(value.c_str(), value.length()); | |
| 96 | +} | |
| 97 | + | |
| 98 | +void | |
| 99 | +QPDFObjectHandle::TokenFilter::finish() | |
| 100 | +{ | |
| 101 | + if (! this->pipeline) | |
| 102 | + { | |
| 103 | + throw std::logic_error( | |
| 104 | + "TokenFilter::finish called before setPipeline"); | |
| 105 | + } | |
| 106 | + this->pipeline->finish(); | |
| 107 | +} | |
| 108 | + | |
| 109 | +void | |
| 66 | 110 | QPDFObjectHandle::ParserCallbacks::terminateParsing() |
| 67 | 111 | { |
| 68 | 112 | throw TerminateParsing(); |
| ... | ... | @@ -508,6 +552,13 @@ QPDFObjectHandle::getDict() |
| 508 | 552 | return dynamic_cast<QPDF_Stream*>(obj.getPointer())->getDict(); |
| 509 | 553 | } |
| 510 | 554 | |
| 555 | +bool | |
| 556 | +QPDFObjectHandle::isDataModified() | |
| 557 | +{ | |
| 558 | + assertStream(); | |
| 559 | + return dynamic_cast<QPDF_Stream*>(obj.getPointer())->isDataModified(); | |
| 560 | +} | |
| 561 | + | |
| 511 | 562 | void |
| 512 | 563 | QPDFObjectHandle::replaceDict(QPDFObjectHandle new_dict) |
| 513 | 564 | { |
| ... | ... | @@ -1033,6 +1084,21 @@ QPDFObjectHandle::parseContentStream_data( |
| 1033 | 1084 | } |
| 1034 | 1085 | } |
| 1035 | 1086 | |
| 1087 | +void | |
| 1088 | +QPDFObjectHandle::addContentTokenFilter(PointerHolder<TokenFilter> filter) | |
| 1089 | +{ | |
| 1090 | + coalesceContentStreams(); | |
| 1091 | + this->getKey("/Contents").addTokenFilter(filter); | |
| 1092 | +} | |
| 1093 | + | |
| 1094 | +void | |
| 1095 | +QPDFObjectHandle::addTokenFilter(PointerHolder<TokenFilter> filter) | |
| 1096 | +{ | |
| 1097 | + assertStream(); | |
| 1098 | + return dynamic_cast<QPDF_Stream*>( | |
| 1099 | + obj.getPointer())->addTokenFilter(filter); | |
| 1100 | +} | |
| 1101 | + | |
| 1036 | 1102 | QPDFObjectHandle |
| 1037 | 1103 | QPDFObjectHandle::parse(PointerHolder<InputSource> input, |
| 1038 | 1104 | std::string const& object_description, | ... | ... |
libqpdf/QPDFTokenizer.cc
| ... | ... | @@ -7,6 +7,7 @@ |
| 7 | 7 | #include <qpdf/QTC.hh> |
| 8 | 8 | #include <qpdf/QPDFExc.hh> |
| 9 | 9 | #include <qpdf/QUtil.hh> |
| 10 | +#include <qpdf/QPDFObjectHandle.hh> | |
| 10 | 11 | |
| 11 | 12 | #include <stdexcept> |
| 12 | 13 | #include <string.h> |
| ... | ... | @@ -39,6 +40,23 @@ QPDFTokenizer::Members::~Members() |
| 39 | 40 | { |
| 40 | 41 | } |
| 41 | 42 | |
| 43 | +QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) : | |
| 44 | + type(type), | |
| 45 | + value(value), | |
| 46 | + raw_value(value) | |
| 47 | +{ | |
| 48 | + if (type == tt_string) | |
| 49 | + { | |
| 50 | + raw_value = QPDFObjectHandle::newString(value).unparse(); | |
| 51 | + } | |
| 52 | + else if (type == tt_string) | |
| 53 | + { | |
| 54 | + raw_value = QPDFObjectHandle::newName(value).unparse(); | |
| 55 | + } | |
| 56 | +} | |
| 57 | + | |
| 58 | + | |
| 59 | + | |
| 42 | 60 | QPDFTokenizer::QPDFTokenizer() : |
| 43 | 61 | m(new Members()) |
| 44 | 62 | { | ... | ... |
libqpdf/QPDFWriter.cc
| ... | ... | @@ -1591,7 +1591,8 @@ QPDFWriter::unparseObject(QPDFObjectHandle object, int level, |
| 1591 | 1591 | { |
| 1592 | 1592 | is_metadata = true; |
| 1593 | 1593 | } |
| 1594 | - bool filter = (this->m->compress_streams || | |
| 1594 | + bool filter = (object.isDataModified() || | |
| 1595 | + this->m->compress_streams || | |
| 1595 | 1596 | this->m->stream_decode_level); |
| 1596 | 1597 | if (this->m->compress_streams) |
| 1597 | 1598 | { |
| ... | ... | @@ -1602,7 +1603,8 @@ QPDFWriter::unparseObject(QPDFObjectHandle object, int level, |
| 1602 | 1603 | // compressed with a lossy compression scheme, but we |
| 1603 | 1604 | // don't support any of those right now. |
| 1604 | 1605 | QPDFObjectHandle filter_obj = stream_dict.getKey("/Filter"); |
| 1605 | - if (filter_obj.isName() && | |
| 1606 | + if ((! object.isDataModified()) && | |
| 1607 | + filter_obj.isName() && | |
| 1606 | 1608 | ((filter_obj.getName() == "/FlateDecode") || |
| 1607 | 1609 | (filter_obj.getName() == "/Fl"))) |
| 1608 | 1610 | { | ... | ... |
libqpdf/QPDF_Stream.cc
| ... | ... | @@ -13,7 +13,7 @@ |
| 13 | 13 | #include <qpdf/Pl_RunLength.hh> |
| 14 | 14 | #include <qpdf/Pl_DCT.hh> |
| 15 | 15 | #include <qpdf/Pl_Count.hh> |
| 16 | - | |
| 16 | +#include <qpdf/ContentNormalizer.hh> | |
| 17 | 17 | #include <qpdf/QTC.hh> |
| 18 | 18 | #include <qpdf/QPDF.hh> |
| 19 | 19 | #include <qpdf/QPDFExc.hh> |
| ... | ... | @@ -91,6 +91,12 @@ QPDF_Stream::getDict() const |
| 91 | 91 | return this->stream_dict; |
| 92 | 92 | } |
| 93 | 93 | |
| 94 | +bool | |
| 95 | +QPDF_Stream::isDataModified() const | |
| 96 | +{ | |
| 97 | + return (! this->token_filters.empty()); | |
| 98 | +} | |
| 99 | + | |
| 94 | 100 | PointerHolder<Buffer> |
| 95 | 101 | QPDF_Stream::getStreamData(qpdf_stream_decode_level_e decode_level) |
| 96 | 102 | { |
| ... | ... | @@ -440,21 +446,36 @@ QPDF_Stream::pipeStreamData(Pipeline* pipeline, |
| 440 | 446 | // create to be deleted when this function finishes. |
| 441 | 447 | std::vector<PointerHolder<Pipeline> > to_delete; |
| 442 | 448 | |
| 449 | + PointerHolder<ContentNormalizer> normalizer; | |
| 443 | 450 | if (filter) |
| 444 | 451 | { |
| 445 | 452 | if (encode_flags & qpdf_ef_compress) |
| 446 | 453 | { |
| 447 | - pipeline = new Pl_Flate("compress object stream", pipeline, | |
| 454 | + pipeline = new Pl_Flate("compress stream", pipeline, | |
| 448 | 455 | Pl_Flate::a_deflate); |
| 449 | 456 | to_delete.push_back(pipeline); |
| 450 | 457 | } |
| 451 | 458 | |
| 452 | 459 | if (encode_flags & qpdf_ef_normalize) |
| 453 | 460 | { |
| 454 | - pipeline = new Pl_QPDFTokenizer("normalizer", pipeline); | |
| 461 | + normalizer = new ContentNormalizer(); | |
| 462 | + normalizer->setPipeline(pipeline); | |
| 463 | + pipeline = new Pl_QPDFTokenizer( | |
| 464 | + "normalizer", normalizer.getPointer()); | |
| 455 | 465 | to_delete.push_back(pipeline); |
| 456 | 466 | } |
| 457 | 467 | |
| 468 | + for (std::vector<PointerHolder< | |
| 469 | + QPDFObjectHandle::TokenFilter> >::reverse_iterator iter = | |
| 470 | + this->token_filters.rbegin(); | |
| 471 | + iter != this->token_filters.rend(); ++iter) | |
| 472 | + { | |
| 473 | + (*iter)->setPipeline(pipeline); | |
| 474 | + pipeline = new Pl_QPDFTokenizer( | |
| 475 | + "token filter", (*iter).getPointer()); | |
| 476 | + to_delete.push_back(pipeline); | |
| 477 | + } | |
| 478 | + | |
| 458 | 479 | for (std::vector<std::string>::reverse_iterator iter = filters.rbegin(); |
| 459 | 480 | iter != filters.rend(); ++iter) |
| 460 | 481 | { |
| ... | ... | @@ -613,6 +634,13 @@ QPDF_Stream::replaceStreamData( |
| 613 | 634 | } |
| 614 | 635 | |
| 615 | 636 | void |
| 637 | +QPDF_Stream::addTokenFilter( | |
| 638 | + PointerHolder<QPDFObjectHandle::TokenFilter> token_filter) | |
| 639 | +{ | |
| 640 | + this->token_filters.push_back(token_filter); | |
| 641 | +} | |
| 642 | + | |
| 643 | +void | |
| 616 | 644 | QPDF_Stream::replaceFilterData(QPDFObjectHandle const& filter, |
| 617 | 645 | QPDFObjectHandle const& decode_parms, |
| 618 | 646 | size_t length) | ... | ... |
libqpdf/build.mk
libqpdf/qpdf/ContentNormalizer.hh
0 → 100644
| 1 | +#ifndef __CONTENTNORMALIZER_HH__ | |
| 2 | +#define __CONTENTNORMALIZER_HH__ | |
| 3 | + | |
| 4 | +#include <qpdf/QPDFObjectHandle.hh> | |
| 5 | + | |
| 6 | +class ContentNormalizer: public QPDFObjectHandle::TokenFilter | |
| 7 | +{ | |
| 8 | + public: | |
| 9 | + ContentNormalizer(); | |
| 10 | + virtual ~ContentNormalizer(); | |
| 11 | + virtual void handleToken(QPDFTokenizer::Token const&); | |
| 12 | + virtual void handleEOF(); | |
| 13 | +}; | |
| 14 | + | |
| 15 | +#endif // __CONTENTNORMALIZER_HH__ | ... | ... |
libqpdf/qpdf/Pl_QPDFTokenizer.hh
| ... | ... | @@ -4,6 +4,8 @@ |
| 4 | 4 | #include <qpdf/Pipeline.hh> |
| 5 | 5 | |
| 6 | 6 | #include <qpdf/QPDFTokenizer.hh> |
| 7 | +#include <qpdf/PointerHolder.hh> | |
| 8 | +#include <qpdf/QPDFObjectHandle.hh> | |
| 7 | 9 | |
| 8 | 10 | // |
| 9 | 11 | // Treat incoming text as a stream consisting of valid PDF tokens, but |
| ... | ... | @@ -16,7 +18,8 @@ |
| 16 | 18 | class Pl_QPDFTokenizer: public Pipeline |
| 17 | 19 | { |
| 18 | 20 | public: |
| 19 | - Pl_QPDFTokenizer(char const* identifier, Pipeline* next); | |
| 21 | + Pl_QPDFTokenizer(char const* identifier, | |
| 22 | + QPDFObjectHandle::TokenFilter* filter); | |
| 20 | 23 | virtual ~Pl_QPDFTokenizer(); |
| 21 | 24 | virtual void write(unsigned char* buf, size_t len); |
| 22 | 25 | virtual void finish(); |
| ... | ... | @@ -24,14 +27,25 @@ class Pl_QPDFTokenizer: public Pipeline |
| 24 | 27 | private: |
| 25 | 28 | void processChar(char ch); |
| 26 | 29 | void checkUnread(); |
| 27 | - void writeNext(char const*, size_t len); | |
| 28 | - void writeToken(QPDFTokenizer::Token&); | |
| 29 | - | |
| 30 | - QPDFTokenizer tokenizer; | |
| 31 | - bool just_wrote_nl; | |
| 32 | - bool last_char_was_cr; | |
| 33 | - bool unread_char; | |
| 34 | - char char_to_unread; | |
| 30 | + | |
| 31 | + class Members | |
| 32 | + { | |
| 33 | + friend class Pl_QPDFTokenizer; | |
| 34 | + | |
| 35 | + public: | |
| 36 | + ~Members(); | |
| 37 | + | |
| 38 | + private: | |
| 39 | + Members(); | |
| 40 | + Members(Members const&); | |
| 41 | + | |
| 42 | + QPDFObjectHandle::TokenFilter* filter; | |
| 43 | + QPDFTokenizer tokenizer; | |
| 44 | + bool last_char_was_cr; | |
| 45 | + bool unread_char; | |
| 46 | + char char_to_unread; | |
| 47 | + }; | |
| 48 | + PointerHolder<Members> m; | |
| 35 | 49 | }; |
| 36 | 50 | |
| 37 | 51 | #endif // __PL_QPDFTOKENIZER_HH__ | ... | ... |
libqpdf/qpdf/QPDF_Stream.hh
| ... | ... | @@ -20,6 +20,7 @@ class QPDF_Stream: public QPDFObject |
| 20 | 20 | virtual QPDFObject::object_type_e getTypeCode() const; |
| 21 | 21 | virtual char const* getTypeName() const; |
| 22 | 22 | QPDFObjectHandle getDict() const; |
| 23 | + bool isDataModified() const; | |
| 23 | 24 | |
| 24 | 25 | // See comments in QPDFObjectHandle.hh for these methods. |
| 25 | 26 | bool pipeStreamData(Pipeline*, |
| ... | ... | @@ -35,6 +36,8 @@ class QPDF_Stream: public QPDFObject |
| 35 | 36 | PointerHolder<QPDFObjectHandle::StreamDataProvider> provider, |
| 36 | 37 | QPDFObjectHandle const& filter, |
| 37 | 38 | QPDFObjectHandle const& decode_parms); |
| 39 | + void addTokenFilter( | |
| 40 | + PointerHolder<QPDFObjectHandle::TokenFilter> token_filter); | |
| 38 | 41 | |
| 39 | 42 | void replaceDict(QPDFObjectHandle new_dict); |
| 40 | 43 | |
| ... | ... | @@ -72,6 +75,8 @@ class QPDF_Stream: public QPDFObject |
| 72 | 75 | size_t length; |
| 73 | 76 | PointerHolder<Buffer> stream_data; |
| 74 | 77 | PointerHolder<QPDFObjectHandle::StreamDataProvider> stream_provider; |
| 78 | + std::vector< | |
| 79 | + PointerHolder<QPDFObjectHandle::TokenFilter> > token_filters; | |
| 75 | 80 | }; |
| 76 | 81 | |
| 77 | 82 | #endif // __QPDF_STREAM_HH__ | ... | ... |
qpdf/qtest/qpdf.test
| ... | ... | @@ -758,6 +758,19 @@ $td->runtest("check output", |
| 758 | 758 | |
| 759 | 759 | show_ntests(); |
| 760 | 760 | # ---------- |
| 761 | +$td->notify("--- Token filters ---"); | |
| 762 | +$n_tests += 2; | |
| 763 | + | |
| 764 | +$td->runtest("token filter", | |
| 765 | + {$td->COMMAND => "test_driver 41 coalesce.pdf"}, | |
| 766 | + {$td->STRING => "test 41 done\n", $td->EXIT_STATUS => 0}, | |
| 767 | + $td->NORMALIZE_NEWLINES); | |
| 768 | +$td->runtest("check output", | |
| 769 | + {$td->FILE => "a.pdf"}, | |
| 770 | + {$td->FILE => "token-filters-out.pdf"}); | |
| 771 | + | |
| 772 | +show_ntests(); | |
| 773 | +# ---------- | |
| 761 | 774 | $td->notify("--- Newline before endstream ---"); |
| 762 | 775 | $n_tests += 10; |
| 763 | 776 | ... | ... |
qpdf/qtest/qpdf/token-filters-out.pdf
0 → 100644
No preview for this file type
qpdf/test_driver.cc
| ... | ... | @@ -97,6 +97,36 @@ ParserCallbacks::handleEOF() |
| 97 | 97 | std::cout << "-EOF-" << std::endl; |
| 98 | 98 | } |
| 99 | 99 | |
| 100 | +class TokenFilter: public QPDFObjectHandle::TokenFilter | |
| 101 | +{ | |
| 102 | + public: | |
| 103 | + TokenFilter() | |
| 104 | + { | |
| 105 | + } | |
| 106 | + virtual ~TokenFilter() | |
| 107 | + { | |
| 108 | + } | |
| 109 | + virtual void handleToken(QPDFTokenizer::Token const& t) | |
| 110 | + { | |
| 111 | + if (t == QPDFTokenizer::Token(QPDFTokenizer::tt_string, "Potato")) | |
| 112 | + { | |
| 113 | + // Exercise unparsing of strings by token constructor | |
| 114 | + writeToken( | |
| 115 | + QPDFTokenizer::Token(QPDFTokenizer::tt_string, "Salad")); | |
| 116 | + } | |
| 117 | + else | |
| 118 | + { | |
| 119 | + writeToken(t); | |
| 120 | + } | |
| 121 | + } | |
| 122 | + virtual void handleEOF() | |
| 123 | + { | |
| 124 | + writeToken(QPDFTokenizer::Token(QPDFTokenizer::tt_name, "/bye")); | |
| 125 | + write("\n"); | |
| 126 | + finish(); | |
| 127 | + } | |
| 128 | +}; | |
| 129 | + | |
| 100 | 130 | static std::string getPageContents(QPDFObjectHandle page) |
| 101 | 131 | { |
| 102 | 132 | PointerHolder<Buffer> b1 = |
| ... | ... | @@ -1345,6 +1375,22 @@ void runtest(int n, char const* filename1, char const* arg2) |
| 1345 | 1375 | w.setStaticID(true); |
| 1346 | 1376 | w.write(); |
| 1347 | 1377 | } |
| 1378 | + else if (n == 41) | |
| 1379 | + { | |
| 1380 | + // Apply a token filter. This test case is crafted to work | |
| 1381 | + // with coalesce.pdf. | |
| 1382 | + std::vector<QPDFObjectHandle> pages = pdf.getAllPages(); | |
| 1383 | + for (std::vector<QPDFObjectHandle>::iterator iter = | |
| 1384 | + pages.begin(); | |
| 1385 | + iter != pages.end(); ++iter) | |
| 1386 | + { | |
| 1387 | + (*iter).addContentTokenFilter(new TokenFilter); | |
| 1388 | + } | |
| 1389 | + QPDFWriter w(pdf, "a.pdf"); | |
| 1390 | + w.setQDFMode(true); | |
| 1391 | + w.setStaticID(true); | |
| 1392 | + w.write(); | |
| 1393 | + } | |
| 1348 | 1394 | else |
| 1349 | 1395 | { |
| 1350 | 1396 | throw std::runtime_error(std::string("invalid test ") + | ... | ... |