Commit 5708b5d0aa9c94ab663509fbb865aa27a134aeb3

Authored by Jay Berkenbilt
1 parent fd02944e

Add additional interface for filtering page contents

ChangeLog
  1 +2018-02-11 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * Add QPDFObjectHandle::filterPageContents method to provide a
  4 + different interface for applying token filters to page contents
  5 + without modifying the ultimate output.
  6 +
1 2018-02-04 Jay Berkenbilt <ejb@ql.org> 7 2018-02-04 Jay Berkenbilt <ejb@ql.org>
2 8
3 * Changes listed on today's date are numerous and reflect 9 * Changes listed on today's date are numerous and reflect
examples/build.mk
@@ -7,7 +7,8 @@ BINS_examples = \ @@ -7,7 +7,8 @@ BINS_examples = \
7 pdf-create \ 7 pdf-create \
8 pdf-parse-content \ 8 pdf-parse-content \
9 pdf-split-pages \ 9 pdf-split-pages \
10 - pdf-filter-tokens 10 + pdf-filter-tokens \
  11 + pdf-count-strings
11 CBINS_examples = pdf-linearize 12 CBINS_examples = pdf-linearize
12 13
13 TARGETS_examples = $(foreach B,$(BINS_examples) $(CBINS_examples),examples/$(OUTPUT_DIR)/$(call binname,$(B))) 14 TARGETS_examples = $(foreach B,$(BINS_examples) $(CBINS_examples),examples/$(OUTPUT_DIR)/$(call binname,$(B)))
examples/pdf-count-strings.cc 0 → 100644
  1 +//
  2 +// This example illustrates the use of QPDFObjectHandle::TokenFilter
  3 +// with filterPageContents. See also pdf-filter-tokens.cc for an
  4 +// example that uses QPDFObjectHandle::TokenFilter with
  5 +// addContentTokenFilter.
  6 +//
  7 +
  8 +#include <iostream>
  9 +#include <string.h>
  10 +#include <stdlib.h>
  11 +
  12 +#include <qpdf/QPDF.hh>
  13 +#include <qpdf/QUtil.hh>
  14 +#include <qpdf/QPDFObjectHandle.hh>
  15 +#include <qpdf/Pl_StdioFile.hh>
  16 +
  17 +static char const* whoami = 0;
  18 +
  19 +void usage()
  20 +{
  21 + std::cerr << "Usage: " << whoami << " infile" << std::endl
  22 + << "Applies token filters to infile"
  23 + << std::endl;
  24 + exit(2);
  25 +}
  26 +
  27 +class StringCounter: public QPDFObjectHandle::TokenFilter
  28 +{
  29 + public:
  30 + StringCounter() :
  31 + count(0)
  32 + {
  33 + }
  34 + virtual ~StringCounter()
  35 + {
  36 + }
  37 + virtual void handleToken(QPDFTokenizer::Token const&);
  38 + virtual void handleEOF();
  39 + int getCount() const;
  40 +
  41 + private:
  42 + int count;
  43 +};
  44 +
  45 +void
  46 +StringCounter::handleToken(QPDFTokenizer::Token const& token)
  47 +{
  48 + // Count string tokens
  49 + if (token.getType() == QPDFTokenizer::tt_string)
  50 + {
  51 + ++this->count;
  52 + }
  53 + // Preserve input verbatim by passing each token to any specified
  54 + // downstream filter.
  55 + writeToken(token);
  56 +}
  57 +
  58 +void
  59 +StringCounter::handleEOF()
  60 +{
  61 + // Write a comment at the end of the stream just to show how we
  62 + // can enhance the output if we want.
  63 + write("\n% strings found: ");
  64 + write(QUtil::int_to_string(this->count));
  65 + // If you override handleEOF, you must always remember to call finish().
  66 + finish();
  67 +}
  68 +
  69 +int
  70 +StringCounter::getCount() const
  71 +{
  72 + return this->count;
  73 +}
  74 +
  75 +int main(int argc, char* argv[])
  76 +{
  77 + whoami = QUtil::getWhoami(argv[0]);
  78 +
  79 + // For libtool's sake....
  80 + if (strncmp(whoami, "lt-", 3) == 0)
  81 + {
  82 + whoami += 3;
  83 + }
  84 +
  85 + if (argc != 2)
  86 + {
  87 + usage();
  88 + }
  89 + char const* infilename = argv[1];
  90 +
  91 + try
  92 + {
  93 + QPDF pdf;
  94 + pdf.processFile(infilename);
  95 + std::vector<QPDFObjectHandle> pages = pdf.getAllPages();
  96 + int pageno = 0;
  97 + for (std::vector<QPDFObjectHandle>::iterator iter = pages.begin();
  98 + iter != pages.end(); ++iter)
  99 + {
  100 + QPDFObjectHandle page = *iter;
  101 + ++pageno;
  102 + // Pass the contents of a page through our string counter.
  103 + // If it's an even page, capture the output. This
  104 + // illustrates that you may capture any output generated
  105 + // by the filter, or you may ignore it.
  106 + StringCounter counter;
  107 + if (pageno % 2)
  108 + {
  109 + // Ignore output for odd pages.
  110 + page.filterPageContents(&counter);
  111 + }
  112 + else
  113 + {
  114 + // Write output to stdout for even pages.
  115 + Pl_StdioFile out("stdout", stdout);
  116 + std::cout << "% Contents of page " << pageno << std::endl;
  117 + page.filterPageContents(&counter, &out);
  118 + std::cout << "\n% end " << pageno << std::endl;
  119 + }
  120 + std::cout << "Page " << pageno
  121 + << ": strings = " << counter.getCount() << std::endl;
  122 + }
  123 + }
  124 + catch (std::exception& e)
  125 + {
  126 + std::cerr << whoami << ": " << e.what() << std::endl;
  127 + exit(2);
  128 + }
  129 +
  130 + return 0;
  131 +}
examples/pdf-filter-tokens.cc
1 // 1 //
2 -// This example illustrates the use of QPDFObjectHandle::TokenFilter.  
3 -// Please see comments inline for details. 2 +// This example illustrates the use of QPDFObjectHandle::TokenFilter
  3 +// with addContentTokenFilter. Please see comments inline for details.
  4 +// See also pdf-count-strings.cc for a use of
  5 +// QPDFObjectHandle::TokenFilter with filterPageContents.
4 // 6 //
5 7
6 #include <iostream> 8 #include <iostream>
examples/qtest/count-strings.test 0 → 100644
  1 +#!/usr/bin/env perl
  2 +require 5.008;
  3 +BEGIN { $^W = 1; }
  4 +use strict;
  5 +
  6 +chdir("count-strings");
  7 +
  8 +require TestDriver;
  9 +
  10 +my $td = new TestDriver('pdf-count-strings');
  11 +
  12 +$td->runtest("filter tokens",
  13 + {$td->COMMAND => "pdf-count-strings in.pdf"},
  14 + {$td->FILE => "out", $td->EXIT_STATUS => 0},
  15 + $td->NORMALIZE_NEWLINES);
  16 +
  17 +$td->report(1);
examples/qtest/count-strings/in.pdf 0 → 100644
No preview for this file type
examples/qtest/count-strings/out 0 → 100644
  1 +Page 1: strings = 3
  2 +% Contents of page 2
  3 +BT
  4 + /F1 24 Tf
  5 + 72 720 Td
  6 + (Four ) Tj
  7 + (Five ) Tj
  8 + (Six )
  9 + (beautiful ) Tj
  10 + (strings) Tj
  11 + (!) Tj
  12 +ET
  13 +
  14 +% strings found: 6
  15 +% end 2
  16 +Page 2: strings = 6
include/qpdf/QPDFObjectHandle.hh
@@ -80,9 +80,10 @@ class QPDFObjectHandle @@ -80,9 +80,10 @@ class QPDFObjectHandle
80 // The TokenFilter class provides a way to filter content streams 80 // The TokenFilter class provides a way to filter content streams
81 // in a lexically aware fashion. TokenFilters can be attached to 81 // in a lexically aware fashion. TokenFilters can be attached to
82 // streams using the addTokenFilter or addContentTokenFilter 82 // streams using the addTokenFilter or addContentTokenFilter
83 - // methods. The handleToken method is called for each token,  
84 - // including the eof token, and then handleEOF is called at the  
85 - // very end. Handlers may call write (or writeToken) to pass data 83 + // methods or can be applied on the spot by filterPageContents.
  84 + // The handleToken method is called for each token, including the
  85 + // eof token, and then handleEOF is called at the very end.
  86 + // Handlers may call write (or writeToken) to pass data
86 // downstream. The finish() method must be called exactly one time 87 // downstream. The finish() method must be called exactly one time
87 // to ensure that any written data is flushed out. The default 88 // to ensure that any written data is flushed out. The default
88 // handleEOF calls finish. If you override handleEOF, you must 89 // handleEOF calls finish. If you override handleEOF, you must
@@ -91,8 +92,9 @@ class QPDFObjectHandle @@ -91,8 +92,9 @@ class QPDFObjectHandle
91 // Failure to call finish() may result in some of the data you 92 // Failure to call finish() may result in some of the data you
92 // have written being lost. You should not rely on a destructor 93 // have written being lost. You should not rely on a destructor
93 // for calling finish() since the destructor call may occur later 94 // for calling finish() since the destructor call may occur later
94 - // than you expect. Please see examples/token-filters.cc for  
95 - // examples of using TokenFilters. 95 + // than you expect. Please see examples/pdf-filter-tokens.cc and
  96 + // examples/pdf-count-strings.cc for examples of using
  97 + // TokenFilters.
96 // 98 //
97 // Please note that when you call token.getValue() on a token of 99 // Please note that when you call token.getValue() on a token of
98 // type tt_string, you get the string value without any 100 // type tt_string, you get the string value without any
@@ -255,6 +257,18 @@ class QPDFObjectHandle @@ -255,6 +257,18 @@ class QPDFObjectHandle
255 QPDF_DLL 257 QPDF_DLL
256 void parsePageContents(ParserCallbacks* callbacks); 258 void parsePageContents(ParserCallbacks* callbacks);
257 259
  260 + // Pass a page's contents through the given TokenFilter. If a
  261 + // pipeline is also provided, it will be the target of the write
  262 + // methods from the token filter. If a pipeline is not specified,
  263 + // any output generated by the token filter will be discarded. Use
  264 + // this interface if you need to pass a page's contents through
  265 + // filter for work purposes without having that filter
  266 + // automatically applied to the page's contents, as happens with
  267 + // addContentTokenFilter. See examples/pdf-count-strings.cc for an
  268 + // example.
  269 + QPDF_DLL
  270 + void filterPageContents(TokenFilter* filter, Pipeline* next = 0);
  271 +
258 // Pipe a page's contents through the given pipeline. This method 272 // Pipe a page's contents through the given pipeline. This method
259 // works whether the contents are a single stream or an array of 273 // works whether the contents are a single stream or an array of
260 // streams. Call on a page object. 274 // streams. Call on a page object.
libqpdf/QPDFObjectHandle.cc
@@ -15,6 +15,8 @@ @@ -15,6 +15,8 @@
15 #include <qpdf/QPDF_Reserved.hh> 15 #include <qpdf/QPDF_Reserved.hh>
16 #include <qpdf/Pl_Buffer.hh> 16 #include <qpdf/Pl_Buffer.hh>
17 #include <qpdf/Pl_Concatenate.hh> 17 #include <qpdf/Pl_Concatenate.hh>
  18 +#include <qpdf/Pl_QPDFTokenizer.hh>
  19 +#include <qpdf/Pl_Discard.hh>
18 #include <qpdf/BufferInputSource.hh> 20 #include <qpdf/BufferInputSource.hh>
19 #include <qpdf/QPDFExc.hh> 21 #include <qpdf/QPDFExc.hh>
20 22
@@ -999,6 +1001,24 @@ QPDFObjectHandle::parsePageContents(ParserCallbacks* callbacks) @@ -999,6 +1001,24 @@ QPDFObjectHandle::parsePageContents(ParserCallbacks* callbacks)
999 } 1001 }
1000 1002
1001 void 1003 void
  1004 +QPDFObjectHandle::filterPageContents(TokenFilter* filter, Pipeline* next)
  1005 +{
  1006 + assertPageObject();
  1007 + std::string description = "token filter for page object " +
  1008 + QUtil::int_to_string(this->objid) + " " +
  1009 + QUtil::int_to_string(this->generation);
  1010 + Pl_QPDFTokenizer token_pipeline(description.c_str(), filter);
  1011 + PointerHolder<Pipeline> next_p;
  1012 + if (next == 0)
  1013 + {
  1014 + next_p = new Pl_Discard();
  1015 + next = next_p.getPointer();
  1016 + }
  1017 + filter->setPipeline(next);
  1018 + this->pipePageContents(&token_pipeline);
  1019 +}
  1020 +
  1021 +void
1002 QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array, 1022 QPDFObjectHandle::parseContentStream(QPDFObjectHandle stream_or_array,
1003 ParserCallbacks* callbacks) 1023 ParserCallbacks* callbacks)
1004 { 1024 {