Commit 296b679d6e3217cc112b7ed19b363b82356615ef

Authored by Jay Berkenbilt
1 parent ef8ae544

Implement findFirst and findLast in InputSource

Preparing to refactor some pattern searching code to use these instead
of their own memchr loops. This should simplify the code that replaces
PCRE.
include/qpdf/BufferInputSource.hh
@@ -15,17 +15,27 @@ @@ -15,17 +15,27 @@
15 class BufferInputSource: public InputSource 15 class BufferInputSource: public InputSource
16 { 16 {
17 public: 17 public:
  18 + QPDF_DLL
18 BufferInputSource(std::string const& description, Buffer* buf, 19 BufferInputSource(std::string const& description, Buffer* buf,
19 bool own_memory = false); 20 bool own_memory = false);
  21 + QPDF_DLL
20 BufferInputSource(std::string const& description, 22 BufferInputSource(std::string const& description,
21 std::string const& contents); 23 std::string const& contents);
  24 + QPDF_DLL
22 virtual ~BufferInputSource(); 25 virtual ~BufferInputSource();
  26 + QPDF_DLL
23 virtual qpdf_offset_t findAndSkipNextEOL(); 27 virtual qpdf_offset_t findAndSkipNextEOL();
  28 + QPDF_DLL
24 virtual std::string const& getName() const; 29 virtual std::string const& getName() const;
  30 + QPDF_DLL
25 virtual qpdf_offset_t tell(); 31 virtual qpdf_offset_t tell();
  32 + QPDF_DLL
26 virtual void seek(qpdf_offset_t offset, int whence); 33 virtual void seek(qpdf_offset_t offset, int whence);
  34 + QPDF_DLL
27 virtual void rewind(); 35 virtual void rewind();
  36 + QPDF_DLL
28 virtual size_t read(char* buffer, size_t length); 37 virtual size_t read(char* buffer, size_t length);
  38 + QPDF_DLL
29 virtual void unreadCh(char ch); 39 virtual void unreadCh(char ch);
30 40
31 private: 41 private:
include/qpdf/FileInputSource.hh
@@ -14,16 +14,27 @@ @@ -14,16 +14,27 @@
14 class FileInputSource: public InputSource 14 class FileInputSource: public InputSource
15 { 15 {
16 public: 16 public:
  17 + QPDF_DLL
17 FileInputSource(); 18 FileInputSource();
  19 + QPDF_DLL
18 void setFilename(char const* filename); 20 void setFilename(char const* filename);
  21 + QPDF_DLL
19 void setFile(char const* description, FILE* filep, bool close_file); 22 void setFile(char const* description, FILE* filep, bool close_file);
  23 + QPDF_DLL
20 virtual ~FileInputSource(); 24 virtual ~FileInputSource();
  25 + QPDF_DLL
21 virtual qpdf_offset_t findAndSkipNextEOL(); 26 virtual qpdf_offset_t findAndSkipNextEOL();
  27 + QPDF_DLL
22 virtual std::string const& getName() const; 28 virtual std::string const& getName() const;
  29 + QPDF_DLL
23 virtual qpdf_offset_t tell(); 30 virtual qpdf_offset_t tell();
  31 + QPDF_DLL
24 virtual void seek(qpdf_offset_t offset, int whence); 32 virtual void seek(qpdf_offset_t offset, int whence);
  33 + QPDF_DLL
25 virtual void rewind(); 34 virtual void rewind();
  35 + QPDF_DLL
26 virtual size_t read(char* buffer, size_t length); 36 virtual size_t read(char* buffer, size_t length);
  37 + QPDF_DLL
27 virtual void unreadCh(char ch); 38 virtual void unreadCh(char ch);
28 39
29 private: 40 private:
include/qpdf/InputSource.hh
@@ -9,6 +9,7 @@ @@ -9,6 +9,7 @@
9 #ifndef __QPDF_INPUTSOURCE_HH__ 9 #ifndef __QPDF_INPUTSOURCE_HH__
10 #define __QPDF_INPUTSOURCE_HH__ 10 #define __QPDF_INPUTSOURCE_HH__
11 11
  12 +#include <qpdf/DLL.h>
12 #include <qpdf/Types.h> 13 #include <qpdf/Types.h>
13 #include <stdio.h> 14 #include <stdio.h>
14 #include <string> 15 #include <string>
@@ -16,18 +17,52 @@ @@ -16,18 +17,52 @@
16 class InputSource 17 class InputSource
17 { 18 {
18 public: 19 public:
  20 + QPDF_DLL
19 InputSource() : 21 InputSource() :
20 last_offset(0) 22 last_offset(0)
21 { 23 {
22 } 24 }
  25 + QPDF_DLL
23 virtual ~InputSource() 26 virtual ~InputSource()
24 { 27 {
25 } 28 }
26 29
  30 + class Finder
  31 + {
  32 + public:
  33 + Finder()
  34 + {
  35 + }
  36 + virtual ~Finder()
  37 + {
  38 + }
  39 +
  40 + virtual bool check() = 0;
  41 + };
  42 +
  43 + QPDF_DLL
27 void setLastOffset(qpdf_offset_t); 44 void setLastOffset(qpdf_offset_t);
  45 + QPDF_DLL
28 qpdf_offset_t getLastOffset() const; 46 qpdf_offset_t getLastOffset() const;
  47 + QPDF_DLL
29 std::string readLine(size_t max_line_length); 48 std::string readLine(size_t max_line_length);
30 49
  50 + // Find first or last occurrence of a sequence of characters
  51 + // starting within the range defined by offset and len such that,
  52 + // when the input source is positioned at the beginning of that
  53 + // sequence, finder.check() returns true. If len is 0, the search
  54 + // proceeds until EOF. If a qualifying pattern these methods
  55 + // return true and leave the input source positioned wherever
  56 + // check() left it at the end of the matching pattern.
  57 + QPDF_DLL
  58 + bool findFirst(char const* start_chars,
  59 + qpdf_offset_t offset, size_t len,
  60 + Finder& finder);
  61 + QPDF_DLL
  62 + bool findLast(char const* start_chars,
  63 + qpdf_offset_t offset, size_t len,
  64 + Finder& finder);
  65 +
31 virtual qpdf_offset_t findAndSkipNextEOL() = 0; 66 virtual qpdf_offset_t findAndSkipNextEOL() = 0;
32 virtual std::string const& getName() const = 0; 67 virtual std::string const& getName() const = 0;
33 virtual qpdf_offset_t tell() = 0; 68 virtual qpdf_offset_t tell() = 0;
include/qpdf/QPDF.hh
@@ -1006,6 +1006,27 @@ class QPDF @@ -1006,6 +1006,27 @@ class QPDF
1006 std::string key; // if ou_trailer_key or ou_root_key 1006 std::string key; // if ou_trailer_key or ou_root_key
1007 }; 1007 };
1008 1008
  1009 + class PatternFinder: public InputSource::Finder
  1010 + {
  1011 + public:
  1012 + PatternFinder(QPDF& qpdf, bool (QPDF::*checker)()) :
  1013 + qpdf(qpdf),
  1014 + checker(checker)
  1015 + {
  1016 + }
  1017 + virtual ~PatternFinder()
  1018 + {
  1019 + }
  1020 + virtual bool check()
  1021 + {
  1022 + return (this->qpdf.*checker)();
  1023 + }
  1024 +
  1025 + private:
  1026 + QPDF& qpdf;
  1027 + bool (QPDF::*checker)();
  1028 + };
  1029 +
1009 // methods to support linearization checking -- implemented in 1030 // methods to support linearization checking -- implemented in
1010 // QPDF_linearization.cc 1031 // QPDF_linearization.cc
1011 void readLinearizationData(); 1032 void readLinearizationData();
libqpdf/InputSource.cc
1 #include <qpdf/InputSource.hh> 1 #include <qpdf/InputSource.hh>
2 #include <string.h> 2 #include <string.h>
  3 +#include <stdexcept>
  4 +#include <qpdf/QTC.hh>
3 #include <qpdf/PointerHolder.hh> 5 #include <qpdf/PointerHolder.hh>
4 6
  7 +
5 void 8 void
6 InputSource::setLastOffset(qpdf_offset_t offset) 9 InputSource::setLastOffset(qpdf_offset_t offset)
7 { 10 {
@@ -39,3 +42,167 @@ InputSource::readLine(size_t max_line_length) @@ -39,3 +42,167 @@ InputSource::readLine(size_t max_line_length)
39 } 42 }
40 return std::string(buf); 43 return std::string(buf);
41 } 44 }
  45 +
  46 +bool
  47 +InputSource::findFirst(char const* start_chars,
  48 + qpdf_offset_t offset, size_t len,
  49 + Finder& finder)
  50 +{
  51 + // Basic approach: search for the first character of start_chars
  52 + // starting from offset but not going past len (if len != 0). Once
  53 + // the first character is found, see if it is the beginning of a
  54 + // sequence of characters matching start_chars. If so, call
  55 + // finder.check() to do caller-specific additional checks. If not,
  56 + // keep searching.
  57 +
  58 + // This code is tricky and highly subject to off-by-one or other
  59 + // edge case logic errors. See comments throughout that explain
  60 + // how we're not missing any edge cases. There are also tests
  61 + // specifically constructed to make sure we caught the edge cases
  62 + // in testing.
  63 +
  64 + char buf[1025]; // size known to input_source.cc in libtests
  65 + // To enable us to guarantee null-termination, save an extra byte
  66 + // so that buf[size] is valid memory.
  67 + size_t size = sizeof(buf) - 1;
  68 + if ((strlen(start_chars) < 1) || (strlen(start_chars) > size))
  69 + {
  70 + throw std::logic_error(
  71 + "InputSource::findSource called with"
  72 + " too small or too large of a character sequence");
  73 + }
  74 +
  75 + char* p = 0;
  76 + qpdf_offset_t buf_offset = offset;
  77 + size_t bytes_read = 0;
  78 +
  79 + // Guarantee that we return from this loop. Each time through, we
  80 + // either return, advance p, or restart the loop with a condition
  81 + // that will cause return on the next pass. Eventually we will
  82 + // either be out of range or hit EOF, either of which forces us to
  83 + // return.
  84 + while (true)
  85 + {
  86 + // Do we need to read more data? Pretend size = 5, buf starts
  87 + // at 0, and start_chars has 3 characters. buf[5] is valid and
  88 + // null. If p == 2, start_chars could be buf[2] through
  89 + // buf[4], so p + strlen(start_chars) == buf + size is okay.
  90 + // If p points to buf[size], since strlen(start_chars) is
  91 + // always >= 1, this overflow test will be correct for that
  92 + // case regardless of start_chars.
  93 + if ((p == 0) || ((p + strlen(start_chars)) > (buf + bytes_read)))
  94 + {
  95 + if (p)
  96 + {
  97 + QTC::TC("libtests", "InputSource read next block",
  98 + ((p == buf + bytes_read) ? 0 : 1));
  99 + buf_offset += (p - buf);
  100 + }
  101 + this->seek(buf_offset, SEEK_SET);
  102 + // Read into buffer and zero out the rest of the buffer
  103 + // including buf[size]. We allocated an extra byte so that
  104 + // we could guarantee null termination as an extra
  105 + // protection against overrun when using string functions.
  106 + bytes_read = this->read(buf, size);
  107 + if (bytes_read < strlen(start_chars))
  108 + {
  109 + QTC::TC("libtests", "InputSource find EOF",
  110 + bytes_read == 0 ? 0 : 1);
  111 + return false;
  112 + }
  113 + memset(buf + bytes_read, '\0', 1 + (size - bytes_read));
  114 + p = buf;
  115 + }
  116 +
  117 + // Search for the first character.
  118 + if ((p = static_cast<char*>(
  119 + memchr(p, start_chars[0], bytes_read - (p - buf)))) != 0)
  120 + {
  121 + if (p == buf)
  122 + {
  123 + QTC::TC("libtests", "InputSource found match at buf[0]");
  124 + }
  125 + // Found first letter.
  126 + if (len != 0)
  127 + {
  128 + // Make sure it's in range.
  129 + size_t p_relative_offset = (p - buf) + (buf_offset - offset);
  130 + if (p_relative_offset >= len)
  131 + {
  132 + // out of range
  133 + QTC::TC("libtests", "InputSource out of range");
  134 + return false;
  135 + }
  136 + }
  137 + if ((p + strlen(start_chars)) > (buf + bytes_read))
  138 + {
  139 + // If there are not enough bytes left in the file for
  140 + // start_chars, we will detect this on the next pass
  141 + // as EOF and return.
  142 + QTC::TC("libtests", "InputSource not enough bytes");
  143 + continue;
  144 + }
  145 +
  146 + // See if p points to a sequence matching start_chars. We
  147 + // already checked above to make sure we are not going to
  148 + // overrun memory.
  149 + if (strncmp(p, start_chars, strlen(start_chars)) == 0)
  150 + {
  151 + // Call finder.check() with the input source
  152 + // positioned to the point of the match.
  153 + this->seek(buf_offset + (p - buf), SEEK_SET);
  154 + if (finder.check())
  155 + {
  156 + return true;
  157 + }
  158 + else
  159 + {
  160 + QTC::TC("libtests", "InputSource start_chars matched but not check");
  161 + }
  162 + }
  163 + else
  164 + {
  165 + QTC::TC("libtests", "InputSource first char matched but not string");
  166 + }
  167 + // This occurrence of the first character wasn't a match.
  168 + // Skip over it and keep searching.
  169 + ++p;
  170 + }
  171 + else
  172 + {
  173 + // Trigger reading the next block
  174 + p = buf + bytes_read;
  175 + }
  176 + }
  177 + throw std::logic_error("InputSource after while (true)");
  178 +}
  179 +
  180 +bool
  181 +InputSource::findLast(char const* start_chars,
  182 + qpdf_offset_t offset, size_t len,
  183 + Finder& finder)
  184 +{
  185 + bool found = false;
  186 + qpdf_offset_t after_found_offset = 0;
  187 + qpdf_offset_t cur_offset = offset;
  188 + size_t cur_len = len;
  189 + while (this->findFirst(start_chars, cur_offset, cur_len, finder))
  190 + {
  191 + if (found)
  192 + {
  193 + QTC::TC("libtests", "InputSource findLast found more than one");
  194 + }
  195 + else
  196 + {
  197 + found = true;
  198 + }
  199 + after_found_offset = this->tell();
  200 + cur_offset = after_found_offset;
  201 + cur_len = len - (cur_offset - offset);
  202 + }
  203 + if (found)
  204 + {
  205 + this->seek(after_found_offset, SEEK_SET);
  206 + }
  207 + return found;
  208 +}
libqpdf/QPDF_linearization.cc
@@ -118,7 +118,7 @@ QPDF::isLinearized() @@ -118,7 +118,7 @@ QPDF::isLinearized()
118 } 118 }
119 else 119 else
120 { 120 {
121 - p = reinterpret_cast<char*>(memchr(p, '_cast<char*>(memchr(p, '\0', tbuf_size - (p - buf)));', tbuf_size - (p - buf))); 121 + p = static_cast<char*>(memchr(p, '_cast<char*>(memchr(p, '\0', tbuf_size - (p - buf)));', tbuf_size - (p - buf)));
122 assert(p != 0); 122 assert(p != 0);
123 while ((p - buf < tbuf_size) && (*p == 0)) 123 while ((p - buf < tbuf_size) && (*p == 0))
124 { 124 {
libtests/build.mk
@@ -6,6 +6,7 @@ BINS_libtests = \ @@ -6,6 +6,7 @@ BINS_libtests = \
6 concatenate \ 6 concatenate \
7 flate \ 7 flate \
8 hex \ 8 hex \
  9 + input_source \
9 lzw \ 10 lzw \
10 md5 \ 11 md5 \
11 pcre \ 12 pcre \
libtests/input_source.cc 0 → 100644
  1 +#include <iostream>
  2 +#include <qpdf/BufferInputSource.hh>
  3 +#include <qpdf/PointerHolder.hh>
  4 +#include <qpdf/Buffer.hh>
  5 +#include <qpdf/QPDFTokenizer.hh>
  6 +
  7 +static PointerHolder<Buffer>
  8 +get_buffer()
  9 +{
  10 + size_t size = 3172;
  11 + PointerHolder<Buffer> b(new Buffer(size));
  12 + unsigned char* p = b->getBuffer();
  13 + for (size_t i = 0; i < size; ++i)
  14 + {
  15 + p[i] = static_cast<unsigned char>(i & 0xff);
  16 + }
  17 + return b;
  18 +}
  19 +
  20 +class Finder: public InputSource::Finder
  21 +{
  22 + public:
  23 + Finder(PointerHolder<InputSource> is, std::string const& after) :
  24 + is(is),
  25 + after(after)
  26 + {
  27 + }
  28 + virtual ~Finder()
  29 + {
  30 + }
  31 + virtual bool check();
  32 +
  33 + private:
  34 + PointerHolder<InputSource> is;
  35 + std::string after;
  36 +};
  37 +
  38 +bool
  39 +Finder::check()
  40 +{
  41 + QPDFTokenizer tokenizer;
  42 + QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
  43 + if (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "potato"))
  44 + {
  45 + t = tokenizer.readToken(is, "finder", true);
  46 + return (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, after));
  47 + }
  48 + return false;
  49 +}
  50 +
  51 +void check(char const* description, bool expected, bool actual)
  52 +{
  53 + std::cout << description << ": "
  54 + << ((actual == expected) ? "PASS" : "FAIL")
  55 + << std::endl;
  56 +}
  57 +
  58 +int main()
  59 +{
  60 + PointerHolder<Buffer> b1 = get_buffer();
  61 + unsigned char* b = b1->getBuffer();
  62 + // Straddle block boundaries
  63 + memcpy(b + 1022, "potato", 6);
  64 + // Overlap so that the first check() would advance past the start
  65 + // of the next match
  66 + memcpy(b + 2037, "potato potato salad ", 20);
  67 + PointerHolder<InputSource> is =
  68 + new BufferInputSource("test buffer input source", b1.getPointer());
  69 + Finder f1(is, "salad");
  70 + check("find potato salad", true,
  71 + is->findFirst("potato", 0, 0, f1));
  72 + check("barely find potato salad", true,
  73 + is->findFirst("potato", 1100, 945, f1));
  74 + check("barely find potato salad", true,
  75 + is->findFirst("potato", 2000, 45, f1));
  76 + check("potato salad is too late", false,
  77 + is->findFirst("potato", 1100, 944, f1));
  78 + check("potato salad is too late", false,
  79 + is->findFirst("potato", 2000, 44, f1));
  80 + check("potato salad not found", false,
  81 + is->findFirst("potato", 2045, 0, f1));
  82 + check("potato salad not found", false,
  83 + is->findFirst("potato", 0, 1, f1));
  84 +
  85 + // Put one more right at EOF
  86 + memcpy(b + b1->getSize() - 12, "potato salad", 12);
  87 + check("potato salad at EOF", true,
  88 + is->findFirst("potato", 3000, 0, f1));
  89 +
  90 + is->findFirst("potato", 0, 0, f1);
  91 + check("findFirst found first", true,
  92 + is->tell() == 2056);
  93 + check("findLast found potato salad", true,
  94 + is->findLast("potato", 0, 0, f1));
  95 + check("findLast found at EOF", true,
  96 + is->tell() == 3172);
  97 +
  98 + // Make check() bump into EOF
  99 + memcpy(b + b1->getSize() - 6, "potato", 6);
  100 + check("potato but not salad salad at EOF", false,
  101 + is->findFirst("potato", 3000, 0, f1));
  102 + check("findLast found potato salad", true,
  103 + is->findLast("potato", 0, 0, f1));
  104 + check("findLast found first one", true,
  105 + is->tell() == 2056);
  106 +
  107 + return 0;
  108 +}
libtests/libtests.testcov
@@ -16,3 +16,11 @@ bits write zero bits 0 @@ -16,3 +16,11 @@ bits write zero bits 0
16 Pl_ASCIIHexDecoder ignore space 0 16 Pl_ASCIIHexDecoder ignore space 0
17 Pl_ASCIIHexDecoder no-op flush 0 17 Pl_ASCIIHexDecoder no-op flush 0
18 Pl_ASCIIHexDecoder partial flush 1 18 Pl_ASCIIHexDecoder partial flush 1
  19 +InputSource read next block 1
  20 +InputSource find EOF 1
  21 +InputSource out of range 0
  22 +InputSource first char matched but not string 0
  23 +InputSource start_chars matched but not check 0
  24 +InputSource not enough bytes 0
  25 +InputSource findLast found more than one 0
  26 +InputSource found match at buf[0] 0
libtests/qtest/input_source.test 0 → 100644
  1 +#!/usr/bin/env perl
  2 +require 5.008;
  3 +use warnings;
  4 +use strict;
  5 +
  6 +chdir("input_source") or die "chdir testdir failed: $!\n";
  7 +
  8 +require TestDriver;
  9 +
  10 +my $td = new TestDriver('InputSource');
  11 +
  12 +cleanup();
  13 +
  14 +$td->runtest("input source tests",
  15 + {$td->COMMAND => "input_source"},
  16 + {$td->FILE => "input_source.out",
  17 + $td->EXIT_STATUS => 0},
  18 + $td->NORMALIZE_NEWLINES);
  19 +
  20 +cleanup();
  21 +
  22 +$td->report(1);
  23 +
  24 +sub cleanup
  25 +{
  26 +}
libtests/qtest/input_source/input_source.out 0 → 100644
  1 +find potato salad: PASS
  2 +barely find potato salad: PASS
  3 +barely find potato salad: PASS
  4 +potato salad is too late: PASS
  5 +potato salad is too late: PASS
  6 +potato salad not found: PASS
  7 +potato salad not found: PASS
  8 +potato salad at EOF: PASS
  9 +findFirst found first: PASS
  10 +findLast found potato salad: PASS
  11 +findLast found at EOF: PASS
  12 +potato but not salad salad at EOF: PASS
  13 +findLast found potato salad: PASS
  14 +findLast found first one: PASS