Commit 7f84239cad2ec58166245394e56a4647085e025e

Authored by Jay Berkenbilt
1 parent bcfc9847

Find PDF header anywhere in the first 1024 bytes

ChangeLog
  1 +2012-12-25 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * Allow PDF header to appear anywhere in the first 1024 bytes of
  4 + the file as recommended in the implementation notes of the Adobe
  5 + version of the PDF spec.
  6 +
1 2012-11-20 Jay Berkenbilt <ejb@ql.org> 7 2012-11-20 Jay Berkenbilt <ejb@ql.org>
2 8
3 * Add zlib and libpcre to Requires.private in the pkg-config file 9 * Add zlib and libpcre to Requires.private in the pkg-config file
1 -Next  
2 -====  
3 -  
4 - * Find PDF header in the first 1024 bytes of the file. Treat the  
5 - location of the PDF header as offset 0 for purposes of resolving  
6 - explicit file locations as this is what other implementations  
7 - appear to do.  
8 -  
9 -  
10 General 1 General
11 ======= 2 =======
12 3
libqpdf/OffsetInputSource.cc 0 → 100644
  1 +#include <qpdf/OffsetInputSource.hh>
  2 +
  3 +OffsetInputSource::OffsetInputSource(PointerHolder<InputSource> proxied,
  4 + qpdf_offset_t global_offset) :
  5 + proxied(proxied),
  6 + global_offset(global_offset)
  7 +{
  8 +}
  9 +
  10 +OffsetInputSource::~OffsetInputSource()
  11 +{
  12 +}
  13 +
  14 +qpdf_offset_t
  15 +OffsetInputSource::findAndSkipNextEOL()
  16 +{
  17 + return this->proxied->findAndSkipNextEOL() - this->global_offset;
  18 +}
  19 +
  20 +std::string const&
  21 +OffsetInputSource::getName() const
  22 +{
  23 + return this->proxied->getName();
  24 +}
  25 +
  26 +qpdf_offset_t
  27 +OffsetInputSource::tell()
  28 +{
  29 + return this->proxied->tell() - this->global_offset;
  30 +}
  31 +
  32 +void
  33 +OffsetInputSource::seek(qpdf_offset_t offset, int whence)
  34 +{
  35 + if (whence == SEEK_SET)
  36 + {
  37 + this->proxied->seek(offset + global_offset, whence);
  38 + }
  39 + else
  40 + {
  41 + this->proxied->seek(offset, whence);
  42 + }
  43 +}
  44 +
  45 +void
  46 +OffsetInputSource::rewind()
  47 +{
  48 + seek(0, SEEK_SET);
  49 +}
  50 +
  51 +size_t
  52 +OffsetInputSource::read(char* buffer, size_t length)
  53 +{
  54 + return this->proxied->read(buffer, length);
  55 +}
  56 +
  57 +void
  58 +OffsetInputSource::unreadCh(char ch)
  59 +{
  60 + this->proxied->unreadCh(ch);
  61 +}
libqpdf/QPDF.cc
@@ -13,6 +13,7 @@ @@ -13,6 +13,7 @@
13 #include <qpdf/Pl_Discard.hh> 13 #include <qpdf/Pl_Discard.hh>
14 #include <qpdf/FileInputSource.hh> 14 #include <qpdf/FileInputSource.hh>
15 #include <qpdf/BufferInputSource.hh> 15 #include <qpdf/BufferInputSource.hh>
  16 +#include <qpdf/OffsetInputSource.hh>
16 17
17 #include <qpdf/QPDFExc.hh> 18 #include <qpdf/QPDFExc.hh>
18 #include <qpdf/QPDF_Null.hh> 19 #include <qpdf/QPDF_Null.hh>
@@ -213,7 +214,7 @@ QPDF::getWarnings() @@ -213,7 +214,7 @@ QPDF::getWarnings()
213 void 214 void
214 QPDF::parse(char const* password) 215 QPDF::parse(char const* password)
215 { 216 {
216 - PCRE header_re("^%PDF-(1.\d+)\b"); 217 + PCRE header_re("\\A((?s).*?)%PDF-(1.\d+)\b");
217 PCRE eof_re("(?s:startxref\\s+(\\d+)\\s+%%EOF\\b)"); 218 PCRE eof_re("(?s:startxref\\s+(\\d+)\\s+%%EOF\\b)");
218 219
219 if (password) 220 if (password)
@@ -221,11 +222,24 @@ QPDF::parse(char const* password) @@ -221,11 +222,24 @@ QPDF::parse(char const* password)
221 this->provided_password = password; 222 this->provided_password = password;
222 } 223 }
223 224
224 - std::string line = this->file->readLine(20); 225 + // Find the header anywhere in the first 1024 bytes of the file.
  226 + char buffer[1044];
  227 + this->file->read(buffer, sizeof(buffer));
  228 + std::string line(buffer);
225 PCRE::Match m1 = header_re.match(line.c_str()); 229 PCRE::Match m1 = header_re.match(line.c_str());
226 if (m1) 230 if (m1)
227 { 231 {
228 - this->pdf_version = m1.getMatch(1); 232 + size_t global_offset = m1.getMatch(1).length();
  233 + if (global_offset != 0)
  234 + {
  235 + // Emperical evidence strongly suggests that when there is
  236 + // leading material prior to the PDF header, all explicit
  237 + // offsets in the file are such that 0 points to the
  238 + // beginning of the header.
  239 + QTC::TC("qpdf", "QPDF global offset");
  240 + this->file = new OffsetInputSource(this->file, global_offset);
  241 + }
  242 + this->pdf_version = m1.getMatch(2);
229 if (atof(this->pdf_version.c_str()) < 1.2) 243 if (atof(this->pdf_version.c_str()) < 1.2)
230 { 244 {
231 this->tokenizer.allowPoundAnywhereInName(); 245 this->tokenizer.allowPoundAnywhereInName();
libqpdf/build.mk
@@ -12,6 +12,7 @@ SRCS_libqpdf = \ @@ -12,6 +12,7 @@ SRCS_libqpdf = \
12 libqpdf/FileInputSource.cc \ 12 libqpdf/FileInputSource.cc \
13 libqpdf/InputSource.cc \ 13 libqpdf/InputSource.cc \
14 libqpdf/MD5.cc \ 14 libqpdf/MD5.cc \
  15 + libqpdf/OffsetInputSource.cc \
15 libqpdf/PCRE.cc \ 16 libqpdf/PCRE.cc \
16 libqpdf/Pipeline.cc \ 17 libqpdf/Pipeline.cc \
17 libqpdf/Pl_AES_PDF.cc \ 18 libqpdf/Pl_AES_PDF.cc \
libqpdf/qpdf/OffsetInputSource.hh 0 → 100644
  1 +#ifndef __QPDF_OFFSETINPUTSOURCE_HH__
  2 +#define __QPDF_OFFSETINPUTSOURCE_HH__
  3 +
  4 +// This class implements an InputSource that proxies for an underlying
  5 +// input source but offset a specific number of bytes.
  6 +
  7 +#include <qpdf/InputSource.hh>
  8 +#include <qpdf/PointerHolder.hh>
  9 +
  10 +class OffsetInputSource: public InputSource
  11 +{
  12 + public:
  13 + OffsetInputSource(PointerHolder<InputSource>, qpdf_offset_t global_offset);
  14 + virtual ~OffsetInputSource();
  15 +
  16 + virtual qpdf_offset_t findAndSkipNextEOL();
  17 + virtual std::string const& getName() const;
  18 + virtual qpdf_offset_t tell();
  19 + virtual void seek(qpdf_offset_t offset, int whence);
  20 + virtual void rewind();
  21 + virtual size_t read(char* buffer, size_t length);
  22 + virtual void unreadCh(char ch);
  23 +
  24 + private:
  25 + PointerHolder<InputSource> proxied;
  26 + qpdf_offset_t global_offset;
  27 +};
  28 +
  29 +#endif // __QPDF_OFFSETINPUTSOURCE_HH__
qpdf/qpdf.testcov
@@ -243,3 +243,4 @@ QPDF_Tokenizer EOF reading appendable token 0 @@ -243,3 +243,4 @@ QPDF_Tokenizer EOF reading appendable token 0
243 QPDFWriter extra header text no newline 0 243 QPDFWriter extra header text no newline 0
244 QPDFWriter extra header text add newline 0 244 QPDFWriter extra header text add newline 0
245 QPDF bogus 0 offset 0 245 QPDF bogus 0 offset 0
  246 +QPDF global offset 0
qpdf/qtest/qpdf.test
@@ -149,7 +149,7 @@ $td-&gt;runtest(&quot;remove page we don&#39;t have&quot;, @@ -149,7 +149,7 @@ $td-&gt;runtest(&quot;remove page we don&#39;t have&quot;,
149 $td->NORMALIZE_NEWLINES); 149 $td->NORMALIZE_NEWLINES);
150 # ---------- 150 # ----------
151 $td->notify("--- Miscellaneous Tests ---"); 151 $td->notify("--- Miscellaneous Tests ---");
152 -$n_tests += 56; 152 +$n_tests += 57;
153 153
154 $td->runtest("qpdf version", 154 $td->runtest("qpdf version",
155 {$td->COMMAND => "qpdf --version"}, 155 {$td->COMMAND => "qpdf --version"},
@@ -414,6 +414,10 @@ $td-&gt;runtest(&quot;object with zero offset&quot;, @@ -414,6 +414,10 @@ $td-&gt;runtest(&quot;object with zero offset&quot;,
414 {$td->COMMAND => "qpdf --check zero-offset.pdf"}, 414 {$td->COMMAND => "qpdf --check zero-offset.pdf"},
415 {$td->FILE => "zero-offset.out", $td->EXIT_STATUS => 3}, 415 {$td->FILE => "zero-offset.out", $td->EXIT_STATUS => 3},
416 $td->NORMALIZE_NEWLINES); 416 $td->NORMALIZE_NEWLINES);
  417 +$td->runtest("check file with leading junk",
  418 + {$td->COMMAND => "qpdf --check leading-junk.pdf"},
  419 + {$td->FILE => "leading-junk.out", $td->EXIT_STATUS => 0},
  420 + $td->NORMALIZE_NEWLINES);
417 421
418 show_ntests(); 422 show_ntests();
419 # ---------- 423 # ----------
qpdf/qtest/qpdf/leading-junk.out 0 → 100644
  1 +checking leading-junk.pdf
  2 +PDF Version: 1.4
  3 +R = 3
  4 +P = -4
  5 +User password =
  6 +extract for accessibility: allowed
  7 +extract for any purpose: allowed
  8 +print low resolution: allowed
  9 +print high resolution: allowed
  10 +modify document assembly: allowed
  11 +modify forms: allowed
  12 +modify annotations: allowed
  13 +modify other: allowed
  14 +modify anything: allowed
  15 +File is linearized
  16 +No syntax or stream encoding errors found; the file may still contain
  17 +errors that qpdf cannot detect
qpdf/qtest/qpdf/leading-junk.pdf 0 → 100644
No preview for this file type