Commit 7f84239cad2ec58166245394e56a4647085e025e
1 parent
bcfc9847
Find PDF header anywhere in the first 1024 bytes
Showing
10 changed files
with
137 additions
and
13 deletions
ChangeLog
| 1 | +2012-12-25 Jay Berkenbilt <ejb@ql.org> | ||
| 2 | + | ||
| 3 | + * Allow PDF header to appear anywhere in the first 1024 bytes of | ||
| 4 | + the file as recommended in the implementation notes of the Adobe | ||
| 5 | + version of the PDF spec. | ||
| 6 | + | ||
| 1 | 2012-11-20 Jay Berkenbilt <ejb@ql.org> | 7 | 2012-11-20 Jay Berkenbilt <ejb@ql.org> |
| 2 | 8 | ||
| 3 | * Add zlib and libpcre to Requires.private in the pkg-config file | 9 | * Add zlib and libpcre to Requires.private in the pkg-config file |
TODO
libqpdf/OffsetInputSource.cc
0 → 100644
| 1 | +#include <qpdf/OffsetInputSource.hh> | ||
| 2 | + | ||
| 3 | +OffsetInputSource::OffsetInputSource(PointerHolder<InputSource> proxied, | ||
| 4 | + qpdf_offset_t global_offset) : | ||
| 5 | + proxied(proxied), | ||
| 6 | + global_offset(global_offset) | ||
| 7 | +{ | ||
| 8 | +} | ||
| 9 | + | ||
| 10 | +OffsetInputSource::~OffsetInputSource() | ||
| 11 | +{ | ||
| 12 | +} | ||
| 13 | + | ||
| 14 | +qpdf_offset_t | ||
| 15 | +OffsetInputSource::findAndSkipNextEOL() | ||
| 16 | +{ | ||
| 17 | + return this->proxied->findAndSkipNextEOL() - this->global_offset; | ||
| 18 | +} | ||
| 19 | + | ||
| 20 | +std::string const& | ||
| 21 | +OffsetInputSource::getName() const | ||
| 22 | +{ | ||
| 23 | + return this->proxied->getName(); | ||
| 24 | +} | ||
| 25 | + | ||
| 26 | +qpdf_offset_t | ||
| 27 | +OffsetInputSource::tell() | ||
| 28 | +{ | ||
| 29 | + return this->proxied->tell() - this->global_offset; | ||
| 30 | +} | ||
| 31 | + | ||
| 32 | +void | ||
| 33 | +OffsetInputSource::seek(qpdf_offset_t offset, int whence) | ||
| 34 | +{ | ||
| 35 | + if (whence == SEEK_SET) | ||
| 36 | + { | ||
| 37 | + this->proxied->seek(offset + global_offset, whence); | ||
| 38 | + } | ||
| 39 | + else | ||
| 40 | + { | ||
| 41 | + this->proxied->seek(offset, whence); | ||
| 42 | + } | ||
| 43 | +} | ||
| 44 | + | ||
| 45 | +void | ||
| 46 | +OffsetInputSource::rewind() | ||
| 47 | +{ | ||
| 48 | + seek(0, SEEK_SET); | ||
| 49 | +} | ||
| 50 | + | ||
| 51 | +size_t | ||
| 52 | +OffsetInputSource::read(char* buffer, size_t length) | ||
| 53 | +{ | ||
| 54 | + return this->proxied->read(buffer, length); | ||
| 55 | +} | ||
| 56 | + | ||
| 57 | +void | ||
| 58 | +OffsetInputSource::unreadCh(char ch) | ||
| 59 | +{ | ||
| 60 | + this->proxied->unreadCh(ch); | ||
| 61 | +} |
libqpdf/QPDF.cc
| @@ -13,6 +13,7 @@ | @@ -13,6 +13,7 @@ | ||
| 13 | #include <qpdf/Pl_Discard.hh> | 13 | #include <qpdf/Pl_Discard.hh> |
| 14 | #include <qpdf/FileInputSource.hh> | 14 | #include <qpdf/FileInputSource.hh> |
| 15 | #include <qpdf/BufferInputSource.hh> | 15 | #include <qpdf/BufferInputSource.hh> |
| 16 | +#include <qpdf/OffsetInputSource.hh> | ||
| 16 | 17 | ||
| 17 | #include <qpdf/QPDFExc.hh> | 18 | #include <qpdf/QPDFExc.hh> |
| 18 | #include <qpdf/QPDF_Null.hh> | 19 | #include <qpdf/QPDF_Null.hh> |
| @@ -213,7 +214,7 @@ QPDF::getWarnings() | @@ -213,7 +214,7 @@ QPDF::getWarnings() | ||
| 213 | void | 214 | void |
| 214 | QPDF::parse(char const* password) | 215 | QPDF::parse(char const* password) |
| 215 | { | 216 | { |
| 216 | - PCRE header_re("^%PDF-(1.\d+)\b"); | 217 | + PCRE header_re("\\A((?s).*?)%PDF-(1.\d+)\b"); |
| 217 | PCRE eof_re("(?s:startxref\\s+(\\d+)\\s+%%EOF\\b)"); | 218 | PCRE eof_re("(?s:startxref\\s+(\\d+)\\s+%%EOF\\b)"); |
| 218 | 219 | ||
| 219 | if (password) | 220 | if (password) |
| @@ -221,11 +222,24 @@ QPDF::parse(char const* password) | @@ -221,11 +222,24 @@ QPDF::parse(char const* password) | ||
| 221 | this->provided_password = password; | 222 | this->provided_password = password; |
| 222 | } | 223 | } |
| 223 | 224 | ||
| 224 | - std::string line = this->file->readLine(20); | 225 | + // Find the header anywhere in the first 1024 bytes of the file. |
| 226 | + char buffer[1044]; | ||
| 227 | + this->file->read(buffer, sizeof(buffer)); | ||
| 228 | + std::string line(buffer); | ||
| 225 | PCRE::Match m1 = header_re.match(line.c_str()); | 229 | PCRE::Match m1 = header_re.match(line.c_str()); |
| 226 | if (m1) | 230 | if (m1) |
| 227 | { | 231 | { |
| 228 | - this->pdf_version = m1.getMatch(1); | 232 | + size_t global_offset = m1.getMatch(1).length(); |
| 233 | + if (global_offset != 0) | ||
| 234 | + { | ||
| 235 | + // Emperical evidence strongly suggests that when there is | ||
| 236 | + // leading material prior to the PDF header, all explicit | ||
| 237 | + // offsets in the file are such that 0 points to the | ||
| 238 | + // beginning of the header. | ||
| 239 | + QTC::TC("qpdf", "QPDF global offset"); | ||
| 240 | + this->file = new OffsetInputSource(this->file, global_offset); | ||
| 241 | + } | ||
| 242 | + this->pdf_version = m1.getMatch(2); | ||
| 229 | if (atof(this->pdf_version.c_str()) < 1.2) | 243 | if (atof(this->pdf_version.c_str()) < 1.2) |
| 230 | { | 244 | { |
| 231 | this->tokenizer.allowPoundAnywhereInName(); | 245 | this->tokenizer.allowPoundAnywhereInName(); |
libqpdf/build.mk
| @@ -12,6 +12,7 @@ SRCS_libqpdf = \ | @@ -12,6 +12,7 @@ SRCS_libqpdf = \ | ||
| 12 | libqpdf/FileInputSource.cc \ | 12 | libqpdf/FileInputSource.cc \ |
| 13 | libqpdf/InputSource.cc \ | 13 | libqpdf/InputSource.cc \ |
| 14 | libqpdf/MD5.cc \ | 14 | libqpdf/MD5.cc \ |
| 15 | + libqpdf/OffsetInputSource.cc \ | ||
| 15 | libqpdf/PCRE.cc \ | 16 | libqpdf/PCRE.cc \ |
| 16 | libqpdf/Pipeline.cc \ | 17 | libqpdf/Pipeline.cc \ |
| 17 | libqpdf/Pl_AES_PDF.cc \ | 18 | libqpdf/Pl_AES_PDF.cc \ |
libqpdf/qpdf/OffsetInputSource.hh
0 → 100644
| 1 | +#ifndef __QPDF_OFFSETINPUTSOURCE_HH__ | ||
| 2 | +#define __QPDF_OFFSETINPUTSOURCE_HH__ | ||
| 3 | + | ||
| 4 | +// This class implements an InputSource that proxies for an underlying | ||
| 5 | +// input source but offset a specific number of bytes. | ||
| 6 | + | ||
| 7 | +#include <qpdf/InputSource.hh> | ||
| 8 | +#include <qpdf/PointerHolder.hh> | ||
| 9 | + | ||
| 10 | +class OffsetInputSource: public InputSource | ||
| 11 | +{ | ||
| 12 | + public: | ||
| 13 | + OffsetInputSource(PointerHolder<InputSource>, qpdf_offset_t global_offset); | ||
| 14 | + virtual ~OffsetInputSource(); | ||
| 15 | + | ||
| 16 | + virtual qpdf_offset_t findAndSkipNextEOL(); | ||
| 17 | + virtual std::string const& getName() const; | ||
| 18 | + virtual qpdf_offset_t tell(); | ||
| 19 | + virtual void seek(qpdf_offset_t offset, int whence); | ||
| 20 | + virtual void rewind(); | ||
| 21 | + virtual size_t read(char* buffer, size_t length); | ||
| 22 | + virtual void unreadCh(char ch); | ||
| 23 | + | ||
| 24 | + private: | ||
| 25 | + PointerHolder<InputSource> proxied; | ||
| 26 | + qpdf_offset_t global_offset; | ||
| 27 | +}; | ||
| 28 | + | ||
| 29 | +#endif // __QPDF_OFFSETINPUTSOURCE_HH__ |
qpdf/qpdf.testcov
| @@ -243,3 +243,4 @@ QPDF_Tokenizer EOF reading appendable token 0 | @@ -243,3 +243,4 @@ QPDF_Tokenizer EOF reading appendable token 0 | ||
| 243 | QPDFWriter extra header text no newline 0 | 243 | QPDFWriter extra header text no newline 0 |
| 244 | QPDFWriter extra header text add newline 0 | 244 | QPDFWriter extra header text add newline 0 |
| 245 | QPDF bogus 0 offset 0 | 245 | QPDF bogus 0 offset 0 |
| 246 | +QPDF global offset 0 |
qpdf/qtest/qpdf.test
| @@ -149,7 +149,7 @@ $td->runtest("remove page we don't have", | @@ -149,7 +149,7 @@ $td->runtest("remove page we don't have", | ||
| 149 | $td->NORMALIZE_NEWLINES); | 149 | $td->NORMALIZE_NEWLINES); |
| 150 | # ---------- | 150 | # ---------- |
| 151 | $td->notify("--- Miscellaneous Tests ---"); | 151 | $td->notify("--- Miscellaneous Tests ---"); |
| 152 | -$n_tests += 56; | 152 | +$n_tests += 57; |
| 153 | 153 | ||
| 154 | $td->runtest("qpdf version", | 154 | $td->runtest("qpdf version", |
| 155 | {$td->COMMAND => "qpdf --version"}, | 155 | {$td->COMMAND => "qpdf --version"}, |
| @@ -414,6 +414,10 @@ $td->runtest("object with zero offset", | @@ -414,6 +414,10 @@ $td->runtest("object with zero offset", | ||
| 414 | {$td->COMMAND => "qpdf --check zero-offset.pdf"}, | 414 | {$td->COMMAND => "qpdf --check zero-offset.pdf"}, |
| 415 | {$td->FILE => "zero-offset.out", $td->EXIT_STATUS => 3}, | 415 | {$td->FILE => "zero-offset.out", $td->EXIT_STATUS => 3}, |
| 416 | $td->NORMALIZE_NEWLINES); | 416 | $td->NORMALIZE_NEWLINES); |
| 417 | +$td->runtest("check file with leading junk", | ||
| 418 | + {$td->COMMAND => "qpdf --check leading-junk.pdf"}, | ||
| 419 | + {$td->FILE => "leading-junk.out", $td->EXIT_STATUS => 0}, | ||
| 420 | + $td->NORMALIZE_NEWLINES); | ||
| 417 | 421 | ||
| 418 | show_ntests(); | 422 | show_ntests(); |
| 419 | # ---------- | 423 | # ---------- |
qpdf/qtest/qpdf/leading-junk.out
0 → 100644
| 1 | +checking leading-junk.pdf | ||
| 2 | +PDF Version: 1.4 | ||
| 3 | +R = 3 | ||
| 4 | +P = -4 | ||
| 5 | +User password = | ||
| 6 | +extract for accessibility: allowed | ||
| 7 | +extract for any purpose: allowed | ||
| 8 | +print low resolution: allowed | ||
| 9 | +print high resolution: allowed | ||
| 10 | +modify document assembly: allowed | ||
| 11 | +modify forms: allowed | ||
| 12 | +modify annotations: allowed | ||
| 13 | +modify other: allowed | ||
| 14 | +modify anything: allowed | ||
| 15 | +File is linearized | ||
| 16 | +No syntax or stream encoding errors found; the file may still contain | ||
| 17 | +errors that qpdf cannot detect |
qpdf/qtest/qpdf/leading-junk.pdf
0 → 100644
No preview for this file type