Commit 7f84239cad2ec58166245394e56a4647085e025e
1 parent
bcfc9847
Find PDF header anywhere in the first 1024 bytes
Showing
10 changed files
with
137 additions
and
13 deletions
ChangeLog
| 1 | +2012-12-25 Jay Berkenbilt <ejb@ql.org> | |
| 2 | + | |
| 3 | + * Allow PDF header to appear anywhere in the first 1024 bytes of | |
| 4 | + the file as recommended in the implementation notes of the Adobe | |
| 5 | + version of the PDF spec. | |
| 6 | + | |
| 1 | 7 | 2012-11-20 Jay Berkenbilt <ejb@ql.org> |
| 2 | 8 | |
| 3 | 9 | * Add zlib and libpcre to Requires.private in the pkg-config file | ... | ... |
TODO
libqpdf/OffsetInputSource.cc
0 → 100644
| 1 | +#include <qpdf/OffsetInputSource.hh> | |
| 2 | + | |
| 3 | +OffsetInputSource::OffsetInputSource(PointerHolder<InputSource> proxied, | |
| 4 | + qpdf_offset_t global_offset) : | |
| 5 | + proxied(proxied), | |
| 6 | + global_offset(global_offset) | |
| 7 | +{ | |
| 8 | +} | |
| 9 | + | |
| 10 | +OffsetInputSource::~OffsetInputSource() | |
| 11 | +{ | |
| 12 | +} | |
| 13 | + | |
| 14 | +qpdf_offset_t | |
| 15 | +OffsetInputSource::findAndSkipNextEOL() | |
| 16 | +{ | |
| 17 | + return this->proxied->findAndSkipNextEOL() - this->global_offset; | |
| 18 | +} | |
| 19 | + | |
| 20 | +std::string const& | |
| 21 | +OffsetInputSource::getName() const | |
| 22 | +{ | |
| 23 | + return this->proxied->getName(); | |
| 24 | +} | |
| 25 | + | |
| 26 | +qpdf_offset_t | |
| 27 | +OffsetInputSource::tell() | |
| 28 | +{ | |
| 29 | + return this->proxied->tell() - this->global_offset; | |
| 30 | +} | |
| 31 | + | |
| 32 | +void | |
| 33 | +OffsetInputSource::seek(qpdf_offset_t offset, int whence) | |
| 34 | +{ | |
| 35 | + if (whence == SEEK_SET) | |
| 36 | + { | |
| 37 | + this->proxied->seek(offset + global_offset, whence); | |
| 38 | + } | |
| 39 | + else | |
| 40 | + { | |
| 41 | + this->proxied->seek(offset, whence); | |
| 42 | + } | |
| 43 | +} | |
| 44 | + | |
| 45 | +void | |
| 46 | +OffsetInputSource::rewind() | |
| 47 | +{ | |
| 48 | + seek(0, SEEK_SET); | |
| 49 | +} | |
| 50 | + | |
| 51 | +size_t | |
| 52 | +OffsetInputSource::read(char* buffer, size_t length) | |
| 53 | +{ | |
| 54 | + return this->proxied->read(buffer, length); | |
| 55 | +} | |
| 56 | + | |
| 57 | +void | |
| 58 | +OffsetInputSource::unreadCh(char ch) | |
| 59 | +{ | |
| 60 | + this->proxied->unreadCh(ch); | |
| 61 | +} | ... | ... |
libqpdf/QPDF.cc
| ... | ... | @@ -13,6 +13,7 @@ |
| 13 | 13 | #include <qpdf/Pl_Discard.hh> |
| 14 | 14 | #include <qpdf/FileInputSource.hh> |
| 15 | 15 | #include <qpdf/BufferInputSource.hh> |
| 16 | +#include <qpdf/OffsetInputSource.hh> | |
| 16 | 17 | |
| 17 | 18 | #include <qpdf/QPDFExc.hh> |
| 18 | 19 | #include <qpdf/QPDF_Null.hh> |
| ... | ... | @@ -213,7 +214,7 @@ QPDF::getWarnings() |
| 213 | 214 | void |
| 214 | 215 | QPDF::parse(char const* password) |
| 215 | 216 | { |
| 216 | - PCRE header_re("^%PDF-(1.\d+)\b"); | |
| 217 | + PCRE header_re("\\A((?s).*?)%PDF-(1.\d+)\b"); | |
| 217 | 218 | PCRE eof_re("(?s:startxref\\s+(\\d+)\\s+%%EOF\\b)"); |
| 218 | 219 | |
| 219 | 220 | if (password) |
| ... | ... | @@ -221,11 +222,24 @@ QPDF::parse(char const* password) |
| 221 | 222 | this->provided_password = password; |
| 222 | 223 | } |
| 223 | 224 | |
| 224 | - std::string line = this->file->readLine(20); | |
| 225 | + // Find the header anywhere in the first 1024 bytes of the file. | |
| 226 | + char buffer[1044]; | |
| 227 | + this->file->read(buffer, sizeof(buffer)); | |
| 228 | + std::string line(buffer); | |
| 225 | 229 | PCRE::Match m1 = header_re.match(line.c_str()); |
| 226 | 230 | if (m1) |
| 227 | 231 | { |
| 228 | - this->pdf_version = m1.getMatch(1); | |
| 232 | + size_t global_offset = m1.getMatch(1).length(); | |
| 233 | + if (global_offset != 0) | |
| 234 | + { | |
| 235 | + // Emperical evidence strongly suggests that when there is | |
| 236 | + // leading material prior to the PDF header, all explicit | |
| 237 | + // offsets in the file are such that 0 points to the | |
| 238 | + // beginning of the header. | |
| 239 | + QTC::TC("qpdf", "QPDF global offset"); | |
| 240 | + this->file = new OffsetInputSource(this->file, global_offset); | |
| 241 | + } | |
| 242 | + this->pdf_version = m1.getMatch(2); | |
| 229 | 243 | if (atof(this->pdf_version.c_str()) < 1.2) |
| 230 | 244 | { |
| 231 | 245 | this->tokenizer.allowPoundAnywhereInName(); | ... | ... |
libqpdf/build.mk
libqpdf/qpdf/OffsetInputSource.hh
0 → 100644
| 1 | +#ifndef __QPDF_OFFSETINPUTSOURCE_HH__ | |
| 2 | +#define __QPDF_OFFSETINPUTSOURCE_HH__ | |
| 3 | + | |
| 4 | +// This class implements an InputSource that proxies for an underlying | |
| 5 | +// input source but offset a specific number of bytes. | |
| 6 | + | |
| 7 | +#include <qpdf/InputSource.hh> | |
| 8 | +#include <qpdf/PointerHolder.hh> | |
| 9 | + | |
| 10 | +class OffsetInputSource: public InputSource | |
| 11 | +{ | |
| 12 | + public: | |
| 13 | + OffsetInputSource(PointerHolder<InputSource>, qpdf_offset_t global_offset); | |
| 14 | + virtual ~OffsetInputSource(); | |
| 15 | + | |
| 16 | + virtual qpdf_offset_t findAndSkipNextEOL(); | |
| 17 | + virtual std::string const& getName() const; | |
| 18 | + virtual qpdf_offset_t tell(); | |
| 19 | + virtual void seek(qpdf_offset_t offset, int whence); | |
| 20 | + virtual void rewind(); | |
| 21 | + virtual size_t read(char* buffer, size_t length); | |
| 22 | + virtual void unreadCh(char ch); | |
| 23 | + | |
| 24 | + private: | |
| 25 | + PointerHolder<InputSource> proxied; | |
| 26 | + qpdf_offset_t global_offset; | |
| 27 | +}; | |
| 28 | + | |
| 29 | +#endif // __QPDF_OFFSETINPUTSOURCE_HH__ | ... | ... |
qpdf/qpdf.testcov
qpdf/qtest/qpdf.test
| ... | ... | @@ -149,7 +149,7 @@ $td->runtest("remove page we don't have", |
| 149 | 149 | $td->NORMALIZE_NEWLINES); |
| 150 | 150 | # ---------- |
| 151 | 151 | $td->notify("--- Miscellaneous Tests ---"); |
| 152 | -$n_tests += 56; | |
| 152 | +$n_tests += 57; | |
| 153 | 153 | |
| 154 | 154 | $td->runtest("qpdf version", |
| 155 | 155 | {$td->COMMAND => "qpdf --version"}, |
| ... | ... | @@ -414,6 +414,10 @@ $td->runtest("object with zero offset", |
| 414 | 414 | {$td->COMMAND => "qpdf --check zero-offset.pdf"}, |
| 415 | 415 | {$td->FILE => "zero-offset.out", $td->EXIT_STATUS => 3}, |
| 416 | 416 | $td->NORMALIZE_NEWLINES); |
| 417 | +$td->runtest("check file with leading junk", | |
| 418 | + {$td->COMMAND => "qpdf --check leading-junk.pdf"}, | |
| 419 | + {$td->FILE => "leading-junk.out", $td->EXIT_STATUS => 0}, | |
| 420 | + $td->NORMALIZE_NEWLINES); | |
| 417 | 421 | |
| 418 | 422 | show_ntests(); |
| 419 | 423 | # ---------- | ... | ... |
qpdf/qtest/qpdf/leading-junk.out
0 → 100644
| 1 | +checking leading-junk.pdf | |
| 2 | +PDF Version: 1.4 | |
| 3 | +R = 3 | |
| 4 | +P = -4 | |
| 5 | +User password = | |
| 6 | +extract for accessibility: allowed | |
| 7 | +extract for any purpose: allowed | |
| 8 | +print low resolution: allowed | |
| 9 | +print high resolution: allowed | |
| 10 | +modify document assembly: allowed | |
| 11 | +modify forms: allowed | |
| 12 | +modify annotations: allowed | |
| 13 | +modify other: allowed | |
| 14 | +modify anything: allowed | |
| 15 | +File is linearized | |
| 16 | +No syntax or stream encoding errors found; the file may still contain | |
| 17 | +errors that qpdf cannot detect | ... | ... |
qpdf/qtest/qpdf/leading-junk.pdf
0 → 100644
No preview for this file type