diff --git a/libqpdf/QPDFParser.cc b/libqpdf/QPDFParser.cc index 5407284..7e823cd 100644 --- a/libqpdf/QPDFParser.cc +++ b/libqpdf/QPDFParser.cc @@ -51,7 +51,6 @@ QPDFObjectHandle QPDFParser::parse(InputSource& input, std::string const& object_description, QPDF* context) { qpdf::Tokenizer tokenizer; - bool empty = false; if (auto result = QPDFParser( input, make_description(input.getName(), object_description), @@ -60,7 +59,7 @@ QPDFParser::parse(InputSource& input, std::string const& object_description, QPD nullptr, context, false) - .parse(empty, false)) { + .parse()) { return result; } return {QPDFObject::create()}; @@ -73,23 +72,24 @@ QPDFParser::parse_content( qpdf::Tokenizer& tokenizer, QPDF* context) { - bool empty = false; - if (auto result = QPDFParser( - input, - std::move(sp_description), - "content", - tokenizer, - nullptr, - context, - true, - 0, - 0, - context && context->doc().reconstructed_xref()) - .parse(empty, true)) { + static const std::string content("content"); // GCC12 - make constexpr + auto p = QPDFParser( + input, + std::move(sp_description), + content, + tokenizer, + nullptr, + context, + true, + 0, + 0, + context && context->doc().reconstructed_xref()); + auto result = p.parse(true); + if (result || p.empty_) { + // In content stream mode, leave object uninitialized to indicate EOF return result; } - // In content stream mode, leave object uninitialized to indicate EOF - return {empty ? nullptr : QPDFObject::create()}; + return {QPDFObject::create()}; } QPDFObjectHandle @@ -101,21 +101,25 @@ QPDFParser::parse( QPDFObjectHandle::StringDecrypter* decrypter, QPDF* context) { - if (auto result = QPDFParser( - input, - make_description(input.getName(), object_description), - object_description, - *tokenizer.m, - decrypter, - context, - false) - .parse(empty, false)) { + // ABI: This parse overload is only used by the deprecated QPDFObjectHandle::parse. It is the + // only user of the 'empty' member. When removing this overload also remove 'empty'. + auto p = QPDFParser( + input, + make_description(input.getName(), object_description), + object_description, + *tokenizer.m, + decrypter, + context, + false); + auto result = p.parse(); + empty = p.empty_; + if (result) { return result; } return {QPDFObject::create()}; } -std::pair +QPDFObjectHandle QPDFParser::parse( InputSource& input, std::string const& object_description, @@ -124,55 +128,44 @@ QPDFParser::parse( QPDF& context, bool sanity_checks) { - bool empty{false}; - auto result = QPDFParser( - input, - make_description(input.getName(), object_description), - object_description, - tokenizer, - decrypter, - &context, - true, - 0, - 0, - sanity_checks) - .parse(empty, false); - if (result) { - return {result, empty}; - } - return {QPDFObject::create(), empty}; + return QPDFParser( + input, + make_description(input.getName(), object_description), + object_description, + tokenizer, + decrypter, + &context, + true, + 0, + 0, + sanity_checks) + .parse(); } -std::pair +QPDFObjectHandle QPDFParser::parse( is::OffsetBuffer& input, int stream_id, int obj_id, qpdf::Tokenizer& tokenizer, QPDF& context) { - bool empty{false}; - auto result = QPDFParser( - input, - std::make_shared( - QPDFObject::ObjStreamDescr(stream_id, obj_id)), - "", - tokenizer, - nullptr, - &context, - true, - stream_id, - obj_id) - .parse(empty, false); - - if (result) { - return {result, empty}; - } - return {QPDFObject::create(), empty}; + return QPDFParser( + input, + std::make_shared( + QPDFObject::ObjStreamDescr(stream_id, obj_id)), + "", + tokenizer, + nullptr, + &context, + true, + stream_id, + obj_id) + .parse(); } QPDFObjectHandle -QPDFParser::parse(bool& empty, bool content_stream) +QPDFParser::parse(bool content_stream) { try { - return parse_first(empty, content_stream); - } catch (Error& e) { + return parse_first(content_stream); + } catch (Error&) { return {}; } catch (QPDFExc& e) { throw e; @@ -185,15 +178,14 @@ QPDFParser::parse(bool& empty, bool content_stream) } QPDFObjectHandle -QPDFParser::parse_first(bool& empty, bool content_stream) +QPDFParser::parse_first(bool content_stream) { // This method must take care not to resolve any objects. Don't check the type of any object // without first ensuring that it is a direct object. Otherwise, doing so may have the side // effect of reading the object and changing the file pointer. If you do this, it will cause a // logic error to be thrown from QPDF::inParse(). - ParseGuard pg(context); - empty = false; + QPDF::Doc::ParseGuard pg(context); start = input.tell(); if (!tokenizer.nextToken(input, object_description)) { warn(tokenizer.getErrorMessage()); @@ -203,7 +195,7 @@ QPDFParser::parse_first(bool& empty, bool content_stream) case QPDFTokenizer::tt_eof: if (content_stream) { // In content stream mode, leave object uninitialized to indicate EOF - empty = true; + empty_ = true; return {}; } warn("unexpected EOF"); @@ -255,10 +247,15 @@ QPDFParser::parse_first(bool& empty, bool content_stream) if (content_stream) { return withDescription(value); } else if (value == "endobj") { - // We just saw endobj without having read anything. Treat this as a null and do - // not move the input source's offset. + // We just saw endobj without having read anything. Nothing in the PDF spec appears + // to allow empty objects, but they have been encountered in actual PDF files and + // Adobe Reader appears to ignore them. Treat this as a null and do not move the + // input source's offset. + empty_ = true; input.seek(input.getLastOffset(), SEEK_SET); - empty = true; + if (!content_stream) { + warn("empty object treated as null"); + } return {}; } else { warn("unknown token while reading object; treating as string"); diff --git a/libqpdf/QPDF_objects.cc b/libqpdf/QPDF_objects.cc index fd96a0d..14f6cd0 100644 --- a/libqpdf/QPDF_objects.cc +++ b/libqpdf/QPDF_objects.cc @@ -1233,13 +1233,9 @@ QPDFObjectHandle Objects::readTrailer() { qpdf_offset_t offset = m->file->tell(); - auto [object, empty] = + auto object = QPDFParser::parse(*m->file, "trailer", m->tokenizer, nullptr, qpdf, m->reconstructed_xref); - if (empty) { - // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in - // actual PDF files and Adobe Reader appears to ignore them. - warn(damagedPDF("trailer", "empty object treated as null")); - } else if (object.isDictionary() && m->objects.readToken(*m->file).isWord("stream")) { + if (object.isDictionary() && m->objects.readToken(*m->file).isWord("stream")) { warn(damagedPDF("trailer", m->file->tell(), "stream keyword found in trailer")); } // Override last_offset so that it points to the beginning of the object we just read @@ -1255,19 +1251,15 @@ Objects::readObject(std::string const& description, QPDFObjGen og) StringDecrypter decrypter{&qpdf, og}; StringDecrypter* decrypter_ptr = m->encp->encrypted ? &decrypter : nullptr; - auto [object, empty] = QPDFParser::parse( + auto object = QPDFParser::parse( *m->file, m->last_object_description, m->tokenizer, decrypter_ptr, qpdf, m->reconstructed_xref || m->in_read_xref_stream); - ; - if (empty) { - // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in - // actual PDF files and Adobe Reader appears to ignore them. - warn(damagedPDF(*m->file, m->file->getLastOffset(), "empty object treated as null")); - return object; + if (!object) { + return {}; } auto token = readToken(*m->file); if (object.isDictionary() && token.isWord("stream")) { @@ -1366,24 +1358,6 @@ Objects::validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_off } } -QPDFObjectHandle -Objects::readObjectInStream(is::OffsetBuffer& input, int stream_id, int obj_id) -{ - auto [object, empty] = QPDFParser::parse(input, stream_id, obj_id, m->tokenizer, qpdf); - if (empty) { - // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in - // actual PDF files and Adobe Reader appears to ignore them. - warn(QPDFExc( - qpdf_e_damaged_pdf, - m->file->getName() + " object stream " + std::to_string(stream_id), - +"object " + std::to_string(obj_id) + " 0, offset " + - std::to_string(input.getLastOffset()), - 0, - "empty object treated as null")); - } - return object; -} - bool Objects ::findEndstream() { @@ -1536,25 +1510,25 @@ Objects::readObjectAtOffset( return; } - QPDFObjectHandle oh = readObject(description, og); + if (auto oh = readObject(description, og)) { + // Determine the end offset of this object before and after white space. We use these + // numbers to validate linearization hint tables. Offsets and lengths of objects may imply + // the end of an object to be anywhere between these values. + qpdf_offset_t end_before_space = m->file->tell(); - // Determine the end offset of this object before and after white space. We use these - // numbers to validate linearization hint tables. Offsets and lengths of objects may imply - // the end of an object to be anywhere between these values. - qpdf_offset_t end_before_space = m->file->tell(); - - // skip over spaces - while (true) { - char ch; - if (!m->file->read(&ch, 1)) { - throw damagedPDF(m->file->tell(), "EOF after endobj"); - } - if (!isspace(static_cast(ch))) { - m->file->seek(-1, SEEK_CUR); - break; + // skip over spaces + while (true) { + char ch; + if (!m->file->read(&ch, 1)) { + throw damagedPDF(m->file->tell(), "EOF after endobj"); + } + if (!isspace(static_cast(ch))) { + m->file->seek(-1, SEEK_CUR); + break; + } } + m->objects.updateCache(og, oh.obj_sp(), end_before_space, m->file->tell()); } - m->objects.updateCache(og, oh.obj_sp(), end_before_space, m->file->tell()); } QPDFObjectHandle @@ -1564,7 +1538,7 @@ Objects::readObjectAtOffset( auto og = read_object_start(offset); auto oh = readObject(description, og); - if (!m->objects.isUnresolved(og)) { + if (!oh || !m->objects.isUnresolved(og)) { return oh; } @@ -1804,8 +1778,9 @@ Objects::resolveObjectsInStream(int obj_stream_number) if (entry != m->xref_table.end() && entry->second.getType() == 2 && entry->second.getObjStreamNumber() == obj_stream_number) { is::OffsetBuffer in("", {b_start + obj_offset, obj_size}, obj_offset); - auto oh = readObjectInStream(in, obj_stream_number, obj_id); - updateCache(og, oh.obj_sp(), end_before_space, end_after_space); + if (auto oh = QPDFParser::parse(in, obj_stream_number, obj_id, m->tokenizer, qpdf)) { + updateCache(og, oh.obj_sp(), end_before_space, end_after_space); + } } else { QTC::TC("qpdf", "QPDF not caching overridden objstm object"); } diff --git a/libqpdf/qpdf/QPDFParser.hh b/libqpdf/qpdf/QPDFParser.hh index 195a9a7..062b315 100644 --- a/libqpdf/qpdf/QPDFParser.hh +++ b/libqpdf/qpdf/QPDFParser.hh @@ -41,8 +41,8 @@ class QPDFParser QPDFObjectHandle::StringDecrypter* decrypter, QPDF* context); - // For use by QPDF. Return parsed object and whether it is empty. - static std::pair parse( + // For use by QPDF. + static QPDFObjectHandle parse( InputSource& input, std::string const& object_description, qpdf::Tokenizer& tokenizer, @@ -50,7 +50,7 @@ class QPDFParser QPDF& context, bool sanity_checks); - static std::pair parse( + static QPDFObjectHandle parse( qpdf::is::OffsetBuffer& input, int stream_id, int obj_id, @@ -112,8 +112,8 @@ class QPDFParser int null_count{0}; }; - QPDFObjectHandle parse(bool& empty, bool content_stream); - QPDFObjectHandle parse_first(bool& empty, bool content_stream); + QPDFObjectHandle parse(bool content_stream = false); + QPDFObjectHandle parse_first(bool content_stream); QPDFObjectHandle parseRemainder(bool content_stream); void add(std::shared_ptr&& obj); void addNull(); @@ -158,6 +158,7 @@ class QPDFParser int int_count{0}; long long int_buffer[2]{0, 0}; qpdf_offset_t last_offset_buffer[2]{0, 0}; + bool empty_{false}; }; #endif // QPDFPARSER_HH diff --git a/libqpdf/qpdf/QPDF_private.hh b/libqpdf/qpdf/QPDF_private.hh index a5738eb..6191836 100644 --- a/libqpdf/qpdf/QPDF_private.hh +++ b/libqpdf/qpdf/QPDF_private.hh @@ -1039,7 +1039,6 @@ class QPDF::Doc::Objects: Common QPDFObjectHandle readObject(std::string const& description, QPDFObjGen og); void readStream(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset); void validateStreamLineEnd(QPDFObjectHandle& object, QPDFObjGen og, qpdf_offset_t offset); - QPDFObjectHandle readObjectInStream(qpdf::is::OffsetBuffer& input, int stream_id, int obj_id); size_t recoverStreamLength( std::shared_ptr input, QPDFObjGen og, qpdf_offset_t stream_offset); diff --git a/qpdf/qtest/qpdf/issue-100.out b/qpdf/qtest/qpdf/issue-100.out index 8ff6730..ef71e91 100644 --- a/qpdf/qtest/qpdf/issue-100.out +++ b/qpdf/qtest/qpdf/issue-100.out @@ -7,7 +7,6 @@ WARNING: issue-100.pdf (trailer, offset 950): recovered trailer has no /Root ent WARNING: issue-100.pdf (trailer, offset 488): stream keyword found in trailer WARNING: issue-100.pdf (trailer, offset 418): recovered trailer has no /Root entry WARNING: issue-100.pdf (object 1 0, offset 83): unexpected dictionary close token -WARNING: issue-100.pdf (object 1 0, offset 87): expected endobj WARNING: issue-100.pdf (object 5 0, offset 268): unknown token while reading object; treating as null WARNING: issue-100.pdf (object 5 0, offset 286): unknown token while reading object; treating as null WARNING: issue-100.pdf (object 5 0, offset 289): unknown token while reading object; treating as null @@ -15,9 +14,7 @@ WARNING: issue-100.pdf (object 5 0, offset 294): unknown token while reading obj WARNING: issue-100.pdf (object 5 0, offset 297): unknown token while reading object; treating as null WARNING: issue-100.pdf (object 5 0, offset 304): unknown token while reading object; treating as null WARNING: issue-100.pdf (object 5 0, offset 304): too many errors; giving up on reading object -WARNING: issue-100.pdf (object 5 0, offset 308): expected endobj WARNING: issue-100.pdf (object 8 0, offset 107): invalid character ()) in hexstring -WARNING: issue-100.pdf (object 8 0, offset 109): expected endobj WARNING: issue-100.pdf (object 9 0, offset 527): unknown token while reading object; treating as string WARNING: issue-100.pdf (object 9 0, offset 529): expected endobj WARNING: issue-100.pdf (object 10 0, offset 573): expected endobj diff --git a/qpdf/qtest/qpdf/issue-101.out b/qpdf/qtest/qpdf/issue-101.out index d3fb8f2..f9567b7 100644 --- a/qpdf/qtest/qpdf/issue-101.out +++ b/qpdf/qtest/qpdf/issue-101.out @@ -8,7 +8,6 @@ WARNING: issue-101.pdf (object 11 0, offset 637): unknown token while reading ob WARNING: issue-101.pdf (object 11 0, offset 639): unknown token while reading object; treating as null WARNING: issue-101.pdf (object 11 0, offset 644): unknown token while reading object; treating as null WARNING: issue-101.pdf (object 11 0, offset 644): too many errors; giving up on reading object -WARNING: issue-101.pdf (object 11 0, offset 647): expected endobj WARNING: issue-101.pdf (trailer, offset 4433): recovered trailer has no /Root entry WARNING: issue-101.pdf (trailer, offset 4183): stream keyword found in trailer WARNING: issue-101.pdf (trailer, offset 4113): recovered trailer has no /Root entry @@ -40,9 +39,7 @@ WARNING: issue-101.pdf (object 7 0, offset 3866): unknown token while reading ob WARNING: issue-101.pdf (object 7 0, offset 3873): unknown token while reading object; treating as null WARNING: issue-101.pdf (object 7 0, offset 3879): unknown token while reading object; treating as null WARNING: issue-101.pdf (object 7 0, offset 3879): too many errors; giving up on reading object -WARNING: issue-101.pdf (object 7 0, offset 3888): expected endobj WARNING: issue-101.pdf (object 8 0, offset 4067): invalid character ()) in hexstring -WARNING: issue-101.pdf (object 8 0, offset 4069): expected endobj WARNING: issue-101.pdf (object 9 0, offset 2832): unknown token while reading object; treating as string WARNING: issue-101.pdf (object 9 0, offset 2834): expected endobj qpdf: issue-101.pdf: unable to find trailer dictionary while recovering damaged file diff --git a/qpdf/qtest/qpdf/issue-150.out b/qpdf/qtest/qpdf/issue-150.out index 0dc8338..765f70d 100644 --- a/qpdf/qtest/qpdf/issue-150.out +++ b/qpdf/qtest/qpdf/issue-150.out @@ -1,7 +1,7 @@ WARNING: issue-150.pdf: can't find PDF header WARNING: issue-150.pdf (xref stream: object 8 0, offset 56): treating object as null because of error during parsing : overflow/underflow converting 9900000000000000000 to 64-bit integer -WARNING: issue-150.pdf (xref stream: object 8 0, offset 75): expected endobj WARNING: issue-150.pdf: file is damaged WARNING: issue-150.pdf (offset 4): xref not found WARNING: issue-150.pdf: Attempting to reconstruct cross-reference table +WARNING: issue-150.pdf (object 8 0): object has offset 0 qpdf: issue-150.pdf: unable to find trailer dictionary while recovering damaged file diff --git a/qpdf/qtest/qpdf/issue-1503.out b/qpdf/qtest/qpdf/issue-1503.out index a57585c..e33a852 100644 --- a/qpdf/qtest/qpdf/issue-1503.out +++ b/qpdf/qtest/qpdf/issue-1503.out @@ -6,11 +6,8 @@ WARNING: issue-1503.pdf (object 31 0, offset 813): unknown token while reading o WARNING: issue-1503.pdf (object 31 0, offset 851): unknown token while reading object; treating as null WARNING: issue-1503.pdf (object 31 0, offset 856): unknown token while reading object; treating as null WARNING: issue-1503.pdf (object 31 0, offset 861): unexpected 'endobj' or 'endstream' while reading object; giving up on reading object -WARNING: issue-1503.pdf (object 31 0, offset 871): expected endobj WARNING: issue-1503.pdf (object 38 0, offset 1126): unexpected 'endobj' or 'endstream' while reading object; giving up on reading object -WARNING: issue-1503.pdf (object 38 0, offset 1133): expected endobj WARNING: issue-1503.pdf (object 40 0, offset 1195): unexpected array close token; giving up on reading object -WARNING: issue-1503.pdf (object 40 0, offset 1198): expected endobj WARNING: issue-1503.pdf (object 41 0, offset 1359): stream dictionary lacks /Length key WARNING: issue-1503.pdf (object 41 0, offset 1411): attempting to recover stream length WARNING: issue-1503.pdf (object 41 0, offset 1411): recovered stream length: 54 @@ -22,9 +19,7 @@ WARNING: issue-1503.pdf (object 44 0, offset 1814): name with stray # will not w WARNING: issue-1503.pdf (object 44 0, offset 1821): unknown token while reading object; treating as null WARNING: issue-1503.pdf (object 44 0, offset 1826): unknown token while reading object; treating as null WARNING: issue-1503.pdf (object 44 0, offset 1826): too many errors; giving up on reading object -WARNING: issue-1503.pdf (object 44 0, offset 1829): expected endobj WARNING: issue-1503.pdf (object 46 0, offset 1923): unexpected array close token; giving up on reading object -WARNING: issue-1503.pdf (object 46 0, offset 1926): expected endobj WARNING: issue-1503.pdf (object 47 0, offset 2087): stream dictionary lacks /Length key WARNING: issue-1503.pdf (object 47 0, offset 2139): attempting to recover stream length WARNING: issue-1503.pdf (object 47 0, offset 2139): recovered stream length: 54 @@ -59,8 +54,6 @@ WARNING: issue-1503.pdf (object 151 0, offset 3836): unknown token while reading WARNING: issue-1503.pdf (object 151 0, offset 3958): unknown token while reading object; treating as null WARNING: issue-1503.pdf (object 152 0, offset 4088): parse error while reading object WARNING: issue-1503.pdf (object 152 0, offset 4088): unexpected EOF -WARNING: issue-1503.pdf (object 152 0, offset 4088): expected endobj -WARNING: issue-1503.pdf (object 152 0, offset 4088): EOF after endobj WARNING: issue-1503.pdf (object 155 0, offset 162): unknown token while reading object; treating as null WARNING: issue-1503.pdf (object 155 0, offset 342): unknown token while reading object; treating as null WARNING: issue-1503.pdf (object 155 0, offset 345): unknown token while reading object; treating as null diff --git a/qpdf/qtest/qpdf/issue-99.out b/qpdf/qtest/qpdf/issue-99.out index 5cc37c7..d800d79 100644 --- a/qpdf/qtest/qpdf/issue-99.out +++ b/qpdf/qtest/qpdf/issue-99.out @@ -8,15 +8,12 @@ WARNING: issue-99.pdf (object 1 0, offset 815): unknown token while reading obje WARNING: issue-99.pdf (object 1 0, offset 835): unknown token while reading object; treating as null WARNING: issue-99.pdf (object 1 0, offset 855): unknown token while reading object; treating as null WARNING: issue-99.pdf (object 1 0, offset 855): too many errors; giving up on reading object -WARNING: issue-99.pdf (object 1 0, offset 858): expected endobj WARNING: issue-99.pdf (object 2 0, offset 64): expected endobj WARNING: issue-99.pdf (object 5 0, offset 2452): unknown token while reading object; treating as string WARNING: issue-99.pdf (object 6 0, offset 2506): unexpected array close token; giving up on reading object -WARNING: issue-99.pdf (object 6 0, offset 2507): expected endobj WARNING: issue-99.pdf (object 10 0, offset 3708): expected dictionary keys but found non-name objects; ignoring WARNING: issue-99.pdf (object 11 0, offset 4485): unknown token while reading object; treating as null WARNING: issue-99.pdf (object 11 0, offset 4497): unexpected array close token; giving up on reading object -WARNING: issue-99.pdf (object 11 0, offset 4499): expected endobj WARNING: issue-99.pdf: unable to find trailer dictionary while recovering damaged file WARNING: object 1 0: Pages tree includes non-dictionary object; ignoring WARNING: object 1 0: operation for dictionary attempted on object of type null: ignoring key replacement request diff --git a/qpdf/qtest/qpdf/parse-object.out b/qpdf/qtest/qpdf/parse-object.out index 47843fd..71e0bf5 100644 --- a/qpdf/qtest/qpdf/parse-object.out +++ b/qpdf/qtest/qpdf/parse-object.out @@ -6,6 +6,4 @@ WARNING: parsed object: treating unexpected brace token as null WARNING: parsed object: treating unexpected brace token as null WARNING: parsed object: unexpected dictionary close token WARNING: bad-parse.qdf (object 7 0, offset 1121): unexpected EOF -WARNING: bad-parse.qdf (object 7 0, offset 1121): expected endobj -WARNING: bad-parse.qdf (object 7 0, offset 1121): EOF after endobj test 31 done