During xref reconstruction reject unreasonably large objects

Reject objects containing arrays or dictionaries with more than 5000 elements. We are by definition dealing with damaged files, and such objects are extremely likely to be invalid or malicious.

During xref reconstruction reject unreasonably large objects
Reject objects containing arrays or dictionaries with more than 5000 elements. We are by definition dealing with damaged files, and such objects are extremely likely to be invalid or malicious.
m-holger
1 parent 8ef5cfad
Showing 7 changed files with 40 additions and 15 deletions
fuzz/CMakeLists.txt
fuzz/qpdf_extra/5109284021272576.fuzz
fuzz/qtest/fuzz.test
libqpdf/QPDFParser.cc
libqpdf/QPDF_objects.cc
libqpdf/qpdf/QPDFParser.hh
libqpdf/qpdf/QPDF_private.hh
@@ -158,6 +158,7 @@ set(CORPUS_OTHER
   398060137.fuzz
   409905355.fuzz
   411312393.fuzz
+  5109284021272576.fuzz
 )
  
 set(CORPUS_DIR ${CMAKE_CURRENT_BINARY_DIR}/qpdf_corpus)
@@ -11,7 +11,7 @@ my $td = new TestDriver(&#39;fuzz&#39;);
  
 my $qpdf_corpus = $ENV{'QPDF_FUZZ_CORPUS'} || die "must set QPDF_FUZZ_CORPUS";
  
-my $n_qpdf_files = 95;       # increment when adding new files
+my $n_qpdf_files = 96;       # increment when adding new files
  
 my @fuzzers = (
     ['ascii85' => 1],
@@ -71,7 +71,8 @@ QPDFParser::parse(
     std::string const& object_description,
     qpdf::Tokenizer& tokenizer,
     QPDFObjectHandle::StringDecrypter* decrypter,
-    QPDF& context)
+    QPDF& context,
+    bool sanity_checks)
 {
     bool empty{false};
     auto result = QPDFParser(
@@ -81,7 +82,10 @@ QPDFParser::parse(
                       tokenizer,
                       decrypter,
                       &context,
-                      true)
+                      true,
+                      0,
+                      0,
+                      sanity_checks)
                       .parse(empty, false);
     return {result, empty};
 }
@@ -298,7 +302,7 @@ QPDFParser::parseRemainder(bool content_stream)
             continue;
  
         case QPDFTokenizer::tt_array_close:
-            if (bad_count && !max_bad_count) {
+            if ((bad_count || sanity_checks) && !max_bad_count) {
                 // Trigger warning.
                 (void)tooManyBadTokens();
                 return {QPDFObject::create<QPDF_Null>()};
@@ -329,7 +333,7 @@ QPDFParser::parseRemainder(bool content_stream)
             continue;
  
         case QPDFTokenizer::tt_dict_close:
-            if (bad_count && !max_bad_count) {
+            if ((bad_count || sanity_checks) && !max_bad_count) {
                 // Trigger warning.
                 (void)tooManyBadTokens();
                 return {QPDFObject::create<QPDF_Null>()};
@@ -514,7 +518,8 @@ template &lt;typename T, typename... Args&gt;
 void
 QPDFParser::addScalar(Args&&... args)
 {
-    if (bad_count && (frame->olist.size() > 5'000 || frame->dict.size() > 5'000)) {
+    if ((bad_count || sanity_checks) &&
+        (frame->olist.size() > 5'000 || frame->dict.size() > 5'000)) {
         // Stop adding scalars. We are going to abort when the close token or a bad token is
         // encountered.
         max_bad_count = 0;
@@ -572,10 +577,15 @@ bool
 QPDFParser::tooManyBadTokens()
 {
     if (frame->olist.size() > 5'000 || frame->dict.size() > 5'000) {
+        if (bad_count) {
+            warn(
+                "encountered errors while parsing an array or dictionary with more than 5000 "
+                "elements; giving up on reading object");
+            return true;
+        }
         warn(
-            "encountered errors while parsing an array or dictionary with more than 5000 "
-            "elements; giving up on reading object");
-        return true;
+            "encountered an array or dictionary with more than 5000 elements during xref recovery; "
+            "giving up on reading object");
     }
     if (--max_bad_count > 0 && good_count > 4) {
         good_count = 0;
@@ -200,6 +200,7 @@ QPDF::reconstruct_xref(QPDFExc&amp; e, bool found_startxref)
     };
  
     m->reconstructed_xref = true;
+    m->in_xref_reconstruction = true;
     // We may find more objects, which may contain dangling references.
     m->fixed_dangling_refs = false;
  
@@ -377,6 +378,8 @@ QPDF::reconstruct_xref(QPDFExc&amp; e, bool found_startxref)
             throw damagedPDF("", -1, "unable to find any pages while recovering damaged file");
         }
     }
+
+    m->in_xref_reconstruction = false;
     // We could iterate through the objects looking for streams and try to find objects inside of
     // them, but it's probably not worth the trouble.  Acrobat can't recover files with any errors
     // in an xref stream, and this would be a real long shot anyway.  If we wanted to do anything
@@ -1154,7 +1157,8 @@ QPDFObjectHandle
 QPDF::readTrailer()
 {
     qpdf_offset_t offset = m->file->tell();
-    auto [object, empty] = QPDFParser::parse(*m->file, "trailer", m->tokenizer, nullptr, *this);
+    auto [object, empty] = QPDFParser::parse(
+        *m->file, "trailer", m->tokenizer, nullptr, *this, m->in_xref_reconstruction);
     if (empty) {
         // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
         // actual PDF files and Adobe Reader appears to ignore them.
@@ -1175,8 +1179,13 @@ QPDF::readObject(std::string const&amp; description, QPDFObjGen og)
  
     StringDecrypter decrypter{this, og};
     StringDecrypter* decrypter_ptr = m->encp->encrypted ? &decrypter : nullptr;
-    auto [object, empty] =
-        QPDFParser::parse(*m->file, m->last_object_description, m->tokenizer, decrypter_ptr, *this);
+    auto [object, empty] = QPDFParser::parse(
+        *m->file,
+        m->last_object_description,
+        m->tokenizer,
+        decrypter_ptr,
+        *this,
+        m->in_xref_reconstruction);
     if (empty) {
         // Nothing in the PDF spec appears to allow empty objects, but they have been encountered in
         // actual PDF files and Adobe Reader appears to ignore them.
@@ -36,7 +36,8 @@ class QPDFParser
         std::string const& object_description,
         qpdf::Tokenizer& tokenizer,
         QPDFObjectHandle::StringDecrypter* decrypter,
-        QPDF& context);
+        QPDF& context,
+        bool sanity_checks);
  
     static std::pair<QPDFObjectHandle, bool> parse(
         qpdf::is::OffsetBuffer& input,
@@ -63,7 +64,8 @@ class QPDFParser
         QPDF* context,
         bool parse_pdf,
         int stream_id = 0,
-        int obj_id = 0) :
+        int obj_id = 0,
+        bool sanity_checks = false) :
         input(input),
         object_description(object_description),
         tokenizer(tokenizer),
@@ -72,7 +74,8 @@ class QPDFParser
         description(std::move(sp_description)),
         parse_pdf(parse_pdf),
         stream_id(stream_id),
-        obj_id(obj_id)
+        obj_id(obj_id),
+        sanity_checks(sanity_checks)
     {
     }
  
@@ -125,6 +128,7 @@ class QPDFParser
     bool parse_pdf{false};
     int stream_id{0};
     int obj_id{0};
+    bool sanity_checks{false};
  
     std::vector<StackFrame> stack;
     StackFrame* frame{nullptr};
@@ -490,6 +490,7 @@ class QPDF::Members
     // copied_stream_data_provider is owned by copied_streams
     CopiedStreamDataProvider* copied_stream_data_provider{nullptr};
     bool reconstructed_xref{false};
+    bool in_xref_reconstruction{false};
     bool fixed_dangling_refs{false};
     bool immediate_copy_from{false};
     bool in_parse{false};