Merge branch 'parse_ref' into work

Jay Berkenbilt
2 parents 64c840b1 1285f976
Showing 12 changed files with 555 additions and 305 deletions
libqpdf/QPDFParser.cc
libqpdf/qpdf/QPDFParser.hh
qpdf/qpdf.testcov
qpdf/qtest/parsing.test
qpdf/qtest/qpdf/bad16-recover.out
qpdf/qtest/qpdf/bad16.out
qpdf/qtest/qpdf/bad36-recover.out
qpdf/qtest/qpdf/bad36.out
qpdf/qtest/qpdf/bad39.qdf
qpdf/qtest/qpdf/issue-335a.out
qpdf/qtest/qpdf/parse-object.out
qpdf/test_driver.cc
@@ -21,22 +21,7 @@
 #include <memory>
-namespace
-{
-    struct StackFrame
-    {
-        StackFrame(std::shared_ptr<InputSource> input) :
-            offset(input->tell())
-        {
-        }
-
-        std::vector<std::shared_ptr<QPDFObject>> olist;
-        qpdf_offset_t offset;
-        std::string contents_string{""};
-        qpdf_offset_t contents_offset{-1};
-        int null_count{0};
-    };
-} // namespace
+using ObjectPtr = std::shared_ptr<QPDFObject>;
 QPDFObjectHandle
 QPDFParser::parse(bool& empty, bool content_stream)
@@ -46,371 +31,457 @@ QPDFParser::parse(bool&amp; empty, bool content_stream)
     // effect of reading the object and changing the file pointer. If you do this, it will cause a
     // logic error to be thrown from QPDF::inParse().
-    const static std::shared_ptr<QPDFObject> null_oh = QPDF_Null::create();
     QPDF::ParseGuard pg(context);
-
     empty = false;
+    start = input->tell();
-    std::shared_ptr<QPDFObject> object;
-    bool set_offset = false;
-
-    std::vector<StackFrame> stack;
-    stack.emplace_back(input);
-    std::vector<parser_state_e> state_stack;
-    state_stack.push_back(st_top);
-    qpdf_offset_t offset;
-    bool done = false;
-    int bad_count = 0;
-    int good_count = 0;
-    bool b_contents = false;
-    bool is_null = false;
+    if (!tokenizer.nextToken(*input, object_description)) {
+        warn(tokenizer.getErrorMessage());
+    }
+
+    switch (tokenizer.getType()) {
+    case QPDFTokenizer::tt_eof:
+        if (content_stream) {
+            // In content stream mode, leave object uninitialized to indicate EOF
+            return {};
+        }
+        QTC::TC("qpdf", "QPDFParser eof in parse");
+        warn("unexpected EOF");
+        return {QPDF_Null::create()};
+
+    case QPDFTokenizer::tt_bad:
+        QTC::TC("qpdf", "QPDFParser bad token in parse");
+        return {QPDF_Null::create()};
+
+    case QPDFTokenizer::tt_brace_open:
+    case QPDFTokenizer::tt_brace_close:
+        QTC::TC("qpdf", "QPDFParser bad brace");
+        warn("treating unexpected brace token as null");
+        return {QPDF_Null::create()};
+
+    case QPDFTokenizer::tt_array_close:
+        QTC::TC("qpdf", "QPDFParser bad array close");
+        warn("treating unexpected array close token as null");
+        return {QPDF_Null::create()};
+
+    case QPDFTokenizer::tt_dict_close:
+        QTC::TC("qpdf", "QPDFParser bad dictionary close");
+        warn("unexpected dictionary close token");
+        return {QPDF_Null::create()};
+
+    case QPDFTokenizer::tt_array_open:
+    case QPDFTokenizer::tt_dict_open:
+        stack.clear();
+        stack.emplace_back(
+            input,
+            (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array : st_dictionary_key);
+        frame = &stack.back();
+        return parseRemainder(content_stream);
+
+    case QPDFTokenizer::tt_bool:
+        return withDescription<QPDF_Bool>(tokenizer.getValue() == "true");
+
+    case QPDFTokenizer::tt_null:
+        return {QPDF_Null::create()};
+
+    case QPDFTokenizer::tt_integer:
+        return withDescription<QPDF_Integer>(QUtil::string_to_ll(tokenizer.getValue().c_str()));
+
+    case QPDFTokenizer::tt_real:
+        return withDescription<QPDF_Real>(tokenizer.getValue());
+
+    case QPDFTokenizer::tt_name:
+        return withDescription<QPDF_Name>(tokenizer.getValue());
+
+    case QPDFTokenizer::tt_word:
+        {
+            auto const& value = tokenizer.getValue();
+            if (content_stream) {
+                return withDescription<QPDF_Operator>(value);
+            } else if (value == "endobj") {
+                // We just saw endobj without having read anything.  Treat this as a null and do
+                // not move the input source's offset.
+                input->seek(input->getLastOffset(), SEEK_SET);
+                empty = true;
+                return {QPDF_Null::create()};
+            } else {
+                QTC::TC("qpdf", "QPDFParser treat word as string");
+                warn("unknown token while reading object; treating as string");
+                return withDescription<QPDF_String>(value);
+            }
+        }
+
+    case QPDFTokenizer::tt_string:
+        if (decrypter) {
+            std::string s{tokenizer.getValue()};
+            decrypter->decryptString(s);
+            return withDescription<QPDF_String>(s);
+        } else {
+            return withDescription<QPDF_String>(tokenizer.getValue());
+        }
+
+    default:
+        warn("treating unknown token type as null while reading object");
+        return {QPDF_Null::create()};
+    }
+}
-    while (!done) {
-        bool bad = false;
-        bool indirect_ref = false;
-        is_null = false;
-        auto& frame = stack.back();
-        auto& olist = frame.olist;
-        parser_state_e state = state_stack.back();
-        offset = frame.offset;
+QPDFObjectHandle
+QPDFParser::parseRemainder(bool content_stream)
+{
+    // This method must take care not to resolve any objects. Don't check the type of any object
+    // without first ensuring that it is a direct object. Otherwise, doing so may have the side
+    // effect of reading the object and changing the file pointer. If you do this, it will cause a
+    // logic error to be thrown from QPDF::inParse().
-        object = nullptr;
-        set_offset = false;
+    bad_count = 0;
+    bool b_contents = false;
+    while (true) {
         if (!tokenizer.nextToken(*input, object_description)) {
             warn(tokenizer.getErrorMessage());
         }
+        ++good_count; // optimistically
+
+        if (int_count != 0) {
+            // Special handling of indirect references. Treat integer tokens as part of an indirect
+            // reference until proven otherwise.
+            if (tokenizer.getType() == QPDFTokenizer::tt_integer) {
+                if (++int_count > 2) {
+                    // Process the oldest buffered integer.
+                    addInt(int_count);
+                }
+                last_offset_buffer[int_count % 2] = input->getLastOffset();
+                int_buffer[int_count % 2] = QUtil::string_to_ll(tokenizer.getValue().c_str());
+                continue;
+
+            } else if (
+                int_count >= 2 && tokenizer.getType() == QPDFTokenizer::tt_word &&
+                tokenizer.getValue() == "R") {
+                if (context == nullptr) {
+                    QTC::TC("qpdf", "QPDFParser indirect without context");
+                    throw std::logic_error("QPDFParser::parse called without context on an object "
+                                           "with indirect references");
+                }
+                auto ref_og = QPDFObjGen(
+                    QIntC::to_int(int_buffer[(int_count - 1) % 2]),
+                    QIntC::to_int(int_buffer[(int_count) % 2]));
+                if (ref_og.isIndirect()) {
+                    // This action has the desirable side effect of causing dangling references
+                    // (references to indirect objects that don't appear in the PDF) in any parsed
+                    // object to appear in the object cache.
+                    add(std::move(context->getObject(ref_og).obj));
+                } else {
+                    QTC::TC("qpdf", "QPDFParser indirect with 0 objid");
+                    addNull();
+                }
+                int_count = 0;
+                continue;
+
+            } else if (int_count > 0) {
+                // Process the buffered integers before processing the current token.
+                if (int_count > 1) {
+                    addInt(int_count - 1);
+                }
+                addInt(int_count);
+                int_count = 0;
+            }
+        }
         switch (tokenizer.getType()) {
         case QPDFTokenizer::tt_eof:
-            if (!content_stream) {
-                QTC::TC("qpdf", "QPDFParser eof in parse");
-                warn("unexpected EOF");
+            warn("parse error while reading object");
+            if (content_stream) {
+                // In content stream mode, leave object uninitialized to indicate EOF
+                return {};
             }
-            bad = true;
-            state = st_eof;
-            break;
+            QTC::TC("qpdf", "QPDFParser eof in parseRemainder");
+            warn("unexpected EOF");
+            return {QPDF_Null::create()};
         case QPDFTokenizer::tt_bad:
-            QTC::TC("qpdf", "QPDFParser bad token in parse");
-            bad = true;
-            is_null = true;
-            break;
+            QTC::TC("qpdf", "QPDFParser bad token in parseRemainder");
+            if (tooManyBadTokens()) {
+                return {QPDF_Null::create()};
+            }
+            addNull();
+            continue;
         case QPDFTokenizer::tt_brace_open:
         case QPDFTokenizer::tt_brace_close:
-            QTC::TC("qpdf", "QPDFParser bad brace");
+            QTC::TC("qpdf", "QPDFParser bad brace in parseRemainder");
             warn("treating unexpected brace token as null");
-            bad = true;
-            is_null = true;
-            break;
+            if (tooManyBadTokens()) {
+                return {QPDF_Null::create()};
+            }
+            addNull();
+            continue;
         case QPDFTokenizer::tt_array_close:
-            if (state == st_array) {
-                state = st_stop;
+            if (frame->state == st_array) {
+                auto object = QPDF_Array::create(std::move(frame->olist), frame->null_count > 100);
+                setDescription(object, frame->offset - 1);
+                // The `offset` points to the next of "[".  Set the rewind offset to point to the
+                // beginning of "[". This has been explicitly tested with whitespace surrounding the
+                // array start delimiter. getLastOffset points to the array end token and therefore
+                // can't be used here.
+                if (stack.size() <= 1) {
+                    return object;
+                }
+                stack.pop_back();
+                frame = &stack.back();
+                add(std::move(object));
             } else {
-                QTC::TC("qpdf", "QPDFParser bad array close");
+                QTC::TC("qpdf", "QPDFParser bad array close in parseRemainder");
                 warn("treating unexpected array close token as null");
-                bad = true;
-                is_null = true;
+                if (tooManyBadTokens()) {
+                    return {QPDF_Null::create()};
+                }
+                addNull();
             }
-            break;
+            continue;
         case QPDFTokenizer::tt_dict_close:
-            if (state == st_dictionary) {
-                state = st_stop;
+            if (frame->state <= st_dictionary_value) {
+                // Attempt to recover more or less gracefully from invalid dictionaries.
+                auto& dict = frame->dict;
+
+                if (frame->state == st_dictionary_value) {
+                    QTC::TC("qpdf", "QPDFParser no val for last key");
+                    warn(
+                        frame->offset,
+                        "dictionary ended prematurely; using null as value for last key");
+                    dict[frame->key] = QPDF_Null::create();
+                }
+
+                if (!frame->olist.empty())
+                    fixMissingKeys();
+
+                if (!frame->contents_string.empty() && dict.count("/Type") &&
+                    dict["/Type"].isNameAndEquals("/Sig") && dict.count("/ByteRange") &&
+                    dict.count("/Contents") && dict["/Contents"].isString()) {
+                    dict["/Contents"] = QPDFObjectHandle::newString(frame->contents_string);
+                    dict["/Contents"].setParsedOffset(frame->contents_offset);
+                }
+                auto object = QPDF_Dictionary::create(std::move(dict));
+                setDescription(object, frame->offset - 2);
+                // The `offset` points to the next of "<<". Set the rewind offset to point to the
+                // beginning of "<<". This has been explicitly tested with whitespace surrounding
+                // the dictionary start delimiter. getLastOffset points to the dictionary end token
+                // and therefore can't be used here.
+                if (stack.size() <= 1) {
+                    return object;
+                }
+                stack.pop_back();
+                frame = &stack.back();
+                add(std::move(object));
             } else {
-                QTC::TC("qpdf", "QPDFParser bad dictionary close");
+                QTC::TC("qpdf", "QPDFParser bad dictionary close in parseRemainder");
                 warn("unexpected dictionary close token");
-                bad = true;
-                is_null = true;
+                if (tooManyBadTokens()) {
+                    return {QPDF_Null::create()};
+                }
+                addNull();
             }
-            break;
+            continue;
         case QPDFTokenizer::tt_array_open:
         case QPDFTokenizer::tt_dict_open:
-            if (stack.size() > 500) {
+            if (stack.size() > 499) {
                 QTC::TC("qpdf", "QPDFParser too deep");
                 warn("ignoring excessively deeply nested data structure");
-                bad = true;
-                is_null = true;
-                state = st_top;
+                return {QPDF_Null::create()};
             } else {
-                state = st_start;
-                state_stack.push_back(
-                    (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array
-                                                                          : st_dictionary);
                 b_contents = false;
-                stack.emplace_back(input);
+                stack.emplace_back(
+                    input,
+                    (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array
+                                                                          : st_dictionary_key);
+                frame = &stack.back();
+                continue;
             }
-            break;
         case QPDFTokenizer::tt_bool:
-            object = QPDF_Bool::create((tokenizer.getValue() == "true"));
-            break;
+            addScalar<QPDF_Bool>(tokenizer.getValue() == "true");
+            continue;
         case QPDFTokenizer::tt_null:
-            is_null = true;
-            ++frame.null_count;
-
-            break;
+            addNull();
+            continue;
         case QPDFTokenizer::tt_integer:
-            object = QPDF_Integer::create(
-                QUtil::string_to_ll(std::string(tokenizer.getValue()).c_str()));
-            break;
+            if (!content_stream) {
+                // Buffer token in case it is part of an indirect reference.
+                last_offset_buffer[1] = input->getLastOffset();
+                int_buffer[1] = QUtil::string_to_ll(tokenizer.getValue().c_str());
+                int_count = 1;
+            } else {
+                addScalar<QPDF_Integer>(QUtil::string_to_ll(tokenizer.getValue().c_str()));
+            }
+            continue;
         case QPDFTokenizer::tt_real:
-            object = QPDF_Real::create(tokenizer.getValue());
-            break;
+            addScalar<QPDF_Real>(tokenizer.getValue());
+            continue;
         case QPDFTokenizer::tt_name:
-            {
-                auto name = tokenizer.getValue();
-                object = QPDF_Name::create(name);
-
-                if (name == "/Contents") {
-                    b_contents = true;
-                } else {
-                    b_contents = false;
-                }
+            if (frame->state == st_dictionary_key) {
+                frame->key = tokenizer.getValue();
+                frame->state = st_dictionary_value;
+                b_contents = decrypter && frame->key == "/Contents";
+                continue;
+            } else {
+                addScalar<QPDF_Name>(tokenizer.getValue());
             }
-            break;
+            continue;
         case QPDFTokenizer::tt_word:
-            {
-                auto value = tokenizer.getValue();
-                auto size = olist.size();
-                if (content_stream) {
-                    object = QPDF_Operator::create(value);
-                } else if (
-                    value == "R" && state != st_top && size >= 2 && olist.back() &&
-                    olist.back()->getTypeCode() == ::ot_integer &&
-                    !olist.back()->getObjGen().isIndirect() && olist.at(size - 2) &&
-                    olist.at(size - 2)->getTypeCode() == ::ot_integer &&
-                    !olist.at(size - 2)->getObjGen().isIndirect()) {
-                    if (context == nullptr) {
-                        QTC::TC("qpdf", "QPDFParser indirect without context");
-                        throw std::logic_error("QPDFObjectHandle::parse called without context on "
-                                               "an object with indirect references");
-                    }
-                    auto ref_og = QPDFObjGen(
-                        QPDFObjectHandle(olist.at(size - 2)).getIntValueAsInt(),
-                        QPDFObjectHandle(olist.back()).getIntValueAsInt());
-                    if (ref_og.isIndirect()) {
-                        // This action has the desirable side effect of causing dangling references
-                        // (references to indirect objects that don't appear in the PDF) in any
-                        // parsed object to appear in the object cache.
-                        object = context->getObject(ref_og).obj;
-                        indirect_ref = true;
-                    } else {
-                        QTC::TC("qpdf", "QPDFParser indirect with 0 objid");
-                        is_null = true;
-                    }
-                    olist.pop_back();
-                    olist.pop_back();
-                } else if ((value == "endobj") && (state == st_top)) {
-                    // We just saw endobj without having read anything.  Treat this as a null and do
-                    // not move the input source's offset.
-                    is_null = true;
-                    input->seek(input->getLastOffset(), SEEK_SET);
-                    empty = true;
-                } else {
-                    QTC::TC("qpdf", "QPDFParser treat word as string");
-                    warn("unknown token while reading object; treating as string");
-                    bad = true;
-                    object = QPDF_String::create(value);
+            if (content_stream) {
+                addScalar<QPDF_Operator>(tokenizer.getValue());
+            } else {
+                QTC::TC("qpdf", "QPDFParser treat word as string in parseRemainder");
+                warn("unknown token while reading object; treating as string");
+                if (tooManyBadTokens()) {
+                    return {QPDF_Null::create()};
                 }
+                addScalar<QPDF_String>(tokenizer.getValue());
             }
-            break;
+            continue;
         case QPDFTokenizer::tt_string:
             {
-                auto val = tokenizer.getValue();
+                auto const& val = tokenizer.getValue();
                 if (decrypter) {
                     if (b_contents) {
-                        frame.contents_string = val;
-                        frame.contents_offset = input->getLastOffset();
+                        frame->contents_string = val;
+                        frame->contents_offset = input->getLastOffset();
                         b_contents = false;
                     }
                     std::string s{val};
                     decrypter->decryptString(s);
-                    object = QPDF_String::create(s);
+                    addScalar<QPDF_String>(s);
                 } else {
-                    object = QPDF_String::create(val);
+                    addScalar<QPDF_String>(val);
                 }
             }
-
-            break;
+            continue;
         default:
             warn("treating unknown token type as null while reading object");
-            bad = true;
-            is_null = true;
-            break;
-        }
-
-        if (object == nullptr && !is_null &&
-            (!((state == st_start) || (state == st_stop) || (state == st_eof)))) {
-            throw std::logic_error("QPDFParser:parseInternal: unexpected uninitialized object");
-            is_null = true;
-        }
-
-        if (bad) {
-            ++bad_count;
-            good_count = 0;
-        } else {
-            ++good_count;
-            if (good_count > 3) {
-                bad_count = 0;
+            if (tooManyBadTokens()) {
+                return {QPDF_Null::create()};
             }
+            addNull();
         }
-        if (bad_count > 5) {
-            // We had too many consecutive errors without enough intervening successful objects.
-            // Give up.
-            warn("too many errors; giving up on reading object");
-            state = st_top;
-            is_null = true;
-        }
+    }
+}
-        switch (state) {
-        case st_eof:
-            if (state_stack.size() > 1) {
-                warn("parse error while reading object");
-            }
-            done = true;
-            // In content stream mode, leave object uninitialized to indicate EOF
-            if (!content_stream) {
-                is_null = true;
-            }
-            break;
-
-        case st_dictionary:
-        case st_array:
-            if (is_null) {
-                object = null_oh;
-                // No need to set description for direct nulls - they probably will become implicit.
-            } else if (!indirect_ref) {
-                setDescription(object, input->getLastOffset());
-            }
-            set_offset = true;
-            olist.push_back(object);
-            break;
+void
+QPDFParser::add(std::shared_ptr<QPDFObject>&& obj)
+{
+    if (frame->state != st_dictionary_value) {
+        // If state is st_dictionary_key then there is a missing key. Push onto olist for
+        // processing once the tt_dict_close token has been found.
+        frame->olist.emplace_back(std::move(obj));
+    } else {
+        if (auto res = frame->dict.insert_or_assign(frame->key, std::move(obj)); !res.second) {
+            warnDuplicateKey();
+        }
+        frame->state = st_dictionary_key;
+    }
+}
-        case st_top:
-            done = true;
-            break;
+void
+QPDFParser::addNull()
+{
+    const static ObjectPtr null_obj = QPDF_Null::create();
-        case st_start:
-            break;
+    if (frame->state != st_dictionary_value) {
+        // If state is st_dictionary_key then there is a missing key. Push onto olist for
+        // processing once the tt_dict_close token has been found.
+        frame->olist.emplace_back(null_obj);
+    } else {
+        if (auto res = frame->dict.insert_or_assign(frame->key, null_obj); !res.second) {
+            warnDuplicateKey();
+        }
+        frame->state = st_dictionary_key;
+    }
+    ++frame->null_count;
+}
-        case st_stop:
-            if ((state_stack.size() < 2) || (stack.size() < 2)) {
-                throw std::logic_error("QPDFParser::parseInternal: st_stop encountered with "
-                                       "insufficient elements in stack");
-            }
-            parser_state_e old_state = state_stack.back();
-            state_stack.pop_back();
-            if (old_state == st_array) {
-                object = QPDF_Array::create(std::move(olist), frame.null_count > 100);
-                setDescription(object, offset - 1);
-                // The `offset` points to the next of "[".  Set the rewind offset to point to the
-                // beginning of "[". This has been explicitly tested with whitespace surrounding the
-                // array start delimiter. getLastOffset points to the array end token and therefore
-                // can't be used here.
-                set_offset = true;
-            } else if (old_state == st_dictionary) {
-                // Convert list to map. Alternating elements are keys.  Attempt to recover more or
-                // less gracefully from invalid dictionaries.
-                std::set<std::string> names;
-                for (auto& obj: olist) {
-                    if (obj) {
-                        if (obj->getTypeCode() == ::ot_name) {
-                            names.insert(obj->getStringValue());
-                        }
-                    }
-                }
+void
+QPDFParser::addInt(int count)
+{
+    auto obj = QPDF_Integer::create(int_buffer[count % 2]);
+    obj->setDescription(context, description, last_offset_buffer[count % 2]);
+    add(std::move(obj));
+}
-                std::map<std::string, QPDFObjectHandle> dict;
-                int next_fake_key = 1;
-                for (auto iter = olist.begin(); iter != olist.end();) {
-                    // Calculate key.
-                    std::string key;
-                    if (*iter && (*iter)->getTypeCode() == ::ot_name) {
-                        key = (*iter)->getStringValue();
-                        ++iter;
-                    } else {
-                        for (bool found_fake = false; !found_fake;) {
-                            key = "/QPDFFake" + std::to_string(next_fake_key++);
-                            found_fake = (names.count(key) == 0);
-                            QTC::TC("qpdf", "QPDFParser found fake", (found_fake ? 0 : 1));
-                        }
-                        warn(
-                            offset,
-                            "expected dictionary key but found non-name object; inserting key " +
-                                key);
-                    }
-                    if (dict.count(key) > 0) {
-                        QTC::TC("qpdf", "QPDFParser duplicate dict key");
-                        warn(
-                            offset,
-                            "dictionary has duplicated key " + key +
-                                "; last occurrence overrides earlier ones");
-                    }
+template <typename T, typename... Args>
+void
+QPDFParser::addScalar(Args&&... args)
+{
+    auto obj = T::create(args...);
+    obj->setDescription(context, description, input->getLastOffset());
+    add(std::move(obj));
+}
-                    // Calculate value.
-                    std::shared_ptr<QPDFObject> val;
-                    if (iter != olist.end()) {
-                        val = *iter;
-                        ++iter;
-                    } else {
-                        QTC::TC("qpdf", "QPDFParser no val for last key");
-                        warn(
-                            offset,
-                            "dictionary ended prematurely; using null as value for last key");
-                        val = QPDF_Null::create();
-                    }
+template <typename T, typename... Args>
+QPDFObjectHandle
+QPDFParser::withDescription(Args&&... args)
+{
+    auto obj = T::create(args...);
+    obj->setDescription(context, description, start);
+    return {obj};
+}
-                    dict[std::move(key)] = std::move(val);
-                }
-                if (!frame.contents_string.empty() && dict.count("/Type") &&
-                    dict["/Type"].isNameAndEquals("/Sig") && dict.count("/ByteRange") &&
-                    dict.count("/Contents") && dict["/Contents"].isString()) {
-                    dict["/Contents"] = QPDFObjectHandle::newString(frame.contents_string);
-                    dict["/Contents"].setParsedOffset(frame.contents_offset);
-                }
-                object = QPDF_Dictionary::create(std::move(dict));
-                setDescription(object, offset - 2);
-                // The `offset` points to the next of "<<". Set the rewind offset to point to the
-                // beginning of "<<". This has been explicitly tested with whitespace surrounding
-                // the dictionary start delimiter. getLastOffset points to the dictionary end token
-                // and therefore can't be used here.
-                set_offset = true;
-            }
-            stack.pop_back();
-            if (state_stack.back() == st_top) {
-                done = true;
-            } else {
-                stack.back().olist.push_back(object);
-            }
-        }
+void
+QPDFParser::setDescription(ObjectPtr& obj, qpdf_offset_t parsed_offset)
+{
+    if (obj) {
+        obj->setDescription(context, description, parsed_offset);
     }
+}
-    if (is_null) {
-        object = QPDF_Null::create();
+void
+QPDFParser::fixMissingKeys()
+{
+    std::set<std::string> names;
+    for (auto& obj: frame->olist) {
+        if (obj->getTypeCode() == ::ot_name) {
+            names.insert(obj->getStringValue());
+        }
     }
-    if (!set_offset) {
-        setDescription(object, offset);
+    int next_fake_key = 1;
+    for (auto const& item: frame->olist) {
+        while (true) {
+            const std::string key = "/QPDFFake" + std::to_string(next_fake_key++);
+            const bool found_fake = frame->dict.count(key) == 0 && names.count(key) == 0;
+            QTC::TC("qpdf", "QPDFParser found fake", (found_fake ? 0 : 1));
+            if (found_fake) {
+                warn(
+                    frame->offset,
+                    "expected dictionary key but found non-name object; inserting key " + key);
+                frame->dict[key] = item;
+                break;
+            }
+        }
     }
-    return object;
 }
-void
-QPDFParser::setDescription(std::shared_ptr<QPDFObject>& obj, qpdf_offset_t parsed_offset)
+bool
+QPDFParser::tooManyBadTokens()
 {
-    if (obj) {
-        obj->setDescription(context, description, parsed_offset);
+    if (good_count <= 4) {
+        if (++bad_count > 5) {
+            warn("too many errors; giving up on reading object");
+            return true;
+        }
+    } else {
+        bad_count = 1;
     }
+    good_count = 0;
+    return false;
 }
 void
@@ -427,6 +498,15 @@ QPDFParser::warn(QPDFExc const&amp; e) const
 }
 void
+QPDFParser::warnDuplicateKey()
+{
+    QTC::TC("qpdf", "QPDFParser duplicate dict key");
+    warn(
+        frame->offset,
+        "dictionary has duplicated key " + frame->key + "; last occurrence overrides earlier ones");
+}
+
+void
 QPDFParser::warn(qpdf_offset_t offset, std::string const& msg) const
 {
     warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(), object_description, offset, msg));
@@ -31,11 +31,44 @@ class QPDFParser
     QPDFObjectHandle parse(bool& empty, bool content_stream);
   private:
-    enum parser_state_e { st_top, st_start, st_stop, st_eof, st_dictionary, st_array };
+    // Parser state.  Note:
+    // state < st_dictionary_value == (state = st_dictionary_key || state = st_dictionary_value)
+    enum parser_state_e { st_dictionary_key, st_dictionary_value, st_array };
+    struct StackFrame
+    {
+        StackFrame(std::shared_ptr<InputSource> const& input, parser_state_e state) :
+            state(state),
+            offset(input->tell())
+        {
+        }
+
+        std::vector<std::shared_ptr<QPDFObject>> olist;
+        std::map<std::string, QPDFObjectHandle> dict;
+        parser_state_e state;
+        std::string key;
+        qpdf_offset_t offset;
+        std::string contents_string;
+        qpdf_offset_t contents_offset{-1};
+        int null_count{0};
+    };
+
+    QPDFObjectHandle parseRemainder(bool content_stream);
+    void add(std::shared_ptr<QPDFObject>&& obj);
+    void addNull();
+    void addInt(int count);
+    template <typename T, typename... Args>
+    void addScalar(Args&&... args);
+    bool tooManyBadTokens();
+    void warnDuplicateKey();
+    void fixMissingKeys();
     void warn(qpdf_offset_t offset, std::string const& msg) const;
     void warn(std::string const& msg) const;
     void warn(QPDFExc const&) const;
+    template <typename T, typename... Args>
+    // Create a new scalar object complete with parsed offset and description.
+    // NB the offset includes any leading whitespace.
+    QPDFObjectHandle withDescription(Args&&... args);
     void setDescription(std::shared_ptr<QPDFObject>& obj, qpdf_offset_t parsed_offset);
     std::shared_ptr<InputSource> input;
     std::string const& object_description;
@@ -43,6 +76,18 @@ class QPDFParser
     QPDFObjectHandle::StringDecrypter* decrypter;
     QPDF* context;
     std::shared_ptr<QPDFValue::Description> description;
+    std::vector<StackFrame> stack;
+    StackFrame* frame;
+    // Number of recent bad tokens.
+    int bad_count = 0;
+    // Number of good tokens since last bad token. Irrelevant if bad_count == 0.
+    int good_count = 0;
+    // Start offset including any leading whitespace.
+    qpdf_offset_t start;
+    // Number of successive integer tokens.
+    int int_count = 0;
+    long long int_buffer[2]{0, 0};
+    qpdf_offset_t last_offset_buffer[2]{0, 0};
 };
 #endif // QPDFPARSER_HH
@@ -57,11 +57,14 @@ QPDF trailer lacks size 0
 QPDF trailer size not integer 0
 QPDF trailer prev not integer 0
 QPDFParser bad brace 0
+QPDFParser bad brace in parseRemainder 0
 QPDFParser bad array close 0
+QPDFParser bad array close in parseRemainder 0
 QPDF stream without length 0
 QPDF stream length not integer 0
 QPDF missing endstream 0
 QPDFParser bad dictionary close 0
+QPDFParser bad dictionary close in parseRemainder 0
 QPDF can't find xref 0
 QPDFTokenizer bad ) 0
 QPDFTokenizer bad > 0
@@ -258,6 +261,7 @@ QPDFParser indirect with 0 objid 0
 QPDF object id 0 0
 QPDF recursion loop in resolve 0
 QPDFParser treat word as string 0
+QPDFParser treat word as string in parseRemainder 0
 QPDFParser found fake 1
 QPDFParser no val for last key 0
 QPDF resolve failure to null 0
@@ -289,7 +293,9 @@ QPDFObjectHandle coalesce called on stream 0
 QPDFObjectHandle coalesce provide stream data 0
 QPDF_Stream bad token at end during normalize 0
 QPDFParser bad token in parse 0
+QPDFParser bad token in parseRemainder 0
 QPDFParser eof in parse 0
+QPDFParser eof in parseRemainder 0
 QPDFObjectHandle array bounds 0
 QPDFObjectHandle boolean returning false 0
 QPDFObjectHandle integer returning 0 0
@@ -17,7 +17,7 @@ my $td = new TestDriver(&#39;parsing&#39;);
 my $n_tests = 17;
 $td->runtest("parse objects from string",
-             {$td->COMMAND => "test_driver 31 good1.qdf"},
+             {$td->COMMAND => "test_driver 31 bad39.qdf"},
              {$td->FILE => "parse-object.out", $td->EXIT_STATUS => 0},
              $td->NORMALIZE_NEWLINES);
 $td->runtest("EOF terminating literal tokens",
 WARNING: bad16.pdf (trailer, offset 753): unexpected dictionary close token
 WARNING: bad16.pdf (trailer, offset 756): unexpected dictionary close token
 WARNING: bad16.pdf (trailer, offset 759): unknown token while reading object; treating as string
-WARNING: bad16.pdf (trailer, offset 779): unexpected EOF
 WARNING: bad16.pdf (trailer, offset 779): parse error while reading object
+WARNING: bad16.pdf (trailer, offset 779): unexpected EOF
 WARNING: bad16.pdf: file is damaged
 WARNING: bad16.pdf (offset 712): expected trailer dictionary
 WARNING: bad16.pdf: Attempting to reconstruct cross-reference table
 WARNING: bad16.pdf (trailer, offset 753): unexpected dictionary close token
 WARNING: bad16.pdf (trailer, offset 756): unexpected dictionary close token
 WARNING: bad16.pdf (trailer, offset 759): unknown token while reading object; treating as string
-WARNING: bad16.pdf (trailer, offset 779): unexpected EOF
 WARNING: bad16.pdf (trailer, offset 779): parse error while reading object
+WARNING: bad16.pdf (trailer, offset 779): unexpected EOF
 bad16.pdf: unable to find trailer dictionary while recovering damaged file
 WARNING: bad16.pdf (trailer, offset 753): unexpected dictionary close token
 WARNING: bad16.pdf (trailer, offset 756): unexpected dictionary close token
 WARNING: bad16.pdf (trailer, offset 759): unknown token while reading object; treating as string
-WARNING: bad16.pdf (trailer, offset 779): unexpected EOF
 WARNING: bad16.pdf (trailer, offset 779): parse error while reading object
+WARNING: bad16.pdf (trailer, offset 779): unexpected EOF
 bad16.pdf (offset 712): expected trailer dictionary
 WARNING: bad36.pdf (trailer, offset 764): unknown token while reading object; treating as string
-WARNING: bad36.pdf (trailer, offset 715): expected dictionary key but found non-name object; inserting key /QPDFFake2
 WARNING: bad36.pdf (trailer, offset 715): dictionary ended prematurely; using null as value for last key
+WARNING: bad36.pdf (trailer, offset 715): expected dictionary key but found non-name object; inserting key /QPDFFake2
 /QTest is implicit
 /QTest is direct and has type null (2)
 /QTest is null
 WARNING: bad36.pdf (trailer, offset 764): unknown token while reading object; treating as string
-WARNING: bad36.pdf (trailer, offset 715): expected dictionary key but found non-name object; inserting key /QPDFFake2
 WARNING: bad36.pdf (trailer, offset 715): dictionary ended prematurely; using null as value for last key
+WARNING: bad36.pdf (trailer, offset 715): expected dictionary key but found non-name object; inserting key /QPDFFake2
 /QTest is implicit
 /QTest is direct and has type null (2)
 /QTest is null
+%PDF-1.3
+%¿÷¢þ
+%QDF-1.0
+
+%% Original object ID: 1 0
+1 0 obj
+<<
+  /Pages 2 0 R
+  /Type /Catalog
+>>
+endobj
+
+%% Original object ID: 2 0
+2 0 obj
+<<
+  /Count 1
+  /Kids [
+    3 0 R
+  ]
+  /Type /Pages
+>>
+endobj
+
+%% Page 1
+%% Original object ID: 3 0
+3 0 obj
+<<
+  /Contents 4 0 R
+  /MediaBox [
+    0
+    0
+    612
+    792
+  ]
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 6 0 R
+    >>
+    /ProcSet 7 0 R
+  >>
+  /Type /Page
+>>
+endobj
+
+%% Contents for page 1
+%% Original object ID: 4 0
+4 0 obj
+<<
+  /Length 5 0 R
+>>
+stream
+BT
+  /F1 24 Tf
+  72 720 Td
+  (Potato) Tj
+ET
+endstream
+endobj
+
+5 0 obj
+44
+endobj
+
+%% Original object ID: 6 0
+6 0 obj
+<<
+  /BaseFont /Helvetica
+  /Encoding /WinAnsiEncoding
+  /Name /F1
+  /Subtype /Type1
+  /Type /Font
+>>
+endobj
+
+%% Original object ID: 5 0
+7 0 obj
+[
+  /PDF
+  /Text
+]
+endobj
+
+xref
+0 8
+0000000000 65535 f 
+0000000052 00000 n 
+0000000133 00000 n 
+0000000242 00000 n 
+0000000484 00000 n 
+0000000583 00000 n 
+0000000629 00000 n 
+0000001113 00000 n 
+trailer <<
+  /Root 1 0 R
+  /Size 8
+  /ID [<31415926535897932384626433832795><31415926535897932384626433832795>]
+>>
+startxref
+809
+%%EOF
+7 0 obj
@@ -51,6 +51,7 @@ WARNING: issue-335a.pdf (trailer, offset 563): unexpected )
 WARNING: issue-335a.pdf (trailer, offset 596): unexpected )
 WARNING: issue-335a.pdf (trailer, offset 597): name with stray # will not work with PDF >= 1.2
 WARNING: issue-335a.pdf (trailer, offset 600): unexpected )
+WARNING: issue-335a.pdf (trailer, offset 134): dictionary has duplicated key /L
 WARNING: issue-335a.pdf (trailer, offset 601): unexpected )
 WARNING: issue-335a.pdf (trailer, offset 648): unexpected )
 WARNING: issue-335a.pdf (trailer, offset 649): name with stray # will not work with PDF >= 1.2
@@ -74,6 +75,7 @@ WARNING: issue-335a.pdf (trailer, offset 563): unexpected )
 WARNING: issue-335a.pdf (trailer, offset 596): unexpected )
 WARNING: issue-335a.pdf (trailer, offset 597): name with stray # will not work with PDF >= 1.2
 WARNING: issue-335a.pdf (trailer, offset 600): unexpected )
+WARNING: issue-335a.pdf (trailer, offset 164): dictionary has duplicated key /L
 WARNING: issue-335a.pdf (trailer, offset 601): unexpected )
 WARNING: issue-335a.pdf (trailer, offset 648): unexpected )
 WARNING: issue-335a.pdf (trailer, offset 649): name with stray # will not work with PDF >= 1.2
@@ -97,6 +99,7 @@ WARNING: issue-335a.pdf (trailer, offset 563): unexpected )
 WARNING: issue-335a.pdf (trailer, offset 596): unexpected )
 WARNING: issue-335a.pdf (trailer, offset 597): name with stray # will not work with PDF >= 1.2
 WARNING: issue-335a.pdf (trailer, offset 600): unexpected )
+WARNING: issue-335a.pdf (trailer, offset 231): dictionary has duplicated key /L
 WARNING: issue-335a.pdf (trailer, offset 601): unexpected )
 WARNING: issue-335a.pdf (trailer, offset 648): unexpected )
 WARNING: issue-335a.pdf (trailer, offset 649): name with stray # will not work with PDF >= 1.2
@@ -448,6 +451,7 @@ WARNING: issue-335a.pdf (trailer, offset 1168): unexpected )
 WARNING: issue-335a.pdf (trailer, offset 1328): unexpected )
 WARNING: issue-335a.pdf (trailer, offset 1329): name with stray # will not work with PDF >= 1.2
 WARNING: issue-335a.pdf (trailer, offset 1332): unexpected )
+WARNING: issue-335a.pdf (trailer, offset 1033): dictionary has duplicated key /L
 WARNING: issue-335a.pdf (trailer, offset 1333): unexpected )
 WARNING: issue-335a.pdf (trailer, offset 1344): unexpected )
 WARNING: issue-335a.pdf (trailer, offset 1428): unexpected )
 [ /name 16059 3.14159 false << /key true /other [ (string1) (string2) ] >> null ]
-logic error parsing indirect: QPDFObjectHandle::parse called without context on an object with indirect references
+logic error parsing indirect: QPDFParser::parse called without context on an object with indirect references
 trailing data: parsed object (trailing test): trailing data found parsing object from string
 WARNING: parsed object (offset 9): unknown token while reading object; treating as string
+WARNING: parsed object: treating unexpected brace token as null
+WARNING: parsed object: treating unexpected brace token as null
+WARNING: parsed object: unexpected dictionary close token
+WARNING: bad39.qdf (object 7 0, offset 1121): unexpected EOF
+WARNING: bad39.qdf (object 7 0, offset 1121): expected endobj
+WARNING: bad39.qdf (object 7 0, offset 1121): EOF after endobj
 test 31 done
@@ -1195,6 +1195,13 @@ test_31(QPDF&amp; pdf, char const* arg2)
     // mistakenly parsed as an indirect object.
     assert(QPDFObjectHandle::parse(&pdf, "[5 0 R 0 R /X]").unparse() == "[ 5 0 R 0 (R) /X ]");
     assert(QPDFObjectHandle::parse(&pdf, "[1 0 R]", "indirect test").unparse() == "[ 1 0 R ]");
+    // TC:QPDFParser bad brace
+    assert(QPDFObjectHandle::parse(&pdf, "}").unparse() == "null");
+    assert(QPDFObjectHandle::parse(&pdf, "{").unparse() == "null");
+    // TC:QPDFParser bad dictionary close
+    assert(QPDFObjectHandle::parse(&pdf, ">>").unparse() == "null");
+    // TC:QPDFParser eof in parse
+    assert(QPDFObjectHandle::parse(&pdf, "[7 0 R]").getArrayItem(0).isNull());
 }
 static void