Commit 5a1bf035f91156d8fdc351fb18b34177ea5822e0

Authored by m-holger
1 parent db6ab9cb

Add new method QPDFParser::parseRemainder

The new method is temporarily an (almost) complete copy of parse, which is
temporarily (almost) unchanged.
libqpdf/QPDFParser.cc
@@ -38,11 +38,343 @@ QPDFParser::parse(bool& empty, bool content_stream) @@ -38,11 +38,343 @@ QPDFParser::parse(bool& empty, bool content_stream)
38 std::shared_ptr<QPDFObject> object; 38 std::shared_ptr<QPDFObject> object;
39 bool set_offset = false; 39 bool set_offset = false;
40 40
41 - std::vector<StackFrame> stack{{input, st_top}}; 41 +// std::vector<StackFrame> stack{{input, st_top}};
  42 + stack.clear(); // NEW
  43 + stack.emplace_back(input, st_top); // NEW
42 bool done = false; 44 bool done = false;
43 bool b_contents = false; 45 bool b_contents = false;
44 bool is_null = false; 46 bool is_null = false;
45 - auto* frame = &stack.back(); 47 + frame = &stack.back(); // CHANGED
  48 +
  49 + while (!done) {
  50 + bool indirect_ref = false;
  51 + is_null = false;
  52 + object = nullptr;
  53 + set_offset = false;
  54 +
  55 + if (!tokenizer.nextToken(*input, object_description)) {
  56 + warn(tokenizer.getErrorMessage());
  57 + }
  58 + ++good_count; // optimistically
  59 +
  60 + switch (tokenizer.getType()) {
  61 + case QPDFTokenizer::tt_eof:
  62 + if (stack.size() > 1) {
  63 + warn("parse error while reading object");
  64 + }
  65 + if (content_stream) {
  66 + // In content stream mode, leave object uninitialized to indicate EOF
  67 + return {};
  68 + }
  69 +// QTC::TC("qpdf", "QPDFParser eof in parse");
  70 + warn("unexpected EOF");
  71 + return {QPDF_Null::create()};
  72 +
  73 + case QPDFTokenizer::tt_bad:
  74 +// QTC::TC("qpdf", "QPDFParser bad token in parse");
  75 + if (tooManyBadTokens()) {
  76 + return {QPDF_Null::create()};
  77 + }
  78 + is_null = true;
  79 + break;
  80 +
  81 + case QPDFTokenizer::tt_brace_open:
  82 + case QPDFTokenizer::tt_brace_close:
  83 +// QTC::TC("qpdf", "QPDFParser bad brace");
  84 + warn("treating unexpected brace token as null");
  85 + if (tooManyBadTokens()) {
  86 + return {QPDF_Null::create()};
  87 + }
  88 + is_null = true;
  89 + break;
  90 +
  91 + case QPDFTokenizer::tt_array_close:
  92 + if (frame->state == st_array) {
  93 + if (stack.size() < 2) {
  94 + throw std::logic_error("QPDFParser::parseInternal: st_stop encountered with "
  95 + "insufficient elements in stack");
  96 + }
  97 + object = QPDF_Array::create(std::move(frame->olist), frame->null_count > 100);
  98 + setDescription(object, frame->offset - 1);
  99 + // The `offset` points to the next of "[". Set the rewind offset to point to the
  100 + // beginning of "[". This has been explicitly tested with whitespace surrounding the
  101 + // array start delimiter. getLastOffset points to the array end token and therefore
  102 + // can't be used here.
  103 + set_offset = true;
  104 + stack.pop_back();
  105 + frame = &stack.back();
  106 + } else {
  107 +// QTC::TC("qpdf", "QPDFParser bad array close");
  108 + warn("treating unexpected array close token as null");
  109 + if (tooManyBadTokens()) {
  110 + return {QPDF_Null::create()};
  111 + }
  112 + is_null = true;
  113 + }
  114 + break;
  115 +
  116 + case QPDFTokenizer::tt_dict_close:
  117 + if (frame->state == st_dictionary) {
  118 + if (stack.size() < 2) {
  119 + throw std::logic_error("QPDFParser::parseInternal: st_stop encountered with "
  120 + "insufficient elements in stack");
  121 + }
  122 +
  123 + // Convert list to map. Alternating elements are keys. Attempt to recover more or
  124 + // less gracefully from invalid dictionaries.
  125 + std::set<std::string> names;
  126 + for (auto& obj: frame->olist) {
  127 + if (obj) {
  128 + if (obj->getTypeCode() == ::ot_name) {
  129 + names.insert(obj->getStringValue());
  130 + }
  131 + }
  132 + }
  133 +
  134 + std::map<std::string, QPDFObjectHandle> dict;
  135 + int next_fake_key = 1;
  136 + for (auto iter = frame->olist.begin(); iter != frame->olist.end();) {
  137 + // Calculate key.
  138 + std::string key;
  139 + if (*iter && (*iter)->getTypeCode() == ::ot_name) {
  140 + key = (*iter)->getStringValue();
  141 + ++iter;
  142 + } else {
  143 + for (bool found_fake = false; !found_fake;) {
  144 + key = "/QPDFFake" + std::to_string(next_fake_key++);
  145 + found_fake = (names.count(key) == 0);
  146 +// QTC::TC("qpdf", "QPDFParser found fake", (found_fake ? 0 : 1));
  147 + }
  148 + warn(
  149 + frame->offset,
  150 + "expected dictionary key but found non-name object; inserting key " +
  151 + key);
  152 + }
  153 + if (dict.count(key) > 0) {
  154 +// QTC::TC("qpdf", "QPDFParser duplicate dict key");
  155 + warn(
  156 + frame->offset,
  157 + "dictionary has duplicated key " + key +
  158 + "; last occurrence overrides earlier ones");
  159 + }
  160 +
  161 + // Calculate value.
  162 + std::shared_ptr<QPDFObject> val;
  163 + if (iter != frame->olist.end()) {
  164 + val = *iter;
  165 + ++iter;
  166 + } else {
  167 +// QTC::TC("qpdf", "QPDFParser no val for last key");
  168 + warn(
  169 + frame->offset,
  170 + "dictionary ended prematurely; using null as value for last key");
  171 + val = QPDF_Null::create();
  172 + }
  173 +
  174 + dict[std::move(key)] = std::move(val);
  175 + }
  176 + if (!frame->contents_string.empty() && dict.count("/Type") &&
  177 + dict["/Type"].isNameAndEquals("/Sig") && dict.count("/ByteRange") &&
  178 + dict.count("/Contents") && dict["/Contents"].isString()) {
  179 + dict["/Contents"] = QPDFObjectHandle::newString(frame->contents_string);
  180 + dict["/Contents"].setParsedOffset(frame->contents_offset);
  181 + }
  182 + object = QPDF_Dictionary::create(std::move(dict));
  183 + setDescription(object, frame->offset - 2);
  184 + // The `offset` points to the next of "<<". Set the rewind offset to point to the
  185 + // beginning of "<<". This has been explicitly tested with whitespace surrounding
  186 + // the dictionary start delimiter. getLastOffset points to the dictionary end token
  187 + // and therefore can't be used here.
  188 + set_offset = true;
  189 + stack.pop_back();
  190 + frame = &stack.back();
  191 + } else {
  192 +// QTC::TC("qpdf", "QPDFParser bad dictionary close");
  193 + warn("unexpected dictionary close token");
  194 + if (tooManyBadTokens()) {
  195 + return {QPDF_Null::create()};
  196 + }
  197 + is_null = true;
  198 + }
  199 + break;
  200 +
  201 + case QPDFTokenizer::tt_array_open:
  202 + case QPDFTokenizer::tt_dict_open:
  203 + if (stack.size() > 500) {
  204 +// QTC::TC("qpdf", "QPDFParser too deep");
  205 + warn("ignoring excessively deeply nested data structure");
  206 + return {QPDF_Null::create()};
  207 + } else {
  208 + b_contents = false;
  209 + stack.emplace_back(
  210 + input,
  211 + (tokenizer.getType() == QPDFTokenizer::tt_array_open) ? st_array
  212 + : st_dictionary);
  213 + frame = &stack.back();
  214 + return parseRemainder(content_stream); // NEW
  215 + continue;
  216 + }
  217 +
  218 + case QPDFTokenizer::tt_bool:
  219 + object = QPDF_Bool::create((tokenizer.getValue() == "true"));
  220 + break;
  221 +
  222 + case QPDFTokenizer::tt_null:
  223 + is_null = true;
  224 + ++frame->null_count;
  225 +
  226 + break;
  227 +
  228 + case QPDFTokenizer::tt_integer:
  229 + object = QPDF_Integer::create(QUtil::string_to_ll(tokenizer.getValue().c_str()));
  230 + break;
  231 +
  232 + case QPDFTokenizer::tt_real:
  233 + object = QPDF_Real::create(tokenizer.getValue());
  234 + break;
  235 +
  236 + case QPDFTokenizer::tt_name:
  237 + {
  238 + auto const& name = tokenizer.getValue();
  239 + object = QPDF_Name::create(name);
  240 +
  241 + if (name == "/Contents") {
  242 + b_contents = true;
  243 + } else {
  244 + b_contents = false;
  245 + }
  246 + }
  247 + break;
  248 +
  249 + case QPDFTokenizer::tt_word:
  250 + {
  251 + auto const& value = tokenizer.getValue();
  252 + auto size = frame->olist.size();
  253 + if (content_stream) {
  254 + object = QPDF_Operator::create(value);
  255 + } else if (
  256 + value == "R" && frame->state != st_top && size >= 2 && frame->olist.back() &&
  257 + frame->olist.back()->getTypeCode() == ::ot_integer &&
  258 + !frame->olist.back()->getObjGen().isIndirect() && frame->olist.at(size - 2) &&
  259 + frame->olist.at(size - 2)->getTypeCode() == ::ot_integer &&
  260 + !frame->olist.at(size - 2)->getObjGen().isIndirect()) {
  261 + if (context == nullptr) {
  262 +// QTC::TC("qpdf", "QPDFParser indirect without context");
  263 + throw std::logic_error("QPDFObjectHandle::parse called without context on "
  264 + "an object with indirect references");
  265 + }
  266 + auto ref_og = QPDFObjGen(
  267 + QPDFObjectHandle(frame->olist.at(size - 2)).getIntValueAsInt(),
  268 + QPDFObjectHandle(frame->olist.back()).getIntValueAsInt());
  269 + if (ref_og.isIndirect()) {
  270 + // This action has the desirable side effect of causing dangling references
  271 + // (references to indirect objects that don't appear in the PDF) in any
  272 + // parsed object to appear in the object cache.
  273 + object = context->getObject(ref_og).obj;
  274 + indirect_ref = true;
  275 + } else {
  276 +// QTC::TC("qpdf", "QPDFParser indirect with 0 objid");
  277 + is_null = true;
  278 + }
  279 + frame->olist.pop_back();
  280 + frame->olist.pop_back();
  281 + } else if ((value == "endobj") && (frame->state == st_top)) {
  282 + // We just saw endobj without having read anything. Treat this as a null and do
  283 + // not move the input source's offset.
  284 + is_null = true;
  285 + input->seek(input->getLastOffset(), SEEK_SET);
  286 + empty = true;
  287 + } else {
  288 +// QTC::TC("qpdf", "QPDFParser treat word as string");
  289 + warn("unknown token while reading object; treating as string");
  290 + if (tooManyBadTokens()) {
  291 + return {QPDF_Null::create()};
  292 + }
  293 + object = QPDF_String::create(value);
  294 + }
  295 + }
  296 + break;
  297 +
  298 + case QPDFTokenizer::tt_string:
  299 + {
  300 + auto const& val = tokenizer.getValue();
  301 + if (decrypter) {
  302 + if (b_contents) {
  303 + frame->contents_string = val;
  304 + frame->contents_offset = input->getLastOffset();
  305 + b_contents = false;
  306 + }
  307 + std::string s{val};
  308 + decrypter->decryptString(s);
  309 + object = QPDF_String::create(s);
  310 + } else {
  311 + object = QPDF_String::create(val);
  312 + }
  313 + }
  314 + break;
  315 +
  316 + default:
  317 + warn("treating unknown token type as null while reading object");
  318 + if (tooManyBadTokens()) {
  319 + return {QPDF_Null::create()};
  320 + }
  321 + is_null = true;
  322 + break;
  323 + }
  324 +
  325 + if (object == nullptr && !is_null) {
  326 + throw std::logic_error("QPDFParser:parseInternal: unexpected uninitialized object");
  327 + }
  328 +
  329 + switch (frame->state) {
  330 + case st_dictionary:
  331 + case st_array:
  332 + if (is_null) {
  333 + object = null_oh;
  334 + // No need to set description for direct nulls - they probably will become implicit.
  335 + } else if (!indirect_ref && !set_offset) {
  336 + setDescription(object, input->getLastOffset());
  337 + }
  338 + set_offset = true;
  339 + frame->olist.push_back(object);
  340 + break;
  341 +
  342 + case st_top:
  343 + done = true;
  344 + break;
  345 + }
  346 + }
  347 +
  348 + if (is_null) {
  349 + object = QPDF_Null::create();
  350 + }
  351 + if (!set_offset) {
  352 + setDescription(object, frame->offset);
  353 + }
  354 + return object;
  355 +}
  356 +
  357 +QPDFObjectHandle
  358 +QPDFParser::parseRemainder(bool content_stream)
  359 +{
  360 + // This method must take care not to resolve any objects. Don't check the type of any object
  361 + // without first ensuring that it is a direct object. Otherwise, doing so may have the side
  362 + // effect of reading the object and changing the file pointer. If you do this, it will cause a
  363 + // logic error to be thrown from QPDF::inParse().
  364 +
  365 + const static std::shared_ptr<QPDFObject> null_oh = QPDF_Null::create();
  366 +// QPDF::ParseGuard pg(context);
  367 +
  368 +// empty = false;
  369 +
  370 + std::shared_ptr<QPDFObject> object;
  371 + bool set_offset = false;
  372 +
  373 +// std::vector<StackFrame> stack{{input, st_top},};
  374 + bool done = false;
  375 + bool b_contents = false;
  376 + bool is_null = false;
  377 + frame = &stack.back(); // CHANGED
46 378
47 while (!done) { 379 while (!done) {
48 bool indirect_ref = false; 380 bool indirect_ref = false;
@@ -280,7 +612,7 @@ QPDFParser::parse(bool&amp; empty, bool content_stream) @@ -280,7 +612,7 @@ QPDFParser::parse(bool&amp; empty, bool content_stream)
280 // not move the input source's offset. 612 // not move the input source's offset.
281 is_null = true; 613 is_null = true;
282 input->seek(input->getLastOffset(), SEEK_SET); 614 input->seek(input->getLastOffset(), SEEK_SET);
283 - empty = true; 615 +// empty = true;
284 } else { 616 } else {
285 QTC::TC("qpdf", "QPDFParser treat word as string"); 617 QTC::TC("qpdf", "QPDFParser treat word as string");
286 warn("unknown token while reading object; treating as string"); 618 warn("unknown token while reading object; treating as string");
libqpdf/qpdf/QPDFParser.hh
@@ -50,6 +50,9 @@ class QPDFParser @@ -50,6 +50,9 @@ class QPDFParser
50 int null_count{0}; 50 int null_count{0};
51 }; 51 };
52 52
  53 +
  54 + QPDFObjectHandle
  55 + parseRemainder(bool content_stream);
53 bool tooManyBadTokens(); 56 bool tooManyBadTokens();
54 void warn(qpdf_offset_t offset, std::string const& msg) const; 57 void warn(qpdf_offset_t offset, std::string const& msg) const;
55 void warn(std::string const& msg) const; 58 void warn(std::string const& msg) const;
@@ -61,6 +64,8 @@ class QPDFParser @@ -61,6 +64,8 @@ class QPDFParser
61 QPDFObjectHandle::StringDecrypter* decrypter; 64 QPDFObjectHandle::StringDecrypter* decrypter;
62 QPDF* context; 65 QPDF* context;
63 std::shared_ptr<QPDFValue::Description> description; 66 std::shared_ptr<QPDFValue::Description> description;
  67 + std::vector<StackFrame> stack;
  68 + StackFrame* frame;
64 // Number of recent bad tokens. 69 // Number of recent bad tokens.
65 int bad_count = 0; 70 int bad_count = 0;
66 // Number of good tokens since last bad token. Irrelevant if bad_count == 0. 71 // Number of good tokens since last bad token. Irrelevant if bad_count == 0.