Commit 6670c685ab9f929121c5498115b278c95574e461

Authored by m-holger
1 parent 0adfd74f

Move QPDFObjectHandle::parseInternal to new class QPDFParser

Part of #729
include/qpdf/QPDF.hh
... ... @@ -49,6 +49,7 @@ class QPDF_Stream;
49 49 class BitStream;
50 50 class BitWriter;
51 51 class QPDFLogger;
  52 +class QPDFParser;
52 53  
53 54 class QPDF
54 55 {
... ... @@ -881,7 +882,7 @@ class QPDF
881 882 // resolution
882 883 class ParseGuard
883 884 {
884   - friend class QPDFObjectHandle;
  885 + friend class QPDFParser;
885 886  
886 887 private:
887 888 ParseGuard(QPDF* qpdf) :
... ...
include/qpdf/QPDFObjectHandle.hh
... ... @@ -49,9 +49,12 @@ class QPDFTokenizer;
49 49 class QPDFExc;
50 50 class Pl_QPDFTokenizer;
51 51 class QPDFMatrix;
  52 +class QPDFParser;
52 53  
53 54 class QPDFObjectHandle
54 55 {
  56 + friend class QPDFParser;
  57 +
55 58 public:
56 59 // This class is used by replaceStreamData. It provides an
57 60 // alternative way of associating stream data with a stream. See
... ... @@ -1563,15 +1566,6 @@ class QPDFObjectHandle
1563 1566 QPDFObjectHandle(QPDF*, QPDFObjGen const& og);
1564 1567 QPDFObjectHandle(std::shared_ptr<QPDFObject> const&);
1565 1568  
1566   - enum parser_state_e {
1567   - st_top,
1568   - st_start,
1569   - st_stop,
1570   - st_eof,
1571   - st_dictionary,
1572   - st_array
1573   - };
1574   -
1575 1569 // Private object factory methods
1576 1570 static QPDFObjectHandle newIndirect(QPDF*, QPDFObjGen const& og);
1577 1571 static QPDFObjectHandle newStream(
... ... @@ -1599,14 +1593,7 @@ class QPDFObjectHandle
1599 1593 std::string const&,
1600 1594 std::shared_ptr<InputSource>,
1601 1595 qpdf_offset_t);
1602   - static QPDFObjectHandle parseInternal(
1603   - std::shared_ptr<InputSource> input,
1604   - std::string const& object_description,
1605   - QPDFTokenizer& tokenizer,
1606   - bool& empty,
1607   - StringDecrypter* decrypter,
1608   - QPDF* context,
1609   - bool content_stream);
  1596 +
1610 1597 void setParsedOffset(qpdf_offset_t offset);
1611 1598 void parseContentStream_internal(
1612 1599 std::string const& description, ParserCallbacks* callbacks);
... ...
libqpdf/CMakeLists.txt
... ... @@ -80,6 +80,7 @@ set(libqpdf_SOURCES
80 80 QPDFPageDocumentHelper.cc
81 81 QPDFPageLabelDocumentHelper.cc
82 82 QPDFPageObjectHelper.cc
  83 + QPDFParser.cc
83 84 QPDFStreamFilter.cc
84 85 QPDFSystemError.cc
85 86 QPDFTokenizer.cc
... ...
libqpdf/QPDFObjectHandle.cc
... ... @@ -8,6 +8,7 @@
8 8 #include <qpdf/QPDFLogger.hh>
9 9 #include <qpdf/QPDFMatrix.hh>
10 10 #include <qpdf/QPDFPageObjectHelper.hh>
  11 +#include <qpdf/QPDFParser.hh>
11 12 #include <qpdf/QPDF_Array.hh>
12 13 #include <qpdf/QPDF_Bool.hh>
13 14 #include <qpdf/QPDF_Dictionary.hh>
... ... @@ -1879,8 +1880,8 @@ QPDFObjectHandle::parseContentStream_data(
1879 1880 tokenizer.readToken(input, "content", true);
1880 1881 qpdf_offset_t offset = input->getLastOffset();
1881 1882 input->seek(offset, SEEK_SET);
1882   - QPDFObjectHandle obj = parseInternal(
1883   - input, "content", tokenizer, empty, nullptr, context, true);
  1883 + auto obj = QPDFParser(input, "content", tokenizer, nullptr, context)
  1884 + .parse(empty, true);
1884 1885 if (!obj.isInitialized()) {
1885 1886 // EOF
1886 1887 break;
... ... @@ -1943,497 +1944,8 @@ QPDFObjectHandle::parse(
1943 1944 StringDecrypter* decrypter,
1944 1945 QPDF* context)
1945 1946 {
1946   - return parseInternal(
1947   - input, object_description, tokenizer, empty, decrypter, context, false);
1948   -}
1949   -
1950   -QPDFObjectHandle
1951   -QPDFObjectHandle::parseInternal(
1952   - std::shared_ptr<InputSource> input,
1953   - std::string const& object_description,
1954   - QPDFTokenizer& tokenizer,
1955   - bool& empty,
1956   - StringDecrypter* decrypter,
1957   - QPDF* context,
1958   - bool content_stream)
1959   -{
1960   - // This method must take care not to resolve any objects. Don't
1961   - // check the type of any object without first ensuring that it is
1962   - // a direct object. Otherwise, doing so may have the side effect
1963   - // of reading the object and changing the file pointer. If you do
1964   - // this, it will cause a logic error to be thrown from
1965   - // QPDF::inParse().
1966   -
1967   - QPDF::ParseGuard pg(context);
1968   -
1969   - empty = false;
1970   -
1971   - QPDFObjectHandle object;
1972   - bool set_offset = false;
1973   -
1974   - std::vector<SparseOHArray> olist_stack;
1975   - olist_stack.push_back(SparseOHArray());
1976   - std::vector<parser_state_e> state_stack;
1977   - state_stack.push_back(st_top);
1978   - std::vector<qpdf_offset_t> offset_stack;
1979   - qpdf_offset_t offset = input->tell();
1980   - offset_stack.push_back(offset);
1981   - bool done = false;
1982   - int bad_count = 0;
1983   - int good_count = 0;
1984   - bool b_contents = false;
1985   - std::vector<std::string> contents_string_stack;
1986   - contents_string_stack.push_back("");
1987   - std::vector<qpdf_offset_t> contents_offset_stack;
1988   - contents_offset_stack.push_back(-1);
1989   - while (!done) {
1990   - bool bad = false;
1991   - SparseOHArray& olist = olist_stack.back();
1992   - parser_state_e state = state_stack.back();
1993   - offset = offset_stack.back();
1994   - std::string& contents_string = contents_string_stack.back();
1995   - qpdf_offset_t& contents_offset = contents_offset_stack.back();
1996   -
1997   - object = QPDFObjectHandle();
1998   - set_offset = false;
1999   -
2000   - QPDFTokenizer::Token token =
2001   - tokenizer.readToken(input, object_description, true);
2002   - std::string const& token_error_message = token.getErrorMessage();
2003   - if (!token_error_message.empty()) {
2004   - // Tokens other than tt_bad can still generate warnings.
2005   - warn(
2006   - context,
2007   - QPDFExc(
2008   - qpdf_e_damaged_pdf,
2009   - input->getName(),
2010   - object_description,
2011   - input->getLastOffset(),
2012   - token_error_message));
2013   - }
2014   -
2015   - switch (token.getType()) {
2016   - case QPDFTokenizer::tt_eof:
2017   - if (!content_stream) {
2018   - QTC::TC("qpdf", "QPDFObjectHandle eof in parseInternal");
2019   - warn(
2020   - context,
2021   - QPDFExc(
2022   - qpdf_e_damaged_pdf,
2023   - input->getName(),
2024   - object_description,
2025   - input->getLastOffset(),
2026   - "unexpected EOF"));
2027   - }
2028   - bad = true;
2029   - state = st_eof;
2030   - break;
2031   -
2032   - case QPDFTokenizer::tt_bad:
2033   - QTC::TC("qpdf", "QPDFObjectHandle bad token in parse");
2034   - bad = true;
2035   - object = newNull();
2036   - break;
2037   -
2038   - case QPDFTokenizer::tt_brace_open:
2039   - case QPDFTokenizer::tt_brace_close:
2040   - QTC::TC("qpdf", "QPDFObjectHandle bad brace");
2041   - warn(
2042   - context,
2043   - QPDFExc(
2044   - qpdf_e_damaged_pdf,
2045   - input->getName(),
2046   - object_description,
2047   - input->getLastOffset(),
2048   - "treating unexpected brace token as null"));
2049   - bad = true;
2050   - object = newNull();
2051   - break;
2052   -
2053   - case QPDFTokenizer::tt_array_close:
2054   - if (state == st_array) {
2055   - state = st_stop;
2056   - } else {
2057   - QTC::TC("qpdf", "QPDFObjectHandle bad array close");
2058   - warn(
2059   - context,
2060   - QPDFExc(
2061   - qpdf_e_damaged_pdf,
2062   - input->getName(),
2063   - object_description,
2064   - input->getLastOffset(),
2065   - "treating unexpected array close token as null"));
2066   - bad = true;
2067   - object = newNull();
2068   - }
2069   - break;
2070   -
2071   - case QPDFTokenizer::tt_dict_close:
2072   - if (state == st_dictionary) {
2073   - state = st_stop;
2074   - } else {
2075   - QTC::TC("qpdf", "QPDFObjectHandle bad dictionary close");
2076   - warn(
2077   - context,
2078   - QPDFExc(
2079   - qpdf_e_damaged_pdf,
2080   - input->getName(),
2081   - object_description,
2082   - input->getLastOffset(),
2083   - "unexpected dictionary close token"));
2084   - bad = true;
2085   - object = newNull();
2086   - }
2087   - break;
2088   -
2089   - case QPDFTokenizer::tt_array_open:
2090   - case QPDFTokenizer::tt_dict_open:
2091   - if (olist_stack.size() > 500) {
2092   - QTC::TC("qpdf", "QPDFObjectHandle too deep");
2093   - warn(
2094   - context,
2095   - QPDFExc(
2096   - qpdf_e_damaged_pdf,
2097   - input->getName(),
2098   - object_description,
2099   - input->getLastOffset(),
2100   - "ignoring excessively deeply nested data structure"));
2101   - bad = true;
2102   - object = newNull();
2103   - state = st_top;
2104   - } else {
2105   - olist_stack.push_back(SparseOHArray());
2106   - state = st_start;
2107   - offset_stack.push_back(input->tell());
2108   - state_stack.push_back(
2109   - (token.getType() == QPDFTokenizer::tt_array_open)
2110   - ? st_array
2111   - : st_dictionary);
2112   - b_contents = false;
2113   - contents_string_stack.push_back("");
2114   - contents_offset_stack.push_back(-1);
2115   - }
2116   - break;
2117   -
2118   - case QPDFTokenizer::tt_bool:
2119   - object = newBool((token.getValue() == "true"));
2120   - break;
2121   -
2122   - case QPDFTokenizer::tt_null:
2123   - object = newNull();
2124   - break;
2125   -
2126   - case QPDFTokenizer::tt_integer:
2127   - object = newInteger(QUtil::string_to_ll(token.getValue().c_str()));
2128   - break;
2129   -
2130   - case QPDFTokenizer::tt_real:
2131   - object = newReal(token.getValue());
2132   - break;
2133   -
2134   - case QPDFTokenizer::tt_name:
2135   - {
2136   - std::string name = token.getValue();
2137   - object = newName(name);
2138   -
2139   - if (name == "/Contents") {
2140   - b_contents = true;
2141   - } else {
2142   - b_contents = false;
2143   - }
2144   - }
2145   - break;
2146   -
2147   - case QPDFTokenizer::tt_word:
2148   - {
2149   - std::string const& value = token.getValue();
2150   - if (content_stream) {
2151   - object = QPDFObjectHandle::newOperator(value);
2152   - } else if (
2153   - (value == "R") && (state != st_top) &&
2154   - (olist.size() >= 2) &&
2155   - (!olist.at(olist.size() - 1).isIndirect()) &&
2156   - (olist.at(olist.size() - 1).isInteger()) &&
2157   - (!olist.at(olist.size() - 2).isIndirect()) &&
2158   - (olist.at(olist.size() - 2).isInteger())) {
2159   - if (context == nullptr) {
2160   - QTC::TC(
2161   - "qpdf",
2162   - "QPDFObjectHandle indirect without context");
2163   - throw std::logic_error(
2164   - "QPDFObjectHandle::parse called without context"
2165   - " on an object with indirect references");
2166   - }
2167   - // Try to resolve indirect objects
2168   - object = newIndirect(
2169   - context,
2170   - QPDFObjGen(
2171   - olist.at(olist.size() - 2).getIntValueAsInt(),
2172   - olist.at(olist.size() - 1).getIntValueAsInt()));
2173   - olist.remove_last();
2174   - olist.remove_last();
2175   - } else if ((value == "endobj") && (state == st_top)) {
2176   - // We just saw endobj without having read
2177   - // anything. Treat this as a null and do not move
2178   - // the input source's offset.
2179   - object = newNull();
2180   - input->seek(input->getLastOffset(), SEEK_SET);
2181   - empty = true;
2182   - } else {
2183   - QTC::TC("qpdf", "QPDFObjectHandle treat word as string");
2184   - warn(
2185   - context,
2186   - QPDFExc(
2187   - qpdf_e_damaged_pdf,
2188   - input->getName(),
2189   - object_description,
2190   - input->getLastOffset(),
2191   - "unknown token while reading object;"
2192   - " treating as string"));
2193   - bad = true;
2194   - object = newString(value);
2195   - }
2196   - }
2197   - break;
2198   -
2199   - case QPDFTokenizer::tt_string:
2200   - {
2201   - std::string val = token.getValue();
2202   - if (decrypter) {
2203   - if (b_contents) {
2204   - contents_string = val;
2205   - contents_offset = input->getLastOffset();
2206   - b_contents = false;
2207   - }
2208   - decrypter->decryptString(val);
2209   - }
2210   - object = QPDFObjectHandle::newString(val);
2211   - }
2212   -
2213   - break;
2214   -
2215   - default:
2216   - warn(
2217   - context,
2218   - QPDFExc(
2219   - qpdf_e_damaged_pdf,
2220   - input->getName(),
2221   - object_description,
2222   - input->getLastOffset(),
2223   - "treating unknown token type as null while "
2224   - "reading object"));
2225   - bad = true;
2226   - object = newNull();
2227   - break;
2228   - }
2229   -
2230   - if ((!object.isInitialized()) &&
2231   - (!((state == st_start) || (state == st_stop) ||
2232   - (state == st_eof)))) {
2233   - throw std::logic_error("QPDFObjectHandle::parseInternal: "
2234   - "unexpected uninitialized object");
2235   - object = newNull();
2236   - }
2237   -
2238   - if (bad) {
2239   - ++bad_count;
2240   - good_count = 0;
2241   - } else {
2242   - ++good_count;
2243   - if (good_count > 3) {
2244   - bad_count = 0;
2245   - }
2246   - }
2247   - if (bad_count > 5) {
2248   - // We had too many consecutive errors without enough
2249   - // intervening successful objects. Give up.
2250   - warn(
2251   - context,
2252   - QPDFExc(
2253   - qpdf_e_damaged_pdf,
2254   - input->getName(),
2255   - object_description,
2256   - input->getLastOffset(),
2257   - "too many errors; giving up on reading object"));
2258   - state = st_top;
2259   - object = newNull();
2260   - }
2261   -
2262   - switch (state) {
2263   - case st_eof:
2264   - if (state_stack.size() > 1) {
2265   - warn(
2266   - context,
2267   - QPDFExc(
2268   - qpdf_e_damaged_pdf,
2269   - input->getName(),
2270   - object_description,
2271   - input->getLastOffset(),
2272   - "parse error while reading object"));
2273   - }
2274   - done = true;
2275   - // In content stream mode, leave object uninitialized to
2276   - // indicate EOF
2277   - if (!content_stream) {
2278   - object = newNull();
2279   - }
2280   - break;
2281   -
2282   - case st_dictionary:
2283   - case st_array:
2284   - setObjectDescriptionFromInput(
2285   - object,
2286   - context,
2287   - object_description,
2288   - input,
2289   - input->getLastOffset());
2290   - object.setParsedOffset(input->getLastOffset());
2291   - set_offset = true;
2292   - olist.append(object);
2293   - break;
2294   -
2295   - case st_top:
2296   - done = true;
2297   - break;
2298   -
2299   - case st_start:
2300   - break;
2301   -
2302   - case st_stop:
2303   - if ((state_stack.size() < 2) || (olist_stack.size() < 2)) {
2304   - throw std::logic_error(
2305   - "QPDFObjectHandle::parseInternal: st_stop encountered"
2306   - " with insufficient elements in stack");
2307   - }
2308   - parser_state_e old_state = state_stack.back();
2309   - state_stack.pop_back();
2310   - if (old_state == st_array) {
2311   - // There's no newArray(SparseOHArray) since
2312   - // SparseOHArray is not part of the public API.
2313   - object = QPDFObjectHandle(QPDF_Array::create(olist));
2314   - setObjectDescriptionFromInput(
2315   - object, context, object_description, input, offset);
2316   - // The `offset` points to the next of "[". Set the
2317   - // rewind offset to point to the beginning of "[".
2318   - // This has been explicitly tested with whitespace
2319   - // surrounding the array start delimiter.
2320   - // getLastOffset points to the array end token and
2321   - // therefore can't be used here.
2322   - object.setParsedOffset(offset - 1);
2323   - set_offset = true;
2324   - } else if (old_state == st_dictionary) {
2325   - // Convert list to map. Alternating elements are keys.
2326   - // Attempt to recover more or less gracefully from
2327   - // invalid dictionaries.
2328   - std::set<std::string> names;
2329   - size_t n_elements = olist.size();
2330   - for (size_t i = 0; i < n_elements; ++i) {
2331   - QPDFObjectHandle oh = olist.at(i);
2332   - if ((!oh.isIndirect()) && oh.isName()) {
2333   - names.insert(oh.getName());
2334   - }
2335   - }
2336   -
2337   - std::map<std::string, QPDFObjectHandle> dict;
2338   - int next_fake_key = 1;
2339   - for (unsigned int i = 0; i < olist.size(); ++i) {
2340   - QPDFObjectHandle key_obj = olist.at(i);
2341   - QPDFObjectHandle val;
2342   - if (key_obj.isIndirect() || (!key_obj.isName())) {
2343   - bool found_fake = false;
2344   - std::string candidate;
2345   - while (!found_fake) {
2346   - candidate = "/QPDFFake" +
2347   - QUtil::int_to_string(next_fake_key++);
2348   - found_fake = (names.count(candidate) == 0);
2349   - QTC::TC(
2350   - "qpdf",
2351   - "QPDFObjectHandle found fake",
2352   - (found_fake ? 0 : 1));
2353   - }
2354   - warn(
2355   - context,
2356   - QPDFExc(
2357   - qpdf_e_damaged_pdf,
2358   - input->getName(),
2359   - object_description,
2360   - offset,
2361   - "expected dictionary key but found"
2362   - " non-name object; inserting key " +
2363   - candidate));
2364   - val = key_obj;
2365   - key_obj = newName(candidate);
2366   - } else if (i + 1 >= olist.size()) {
2367   - QTC::TC("qpdf", "QPDFObjectHandle no val for last key");
2368   - warn(
2369   - context,
2370   - QPDFExc(
2371   - qpdf_e_damaged_pdf,
2372   - input->getName(),
2373   - object_description,
2374   - offset,
2375   - "dictionary ended prematurely; "
2376   - "using null as value for last key"));
2377   - val = newNull();
2378   - setObjectDescriptionFromInput(
2379   - val, context, object_description, input, offset);
2380   - } else {
2381   - val = olist.at(++i);
2382   - }
2383   - std::string key = key_obj.getName();
2384   - if (dict.count(key) > 0) {
2385   - QTC::TC("qpdf", "QPDFObjectHandle duplicate dict key");
2386   - warn(
2387   - context,
2388   - QPDFExc(
2389   - qpdf_e_damaged_pdf,
2390   - input->getName(),
2391   - object_description,
2392   - offset,
2393   - "dictionary has duplicated key " + key +
2394   - "; last occurrence overrides earlier "
2395   - "ones"));
2396   - }
2397   - dict[key] = val;
2398   - }
2399   - if (!contents_string.empty() && dict.count("/Type") &&
2400   - dict["/Type"].isNameAndEquals("/Sig") &&
2401   - dict.count("/ByteRange") && dict.count("/Contents") &&
2402   - dict["/Contents"].isString()) {
2403   - dict["/Contents"] =
2404   - QPDFObjectHandle::newString(contents_string);
2405   - dict["/Contents"].setParsedOffset(contents_offset);
2406   - }
2407   - object = newDictionary(dict);
2408   - setObjectDescriptionFromInput(
2409   - object, context, object_description, input, offset);
2410   - // The `offset` points to the next of "<<". Set the
2411   - // rewind offset to point to the beginning of "<<".
2412   - // This has been explicitly tested with whitespace
2413   - // surrounding the dictionary start delimiter.
2414   - // getLastOffset points to the dictionary end token
2415   - // and therefore can't be used here.
2416   - object.setParsedOffset(offset - 2);
2417   - set_offset = true;
2418   - }
2419   - olist_stack.pop_back();
2420   - offset_stack.pop_back();
2421   - if (state_stack.back() == st_top) {
2422   - done = true;
2423   - } else {
2424   - olist_stack.back().append(object);
2425   - }
2426   - contents_string_stack.pop_back();
2427   - contents_offset_stack.pop_back();
2428   - }
2429   - }
2430   -
2431   - if (!set_offset) {
2432   - setObjectDescriptionFromInput(
2433   - object, context, object_description, input, offset);
2434   - object.setParsedOffset(offset);
2435   - }
2436   - return object;
  1947 + return QPDFParser(input, object_description, tokenizer, decrypter, context)
  1948 + .parse(empty, false);
2437 1949 }
2438 1950  
2439 1951 qpdf_offset_t
... ...
libqpdf/QPDFParser.cc 0 โ†’ 100644
  1 +#include <qpdf/QPDFParser.hh>
  2 +
  3 +#include <qpdf/QPDF.hh>
  4 +#include <qpdf/QPDFObjectHandle.hh>
  5 +#include <qpdf/QPDF_Array.hh>
  6 +#include <qpdf/QTC.hh>
  7 +#include <qpdf/QUtil.hh>
  8 +#include <qpdf/SparseOHArray.hh>
  9 +
  10 +QPDFObjectHandle
  11 +QPDFParser::parse(bool& empty, bool content_stream)
  12 +{
  13 + // This method must take care not to resolve any objects. Don't
  14 + // check the type of any object without first ensuring that it is
  15 + // a direct object. Otherwise, doing so may have the side effect
  16 + // of reading the object and changing the file pointer. If you do
  17 + // this, it will cause a logic error to be thrown from
  18 + // QPDF::inParse().
  19 +
  20 + QPDF::ParseGuard pg(context);
  21 +
  22 + empty = false;
  23 +
  24 + QPDFObjectHandle object;
  25 + bool set_offset = false;
  26 +
  27 + std::vector<SparseOHArray> olist_stack;
  28 + olist_stack.push_back(SparseOHArray());
  29 + std::vector<parser_state_e> state_stack;
  30 + state_stack.push_back(st_top);
  31 + std::vector<qpdf_offset_t> offset_stack;
  32 + qpdf_offset_t offset = input->tell();
  33 + offset_stack.push_back(offset);
  34 + bool done = false;
  35 + int bad_count = 0;
  36 + int good_count = 0;
  37 + bool b_contents = false;
  38 + std::vector<std::string> contents_string_stack;
  39 + contents_string_stack.push_back("");
  40 + std::vector<qpdf_offset_t> contents_offset_stack;
  41 + contents_offset_stack.push_back(-1);
  42 + while (!done) {
  43 + bool bad = false;
  44 + SparseOHArray& olist = olist_stack.back();
  45 + parser_state_e state = state_stack.back();
  46 + offset = offset_stack.back();
  47 + std::string& contents_string = contents_string_stack.back();
  48 + qpdf_offset_t& contents_offset = contents_offset_stack.back();
  49 +
  50 + object = QPDFObjectHandle();
  51 + set_offset = false;
  52 +
  53 + QPDFTokenizer::Token token =
  54 + tokenizer.readToken(input, object_description, true);
  55 + std::string const& token_error_message = token.getErrorMessage();
  56 + if (!token_error_message.empty()) {
  57 + // Tokens other than tt_bad can still generate warnings.
  58 + warn(
  59 + context,
  60 + QPDFExc(
  61 + qpdf_e_damaged_pdf,
  62 + input->getName(),
  63 + object_description,
  64 + input->getLastOffset(),
  65 + token_error_message));
  66 + }
  67 +
  68 + switch (token.getType()) {
  69 + case QPDFTokenizer::tt_eof:
  70 + if (!content_stream) {
  71 + QTC::TC("qpdf", "QPDFParser eof in parse");
  72 + warn(
  73 + context,
  74 + QPDFExc(
  75 + qpdf_e_damaged_pdf,
  76 + input->getName(),
  77 + object_description,
  78 + input->getLastOffset(),
  79 + "unexpected EOF"));
  80 + }
  81 + bad = true;
  82 + state = st_eof;
  83 + break;
  84 +
  85 + case QPDFTokenizer::tt_bad:
  86 + QTC::TC("qpdf", "QPDFParser bad token in parse");
  87 + bad = true;
  88 + object = QPDFObjectHandle::newNull();
  89 + break;
  90 +
  91 + case QPDFTokenizer::tt_brace_open:
  92 + case QPDFTokenizer::tt_brace_close:
  93 + QTC::TC("qpdf", "QPDFParser bad brace");
  94 + warn(
  95 + context,
  96 + QPDFExc(
  97 + qpdf_e_damaged_pdf,
  98 + input->getName(),
  99 + object_description,
  100 + input->getLastOffset(),
  101 + "treating unexpected brace token as null"));
  102 + bad = true;
  103 + object = QPDFObjectHandle::newNull();
  104 + break;
  105 +
  106 + case QPDFTokenizer::tt_array_close:
  107 + if (state == st_array) {
  108 + state = st_stop;
  109 + } else {
  110 + QTC::TC("qpdf", "QPDFParser bad array close");
  111 + warn(
  112 + context,
  113 + QPDFExc(
  114 + qpdf_e_damaged_pdf,
  115 + input->getName(),
  116 + object_description,
  117 + input->getLastOffset(),
  118 + "treating unexpected array close token as null"));
  119 + bad = true;
  120 + object = QPDFObjectHandle::newNull();
  121 + }
  122 + break;
  123 +
  124 + case QPDFTokenizer::tt_dict_close:
  125 + if (state == st_dictionary) {
  126 + state = st_stop;
  127 + } else {
  128 + QTC::TC("qpdf", "QPDFParser bad dictionary close");
  129 + warn(
  130 + context,
  131 + QPDFExc(
  132 + qpdf_e_damaged_pdf,
  133 + input->getName(),
  134 + object_description,
  135 + input->getLastOffset(),
  136 + "unexpected dictionary close token"));
  137 + bad = true;
  138 + object = QPDFObjectHandle::newNull();
  139 + }
  140 + break;
  141 +
  142 + case QPDFTokenizer::tt_array_open:
  143 + case QPDFTokenizer::tt_dict_open:
  144 + if (olist_stack.size() > 500) {
  145 + QTC::TC("qpdf", "QPDFParser too deep");
  146 + warn(
  147 + context,
  148 + QPDFExc(
  149 + qpdf_e_damaged_pdf,
  150 + input->getName(),
  151 + object_description,
  152 + input->getLastOffset(),
  153 + "ignoring excessively deeply nested data structure"));
  154 + bad = true;
  155 + object = QPDFObjectHandle::newNull();
  156 + state = st_top;
  157 + } else {
  158 + olist_stack.push_back(SparseOHArray());
  159 + state = st_start;
  160 + offset_stack.push_back(input->tell());
  161 + state_stack.push_back(
  162 + (token.getType() == QPDFTokenizer::tt_array_open)
  163 + ? st_array
  164 + : st_dictionary);
  165 + b_contents = false;
  166 + contents_string_stack.push_back("");
  167 + contents_offset_stack.push_back(-1);
  168 + }
  169 + break;
  170 +
  171 + case QPDFTokenizer::tt_bool:
  172 + object = QPDFObjectHandle::newBool((token.getValue() == "true"));
  173 + break;
  174 +
  175 + case QPDFTokenizer::tt_null:
  176 + object = QPDFObjectHandle::newNull();
  177 + break;
  178 +
  179 + case QPDFTokenizer::tt_integer:
  180 + object = QPDFObjectHandle::newInteger(
  181 + QUtil::string_to_ll(token.getValue().c_str()));
  182 + break;
  183 +
  184 + case QPDFTokenizer::tt_real:
  185 + object = QPDFObjectHandle::newReal(token.getValue());
  186 + break;
  187 +
  188 + case QPDFTokenizer::tt_name:
  189 + {
  190 + std::string name = token.getValue();
  191 + object = QPDFObjectHandle::newName(name);
  192 +
  193 + if (name == "/Contents") {
  194 + b_contents = true;
  195 + } else {
  196 + b_contents = false;
  197 + }
  198 + }
  199 + break;
  200 +
  201 + case QPDFTokenizer::tt_word:
  202 + {
  203 + std::string const& value = token.getValue();
  204 + if (content_stream) {
  205 + object = QPDFObjectHandle::newOperator(value);
  206 + } else if (
  207 + (value == "R") && (state != st_top) &&
  208 + (olist.size() >= 2) &&
  209 + (!olist.at(olist.size() - 1).isIndirect()) &&
  210 + (olist.at(olist.size() - 1).isInteger()) &&
  211 + (!olist.at(olist.size() - 2).isIndirect()) &&
  212 + (olist.at(olist.size() - 2).isInteger())) {
  213 + if (context == nullptr) {
  214 + QTC::TC("qpdf", "QPDFParser indirect without context");
  215 + throw std::logic_error(
  216 + "QPDFObjectHandle::parse called without context"
  217 + " on an object with indirect references");
  218 + }
  219 + // Try to resolve indirect objects
  220 + object = QPDFObjectHandle::newIndirect(
  221 + context,
  222 + QPDFObjGen(
  223 + olist.at(olist.size() - 2).getIntValueAsInt(),
  224 + olist.at(olist.size() - 1).getIntValueAsInt()));
  225 + olist.remove_last();
  226 + olist.remove_last();
  227 + } else if ((value == "endobj") && (state == st_top)) {
  228 + // We just saw endobj without having read
  229 + // anything. Treat this as a null and do not move
  230 + // the input source's offset.
  231 + object = QPDFObjectHandle::newNull();
  232 + input->seek(input->getLastOffset(), SEEK_SET);
  233 + empty = true;
  234 + } else {
  235 + QTC::TC("qpdf", "QPDFParser treat word as string");
  236 + warn(
  237 + context,
  238 + QPDFExc(
  239 + qpdf_e_damaged_pdf,
  240 + input->getName(),
  241 + object_description,
  242 + input->getLastOffset(),
  243 + "unknown token while reading object;"
  244 + " treating as string"));
  245 + bad = true;
  246 + object = QPDFObjectHandle::newString(value);
  247 + }
  248 + }
  249 + break;
  250 +
  251 + case QPDFTokenizer::tt_string:
  252 + {
  253 + std::string val = token.getValue();
  254 + if (decrypter) {
  255 + if (b_contents) {
  256 + contents_string = val;
  257 + contents_offset = input->getLastOffset();
  258 + b_contents = false;
  259 + }
  260 + decrypter->decryptString(val);
  261 + }
  262 + object = QPDFObjectHandle::newString(val);
  263 + }
  264 +
  265 + break;
  266 +
  267 + default:
  268 + warn(
  269 + context,
  270 + QPDFExc(
  271 + qpdf_e_damaged_pdf,
  272 + input->getName(),
  273 + object_description,
  274 + input->getLastOffset(),
  275 + "treating unknown token type as null while "
  276 + "reading object"));
  277 + bad = true;
  278 + object = QPDFObjectHandle::newNull();
  279 + break;
  280 + }
  281 +
  282 + if ((!object.isInitialized()) &&
  283 + (!((state == st_start) || (state == st_stop) ||
  284 + (state == st_eof)))) {
  285 + throw std::logic_error("QPDFObjectHandle::parseInternal: "
  286 + "unexpected uninitialized object");
  287 + object = QPDFObjectHandle::newNull();
  288 + }
  289 +
  290 + if (bad) {
  291 + ++bad_count;
  292 + good_count = 0;
  293 + } else {
  294 + ++good_count;
  295 + if (good_count > 3) {
  296 + bad_count = 0;
  297 + }
  298 + }
  299 + if (bad_count > 5) {
  300 + // We had too many consecutive errors without enough
  301 + // intervening successful objects. Give up.
  302 + warn(
  303 + context,
  304 + QPDFExc(
  305 + qpdf_e_damaged_pdf,
  306 + input->getName(),
  307 + object_description,
  308 + input->getLastOffset(),
  309 + "too many errors; giving up on reading object"));
  310 + state = st_top;
  311 + object = QPDFObjectHandle::newNull();
  312 + }
  313 +
  314 + switch (state) {
  315 + case st_eof:
  316 + if (state_stack.size() > 1) {
  317 + warn(
  318 + context,
  319 + QPDFExc(
  320 + qpdf_e_damaged_pdf,
  321 + input->getName(),
  322 + object_description,
  323 + input->getLastOffset(),
  324 + "parse error while reading object"));
  325 + }
  326 + done = true;
  327 + // In content stream mode, leave object uninitialized to
  328 + // indicate EOF
  329 + if (!content_stream) {
  330 + object = QPDFObjectHandle::newNull();
  331 + }
  332 + break;
  333 +
  334 + case st_dictionary:
  335 + case st_array:
  336 + QPDFObjectHandle::setObjectDescriptionFromInput(
  337 + object,
  338 + context,
  339 + object_description,
  340 + input,
  341 + input->getLastOffset());
  342 + object.setParsedOffset(input->getLastOffset());
  343 + set_offset = true;
  344 + olist.append(object);
  345 + break;
  346 +
  347 + case st_top:
  348 + done = true;
  349 + break;
  350 +
  351 + case st_start:
  352 + break;
  353 +
  354 + case st_stop:
  355 + if ((state_stack.size() < 2) || (olist_stack.size() < 2)) {
  356 + throw std::logic_error(
  357 + "QPDFObjectHandle::parseInternal: st_stop encountered"
  358 + " with insufficient elements in stack");
  359 + }
  360 + parser_state_e old_state = state_stack.back();
  361 + state_stack.pop_back();
  362 + if (old_state == st_array) {
  363 + // There's no newArray(SparseOHArray) since
  364 + // SparseOHArray is not part of the public API.
  365 + object = QPDFObjectHandle(QPDF_Array::create(olist));
  366 + QPDFObjectHandle::setObjectDescriptionFromInput(
  367 + object, context, object_description, input, offset);
  368 + // The `offset` points to the next of "[". Set the
  369 + // rewind offset to point to the beginning of "[".
  370 + // This has been explicitly tested with whitespace
  371 + // surrounding the array start delimiter.
  372 + // getLastOffset points to the array end token and
  373 + // therefore can't be used here.
  374 + object.setParsedOffset(offset - 1);
  375 + set_offset = true;
  376 + } else if (old_state == st_dictionary) {
  377 + // Convert list to map. Alternating elements are keys.
  378 + // Attempt to recover more or less gracefully from
  379 + // invalid dictionaries.
  380 + std::set<std::string> names;
  381 + size_t n_elements = olist.size();
  382 + for (size_t i = 0; i < n_elements; ++i) {
  383 + QPDFObjectHandle oh = olist.at(i);
  384 + if ((!oh.isIndirect()) && oh.isName()) {
  385 + names.insert(oh.getName());
  386 + }
  387 + }
  388 +
  389 + std::map<std::string, QPDFObjectHandle> dict;
  390 + int next_fake_key = 1;
  391 + for (unsigned int i = 0; i < olist.size(); ++i) {
  392 + QPDFObjectHandle key_obj = olist.at(i);
  393 + QPDFObjectHandle val;
  394 + if (key_obj.isIndirect() || (!key_obj.isName())) {
  395 + bool found_fake = false;
  396 + std::string candidate;
  397 + while (!found_fake) {
  398 + candidate = "/QPDFFake" +
  399 + QUtil::int_to_string(next_fake_key++);
  400 + found_fake = (names.count(candidate) == 0);
  401 + QTC::TC(
  402 + "qpdf",
  403 + "QPDFParser found fake",
  404 + (found_fake ? 0 : 1));
  405 + }
  406 + warn(
  407 + context,
  408 + QPDFExc(
  409 + qpdf_e_damaged_pdf,
  410 + input->getName(),
  411 + object_description,
  412 + offset,
  413 + "expected dictionary key but found"
  414 + " non-name object; inserting key " +
  415 + candidate));
  416 + val = key_obj;
  417 + key_obj = QPDFObjectHandle::newName(candidate);
  418 + } else if (i + 1 >= olist.size()) {
  419 + QTC::TC("qpdf", "QPDFParser no val for last key");
  420 + warn(
  421 + context,
  422 + QPDFExc(
  423 + qpdf_e_damaged_pdf,
  424 + input->getName(),
  425 + object_description,
  426 + offset,
  427 + "dictionary ended prematurely; "
  428 + "using null as value for last key"));
  429 + val = QPDFObjectHandle::newNull();
  430 + QPDFObjectHandle::setObjectDescriptionFromInput(
  431 + val, context, object_description, input, offset);
  432 + } else {
  433 + val = olist.at(++i);
  434 + }
  435 + std::string key = key_obj.getName();
  436 + if (dict.count(key) > 0) {
  437 + QTC::TC("qpdf", "QPDFParser duplicate dict key");
  438 + warn(
  439 + context,
  440 + QPDFExc(
  441 + qpdf_e_damaged_pdf,
  442 + input->getName(),
  443 + object_description,
  444 + offset,
  445 + "dictionary has duplicated key " + key +
  446 + "; last occurrence overrides earlier "
  447 + "ones"));
  448 + }
  449 + dict[key] = val;
  450 + }
  451 + if (!contents_string.empty() && dict.count("/Type") &&
  452 + dict["/Type"].isNameAndEquals("/Sig") &&
  453 + dict.count("/ByteRange") && dict.count("/Contents") &&
  454 + dict["/Contents"].isString()) {
  455 + dict["/Contents"] =
  456 + QPDFObjectHandle::newString(contents_string);
  457 + dict["/Contents"].setParsedOffset(contents_offset);
  458 + }
  459 + object = QPDFObjectHandle::newDictionary(dict);
  460 + QPDFObjectHandle::setObjectDescriptionFromInput(
  461 + object, context, object_description, input, offset);
  462 + // The `offset` points to the next of "<<". Set the
  463 + // rewind offset to point to the beginning of "<<".
  464 + // This has been explicitly tested with whitespace
  465 + // surrounding the dictionary start delimiter.
  466 + // getLastOffset points to the dictionary end token
  467 + // and therefore can't be used here.
  468 + object.setParsedOffset(offset - 2);
  469 + set_offset = true;
  470 + }
  471 + olist_stack.pop_back();
  472 + offset_stack.pop_back();
  473 + if (state_stack.back() == st_top) {
  474 + done = true;
  475 + } else {
  476 + olist_stack.back().append(object);
  477 + }
  478 + contents_string_stack.pop_back();
  479 + contents_offset_stack.pop_back();
  480 + }
  481 + }
  482 +
  483 + if (!set_offset) {
  484 + QPDFObjectHandle::setObjectDescriptionFromInput(
  485 + object, context, object_description, input, offset);
  486 + object.setParsedOffset(offset);
  487 + }
  488 + return object;
  489 +}
  490 +
  491 +void
  492 +QPDFParser::warn(QPDF* qpdf, QPDFExc const& e)
  493 +{
  494 + // If parsing on behalf of a QPDF object and want to give a
  495 + // warning, we can warn through the object. If parsing for some
  496 + // other reason, such as an explicit creation of an object from a
  497 + // string, then just throw the exception.
  498 + if (qpdf) {
  499 + qpdf->warn(e);
  500 + } else {
  501 + throw e;
  502 + }
  503 +}
... ...
libqpdf/qpdf/QPDFParser.hh 0 โ†’ 100644
  1 +#ifndef QPDFPARSER_HH
  2 +#define QPDFPARSER_HH
  3 +
  4 +#include <qpdf/QPDFObjectHandle.hh>
  5 +
  6 +#include <memory>
  7 +#include <string>
  8 +
  9 +class QPDFParser
  10 +{
  11 + public:
  12 + QPDFParser() = delete;
  13 + QPDFParser(
  14 + std::shared_ptr<InputSource> input,
  15 + std::string const& object_description,
  16 + QPDFTokenizer& tokenizer,
  17 + QPDFObjectHandle::StringDecrypter* decrypter,
  18 + QPDF* context) :
  19 + input(input),
  20 + object_description(object_description),
  21 + tokenizer(tokenizer),
  22 + decrypter(decrypter),
  23 + context(context)
  24 + {
  25 + }
  26 + virtual ~QPDFParser() = default;
  27 +
  28 + QPDFObjectHandle parse(bool& empty, bool content_stream);
  29 +
  30 + private:
  31 + enum parser_state_e {
  32 + st_top,
  33 + st_start,
  34 + st_stop,
  35 + st_eof,
  36 + st_dictionary,
  37 + st_array
  38 + };
  39 +
  40 + static void warn(QPDF*, QPDFExc const&);
  41 + void setParsedOffset(qpdf_offset_t offset);
  42 +
  43 + std::shared_ptr<InputSource> input;
  44 + std::string const& object_description;
  45 + QPDFTokenizer& tokenizer;
  46 + QPDFObjectHandle::StringDecrypter* decrypter;
  47 + QPDF* context;
  48 +};
  49 +
  50 +#endif // QPDFPARSER_HH
... ...
qpdf/qpdf.testcov
... ... @@ -56,12 +56,12 @@ QPDF missing trailer 0
56 56 QPDF trailer lacks size 0
57 57 QPDF trailer size not integer 0
58 58 QPDF trailer prev not integer 0
59   -QPDFObjectHandle bad brace 0
60   -QPDFObjectHandle bad array close 0
  59 +QPDFParser bad brace 0
  60 +QPDFParser bad array close 0
61 61 QPDF stream without length 0
62 62 QPDF stream length not integer 0
63 63 QPDF missing endstream 0
64   -QPDFObjectHandle bad dictionary close 0
  64 +QPDFParser bad dictionary close 0
65 65 QPDF can't find xref 0
66 66 QPDFTokenizer bad ) 0
67 67 QPDFTokenizer bad > 0
... ... @@ -215,7 +215,7 @@ QPDF not copying pages object 0
215 215 QPDF insert foreign page 0
216 216 QPDFWriter foreign object 0
217 217 QPDFWriter copy use_aes 1
218   -QPDFObjectHandle indirect without context 0
  218 +QPDFParser indirect without context 0
219 219 QPDFObjectHandle trailing data in parse 0
220 220 QPDFJob pages encryption password 0
221 221 QPDFTokenizer EOF reading token 0
... ... @@ -257,9 +257,9 @@ qpdf-c called qpdf_set_deterministic_ID 0
257 257 QPDFObjectHandle indirect with 0 objid 0
258 258 QPDF object id 0 0
259 259 QPDF recursion loop in resolve 0
260   -QPDFObjectHandle treat word as string 0
261   -QPDFObjectHandle found fake 1
262   -QPDFObjectHandle no val for last key 0
  260 +QPDFParser treat word as string 0
  261 +QPDFParser found fake 1
  262 +QPDFParser no val for last key 0
263 263 QPDF resolve failure to null 0
264 264 QPDFWriter preserve unreferenced standard 0
265 265 QPDFObjectHandle errors in parsecontent 0
... ... @@ -288,8 +288,8 @@ QPDFObjectHandle non-stream in stream array 0
288 288 QPDFObjectHandle coalesce called on stream 0
289 289 QPDFObjectHandle coalesce provide stream data 0
290 290 QPDF_Stream bad token at end during normalize 0
291   -QPDFObjectHandle bad token in parse 0
292   -QPDFObjectHandle eof in parseInternal 0
  291 +QPDFParser bad token in parse 0
  292 +QPDFParser eof in parse 0
293 293 QPDFObjectHandle array bounds 0
294 294 QPDFObjectHandle boolean returning false 0
295 295 QPDFObjectHandle integer returning 0 0
... ... @@ -317,7 +317,7 @@ QPDFObjectHandle numeric non-numeric 0
317 317 QPDFObjectHandle erase array bounds 0
318 318 qpdf-c called qpdf_check_pdf 0
319 319 QPDF xref loop 0
320   -QPDFObjectHandle too deep 0
  320 +QPDFParser too deep 0
321 321 QPDFFormFieldObjectHelper non-trivial inheritance 0
322 322 QPDFFormFieldObjectHelper non-trivial qualified name 0
323 323 QPDFFormFieldObjectHelper TU present 0
... ... @@ -428,7 +428,7 @@ QPDF eof skipping spaces before xref 1
428 428 QPDF_encryption user matches owner V < 5 0
429 429 QPDF_encryption same password 1
430 430 QPDFWriter stream in ostream 0
431   -QPDFObjectHandle duplicate dict key 0
  431 +QPDFParser duplicate dict key 0
432 432 QPDFWriter no encryption sig contents 0
433 433 QPDFPageObjectHelper colorspace lookup 0
434 434 QPDFWriter ignore XRef in qdf mode 0
... ...