Commit 8d2a0eda5a76a341ae6b597f58e874d9e3bd571c
1 parent
f5dd6381
Add reactors to the JSON parser
Showing
17 changed files
with
401 additions
and
14 deletions
ChangeLog
| 1 | +2022-05-01 Jay Berkenbilt <ejb@ql.org> | |
| 2 | + | |
| 3 | + * JSON: add reactors to the JSON parser, making it possible to | |
| 4 | + react to JSON parsing events as they occur and to block the | |
| 5 | + results from being stored. This makes it possible to incrementally | |
| 6 | + parse arbitrarily large JSON inputs. | |
| 7 | + | |
| 1 | 8 | 2022-04-30 Jay Berkenbilt <ejb@ql.org> |
| 2 | 9 | |
| 3 | 10 | * QPDFWriter: change encryption API calls | ... | ... |
include/qpdf/JSON.hh
| ... | ... | @@ -141,9 +141,86 @@ class JSON |
| 141 | 141 | QPDF_DLL |
| 142 | 142 | bool checkSchema(JSON schema, std::list<std::string>& errors); |
| 143 | 143 | |
| 144 | - // Create a JSON object from a string. | |
| 144 | + // An pointer to a Reactor class can be passed to parse, which | |
| 145 | + // will enable the caller to react to incremental events in the | |
| 146 | + // construction of the JSON object. This makes it possible to | |
| 147 | + // implement SAX-like handling of very large JSON objects. | |
| 148 | + class QPDF_DLL_CLASS Reactor | |
| 149 | + { | |
| 150 | + public: | |
| 151 | + QPDF_DLL | |
| 152 | + virtual ~Reactor() = default; | |
| 153 | + | |
| 154 | + // The start/end methods are called when parsing of a | |
| 155 | + // dictionary or array is started or ended. The item methods | |
| 156 | + // are called when an item is added to a dictionary or array. | |
| 157 | + // See important notes in "Item methods" below. | |
| 158 | + | |
| 159 | + // During parsing of a JSON string, the parser is operating on | |
| 160 | + // a single object at a time. When a dictionary or array is | |
| 161 | + // started, a new context begins, and when that dictionary or | |
| 162 | + // array is ended, the previous context is resumed. So, for | |
| 163 | + // example, if you have `{"a": [1]}`, you will receive the | |
| 164 | + // following method calls | |
| 165 | + // | |
| 166 | + // dictionaryStart -- current object is the top-level dictionary | |
| 167 | + // arrayStart -- current object is the array | |
| 168 | + // arrayItem -- called with the "1" object | |
| 169 | + // containerEnd -- now current object is the dictionary again | |
| 170 | + // dictionaryItem -- called with "a" and the just-completed array | |
| 171 | + // containerEnd -- current object is undefined | |
| 172 | + // | |
| 173 | + // If the top-level item in a JSON string is a scalar, the | |
| 174 | + // topLevelScalar() method will be called. No argument is | |
| 175 | + // passed since the object is the same as what is returned by | |
| 176 | + // parse(). | |
| 177 | + | |
| 178 | + QPDF_DLL | |
| 179 | + virtual void dictionaryStart() = 0; | |
| 180 | + QPDF_DLL | |
| 181 | + virtual void arrayStart() = 0; | |
| 182 | + QPDF_DLL | |
| 183 | + virtual void containerEnd(JSON const& value) = 0; | |
| 184 | + QPDF_DLL | |
| 185 | + virtual void topLevelScalar() = 0; | |
| 186 | + | |
| 187 | + // Item methods: | |
| 188 | + // | |
| 189 | + // The return value of the item methods indicate whether the | |
| 190 | + // item has been "consumed". If the item method returns true, | |
| 191 | + // then the item will not be added to the containing JSON | |
| 192 | + // object. This is what allows arbitrarily large JSON objects | |
| 193 | + // to be parsed and not have to be kept in memory. | |
| 194 | + // | |
| 195 | + // NOTE: When a dictionary or an array is added to a | |
| 196 | + // container, the dictionaryItem or arrayItem method is called | |
| 197 | + // when the child item's start delimiter is encountered, so | |
| 198 | + // the JSON object passed in at that time will always be | |
| 199 | + // in its initial, empty state. | |
| 200 | + | |
| 201 | + QPDF_DLL | |
| 202 | + virtual bool | |
| 203 | + dictionaryItem(std::string const& key, JSON const& value) = 0; | |
| 204 | + QPDF_DLL | |
| 205 | + virtual bool arrayItem(JSON const& value) = 0; | |
| 206 | + }; | |
| 207 | + | |
| 208 | + // Create a JSON object from a string. See above for information | |
| 209 | + // about how to use the Reactor. | |
| 210 | + QPDF_DLL | |
| 211 | + static JSON parse(std::string const&, Reactor* reactor = nullptr); | |
| 212 | + | |
| 213 | + // parse calls setOffsets to set the inclusive start and | |
| 214 | + // non-inclusive end offsets of an object relative to its input | |
| 215 | + // string. Otherwise, both values are 0. | |
| 216 | + QPDF_DLL | |
| 217 | + void setStart(size_t); | |
| 218 | + QPDF_DLL | |
| 219 | + void setEnd(size_t); | |
| 220 | + QPDF_DLL | |
| 221 | + size_t getStart() const; | |
| 145 | 222 | QPDF_DLL |
| 146 | - static JSON parse(std::string const&); | |
| 223 | + size_t getEnd() const; | |
| 147 | 224 | |
| 148 | 225 | private: |
| 149 | 226 | static std::string encode_string(std::string const& utf8); |
| ... | ... | @@ -217,6 +294,9 @@ class JSON |
| 217 | 294 | Members(Members const&) = delete; |
| 218 | 295 | |
| 219 | 296 | std::shared_ptr<JSON_value> value; |
| 297 | + // start and end are only populated for objects created by parse | |
| 298 | + size_t start; | |
| 299 | + size_t end; | |
| 220 | 300 | }; |
| 221 | 301 | |
| 222 | 302 | std::shared_ptr<Members> m; | ... | ... |
libqpdf/JSON.cc
| 1 | 1 | #include <qpdf/JSON.hh> |
| 2 | 2 | |
| 3 | +#include <qpdf/QIntC.hh> | |
| 3 | 4 | #include <qpdf/QTC.hh> |
| 4 | 5 | #include <qpdf/QUtil.hh> |
| 5 | 6 | #include <cstring> |
| 6 | 7 | #include <stdexcept> |
| 7 | 8 | |
| 8 | 9 | JSON::Members::Members(std::shared_ptr<JSON_value> value) : |
| 9 | - value(value) | |
| 10 | + value(value), | |
| 11 | + start(0), | |
| 12 | + end(0) | |
| 10 | 13 | { |
| 11 | 14 | } |
| 12 | 15 | |
| ... | ... | @@ -455,7 +458,8 @@ namespace |
| 455 | 458 | class JSONParser |
| 456 | 459 | { |
| 457 | 460 | public: |
| 458 | - JSONParser() : | |
| 461 | + JSONParser(JSON::Reactor* reactor) : | |
| 462 | + reactor(reactor), | |
| 459 | 463 | lex_state(ls_top), |
| 460 | 464 | number_before_point(0), |
| 461 | 465 | number_after_point(0), |
| ... | ... | @@ -499,6 +503,7 @@ namespace |
| 499 | 503 | ls_backslash, |
| 500 | 504 | }; |
| 501 | 505 | |
| 506 | + JSON::Reactor* reactor; | |
| 502 | 507 | lex_state_e lex_state; |
| 503 | 508 | size_t number_before_point; |
| 504 | 509 | size_t number_after_point; |
| ... | ... | @@ -828,10 +833,18 @@ JSONParser::handleToken() |
| 828 | 833 | switch (*tok_start) { |
| 829 | 834 | case '{': |
| 830 | 835 | item = std::make_shared<JSON>(JSON::makeDictionary()); |
| 836 | + item->setStart(QIntC::to_size(tok_start - cstr)); | |
| 837 | + if (reactor) { | |
| 838 | + reactor->dictionaryStart(); | |
| 839 | + } | |
| 831 | 840 | break; |
| 832 | 841 | |
| 833 | 842 | case '[': |
| 834 | 843 | item = std::make_shared<JSON>(JSON::makeArray()); |
| 844 | + item->setStart(QIntC::to_size(tok_start - cstr)); | |
| 845 | + if (reactor) { | |
| 846 | + reactor->arrayStart(); | |
| 847 | + } | |
| 835 | 848 | break; |
| 836 | 849 | |
| 837 | 850 | default: |
| ... | ... | @@ -997,6 +1010,11 @@ JSONParser::handleToken() |
| 997 | 1010 | } else if ((delimiter == '}') || (delimiter == ']')) { |
| 998 | 1011 | next_state = ps_stack.back(); |
| 999 | 1012 | ps_stack.pop_back(); |
| 1013 | + auto tos = stack.back(); | |
| 1014 | + tos->setEnd(QIntC::to_size(tok_end - cstr)); | |
| 1015 | + if (reactor) { | |
| 1016 | + reactor->containerEnd(*tos); | |
| 1017 | + } | |
| 1000 | 1018 | if (next_state != ps_done) { |
| 1001 | 1019 | stack.pop_back(); |
| 1002 | 1020 | } |
| ... | ... | @@ -1004,6 +1022,11 @@ JSONParser::handleToken() |
| 1004 | 1022 | throw std::logic_error( |
| 1005 | 1023 | "JSONParser::handleToken: unexpected delimiter in transition"); |
| 1006 | 1024 | } else if (item.get()) { |
| 1025 | + if (!(item->isArray() || item->isDictionary())) { | |
| 1026 | + item->setStart(QIntC::to_size(tok_start - cstr)); | |
| 1027 | + item->setEnd(QIntC::to_size(tok_end - cstr)); | |
| 1028 | + } | |
| 1029 | + | |
| 1007 | 1030 | std::shared_ptr<JSON> tos; |
| 1008 | 1031 | if (!stack.empty()) { |
| 1009 | 1032 | tos = stack.back(); |
| ... | ... | @@ -1017,14 +1040,18 @@ JSONParser::handleToken() |
| 1017 | 1040 | break; |
| 1018 | 1041 | |
| 1019 | 1042 | case ps_dict_after_colon: |
| 1020 | - tos->addDictionaryMember(dict_key, *item); | |
| 1043 | + if (!reactor || !reactor->dictionaryItem(dict_key, *item)) { | |
| 1044 | + tos->addDictionaryMember(dict_key, *item); | |
| 1045 | + } | |
| 1021 | 1046 | next_state = ps_dict_after_item; |
| 1022 | 1047 | break; |
| 1023 | 1048 | |
| 1024 | 1049 | case ps_array_begin: |
| 1025 | 1050 | case ps_array_after_comma: |
| 1051 | + if (!reactor || !reactor->arrayItem(*item)) { | |
| 1052 | + tos->addArrayElement(*item); | |
| 1053 | + } | |
| 1026 | 1054 | next_state = ps_array_after_item; |
| 1027 | - tos->addArrayElement(*item); | |
| 1028 | 1055 | break; |
| 1029 | 1056 | |
| 1030 | 1057 | case ps_top: |
| ... | ... | @@ -1083,12 +1110,40 @@ JSONParser::parse(std::string const& s) |
| 1083 | 1110 | QTC::TC("libtests", "JSON parse premature EOF"); |
| 1084 | 1111 | throw std::runtime_error("JSON: premature end of input"); |
| 1085 | 1112 | } |
| 1086 | - return stack.back(); | |
| 1113 | + auto const& tos = stack.back(); | |
| 1114 | + if (reactor && tos.get() && !(tos->isArray() || tos->isDictionary())) { | |
| 1115 | + reactor->topLevelScalar(); | |
| 1116 | + } | |
| 1117 | + return tos; | |
| 1087 | 1118 | } |
| 1088 | 1119 | |
| 1089 | 1120 | JSON |
| 1090 | -JSON::parse(std::string const& s) | |
| 1121 | +JSON::parse(std::string const& s, Reactor* reactor) | |
| 1091 | 1122 | { |
| 1092 | - JSONParser jp; | |
| 1123 | + JSONParser jp(reactor); | |
| 1093 | 1124 | return *jp.parse(s); |
| 1094 | 1125 | } |
| 1126 | + | |
| 1127 | +void | |
| 1128 | +JSON::setStart(size_t start) | |
| 1129 | +{ | |
| 1130 | + this->m->start = start; | |
| 1131 | +} | |
| 1132 | + | |
| 1133 | +void | |
| 1134 | +JSON::setEnd(size_t end) | |
| 1135 | +{ | |
| 1136 | + this->m->end = end; | |
| 1137 | +} | |
| 1138 | + | |
| 1139 | +size_t | |
| 1140 | +JSON::getStart() const | |
| 1141 | +{ | |
| 1142 | + return this->m->start; | |
| 1143 | +} | |
| 1144 | + | |
| 1145 | +size_t | |
| 1146 | +JSON::getEnd() const | |
| 1147 | +{ | |
| 1148 | + return this->m->end; | |
| 1149 | +} | ... | ... |
libtests/json_parse.cc
| 1 | 1 | #include <qpdf/JSON.hh> |
| 2 | 2 | #include <qpdf/QUtil.hh> |
| 3 | +#include <cstdlib> | |
| 4 | +#include <cstring> | |
| 3 | 5 | #include <iostream> |
| 6 | +#include <memory> | |
| 7 | + | |
| 8 | +namespace | |
| 9 | +{ | |
| 10 | + class Reactor: public JSON::Reactor | |
| 11 | + { | |
| 12 | + public: | |
| 13 | + virtual ~Reactor() = default; | |
| 14 | + virtual void dictionaryStart() override; | |
| 15 | + virtual void arrayStart() override; | |
| 16 | + virtual void containerEnd(JSON const& value) override; | |
| 17 | + virtual void topLevelScalar() override; | |
| 18 | + virtual bool | |
| 19 | + dictionaryItem(std::string const& key, JSON const& value) override; | |
| 20 | + virtual bool arrayItem(JSON const& value) override; | |
| 21 | + | |
| 22 | + private: | |
| 23 | + void printItem(JSON const&); | |
| 24 | + }; | |
| 25 | +} // namespace | |
| 26 | + | |
| 27 | +void | |
| 28 | +Reactor::dictionaryStart() | |
| 29 | +{ | |
| 30 | + std::cout << "dictionary start" << std::endl; | |
| 31 | +} | |
| 32 | + | |
| 33 | +void | |
| 34 | +Reactor::arrayStart() | |
| 35 | +{ | |
| 36 | + std::cout << "array start" << std::endl; | |
| 37 | +} | |
| 38 | + | |
| 39 | +void | |
| 40 | +Reactor::containerEnd(JSON const& value) | |
| 41 | +{ | |
| 42 | + std::cout << "container end: "; | |
| 43 | + printItem(value); | |
| 44 | +} | |
| 45 | + | |
| 46 | +void | |
| 47 | +Reactor::topLevelScalar() | |
| 48 | +{ | |
| 49 | + std::cout << "top-level scalar" << std::endl; | |
| 50 | +} | |
| 51 | + | |
| 52 | +bool | |
| 53 | +Reactor::dictionaryItem(std::string const& key, JSON const& value) | |
| 54 | +{ | |
| 55 | + std::cout << "dictionary item: " << key << " -> "; | |
| 56 | + printItem(value); | |
| 57 | + if (key == "keep") { | |
| 58 | + return false; | |
| 59 | + } | |
| 60 | + return true; | |
| 61 | +} | |
| 62 | + | |
| 63 | +bool | |
| 64 | +Reactor::arrayItem(JSON const& value) | |
| 65 | +{ | |
| 66 | + std::cout << "array item: "; | |
| 67 | + printItem(value); | |
| 68 | + std::string n; | |
| 69 | + if (value.getString(n) && n == "keep") { | |
| 70 | + return false; | |
| 71 | + } | |
| 72 | + return true; | |
| 73 | +} | |
| 74 | + | |
| 75 | +void | |
| 76 | +Reactor::printItem(JSON const& j) | |
| 77 | +{ | |
| 78 | + std::cout << "[" << j.getStart() << ", " << j.getEnd() | |
| 79 | + << "): " << j.unparse() << std::endl; | |
| 80 | +} | |
| 81 | + | |
| 82 | +static void | |
| 83 | +usage() | |
| 84 | +{ | |
| 85 | + std::cerr << "Usage: json_parse file [--react]" << std::endl; | |
| 86 | + exit(2); | |
| 87 | +} | |
| 4 | 88 | |
| 5 | 89 | int |
| 6 | 90 | main(int argc, char* argv[]) |
| 7 | 91 | { |
| 8 | - if (argc != 2) { | |
| 9 | - std::cerr << "Usage: json_parse file" << std::endl; | |
| 92 | + if ((argc < 2) || (argc > 3)) { | |
| 93 | + usage(); | |
| 10 | 94 | return 2; |
| 11 | 95 | } |
| 12 | 96 | char const* filename = argv[1]; |
| 97 | + std::shared_ptr<Reactor> reactor; | |
| 98 | + if (argc == 3) { | |
| 99 | + if (strcmp(argv[2], "--react") == 0) { | |
| 100 | + reactor = std::make_shared<Reactor>(); | |
| 101 | + } else { | |
| 102 | + usage(); | |
| 103 | + } | |
| 104 | + } | |
| 13 | 105 | try { |
| 14 | 106 | std::shared_ptr<char> buf; |
| 15 | 107 | size_t size; |
| 16 | 108 | QUtil::read_file_into_memory(filename, buf, size); |
| 17 | 109 | std::string s(buf.get(), size); |
| 18 | - std::cout << JSON::parse(s).unparse() << std::endl; | |
| 110 | + std::cout << JSON::parse(s, reactor.get()).unparse() << std::endl; | |
| 19 | 111 | } catch (std::exception& e) { |
| 20 | 112 | std::cerr << "exception: " << filename << ": " << e.what() << std::endl; |
| 21 | 113 | return 2; | ... | ... |
libtests/qtest/json_parse.test
| ... | ... | @@ -32,7 +32,7 @@ if ($^O ne 'msys') |
| 32 | 32 | |
| 33 | 33 | cleanup(); |
| 34 | 34 | |
| 35 | -my $good = 9; | |
| 35 | +my $good = 10; | |
| 36 | 36 | |
| 37 | 37 | for (my $i = 1; $i <= $good; ++$i) |
| 38 | 38 | { |
| ... | ... | @@ -73,6 +73,11 @@ for (my $i = 1; $i <= $good; ++$i) |
| 73 | 73 | {$td->FILE => "out.json"}, |
| 74 | 74 | {$td->STRING => ""}); |
| 75 | 75 | } |
| 76 | + | |
| 77 | + $td->runtest("good $n reactor", | |
| 78 | + {$td->COMMAND => "json_parse good-$n.json --react"}, | |
| 79 | + {$td->FILE => "good-$n-react.out", $td->EXIT_STATUS => 0}, | |
| 80 | + $td->NORMALIZE_NEWLINES); | |
| 76 | 81 | } |
| 77 | 82 | |
| 78 | 83 | my @bad = ( |
| ... | ... | @@ -127,7 +132,7 @@ foreach my $d (@bad) |
| 127 | 132 | |
| 128 | 133 | cleanup(); |
| 129 | 134 | |
| 130 | -$td->report((2 * $good) + scalar(@bad)); | |
| 135 | +$td->report((3 * $good) + scalar(@bad)); | |
| 131 | 136 | |
| 132 | 137 | sub cleanup |
| 133 | 138 | { | ... | ... |
libtests/qtest/json_parse/good-01-react.out
0 → 100644
| 1 | +dictionary start | |
| 2 | +dictionary item: a -> [6, 11): "bcd" | |
| 3 | +array start | |
| 4 | +dictionary item: e -> [18, 0): [] | |
| 5 | +array item: [19, 20): 1 | |
| 6 | +array item: [41, 42): 2 | |
| 7 | +array item: [44, 45): 3 | |
| 8 | +array item: [46, 47): 4 | |
| 9 | +array item: [48, 54): "five" | |
| 10 | +dictionary start | |
| 11 | +array item: [56, 0): {} | |
| 12 | +dictionary item: six -> [64, 65): 7 | |
| 13 | +dictionary item: 8 -> [72, 73): 9 | |
| 14 | +container end: [56, 74): {} | |
| 15 | +array item: [76, 80): null | |
| 16 | +array item: [82, 86): true | |
| 17 | +array item: [107, 112): false | |
| 18 | +array item: [114, 134): "a\b\f\n\r\t\\\"/z" | |
| 19 | +container end: [18, 135): [] | |
| 20 | +container end: [0, 136): {} | |
| 21 | +{} | ... | ... |
libtests/qtest/json_parse/good-02-react.out
0 → 100644
libtests/qtest/json_parse/good-03-react.out
0 → 100644
libtests/qtest/json_parse/good-04-react.out
0 → 100644
| 1 | +array start | |
| 2 | +array start | |
| 3 | +array item: [1, 0): [] | |
| 4 | +array start | |
| 5 | +array item: [2, 0): [] | |
| 6 | +dictionary start | |
| 7 | +array item: [3, 0): {} | |
| 8 | +container end: [3, 5): {} | |
| 9 | +container end: [2, 6): [] | |
| 10 | +dictionary start | |
| 11 | +array item: [8, 0): {} | |
| 12 | +dictionary start | |
| 13 | +dictionary item: -> [13, 0): {} | |
| 14 | +container end: [13, 15): {} | |
| 15 | +container end: [8, 16): {} | |
| 16 | +container end: [1, 17): [] | |
| 17 | +container end: [0, 18): [] | |
| 18 | +[] | ... | ... |
libtests/qtest/json_parse/good-05-react.out
0 → 100644
libtests/qtest/json_parse/good-06-react.out
0 → 100644
libtests/qtest/json_parse/good-07-react.out
0 → 100644
libtests/qtest/json_parse/good-08-react.out
0 → 100644
libtests/qtest/json_parse/good-09-react.out
0 → 100644
libtests/qtest/json_parse/good-10-react.out
0 → 100644
| 1 | +dictionary start | |
| 2 | +array start | |
| 3 | +dictionary item: a -> [9, 0): [] | |
| 4 | +array item: [10, 11): 1 | |
| 5 | +array item: [13, 14): 2 | |
| 6 | +dictionary start | |
| 7 | +array item: [16, 0): {} | |
| 8 | +dictionary item: x -> [22, 25): "y" | |
| 9 | +container end: [16, 26): {} | |
| 10 | +array item: [28, 29): 3 | |
| 11 | +dictionary start | |
| 12 | +array item: [31, 0): {} | |
| 13 | +dictionary item: keep -> [40, 61): "not in final output" | |
| 14 | +container end: [31, 62): { | |
| 15 | + "keep": "not in final output" | |
| 16 | +} | |
| 17 | +container end: [9, 63): [] | |
| 18 | +array start | |
| 19 | +dictionary item: keep -> [75, 0): [] | |
| 20 | +array item: [76, 77): 1 | |
| 21 | +array item: [79, 83): null | |
| 22 | +array item: [85, 86): 2 | |
| 23 | +array item: [88, 93): false | |
| 24 | +array item: [95, 101): "keep" | |
| 25 | +array item: [103, 104): 3 | |
| 26 | +array start | |
| 27 | +array item: [106, 0): [] | |
| 28 | +array item: [107, 113): "this" | |
| 29 | +array item: [115, 121): "keep" | |
| 30 | +array item: [123, 128): "not" | |
| 31 | +array item: [130, 137): "final" | |
| 32 | +container end: [106, 138): [ | |
| 33 | + "keep" | |
| 34 | +] | |
| 35 | +container end: [75, 139): [ | |
| 36 | + "keep" | |
| 37 | +] | |
| 38 | +container end: [0, 141): { | |
| 39 | + "keep": [ | |
| 40 | + "keep" | |
| 41 | + ] | |
| 42 | +} | |
| 43 | +{ | |
| 44 | + "keep": [ | |
| 45 | + "keep" | |
| 46 | + ] | |
| 47 | +} | ... | ... |
libtests/qtest/json_parse/good-10.json
0 → 100644
libtests/qtest/json_parse/save-10.json
0 → 100644