Commit 8d2a0eda5a76a341ae6b597f58e874d9e3bd571c

Authored by Jay Berkenbilt
1 parent f5dd6381

Add reactors to the JSON parser

ChangeLog
  1 +2022-05-01 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * JSON: add reactors to the JSON parser, making it possible to
  4 + react to JSON parsing events as they occur and to block the
  5 + results from being stored. This makes it possible to incrementally
  6 + parse arbitrarily large JSON inputs.
  7 +
1 8 2022-04-30 Jay Berkenbilt <ejb@ql.org>
2 9  
3 10 * QPDFWriter: change encryption API calls
... ...
include/qpdf/JSON.hh
... ... @@ -141,9 +141,86 @@ class JSON
141 141 QPDF_DLL
142 142 bool checkSchema(JSON schema, std::list<std::string>& errors);
143 143  
144   - // Create a JSON object from a string.
  144 + // An pointer to a Reactor class can be passed to parse, which
  145 + // will enable the caller to react to incremental events in the
  146 + // construction of the JSON object. This makes it possible to
  147 + // implement SAX-like handling of very large JSON objects.
  148 + class QPDF_DLL_CLASS Reactor
  149 + {
  150 + public:
  151 + QPDF_DLL
  152 + virtual ~Reactor() = default;
  153 +
  154 + // The start/end methods are called when parsing of a
  155 + // dictionary or array is started or ended. The item methods
  156 + // are called when an item is added to a dictionary or array.
  157 + // See important notes in "Item methods" below.
  158 +
  159 + // During parsing of a JSON string, the parser is operating on
  160 + // a single object at a time. When a dictionary or array is
  161 + // started, a new context begins, and when that dictionary or
  162 + // array is ended, the previous context is resumed. So, for
  163 + // example, if you have `{"a": [1]}`, you will receive the
  164 + // following method calls
  165 + //
  166 + // dictionaryStart -- current object is the top-level dictionary
  167 + // arrayStart -- current object is the array
  168 + // arrayItem -- called with the "1" object
  169 + // containerEnd -- now current object is the dictionary again
  170 + // dictionaryItem -- called with "a" and the just-completed array
  171 + // containerEnd -- current object is undefined
  172 + //
  173 + // If the top-level item in a JSON string is a scalar, the
  174 + // topLevelScalar() method will be called. No argument is
  175 + // passed since the object is the same as what is returned by
  176 + // parse().
  177 +
  178 + QPDF_DLL
  179 + virtual void dictionaryStart() = 0;
  180 + QPDF_DLL
  181 + virtual void arrayStart() = 0;
  182 + QPDF_DLL
  183 + virtual void containerEnd(JSON const& value) = 0;
  184 + QPDF_DLL
  185 + virtual void topLevelScalar() = 0;
  186 +
  187 + // Item methods:
  188 + //
  189 + // The return value of the item methods indicate whether the
  190 + // item has been "consumed". If the item method returns true,
  191 + // then the item will not be added to the containing JSON
  192 + // object. This is what allows arbitrarily large JSON objects
  193 + // to be parsed and not have to be kept in memory.
  194 + //
  195 + // NOTE: When a dictionary or an array is added to a
  196 + // container, the dictionaryItem or arrayItem method is called
  197 + // when the child item's start delimiter is encountered, so
  198 + // the JSON object passed in at that time will always be
  199 + // in its initial, empty state.
  200 +
  201 + QPDF_DLL
  202 + virtual bool
  203 + dictionaryItem(std::string const& key, JSON const& value) = 0;
  204 + QPDF_DLL
  205 + virtual bool arrayItem(JSON const& value) = 0;
  206 + };
  207 +
  208 + // Create a JSON object from a string. See above for information
  209 + // about how to use the Reactor.
  210 + QPDF_DLL
  211 + static JSON parse(std::string const&, Reactor* reactor = nullptr);
  212 +
  213 + // parse calls setOffsets to set the inclusive start and
  214 + // non-inclusive end offsets of an object relative to its input
  215 + // string. Otherwise, both values are 0.
  216 + QPDF_DLL
  217 + void setStart(size_t);
  218 + QPDF_DLL
  219 + void setEnd(size_t);
  220 + QPDF_DLL
  221 + size_t getStart() const;
145 222 QPDF_DLL
146   - static JSON parse(std::string const&);
  223 + size_t getEnd() const;
147 224  
148 225 private:
149 226 static std::string encode_string(std::string const& utf8);
... ... @@ -217,6 +294,9 @@ class JSON
217 294 Members(Members const&) = delete;
218 295  
219 296 std::shared_ptr<JSON_value> value;
  297 + // start and end are only populated for objects created by parse
  298 + size_t start;
  299 + size_t end;
220 300 };
221 301  
222 302 std::shared_ptr<Members> m;
... ...
libqpdf/JSON.cc
1 1 #include <qpdf/JSON.hh>
2 2  
  3 +#include <qpdf/QIntC.hh>
3 4 #include <qpdf/QTC.hh>
4 5 #include <qpdf/QUtil.hh>
5 6 #include <cstring>
6 7 #include <stdexcept>
7 8  
8 9 JSON::Members::Members(std::shared_ptr<JSON_value> value) :
9   - value(value)
  10 + value(value),
  11 + start(0),
  12 + end(0)
10 13 {
11 14 }
12 15  
... ... @@ -455,7 +458,8 @@ namespace
455 458 class JSONParser
456 459 {
457 460 public:
458   - JSONParser() :
  461 + JSONParser(JSON::Reactor* reactor) :
  462 + reactor(reactor),
459 463 lex_state(ls_top),
460 464 number_before_point(0),
461 465 number_after_point(0),
... ... @@ -499,6 +503,7 @@ namespace
499 503 ls_backslash,
500 504 };
501 505  
  506 + JSON::Reactor* reactor;
502 507 lex_state_e lex_state;
503 508 size_t number_before_point;
504 509 size_t number_after_point;
... ... @@ -828,10 +833,18 @@ JSONParser::handleToken()
828 833 switch (*tok_start) {
829 834 case '{':
830 835 item = std::make_shared<JSON>(JSON::makeDictionary());
  836 + item->setStart(QIntC::to_size(tok_start - cstr));
  837 + if (reactor) {
  838 + reactor->dictionaryStart();
  839 + }
831 840 break;
832 841  
833 842 case '[':
834 843 item = std::make_shared<JSON>(JSON::makeArray());
  844 + item->setStart(QIntC::to_size(tok_start - cstr));
  845 + if (reactor) {
  846 + reactor->arrayStart();
  847 + }
835 848 break;
836 849  
837 850 default:
... ... @@ -997,6 +1010,11 @@ JSONParser::handleToken()
997 1010 } else if ((delimiter == '}') || (delimiter == ']')) {
998 1011 next_state = ps_stack.back();
999 1012 ps_stack.pop_back();
  1013 + auto tos = stack.back();
  1014 + tos->setEnd(QIntC::to_size(tok_end - cstr));
  1015 + if (reactor) {
  1016 + reactor->containerEnd(*tos);
  1017 + }
1000 1018 if (next_state != ps_done) {
1001 1019 stack.pop_back();
1002 1020 }
... ... @@ -1004,6 +1022,11 @@ JSONParser::handleToken()
1004 1022 throw std::logic_error(
1005 1023 "JSONParser::handleToken: unexpected delimiter in transition");
1006 1024 } else if (item.get()) {
  1025 + if (!(item->isArray() || item->isDictionary())) {
  1026 + item->setStart(QIntC::to_size(tok_start - cstr));
  1027 + item->setEnd(QIntC::to_size(tok_end - cstr));
  1028 + }
  1029 +
1007 1030 std::shared_ptr<JSON> tos;
1008 1031 if (!stack.empty()) {
1009 1032 tos = stack.back();
... ... @@ -1017,14 +1040,18 @@ JSONParser::handleToken()
1017 1040 break;
1018 1041  
1019 1042 case ps_dict_after_colon:
1020   - tos->addDictionaryMember(dict_key, *item);
  1043 + if (!reactor || !reactor->dictionaryItem(dict_key, *item)) {
  1044 + tos->addDictionaryMember(dict_key, *item);
  1045 + }
1021 1046 next_state = ps_dict_after_item;
1022 1047 break;
1023 1048  
1024 1049 case ps_array_begin:
1025 1050 case ps_array_after_comma:
  1051 + if (!reactor || !reactor->arrayItem(*item)) {
  1052 + tos->addArrayElement(*item);
  1053 + }
1026 1054 next_state = ps_array_after_item;
1027   - tos->addArrayElement(*item);
1028 1055 break;
1029 1056  
1030 1057 case ps_top:
... ... @@ -1083,12 +1110,40 @@ JSONParser::parse(std::string const&amp; s)
1083 1110 QTC::TC("libtests", "JSON parse premature EOF");
1084 1111 throw std::runtime_error("JSON: premature end of input");
1085 1112 }
1086   - return stack.back();
  1113 + auto const& tos = stack.back();
  1114 + if (reactor && tos.get() && !(tos->isArray() || tos->isDictionary())) {
  1115 + reactor->topLevelScalar();
  1116 + }
  1117 + return tos;
1087 1118 }
1088 1119  
1089 1120 JSON
1090   -JSON::parse(std::string const& s)
  1121 +JSON::parse(std::string const& s, Reactor* reactor)
1091 1122 {
1092   - JSONParser jp;
  1123 + JSONParser jp(reactor);
1093 1124 return *jp.parse(s);
1094 1125 }
  1126 +
  1127 +void
  1128 +JSON::setStart(size_t start)
  1129 +{
  1130 + this->m->start = start;
  1131 +}
  1132 +
  1133 +void
  1134 +JSON::setEnd(size_t end)
  1135 +{
  1136 + this->m->end = end;
  1137 +}
  1138 +
  1139 +size_t
  1140 +JSON::getStart() const
  1141 +{
  1142 + return this->m->start;
  1143 +}
  1144 +
  1145 +size_t
  1146 +JSON::getEnd() const
  1147 +{
  1148 + return this->m->end;
  1149 +}
... ...
libtests/json_parse.cc
1 1 #include <qpdf/JSON.hh>
2 2 #include <qpdf/QUtil.hh>
  3 +#include <cstdlib>
  4 +#include <cstring>
3 5 #include <iostream>
  6 +#include <memory>
  7 +
  8 +namespace
  9 +{
  10 + class Reactor: public JSON::Reactor
  11 + {
  12 + public:
  13 + virtual ~Reactor() = default;
  14 + virtual void dictionaryStart() override;
  15 + virtual void arrayStart() override;
  16 + virtual void containerEnd(JSON const& value) override;
  17 + virtual void topLevelScalar() override;
  18 + virtual bool
  19 + dictionaryItem(std::string const& key, JSON const& value) override;
  20 + virtual bool arrayItem(JSON const& value) override;
  21 +
  22 + private:
  23 + void printItem(JSON const&);
  24 + };
  25 +} // namespace
  26 +
  27 +void
  28 +Reactor::dictionaryStart()
  29 +{
  30 + std::cout << "dictionary start" << std::endl;
  31 +}
  32 +
  33 +void
  34 +Reactor::arrayStart()
  35 +{
  36 + std::cout << "array start" << std::endl;
  37 +}
  38 +
  39 +void
  40 +Reactor::containerEnd(JSON const& value)
  41 +{
  42 + std::cout << "container end: ";
  43 + printItem(value);
  44 +}
  45 +
  46 +void
  47 +Reactor::topLevelScalar()
  48 +{
  49 + std::cout << "top-level scalar" << std::endl;
  50 +}
  51 +
  52 +bool
  53 +Reactor::dictionaryItem(std::string const& key, JSON const& value)
  54 +{
  55 + std::cout << "dictionary item: " << key << " -> ";
  56 + printItem(value);
  57 + if (key == "keep") {
  58 + return false;
  59 + }
  60 + return true;
  61 +}
  62 +
  63 +bool
  64 +Reactor::arrayItem(JSON const& value)
  65 +{
  66 + std::cout << "array item: ";
  67 + printItem(value);
  68 + std::string n;
  69 + if (value.getString(n) && n == "keep") {
  70 + return false;
  71 + }
  72 + return true;
  73 +}
  74 +
  75 +void
  76 +Reactor::printItem(JSON const& j)
  77 +{
  78 + std::cout << "[" << j.getStart() << ", " << j.getEnd()
  79 + << "): " << j.unparse() << std::endl;
  80 +}
  81 +
  82 +static void
  83 +usage()
  84 +{
  85 + std::cerr << "Usage: json_parse file [--react]" << std::endl;
  86 + exit(2);
  87 +}
4 88  
5 89 int
6 90 main(int argc, char* argv[])
7 91 {
8   - if (argc != 2) {
9   - std::cerr << "Usage: json_parse file" << std::endl;
  92 + if ((argc < 2) || (argc > 3)) {
  93 + usage();
10 94 return 2;
11 95 }
12 96 char const* filename = argv[1];
  97 + std::shared_ptr<Reactor> reactor;
  98 + if (argc == 3) {
  99 + if (strcmp(argv[2], "--react") == 0) {
  100 + reactor = std::make_shared<Reactor>();
  101 + } else {
  102 + usage();
  103 + }
  104 + }
13 105 try {
14 106 std::shared_ptr<char> buf;
15 107 size_t size;
16 108 QUtil::read_file_into_memory(filename, buf, size);
17 109 std::string s(buf.get(), size);
18   - std::cout << JSON::parse(s).unparse() << std::endl;
  110 + std::cout << JSON::parse(s, reactor.get()).unparse() << std::endl;
19 111 } catch (std::exception& e) {
20 112 std::cerr << "exception: " << filename << ": " << e.what() << std::endl;
21 113 return 2;
... ...
libtests/qtest/json_parse.test
... ... @@ -32,7 +32,7 @@ if ($^O ne &#39;msys&#39;)
32 32  
33 33 cleanup();
34 34  
35   -my $good = 9;
  35 +my $good = 10;
36 36  
37 37 for (my $i = 1; $i <= $good; ++$i)
38 38 {
... ... @@ -73,6 +73,11 @@ for (my $i = 1; $i &lt;= $good; ++$i)
73 73 {$td->FILE => "out.json"},
74 74 {$td->STRING => ""});
75 75 }
  76 +
  77 + $td->runtest("good $n reactor",
  78 + {$td->COMMAND => "json_parse good-$n.json --react"},
  79 + {$td->FILE => "good-$n-react.out", $td->EXIT_STATUS => 0},
  80 + $td->NORMALIZE_NEWLINES);
76 81 }
77 82  
78 83 my @bad = (
... ... @@ -127,7 +132,7 @@ foreach my $d (@bad)
127 132  
128 133 cleanup();
129 134  
130   -$td->report((2 * $good) + scalar(@bad));
  135 +$td->report((3 * $good) + scalar(@bad));
131 136  
132 137 sub cleanup
133 138 {
... ...
libtests/qtest/json_parse/good-01-react.out 0 → 100644
  1 +dictionary start
  2 +dictionary item: a -> [6, 11): "bcd"
  3 +array start
  4 +dictionary item: e -> [18, 0): []
  5 +array item: [19, 20): 1
  6 +array item: [41, 42): 2
  7 +array item: [44, 45): 3
  8 +array item: [46, 47): 4
  9 +array item: [48, 54): "five"
  10 +dictionary start
  11 +array item: [56, 0): {}
  12 +dictionary item: six -> [64, 65): 7
  13 +dictionary item: 8 -> [72, 73): 9
  14 +container end: [56, 74): {}
  15 +array item: [76, 80): null
  16 +array item: [82, 86): true
  17 +array item: [107, 112): false
  18 +array item: [114, 134): "a\b\f\n\r\t\\\"/z"
  19 +container end: [18, 135): []
  20 +container end: [0, 136): {}
  21 +{}
... ...
libtests/qtest/json_parse/good-02-react.out 0 → 100644
  1 +dictionary start
  2 +container end: [0, 2): {}
  3 +{}
... ...
libtests/qtest/json_parse/good-03-react.out 0 → 100644
  1 +array start
  2 +container end: [0, 2): []
  3 +[]
... ...
libtests/qtest/json_parse/good-04-react.out 0 → 100644
  1 +array start
  2 +array start
  3 +array item: [1, 0): []
  4 +array start
  5 +array item: [2, 0): []
  6 +dictionary start
  7 +array item: [3, 0): {}
  8 +container end: [3, 5): {}
  9 +container end: [2, 6): []
  10 +dictionary start
  11 +array item: [8, 0): {}
  12 +dictionary start
  13 +dictionary item: -> [13, 0): {}
  14 +container end: [13, 15): {}
  15 +container end: [8, 16): {}
  16 +container end: [1, 17): []
  17 +container end: [0, 18): []
  18 +[]
... ...
libtests/qtest/json_parse/good-05-react.out 0 → 100644
  1 +top-level scalar
  2 +"x"
... ...
libtests/qtest/json_parse/good-06-react.out 0 → 100644
  1 +top-level scalar
  2 +123
... ...
libtests/qtest/json_parse/good-07-react.out 0 → 100644
  1 +top-level scalar
  2 +-123
... ...
libtests/qtest/json_parse/good-08-react.out 0 → 100644
  1 +array start
  2 +array item: [1, 2): 1
  3 +array item: [4, 6): -2
  4 +array item: [8, 11): 3.4
  5 +array item: [13, 17): -5.6
  6 +array item: [19, 23): -9e1
  7 +array item: [25, 29): 10e2
  8 +array item: [31, 37): 12.3e5
  9 +array item: [39, 46): 12.6e-7
  10 +container end: [0, 47): []
  11 +[]
... ...
libtests/qtest/json_parse/good-09-react.out 0 → 100644
  1 +array start
  2 +array item: [1, 7): "aπb"
  3 +array item: [9, 23): "a\b\f\n\r\tc"
  4 +array item: [25, 42): "aπbπc"
  5 +array item: [44, 52): "π"
  6 +array item: [54, 71): "a\u0018bʬc"
  7 +container end: [0, 72): []
  8 +[]
... ...
libtests/qtest/json_parse/good-10-react.out 0 → 100644
  1 +dictionary start
  2 +array start
  3 +dictionary item: a -> [9, 0): []
  4 +array item: [10, 11): 1
  5 +array item: [13, 14): 2
  6 +dictionary start
  7 +array item: [16, 0): {}
  8 +dictionary item: x -> [22, 25): "y"
  9 +container end: [16, 26): {}
  10 +array item: [28, 29): 3
  11 +dictionary start
  12 +array item: [31, 0): {}
  13 +dictionary item: keep -> [40, 61): "not in final output"
  14 +container end: [31, 62): {
  15 + "keep": "not in final output"
  16 +}
  17 +container end: [9, 63): []
  18 +array start
  19 +dictionary item: keep -> [75, 0): []
  20 +array item: [76, 77): 1
  21 +array item: [79, 83): null
  22 +array item: [85, 86): 2
  23 +array item: [88, 93): false
  24 +array item: [95, 101): "keep"
  25 +array item: [103, 104): 3
  26 +array start
  27 +array item: [106, 0): []
  28 +array item: [107, 113): "this"
  29 +array item: [115, 121): "keep"
  30 +array item: [123, 128): "not"
  31 +array item: [130, 137): "final"
  32 +container end: [106, 138): [
  33 + "keep"
  34 +]
  35 +container end: [75, 139): [
  36 + "keep"
  37 +]
  38 +container end: [0, 141): {
  39 + "keep": [
  40 + "keep"
  41 + ]
  42 +}
  43 +{
  44 + "keep": [
  45 + "keep"
  46 + ]
  47 +}
... ...
libtests/qtest/json_parse/good-10.json 0 → 100644
  1 +{
  2 + "a": [1, 2, {"x": "y"}, 3, {"keep": "not in final output"}],
  3 + "keep": [1, null, 2, false, "keep", 3, ["this", "keep", "not", "final"]]
  4 +}
... ...
libtests/qtest/json_parse/save-10.json 0 → 100644
  1 +{
  2 + "a": [
  3 + 1,
  4 + 2,
  5 + {
  6 + "x": "y"
  7 + },
  8 + 3,
  9 + {
  10 + "keep": "not in final output"
  11 + }
  12 + ],
  13 + "keep": [
  14 + 1,
  15 + null,
  16 + 2,
  17 + false,
  18 + "keep",
  19 + 3,
  20 + [
  21 + "this",
  22 + "keep",
  23 + "not",
  24 + "final"
  25 + ]
  26 + ]
  27 +}
... ...