Commit 8d2a0eda5a76a341ae6b597f58e874d9e3bd571c

Authored by Jay Berkenbilt
1 parent f5dd6381

Add reactors to the JSON parser

ChangeLog
  1 +2022-05-01 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * JSON: add reactors to the JSON parser, making it possible to
  4 + react to JSON parsing events as they occur and to block the
  5 + results from being stored. This makes it possible to incrementally
  6 + parse arbitrarily large JSON inputs.
  7 +
1 2022-04-30 Jay Berkenbilt <ejb@ql.org> 8 2022-04-30 Jay Berkenbilt <ejb@ql.org>
2 9
3 * QPDFWriter: change encryption API calls 10 * QPDFWriter: change encryption API calls
include/qpdf/JSON.hh
@@ -141,9 +141,86 @@ class JSON @@ -141,9 +141,86 @@ class JSON
141 QPDF_DLL 141 QPDF_DLL
142 bool checkSchema(JSON schema, std::list<std::string>& errors); 142 bool checkSchema(JSON schema, std::list<std::string>& errors);
143 143
144 - // Create a JSON object from a string. 144 + // An pointer to a Reactor class can be passed to parse, which
  145 + // will enable the caller to react to incremental events in the
  146 + // construction of the JSON object. This makes it possible to
  147 + // implement SAX-like handling of very large JSON objects.
  148 + class QPDF_DLL_CLASS Reactor
  149 + {
  150 + public:
  151 + QPDF_DLL
  152 + virtual ~Reactor() = default;
  153 +
  154 + // The start/end methods are called when parsing of a
  155 + // dictionary or array is started or ended. The item methods
  156 + // are called when an item is added to a dictionary or array.
  157 + // See important notes in "Item methods" below.
  158 +
  159 + // During parsing of a JSON string, the parser is operating on
  160 + // a single object at a time. When a dictionary or array is
  161 + // started, a new context begins, and when that dictionary or
  162 + // array is ended, the previous context is resumed. So, for
  163 + // example, if you have `{"a": [1]}`, you will receive the
  164 + // following method calls
  165 + //
  166 + // dictionaryStart -- current object is the top-level dictionary
  167 + // arrayStart -- current object is the array
  168 + // arrayItem -- called with the "1" object
  169 + // containerEnd -- now current object is the dictionary again
  170 + // dictionaryItem -- called with "a" and the just-completed array
  171 + // containerEnd -- current object is undefined
  172 + //
  173 + // If the top-level item in a JSON string is a scalar, the
  174 + // topLevelScalar() method will be called. No argument is
  175 + // passed since the object is the same as what is returned by
  176 + // parse().
  177 +
  178 + QPDF_DLL
  179 + virtual void dictionaryStart() = 0;
  180 + QPDF_DLL
  181 + virtual void arrayStart() = 0;
  182 + QPDF_DLL
  183 + virtual void containerEnd(JSON const& value) = 0;
  184 + QPDF_DLL
  185 + virtual void topLevelScalar() = 0;
  186 +
  187 + // Item methods:
  188 + //
  189 + // The return value of the item methods indicate whether the
  190 + // item has been "consumed". If the item method returns true,
  191 + // then the item will not be added to the containing JSON
  192 + // object. This is what allows arbitrarily large JSON objects
  193 + // to be parsed and not have to be kept in memory.
  194 + //
  195 + // NOTE: When a dictionary or an array is added to a
  196 + // container, the dictionaryItem or arrayItem method is called
  197 + // when the child item's start delimiter is encountered, so
  198 + // the JSON object passed in at that time will always be
  199 + // in its initial, empty state.
  200 +
  201 + QPDF_DLL
  202 + virtual bool
  203 + dictionaryItem(std::string const& key, JSON const& value) = 0;
  204 + QPDF_DLL
  205 + virtual bool arrayItem(JSON const& value) = 0;
  206 + };
  207 +
  208 + // Create a JSON object from a string. See above for information
  209 + // about how to use the Reactor.
  210 + QPDF_DLL
  211 + static JSON parse(std::string const&, Reactor* reactor = nullptr);
  212 +
  213 + // parse calls setOffsets to set the inclusive start and
  214 + // non-inclusive end offsets of an object relative to its input
  215 + // string. Otherwise, both values are 0.
  216 + QPDF_DLL
  217 + void setStart(size_t);
  218 + QPDF_DLL
  219 + void setEnd(size_t);
  220 + QPDF_DLL
  221 + size_t getStart() const;
145 QPDF_DLL 222 QPDF_DLL
146 - static JSON parse(std::string const&); 223 + size_t getEnd() const;
147 224
148 private: 225 private:
149 static std::string encode_string(std::string const& utf8); 226 static std::string encode_string(std::string const& utf8);
@@ -217,6 +294,9 @@ class JSON @@ -217,6 +294,9 @@ class JSON
217 Members(Members const&) = delete; 294 Members(Members const&) = delete;
218 295
219 std::shared_ptr<JSON_value> value; 296 std::shared_ptr<JSON_value> value;
  297 + // start and end are only populated for objects created by parse
  298 + size_t start;
  299 + size_t end;
220 }; 300 };
221 301
222 std::shared_ptr<Members> m; 302 std::shared_ptr<Members> m;
libqpdf/JSON.cc
1 #include <qpdf/JSON.hh> 1 #include <qpdf/JSON.hh>
2 2
  3 +#include <qpdf/QIntC.hh>
3 #include <qpdf/QTC.hh> 4 #include <qpdf/QTC.hh>
4 #include <qpdf/QUtil.hh> 5 #include <qpdf/QUtil.hh>
5 #include <cstring> 6 #include <cstring>
6 #include <stdexcept> 7 #include <stdexcept>
7 8
8 JSON::Members::Members(std::shared_ptr<JSON_value> value) : 9 JSON::Members::Members(std::shared_ptr<JSON_value> value) :
9 - value(value) 10 + value(value),
  11 + start(0),
  12 + end(0)
10 { 13 {
11 } 14 }
12 15
@@ -455,7 +458,8 @@ namespace @@ -455,7 +458,8 @@ namespace
455 class JSONParser 458 class JSONParser
456 { 459 {
457 public: 460 public:
458 - JSONParser() : 461 + JSONParser(JSON::Reactor* reactor) :
  462 + reactor(reactor),
459 lex_state(ls_top), 463 lex_state(ls_top),
460 number_before_point(0), 464 number_before_point(0),
461 number_after_point(0), 465 number_after_point(0),
@@ -499,6 +503,7 @@ namespace @@ -499,6 +503,7 @@ namespace
499 ls_backslash, 503 ls_backslash,
500 }; 504 };
501 505
  506 + JSON::Reactor* reactor;
502 lex_state_e lex_state; 507 lex_state_e lex_state;
503 size_t number_before_point; 508 size_t number_before_point;
504 size_t number_after_point; 509 size_t number_after_point;
@@ -828,10 +833,18 @@ JSONParser::handleToken() @@ -828,10 +833,18 @@ JSONParser::handleToken()
828 switch (*tok_start) { 833 switch (*tok_start) {
829 case '{': 834 case '{':
830 item = std::make_shared<JSON>(JSON::makeDictionary()); 835 item = std::make_shared<JSON>(JSON::makeDictionary());
  836 + item->setStart(QIntC::to_size(tok_start - cstr));
  837 + if (reactor) {
  838 + reactor->dictionaryStart();
  839 + }
831 break; 840 break;
832 841
833 case '[': 842 case '[':
834 item = std::make_shared<JSON>(JSON::makeArray()); 843 item = std::make_shared<JSON>(JSON::makeArray());
  844 + item->setStart(QIntC::to_size(tok_start - cstr));
  845 + if (reactor) {
  846 + reactor->arrayStart();
  847 + }
835 break; 848 break;
836 849
837 default: 850 default:
@@ -997,6 +1010,11 @@ JSONParser::handleToken() @@ -997,6 +1010,11 @@ JSONParser::handleToken()
997 } else if ((delimiter == '}') || (delimiter == ']')) { 1010 } else if ((delimiter == '}') || (delimiter == ']')) {
998 next_state = ps_stack.back(); 1011 next_state = ps_stack.back();
999 ps_stack.pop_back(); 1012 ps_stack.pop_back();
  1013 + auto tos = stack.back();
  1014 + tos->setEnd(QIntC::to_size(tok_end - cstr));
  1015 + if (reactor) {
  1016 + reactor->containerEnd(*tos);
  1017 + }
1000 if (next_state != ps_done) { 1018 if (next_state != ps_done) {
1001 stack.pop_back(); 1019 stack.pop_back();
1002 } 1020 }
@@ -1004,6 +1022,11 @@ JSONParser::handleToken() @@ -1004,6 +1022,11 @@ JSONParser::handleToken()
1004 throw std::logic_error( 1022 throw std::logic_error(
1005 "JSONParser::handleToken: unexpected delimiter in transition"); 1023 "JSONParser::handleToken: unexpected delimiter in transition");
1006 } else if (item.get()) { 1024 } else if (item.get()) {
  1025 + if (!(item->isArray() || item->isDictionary())) {
  1026 + item->setStart(QIntC::to_size(tok_start - cstr));
  1027 + item->setEnd(QIntC::to_size(tok_end - cstr));
  1028 + }
  1029 +
1007 std::shared_ptr<JSON> tos; 1030 std::shared_ptr<JSON> tos;
1008 if (!stack.empty()) { 1031 if (!stack.empty()) {
1009 tos = stack.back(); 1032 tos = stack.back();
@@ -1017,14 +1040,18 @@ JSONParser::handleToken() @@ -1017,14 +1040,18 @@ JSONParser::handleToken()
1017 break; 1040 break;
1018 1041
1019 case ps_dict_after_colon: 1042 case ps_dict_after_colon:
1020 - tos->addDictionaryMember(dict_key, *item); 1043 + if (!reactor || !reactor->dictionaryItem(dict_key, *item)) {
  1044 + tos->addDictionaryMember(dict_key, *item);
  1045 + }
1021 next_state = ps_dict_after_item; 1046 next_state = ps_dict_after_item;
1022 break; 1047 break;
1023 1048
1024 case ps_array_begin: 1049 case ps_array_begin:
1025 case ps_array_after_comma: 1050 case ps_array_after_comma:
  1051 + if (!reactor || !reactor->arrayItem(*item)) {
  1052 + tos->addArrayElement(*item);
  1053 + }
1026 next_state = ps_array_after_item; 1054 next_state = ps_array_after_item;
1027 - tos->addArrayElement(*item);  
1028 break; 1055 break;
1029 1056
1030 case ps_top: 1057 case ps_top:
@@ -1083,12 +1110,40 @@ JSONParser::parse(std::string const&amp; s) @@ -1083,12 +1110,40 @@ JSONParser::parse(std::string const&amp; s)
1083 QTC::TC("libtests", "JSON parse premature EOF"); 1110 QTC::TC("libtests", "JSON parse premature EOF");
1084 throw std::runtime_error("JSON: premature end of input"); 1111 throw std::runtime_error("JSON: premature end of input");
1085 } 1112 }
1086 - return stack.back(); 1113 + auto const& tos = stack.back();
  1114 + if (reactor && tos.get() && !(tos->isArray() || tos->isDictionary())) {
  1115 + reactor->topLevelScalar();
  1116 + }
  1117 + return tos;
1087 } 1118 }
1088 1119
1089 JSON 1120 JSON
1090 -JSON::parse(std::string const& s) 1121 +JSON::parse(std::string const& s, Reactor* reactor)
1091 { 1122 {
1092 - JSONParser jp; 1123 + JSONParser jp(reactor);
1093 return *jp.parse(s); 1124 return *jp.parse(s);
1094 } 1125 }
  1126 +
  1127 +void
  1128 +JSON::setStart(size_t start)
  1129 +{
  1130 + this->m->start = start;
  1131 +}
  1132 +
  1133 +void
  1134 +JSON::setEnd(size_t end)
  1135 +{
  1136 + this->m->end = end;
  1137 +}
  1138 +
  1139 +size_t
  1140 +JSON::getStart() const
  1141 +{
  1142 + return this->m->start;
  1143 +}
  1144 +
  1145 +size_t
  1146 +JSON::getEnd() const
  1147 +{
  1148 + return this->m->end;
  1149 +}
libtests/json_parse.cc
1 #include <qpdf/JSON.hh> 1 #include <qpdf/JSON.hh>
2 #include <qpdf/QUtil.hh> 2 #include <qpdf/QUtil.hh>
  3 +#include <cstdlib>
  4 +#include <cstring>
3 #include <iostream> 5 #include <iostream>
  6 +#include <memory>
  7 +
  8 +namespace
  9 +{
  10 + class Reactor: public JSON::Reactor
  11 + {
  12 + public:
  13 + virtual ~Reactor() = default;
  14 + virtual void dictionaryStart() override;
  15 + virtual void arrayStart() override;
  16 + virtual void containerEnd(JSON const& value) override;
  17 + virtual void topLevelScalar() override;
  18 + virtual bool
  19 + dictionaryItem(std::string const& key, JSON const& value) override;
  20 + virtual bool arrayItem(JSON const& value) override;
  21 +
  22 + private:
  23 + void printItem(JSON const&);
  24 + };
  25 +} // namespace
  26 +
  27 +void
  28 +Reactor::dictionaryStart()
  29 +{
  30 + std::cout << "dictionary start" << std::endl;
  31 +}
  32 +
  33 +void
  34 +Reactor::arrayStart()
  35 +{
  36 + std::cout << "array start" << std::endl;
  37 +}
  38 +
  39 +void
  40 +Reactor::containerEnd(JSON const& value)
  41 +{
  42 + std::cout << "container end: ";
  43 + printItem(value);
  44 +}
  45 +
  46 +void
  47 +Reactor::topLevelScalar()
  48 +{
  49 + std::cout << "top-level scalar" << std::endl;
  50 +}
  51 +
  52 +bool
  53 +Reactor::dictionaryItem(std::string const& key, JSON const& value)
  54 +{
  55 + std::cout << "dictionary item: " << key << " -> ";
  56 + printItem(value);
  57 + if (key == "keep") {
  58 + return false;
  59 + }
  60 + return true;
  61 +}
  62 +
  63 +bool
  64 +Reactor::arrayItem(JSON const& value)
  65 +{
  66 + std::cout << "array item: ";
  67 + printItem(value);
  68 + std::string n;
  69 + if (value.getString(n) && n == "keep") {
  70 + return false;
  71 + }
  72 + return true;
  73 +}
  74 +
  75 +void
  76 +Reactor::printItem(JSON const& j)
  77 +{
  78 + std::cout << "[" << j.getStart() << ", " << j.getEnd()
  79 + << "): " << j.unparse() << std::endl;
  80 +}
  81 +
  82 +static void
  83 +usage()
  84 +{
  85 + std::cerr << "Usage: json_parse file [--react]" << std::endl;
  86 + exit(2);
  87 +}
4 88
5 int 89 int
6 main(int argc, char* argv[]) 90 main(int argc, char* argv[])
7 { 91 {
8 - if (argc != 2) {  
9 - std::cerr << "Usage: json_parse file" << std::endl; 92 + if ((argc < 2) || (argc > 3)) {
  93 + usage();
10 return 2; 94 return 2;
11 } 95 }
12 char const* filename = argv[1]; 96 char const* filename = argv[1];
  97 + std::shared_ptr<Reactor> reactor;
  98 + if (argc == 3) {
  99 + if (strcmp(argv[2], "--react") == 0) {
  100 + reactor = std::make_shared<Reactor>();
  101 + } else {
  102 + usage();
  103 + }
  104 + }
13 try { 105 try {
14 std::shared_ptr<char> buf; 106 std::shared_ptr<char> buf;
15 size_t size; 107 size_t size;
16 QUtil::read_file_into_memory(filename, buf, size); 108 QUtil::read_file_into_memory(filename, buf, size);
17 std::string s(buf.get(), size); 109 std::string s(buf.get(), size);
18 - std::cout << JSON::parse(s).unparse() << std::endl; 110 + std::cout << JSON::parse(s, reactor.get()).unparse() << std::endl;
19 } catch (std::exception& e) { 111 } catch (std::exception& e) {
20 std::cerr << "exception: " << filename << ": " << e.what() << std::endl; 112 std::cerr << "exception: " << filename << ": " << e.what() << std::endl;
21 return 2; 113 return 2;
libtests/qtest/json_parse.test
@@ -32,7 +32,7 @@ if ($^O ne &#39;msys&#39;) @@ -32,7 +32,7 @@ if ($^O ne &#39;msys&#39;)
32 32
33 cleanup(); 33 cleanup();
34 34
35 -my $good = 9; 35 +my $good = 10;
36 36
37 for (my $i = 1; $i <= $good; ++$i) 37 for (my $i = 1; $i <= $good; ++$i)
38 { 38 {
@@ -73,6 +73,11 @@ for (my $i = 1; $i &lt;= $good; ++$i) @@ -73,6 +73,11 @@ for (my $i = 1; $i &lt;= $good; ++$i)
73 {$td->FILE => "out.json"}, 73 {$td->FILE => "out.json"},
74 {$td->STRING => ""}); 74 {$td->STRING => ""});
75 } 75 }
  76 +
  77 + $td->runtest("good $n reactor",
  78 + {$td->COMMAND => "json_parse good-$n.json --react"},
  79 + {$td->FILE => "good-$n-react.out", $td->EXIT_STATUS => 0},
  80 + $td->NORMALIZE_NEWLINES);
76 } 81 }
77 82
78 my @bad = ( 83 my @bad = (
@@ -127,7 +132,7 @@ foreach my $d (@bad) @@ -127,7 +132,7 @@ foreach my $d (@bad)
127 132
128 cleanup(); 133 cleanup();
129 134
130 -$td->report((2 * $good) + scalar(@bad)); 135 +$td->report((3 * $good) + scalar(@bad));
131 136
132 sub cleanup 137 sub cleanup
133 { 138 {
libtests/qtest/json_parse/good-01-react.out 0 โ†’ 100644
  1 +dictionary start
  2 +dictionary item: a -> [6, 11): "bcd"
  3 +array start
  4 +dictionary item: e -> [18, 0): []
  5 +array item: [19, 20): 1
  6 +array item: [41, 42): 2
  7 +array item: [44, 45): 3
  8 +array item: [46, 47): 4
  9 +array item: [48, 54): "five"
  10 +dictionary start
  11 +array item: [56, 0): {}
  12 +dictionary item: six -> [64, 65): 7
  13 +dictionary item: 8 -> [72, 73): 9
  14 +container end: [56, 74): {}
  15 +array item: [76, 80): null
  16 +array item: [82, 86): true
  17 +array item: [107, 112): false
  18 +array item: [114, 134): "a\b\f\n\r\t\\\"/z"
  19 +container end: [18, 135): []
  20 +container end: [0, 136): {}
  21 +{}
libtests/qtest/json_parse/good-02-react.out 0 โ†’ 100644
  1 +dictionary start
  2 +container end: [0, 2): {}
  3 +{}
libtests/qtest/json_parse/good-03-react.out 0 โ†’ 100644
  1 +array start
  2 +container end: [0, 2): []
  3 +[]
libtests/qtest/json_parse/good-04-react.out 0 โ†’ 100644
  1 +array start
  2 +array start
  3 +array item: [1, 0): []
  4 +array start
  5 +array item: [2, 0): []
  6 +dictionary start
  7 +array item: [3, 0): {}
  8 +container end: [3, 5): {}
  9 +container end: [2, 6): []
  10 +dictionary start
  11 +array item: [8, 0): {}
  12 +dictionary start
  13 +dictionary item: -> [13, 0): {}
  14 +container end: [13, 15): {}
  15 +container end: [8, 16): {}
  16 +container end: [1, 17): []
  17 +container end: [0, 18): []
  18 +[]
libtests/qtest/json_parse/good-05-react.out 0 โ†’ 100644
  1 +top-level scalar
  2 +"x"
libtests/qtest/json_parse/good-06-react.out 0 โ†’ 100644
  1 +top-level scalar
  2 +123
libtests/qtest/json_parse/good-07-react.out 0 โ†’ 100644
  1 +top-level scalar
  2 +-123
libtests/qtest/json_parse/good-08-react.out 0 โ†’ 100644
  1 +array start
  2 +array item: [1, 2): 1
  3 +array item: [4, 6): -2
  4 +array item: [8, 11): 3.4
  5 +array item: [13, 17): -5.6
  6 +array item: [19, 23): -9e1
  7 +array item: [25, 29): 10e2
  8 +array item: [31, 37): 12.3e5
  9 +array item: [39, 46): 12.6e-7
  10 +container end: [0, 47): []
  11 +[]
libtests/qtest/json_parse/good-09-react.out 0 โ†’ 100644
  1 +array start
  2 +array item: [1, 7): "aฯ€b"
  3 +array item: [9, 23): "a\b\f\n\r\tc"
  4 +array item: [25, 42): "aฯ€bฯ€c"
  5 +array item: [44, 52): "ฯ€"
  6 +array item: [54, 71): "a\u0018bสฌc"
  7 +container end: [0, 72): []
  8 +[]
libtests/qtest/json_parse/good-10-react.out 0 โ†’ 100644
  1 +dictionary start
  2 +array start
  3 +dictionary item: a -> [9, 0): []
  4 +array item: [10, 11): 1
  5 +array item: [13, 14): 2
  6 +dictionary start
  7 +array item: [16, 0): {}
  8 +dictionary item: x -> [22, 25): "y"
  9 +container end: [16, 26): {}
  10 +array item: [28, 29): 3
  11 +dictionary start
  12 +array item: [31, 0): {}
  13 +dictionary item: keep -> [40, 61): "not in final output"
  14 +container end: [31, 62): {
  15 + "keep": "not in final output"
  16 +}
  17 +container end: [9, 63): []
  18 +array start
  19 +dictionary item: keep -> [75, 0): []
  20 +array item: [76, 77): 1
  21 +array item: [79, 83): null
  22 +array item: [85, 86): 2
  23 +array item: [88, 93): false
  24 +array item: [95, 101): "keep"
  25 +array item: [103, 104): 3
  26 +array start
  27 +array item: [106, 0): []
  28 +array item: [107, 113): "this"
  29 +array item: [115, 121): "keep"
  30 +array item: [123, 128): "not"
  31 +array item: [130, 137): "final"
  32 +container end: [106, 138): [
  33 + "keep"
  34 +]
  35 +container end: [75, 139): [
  36 + "keep"
  37 +]
  38 +container end: [0, 141): {
  39 + "keep": [
  40 + "keep"
  41 + ]
  42 +}
  43 +{
  44 + "keep": [
  45 + "keep"
  46 + ]
  47 +}
libtests/qtest/json_parse/good-10.json 0 โ†’ 100644
  1 +{
  2 + "a": [1, 2, {"x": "y"}, 3, {"keep": "not in final output"}],
  3 + "keep": [1, null, 2, false, "keep", 3, ["this", "keep", "not", "final"]]
  4 +}
libtests/qtest/json_parse/save-10.json 0 โ†’ 100644
  1 +{
  2 + "a": [
  3 + 1,
  4 + 2,
  5 + {
  6 + "x": "y"
  7 + },
  8 + 3,
  9 + {
  10 + "keep": "not in final output"
  11 + }
  12 + ],
  13 + "keep": [
  14 + 1,
  15 + null,
  16 + 2,
  17 + false,
  18 + "keep",
  19 + 3,
  20 + [
  21 + "this",
  22 + "keep",
  23 + "not",
  24 + "final"
  25 + ]
  26 + ]
  27 +}