Commit 47c093c48b7ac3eb97c33b8edfafdf89685cffc7

Authored by Jay Berkenbilt
1 parent 9b2eb01e

Replace std::regex with validators for better performance

include/qpdf/QPDF.hh
@@ -867,6 +867,9 @@ class QPDF @@ -867,6 +867,9 @@ class QPDF
867 }; 867 };
868 friend class Pipe; 868 friend class Pipe;
869 869
  870 + // For testing only -- do not add to DLL
  871 + static bool test_json_validators();
  872 +
870 private: 873 private:
871 static std::string const qpdf_version; 874 static std::string const qpdf_version;
872 875
@@ -1045,7 +1048,7 @@ class QPDF @@ -1045,7 +1048,7 @@ class QPDF
1045 QPDFObjectHandle makeObject(JSON const& value); 1048 QPDFObjectHandle makeObject(JSON const& value);
1046 void error(size_t offset, std::string const& message); 1049 void error(size_t offset, std::string const& message);
1047 QPDFObjectHandle 1050 QPDFObjectHandle
1048 - reserveObject(std::string const& obj, std::string const& gen); 1051 + reserveObject(int obj, int gen);
1049 void replaceObject( 1052 void replaceObject(
1050 QPDFObjectHandle to_replace, 1053 QPDFObjectHandle to_replace,
1051 QPDFObjectHandle replacement, 1054 QPDFObjectHandle replacement,
@@ -1500,6 +1503,7 @@ class QPDF @@ -1500,6 +1503,7 @@ class QPDF
1500 }; 1503 };
1501 1504
1502 // Methods to support pattern finding 1505 // Methods to support pattern finding
  1506 + static bool validatePDFVersion(char const*&, std::string& version);
1503 bool findHeader(); 1507 bool findHeader();
1504 bool findStartxref(); 1508 bool findStartxref();
1505 bool findEndstream(); 1509 bool findEndstream();
libqpdf/QPDF.cc
@@ -385,20 +385,8 @@ QPDF::numWarnings() const @@ -385,20 +385,8 @@ QPDF::numWarnings() const
385 } 385 }
386 386
387 bool 387 bool
388 -QPDF::findHeader() 388 +QPDF::validatePDFVersion(char const*& p, std::string& version)
389 { 389 {
390 - qpdf_offset_t global_offset = this->m->file->tell();  
391 - std::string line = this->m->file->readLine(1024);  
392 - char const* p = line.c_str();  
393 - if (strncmp(p, "%PDF-", 5) != 0) {  
394 - throw std::logic_error("findHeader is not looking at %PDF-");  
395 - }  
396 - p += 5;  
397 - std::string version;  
398 - // Note: The string returned by line.c_str() is always  
399 - // null-terminated. The code below never overruns the buffer  
400 - // because a null character always short-circuits further  
401 - // advancement.  
402 bool valid = QUtil::is_digit(*p); 390 bool valid = QUtil::is_digit(*p);
403 if (valid) { 391 if (valid) {
404 while (QUtil::is_digit(*p)) { 392 while (QUtil::is_digit(*p)) {
@@ -413,6 +401,25 @@ QPDF::findHeader() @@ -413,6 +401,25 @@ QPDF::findHeader()
413 valid = false; 401 valid = false;
414 } 402 }
415 } 403 }
  404 + return valid;
  405 +}
  406 +
  407 +bool
  408 +QPDF::findHeader()
  409 +{
  410 + qpdf_offset_t global_offset = this->m->file->tell();
  411 + std::string line = this->m->file->readLine(1024);
  412 + char const* p = line.c_str();
  413 + if (strncmp(p, "%PDF-", 5) != 0) {
  414 + throw std::logic_error("findHeader is not looking at %PDF-");
  415 + }
  416 + p += 5;
  417 + std::string version;
  418 + // Note: The string returned by line.c_str() is always
  419 + // null-terminated. The code below never overruns the buffer
  420 + // because a null character always short-circuits further
  421 + // advancement.
  422 + bool valid = validatePDFVersion(p, version);
416 if (valid) { 423 if (valid) {
417 this->m->pdf_version = version; 424 this->m->pdf_version = version;
418 if (global_offset != 0) { 425 if (global_offset != 0) {
libqpdf/QPDF_json.cc
@@ -7,7 +7,7 @@ @@ -7,7 +7,7 @@
7 #include <qpdf/QTC.hh> 7 #include <qpdf/QTC.hh>
8 #include <qpdf/QUtil.hh> 8 #include <qpdf/QUtil.hh>
9 #include <algorithm> 9 #include <algorithm>
10 -#include <regex> 10 +#include <cstring>
11 11
12 // This chart shows an example of the state transitions that would 12 // This chart shows an example of the state transitions that would
13 // occur in parsing a minimal file. 13 // occur in parsing a minimal file.
@@ -55,14 +55,146 @@ static char const* JSON_PDF = ( @@ -55,14 +55,146 @@ static char const* JSON_PDF = (
55 "9\n" 55 "9\n"
56 "%%EOF\n"); 56 "%%EOF\n");
57 57
58 -// Note use of [\\s\\S] rather than . to match any character since .  
59 -// doesn't match newlines.  
60 -static std::regex PDF_VERSION_RE("^\\d+\\.\\d+$");  
61 -static std::regex OBJ_KEY_RE("^obj:(\\d+) (\\d+) R$");  
62 -static std::regex INDIRECT_OBJ_RE("^(\\d+) (\\d+) R$");  
63 -static std::regex UNICODE_RE("^u:([\\s\\S]*)$");  
64 -static std::regex BINARY_RE("^b:((?:[0-9a-fA-F]{2})*)$");  
65 -static std::regex NAME_RE("^/[\\s\\S]*$"); 58 +// Validator methods -- these are much more performant than std::regex.
  59 +static bool
  60 +is_indirect_object(std::string const& v, int& obj, int& gen)
  61 +{
  62 + char const* p = v.c_str();
  63 + std::string o_str;
  64 + std::string g_str;
  65 + if (!QUtil::is_digit(*p)) {
  66 + return false;
  67 + }
  68 + while (QUtil::is_digit(*p)) {
  69 + o_str.append(1, *p++);
  70 + }
  71 + if (*p != ' ') {
  72 + return false;
  73 + }
  74 + while (*p == ' ') {
  75 + ++p;
  76 + }
  77 + if (!QUtil::is_digit(*p)) {
  78 + return false;
  79 + }
  80 + while (QUtil::is_digit(*p)) {
  81 + g_str.append(1, *p++);
  82 + }
  83 + if (*p != ' ') {
  84 + return false;
  85 + }
  86 + while (*p == ' ') {
  87 + ++p;
  88 + }
  89 + if (*p++ != 'R') {
  90 + return false;
  91 + }
  92 + if (*p) {
  93 + return false;
  94 + }
  95 + obj = QUtil::string_to_int(o_str.c_str());
  96 + gen = QUtil::string_to_int(g_str.c_str());
  97 + return true;
  98 +}
  99 +
  100 +static bool
  101 +is_obj_key(std::string const& v, int& obj, int& gen)
  102 +{
  103 + if (v.substr(0, 4) != "obj:") {
  104 + return false;
  105 + }
  106 + return is_indirect_object(v.substr(4), obj, gen);
  107 +}
  108 +
  109 +static bool
  110 +is_unicode_string(std::string const& v, std::string& str)
  111 +{
  112 + if (v.substr(0, 2) == "u:") {
  113 + str = v.substr(2);
  114 + return true;
  115 + }
  116 + return false;
  117 +}
  118 +
  119 +static bool
  120 +is_binary_string(std::string const& v, std::string& str)
  121 +{
  122 + if (v.substr(0, 2) == "b:") {
  123 + str = v.substr(2);
  124 + int count = 0;
  125 + for (char c: str) {
  126 + if (!QUtil::is_hex_digit(c)) {
  127 + return false;
  128 + }
  129 + ++count;
  130 + }
  131 + return ((count > 0) && (count % 2 == 0));
  132 + }
  133 + return false;
  134 +}
  135 +
  136 +static bool
  137 +is_name(std::string const& v)
  138 +{
  139 + return ((v.length() > 1) && (v.at(0) == '/'));
  140 +}
  141 +
  142 +bool
  143 +QPDF::test_json_validators()
  144 +{
  145 + bool passed = true;
  146 + auto check_fn = [&passed](char const* msg, bool expr) {
  147 + if (!expr) {
  148 + passed = false;
  149 + std::cerr << msg << std::endl;
  150 + }
  151 + };
  152 +#define check(expr) check_fn(#expr, expr)
  153 +
  154 + int obj = 0;
  155 + int gen = 0;
  156 + check(!is_indirect_object("", obj, gen));
  157 + check(!is_indirect_object("12", obj, gen));
  158 + check(!is_indirect_object("x12 0 R", obj, gen));
  159 + check(!is_indirect_object("12 0 Rx", obj, gen));
  160 + check(!is_indirect_object("12 0R", obj, gen));
  161 + check(is_indirect_object("52 1 R", obj, gen));
  162 + check(obj == 52);
  163 + check(gen == 1);
  164 + check(is_indirect_object("53 20 R", obj, gen));
  165 + check(obj == 53);
  166 + check(gen == 20);
  167 + check(!is_obj_key("", obj, gen));
  168 + check(!is_obj_key("obj:x", obj, gen));
  169 + check(!is_obj_key("obj:x", obj, gen));
  170 + check(is_obj_key("obj:12 13 R", obj, gen));
  171 + check(obj == 12);
  172 + check(gen == 13);
  173 + std::string str;
  174 + check(!is_unicode_string("", str));
  175 + check(!is_unicode_string("xyz", str));
  176 + check(!is_unicode_string("x:", str));
  177 + check(is_unicode_string("u:potato", str));
  178 + check(str == "potato");
  179 + check(is_unicode_string("u:", str));
  180 + check(str == "");
  181 + check(!is_binary_string("", str));
  182 + check(!is_binary_string("x:", str));
  183 + check(!is_binary_string("b:", str));
  184 + check(!is_binary_string("b:1", str));
  185 + check(!is_binary_string("b:123", str));
  186 + check(!is_binary_string("b:gh", str));
  187 + check(is_binary_string("b:12", str));
  188 + check(is_binary_string("b:123aBC", str));
  189 + check(!is_name(""));
  190 + check(!is_name("/"));
  191 + check(!is_name("xyz"));
  192 + check(is_name("/Potato"));
  193 + check(is_name("/Potato Salad"));
  194 +
  195 + return passed;
  196 +#undef check_arg
  197 +}
66 198
67 static std::function<void(Pipeline*)> 199 static std::function<void(Pipeline*)>
68 provide_data(std::shared_ptr<InputSource> is, size_t start, size_t end) 200 provide_data(std::shared_ptr<InputSource> is, size_t start, size_t end)
@@ -236,13 +368,11 @@ QPDF::JSONReactor::containerEnd(JSON const&amp; value) @@ -236,13 +368,11 @@ QPDF::JSONReactor::containerEnd(JSON const&amp; value)
236 } 368 }
237 369
238 QPDFObjectHandle 370 QPDFObjectHandle
239 -QPDF::JSONReactor::reserveObject(std::string const& obj, std::string const& gen) 371 +QPDF::JSONReactor::reserveObject(int obj, int gen)
240 { 372 {
241 - int o = QUtil::string_to_int(obj.c_str());  
242 - int g = QUtil::string_to_int(gen.c_str());  
243 - auto oh = pdf.reserveObjectIfNotExists(o, g); 373 + auto oh = pdf.reserveObjectIfNotExists(obj, gen);
244 if (oh.isReserved()) { 374 if (oh.isReserved()) {
245 - this->reserved.insert(QPDFObjGen(o, g)); 375 + this->reserved.insert(QPDFObjGen(obj, gen));
246 } 376 }
247 return oh; 377 return oh;
248 } 378 }
@@ -304,10 +434,11 @@ QPDF::JSONReactor::dictionaryItem(std::string const&amp; key, JSON const&amp; value) @@ -304,10 +434,11 @@ QPDF::JSONReactor::dictionaryItem(std::string const&amp; key, JSON const&amp; value)
304 bool version_okay = false; 434 bool version_okay = false;
305 std::string v; 435 std::string v;
306 if (value.getString(v)) { 436 if (value.getString(v)) {
307 - std::smatch m;  
308 - if (std::regex_match(v, m, PDF_VERSION_RE)) { 437 + std::string version;
  438 + char const* p = v.c_str();
  439 + if (QPDF::validatePDFVersion(p, version) && (*p == '\0')) {
309 version_okay = true; 440 version_okay = true;
310 - this->pdf.m->pdf_version = v; 441 + this->pdf.m->pdf_version = version;
311 } 442 }
312 } 443 }
313 if (!version_okay) { 444 if (!version_okay) {
@@ -324,14 +455,15 @@ QPDF::JSONReactor::dictionaryItem(std::string const&amp; key, JSON const&amp; value) @@ -324,14 +455,15 @@ QPDF::JSONReactor::dictionaryItem(std::string const&amp; key, JSON const&amp; value)
324 next_state = st_ignore; 455 next_state = st_ignore;
325 } 456 }
326 } else if (state == st_objects) { 457 } else if (state == st_objects) {
327 - std::smatch m; 458 + int obj = 0;
  459 + int gen = 0;
328 if (key == "trailer") { 460 if (key == "trailer") {
329 this->saw_trailer = true; 461 this->saw_trailer = true;
330 nestedState(key, value, st_trailer); 462 nestedState(key, value, st_trailer);
331 this->cur_object = "trailer"; 463 this->cur_object = "trailer";
332 - } else if (std::regex_match(key, m, OBJ_KEY_RE)) { 464 + } else if (is_obj_key(key, obj, gen)) {
333 this->cur_object = key; 465 this->cur_object = key;
334 - auto oh = reserveObject(m[1].str(), m[2].str()); 466 + auto oh = reserveObject(obj, gen);
335 object_stack.push_back(oh); 467 object_stack.push_back(oh);
336 nestedState(key, value, st_object_top); 468 nestedState(key, value, st_object_top);
337 } else { 469 } else {
@@ -494,7 +626,6 @@ QPDF::JSONReactor::makeObject(JSON const&amp; value) @@ -494,7 +626,6 @@ QPDF::JSONReactor::makeObject(JSON const&amp; value)
494 QPDFObjectHandle result; 626 QPDFObjectHandle result;
495 std::string str_v; 627 std::string str_v;
496 bool bool_v = false; 628 bool bool_v = false;
497 - std::smatch m;  
498 if (value.isDictionary()) { 629 if (value.isDictionary()) {
499 result = QPDFObjectHandle::newDictionary(); 630 result = QPDFObjectHandle::newDictionary();
500 object_stack.push_back(result); 631 object_stack.push_back(result);
@@ -513,13 +644,16 @@ QPDF::JSONReactor::makeObject(JSON const&amp; value) @@ -513,13 +644,16 @@ QPDF::JSONReactor::makeObject(JSON const&amp; value)
513 result = QPDFObjectHandle::newReal(str_v); 644 result = QPDFObjectHandle::newReal(str_v);
514 } 645 }
515 } else if (value.getString(str_v)) { 646 } else if (value.getString(str_v)) {
516 - if (std::regex_match(str_v, m, INDIRECT_OBJ_RE)) {  
517 - result = reserveObject(m[1].str(), m[2].str());  
518 - } else if (std::regex_match(str_v, m, UNICODE_RE)) {  
519 - result = QPDFObjectHandle::newUnicodeString(m[1].str());  
520 - } else if (std::regex_match(str_v, m, BINARY_RE)) {  
521 - result = QPDFObjectHandle::newString(QUtil::hex_decode(m[1].str()));  
522 - } else if (std::regex_match(str_v, m, NAME_RE)) { 647 + int obj = 0;
  648 + int gen = 0;
  649 + std::string str;
  650 + if (is_indirect_object(str_v, obj, gen)) {
  651 + result = reserveObject(obj, gen);
  652 + } else if (is_unicode_string(str_v, str)) {
  653 + result = QPDFObjectHandle::newUnicodeString(str);
  654 + } else if (is_binary_string(str_v, str)) {
  655 + result = QPDFObjectHandle::newString(QUtil::hex_decode(str));
  656 + } else if (is_name(str_v)) {
523 result = QPDFObjectHandle::newName(str_v); 657 result = QPDFObjectHandle::newName(str_v);
524 } else { 658 } else {
525 QTC::TC("qpdf", "QPDF_json unrecognized string value"); 659 QTC::TC("qpdf", "QPDF_json unrecognized string value");
libtests/json.cc
@@ -3,6 +3,7 @@ @@ -3,6 +3,7 @@
3 #include <qpdf/JSON.hh> 3 #include <qpdf/JSON.hh>
4 #include <qpdf/Pipeline.hh> 4 #include <qpdf/Pipeline.hh>
5 #include <qpdf/QPDFObjectHandle.hh> 5 #include <qpdf/QPDFObjectHandle.hh>
  6 +#include <qpdf/QPDF.hh>
6 #include <iostream> 7 #include <iostream>
7 8
8 static void 9 static void
@@ -271,6 +272,7 @@ main() @@ -271,6 +272,7 @@ main()
271 { 272 {
272 test_main(); 273 test_main();
273 test_schema(); 274 test_schema();
  275 + assert(QPDF::test_json_validators());
274 276
275 std::cout << "end of json tests\n"; 277 std::cout << "end of json tests\n";
276 return 0; 278 return 0;