Commit 47c093c48b7ac3eb97c33b8edfafdf89685cffc7
1 parent
9b2eb01e
Replace std::regex with validators for better performance
Showing
4 changed files
with
189 additions
and
42 deletions
include/qpdf/QPDF.hh
| @@ -867,6 +867,9 @@ class QPDF | @@ -867,6 +867,9 @@ class QPDF | ||
| 867 | }; | 867 | }; |
| 868 | friend class Pipe; | 868 | friend class Pipe; |
| 869 | 869 | ||
| 870 | + // For testing only -- do not add to DLL | ||
| 871 | + static bool test_json_validators(); | ||
| 872 | + | ||
| 870 | private: | 873 | private: |
| 871 | static std::string const qpdf_version; | 874 | static std::string const qpdf_version; |
| 872 | 875 | ||
| @@ -1045,7 +1048,7 @@ class QPDF | @@ -1045,7 +1048,7 @@ class QPDF | ||
| 1045 | QPDFObjectHandle makeObject(JSON const& value); | 1048 | QPDFObjectHandle makeObject(JSON const& value); |
| 1046 | void error(size_t offset, std::string const& message); | 1049 | void error(size_t offset, std::string const& message); |
| 1047 | QPDFObjectHandle | 1050 | QPDFObjectHandle |
| 1048 | - reserveObject(std::string const& obj, std::string const& gen); | 1051 | + reserveObject(int obj, int gen); |
| 1049 | void replaceObject( | 1052 | void replaceObject( |
| 1050 | QPDFObjectHandle to_replace, | 1053 | QPDFObjectHandle to_replace, |
| 1051 | QPDFObjectHandle replacement, | 1054 | QPDFObjectHandle replacement, |
| @@ -1500,6 +1503,7 @@ class QPDF | @@ -1500,6 +1503,7 @@ class QPDF | ||
| 1500 | }; | 1503 | }; |
| 1501 | 1504 | ||
| 1502 | // Methods to support pattern finding | 1505 | // Methods to support pattern finding |
| 1506 | + static bool validatePDFVersion(char const*&, std::string& version); | ||
| 1503 | bool findHeader(); | 1507 | bool findHeader(); |
| 1504 | bool findStartxref(); | 1508 | bool findStartxref(); |
| 1505 | bool findEndstream(); | 1509 | bool findEndstream(); |
libqpdf/QPDF.cc
| @@ -385,20 +385,8 @@ QPDF::numWarnings() const | @@ -385,20 +385,8 @@ QPDF::numWarnings() const | ||
| 385 | } | 385 | } |
| 386 | 386 | ||
| 387 | bool | 387 | bool |
| 388 | -QPDF::findHeader() | 388 | +QPDF::validatePDFVersion(char const*& p, std::string& version) |
| 389 | { | 389 | { |
| 390 | - qpdf_offset_t global_offset = this->m->file->tell(); | ||
| 391 | - std::string line = this->m->file->readLine(1024); | ||
| 392 | - char const* p = line.c_str(); | ||
| 393 | - if (strncmp(p, "%PDF-", 5) != 0) { | ||
| 394 | - throw std::logic_error("findHeader is not looking at %PDF-"); | ||
| 395 | - } | ||
| 396 | - p += 5; | ||
| 397 | - std::string version; | ||
| 398 | - // Note: The string returned by line.c_str() is always | ||
| 399 | - // null-terminated. The code below never overruns the buffer | ||
| 400 | - // because a null character always short-circuits further | ||
| 401 | - // advancement. | ||
| 402 | bool valid = QUtil::is_digit(*p); | 390 | bool valid = QUtil::is_digit(*p); |
| 403 | if (valid) { | 391 | if (valid) { |
| 404 | while (QUtil::is_digit(*p)) { | 392 | while (QUtil::is_digit(*p)) { |
| @@ -413,6 +401,25 @@ QPDF::findHeader() | @@ -413,6 +401,25 @@ QPDF::findHeader() | ||
| 413 | valid = false; | 401 | valid = false; |
| 414 | } | 402 | } |
| 415 | } | 403 | } |
| 404 | + return valid; | ||
| 405 | +} | ||
| 406 | + | ||
| 407 | +bool | ||
| 408 | +QPDF::findHeader() | ||
| 409 | +{ | ||
| 410 | + qpdf_offset_t global_offset = this->m->file->tell(); | ||
| 411 | + std::string line = this->m->file->readLine(1024); | ||
| 412 | + char const* p = line.c_str(); | ||
| 413 | + if (strncmp(p, "%PDF-", 5) != 0) { | ||
| 414 | + throw std::logic_error("findHeader is not looking at %PDF-"); | ||
| 415 | + } | ||
| 416 | + p += 5; | ||
| 417 | + std::string version; | ||
| 418 | + // Note: The string returned by line.c_str() is always | ||
| 419 | + // null-terminated. The code below never overruns the buffer | ||
| 420 | + // because a null character always short-circuits further | ||
| 421 | + // advancement. | ||
| 422 | + bool valid = validatePDFVersion(p, version); | ||
| 416 | if (valid) { | 423 | if (valid) { |
| 417 | this->m->pdf_version = version; | 424 | this->m->pdf_version = version; |
| 418 | if (global_offset != 0) { | 425 | if (global_offset != 0) { |
libqpdf/QPDF_json.cc
| @@ -7,7 +7,7 @@ | @@ -7,7 +7,7 @@ | ||
| 7 | #include <qpdf/QTC.hh> | 7 | #include <qpdf/QTC.hh> |
| 8 | #include <qpdf/QUtil.hh> | 8 | #include <qpdf/QUtil.hh> |
| 9 | #include <algorithm> | 9 | #include <algorithm> |
| 10 | -#include <regex> | 10 | +#include <cstring> |
| 11 | 11 | ||
| 12 | // This chart shows an example of the state transitions that would | 12 | // This chart shows an example of the state transitions that would |
| 13 | // occur in parsing a minimal file. | 13 | // occur in parsing a minimal file. |
| @@ -55,14 +55,146 @@ static char const* JSON_PDF = ( | @@ -55,14 +55,146 @@ static char const* JSON_PDF = ( | ||
| 55 | "9\n" | 55 | "9\n" |
| 56 | "%%EOF\n"); | 56 | "%%EOF\n"); |
| 57 | 57 | ||
| 58 | -// Note use of [\\s\\S] rather than . to match any character since . | ||
| 59 | -// doesn't match newlines. | ||
| 60 | -static std::regex PDF_VERSION_RE("^\\d+\\.\\d+$"); | ||
| 61 | -static std::regex OBJ_KEY_RE("^obj:(\\d+) (\\d+) R$"); | ||
| 62 | -static std::regex INDIRECT_OBJ_RE("^(\\d+) (\\d+) R$"); | ||
| 63 | -static std::regex UNICODE_RE("^u:([\\s\\S]*)$"); | ||
| 64 | -static std::regex BINARY_RE("^b:((?:[0-9a-fA-F]{2})*)$"); | ||
| 65 | -static std::regex NAME_RE("^/[\\s\\S]*$"); | 58 | +// Validator methods -- these are much more performant than std::regex. |
| 59 | +static bool | ||
| 60 | +is_indirect_object(std::string const& v, int& obj, int& gen) | ||
| 61 | +{ | ||
| 62 | + char const* p = v.c_str(); | ||
| 63 | + std::string o_str; | ||
| 64 | + std::string g_str; | ||
| 65 | + if (!QUtil::is_digit(*p)) { | ||
| 66 | + return false; | ||
| 67 | + } | ||
| 68 | + while (QUtil::is_digit(*p)) { | ||
| 69 | + o_str.append(1, *p++); | ||
| 70 | + } | ||
| 71 | + if (*p != ' ') { | ||
| 72 | + return false; | ||
| 73 | + } | ||
| 74 | + while (*p == ' ') { | ||
| 75 | + ++p; | ||
| 76 | + } | ||
| 77 | + if (!QUtil::is_digit(*p)) { | ||
| 78 | + return false; | ||
| 79 | + } | ||
| 80 | + while (QUtil::is_digit(*p)) { | ||
| 81 | + g_str.append(1, *p++); | ||
| 82 | + } | ||
| 83 | + if (*p != ' ') { | ||
| 84 | + return false; | ||
| 85 | + } | ||
| 86 | + while (*p == ' ') { | ||
| 87 | + ++p; | ||
| 88 | + } | ||
| 89 | + if (*p++ != 'R') { | ||
| 90 | + return false; | ||
| 91 | + } | ||
| 92 | + if (*p) { | ||
| 93 | + return false; | ||
| 94 | + } | ||
| 95 | + obj = QUtil::string_to_int(o_str.c_str()); | ||
| 96 | + gen = QUtil::string_to_int(g_str.c_str()); | ||
| 97 | + return true; | ||
| 98 | +} | ||
| 99 | + | ||
| 100 | +static bool | ||
| 101 | +is_obj_key(std::string const& v, int& obj, int& gen) | ||
| 102 | +{ | ||
| 103 | + if (v.substr(0, 4) != "obj:") { | ||
| 104 | + return false; | ||
| 105 | + } | ||
| 106 | + return is_indirect_object(v.substr(4), obj, gen); | ||
| 107 | +} | ||
| 108 | + | ||
| 109 | +static bool | ||
| 110 | +is_unicode_string(std::string const& v, std::string& str) | ||
| 111 | +{ | ||
| 112 | + if (v.substr(0, 2) == "u:") { | ||
| 113 | + str = v.substr(2); | ||
| 114 | + return true; | ||
| 115 | + } | ||
| 116 | + return false; | ||
| 117 | +} | ||
| 118 | + | ||
| 119 | +static bool | ||
| 120 | +is_binary_string(std::string const& v, std::string& str) | ||
| 121 | +{ | ||
| 122 | + if (v.substr(0, 2) == "b:") { | ||
| 123 | + str = v.substr(2); | ||
| 124 | + int count = 0; | ||
| 125 | + for (char c: str) { | ||
| 126 | + if (!QUtil::is_hex_digit(c)) { | ||
| 127 | + return false; | ||
| 128 | + } | ||
| 129 | + ++count; | ||
| 130 | + } | ||
| 131 | + return ((count > 0) && (count % 2 == 0)); | ||
| 132 | + } | ||
| 133 | + return false; | ||
| 134 | +} | ||
| 135 | + | ||
| 136 | +static bool | ||
| 137 | +is_name(std::string const& v) | ||
| 138 | +{ | ||
| 139 | + return ((v.length() > 1) && (v.at(0) == '/')); | ||
| 140 | +} | ||
| 141 | + | ||
| 142 | +bool | ||
| 143 | +QPDF::test_json_validators() | ||
| 144 | +{ | ||
| 145 | + bool passed = true; | ||
| 146 | + auto check_fn = [&passed](char const* msg, bool expr) { | ||
| 147 | + if (!expr) { | ||
| 148 | + passed = false; | ||
| 149 | + std::cerr << msg << std::endl; | ||
| 150 | + } | ||
| 151 | + }; | ||
| 152 | +#define check(expr) check_fn(#expr, expr) | ||
| 153 | + | ||
| 154 | + int obj = 0; | ||
| 155 | + int gen = 0; | ||
| 156 | + check(!is_indirect_object("", obj, gen)); | ||
| 157 | + check(!is_indirect_object("12", obj, gen)); | ||
| 158 | + check(!is_indirect_object("x12 0 R", obj, gen)); | ||
| 159 | + check(!is_indirect_object("12 0 Rx", obj, gen)); | ||
| 160 | + check(!is_indirect_object("12 0R", obj, gen)); | ||
| 161 | + check(is_indirect_object("52 1 R", obj, gen)); | ||
| 162 | + check(obj == 52); | ||
| 163 | + check(gen == 1); | ||
| 164 | + check(is_indirect_object("53 20 R", obj, gen)); | ||
| 165 | + check(obj == 53); | ||
| 166 | + check(gen == 20); | ||
| 167 | + check(!is_obj_key("", obj, gen)); | ||
| 168 | + check(!is_obj_key("obj:x", obj, gen)); | ||
| 169 | + check(!is_obj_key("obj:x", obj, gen)); | ||
| 170 | + check(is_obj_key("obj:12 13 R", obj, gen)); | ||
| 171 | + check(obj == 12); | ||
| 172 | + check(gen == 13); | ||
| 173 | + std::string str; | ||
| 174 | + check(!is_unicode_string("", str)); | ||
| 175 | + check(!is_unicode_string("xyz", str)); | ||
| 176 | + check(!is_unicode_string("x:", str)); | ||
| 177 | + check(is_unicode_string("u:potato", str)); | ||
| 178 | + check(str == "potato"); | ||
| 179 | + check(is_unicode_string("u:", str)); | ||
| 180 | + check(str == ""); | ||
| 181 | + check(!is_binary_string("", str)); | ||
| 182 | + check(!is_binary_string("x:", str)); | ||
| 183 | + check(!is_binary_string("b:", str)); | ||
| 184 | + check(!is_binary_string("b:1", str)); | ||
| 185 | + check(!is_binary_string("b:123", str)); | ||
| 186 | + check(!is_binary_string("b:gh", str)); | ||
| 187 | + check(is_binary_string("b:12", str)); | ||
| 188 | + check(is_binary_string("b:123aBC", str)); | ||
| 189 | + check(!is_name("")); | ||
| 190 | + check(!is_name("/")); | ||
| 191 | + check(!is_name("xyz")); | ||
| 192 | + check(is_name("/Potato")); | ||
| 193 | + check(is_name("/Potato Salad")); | ||
| 194 | + | ||
| 195 | + return passed; | ||
| 196 | +#undef check_arg | ||
| 197 | +} | ||
| 66 | 198 | ||
| 67 | static std::function<void(Pipeline*)> | 199 | static std::function<void(Pipeline*)> |
| 68 | provide_data(std::shared_ptr<InputSource> is, size_t start, size_t end) | 200 | provide_data(std::shared_ptr<InputSource> is, size_t start, size_t end) |
| @@ -236,13 +368,11 @@ QPDF::JSONReactor::containerEnd(JSON const& value) | @@ -236,13 +368,11 @@ QPDF::JSONReactor::containerEnd(JSON const& value) | ||
| 236 | } | 368 | } |
| 237 | 369 | ||
| 238 | QPDFObjectHandle | 370 | QPDFObjectHandle |
| 239 | -QPDF::JSONReactor::reserveObject(std::string const& obj, std::string const& gen) | 371 | +QPDF::JSONReactor::reserveObject(int obj, int gen) |
| 240 | { | 372 | { |
| 241 | - int o = QUtil::string_to_int(obj.c_str()); | ||
| 242 | - int g = QUtil::string_to_int(gen.c_str()); | ||
| 243 | - auto oh = pdf.reserveObjectIfNotExists(o, g); | 373 | + auto oh = pdf.reserveObjectIfNotExists(obj, gen); |
| 244 | if (oh.isReserved()) { | 374 | if (oh.isReserved()) { |
| 245 | - this->reserved.insert(QPDFObjGen(o, g)); | 375 | + this->reserved.insert(QPDFObjGen(obj, gen)); |
| 246 | } | 376 | } |
| 247 | return oh; | 377 | return oh; |
| 248 | } | 378 | } |
| @@ -304,10 +434,11 @@ QPDF::JSONReactor::dictionaryItem(std::string const& key, JSON const& value) | @@ -304,10 +434,11 @@ QPDF::JSONReactor::dictionaryItem(std::string const& key, JSON const& value) | ||
| 304 | bool version_okay = false; | 434 | bool version_okay = false; |
| 305 | std::string v; | 435 | std::string v; |
| 306 | if (value.getString(v)) { | 436 | if (value.getString(v)) { |
| 307 | - std::smatch m; | ||
| 308 | - if (std::regex_match(v, m, PDF_VERSION_RE)) { | 437 | + std::string version; |
| 438 | + char const* p = v.c_str(); | ||
| 439 | + if (QPDF::validatePDFVersion(p, version) && (*p == '\0')) { | ||
| 309 | version_okay = true; | 440 | version_okay = true; |
| 310 | - this->pdf.m->pdf_version = v; | 441 | + this->pdf.m->pdf_version = version; |
| 311 | } | 442 | } |
| 312 | } | 443 | } |
| 313 | if (!version_okay) { | 444 | if (!version_okay) { |
| @@ -324,14 +455,15 @@ QPDF::JSONReactor::dictionaryItem(std::string const& key, JSON const& value) | @@ -324,14 +455,15 @@ QPDF::JSONReactor::dictionaryItem(std::string const& key, JSON const& value) | ||
| 324 | next_state = st_ignore; | 455 | next_state = st_ignore; |
| 325 | } | 456 | } |
| 326 | } else if (state == st_objects) { | 457 | } else if (state == st_objects) { |
| 327 | - std::smatch m; | 458 | + int obj = 0; |
| 459 | + int gen = 0; | ||
| 328 | if (key == "trailer") { | 460 | if (key == "trailer") { |
| 329 | this->saw_trailer = true; | 461 | this->saw_trailer = true; |
| 330 | nestedState(key, value, st_trailer); | 462 | nestedState(key, value, st_trailer); |
| 331 | this->cur_object = "trailer"; | 463 | this->cur_object = "trailer"; |
| 332 | - } else if (std::regex_match(key, m, OBJ_KEY_RE)) { | 464 | + } else if (is_obj_key(key, obj, gen)) { |
| 333 | this->cur_object = key; | 465 | this->cur_object = key; |
| 334 | - auto oh = reserveObject(m[1].str(), m[2].str()); | 466 | + auto oh = reserveObject(obj, gen); |
| 335 | object_stack.push_back(oh); | 467 | object_stack.push_back(oh); |
| 336 | nestedState(key, value, st_object_top); | 468 | nestedState(key, value, st_object_top); |
| 337 | } else { | 469 | } else { |
| @@ -494,7 +626,6 @@ QPDF::JSONReactor::makeObject(JSON const& value) | @@ -494,7 +626,6 @@ QPDF::JSONReactor::makeObject(JSON const& value) | ||
| 494 | QPDFObjectHandle result; | 626 | QPDFObjectHandle result; |
| 495 | std::string str_v; | 627 | std::string str_v; |
| 496 | bool bool_v = false; | 628 | bool bool_v = false; |
| 497 | - std::smatch m; | ||
| 498 | if (value.isDictionary()) { | 629 | if (value.isDictionary()) { |
| 499 | result = QPDFObjectHandle::newDictionary(); | 630 | result = QPDFObjectHandle::newDictionary(); |
| 500 | object_stack.push_back(result); | 631 | object_stack.push_back(result); |
| @@ -513,13 +644,16 @@ QPDF::JSONReactor::makeObject(JSON const& value) | @@ -513,13 +644,16 @@ QPDF::JSONReactor::makeObject(JSON const& value) | ||
| 513 | result = QPDFObjectHandle::newReal(str_v); | 644 | result = QPDFObjectHandle::newReal(str_v); |
| 514 | } | 645 | } |
| 515 | } else if (value.getString(str_v)) { | 646 | } else if (value.getString(str_v)) { |
| 516 | - if (std::regex_match(str_v, m, INDIRECT_OBJ_RE)) { | ||
| 517 | - result = reserveObject(m[1].str(), m[2].str()); | ||
| 518 | - } else if (std::regex_match(str_v, m, UNICODE_RE)) { | ||
| 519 | - result = QPDFObjectHandle::newUnicodeString(m[1].str()); | ||
| 520 | - } else if (std::regex_match(str_v, m, BINARY_RE)) { | ||
| 521 | - result = QPDFObjectHandle::newString(QUtil::hex_decode(m[1].str())); | ||
| 522 | - } else if (std::regex_match(str_v, m, NAME_RE)) { | 647 | + int obj = 0; |
| 648 | + int gen = 0; | ||
| 649 | + std::string str; | ||
| 650 | + if (is_indirect_object(str_v, obj, gen)) { | ||
| 651 | + result = reserveObject(obj, gen); | ||
| 652 | + } else if (is_unicode_string(str_v, str)) { | ||
| 653 | + result = QPDFObjectHandle::newUnicodeString(str); | ||
| 654 | + } else if (is_binary_string(str_v, str)) { | ||
| 655 | + result = QPDFObjectHandle::newString(QUtil::hex_decode(str)); | ||
| 656 | + } else if (is_name(str_v)) { | ||
| 523 | result = QPDFObjectHandle::newName(str_v); | 657 | result = QPDFObjectHandle::newName(str_v); |
| 524 | } else { | 658 | } else { |
| 525 | QTC::TC("qpdf", "QPDF_json unrecognized string value"); | 659 | QTC::TC("qpdf", "QPDF_json unrecognized string value"); |
libtests/json.cc
| @@ -3,6 +3,7 @@ | @@ -3,6 +3,7 @@ | ||
| 3 | #include <qpdf/JSON.hh> | 3 | #include <qpdf/JSON.hh> |
| 4 | #include <qpdf/Pipeline.hh> | 4 | #include <qpdf/Pipeline.hh> |
| 5 | #include <qpdf/QPDFObjectHandle.hh> | 5 | #include <qpdf/QPDFObjectHandle.hh> |
| 6 | +#include <qpdf/QPDF.hh> | ||
| 6 | #include <iostream> | 7 | #include <iostream> |
| 7 | 8 | ||
| 8 | static void | 9 | static void |
| @@ -271,6 +272,7 @@ main() | @@ -271,6 +272,7 @@ main() | ||
| 271 | { | 272 | { |
| 272 | test_main(); | 273 | test_main(); |
| 273 | test_schema(); | 274 | test_schema(); |
| 275 | + assert(QPDF::test_json_validators()); | ||
| 274 | 276 | ||
| 275 | std::cout << "end of json tests\n"; | 277 | std::cout << "end of json tests\n"; |
| 276 | return 0; | 278 | return 0; |