Commit 47c093c48b7ac3eb97c33b8edfafdf89685cffc7
1 parent
9b2eb01e
Replace std::regex with validators for better performance
Showing
4 changed files
with
189 additions
and
42 deletions
include/qpdf/QPDF.hh
| ... | ... | @@ -867,6 +867,9 @@ class QPDF |
| 867 | 867 | }; |
| 868 | 868 | friend class Pipe; |
| 869 | 869 | |
| 870 | + // For testing only -- do not add to DLL | |
| 871 | + static bool test_json_validators(); | |
| 872 | + | |
| 870 | 873 | private: |
| 871 | 874 | static std::string const qpdf_version; |
| 872 | 875 | |
| ... | ... | @@ -1045,7 +1048,7 @@ class QPDF |
| 1045 | 1048 | QPDFObjectHandle makeObject(JSON const& value); |
| 1046 | 1049 | void error(size_t offset, std::string const& message); |
| 1047 | 1050 | QPDFObjectHandle |
| 1048 | - reserveObject(std::string const& obj, std::string const& gen); | |
| 1051 | + reserveObject(int obj, int gen); | |
| 1049 | 1052 | void replaceObject( |
| 1050 | 1053 | QPDFObjectHandle to_replace, |
| 1051 | 1054 | QPDFObjectHandle replacement, |
| ... | ... | @@ -1500,6 +1503,7 @@ class QPDF |
| 1500 | 1503 | }; |
| 1501 | 1504 | |
| 1502 | 1505 | // Methods to support pattern finding |
| 1506 | + static bool validatePDFVersion(char const*&, std::string& version); | |
| 1503 | 1507 | bool findHeader(); |
| 1504 | 1508 | bool findStartxref(); |
| 1505 | 1509 | bool findEndstream(); | ... | ... |
libqpdf/QPDF.cc
| ... | ... | @@ -385,20 +385,8 @@ QPDF::numWarnings() const |
| 385 | 385 | } |
| 386 | 386 | |
| 387 | 387 | bool |
| 388 | -QPDF::findHeader() | |
| 388 | +QPDF::validatePDFVersion(char const*& p, std::string& version) | |
| 389 | 389 | { |
| 390 | - qpdf_offset_t global_offset = this->m->file->tell(); | |
| 391 | - std::string line = this->m->file->readLine(1024); | |
| 392 | - char const* p = line.c_str(); | |
| 393 | - if (strncmp(p, "%PDF-", 5) != 0) { | |
| 394 | - throw std::logic_error("findHeader is not looking at %PDF-"); | |
| 395 | - } | |
| 396 | - p += 5; | |
| 397 | - std::string version; | |
| 398 | - // Note: The string returned by line.c_str() is always | |
| 399 | - // null-terminated. The code below never overruns the buffer | |
| 400 | - // because a null character always short-circuits further | |
| 401 | - // advancement. | |
| 402 | 390 | bool valid = QUtil::is_digit(*p); |
| 403 | 391 | if (valid) { |
| 404 | 392 | while (QUtil::is_digit(*p)) { |
| ... | ... | @@ -413,6 +401,25 @@ QPDF::findHeader() |
| 413 | 401 | valid = false; |
| 414 | 402 | } |
| 415 | 403 | } |
| 404 | + return valid; | |
| 405 | +} | |
| 406 | + | |
| 407 | +bool | |
| 408 | +QPDF::findHeader() | |
| 409 | +{ | |
| 410 | + qpdf_offset_t global_offset = this->m->file->tell(); | |
| 411 | + std::string line = this->m->file->readLine(1024); | |
| 412 | + char const* p = line.c_str(); | |
| 413 | + if (strncmp(p, "%PDF-", 5) != 0) { | |
| 414 | + throw std::logic_error("findHeader is not looking at %PDF-"); | |
| 415 | + } | |
| 416 | + p += 5; | |
| 417 | + std::string version; | |
| 418 | + // Note: The string returned by line.c_str() is always | |
| 419 | + // null-terminated. The code below never overruns the buffer | |
| 420 | + // because a null character always short-circuits further | |
| 421 | + // advancement. | |
| 422 | + bool valid = validatePDFVersion(p, version); | |
| 416 | 423 | if (valid) { |
| 417 | 424 | this->m->pdf_version = version; |
| 418 | 425 | if (global_offset != 0) { | ... | ... |
libqpdf/QPDF_json.cc
| ... | ... | @@ -7,7 +7,7 @@ |
| 7 | 7 | #include <qpdf/QTC.hh> |
| 8 | 8 | #include <qpdf/QUtil.hh> |
| 9 | 9 | #include <algorithm> |
| 10 | -#include <regex> | |
| 10 | +#include <cstring> | |
| 11 | 11 | |
| 12 | 12 | // This chart shows an example of the state transitions that would |
| 13 | 13 | // occur in parsing a minimal file. |
| ... | ... | @@ -55,14 +55,146 @@ static char const* JSON_PDF = ( |
| 55 | 55 | "9\n" |
| 56 | 56 | "%%EOF\n"); |
| 57 | 57 | |
| 58 | -// Note use of [\\s\\S] rather than . to match any character since . | |
| 59 | -// doesn't match newlines. | |
| 60 | -static std::regex PDF_VERSION_RE("^\\d+\\.\\d+$"); | |
| 61 | -static std::regex OBJ_KEY_RE("^obj:(\\d+) (\\d+) R$"); | |
| 62 | -static std::regex INDIRECT_OBJ_RE("^(\\d+) (\\d+) R$"); | |
| 63 | -static std::regex UNICODE_RE("^u:([\\s\\S]*)$"); | |
| 64 | -static std::regex BINARY_RE("^b:((?:[0-9a-fA-F]{2})*)$"); | |
| 65 | -static std::regex NAME_RE("^/[\\s\\S]*$"); | |
| 58 | +// Validator methods -- these are much more performant than std::regex. | |
| 59 | +static bool | |
| 60 | +is_indirect_object(std::string const& v, int& obj, int& gen) | |
| 61 | +{ | |
| 62 | + char const* p = v.c_str(); | |
| 63 | + std::string o_str; | |
| 64 | + std::string g_str; | |
| 65 | + if (!QUtil::is_digit(*p)) { | |
| 66 | + return false; | |
| 67 | + } | |
| 68 | + while (QUtil::is_digit(*p)) { | |
| 69 | + o_str.append(1, *p++); | |
| 70 | + } | |
| 71 | + if (*p != ' ') { | |
| 72 | + return false; | |
| 73 | + } | |
| 74 | + while (*p == ' ') { | |
| 75 | + ++p; | |
| 76 | + } | |
| 77 | + if (!QUtil::is_digit(*p)) { | |
| 78 | + return false; | |
| 79 | + } | |
| 80 | + while (QUtil::is_digit(*p)) { | |
| 81 | + g_str.append(1, *p++); | |
| 82 | + } | |
| 83 | + if (*p != ' ') { | |
| 84 | + return false; | |
| 85 | + } | |
| 86 | + while (*p == ' ') { | |
| 87 | + ++p; | |
| 88 | + } | |
| 89 | + if (*p++ != 'R') { | |
| 90 | + return false; | |
| 91 | + } | |
| 92 | + if (*p) { | |
| 93 | + return false; | |
| 94 | + } | |
| 95 | + obj = QUtil::string_to_int(o_str.c_str()); | |
| 96 | + gen = QUtil::string_to_int(g_str.c_str()); | |
| 97 | + return true; | |
| 98 | +} | |
| 99 | + | |
| 100 | +static bool | |
| 101 | +is_obj_key(std::string const& v, int& obj, int& gen) | |
| 102 | +{ | |
| 103 | + if (v.substr(0, 4) != "obj:") { | |
| 104 | + return false; | |
| 105 | + } | |
| 106 | + return is_indirect_object(v.substr(4), obj, gen); | |
| 107 | +} | |
| 108 | + | |
| 109 | +static bool | |
| 110 | +is_unicode_string(std::string const& v, std::string& str) | |
| 111 | +{ | |
| 112 | + if (v.substr(0, 2) == "u:") { | |
| 113 | + str = v.substr(2); | |
| 114 | + return true; | |
| 115 | + } | |
| 116 | + return false; | |
| 117 | +} | |
| 118 | + | |
| 119 | +static bool | |
| 120 | +is_binary_string(std::string const& v, std::string& str) | |
| 121 | +{ | |
| 122 | + if (v.substr(0, 2) == "b:") { | |
| 123 | + str = v.substr(2); | |
| 124 | + int count = 0; | |
| 125 | + for (char c: str) { | |
| 126 | + if (!QUtil::is_hex_digit(c)) { | |
| 127 | + return false; | |
| 128 | + } | |
| 129 | + ++count; | |
| 130 | + } | |
| 131 | + return ((count > 0) && (count % 2 == 0)); | |
| 132 | + } | |
| 133 | + return false; | |
| 134 | +} | |
| 135 | + | |
| 136 | +static bool | |
| 137 | +is_name(std::string const& v) | |
| 138 | +{ | |
| 139 | + return ((v.length() > 1) && (v.at(0) == '/')); | |
| 140 | +} | |
| 141 | + | |
| 142 | +bool | |
| 143 | +QPDF::test_json_validators() | |
| 144 | +{ | |
| 145 | + bool passed = true; | |
| 146 | + auto check_fn = [&passed](char const* msg, bool expr) { | |
| 147 | + if (!expr) { | |
| 148 | + passed = false; | |
| 149 | + std::cerr << msg << std::endl; | |
| 150 | + } | |
| 151 | + }; | |
| 152 | +#define check(expr) check_fn(#expr, expr) | |
| 153 | + | |
| 154 | + int obj = 0; | |
| 155 | + int gen = 0; | |
| 156 | + check(!is_indirect_object("", obj, gen)); | |
| 157 | + check(!is_indirect_object("12", obj, gen)); | |
| 158 | + check(!is_indirect_object("x12 0 R", obj, gen)); | |
| 159 | + check(!is_indirect_object("12 0 Rx", obj, gen)); | |
| 160 | + check(!is_indirect_object("12 0R", obj, gen)); | |
| 161 | + check(is_indirect_object("52 1 R", obj, gen)); | |
| 162 | + check(obj == 52); | |
| 163 | + check(gen == 1); | |
| 164 | + check(is_indirect_object("53 20 R", obj, gen)); | |
| 165 | + check(obj == 53); | |
| 166 | + check(gen == 20); | |
| 167 | + check(!is_obj_key("", obj, gen)); | |
| 168 | + check(!is_obj_key("obj:x", obj, gen)); | |
| 169 | + check(!is_obj_key("obj:x", obj, gen)); | |
| 170 | + check(is_obj_key("obj:12 13 R", obj, gen)); | |
| 171 | + check(obj == 12); | |
| 172 | + check(gen == 13); | |
| 173 | + std::string str; | |
| 174 | + check(!is_unicode_string("", str)); | |
| 175 | + check(!is_unicode_string("xyz", str)); | |
| 176 | + check(!is_unicode_string("x:", str)); | |
| 177 | + check(is_unicode_string("u:potato", str)); | |
| 178 | + check(str == "potato"); | |
| 179 | + check(is_unicode_string("u:", str)); | |
| 180 | + check(str == ""); | |
| 181 | + check(!is_binary_string("", str)); | |
| 182 | + check(!is_binary_string("x:", str)); | |
| 183 | + check(!is_binary_string("b:", str)); | |
| 184 | + check(!is_binary_string("b:1", str)); | |
| 185 | + check(!is_binary_string("b:123", str)); | |
| 186 | + check(!is_binary_string("b:gh", str)); | |
| 187 | + check(is_binary_string("b:12", str)); | |
| 188 | + check(is_binary_string("b:123aBC", str)); | |
| 189 | + check(!is_name("")); | |
| 190 | + check(!is_name("/")); | |
| 191 | + check(!is_name("xyz")); | |
| 192 | + check(is_name("/Potato")); | |
| 193 | + check(is_name("/Potato Salad")); | |
| 194 | + | |
| 195 | + return passed; | |
| 196 | +#undef check_arg | |
| 197 | +} | |
| 66 | 198 | |
| 67 | 199 | static std::function<void(Pipeline*)> |
| 68 | 200 | provide_data(std::shared_ptr<InputSource> is, size_t start, size_t end) |
| ... | ... | @@ -236,13 +368,11 @@ QPDF::JSONReactor::containerEnd(JSON const& value) |
| 236 | 368 | } |
| 237 | 369 | |
| 238 | 370 | QPDFObjectHandle |
| 239 | -QPDF::JSONReactor::reserveObject(std::string const& obj, std::string const& gen) | |
| 371 | +QPDF::JSONReactor::reserveObject(int obj, int gen) | |
| 240 | 372 | { |
| 241 | - int o = QUtil::string_to_int(obj.c_str()); | |
| 242 | - int g = QUtil::string_to_int(gen.c_str()); | |
| 243 | - auto oh = pdf.reserveObjectIfNotExists(o, g); | |
| 373 | + auto oh = pdf.reserveObjectIfNotExists(obj, gen); | |
| 244 | 374 | if (oh.isReserved()) { |
| 245 | - this->reserved.insert(QPDFObjGen(o, g)); | |
| 375 | + this->reserved.insert(QPDFObjGen(obj, gen)); | |
| 246 | 376 | } |
| 247 | 377 | return oh; |
| 248 | 378 | } |
| ... | ... | @@ -304,10 +434,11 @@ QPDF::JSONReactor::dictionaryItem(std::string const& key, JSON const& value) |
| 304 | 434 | bool version_okay = false; |
| 305 | 435 | std::string v; |
| 306 | 436 | if (value.getString(v)) { |
| 307 | - std::smatch m; | |
| 308 | - if (std::regex_match(v, m, PDF_VERSION_RE)) { | |
| 437 | + std::string version; | |
| 438 | + char const* p = v.c_str(); | |
| 439 | + if (QPDF::validatePDFVersion(p, version) && (*p == '\0')) { | |
| 309 | 440 | version_okay = true; |
| 310 | - this->pdf.m->pdf_version = v; | |
| 441 | + this->pdf.m->pdf_version = version; | |
| 311 | 442 | } |
| 312 | 443 | } |
| 313 | 444 | if (!version_okay) { |
| ... | ... | @@ -324,14 +455,15 @@ QPDF::JSONReactor::dictionaryItem(std::string const& key, JSON const& value) |
| 324 | 455 | next_state = st_ignore; |
| 325 | 456 | } |
| 326 | 457 | } else if (state == st_objects) { |
| 327 | - std::smatch m; | |
| 458 | + int obj = 0; | |
| 459 | + int gen = 0; | |
| 328 | 460 | if (key == "trailer") { |
| 329 | 461 | this->saw_trailer = true; |
| 330 | 462 | nestedState(key, value, st_trailer); |
| 331 | 463 | this->cur_object = "trailer"; |
| 332 | - } else if (std::regex_match(key, m, OBJ_KEY_RE)) { | |
| 464 | + } else if (is_obj_key(key, obj, gen)) { | |
| 333 | 465 | this->cur_object = key; |
| 334 | - auto oh = reserveObject(m[1].str(), m[2].str()); | |
| 466 | + auto oh = reserveObject(obj, gen); | |
| 335 | 467 | object_stack.push_back(oh); |
| 336 | 468 | nestedState(key, value, st_object_top); |
| 337 | 469 | } else { |
| ... | ... | @@ -494,7 +626,6 @@ QPDF::JSONReactor::makeObject(JSON const& value) |
| 494 | 626 | QPDFObjectHandle result; |
| 495 | 627 | std::string str_v; |
| 496 | 628 | bool bool_v = false; |
| 497 | - std::smatch m; | |
| 498 | 629 | if (value.isDictionary()) { |
| 499 | 630 | result = QPDFObjectHandle::newDictionary(); |
| 500 | 631 | object_stack.push_back(result); |
| ... | ... | @@ -513,13 +644,16 @@ QPDF::JSONReactor::makeObject(JSON const& value) |
| 513 | 644 | result = QPDFObjectHandle::newReal(str_v); |
| 514 | 645 | } |
| 515 | 646 | } else if (value.getString(str_v)) { |
| 516 | - if (std::regex_match(str_v, m, INDIRECT_OBJ_RE)) { | |
| 517 | - result = reserveObject(m[1].str(), m[2].str()); | |
| 518 | - } else if (std::regex_match(str_v, m, UNICODE_RE)) { | |
| 519 | - result = QPDFObjectHandle::newUnicodeString(m[1].str()); | |
| 520 | - } else if (std::regex_match(str_v, m, BINARY_RE)) { | |
| 521 | - result = QPDFObjectHandle::newString(QUtil::hex_decode(m[1].str())); | |
| 522 | - } else if (std::regex_match(str_v, m, NAME_RE)) { | |
| 647 | + int obj = 0; | |
| 648 | + int gen = 0; | |
| 649 | + std::string str; | |
| 650 | + if (is_indirect_object(str_v, obj, gen)) { | |
| 651 | + result = reserveObject(obj, gen); | |
| 652 | + } else if (is_unicode_string(str_v, str)) { | |
| 653 | + result = QPDFObjectHandle::newUnicodeString(str); | |
| 654 | + } else if (is_binary_string(str_v, str)) { | |
| 655 | + result = QPDFObjectHandle::newString(QUtil::hex_decode(str)); | |
| 656 | + } else if (is_name(str_v)) { | |
| 523 | 657 | result = QPDFObjectHandle::newName(str_v); |
| 524 | 658 | } else { |
| 525 | 659 | QTC::TC("qpdf", "QPDF_json unrecognized string value"); | ... | ... |
libtests/json.cc
| ... | ... | @@ -3,6 +3,7 @@ |
| 3 | 3 | #include <qpdf/JSON.hh> |
| 4 | 4 | #include <qpdf/Pipeline.hh> |
| 5 | 5 | #include <qpdf/QPDFObjectHandle.hh> |
| 6 | +#include <qpdf/QPDF.hh> | |
| 6 | 7 | #include <iostream> |
| 7 | 8 | |
| 8 | 9 | static void |
| ... | ... | @@ -271,6 +272,7 @@ main() |
| 271 | 272 | { |
| 272 | 273 | test_main(); |
| 273 | 274 | test_schema(); |
| 275 | + assert(QPDF::test_json_validators()); | |
| 274 | 276 | |
| 275 | 277 | std::cout << "end of json tests\n"; |
| 276 | 278 | return 0; | ... | ... |