Commit 00ad6f555bf9244a918420da5baad2a8cbf724e6
Committed by
GitHub
Merge pull request #1379 from m-holger/tokenizer
Refactor QPDFTokenizer
Showing
5 changed files
with
312 additions
and
193 deletions
include/qpdf/QPDFTokenizer.hh
| ... | ... | @@ -29,6 +29,11 @@ |
| 29 | 29 | #include <memory> |
| 30 | 30 | #include <string> |
| 31 | 31 | |
| 32 | +namespace qpdf | |
| 33 | +{ | |
| 34 | + class Tokenizer; | |
| 35 | +} // namespace qpdf | |
| 36 | + | |
| 32 | 37 | class QPDFTokenizer |
| 33 | 38 | { |
| 34 | 39 | public: |
| ... | ... | @@ -129,6 +134,9 @@ class QPDFTokenizer |
| 129 | 134 | QPDF_DLL |
| 130 | 135 | QPDFTokenizer(); |
| 131 | 136 | |
| 137 | + QPDF_DLL | |
| 138 | + ~QPDFTokenizer(); | |
| 139 | + | |
| 132 | 140 | // If called, treat EOF as a separate token type instead of an error. This was introduced in |
| 133 | 141 | // QPDF 4.1 to facilitate tokenizing content streams. |
| 134 | 142 | QPDF_DLL |
| ... | ... | @@ -198,123 +206,10 @@ class QPDFTokenizer |
| 198 | 206 | private: |
| 199 | 207 | friend class QPDFParser; |
| 200 | 208 | |
| 201 | - // Read a token from an input source. Context describes the context in which the token is being | |
| 202 | - // read and is used in the exception thrown if there is an error. After a token is read, the | |
| 203 | - // position of the input source returned by input->tell() points to just after the token, and | |
| 204 | - // the input source's "last offset" as returned by input->getLastOffset() points to the | |
| 205 | - // beginning of the token. Returns false if the token is bad or if scanning produced an error | |
| 206 | - // message for any reason. | |
| 207 | - | |
| 208 | - bool nextToken(InputSource& input, std::string const& context, size_t max_len = 0); | |
| 209 | - | |
| 210 | - // The following methods are only valid after nextToken has been called and until another | |
| 211 | - // QPDFTokenizer method is called. They allow the results of calling nextToken to be accessed | |
| 212 | - // without creating a Token, thus avoiding copying information that may not be needed. | |
| 213 | - inline token_type_e getType() const noexcept; | |
| 214 | - inline std::string const& getValue() const noexcept; | |
| 215 | - inline std::string const& getRawValue() const noexcept; | |
| 216 | - inline std::string const& getErrorMessage() const noexcept; | |
| 217 | - | |
| 218 | 209 | QPDFTokenizer(QPDFTokenizer const&) = delete; |
| 219 | 210 | QPDFTokenizer& operator=(QPDFTokenizer const&) = delete; |
| 220 | 211 | |
| 221 | - bool isSpace(char); | |
| 222 | - bool isDelimiter(char); | |
| 223 | - void findEI(InputSource& input); | |
| 224 | - | |
| 225 | - enum state_e { | |
| 226 | - st_top, | |
| 227 | - st_in_hexstring, | |
| 228 | - st_in_string, | |
| 229 | - st_in_hexstring_2nd, | |
| 230 | - st_name, | |
| 231 | - st_literal, | |
| 232 | - st_in_space, | |
| 233 | - st_in_comment, | |
| 234 | - st_string_escape, | |
| 235 | - st_char_code, | |
| 236 | - st_string_after_cr, | |
| 237 | - st_lt, | |
| 238 | - st_gt, | |
| 239 | - st_inline_image, | |
| 240 | - st_sign, | |
| 241 | - st_number, | |
| 242 | - st_real, | |
| 243 | - st_decimal, | |
| 244 | - st_name_hex1, | |
| 245 | - st_name_hex2, | |
| 246 | - st_before_token, | |
| 247 | - st_token_ready | |
| 248 | - }; | |
| 249 | - | |
| 250 | - void handleCharacter(char); | |
| 251 | - void inBeforeToken(char); | |
| 252 | - void inTop(char); | |
| 253 | - void inSpace(char); | |
| 254 | - void inComment(char); | |
| 255 | - void inString(char); | |
| 256 | - void inName(char); | |
| 257 | - void inLt(char); | |
| 258 | - void inGt(char); | |
| 259 | - void inStringAfterCR(char); | |
| 260 | - void inStringEscape(char); | |
| 261 | - void inLiteral(char); | |
| 262 | - void inCharCode(char); | |
| 263 | - void inHexstring(char); | |
| 264 | - void inHexstring2nd(char); | |
| 265 | - void inInlineImage(char); | |
| 266 | - void inTokenReady(char); | |
| 267 | - void inNameHex1(char); | |
| 268 | - void inNameHex2(char); | |
| 269 | - void inSign(char); | |
| 270 | - void inDecimal(char); | |
| 271 | - void inNumber(char); | |
| 272 | - void inReal(char); | |
| 273 | - void reset(); | |
| 274 | - | |
| 275 | - // Lexer state | |
| 276 | - state_e state; | |
| 277 | - | |
| 278 | - bool allow_eof; | |
| 279 | - bool include_ignorable; | |
| 280 | - | |
| 281 | - // Current token accumulation | |
| 282 | - token_type_e type; | |
| 283 | - std::string val; | |
| 284 | - std::string raw_val; | |
| 285 | - std::string error_message; | |
| 286 | - bool before_token; | |
| 287 | - bool in_token; | |
| 288 | - char char_to_unread; | |
| 289 | - size_t inline_image_bytes; | |
| 290 | - bool bad; | |
| 291 | - | |
| 292 | - // State for strings | |
| 293 | - int string_depth; | |
| 294 | - int char_code; | |
| 295 | - char hex_char; | |
| 296 | - int digit_count; | |
| 212 | + std::unique_ptr<qpdf::Tokenizer> m; | |
| 297 | 213 | }; |
| 298 | 214 | |
| 299 | -inline QPDFTokenizer::token_type_e | |
| 300 | -QPDFTokenizer::getType() const noexcept | |
| 301 | -{ | |
| 302 | - return this->type; | |
| 303 | -} | |
| 304 | -inline std::string const& | |
| 305 | -QPDFTokenizer::getValue() const noexcept | |
| 306 | -{ | |
| 307 | - return (this->type == tt_name || this->type == tt_string) ? this->val : this->raw_val; | |
| 308 | -} | |
| 309 | -inline std::string const& | |
| 310 | -QPDFTokenizer::getRawValue() const noexcept | |
| 311 | -{ | |
| 312 | - return this->raw_val; | |
| 313 | -} | |
| 314 | -inline std::string const& | |
| 315 | -QPDFTokenizer::getErrorMessage() const noexcept | |
| 316 | -{ | |
| 317 | - return this->error_message; | |
| 318 | -} | |
| 319 | - | |
| 320 | 215 | #endif // QPDFTOKENIZER_HH | ... | ... |
libqpdf/QPDFParser.cc
libqpdf/QPDFTokenizer.cc
| 1 | -#include <qpdf/QPDFTokenizer.hh> | |
| 1 | +#include <qpdf/QPDFTokenizer_private.hh> | |
| 2 | 2 | |
| 3 | 3 | // DO NOT USE ctype -- it is locale dependent for some things, and it's not worth the risk of |
| 4 | 4 | // including it in case it may accidentally be used. |
| ... | ... | @@ -16,6 +16,9 @@ |
| 16 | 16 | |
| 17 | 17 | using namespace qpdf; |
| 18 | 18 | |
| 19 | +using Token = QPDFTokenizer::Token; | |
| 20 | +using tt = QPDFTokenizer::token_type_e; | |
| 21 | + | |
| 19 | 22 | static inline bool |
| 20 | 23 | is_delimiter(char ch) |
| 21 | 24 | { |
| ... | ... | @@ -77,10 +80,10 @@ QPDFWordTokenFinder::check() |
| 77 | 80 | } |
| 78 | 81 | |
| 79 | 82 | void |
| 80 | -QPDFTokenizer::reset() | |
| 83 | +Tokenizer::reset() | |
| 81 | 84 | { |
| 82 | 85 | state = st_before_token; |
| 83 | - type = tt_bad; | |
| 86 | + type = tt::tt_bad; | |
| 84 | 87 | val.clear(); |
| 85 | 88 | raw_val.clear(); |
| 86 | 89 | error_message = ""; |
| ... | ... | @@ -105,8 +108,13 @@ QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) : |
| 105 | 108 | } |
| 106 | 109 | |
| 107 | 110 | QPDFTokenizer::QPDFTokenizer() : |
| 108 | - allow_eof(false), | |
| 109 | - include_ignorable(false) | |
| 111 | + m(std::make_unique<qpdf::Tokenizer>()) | |
| 112 | +{ | |
| 113 | +} | |
| 114 | + | |
| 115 | +QPDFTokenizer::~QPDFTokenizer() = default; | |
| 116 | + | |
| 117 | +Tokenizer::Tokenizer() | |
| 110 | 118 | { |
| 111 | 119 | reset(); |
| 112 | 120 | } |
| ... | ... | @@ -114,23 +122,35 @@ QPDFTokenizer::QPDFTokenizer() : |
| 114 | 122 | void |
| 115 | 123 | QPDFTokenizer::allowEOF() |
| 116 | 124 | { |
| 117 | - this->allow_eof = true; | |
| 125 | + m->allowEOF(); | |
| 126 | +} | |
| 127 | + | |
| 128 | +void | |
| 129 | +Tokenizer::allowEOF() | |
| 130 | +{ | |
| 131 | + allow_eof = true; | |
| 118 | 132 | } |
| 119 | 133 | |
| 120 | 134 | void |
| 121 | 135 | QPDFTokenizer::includeIgnorable() |
| 122 | 136 | { |
| 123 | - this->include_ignorable = true; | |
| 137 | + m->includeIgnorable(); | |
| 138 | +} | |
| 139 | + | |
| 140 | +void | |
| 141 | +Tokenizer::includeIgnorable() | |
| 142 | +{ | |
| 143 | + include_ignorable = true; | |
| 124 | 144 | } |
| 125 | 145 | |
| 126 | 146 | bool |
| 127 | -QPDFTokenizer::isSpace(char ch) | |
| 147 | +Tokenizer::isSpace(char ch) | |
| 128 | 148 | { |
| 129 | 149 | return (ch == '\0' || util::is_space(ch)); |
| 130 | 150 | } |
| 131 | 151 | |
| 132 | 152 | bool |
| 133 | -QPDFTokenizer::isDelimiter(char ch) | |
| 153 | +Tokenizer::isDelimiter(char ch) | |
| 134 | 154 | { |
| 135 | 155 | return is_delimiter(ch); |
| 136 | 156 | } |
| ... | ... | @@ -138,6 +158,12 @@ QPDFTokenizer::isDelimiter(char ch) |
| 138 | 158 | void |
| 139 | 159 | QPDFTokenizer::presentCharacter(char ch) |
| 140 | 160 | { |
| 161 | + m->presentCharacter(ch); | |
| 162 | +} | |
| 163 | + | |
| 164 | +void | |
| 165 | +Tokenizer::presentCharacter(char ch) | |
| 166 | +{ | |
| 141 | 167 | handleCharacter(ch); |
| 142 | 168 | |
| 143 | 169 | if (this->in_token) { |
| ... | ... | @@ -146,7 +172,7 @@ QPDFTokenizer::presentCharacter(char ch) |
| 146 | 172 | } |
| 147 | 173 | |
| 148 | 174 | void |
| 149 | -QPDFTokenizer::handleCharacter(char ch) | |
| 175 | +Tokenizer::handleCharacter(char ch) | |
| 150 | 176 | { |
| 151 | 177 | // In some cases, functions called below may call a second handler. This happens whenever you |
| 152 | 178 | // have to use a character from the next token to detect the end of the current token. |
| ... | ... | @@ -246,14 +272,14 @@ QPDFTokenizer::handleCharacter(char ch) |
| 246 | 272 | } |
| 247 | 273 | |
| 248 | 274 | void |
| 249 | -QPDFTokenizer::inTokenReady(char ch) | |
| 275 | +Tokenizer::inTokenReady(char ch) | |
| 250 | 276 | { |
| 251 | 277 | throw std::logic_error( |
| 252 | 278 | "INTERNAL ERROR: QPDF tokenizer presented character while token is waiting"); |
| 253 | 279 | } |
| 254 | 280 | |
| 255 | 281 | void |
| 256 | -QPDFTokenizer::inBeforeToken(char ch) | |
| 282 | +Tokenizer::inBeforeToken(char ch) | |
| 257 | 283 | { |
| 258 | 284 | // Note: we specifically do not use ctype here. It is locale-dependent. |
| 259 | 285 | if (isSpace(ch)) { |
| ... | ... | @@ -274,7 +300,7 @@ QPDFTokenizer::inBeforeToken(char ch) |
| 274 | 300 | } |
| 275 | 301 | |
| 276 | 302 | void |
| 277 | -QPDFTokenizer::inTop(char ch) | |
| 303 | +Tokenizer::inTop(char ch) | |
| 278 | 304 | { |
| 279 | 305 | switch (ch) { |
| 280 | 306 | case '(': |
| ... | ... | @@ -291,29 +317,29 @@ QPDFTokenizer::inTop(char ch) |
| 291 | 317 | return; |
| 292 | 318 | |
| 293 | 319 | case (')'): |
| 294 | - this->type = tt_bad; | |
| 320 | + this->type = tt::tt_bad; | |
| 295 | 321 | QTC::TC("qpdf", "QPDFTokenizer bad )"); |
| 296 | 322 | this->error_message = "unexpected )"; |
| 297 | 323 | this->state = st_token_ready; |
| 298 | 324 | return; |
| 299 | 325 | |
| 300 | 326 | case '[': |
| 301 | - this->type = tt_array_open; | |
| 327 | + this->type = tt::tt_array_open; | |
| 302 | 328 | this->state = st_token_ready; |
| 303 | 329 | return; |
| 304 | 330 | |
| 305 | 331 | case ']': |
| 306 | - this->type = tt_array_close; | |
| 332 | + this->type = tt::tt_array_close; | |
| 307 | 333 | this->state = st_token_ready; |
| 308 | 334 | return; |
| 309 | 335 | |
| 310 | 336 | case '{': |
| 311 | - this->type = tt_brace_open; | |
| 337 | + this->type = tt::tt_brace_open; | |
| 312 | 338 | this->state = st_token_ready; |
| 313 | 339 | return; |
| 314 | 340 | |
| 315 | 341 | case '}': |
| 316 | - this->type = tt_brace_close; | |
| 342 | + this->type = tt::tt_brace_close; | |
| 317 | 343 | this->state = st_token_ready; |
| 318 | 344 | return; |
| 319 | 345 | |
| ... | ... | @@ -351,11 +377,11 @@ QPDFTokenizer::inTop(char ch) |
| 351 | 377 | } |
| 352 | 378 | |
| 353 | 379 | void |
| 354 | -QPDFTokenizer::inSpace(char ch) | |
| 380 | +Tokenizer::inSpace(char ch) | |
| 355 | 381 | { |
| 356 | 382 | // We only enter this state if include_ignorable is true. |
| 357 | 383 | if (!isSpace(ch)) { |
| 358 | - this->type = tt_space; | |
| 384 | + this->type = tt::tt_space; | |
| 359 | 385 | this->in_token = false; |
| 360 | 386 | this->char_to_unread = ch; |
| 361 | 387 | this->state = st_token_ready; |
| ... | ... | @@ -363,11 +389,11 @@ QPDFTokenizer::inSpace(char ch) |
| 363 | 389 | } |
| 364 | 390 | |
| 365 | 391 | void |
| 366 | -QPDFTokenizer::inComment(char ch) | |
| 392 | +Tokenizer::inComment(char ch) | |
| 367 | 393 | { |
| 368 | 394 | if ((ch == '\r') || (ch == '\n')) { |
| 369 | 395 | if (this->include_ignorable) { |
| 370 | - this->type = tt_comment; | |
| 396 | + this->type = tt::tt_comment; | |
| 371 | 397 | this->in_token = false; |
| 372 | 398 | this->char_to_unread = ch; |
| 373 | 399 | this->state = st_token_ready; |
| ... | ... | @@ -378,7 +404,7 @@ QPDFTokenizer::inComment(char ch) |
| 378 | 404 | } |
| 379 | 405 | |
| 380 | 406 | void |
| 381 | -QPDFTokenizer::inString(char ch) | |
| 407 | +Tokenizer::inString(char ch) | |
| 382 | 408 | { |
| 383 | 409 | switch (ch) { |
| 384 | 410 | case '\\': |
| ... | ... | @@ -392,7 +418,7 @@ QPDFTokenizer::inString(char ch) |
| 392 | 418 | |
| 393 | 419 | case ')': |
| 394 | 420 | if (--this->string_depth == 0) { |
| 395 | - this->type = tt_string; | |
| 421 | + this->type = tt::tt_string; | |
| 396 | 422 | this->state = st_token_ready; |
| 397 | 423 | return; |
| 398 | 424 | } |
| ... | ... | @@ -417,7 +443,7 @@ QPDFTokenizer::inString(char ch) |
| 417 | 443 | } |
| 418 | 444 | |
| 419 | 445 | void |
| 420 | -QPDFTokenizer::inName(char ch) | |
| 446 | +Tokenizer::inName(char ch) | |
| 421 | 447 | { |
| 422 | 448 | if (isDelimiter(ch)) { |
| 423 | 449 | // A C-locale whitespace character or delimiter terminates token. It is important to unread |
| ... | ... | @@ -426,7 +452,7 @@ QPDFTokenizer::inName(char ch) |
| 426 | 452 | // though not on any files in the test suite as of this |
| 427 | 453 | // writing. |
| 428 | 454 | |
| 429 | - this->type = this->bad ? tt_bad : tt_name; | |
| 455 | + this->type = this->bad ? tt::tt_bad : tt::tt_name; | |
| 430 | 456 | this->in_token = false; |
| 431 | 457 | this->char_to_unread = ch; |
| 432 | 458 | this->state = st_token_ready; |
| ... | ... | @@ -439,7 +465,7 @@ QPDFTokenizer::inName(char ch) |
| 439 | 465 | } |
| 440 | 466 | |
| 441 | 467 | void |
| 442 | -QPDFTokenizer::inNameHex1(char ch) | |
| 468 | +Tokenizer::inNameHex1(char ch) | |
| 443 | 469 | { |
| 444 | 470 | this->hex_char = ch; |
| 445 | 471 | |
| ... | ... | @@ -457,7 +483,7 @@ QPDFTokenizer::inNameHex1(char ch) |
| 457 | 483 | } |
| 458 | 484 | |
| 459 | 485 | void |
| 460 | -QPDFTokenizer::inNameHex2(char ch) | |
| 486 | +Tokenizer::inNameHex2(char ch) | |
| 461 | 487 | { |
| 462 | 488 | if (char hval = util::hex_decode_char(ch); hval < '\20') { |
| 463 | 489 | this->char_code |= int(hval); |
| ... | ... | @@ -484,7 +510,7 @@ QPDFTokenizer::inNameHex2(char ch) |
| 484 | 510 | } |
| 485 | 511 | |
| 486 | 512 | void |
| 487 | -QPDFTokenizer::inSign(char ch) | |
| 513 | +Tokenizer::inSign(char ch) | |
| 488 | 514 | { |
| 489 | 515 | if (util::is_digit(ch)) { |
| 490 | 516 | this->state = st_number; |
| ... | ... | @@ -497,7 +523,7 @@ QPDFTokenizer::inSign(char ch) |
| 497 | 523 | } |
| 498 | 524 | |
| 499 | 525 | void |
| 500 | -QPDFTokenizer::inDecimal(char ch) | |
| 526 | +Tokenizer::inDecimal(char ch) | |
| 501 | 527 | { |
| 502 | 528 | if (util::is_digit(ch)) { |
| 503 | 529 | this->state = st_real; |
| ... | ... | @@ -508,13 +534,13 @@ QPDFTokenizer::inDecimal(char ch) |
| 508 | 534 | } |
| 509 | 535 | |
| 510 | 536 | void |
| 511 | -QPDFTokenizer::inNumber(char ch) | |
| 537 | +Tokenizer::inNumber(char ch) | |
| 512 | 538 | { |
| 513 | 539 | if (util::is_digit(ch)) { |
| 514 | 540 | } else if (ch == '.') { |
| 515 | 541 | this->state = st_real; |
| 516 | 542 | } else if (isDelimiter(ch)) { |
| 517 | - this->type = tt_integer; | |
| 543 | + this->type = tt::tt_integer; | |
| 518 | 544 | this->state = st_token_ready; |
| 519 | 545 | this->in_token = false; |
| 520 | 546 | this->char_to_unread = ch; |
| ... | ... | @@ -524,11 +550,11 @@ QPDFTokenizer::inNumber(char ch) |
| 524 | 550 | } |
| 525 | 551 | |
| 526 | 552 | void |
| 527 | -QPDFTokenizer::inReal(char ch) | |
| 553 | +Tokenizer::inReal(char ch) | |
| 528 | 554 | { |
| 529 | 555 | if (util::is_digit(ch)) { |
| 530 | 556 | } else if (isDelimiter(ch)) { |
| 531 | - this->type = tt_real; | |
| 557 | + this->type = tt::tt_real; | |
| 532 | 558 | this->state = st_token_ready; |
| 533 | 559 | this->in_token = false; |
| 534 | 560 | this->char_to_unread = ch; |
| ... | ... | @@ -537,7 +563,7 @@ QPDFTokenizer::inReal(char ch) |
| 537 | 563 | } |
| 538 | 564 | } |
| 539 | 565 | void |
| 540 | -QPDFTokenizer::inStringEscape(char ch) | |
| 566 | +Tokenizer::inStringEscape(char ch) | |
| 541 | 567 | { |
| 542 | 568 | this->state = st_in_string; |
| 543 | 569 | switch (ch) { |
| ... | ... | @@ -590,7 +616,7 @@ QPDFTokenizer::inStringEscape(char ch) |
| 590 | 616 | } |
| 591 | 617 | |
| 592 | 618 | void |
| 593 | -QPDFTokenizer::inStringAfterCR(char ch) | |
| 619 | +Tokenizer::inStringAfterCR(char ch) | |
| 594 | 620 | { |
| 595 | 621 | this->state = st_in_string; |
| 596 | 622 | if (ch != '\n') { |
| ... | ... | @@ -599,10 +625,10 @@ QPDFTokenizer::inStringAfterCR(char ch) |
| 599 | 625 | } |
| 600 | 626 | |
| 601 | 627 | void |
| 602 | -QPDFTokenizer::inLt(char ch) | |
| 628 | +Tokenizer::inLt(char ch) | |
| 603 | 629 | { |
| 604 | 630 | if (ch == '<') { |
| 605 | - this->type = tt_dict_open; | |
| 631 | + this->type = tt::tt_dict_open; | |
| 606 | 632 | this->state = st_token_ready; |
| 607 | 633 | return; |
| 608 | 634 | } |
| ... | ... | @@ -612,13 +638,13 @@ QPDFTokenizer::inLt(char ch) |
| 612 | 638 | } |
| 613 | 639 | |
| 614 | 640 | void |
| 615 | -QPDFTokenizer::inGt(char ch) | |
| 641 | +Tokenizer::inGt(char ch) | |
| 616 | 642 | { |
| 617 | 643 | if (ch == '>') { |
| 618 | - this->type = tt_dict_close; | |
| 644 | + this->type = tt::tt_dict_close; | |
| 619 | 645 | this->state = st_token_ready; |
| 620 | 646 | } else { |
| 621 | - this->type = tt_bad; | |
| 647 | + this->type = tt::tt_bad; | |
| 622 | 648 | QTC::TC("qpdf", "QPDFTokenizer bad >"); |
| 623 | 649 | this->error_message = "unexpected >"; |
| 624 | 650 | this->in_token = false; |
| ... | ... | @@ -628,7 +654,7 @@ QPDFTokenizer::inGt(char ch) |
| 628 | 654 | } |
| 629 | 655 | |
| 630 | 656 | void |
| 631 | -QPDFTokenizer::inLiteral(char ch) | |
| 657 | +Tokenizer::inLiteral(char ch) | |
| 632 | 658 | { |
| 633 | 659 | if (isDelimiter(ch)) { |
| 634 | 660 | // A C-locale whitespace character or delimiter terminates token. It is important to unread |
| ... | ... | @@ -640,27 +666,27 @@ QPDFTokenizer::inLiteral(char ch) |
| 640 | 666 | this->char_to_unread = ch; |
| 641 | 667 | this->state = st_token_ready; |
| 642 | 668 | this->type = (this->raw_val == "true") || (this->raw_val == "false") |
| 643 | - ? tt_bool | |
| 644 | - : (this->raw_val == "null" ? tt_null : tt_word); | |
| 669 | + ? tt::tt_bool | |
| 670 | + : (this->raw_val == "null" ? tt::tt_null : tt::tt_word); | |
| 645 | 671 | } |
| 646 | 672 | } |
| 647 | 673 | |
| 648 | 674 | void |
| 649 | -QPDFTokenizer::inHexstring(char ch) | |
| 675 | +Tokenizer::inHexstring(char ch) | |
| 650 | 676 | { |
| 651 | 677 | if (char hval = util::hex_decode_char(ch); hval < '\20') { |
| 652 | 678 | this->char_code = int(hval) << 4; |
| 653 | 679 | this->state = st_in_hexstring_2nd; |
| 654 | 680 | |
| 655 | 681 | } else if (ch == '>') { |
| 656 | - this->type = tt_string; | |
| 682 | + this->type = tt::tt_string; | |
| 657 | 683 | this->state = st_token_ready; |
| 658 | 684 | |
| 659 | 685 | } else if (isSpace(ch)) { |
| 660 | 686 | // ignore |
| 661 | 687 | |
| 662 | 688 | } else { |
| 663 | - this->type = tt_bad; | |
| 689 | + this->type = tt::tt_bad; | |
| 664 | 690 | QTC::TC("qpdf", "QPDFTokenizer bad hexstring character"); |
| 665 | 691 | this->error_message = std::string("invalid character (") + ch + ") in hexstring"; |
| 666 | 692 | this->state = st_token_ready; |
| ... | ... | @@ -668,7 +694,7 @@ QPDFTokenizer::inHexstring(char ch) |
| 668 | 694 | } |
| 669 | 695 | |
| 670 | 696 | void |
| 671 | -QPDFTokenizer::inHexstring2nd(char ch) | |
| 697 | +Tokenizer::inHexstring2nd(char ch) | |
| 672 | 698 | { |
| 673 | 699 | if (char hval = util::hex_decode_char(ch); hval < '\20') { |
| 674 | 700 | this->val += char(this->char_code) | hval; |
| ... | ... | @@ -677,14 +703,14 @@ QPDFTokenizer::inHexstring2nd(char ch) |
| 677 | 703 | } else if (ch == '>') { |
| 678 | 704 | // PDF spec says odd hexstrings have implicit trailing 0. |
| 679 | 705 | this->val += char(this->char_code); |
| 680 | - this->type = tt_string; | |
| 706 | + this->type = tt::tt_string; | |
| 681 | 707 | this->state = st_token_ready; |
| 682 | 708 | |
| 683 | 709 | } else if (isSpace(ch)) { |
| 684 | 710 | // ignore |
| 685 | 711 | |
| 686 | 712 | } else { |
| 687 | - this->type = tt_bad; | |
| 713 | + this->type = tt::tt_bad; | |
| 688 | 714 | QTC::TC("qpdf", "QPDFTokenizer bad hexstring 2nd character"); |
| 689 | 715 | this->error_message = std::string("invalid character (") + ch + ") in hexstring"; |
| 690 | 716 | this->state = st_token_ready; |
| ... | ... | @@ -692,7 +718,7 @@ QPDFTokenizer::inHexstring2nd(char ch) |
| 692 | 718 | } |
| 693 | 719 | |
| 694 | 720 | void |
| 695 | -QPDFTokenizer::inCharCode(char ch) | |
| 721 | +Tokenizer::inCharCode(char ch) | |
| 696 | 722 | { |
| 697 | 723 | bool handled = false; |
| 698 | 724 | if (('0' <= ch) && (ch <= '7')) { |
| ... | ... | @@ -712,11 +738,11 @@ QPDFTokenizer::inCharCode(char ch) |
| 712 | 738 | } |
| 713 | 739 | |
| 714 | 740 | void |
| 715 | -QPDFTokenizer::inInlineImage(char ch) | |
| 741 | +Tokenizer::inInlineImage(char ch) | |
| 716 | 742 | { |
| 717 | 743 | if ((this->raw_val.length() + 1) == this->inline_image_bytes) { |
| 718 | 744 | QTC::TC("qpdf", "QPDFTokenizer found EI by byte count"); |
| 719 | - this->type = tt_inline_image; | |
| 745 | + this->type = tt::tt_inline_image; | |
| 720 | 746 | this->inline_image_bytes = 0; |
| 721 | 747 | this->state = st_token_ready; |
| 722 | 748 | } |
| ... | ... | @@ -725,6 +751,12 @@ QPDFTokenizer::inInlineImage(char ch) |
| 725 | 751 | void |
| 726 | 752 | QPDFTokenizer::presentEOF() |
| 727 | 753 | { |
| 754 | + m->presentEOF(); | |
| 755 | +} | |
| 756 | + | |
| 757 | +void | |
| 758 | +Tokenizer::presentEOF() | |
| 759 | +{ | |
| 728 | 760 | switch (this->state) { |
| 729 | 761 | case st_name: |
| 730 | 762 | case st_name_hex1: |
| ... | ... | @@ -742,15 +774,15 @@ QPDFTokenizer::presentEOF() |
| 742 | 774 | |
| 743 | 775 | case st_top: |
| 744 | 776 | case st_before_token: |
| 745 | - this->type = tt_eof; | |
| 777 | + this->type = tt::tt_eof; | |
| 746 | 778 | break; |
| 747 | 779 | |
| 748 | 780 | case st_in_space: |
| 749 | - this->type = this->include_ignorable ? tt_space : tt_eof; | |
| 781 | + this->type = this->include_ignorable ? tt::tt_space : tt::tt_eof; | |
| 750 | 782 | break; |
| 751 | 783 | |
| 752 | 784 | case st_in_comment: |
| 753 | - this->type = this->include_ignorable ? tt_comment : tt_bad; | |
| 785 | + this->type = this->include_ignorable ? tt::tt_comment : tt::tt_bad; | |
| 754 | 786 | break; |
| 755 | 787 | |
| 756 | 788 | case st_token_ready: |
| ... | ... | @@ -758,7 +790,7 @@ QPDFTokenizer::presentEOF() |
| 758 | 790 | |
| 759 | 791 | default: |
| 760 | 792 | QTC::TC("qpdf", "QPDFTokenizer EOF reading token"); |
| 761 | - this->type = tt_bad; | |
| 793 | + this->type = tt::tt_bad; | |
| 762 | 794 | this->error_message = "EOF while reading token"; |
| 763 | 795 | } |
| 764 | 796 | this->state = st_token_ready; |
| ... | ... | @@ -767,12 +799,18 @@ QPDFTokenizer::presentEOF() |
| 767 | 799 | void |
| 768 | 800 | QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input) |
| 769 | 801 | { |
| 770 | - expectInlineImage(*input); | |
| 802 | + m->expectInlineImage(*input); | |
| 771 | 803 | } |
| 772 | 804 | |
| 773 | 805 | void |
| 774 | 806 | QPDFTokenizer::expectInlineImage(InputSource& input) |
| 775 | 807 | { |
| 808 | + m->expectInlineImage(input); | |
| 809 | +} | |
| 810 | + | |
| 811 | +void | |
| 812 | +Tokenizer::expectInlineImage(InputSource& input) | |
| 813 | +{ | |
| 776 | 814 | if (this->state == st_token_ready) { |
| 777 | 815 | reset(); |
| 778 | 816 | } else if (this->state != st_before_token) { |
| ... | ... | @@ -786,7 +824,7 @@ QPDFTokenizer::expectInlineImage(InputSource& input) |
| 786 | 824 | } |
| 787 | 825 | |
| 788 | 826 | void |
| 789 | -QPDFTokenizer::findEI(InputSource& input) | |
| 827 | +Tokenizer::findEI(InputSource& input) | |
| 790 | 828 | { |
| 791 | 829 | qpdf_offset_t last_offset = input.getLastOffset(); |
| 792 | 830 | qpdf_offset_t pos = input.tell(); |
| ... | ... | @@ -816,10 +854,10 @@ QPDFTokenizer::findEI(InputSource& input) |
| 816 | 854 | // be pretty sure we've found the actual EI. |
| 817 | 855 | for (int i = 0; i < 10; ++i) { |
| 818 | 856 | QPDFTokenizer::Token t = check.readToken(input, "checker", true); |
| 819 | - token_type_e type = t.getType(); | |
| 820 | - if (type == tt_eof) { | |
| 857 | + QPDFTokenizer::token_type_e type = t.getType(); | |
| 858 | + if (type == tt::tt_eof) { | |
| 821 | 859 | okay = true; |
| 822 | - } else if (type == tt_bad) { | |
| 860 | + } else if (type == tt::tt_bad) { | |
| 823 | 861 | found_bad = true; |
| 824 | 862 | } else if (t.isWord()) { |
| 825 | 863 | // The qpdf tokenizer lumps alphabetic and otherwise uncategorized characters into |
| ... | ... | @@ -870,11 +908,17 @@ QPDFTokenizer::findEI(InputSource& input) |
| 870 | 908 | bool |
| 871 | 909 | QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch) |
| 872 | 910 | { |
| 911 | + return m->getToken(token, unread_char, ch); | |
| 912 | +} | |
| 913 | + | |
| 914 | +bool | |
| 915 | +Tokenizer::getToken(Token& token, bool& unread_char, char& ch) | |
| 916 | +{ | |
| 873 | 917 | bool ready = (this->state == st_token_ready); |
| 874 | 918 | unread_char = !this->in_token && !this->before_token; |
| 875 | 919 | ch = this->char_to_unread; |
| 876 | 920 | if (ready) { |
| 877 | - token = (!(this->type == tt_name || this->type == tt_string)) | |
| 921 | + token = (!(this->type == tt::tt_name || this->type == tt::tt_string)) | |
| 878 | 922 | ? Token(this->type, this->raw_val, this->raw_val, this->error_message) |
| 879 | 923 | : Token(this->type, this->val, this->raw_val, this->error_message); |
| 880 | 924 | |
| ... | ... | @@ -886,13 +930,32 @@ QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch) |
| 886 | 930 | bool |
| 887 | 931 | QPDFTokenizer::betweenTokens() |
| 888 | 932 | { |
| 889 | - return this->before_token; | |
| 933 | + return m->betweenTokens(); | |
| 934 | +} | |
| 935 | + | |
| 936 | +bool | |
| 937 | +Tokenizer::betweenTokens() | |
| 938 | +{ | |
| 939 | + return before_token; | |
| 890 | 940 | } |
| 891 | 941 | |
| 892 | 942 | QPDFTokenizer::Token |
| 893 | 943 | QPDFTokenizer::readToken( |
| 894 | 944 | InputSource& input, std::string const& context, bool allow_bad, size_t max_len) |
| 895 | 945 | { |
| 946 | + return m->readToken(input, context, allow_bad, max_len); | |
| 947 | +} | |
| 948 | + | |
| 949 | +QPDFTokenizer::Token | |
| 950 | +QPDFTokenizer::readToken( | |
| 951 | + std::shared_ptr<InputSource> input, std::string const& context, bool allow_bad, size_t max_len) | |
| 952 | +{ | |
| 953 | + return m->readToken(*input, context, allow_bad, max_len); | |
| 954 | +} | |
| 955 | + | |
| 956 | +QPDFTokenizer::Token | |
| 957 | +Tokenizer::readToken(InputSource& input, std::string const& context, bool allow_bad, size_t max_len) | |
| 958 | +{ | |
| 896 | 959 | nextToken(input, context, max_len); |
| 897 | 960 | |
| 898 | 961 | Token token; |
| ... | ... | @@ -900,7 +963,7 @@ QPDFTokenizer::readToken( |
| 900 | 963 | char char_to_unread; |
| 901 | 964 | getToken(token, unread_char, char_to_unread); |
| 902 | 965 | |
| 903 | - if (token.getType() == tt_bad) { | |
| 966 | + if (token.getType() == tt::tt_bad) { | |
| 904 | 967 | if (allow_bad) { |
| 905 | 968 | QTC::TC("qpdf", "QPDFTokenizer allowing bad token"); |
| 906 | 969 | } else { |
| ... | ... | @@ -915,15 +978,8 @@ QPDFTokenizer::readToken( |
| 915 | 978 | return token; |
| 916 | 979 | } |
| 917 | 980 | |
| 918 | -QPDFTokenizer::Token | |
| 919 | -QPDFTokenizer::readToken( | |
| 920 | - std::shared_ptr<InputSource> input, std::string const& context, bool allow_bad, size_t max_len) | |
| 921 | -{ | |
| 922 | - return readToken(*input, context, allow_bad, max_len); | |
| 923 | -} | |
| 924 | - | |
| 925 | 981 | bool |
| 926 | -QPDFTokenizer::nextToken(InputSource& input, std::string const& context, size_t max_len) | |
| 982 | +Tokenizer::nextToken(InputSource& input, std::string const& context, size_t max_len) | |
| 927 | 983 | { |
| 928 | 984 | if (this->state != st_inline_image) { |
| 929 | 985 | reset(); |
| ... | ... | @@ -935,10 +991,10 @@ QPDFTokenizer::nextToken(InputSource& input, std::string const& context, size_t |
| 935 | 991 | if (!input.fastRead(ch)) { |
| 936 | 992 | presentEOF(); |
| 937 | 993 | |
| 938 | - if ((this->type == tt_eof) && (!this->allow_eof)) { | |
| 994 | + if ((this->type == tt::tt_eof) && (!this->allow_eof)) { | |
| 939 | 995 | // Nothing in the qpdf library calls readToken without allowEOF anymore, so this |
| 940 | 996 | // case is not exercised. |
| 941 | - this->type = tt_bad; | |
| 997 | + this->type = tt::tt_bad; | |
| 942 | 998 | this->error_message = "unexpected EOF"; |
| 943 | 999 | offset = input.getLastOffset(); |
| 944 | 1000 | } |
| ... | ... | @@ -953,7 +1009,7 @@ QPDFTokenizer::nextToken(InputSource& input, std::string const& context, size_t |
| 953 | 1009 | if (max_len && (this->raw_val.length() >= max_len) && (this->state != st_token_ready)) { |
| 954 | 1010 | // terminate this token now |
| 955 | 1011 | QTC::TC("qpdf", "QPDFTokenizer block long token"); |
| 956 | - this->type = tt_bad; | |
| 1012 | + this->type = tt::tt_bad; | |
| 957 | 1013 | this->state = st_token_ready; |
| 958 | 1014 | this->error_message = "exceeded allowable length while reading token"; |
| 959 | 1015 | } |
| ... | ... | @@ -962,7 +1018,7 @@ QPDFTokenizer::nextToken(InputSource& input, std::string const& context, size_t |
| 962 | 1018 | |
| 963 | 1019 | input.fastUnread(!this->in_token && !this->before_token); |
| 964 | 1020 | |
| 965 | - if (this->type != tt_eof) { | |
| 1021 | + if (this->type != tt::tt_eof) { | |
| 966 | 1022 | input.setLastOffset(offset); |
| 967 | 1023 | } |
| 968 | 1024 | ... | ... |
libqpdf/qpdf/QPDFParser.hh
| ... | ... | @@ -3,6 +3,7 @@ |
| 3 | 3 | |
| 4 | 4 | #include <qpdf/QPDFObjectHandle_private.hh> |
| 5 | 5 | #include <qpdf/QPDFObject_private.hh> |
| 6 | +#include <qpdf/QPDFTokenizer_private.hh> | |
| 6 | 7 | |
| 7 | 8 | #include <memory> |
| 8 | 9 | #include <string> |
| ... | ... | @@ -20,7 +21,7 @@ class QPDFParser |
| 20 | 21 | bool parse_pdf) : |
| 21 | 22 | input(input), |
| 22 | 23 | object_description(object_description), |
| 23 | - tokenizer(tokenizer), | |
| 24 | + tokenizer(*tokenizer.m), | |
| 24 | 25 | decrypter(decrypter), |
| 25 | 26 | context(context), |
| 26 | 27 | description( |
| ... | ... | @@ -75,7 +76,7 @@ class QPDFParser |
| 75 | 76 | void setDescription(std::shared_ptr<QPDFObject>& obj, qpdf_offset_t parsed_offset); |
| 76 | 77 | InputSource& input; |
| 77 | 78 | std::string const& object_description; |
| 78 | - QPDFTokenizer& tokenizer; | |
| 79 | + qpdf::Tokenizer& tokenizer; | |
| 79 | 80 | QPDFObjectHandle::StringDecrypter* decrypter; |
| 80 | 81 | QPDF* context; |
| 81 | 82 | std::shared_ptr<QPDFObject::Description> description; | ... | ... |
libqpdf/qpdf/QPDFTokenizer_private.hh
0 โ 100644
| 1 | +#ifndef QPDFTOKENIZER_PRIVATE_HH | |
| 2 | +#define QPDFTOKENIZER_PRIVATE_HH | |
| 3 | + | |
| 4 | +#include <qpdf/QPDFTokenizer.hh> | |
| 5 | + | |
| 6 | +namespace qpdf | |
| 7 | +{ | |
| 8 | + | |
| 9 | + class Tokenizer | |
| 10 | + { | |
| 11 | + public: | |
| 12 | + Tokenizer(); | |
| 13 | + Tokenizer(Tokenizer const&) = delete; | |
| 14 | + Tokenizer& operator=(Tokenizer const&) = delete; | |
| 15 | + | |
| 16 | + // Methods to support QPDFTokenizer. See QPDFTokenizer.hh for detail. Some of these are used | |
| 17 | + // by Tokenizer internally but are not accessed directly by the rest of qpdf. | |
| 18 | + | |
| 19 | + void allowEOF(); | |
| 20 | + void includeIgnorable(); | |
| 21 | + void presentCharacter(char ch); | |
| 22 | + void presentEOF(); | |
| 23 | + bool betweenTokens(); | |
| 24 | + | |
| 25 | + // If a token is available, return true and initialize token with the token, unread_char | |
| 26 | + // with whether or not we have to unread the last character, and if unread_char, ch with the | |
| 27 | + // character to unread. | |
| 28 | + bool getToken(QPDFTokenizer::Token& token, bool& unread_char, char& ch); | |
| 29 | + | |
| 30 | + // Read a token from an input source. Context describes the context in which the token is | |
| 31 | + // being read and is used in the exception thrown if there is an error. After a token is | |
| 32 | + // read, the position of the input source returned by input->tell() points to just after the | |
| 33 | + // token, and the input source's "last offset" as returned by input->getLastOffset() points | |
| 34 | + // to the beginning of the token. | |
| 35 | + QPDFTokenizer::Token readToken( | |
| 36 | + InputSource& input, | |
| 37 | + std::string const& context, | |
| 38 | + bool allow_bad = false, | |
| 39 | + size_t max_len = 0); | |
| 40 | + | |
| 41 | + // Calling this method puts the tokenizer in a state for reading inline images. You should | |
| 42 | + // call this method after reading the character following the ID operator. In that state, it | |
| 43 | + // will return all data up to BUT NOT INCLUDING the next EI token. After you call this | |
| 44 | + // method, the next call to readToken (or the token created next time getToken returns true) | |
| 45 | + // will either be tt_inline_image or tt_bad. This is the only way readToken returns a | |
| 46 | + // tt_inline_image token. | |
| 47 | + void expectInlineImage(InputSource& input); | |
| 48 | + | |
| 49 | + // Read a token from an input source. Context describes the context in which the token is | |
| 50 | + // being read and is used in the exception thrown if there is an error. After a token is | |
| 51 | + // read, the position of the input source returned by input->tell() points to just after the | |
| 52 | + // token, and the input source's "last offset" as returned by input->getLastOffset() points | |
| 53 | + // to the beginning of the token. Returns false if the token is bad or if scanning produced | |
| 54 | + // an error message for any reason. | |
| 55 | + bool nextToken(InputSource& input, std::string const& context, size_t max_len = 0); | |
| 56 | + | |
| 57 | + // The following methods are only valid after nextToken has been called and until another | |
| 58 | + // QPDFTokenizer method is called. They allow the results of calling nextToken to be | |
| 59 | + // accessed without creating a Token, thus avoiding copying information that may not be | |
| 60 | + // needed. | |
| 61 | + | |
| 62 | + inline QPDFTokenizer::token_type_e | |
| 63 | + getType() const | |
| 64 | + { | |
| 65 | + return this->type; | |
| 66 | + } | |
| 67 | + inline std::string const& | |
| 68 | + getValue() const | |
| 69 | + { | |
| 70 | + return (this->type == QPDFTokenizer::tt_name || this->type == QPDFTokenizer::tt_string) | |
| 71 | + ? this->val | |
| 72 | + : this->raw_val; | |
| 73 | + } | |
| 74 | + inline std::string const& | |
| 75 | + getRawValue() const | |
| 76 | + { | |
| 77 | + return this->raw_val; | |
| 78 | + } | |
| 79 | + inline std::string const& | |
| 80 | + getErrorMessage() const | |
| 81 | + { | |
| 82 | + return this->error_message; | |
| 83 | + } | |
| 84 | + | |
| 85 | + private: | |
| 86 | + bool isSpace(char); | |
| 87 | + bool isDelimiter(char); | |
| 88 | + void findEI(InputSource& input); | |
| 89 | + | |
| 90 | + enum state_e { | |
| 91 | + st_top, | |
| 92 | + st_in_hexstring, | |
| 93 | + st_in_string, | |
| 94 | + st_in_hexstring_2nd, | |
| 95 | + st_name, | |
| 96 | + st_literal, | |
| 97 | + st_in_space, | |
| 98 | + st_in_comment, | |
| 99 | + st_string_escape, | |
| 100 | + st_char_code, | |
| 101 | + st_string_after_cr, | |
| 102 | + st_lt, | |
| 103 | + st_gt, | |
| 104 | + st_inline_image, | |
| 105 | + st_sign, | |
| 106 | + st_number, | |
| 107 | + st_real, | |
| 108 | + st_decimal, | |
| 109 | + st_name_hex1, | |
| 110 | + st_name_hex2, | |
| 111 | + st_before_token, | |
| 112 | + st_token_ready | |
| 113 | + }; | |
| 114 | + | |
| 115 | + void handleCharacter(char); | |
| 116 | + void inBeforeToken(char); | |
| 117 | + void inTop(char); | |
| 118 | + void inSpace(char); | |
| 119 | + void inComment(char); | |
| 120 | + void inString(char); | |
| 121 | + void inName(char); | |
| 122 | + void inLt(char); | |
| 123 | + void inGt(char); | |
| 124 | + void inStringAfterCR(char); | |
| 125 | + void inStringEscape(char); | |
| 126 | + void inLiteral(char); | |
| 127 | + void inCharCode(char); | |
| 128 | + void inHexstring(char); | |
| 129 | + void inHexstring2nd(char); | |
| 130 | + void inInlineImage(char); | |
| 131 | + void inTokenReady(char); | |
| 132 | + void inNameHex1(char); | |
| 133 | + void inNameHex2(char); | |
| 134 | + void inSign(char); | |
| 135 | + void inDecimal(char); | |
| 136 | + void inNumber(char); | |
| 137 | + void inReal(char); | |
| 138 | + void reset(); | |
| 139 | + | |
| 140 | + // Lexer state | |
| 141 | + state_e state; | |
| 142 | + | |
| 143 | + bool allow_eof{false}; | |
| 144 | + bool include_ignorable{false}; | |
| 145 | + | |
| 146 | + // Current token accumulation | |
| 147 | + QPDFTokenizer::token_type_e type; | |
| 148 | + std::string val; | |
| 149 | + std::string raw_val; | |
| 150 | + std::string error_message; | |
| 151 | + bool before_token; | |
| 152 | + bool in_token; | |
| 153 | + char char_to_unread; | |
| 154 | + size_t inline_image_bytes; | |
| 155 | + bool bad; | |
| 156 | + | |
| 157 | + // State for strings | |
| 158 | + int string_depth; | |
| 159 | + int char_code; | |
| 160 | + char hex_char; | |
| 161 | + int digit_count; | |
| 162 | + }; | |
| 163 | + | |
| 164 | +} // namespace qpdf | |
| 165 | + | |
| 166 | +#endif // QPDFTOKENIZER_PRIVATE_HH | ... | ... |