Commit 39bc2eb4d9c41ef9707db224ffacc1ec008fb441

Authored by m-holger
1 parent a4b7907e

For QPDFTokenizer add private implementation class qpdf::Tokenizer

include/qpdf/QPDFTokenizer.hh
... ... @@ -29,6 +29,11 @@
29 29 #include <memory>
30 30 #include <string>
31 31  
  32 +namespace qpdf
  33 +{
  34 + class Tokenizer;
  35 +} // namespace qpdf
  36 +
32 37 class QPDFTokenizer
33 38 {
34 39 public:
... ... @@ -129,6 +134,9 @@ class QPDFTokenizer
129 134 QPDF_DLL
130 135 QPDFTokenizer();
131 136  
  137 + QPDF_DLL
  138 + ~QPDFTokenizer();
  139 +
132 140 // If called, treat EOF as a separate token type instead of an error. This was introduced in
133 141 // QPDF 4.1 to facilitate tokenizing content streams.
134 142 QPDF_DLL
... ... @@ -218,103 +226,7 @@ class QPDFTokenizer
218 226 QPDFTokenizer(QPDFTokenizer const&) = delete;
219 227 QPDFTokenizer& operator=(QPDFTokenizer const&) = delete;
220 228  
221   - bool isSpace(char);
222   - bool isDelimiter(char);
223   - void findEI(InputSource& input);
224   -
225   - enum state_e {
226   - st_top,
227   - st_in_hexstring,
228   - st_in_string,
229   - st_in_hexstring_2nd,
230   - st_name,
231   - st_literal,
232   - st_in_space,
233   - st_in_comment,
234   - st_string_escape,
235   - st_char_code,
236   - st_string_after_cr,
237   - st_lt,
238   - st_gt,
239   - st_inline_image,
240   - st_sign,
241   - st_number,
242   - st_real,
243   - st_decimal,
244   - st_name_hex1,
245   - st_name_hex2,
246   - st_before_token,
247   - st_token_ready
248   - };
249   -
250   - void handleCharacter(char);
251   - void inBeforeToken(char);
252   - void inTop(char);
253   - void inSpace(char);
254   - void inComment(char);
255   - void inString(char);
256   - void inName(char);
257   - void inLt(char);
258   - void inGt(char);
259   - void inStringAfterCR(char);
260   - void inStringEscape(char);
261   - void inLiteral(char);
262   - void inCharCode(char);
263   - void inHexstring(char);
264   - void inHexstring2nd(char);
265   - void inInlineImage(char);
266   - void inTokenReady(char);
267   - void inNameHex1(char);
268   - void inNameHex2(char);
269   - void inSign(char);
270   - void inDecimal(char);
271   - void inNumber(char);
272   - void inReal(char);
273   - void reset();
274   -
275   - // Lexer state
276   - state_e state;
277   -
278   - bool allow_eof;
279   - bool include_ignorable;
280   -
281   - // Current token accumulation
282   - token_type_e type;
283   - std::string val;
284   - std::string raw_val;
285   - std::string error_message;
286   - bool before_token;
287   - bool in_token;
288   - char char_to_unread;
289   - size_t inline_image_bytes;
290   - bool bad;
291   -
292   - // State for strings
293   - int string_depth;
294   - int char_code;
295   - char hex_char;
296   - int digit_count;
  229 + std::unique_ptr<qpdf::Tokenizer> m;
297 230 };
298 231  
299   -inline QPDFTokenizer::token_type_e
300   -QPDFTokenizer::getType() const noexcept
301   -{
302   - return this->type;
303   -}
304   -inline std::string const&
305   -QPDFTokenizer::getValue() const noexcept
306   -{
307   - return (this->type == tt_name || this->type == tt_string) ? this->val : this->raw_val;
308   -}
309   -inline std::string const&
310   -QPDFTokenizer::getRawValue() const noexcept
311   -{
312   - return this->raw_val;
313   -}
314   -inline std::string const&
315   -QPDFTokenizer::getErrorMessage() const noexcept
316   -{
317   - return this->error_message;
318   -}
319   -
320 232 #endif // QPDFTOKENIZER_HH
... ...
libqpdf/QPDFParser.cc
... ... @@ -4,6 +4,7 @@
4 4 #include <qpdf/QPDFObjGen.hh>
5 5 #include <qpdf/QPDFObjectHandle.hh>
6 6 #include <qpdf/QPDFObject_private.hh>
  7 +#include <qpdf/QPDFTokenizer_private.hh>
7 8 #include <qpdf/QTC.hh>
8 9 #include <qpdf/QUtil.hh>
9 10  
... ...
libqpdf/QPDFTokenizer.cc
1   -#include <qpdf/QPDFTokenizer.hh>
  1 +#include <qpdf/QPDFTokenizer_private.hh>
2 2  
3 3 // DO NOT USE ctype -- it is locale dependent for some things, and it's not worth the risk of
4 4 // including it in case it may accidentally be used.
... ... @@ -16,6 +16,9 @@
16 16  
17 17 using namespace qpdf;
18 18  
  19 +using Token = QPDFTokenizer::Token;
  20 +using tt = QPDFTokenizer::token_type_e;
  21 +
19 22 static inline bool
20 23 is_delimiter(char ch)
21 24 {
... ... @@ -77,10 +80,10 @@ QPDFWordTokenFinder::check()
77 80 }
78 81  
79 82 void
80   -QPDFTokenizer::reset()
  83 +Tokenizer::reset()
81 84 {
82 85 state = st_before_token;
83   - type = tt_bad;
  86 + type = tt::tt_bad;
84 87 val.clear();
85 88 raw_val.clear();
86 89 error_message = "";
... ... @@ -105,8 +108,13 @@ QPDFTokenizer::Token::Token(token_type_e type, std::string const&amp; value) :
105 108 }
106 109  
107 110 QPDFTokenizer::QPDFTokenizer() :
108   - allow_eof(false),
109   - include_ignorable(false)
  111 + m(std::make_unique<qpdf::Tokenizer>())
  112 +{
  113 +}
  114 +
  115 +QPDFTokenizer::~QPDFTokenizer() = default;
  116 +
  117 +Tokenizer::Tokenizer()
110 118 {
111 119 reset();
112 120 }
... ... @@ -114,23 +122,35 @@ QPDFTokenizer::QPDFTokenizer() :
114 122 void
115 123 QPDFTokenizer::allowEOF()
116 124 {
117   - this->allow_eof = true;
  125 + m->allowEOF();
  126 +}
  127 +
  128 +void
  129 +Tokenizer::allowEOF()
  130 +{
  131 + allow_eof = true;
118 132 }
119 133  
120 134 void
121 135 QPDFTokenizer::includeIgnorable()
122 136 {
123   - this->include_ignorable = true;
  137 + m->includeIgnorable();
  138 +}
  139 +
  140 +void
  141 +Tokenizer::includeIgnorable()
  142 +{
  143 + include_ignorable = true;
124 144 }
125 145  
126 146 bool
127   -QPDFTokenizer::isSpace(char ch)
  147 +Tokenizer::isSpace(char ch)
128 148 {
129 149 return (ch == '\0' || util::is_space(ch));
130 150 }
131 151  
132 152 bool
133   -QPDFTokenizer::isDelimiter(char ch)
  153 +Tokenizer::isDelimiter(char ch)
134 154 {
135 155 return is_delimiter(ch);
136 156 }
... ... @@ -138,6 +158,12 @@ QPDFTokenizer::isDelimiter(char ch)
138 158 void
139 159 QPDFTokenizer::presentCharacter(char ch)
140 160 {
  161 + m->presentCharacter(ch);
  162 +}
  163 +
  164 +void
  165 +Tokenizer::presentCharacter(char ch)
  166 +{
141 167 handleCharacter(ch);
142 168  
143 169 if (this->in_token) {
... ... @@ -146,7 +172,7 @@ QPDFTokenizer::presentCharacter(char ch)
146 172 }
147 173  
148 174 void
149   -QPDFTokenizer::handleCharacter(char ch)
  175 +Tokenizer::handleCharacter(char ch)
150 176 {
151 177 // In some cases, functions called below may call a second handler. This happens whenever you
152 178 // have to use a character from the next token to detect the end of the current token.
... ... @@ -246,14 +272,14 @@ QPDFTokenizer::handleCharacter(char ch)
246 272 }
247 273  
248 274 void
249   -QPDFTokenizer::inTokenReady(char ch)
  275 +Tokenizer::inTokenReady(char ch)
250 276 {
251 277 throw std::logic_error(
252 278 "INTERNAL ERROR: QPDF tokenizer presented character while token is waiting");
253 279 }
254 280  
255 281 void
256   -QPDFTokenizer::inBeforeToken(char ch)
  282 +Tokenizer::inBeforeToken(char ch)
257 283 {
258 284 // Note: we specifically do not use ctype here. It is locale-dependent.
259 285 if (isSpace(ch)) {
... ... @@ -274,7 +300,7 @@ QPDFTokenizer::inBeforeToken(char ch)
274 300 }
275 301  
276 302 void
277   -QPDFTokenizer::inTop(char ch)
  303 +Tokenizer::inTop(char ch)
278 304 {
279 305 switch (ch) {
280 306 case '(':
... ... @@ -291,29 +317,29 @@ QPDFTokenizer::inTop(char ch)
291 317 return;
292 318  
293 319 case (')'):
294   - this->type = tt_bad;
  320 + this->type = tt::tt_bad;
295 321 QTC::TC("qpdf", "QPDFTokenizer bad )");
296 322 this->error_message = "unexpected )";
297 323 this->state = st_token_ready;
298 324 return;
299 325  
300 326 case '[':
301   - this->type = tt_array_open;
  327 + this->type = tt::tt_array_open;
302 328 this->state = st_token_ready;
303 329 return;
304 330  
305 331 case ']':
306   - this->type = tt_array_close;
  332 + this->type = tt::tt_array_close;
307 333 this->state = st_token_ready;
308 334 return;
309 335  
310 336 case '{':
311   - this->type = tt_brace_open;
  337 + this->type = tt::tt_brace_open;
312 338 this->state = st_token_ready;
313 339 return;
314 340  
315 341 case '}':
316   - this->type = tt_brace_close;
  342 + this->type = tt::tt_brace_close;
317 343 this->state = st_token_ready;
318 344 return;
319 345  
... ... @@ -351,11 +377,11 @@ QPDFTokenizer::inTop(char ch)
351 377 }
352 378  
353 379 void
354   -QPDFTokenizer::inSpace(char ch)
  380 +Tokenizer::inSpace(char ch)
355 381 {
356 382 // We only enter this state if include_ignorable is true.
357 383 if (!isSpace(ch)) {
358   - this->type = tt_space;
  384 + this->type = tt::tt_space;
359 385 this->in_token = false;
360 386 this->char_to_unread = ch;
361 387 this->state = st_token_ready;
... ... @@ -363,11 +389,11 @@ QPDFTokenizer::inSpace(char ch)
363 389 }
364 390  
365 391 void
366   -QPDFTokenizer::inComment(char ch)
  392 +Tokenizer::inComment(char ch)
367 393 {
368 394 if ((ch == '\r') || (ch == '\n')) {
369 395 if (this->include_ignorable) {
370   - this->type = tt_comment;
  396 + this->type = tt::tt_comment;
371 397 this->in_token = false;
372 398 this->char_to_unread = ch;
373 399 this->state = st_token_ready;
... ... @@ -378,7 +404,7 @@ QPDFTokenizer::inComment(char ch)
378 404 }
379 405  
380 406 void
381   -QPDFTokenizer::inString(char ch)
  407 +Tokenizer::inString(char ch)
382 408 {
383 409 switch (ch) {
384 410 case '\\':
... ... @@ -392,7 +418,7 @@ QPDFTokenizer::inString(char ch)
392 418  
393 419 case ')':
394 420 if (--this->string_depth == 0) {
395   - this->type = tt_string;
  421 + this->type = tt::tt_string;
396 422 this->state = st_token_ready;
397 423 return;
398 424 }
... ... @@ -417,7 +443,7 @@ QPDFTokenizer::inString(char ch)
417 443 }
418 444  
419 445 void
420   -QPDFTokenizer::inName(char ch)
  446 +Tokenizer::inName(char ch)
421 447 {
422 448 if (isDelimiter(ch)) {
423 449 // A C-locale whitespace character or delimiter terminates token. It is important to unread
... ... @@ -426,7 +452,7 @@ QPDFTokenizer::inName(char ch)
426 452 // though not on any files in the test suite as of this
427 453 // writing.
428 454  
429   - this->type = this->bad ? tt_bad : tt_name;
  455 + this->type = this->bad ? tt::tt_bad : tt::tt_name;
430 456 this->in_token = false;
431 457 this->char_to_unread = ch;
432 458 this->state = st_token_ready;
... ... @@ -439,7 +465,7 @@ QPDFTokenizer::inName(char ch)
439 465 }
440 466  
441 467 void
442   -QPDFTokenizer::inNameHex1(char ch)
  468 +Tokenizer::inNameHex1(char ch)
443 469 {
444 470 this->hex_char = ch;
445 471  
... ... @@ -457,7 +483,7 @@ QPDFTokenizer::inNameHex1(char ch)
457 483 }
458 484  
459 485 void
460   -QPDFTokenizer::inNameHex2(char ch)
  486 +Tokenizer::inNameHex2(char ch)
461 487 {
462 488 if (char hval = util::hex_decode_char(ch); hval < '\20') {
463 489 this->char_code |= int(hval);
... ... @@ -484,7 +510,7 @@ QPDFTokenizer::inNameHex2(char ch)
484 510 }
485 511  
486 512 void
487   -QPDFTokenizer::inSign(char ch)
  513 +Tokenizer::inSign(char ch)
488 514 {
489 515 if (util::is_digit(ch)) {
490 516 this->state = st_number;
... ... @@ -497,7 +523,7 @@ QPDFTokenizer::inSign(char ch)
497 523 }
498 524  
499 525 void
500   -QPDFTokenizer::inDecimal(char ch)
  526 +Tokenizer::inDecimal(char ch)
501 527 {
502 528 if (util::is_digit(ch)) {
503 529 this->state = st_real;
... ... @@ -508,13 +534,13 @@ QPDFTokenizer::inDecimal(char ch)
508 534 }
509 535  
510 536 void
511   -QPDFTokenizer::inNumber(char ch)
  537 +Tokenizer::inNumber(char ch)
512 538 {
513 539 if (util::is_digit(ch)) {
514 540 } else if (ch == '.') {
515 541 this->state = st_real;
516 542 } else if (isDelimiter(ch)) {
517   - this->type = tt_integer;
  543 + this->type = tt::tt_integer;
518 544 this->state = st_token_ready;
519 545 this->in_token = false;
520 546 this->char_to_unread = ch;
... ... @@ -524,11 +550,11 @@ QPDFTokenizer::inNumber(char ch)
524 550 }
525 551  
526 552 void
527   -QPDFTokenizer::inReal(char ch)
  553 +Tokenizer::inReal(char ch)
528 554 {
529 555 if (util::is_digit(ch)) {
530 556 } else if (isDelimiter(ch)) {
531   - this->type = tt_real;
  557 + this->type = tt::tt_real;
532 558 this->state = st_token_ready;
533 559 this->in_token = false;
534 560 this->char_to_unread = ch;
... ... @@ -537,7 +563,7 @@ QPDFTokenizer::inReal(char ch)
537 563 }
538 564 }
539 565 void
540   -QPDFTokenizer::inStringEscape(char ch)
  566 +Tokenizer::inStringEscape(char ch)
541 567 {
542 568 this->state = st_in_string;
543 569 switch (ch) {
... ... @@ -590,7 +616,7 @@ QPDFTokenizer::inStringEscape(char ch)
590 616 }
591 617  
592 618 void
593   -QPDFTokenizer::inStringAfterCR(char ch)
  619 +Tokenizer::inStringAfterCR(char ch)
594 620 {
595 621 this->state = st_in_string;
596 622 if (ch != '\n') {
... ... @@ -599,10 +625,10 @@ QPDFTokenizer::inStringAfterCR(char ch)
599 625 }
600 626  
601 627 void
602   -QPDFTokenizer::inLt(char ch)
  628 +Tokenizer::inLt(char ch)
603 629 {
604 630 if (ch == '<') {
605   - this->type = tt_dict_open;
  631 + this->type = tt::tt_dict_open;
606 632 this->state = st_token_ready;
607 633 return;
608 634 }
... ... @@ -612,13 +638,13 @@ QPDFTokenizer::inLt(char ch)
612 638 }
613 639  
614 640 void
615   -QPDFTokenizer::inGt(char ch)
  641 +Tokenizer::inGt(char ch)
616 642 {
617 643 if (ch == '>') {
618   - this->type = tt_dict_close;
  644 + this->type = tt::tt_dict_close;
619 645 this->state = st_token_ready;
620 646 } else {
621   - this->type = tt_bad;
  647 + this->type = tt::tt_bad;
622 648 QTC::TC("qpdf", "QPDFTokenizer bad >");
623 649 this->error_message = "unexpected >";
624 650 this->in_token = false;
... ... @@ -628,7 +654,7 @@ QPDFTokenizer::inGt(char ch)
628 654 }
629 655  
630 656 void
631   -QPDFTokenizer::inLiteral(char ch)
  657 +Tokenizer::inLiteral(char ch)
632 658 {
633 659 if (isDelimiter(ch)) {
634 660 // A C-locale whitespace character or delimiter terminates token. It is important to unread
... ... @@ -640,27 +666,27 @@ QPDFTokenizer::inLiteral(char ch)
640 666 this->char_to_unread = ch;
641 667 this->state = st_token_ready;
642 668 this->type = (this->raw_val == "true") || (this->raw_val == "false")
643   - ? tt_bool
644   - : (this->raw_val == "null" ? tt_null : tt_word);
  669 + ? tt::tt_bool
  670 + : (this->raw_val == "null" ? tt::tt_null : tt::tt_word);
645 671 }
646 672 }
647 673  
648 674 void
649   -QPDFTokenizer::inHexstring(char ch)
  675 +Tokenizer::inHexstring(char ch)
650 676 {
651 677 if (char hval = util::hex_decode_char(ch); hval < '\20') {
652 678 this->char_code = int(hval) << 4;
653 679 this->state = st_in_hexstring_2nd;
654 680  
655 681 } else if (ch == '>') {
656   - this->type = tt_string;
  682 + this->type = tt::tt_string;
657 683 this->state = st_token_ready;
658 684  
659 685 } else if (isSpace(ch)) {
660 686 // ignore
661 687  
662 688 } else {
663   - this->type = tt_bad;
  689 + this->type = tt::tt_bad;
664 690 QTC::TC("qpdf", "QPDFTokenizer bad hexstring character");
665 691 this->error_message = std::string("invalid character (") + ch + ") in hexstring";
666 692 this->state = st_token_ready;
... ... @@ -668,7 +694,7 @@ QPDFTokenizer::inHexstring(char ch)
668 694 }
669 695  
670 696 void
671   -QPDFTokenizer::inHexstring2nd(char ch)
  697 +Tokenizer::inHexstring2nd(char ch)
672 698 {
673 699 if (char hval = util::hex_decode_char(ch); hval < '\20') {
674 700 this->val += char(this->char_code) | hval;
... ... @@ -677,14 +703,14 @@ QPDFTokenizer::inHexstring2nd(char ch)
677 703 } else if (ch == '>') {
678 704 // PDF spec says odd hexstrings have implicit trailing 0.
679 705 this->val += char(this->char_code);
680   - this->type = tt_string;
  706 + this->type = tt::tt_string;
681 707 this->state = st_token_ready;
682 708  
683 709 } else if (isSpace(ch)) {
684 710 // ignore
685 711  
686 712 } else {
687   - this->type = tt_bad;
  713 + this->type = tt::tt_bad;
688 714 QTC::TC("qpdf", "QPDFTokenizer bad hexstring 2nd character");
689 715 this->error_message = std::string("invalid character (") + ch + ") in hexstring";
690 716 this->state = st_token_ready;
... ... @@ -692,7 +718,7 @@ QPDFTokenizer::inHexstring2nd(char ch)
692 718 }
693 719  
694 720 void
695   -QPDFTokenizer::inCharCode(char ch)
  721 +Tokenizer::inCharCode(char ch)
696 722 {
697 723 bool handled = false;
698 724 if (('0' <= ch) && (ch <= '7')) {
... ... @@ -712,11 +738,11 @@ QPDFTokenizer::inCharCode(char ch)
712 738 }
713 739  
714 740 void
715   -QPDFTokenizer::inInlineImage(char ch)
  741 +Tokenizer::inInlineImage(char ch)
716 742 {
717 743 if ((this->raw_val.length() + 1) == this->inline_image_bytes) {
718 744 QTC::TC("qpdf", "QPDFTokenizer found EI by byte count");
719   - this->type = tt_inline_image;
  745 + this->type = tt::tt_inline_image;
720 746 this->inline_image_bytes = 0;
721 747 this->state = st_token_ready;
722 748 }
... ... @@ -725,6 +751,12 @@ QPDFTokenizer::inInlineImage(char ch)
725 751 void
726 752 QPDFTokenizer::presentEOF()
727 753 {
  754 + m->presentEOF();
  755 +}
  756 +
  757 +void
  758 +Tokenizer::presentEOF()
  759 +{
728 760 switch (this->state) {
729 761 case st_name:
730 762 case st_name_hex1:
... ... @@ -742,15 +774,15 @@ QPDFTokenizer::presentEOF()
742 774  
743 775 case st_top:
744 776 case st_before_token:
745   - this->type = tt_eof;
  777 + this->type = tt::tt_eof;
746 778 break;
747 779  
748 780 case st_in_space:
749   - this->type = this->include_ignorable ? tt_space : tt_eof;
  781 + this->type = this->include_ignorable ? tt::tt_space : tt::tt_eof;
750 782 break;
751 783  
752 784 case st_in_comment:
753   - this->type = this->include_ignorable ? tt_comment : tt_bad;
  785 + this->type = this->include_ignorable ? tt::tt_comment : tt::tt_bad;
754 786 break;
755 787  
756 788 case st_token_ready:
... ... @@ -758,7 +790,7 @@ QPDFTokenizer::presentEOF()
758 790  
759 791 default:
760 792 QTC::TC("qpdf", "QPDFTokenizer EOF reading token");
761   - this->type = tt_bad;
  793 + this->type = tt::tt_bad;
762 794 this->error_message = "EOF while reading token";
763 795 }
764 796 this->state = st_token_ready;
... ... @@ -767,12 +799,24 @@ QPDFTokenizer::presentEOF()
767 799 void
768 800 QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input)
769 801 {
  802 + m->expectInlineImage(input);
  803 +}
  804 +
  805 +void
  806 +Tokenizer::expectInlineImage(std::shared_ptr<InputSource> input)
  807 +{
770 808 expectInlineImage(*input);
771 809 }
772 810  
773 811 void
774 812 QPDFTokenizer::expectInlineImage(InputSource& input)
775 813 {
  814 + m->expectInlineImage(input);
  815 +}
  816 +
  817 +void
  818 +Tokenizer::expectInlineImage(InputSource& input)
  819 +{
776 820 if (this->state == st_token_ready) {
777 821 reset();
778 822 } else if (this->state != st_before_token) {
... ... @@ -786,7 +830,7 @@ QPDFTokenizer::expectInlineImage(InputSource&amp; input)
786 830 }
787 831  
788 832 void
789   -QPDFTokenizer::findEI(InputSource& input)
  833 +Tokenizer::findEI(InputSource& input)
790 834 {
791 835 qpdf_offset_t last_offset = input.getLastOffset();
792 836 qpdf_offset_t pos = input.tell();
... ... @@ -816,10 +860,10 @@ QPDFTokenizer::findEI(InputSource&amp; input)
816 860 // be pretty sure we've found the actual EI.
817 861 for (int i = 0; i < 10; ++i) {
818 862 QPDFTokenizer::Token t = check.readToken(input, "checker", true);
819   - token_type_e type = t.getType();
820   - if (type == tt_eof) {
  863 + QPDFTokenizer::token_type_e type = t.getType();
  864 + if (type == tt::tt_eof) {
821 865 okay = true;
822   - } else if (type == tt_bad) {
  866 + } else if (type == tt::tt_bad) {
823 867 found_bad = true;
824 868 } else if (t.isWord()) {
825 869 // The qpdf tokenizer lumps alphabetic and otherwise uncategorized characters into
... ... @@ -870,11 +914,17 @@ QPDFTokenizer::findEI(InputSource&amp; input)
870 914 bool
871 915 QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch)
872 916 {
  917 + return m->getToken(token, unread_char, ch);
  918 +}
  919 +
  920 +bool
  921 +Tokenizer::getToken(Token& token, bool& unread_char, char& ch)
  922 +{
873 923 bool ready = (this->state == st_token_ready);
874 924 unread_char = !this->in_token && !this->before_token;
875 925 ch = this->char_to_unread;
876 926 if (ready) {
877   - token = (!(this->type == tt_name || this->type == tt_string))
  927 + token = (!(this->type == tt::tt_name || this->type == tt::tt_string))
878 928 ? Token(this->type, this->raw_val, this->raw_val, this->error_message)
879 929 : Token(this->type, this->val, this->raw_val, this->error_message);
880 930  
... ... @@ -886,13 +936,19 @@ QPDFTokenizer::getToken(Token&amp; token, bool&amp; unread_char, char&amp; ch)
886 936 bool
887 937 QPDFTokenizer::betweenTokens()
888 938 {
889   - return this->before_token;
  939 + return m->before_token;
890 940 }
891 941  
892 942 QPDFTokenizer::Token
893 943 QPDFTokenizer::readToken(
894 944 InputSource& input, std::string const& context, bool allow_bad, size_t max_len)
895 945 {
  946 + return m->readToken(input, context, allow_bad, max_len);
  947 +}
  948 +
  949 +QPDFTokenizer::Token
  950 +Tokenizer::readToken(InputSource& input, std::string const& context, bool allow_bad, size_t max_len)
  951 +{
896 952 nextToken(input, context, max_len);
897 953  
898 954 Token token;
... ... @@ -900,7 +956,7 @@ QPDFTokenizer::readToken(
900 956 char char_to_unread;
901 957 getToken(token, unread_char, char_to_unread);
902 958  
903   - if (token.getType() == tt_bad) {
  959 + if (token.getType() == tt::tt_bad) {
904 960 if (allow_bad) {
905 961 QTC::TC("qpdf", "QPDFTokenizer allowing bad token");
906 962 } else {
... ... @@ -919,12 +975,25 @@ QPDFTokenizer::Token
919 975 QPDFTokenizer::readToken(
920 976 std::shared_ptr<InputSource> input, std::string const& context, bool allow_bad, size_t max_len)
921 977 {
  978 + return m->readToken(*input, context, allow_bad, max_len);
  979 +}
  980 +
  981 +QPDFTokenizer::Token
  982 +Tokenizer::readToken(
  983 + std::shared_ptr<InputSource> input, std::string const& context, bool allow_bad, size_t max_len)
  984 +{
922 985 return readToken(*input, context, allow_bad, max_len);
923 986 }
924 987  
925 988 bool
926 989 QPDFTokenizer::nextToken(InputSource& input, std::string const& context, size_t max_len)
927 990 {
  991 + return m->nextToken(input, context, max_len);
  992 +}
  993 +
  994 +bool
  995 +Tokenizer::nextToken(InputSource& input, std::string const& context, size_t max_len)
  996 +{
928 997 if (this->state != st_inline_image) {
929 998 reset();
930 999 }
... ... @@ -935,10 +1004,10 @@ QPDFTokenizer::nextToken(InputSource&amp; input, std::string const&amp; context, size_t
935 1004 if (!input.fastRead(ch)) {
936 1005 presentEOF();
937 1006  
938   - if ((this->type == tt_eof) && (!this->allow_eof)) {
  1007 + if ((this->type == tt::tt_eof) && (!this->allow_eof)) {
939 1008 // Nothing in the qpdf library calls readToken without allowEOF anymore, so this
940 1009 // case is not exercised.
941   - this->type = tt_bad;
  1010 + this->type = tt::tt_bad;
942 1011 this->error_message = "unexpected EOF";
943 1012 offset = input.getLastOffset();
944 1013 }
... ... @@ -953,7 +1022,7 @@ QPDFTokenizer::nextToken(InputSource&amp; input, std::string const&amp; context, size_t
953 1022 if (max_len && (this->raw_val.length() >= max_len) && (this->state != st_token_ready)) {
954 1023 // terminate this token now
955 1024 QTC::TC("qpdf", "QPDFTokenizer block long token");
956   - this->type = tt_bad;
  1025 + this->type = tt::tt_bad;
957 1026 this->state = st_token_ready;
958 1027 this->error_message = "exceeded allowable length while reading token";
959 1028 }
... ... @@ -962,7 +1031,7 @@ QPDFTokenizer::nextToken(InputSource&amp; input, std::string const&amp; context, size_t
962 1031  
963 1032 input.fastUnread(!this->in_token && !this->before_token);
964 1033  
965   - if (this->type != tt_eof) {
  1034 + if (this->type != tt::tt_eof) {
966 1035 input.setLastOffset(offset);
967 1036 }
968 1037  
... ...
libqpdf/qpdf/QPDFTokenizer_private.hh 0 โ†’ 100644
  1 +#ifndef QPDFTOKENIZER_PRIVATE_HH
  2 +#define QPDFTOKENIZER_PRIVATE_HH
  3 +
  4 +#include <qpdf/QPDFTokenizer.hh>
  5 +
  6 +namespace qpdf
  7 +{
  8 +
  9 + class Tokenizer
  10 + {
  11 + friend class ::QPDFTokenizer;
  12 +
  13 + public:
  14 + Tokenizer();
  15 +
  16 + // Methods to support QPDFTokenizer. See QPDFTokenizer.hh for detail. Some of these are used
  17 + // by Tokenizer internally but are not accessed directly by the rest of qpdf.
  18 +
  19 + void allowEOF();
  20 + void includeIgnorable();
  21 + void presentCharacter(char ch);
  22 + void presentEOF();
  23 +
  24 + // If a token is available, return true and initialize token with the token, unread_char
  25 + // with whether or not we have to unread the last character, and if unread_char, ch with the
  26 + // character to unread.
  27 + bool getToken(QPDFTokenizer::Token& token, bool& unread_char, char& ch);
  28 +
  29 + // Pull mode:
  30 +
  31 + // Read a token from an input source. Context describes the context in which the token is
  32 + // being read and is used in the exception thrown if there is an error. After a token is
  33 + // read, the position of the input source returned by input->tell() points to just after the
  34 + // token, and the input source's "last offset" as returned by input->getLastOffset() points
  35 + // to the beginning of the token.
  36 + QPDFTokenizer::Token readToken(
  37 + InputSource& input,
  38 + std::string const& context,
  39 + bool allow_bad = false,
  40 + size_t max_len = 0);
  41 +
  42 + QPDFTokenizer::Token readToken(
  43 + std::shared_ptr<InputSource> input,
  44 + std::string const& context,
  45 + bool allow_bad = false,
  46 + size_t max_len = 0);
  47 +
  48 + // Calling this method puts the tokenizer in a state for reading inline images. You should
  49 + // call this method after reading the character following the ID operator. In that state, it
  50 + // will return all data up to BUT NOT INCLUDING the next EI token. After you call this
  51 + // method, the next call to readToken (or the token created next time getToken returns true)
  52 + // will either be tt_inline_image or tt_bad. This is the only way readToken returns a
  53 + // tt_inline_image token.
  54 + void expectInlineImage(std::shared_ptr<InputSource> input);
  55 +
  56 + void expectInlineImage(InputSource& input);
  57 +
  58 + private:
  59 + // Read a token from an input source. Context describes the context in which the token is
  60 + // being read and is used in the exception thrown if there is an error. After a token is
  61 + // read, the position of the input source returned by input->tell() points to just after the
  62 + // token, and the input source's "last offset" as returned by input->getLastOffset() points
  63 + // to the beginning of the token. Returns false if the token is bad or if scanning produced
  64 + // an error message for any reason.
  65 + bool nextToken(InputSource& input, std::string const& context, size_t max_len = 0);
  66 +
  67 + // The following methods are only valid after nextToken has been called and until another
  68 + // QPDFTokenizer method is called. They allow the results of calling nextToken to be
  69 + // accessed without creating a Token, thus avoiding copying information that may not be
  70 + // needed.
  71 + inline QPDFTokenizer::token_type_e getType() const;
  72 + inline std::string const& getValue() const;
  73 + inline std::string const& getRawValue() const;
  74 + inline std::string const& getErrorMessage() const;
  75 +
  76 + Tokenizer(Tokenizer const&) = delete;
  77 + Tokenizer& operator=(Tokenizer const&) = delete;
  78 +
  79 + bool isSpace(char);
  80 + bool isDelimiter(char);
  81 + void findEI(InputSource& input);
  82 +
  83 + enum state_e {
  84 + st_top,
  85 + st_in_hexstring,
  86 + st_in_string,
  87 + st_in_hexstring_2nd,
  88 + st_name,
  89 + st_literal,
  90 + st_in_space,
  91 + st_in_comment,
  92 + st_string_escape,
  93 + st_char_code,
  94 + st_string_after_cr,
  95 + st_lt,
  96 + st_gt,
  97 + st_inline_image,
  98 + st_sign,
  99 + st_number,
  100 + st_real,
  101 + st_decimal,
  102 + st_name_hex1,
  103 + st_name_hex2,
  104 + st_before_token,
  105 + st_token_ready
  106 + };
  107 +
  108 + void handleCharacter(char);
  109 + void inBeforeToken(char);
  110 + void inTop(char);
  111 + void inSpace(char);
  112 + void inComment(char);
  113 + void inString(char);
  114 + void inName(char);
  115 + void inLt(char);
  116 + void inGt(char);
  117 + void inStringAfterCR(char);
  118 + void inStringEscape(char);
  119 + void inLiteral(char);
  120 + void inCharCode(char);
  121 + void inHexstring(char);
  122 + void inHexstring2nd(char);
  123 + void inInlineImage(char);
  124 + void inTokenReady(char);
  125 + void inNameHex1(char);
  126 + void inNameHex2(char);
  127 + void inSign(char);
  128 + void inDecimal(char);
  129 + void inNumber(char);
  130 + void inReal(char);
  131 + void reset();
  132 +
  133 + // Lexer state
  134 + state_e state;
  135 +
  136 + bool allow_eof{false};
  137 + bool include_ignorable{false};
  138 +
  139 + // Current token accumulation
  140 + QPDFTokenizer::token_type_e type;
  141 + std::string val;
  142 + std::string raw_val;
  143 + std::string error_message;
  144 + bool before_token;
  145 + bool in_token;
  146 + char char_to_unread;
  147 + size_t inline_image_bytes;
  148 + bool bad;
  149 +
  150 + // State for strings
  151 + int string_depth;
  152 + int char_code;
  153 + char hex_char;
  154 + int digit_count;
  155 + };
  156 +
  157 + inline QPDFTokenizer::token_type_e
  158 + Tokenizer::getType() const
  159 + {
  160 + return this->type;
  161 + }
  162 + inline std::string const&
  163 + Tokenizer::getValue() const
  164 + {
  165 + return (this->type == QPDFTokenizer::tt_name || this->type == QPDFTokenizer::tt_string)
  166 + ? this->val
  167 + : this->raw_val;
  168 + }
  169 + inline std::string const&
  170 + Tokenizer::getRawValue() const
  171 + {
  172 + return this->raw_val;
  173 + }
  174 + inline std::string const&
  175 + Tokenizer::getErrorMessage() const
  176 + {
  177 + return this->error_message;
  178 + }
  179 +
  180 +} // namespace qpdf
  181 +
  182 +inline QPDFTokenizer::token_type_e
  183 +QPDFTokenizer::getType() const noexcept
  184 +{
  185 + return m->type;
  186 +}
  187 +inline std::string const&
  188 +QPDFTokenizer::getValue() const noexcept
  189 +{
  190 + return (m->type == tt_name || m->type == tt_string) ? m->val : m->raw_val;
  191 +}
  192 +inline std::string const&
  193 +QPDFTokenizer::getRawValue() const noexcept
  194 +{
  195 + return m->raw_val;
  196 +}
  197 +inline std::string const&
  198 +QPDFTokenizer::getErrorMessage() const noexcept
  199 +{
  200 + return m->error_message;
  201 +}
  202 +
  203 +#endif // QPDFTOKENIZER_PRIVATE_HH
... ...