Commit 39bc2eb4d9c41ef9707db224ffacc1ec008fb441

Authored by m-holger
1 parent a4b7907e

For QPDFTokenizer add private implementation class qpdf::Tokenizer

include/qpdf/QPDFTokenizer.hh
@@ -29,6 +29,11 @@ @@ -29,6 +29,11 @@
29 #include <memory> 29 #include <memory>
30 #include <string> 30 #include <string>
31 31
  32 +namespace qpdf
  33 +{
  34 + class Tokenizer;
  35 +} // namespace qpdf
  36 +
32 class QPDFTokenizer 37 class QPDFTokenizer
33 { 38 {
34 public: 39 public:
@@ -129,6 +134,9 @@ class QPDFTokenizer @@ -129,6 +134,9 @@ class QPDFTokenizer
129 QPDF_DLL 134 QPDF_DLL
130 QPDFTokenizer(); 135 QPDFTokenizer();
131 136
  137 + QPDF_DLL
  138 + ~QPDFTokenizer();
  139 +
132 // If called, treat EOF as a separate token type instead of an error. This was introduced in 140 // If called, treat EOF as a separate token type instead of an error. This was introduced in
133 // QPDF 4.1 to facilitate tokenizing content streams. 141 // QPDF 4.1 to facilitate tokenizing content streams.
134 QPDF_DLL 142 QPDF_DLL
@@ -218,103 +226,7 @@ class QPDFTokenizer @@ -218,103 +226,7 @@ class QPDFTokenizer
218 QPDFTokenizer(QPDFTokenizer const&) = delete; 226 QPDFTokenizer(QPDFTokenizer const&) = delete;
219 QPDFTokenizer& operator=(QPDFTokenizer const&) = delete; 227 QPDFTokenizer& operator=(QPDFTokenizer const&) = delete;
220 228
221 - bool isSpace(char);  
222 - bool isDelimiter(char);  
223 - void findEI(InputSource& input);  
224 -  
225 - enum state_e {  
226 - st_top,  
227 - st_in_hexstring,  
228 - st_in_string,  
229 - st_in_hexstring_2nd,  
230 - st_name,  
231 - st_literal,  
232 - st_in_space,  
233 - st_in_comment,  
234 - st_string_escape,  
235 - st_char_code,  
236 - st_string_after_cr,  
237 - st_lt,  
238 - st_gt,  
239 - st_inline_image,  
240 - st_sign,  
241 - st_number,  
242 - st_real,  
243 - st_decimal,  
244 - st_name_hex1,  
245 - st_name_hex2,  
246 - st_before_token,  
247 - st_token_ready  
248 - };  
249 -  
250 - void handleCharacter(char);  
251 - void inBeforeToken(char);  
252 - void inTop(char);  
253 - void inSpace(char);  
254 - void inComment(char);  
255 - void inString(char);  
256 - void inName(char);  
257 - void inLt(char);  
258 - void inGt(char);  
259 - void inStringAfterCR(char);  
260 - void inStringEscape(char);  
261 - void inLiteral(char);  
262 - void inCharCode(char);  
263 - void inHexstring(char);  
264 - void inHexstring2nd(char);  
265 - void inInlineImage(char);  
266 - void inTokenReady(char);  
267 - void inNameHex1(char);  
268 - void inNameHex2(char);  
269 - void inSign(char);  
270 - void inDecimal(char);  
271 - void inNumber(char);  
272 - void inReal(char);  
273 - void reset();  
274 -  
275 - // Lexer state  
276 - state_e state;  
277 -  
278 - bool allow_eof;  
279 - bool include_ignorable;  
280 -  
281 - // Current token accumulation  
282 - token_type_e type;  
283 - std::string val;  
284 - std::string raw_val;  
285 - std::string error_message;  
286 - bool before_token;  
287 - bool in_token;  
288 - char char_to_unread;  
289 - size_t inline_image_bytes;  
290 - bool bad;  
291 -  
292 - // State for strings  
293 - int string_depth;  
294 - int char_code;  
295 - char hex_char;  
296 - int digit_count; 229 + std::unique_ptr<qpdf::Tokenizer> m;
297 }; 230 };
298 231
299 -inline QPDFTokenizer::token_type_e  
300 -QPDFTokenizer::getType() const noexcept  
301 -{  
302 - return this->type;  
303 -}  
304 -inline std::string const&  
305 -QPDFTokenizer::getValue() const noexcept  
306 -{  
307 - return (this->type == tt_name || this->type == tt_string) ? this->val : this->raw_val;  
308 -}  
309 -inline std::string const&  
310 -QPDFTokenizer::getRawValue() const noexcept  
311 -{  
312 - return this->raw_val;  
313 -}  
314 -inline std::string const&  
315 -QPDFTokenizer::getErrorMessage() const noexcept  
316 -{  
317 - return this->error_message;  
318 -}  
319 -  
320 #endif // QPDFTOKENIZER_HH 232 #endif // QPDFTOKENIZER_HH
libqpdf/QPDFParser.cc
@@ -4,6 +4,7 @@ @@ -4,6 +4,7 @@
4 #include <qpdf/QPDFObjGen.hh> 4 #include <qpdf/QPDFObjGen.hh>
5 #include <qpdf/QPDFObjectHandle.hh> 5 #include <qpdf/QPDFObjectHandle.hh>
6 #include <qpdf/QPDFObject_private.hh> 6 #include <qpdf/QPDFObject_private.hh>
  7 +#include <qpdf/QPDFTokenizer_private.hh>
7 #include <qpdf/QTC.hh> 8 #include <qpdf/QTC.hh>
8 #include <qpdf/QUtil.hh> 9 #include <qpdf/QUtil.hh>
9 10
libqpdf/QPDFTokenizer.cc
1 -#include <qpdf/QPDFTokenizer.hh> 1 +#include <qpdf/QPDFTokenizer_private.hh>
2 2
3 // DO NOT USE ctype -- it is locale dependent for some things, and it's not worth the risk of 3 // DO NOT USE ctype -- it is locale dependent for some things, and it's not worth the risk of
4 // including it in case it may accidentally be used. 4 // including it in case it may accidentally be used.
@@ -16,6 +16,9 @@ @@ -16,6 +16,9 @@
16 16
17 using namespace qpdf; 17 using namespace qpdf;
18 18
  19 +using Token = QPDFTokenizer::Token;
  20 +using tt = QPDFTokenizer::token_type_e;
  21 +
19 static inline bool 22 static inline bool
20 is_delimiter(char ch) 23 is_delimiter(char ch)
21 { 24 {
@@ -77,10 +80,10 @@ QPDFWordTokenFinder::check() @@ -77,10 +80,10 @@ QPDFWordTokenFinder::check()
77 } 80 }
78 81
79 void 82 void
80 -QPDFTokenizer::reset() 83 +Tokenizer::reset()
81 { 84 {
82 state = st_before_token; 85 state = st_before_token;
83 - type = tt_bad; 86 + type = tt::tt_bad;
84 val.clear(); 87 val.clear();
85 raw_val.clear(); 88 raw_val.clear();
86 error_message = ""; 89 error_message = "";
@@ -105,8 +108,13 @@ QPDFTokenizer::Token::Token(token_type_e type, std::string const&amp; value) : @@ -105,8 +108,13 @@ QPDFTokenizer::Token::Token(token_type_e type, std::string const&amp; value) :
105 } 108 }
106 109
107 QPDFTokenizer::QPDFTokenizer() : 110 QPDFTokenizer::QPDFTokenizer() :
108 - allow_eof(false),  
109 - include_ignorable(false) 111 + m(std::make_unique<qpdf::Tokenizer>())
  112 +{
  113 +}
  114 +
  115 +QPDFTokenizer::~QPDFTokenizer() = default;
  116 +
  117 +Tokenizer::Tokenizer()
110 { 118 {
111 reset(); 119 reset();
112 } 120 }
@@ -114,23 +122,35 @@ QPDFTokenizer::QPDFTokenizer() : @@ -114,23 +122,35 @@ QPDFTokenizer::QPDFTokenizer() :
114 void 122 void
115 QPDFTokenizer::allowEOF() 123 QPDFTokenizer::allowEOF()
116 { 124 {
117 - this->allow_eof = true; 125 + m->allowEOF();
  126 +}
  127 +
  128 +void
  129 +Tokenizer::allowEOF()
  130 +{
  131 + allow_eof = true;
118 } 132 }
119 133
120 void 134 void
121 QPDFTokenizer::includeIgnorable() 135 QPDFTokenizer::includeIgnorable()
122 { 136 {
123 - this->include_ignorable = true; 137 + m->includeIgnorable();
  138 +}
  139 +
  140 +void
  141 +Tokenizer::includeIgnorable()
  142 +{
  143 + include_ignorable = true;
124 } 144 }
125 145
126 bool 146 bool
127 -QPDFTokenizer::isSpace(char ch) 147 +Tokenizer::isSpace(char ch)
128 { 148 {
129 return (ch == '\0' || util::is_space(ch)); 149 return (ch == '\0' || util::is_space(ch));
130 } 150 }
131 151
132 bool 152 bool
133 -QPDFTokenizer::isDelimiter(char ch) 153 +Tokenizer::isDelimiter(char ch)
134 { 154 {
135 return is_delimiter(ch); 155 return is_delimiter(ch);
136 } 156 }
@@ -138,6 +158,12 @@ QPDFTokenizer::isDelimiter(char ch) @@ -138,6 +158,12 @@ QPDFTokenizer::isDelimiter(char ch)
138 void 158 void
139 QPDFTokenizer::presentCharacter(char ch) 159 QPDFTokenizer::presentCharacter(char ch)
140 { 160 {
  161 + m->presentCharacter(ch);
  162 +}
  163 +
  164 +void
  165 +Tokenizer::presentCharacter(char ch)
  166 +{
141 handleCharacter(ch); 167 handleCharacter(ch);
142 168
143 if (this->in_token) { 169 if (this->in_token) {
@@ -146,7 +172,7 @@ QPDFTokenizer::presentCharacter(char ch) @@ -146,7 +172,7 @@ QPDFTokenizer::presentCharacter(char ch)
146 } 172 }
147 173
148 void 174 void
149 -QPDFTokenizer::handleCharacter(char ch) 175 +Tokenizer::handleCharacter(char ch)
150 { 176 {
151 // In some cases, functions called below may call a second handler. This happens whenever you 177 // In some cases, functions called below may call a second handler. This happens whenever you
152 // have to use a character from the next token to detect the end of the current token. 178 // have to use a character from the next token to detect the end of the current token.
@@ -246,14 +272,14 @@ QPDFTokenizer::handleCharacter(char ch) @@ -246,14 +272,14 @@ QPDFTokenizer::handleCharacter(char ch)
246 } 272 }
247 273
248 void 274 void
249 -QPDFTokenizer::inTokenReady(char ch) 275 +Tokenizer::inTokenReady(char ch)
250 { 276 {
251 throw std::logic_error( 277 throw std::logic_error(
252 "INTERNAL ERROR: QPDF tokenizer presented character while token is waiting"); 278 "INTERNAL ERROR: QPDF tokenizer presented character while token is waiting");
253 } 279 }
254 280
255 void 281 void
256 -QPDFTokenizer::inBeforeToken(char ch) 282 +Tokenizer::inBeforeToken(char ch)
257 { 283 {
258 // Note: we specifically do not use ctype here. It is locale-dependent. 284 // Note: we specifically do not use ctype here. It is locale-dependent.
259 if (isSpace(ch)) { 285 if (isSpace(ch)) {
@@ -274,7 +300,7 @@ QPDFTokenizer::inBeforeToken(char ch) @@ -274,7 +300,7 @@ QPDFTokenizer::inBeforeToken(char ch)
274 } 300 }
275 301
276 void 302 void
277 -QPDFTokenizer::inTop(char ch) 303 +Tokenizer::inTop(char ch)
278 { 304 {
279 switch (ch) { 305 switch (ch) {
280 case '(': 306 case '(':
@@ -291,29 +317,29 @@ QPDFTokenizer::inTop(char ch) @@ -291,29 +317,29 @@ QPDFTokenizer::inTop(char ch)
291 return; 317 return;
292 318
293 case (')'): 319 case (')'):
294 - this->type = tt_bad; 320 + this->type = tt::tt_bad;
295 QTC::TC("qpdf", "QPDFTokenizer bad )"); 321 QTC::TC("qpdf", "QPDFTokenizer bad )");
296 this->error_message = "unexpected )"; 322 this->error_message = "unexpected )";
297 this->state = st_token_ready; 323 this->state = st_token_ready;
298 return; 324 return;
299 325
300 case '[': 326 case '[':
301 - this->type = tt_array_open; 327 + this->type = tt::tt_array_open;
302 this->state = st_token_ready; 328 this->state = st_token_ready;
303 return; 329 return;
304 330
305 case ']': 331 case ']':
306 - this->type = tt_array_close; 332 + this->type = tt::tt_array_close;
307 this->state = st_token_ready; 333 this->state = st_token_ready;
308 return; 334 return;
309 335
310 case '{': 336 case '{':
311 - this->type = tt_brace_open; 337 + this->type = tt::tt_brace_open;
312 this->state = st_token_ready; 338 this->state = st_token_ready;
313 return; 339 return;
314 340
315 case '}': 341 case '}':
316 - this->type = tt_brace_close; 342 + this->type = tt::tt_brace_close;
317 this->state = st_token_ready; 343 this->state = st_token_ready;
318 return; 344 return;
319 345
@@ -351,11 +377,11 @@ QPDFTokenizer::inTop(char ch) @@ -351,11 +377,11 @@ QPDFTokenizer::inTop(char ch)
351 } 377 }
352 378
353 void 379 void
354 -QPDFTokenizer::inSpace(char ch) 380 +Tokenizer::inSpace(char ch)
355 { 381 {
356 // We only enter this state if include_ignorable is true. 382 // We only enter this state if include_ignorable is true.
357 if (!isSpace(ch)) { 383 if (!isSpace(ch)) {
358 - this->type = tt_space; 384 + this->type = tt::tt_space;
359 this->in_token = false; 385 this->in_token = false;
360 this->char_to_unread = ch; 386 this->char_to_unread = ch;
361 this->state = st_token_ready; 387 this->state = st_token_ready;
@@ -363,11 +389,11 @@ QPDFTokenizer::inSpace(char ch) @@ -363,11 +389,11 @@ QPDFTokenizer::inSpace(char ch)
363 } 389 }
364 390
365 void 391 void
366 -QPDFTokenizer::inComment(char ch) 392 +Tokenizer::inComment(char ch)
367 { 393 {
368 if ((ch == '\r') || (ch == '\n')) { 394 if ((ch == '\r') || (ch == '\n')) {
369 if (this->include_ignorable) { 395 if (this->include_ignorable) {
370 - this->type = tt_comment; 396 + this->type = tt::tt_comment;
371 this->in_token = false; 397 this->in_token = false;
372 this->char_to_unread = ch; 398 this->char_to_unread = ch;
373 this->state = st_token_ready; 399 this->state = st_token_ready;
@@ -378,7 +404,7 @@ QPDFTokenizer::inComment(char ch) @@ -378,7 +404,7 @@ QPDFTokenizer::inComment(char ch)
378 } 404 }
379 405
380 void 406 void
381 -QPDFTokenizer::inString(char ch) 407 +Tokenizer::inString(char ch)
382 { 408 {
383 switch (ch) { 409 switch (ch) {
384 case '\\': 410 case '\\':
@@ -392,7 +418,7 @@ QPDFTokenizer::inString(char ch) @@ -392,7 +418,7 @@ QPDFTokenizer::inString(char ch)
392 418
393 case ')': 419 case ')':
394 if (--this->string_depth == 0) { 420 if (--this->string_depth == 0) {
395 - this->type = tt_string; 421 + this->type = tt::tt_string;
396 this->state = st_token_ready; 422 this->state = st_token_ready;
397 return; 423 return;
398 } 424 }
@@ -417,7 +443,7 @@ QPDFTokenizer::inString(char ch) @@ -417,7 +443,7 @@ QPDFTokenizer::inString(char ch)
417 } 443 }
418 444
419 void 445 void
420 -QPDFTokenizer::inName(char ch) 446 +Tokenizer::inName(char ch)
421 { 447 {
422 if (isDelimiter(ch)) { 448 if (isDelimiter(ch)) {
423 // A C-locale whitespace character or delimiter terminates token. It is important to unread 449 // A C-locale whitespace character or delimiter terminates token. It is important to unread
@@ -426,7 +452,7 @@ QPDFTokenizer::inName(char ch) @@ -426,7 +452,7 @@ QPDFTokenizer::inName(char ch)
426 // though not on any files in the test suite as of this 452 // though not on any files in the test suite as of this
427 // writing. 453 // writing.
428 454
429 - this->type = this->bad ? tt_bad : tt_name; 455 + this->type = this->bad ? tt::tt_bad : tt::tt_name;
430 this->in_token = false; 456 this->in_token = false;
431 this->char_to_unread = ch; 457 this->char_to_unread = ch;
432 this->state = st_token_ready; 458 this->state = st_token_ready;
@@ -439,7 +465,7 @@ QPDFTokenizer::inName(char ch) @@ -439,7 +465,7 @@ QPDFTokenizer::inName(char ch)
439 } 465 }
440 466
441 void 467 void
442 -QPDFTokenizer::inNameHex1(char ch) 468 +Tokenizer::inNameHex1(char ch)
443 { 469 {
444 this->hex_char = ch; 470 this->hex_char = ch;
445 471
@@ -457,7 +483,7 @@ QPDFTokenizer::inNameHex1(char ch) @@ -457,7 +483,7 @@ QPDFTokenizer::inNameHex1(char ch)
457 } 483 }
458 484
459 void 485 void
460 -QPDFTokenizer::inNameHex2(char ch) 486 +Tokenizer::inNameHex2(char ch)
461 { 487 {
462 if (char hval = util::hex_decode_char(ch); hval < '\20') { 488 if (char hval = util::hex_decode_char(ch); hval < '\20') {
463 this->char_code |= int(hval); 489 this->char_code |= int(hval);
@@ -484,7 +510,7 @@ QPDFTokenizer::inNameHex2(char ch) @@ -484,7 +510,7 @@ QPDFTokenizer::inNameHex2(char ch)
484 } 510 }
485 511
486 void 512 void
487 -QPDFTokenizer::inSign(char ch) 513 +Tokenizer::inSign(char ch)
488 { 514 {
489 if (util::is_digit(ch)) { 515 if (util::is_digit(ch)) {
490 this->state = st_number; 516 this->state = st_number;
@@ -497,7 +523,7 @@ QPDFTokenizer::inSign(char ch) @@ -497,7 +523,7 @@ QPDFTokenizer::inSign(char ch)
497 } 523 }
498 524
499 void 525 void
500 -QPDFTokenizer::inDecimal(char ch) 526 +Tokenizer::inDecimal(char ch)
501 { 527 {
502 if (util::is_digit(ch)) { 528 if (util::is_digit(ch)) {
503 this->state = st_real; 529 this->state = st_real;
@@ -508,13 +534,13 @@ QPDFTokenizer::inDecimal(char ch) @@ -508,13 +534,13 @@ QPDFTokenizer::inDecimal(char ch)
508 } 534 }
509 535
510 void 536 void
511 -QPDFTokenizer::inNumber(char ch) 537 +Tokenizer::inNumber(char ch)
512 { 538 {
513 if (util::is_digit(ch)) { 539 if (util::is_digit(ch)) {
514 } else if (ch == '.') { 540 } else if (ch == '.') {
515 this->state = st_real; 541 this->state = st_real;
516 } else if (isDelimiter(ch)) { 542 } else if (isDelimiter(ch)) {
517 - this->type = tt_integer; 543 + this->type = tt::tt_integer;
518 this->state = st_token_ready; 544 this->state = st_token_ready;
519 this->in_token = false; 545 this->in_token = false;
520 this->char_to_unread = ch; 546 this->char_to_unread = ch;
@@ -524,11 +550,11 @@ QPDFTokenizer::inNumber(char ch) @@ -524,11 +550,11 @@ QPDFTokenizer::inNumber(char ch)
524 } 550 }
525 551
526 void 552 void
527 -QPDFTokenizer::inReal(char ch) 553 +Tokenizer::inReal(char ch)
528 { 554 {
529 if (util::is_digit(ch)) { 555 if (util::is_digit(ch)) {
530 } else if (isDelimiter(ch)) { 556 } else if (isDelimiter(ch)) {
531 - this->type = tt_real; 557 + this->type = tt::tt_real;
532 this->state = st_token_ready; 558 this->state = st_token_ready;
533 this->in_token = false; 559 this->in_token = false;
534 this->char_to_unread = ch; 560 this->char_to_unread = ch;
@@ -537,7 +563,7 @@ QPDFTokenizer::inReal(char ch) @@ -537,7 +563,7 @@ QPDFTokenizer::inReal(char ch)
537 } 563 }
538 } 564 }
539 void 565 void
540 -QPDFTokenizer::inStringEscape(char ch) 566 +Tokenizer::inStringEscape(char ch)
541 { 567 {
542 this->state = st_in_string; 568 this->state = st_in_string;
543 switch (ch) { 569 switch (ch) {
@@ -590,7 +616,7 @@ QPDFTokenizer::inStringEscape(char ch) @@ -590,7 +616,7 @@ QPDFTokenizer::inStringEscape(char ch)
590 } 616 }
591 617
592 void 618 void
593 -QPDFTokenizer::inStringAfterCR(char ch) 619 +Tokenizer::inStringAfterCR(char ch)
594 { 620 {
595 this->state = st_in_string; 621 this->state = st_in_string;
596 if (ch != '\n') { 622 if (ch != '\n') {
@@ -599,10 +625,10 @@ QPDFTokenizer::inStringAfterCR(char ch) @@ -599,10 +625,10 @@ QPDFTokenizer::inStringAfterCR(char ch)
599 } 625 }
600 626
601 void 627 void
602 -QPDFTokenizer::inLt(char ch) 628 +Tokenizer::inLt(char ch)
603 { 629 {
604 if (ch == '<') { 630 if (ch == '<') {
605 - this->type = tt_dict_open; 631 + this->type = tt::tt_dict_open;
606 this->state = st_token_ready; 632 this->state = st_token_ready;
607 return; 633 return;
608 } 634 }
@@ -612,13 +638,13 @@ QPDFTokenizer::inLt(char ch) @@ -612,13 +638,13 @@ QPDFTokenizer::inLt(char ch)
612 } 638 }
613 639
614 void 640 void
615 -QPDFTokenizer::inGt(char ch) 641 +Tokenizer::inGt(char ch)
616 { 642 {
617 if (ch == '>') { 643 if (ch == '>') {
618 - this->type = tt_dict_close; 644 + this->type = tt::tt_dict_close;
619 this->state = st_token_ready; 645 this->state = st_token_ready;
620 } else { 646 } else {
621 - this->type = tt_bad; 647 + this->type = tt::tt_bad;
622 QTC::TC("qpdf", "QPDFTokenizer bad >"); 648 QTC::TC("qpdf", "QPDFTokenizer bad >");
623 this->error_message = "unexpected >"; 649 this->error_message = "unexpected >";
624 this->in_token = false; 650 this->in_token = false;
@@ -628,7 +654,7 @@ QPDFTokenizer::inGt(char ch) @@ -628,7 +654,7 @@ QPDFTokenizer::inGt(char ch)
628 } 654 }
629 655
630 void 656 void
631 -QPDFTokenizer::inLiteral(char ch) 657 +Tokenizer::inLiteral(char ch)
632 { 658 {
633 if (isDelimiter(ch)) { 659 if (isDelimiter(ch)) {
634 // A C-locale whitespace character or delimiter terminates token. It is important to unread 660 // A C-locale whitespace character or delimiter terminates token. It is important to unread
@@ -640,27 +666,27 @@ QPDFTokenizer::inLiteral(char ch) @@ -640,27 +666,27 @@ QPDFTokenizer::inLiteral(char ch)
640 this->char_to_unread = ch; 666 this->char_to_unread = ch;
641 this->state = st_token_ready; 667 this->state = st_token_ready;
642 this->type = (this->raw_val == "true") || (this->raw_val == "false") 668 this->type = (this->raw_val == "true") || (this->raw_val == "false")
643 - ? tt_bool  
644 - : (this->raw_val == "null" ? tt_null : tt_word); 669 + ? tt::tt_bool
  670 + : (this->raw_val == "null" ? tt::tt_null : tt::tt_word);
645 } 671 }
646 } 672 }
647 673
648 void 674 void
649 -QPDFTokenizer::inHexstring(char ch) 675 +Tokenizer::inHexstring(char ch)
650 { 676 {
651 if (char hval = util::hex_decode_char(ch); hval < '\20') { 677 if (char hval = util::hex_decode_char(ch); hval < '\20') {
652 this->char_code = int(hval) << 4; 678 this->char_code = int(hval) << 4;
653 this->state = st_in_hexstring_2nd; 679 this->state = st_in_hexstring_2nd;
654 680
655 } else if (ch == '>') { 681 } else if (ch == '>') {
656 - this->type = tt_string; 682 + this->type = tt::tt_string;
657 this->state = st_token_ready; 683 this->state = st_token_ready;
658 684
659 } else if (isSpace(ch)) { 685 } else if (isSpace(ch)) {
660 // ignore 686 // ignore
661 687
662 } else { 688 } else {
663 - this->type = tt_bad; 689 + this->type = tt::tt_bad;
664 QTC::TC("qpdf", "QPDFTokenizer bad hexstring character"); 690 QTC::TC("qpdf", "QPDFTokenizer bad hexstring character");
665 this->error_message = std::string("invalid character (") + ch + ") in hexstring"; 691 this->error_message = std::string("invalid character (") + ch + ") in hexstring";
666 this->state = st_token_ready; 692 this->state = st_token_ready;
@@ -668,7 +694,7 @@ QPDFTokenizer::inHexstring(char ch) @@ -668,7 +694,7 @@ QPDFTokenizer::inHexstring(char ch)
668 } 694 }
669 695
670 void 696 void
671 -QPDFTokenizer::inHexstring2nd(char ch) 697 +Tokenizer::inHexstring2nd(char ch)
672 { 698 {
673 if (char hval = util::hex_decode_char(ch); hval < '\20') { 699 if (char hval = util::hex_decode_char(ch); hval < '\20') {
674 this->val += char(this->char_code) | hval; 700 this->val += char(this->char_code) | hval;
@@ -677,14 +703,14 @@ QPDFTokenizer::inHexstring2nd(char ch) @@ -677,14 +703,14 @@ QPDFTokenizer::inHexstring2nd(char ch)
677 } else if (ch == '>') { 703 } else if (ch == '>') {
678 // PDF spec says odd hexstrings have implicit trailing 0. 704 // PDF spec says odd hexstrings have implicit trailing 0.
679 this->val += char(this->char_code); 705 this->val += char(this->char_code);
680 - this->type = tt_string; 706 + this->type = tt::tt_string;
681 this->state = st_token_ready; 707 this->state = st_token_ready;
682 708
683 } else if (isSpace(ch)) { 709 } else if (isSpace(ch)) {
684 // ignore 710 // ignore
685 711
686 } else { 712 } else {
687 - this->type = tt_bad; 713 + this->type = tt::tt_bad;
688 QTC::TC("qpdf", "QPDFTokenizer bad hexstring 2nd character"); 714 QTC::TC("qpdf", "QPDFTokenizer bad hexstring 2nd character");
689 this->error_message = std::string("invalid character (") + ch + ") in hexstring"; 715 this->error_message = std::string("invalid character (") + ch + ") in hexstring";
690 this->state = st_token_ready; 716 this->state = st_token_ready;
@@ -692,7 +718,7 @@ QPDFTokenizer::inHexstring2nd(char ch) @@ -692,7 +718,7 @@ QPDFTokenizer::inHexstring2nd(char ch)
692 } 718 }
693 719
694 void 720 void
695 -QPDFTokenizer::inCharCode(char ch) 721 +Tokenizer::inCharCode(char ch)
696 { 722 {
697 bool handled = false; 723 bool handled = false;
698 if (('0' <= ch) && (ch <= '7')) { 724 if (('0' <= ch) && (ch <= '7')) {
@@ -712,11 +738,11 @@ QPDFTokenizer::inCharCode(char ch) @@ -712,11 +738,11 @@ QPDFTokenizer::inCharCode(char ch)
712 } 738 }
713 739
714 void 740 void
715 -QPDFTokenizer::inInlineImage(char ch) 741 +Tokenizer::inInlineImage(char ch)
716 { 742 {
717 if ((this->raw_val.length() + 1) == this->inline_image_bytes) { 743 if ((this->raw_val.length() + 1) == this->inline_image_bytes) {
718 QTC::TC("qpdf", "QPDFTokenizer found EI by byte count"); 744 QTC::TC("qpdf", "QPDFTokenizer found EI by byte count");
719 - this->type = tt_inline_image; 745 + this->type = tt::tt_inline_image;
720 this->inline_image_bytes = 0; 746 this->inline_image_bytes = 0;
721 this->state = st_token_ready; 747 this->state = st_token_ready;
722 } 748 }
@@ -725,6 +751,12 @@ QPDFTokenizer::inInlineImage(char ch) @@ -725,6 +751,12 @@ QPDFTokenizer::inInlineImage(char ch)
725 void 751 void
726 QPDFTokenizer::presentEOF() 752 QPDFTokenizer::presentEOF()
727 { 753 {
  754 + m->presentEOF();
  755 +}
  756 +
  757 +void
  758 +Tokenizer::presentEOF()
  759 +{
728 switch (this->state) { 760 switch (this->state) {
729 case st_name: 761 case st_name:
730 case st_name_hex1: 762 case st_name_hex1:
@@ -742,15 +774,15 @@ QPDFTokenizer::presentEOF() @@ -742,15 +774,15 @@ QPDFTokenizer::presentEOF()
742 774
743 case st_top: 775 case st_top:
744 case st_before_token: 776 case st_before_token:
745 - this->type = tt_eof; 777 + this->type = tt::tt_eof;
746 break; 778 break;
747 779
748 case st_in_space: 780 case st_in_space:
749 - this->type = this->include_ignorable ? tt_space : tt_eof; 781 + this->type = this->include_ignorable ? tt::tt_space : tt::tt_eof;
750 break; 782 break;
751 783
752 case st_in_comment: 784 case st_in_comment:
753 - this->type = this->include_ignorable ? tt_comment : tt_bad; 785 + this->type = this->include_ignorable ? tt::tt_comment : tt::tt_bad;
754 break; 786 break;
755 787
756 case st_token_ready: 788 case st_token_ready:
@@ -758,7 +790,7 @@ QPDFTokenizer::presentEOF() @@ -758,7 +790,7 @@ QPDFTokenizer::presentEOF()
758 790
759 default: 791 default:
760 QTC::TC("qpdf", "QPDFTokenizer EOF reading token"); 792 QTC::TC("qpdf", "QPDFTokenizer EOF reading token");
761 - this->type = tt_bad; 793 + this->type = tt::tt_bad;
762 this->error_message = "EOF while reading token"; 794 this->error_message = "EOF while reading token";
763 } 795 }
764 this->state = st_token_ready; 796 this->state = st_token_ready;
@@ -767,12 +799,24 @@ QPDFTokenizer::presentEOF() @@ -767,12 +799,24 @@ QPDFTokenizer::presentEOF()
767 void 799 void
768 QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input) 800 QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input)
769 { 801 {
  802 + m->expectInlineImage(input);
  803 +}
  804 +
  805 +void
  806 +Tokenizer::expectInlineImage(std::shared_ptr<InputSource> input)
  807 +{
770 expectInlineImage(*input); 808 expectInlineImage(*input);
771 } 809 }
772 810
773 void 811 void
774 QPDFTokenizer::expectInlineImage(InputSource& input) 812 QPDFTokenizer::expectInlineImage(InputSource& input)
775 { 813 {
  814 + m->expectInlineImage(input);
  815 +}
  816 +
  817 +void
  818 +Tokenizer::expectInlineImage(InputSource& input)
  819 +{
776 if (this->state == st_token_ready) { 820 if (this->state == st_token_ready) {
777 reset(); 821 reset();
778 } else if (this->state != st_before_token) { 822 } else if (this->state != st_before_token) {
@@ -786,7 +830,7 @@ QPDFTokenizer::expectInlineImage(InputSource&amp; input) @@ -786,7 +830,7 @@ QPDFTokenizer::expectInlineImage(InputSource&amp; input)
786 } 830 }
787 831
788 void 832 void
789 -QPDFTokenizer::findEI(InputSource& input) 833 +Tokenizer::findEI(InputSource& input)
790 { 834 {
791 qpdf_offset_t last_offset = input.getLastOffset(); 835 qpdf_offset_t last_offset = input.getLastOffset();
792 qpdf_offset_t pos = input.tell(); 836 qpdf_offset_t pos = input.tell();
@@ -816,10 +860,10 @@ QPDFTokenizer::findEI(InputSource&amp; input) @@ -816,10 +860,10 @@ QPDFTokenizer::findEI(InputSource&amp; input)
816 // be pretty sure we've found the actual EI. 860 // be pretty sure we've found the actual EI.
817 for (int i = 0; i < 10; ++i) { 861 for (int i = 0; i < 10; ++i) {
818 QPDFTokenizer::Token t = check.readToken(input, "checker", true); 862 QPDFTokenizer::Token t = check.readToken(input, "checker", true);
819 - token_type_e type = t.getType();  
820 - if (type == tt_eof) { 863 + QPDFTokenizer::token_type_e type = t.getType();
  864 + if (type == tt::tt_eof) {
821 okay = true; 865 okay = true;
822 - } else if (type == tt_bad) { 866 + } else if (type == tt::tt_bad) {
823 found_bad = true; 867 found_bad = true;
824 } else if (t.isWord()) { 868 } else if (t.isWord()) {
825 // The qpdf tokenizer lumps alphabetic and otherwise uncategorized characters into 869 // The qpdf tokenizer lumps alphabetic and otherwise uncategorized characters into
@@ -870,11 +914,17 @@ QPDFTokenizer::findEI(InputSource&amp; input) @@ -870,11 +914,17 @@ QPDFTokenizer::findEI(InputSource&amp; input)
870 bool 914 bool
871 QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch) 915 QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch)
872 { 916 {
  917 + return m->getToken(token, unread_char, ch);
  918 +}
  919 +
  920 +bool
  921 +Tokenizer::getToken(Token& token, bool& unread_char, char& ch)
  922 +{
873 bool ready = (this->state == st_token_ready); 923 bool ready = (this->state == st_token_ready);
874 unread_char = !this->in_token && !this->before_token; 924 unread_char = !this->in_token && !this->before_token;
875 ch = this->char_to_unread; 925 ch = this->char_to_unread;
876 if (ready) { 926 if (ready) {
877 - token = (!(this->type == tt_name || this->type == tt_string)) 927 + token = (!(this->type == tt::tt_name || this->type == tt::tt_string))
878 ? Token(this->type, this->raw_val, this->raw_val, this->error_message) 928 ? Token(this->type, this->raw_val, this->raw_val, this->error_message)
879 : Token(this->type, this->val, this->raw_val, this->error_message); 929 : Token(this->type, this->val, this->raw_val, this->error_message);
880 930
@@ -886,13 +936,19 @@ QPDFTokenizer::getToken(Token&amp; token, bool&amp; unread_char, char&amp; ch) @@ -886,13 +936,19 @@ QPDFTokenizer::getToken(Token&amp; token, bool&amp; unread_char, char&amp; ch)
886 bool 936 bool
887 QPDFTokenizer::betweenTokens() 937 QPDFTokenizer::betweenTokens()
888 { 938 {
889 - return this->before_token; 939 + return m->before_token;
890 } 940 }
891 941
892 QPDFTokenizer::Token 942 QPDFTokenizer::Token
893 QPDFTokenizer::readToken( 943 QPDFTokenizer::readToken(
894 InputSource& input, std::string const& context, bool allow_bad, size_t max_len) 944 InputSource& input, std::string const& context, bool allow_bad, size_t max_len)
895 { 945 {
  946 + return m->readToken(input, context, allow_bad, max_len);
  947 +}
  948 +
  949 +QPDFTokenizer::Token
  950 +Tokenizer::readToken(InputSource& input, std::string const& context, bool allow_bad, size_t max_len)
  951 +{
896 nextToken(input, context, max_len); 952 nextToken(input, context, max_len);
897 953
898 Token token; 954 Token token;
@@ -900,7 +956,7 @@ QPDFTokenizer::readToken( @@ -900,7 +956,7 @@ QPDFTokenizer::readToken(
900 char char_to_unread; 956 char char_to_unread;
901 getToken(token, unread_char, char_to_unread); 957 getToken(token, unread_char, char_to_unread);
902 958
903 - if (token.getType() == tt_bad) { 959 + if (token.getType() == tt::tt_bad) {
904 if (allow_bad) { 960 if (allow_bad) {
905 QTC::TC("qpdf", "QPDFTokenizer allowing bad token"); 961 QTC::TC("qpdf", "QPDFTokenizer allowing bad token");
906 } else { 962 } else {
@@ -919,12 +975,25 @@ QPDFTokenizer::Token @@ -919,12 +975,25 @@ QPDFTokenizer::Token
919 QPDFTokenizer::readToken( 975 QPDFTokenizer::readToken(
920 std::shared_ptr<InputSource> input, std::string const& context, bool allow_bad, size_t max_len) 976 std::shared_ptr<InputSource> input, std::string const& context, bool allow_bad, size_t max_len)
921 { 977 {
  978 + return m->readToken(*input, context, allow_bad, max_len);
  979 +}
  980 +
  981 +QPDFTokenizer::Token
  982 +Tokenizer::readToken(
  983 + std::shared_ptr<InputSource> input, std::string const& context, bool allow_bad, size_t max_len)
  984 +{
922 return readToken(*input, context, allow_bad, max_len); 985 return readToken(*input, context, allow_bad, max_len);
923 } 986 }
924 987
925 bool 988 bool
926 QPDFTokenizer::nextToken(InputSource& input, std::string const& context, size_t max_len) 989 QPDFTokenizer::nextToken(InputSource& input, std::string const& context, size_t max_len)
927 { 990 {
  991 + return m->nextToken(input, context, max_len);
  992 +}
  993 +
  994 +bool
  995 +Tokenizer::nextToken(InputSource& input, std::string const& context, size_t max_len)
  996 +{
928 if (this->state != st_inline_image) { 997 if (this->state != st_inline_image) {
929 reset(); 998 reset();
930 } 999 }
@@ -935,10 +1004,10 @@ QPDFTokenizer::nextToken(InputSource&amp; input, std::string const&amp; context, size_t @@ -935,10 +1004,10 @@ QPDFTokenizer::nextToken(InputSource&amp; input, std::string const&amp; context, size_t
935 if (!input.fastRead(ch)) { 1004 if (!input.fastRead(ch)) {
936 presentEOF(); 1005 presentEOF();
937 1006
938 - if ((this->type == tt_eof) && (!this->allow_eof)) { 1007 + if ((this->type == tt::tt_eof) && (!this->allow_eof)) {
939 // Nothing in the qpdf library calls readToken without allowEOF anymore, so this 1008 // Nothing in the qpdf library calls readToken without allowEOF anymore, so this
940 // case is not exercised. 1009 // case is not exercised.
941 - this->type = tt_bad; 1010 + this->type = tt::tt_bad;
942 this->error_message = "unexpected EOF"; 1011 this->error_message = "unexpected EOF";
943 offset = input.getLastOffset(); 1012 offset = input.getLastOffset();
944 } 1013 }
@@ -953,7 +1022,7 @@ QPDFTokenizer::nextToken(InputSource&amp; input, std::string const&amp; context, size_t @@ -953,7 +1022,7 @@ QPDFTokenizer::nextToken(InputSource&amp; input, std::string const&amp; context, size_t
953 if (max_len && (this->raw_val.length() >= max_len) && (this->state != st_token_ready)) { 1022 if (max_len && (this->raw_val.length() >= max_len) && (this->state != st_token_ready)) {
954 // terminate this token now 1023 // terminate this token now
955 QTC::TC("qpdf", "QPDFTokenizer block long token"); 1024 QTC::TC("qpdf", "QPDFTokenizer block long token");
956 - this->type = tt_bad; 1025 + this->type = tt::tt_bad;
957 this->state = st_token_ready; 1026 this->state = st_token_ready;
958 this->error_message = "exceeded allowable length while reading token"; 1027 this->error_message = "exceeded allowable length while reading token";
959 } 1028 }
@@ -962,7 +1031,7 @@ QPDFTokenizer::nextToken(InputSource&amp; input, std::string const&amp; context, size_t @@ -962,7 +1031,7 @@ QPDFTokenizer::nextToken(InputSource&amp; input, std::string const&amp; context, size_t
962 1031
963 input.fastUnread(!this->in_token && !this->before_token); 1032 input.fastUnread(!this->in_token && !this->before_token);
964 1033
965 - if (this->type != tt_eof) { 1034 + if (this->type != tt::tt_eof) {
966 input.setLastOffset(offset); 1035 input.setLastOffset(offset);
967 } 1036 }
968 1037
libqpdf/qpdf/QPDFTokenizer_private.hh 0 โ†’ 100644
  1 +#ifndef QPDFTOKENIZER_PRIVATE_HH
  2 +#define QPDFTOKENIZER_PRIVATE_HH
  3 +
  4 +#include <qpdf/QPDFTokenizer.hh>
  5 +
  6 +namespace qpdf
  7 +{
  8 +
  9 + class Tokenizer
  10 + {
  11 + friend class ::QPDFTokenizer;
  12 +
  13 + public:
  14 + Tokenizer();
  15 +
  16 + // Methods to support QPDFTokenizer. See QPDFTokenizer.hh for detail. Some of these are used
  17 + // by Tokenizer internally but are not accessed directly by the rest of qpdf.
  18 +
  19 + void allowEOF();
  20 + void includeIgnorable();
  21 + void presentCharacter(char ch);
  22 + void presentEOF();
  23 +
  24 + // If a token is available, return true and initialize token with the token, unread_char
  25 + // with whether or not we have to unread the last character, and if unread_char, ch with the
  26 + // character to unread.
  27 + bool getToken(QPDFTokenizer::Token& token, bool& unread_char, char& ch);
  28 +
  29 + // Pull mode:
  30 +
  31 + // Read a token from an input source. Context describes the context in which the token is
  32 + // being read and is used in the exception thrown if there is an error. After a token is
  33 + // read, the position of the input source returned by input->tell() points to just after the
  34 + // token, and the input source's "last offset" as returned by input->getLastOffset() points
  35 + // to the beginning of the token.
  36 + QPDFTokenizer::Token readToken(
  37 + InputSource& input,
  38 + std::string const& context,
  39 + bool allow_bad = false,
  40 + size_t max_len = 0);
  41 +
  42 + QPDFTokenizer::Token readToken(
  43 + std::shared_ptr<InputSource> input,
  44 + std::string const& context,
  45 + bool allow_bad = false,
  46 + size_t max_len = 0);
  47 +
  48 + // Calling this method puts the tokenizer in a state for reading inline images. You should
  49 + // call this method after reading the character following the ID operator. In that state, it
  50 + // will return all data up to BUT NOT INCLUDING the next EI token. After you call this
  51 + // method, the next call to readToken (or the token created next time getToken returns true)
  52 + // will either be tt_inline_image or tt_bad. This is the only way readToken returns a
  53 + // tt_inline_image token.
  54 + void expectInlineImage(std::shared_ptr<InputSource> input);
  55 +
  56 + void expectInlineImage(InputSource& input);
  57 +
  58 + private:
  59 + // Read a token from an input source. Context describes the context in which the token is
  60 + // being read and is used in the exception thrown if there is an error. After a token is
  61 + // read, the position of the input source returned by input->tell() points to just after the
  62 + // token, and the input source's "last offset" as returned by input->getLastOffset() points
  63 + // to the beginning of the token. Returns false if the token is bad or if scanning produced
  64 + // an error message for any reason.
  65 + bool nextToken(InputSource& input, std::string const& context, size_t max_len = 0);
  66 +
  67 + // The following methods are only valid after nextToken has been called and until another
  68 + // QPDFTokenizer method is called. They allow the results of calling nextToken to be
  69 + // accessed without creating a Token, thus avoiding copying information that may not be
  70 + // needed.
  71 + inline QPDFTokenizer::token_type_e getType() const;
  72 + inline std::string const& getValue() const;
  73 + inline std::string const& getRawValue() const;
  74 + inline std::string const& getErrorMessage() const;
  75 +
  76 + Tokenizer(Tokenizer const&) = delete;
  77 + Tokenizer& operator=(Tokenizer const&) = delete;
  78 +
  79 + bool isSpace(char);
  80 + bool isDelimiter(char);
  81 + void findEI(InputSource& input);
  82 +
  83 + enum state_e {
  84 + st_top,
  85 + st_in_hexstring,
  86 + st_in_string,
  87 + st_in_hexstring_2nd,
  88 + st_name,
  89 + st_literal,
  90 + st_in_space,
  91 + st_in_comment,
  92 + st_string_escape,
  93 + st_char_code,
  94 + st_string_after_cr,
  95 + st_lt,
  96 + st_gt,
  97 + st_inline_image,
  98 + st_sign,
  99 + st_number,
  100 + st_real,
  101 + st_decimal,
  102 + st_name_hex1,
  103 + st_name_hex2,
  104 + st_before_token,
  105 + st_token_ready
  106 + };
  107 +
  108 + void handleCharacter(char);
  109 + void inBeforeToken(char);
  110 + void inTop(char);
  111 + void inSpace(char);
  112 + void inComment(char);
  113 + void inString(char);
  114 + void inName(char);
  115 + void inLt(char);
  116 + void inGt(char);
  117 + void inStringAfterCR(char);
  118 + void inStringEscape(char);
  119 + void inLiteral(char);
  120 + void inCharCode(char);
  121 + void inHexstring(char);
  122 + void inHexstring2nd(char);
  123 + void inInlineImage(char);
  124 + void inTokenReady(char);
  125 + void inNameHex1(char);
  126 + void inNameHex2(char);
  127 + void inSign(char);
  128 + void inDecimal(char);
  129 + void inNumber(char);
  130 + void inReal(char);
  131 + void reset();
  132 +
  133 + // Lexer state
  134 + state_e state;
  135 +
  136 + bool allow_eof{false};
  137 + bool include_ignorable{false};
  138 +
  139 + // Current token accumulation
  140 + QPDFTokenizer::token_type_e type;
  141 + std::string val;
  142 + std::string raw_val;
  143 + std::string error_message;
  144 + bool before_token;
  145 + bool in_token;
  146 + char char_to_unread;
  147 + size_t inline_image_bytes;
  148 + bool bad;
  149 +
  150 + // State for strings
  151 + int string_depth;
  152 + int char_code;
  153 + char hex_char;
  154 + int digit_count;
  155 + };
  156 +
  157 + inline QPDFTokenizer::token_type_e
  158 + Tokenizer::getType() const
  159 + {
  160 + return this->type;
  161 + }
  162 + inline std::string const&
  163 + Tokenizer::getValue() const
  164 + {
  165 + return (this->type == QPDFTokenizer::tt_name || this->type == QPDFTokenizer::tt_string)
  166 + ? this->val
  167 + : this->raw_val;
  168 + }
  169 + inline std::string const&
  170 + Tokenizer::getRawValue() const
  171 + {
  172 + return this->raw_val;
  173 + }
  174 + inline std::string const&
  175 + Tokenizer::getErrorMessage() const
  176 + {
  177 + return this->error_message;
  178 + }
  179 +
  180 +} // namespace qpdf
  181 +
  182 +inline QPDFTokenizer::token_type_e
  183 +QPDFTokenizer::getType() const noexcept
  184 +{
  185 + return m->type;
  186 +}
  187 +inline std::string const&
  188 +QPDFTokenizer::getValue() const noexcept
  189 +{
  190 + return (m->type == tt_name || m->type == tt_string) ? m->val : m->raw_val;
  191 +}
  192 +inline std::string const&
  193 +QPDFTokenizer::getRawValue() const noexcept
  194 +{
  195 + return m->raw_val;
  196 +}
  197 +inline std::string const&
  198 +QPDFTokenizer::getErrorMessage() const noexcept
  199 +{
  200 + return m->error_message;
  201 +}
  202 +
  203 +#endif // QPDFTOKENIZER_PRIVATE_HH