Commit a3f3238f371f07cd2b2e1a96753cde6590712dc5

Authored by m-holger
1 parent 6111a6a4

Split QPDFTokenizer::handleCharacter into individual methods

include/qpdf/QPDFTokenizer.hh
@@ -200,26 +200,36 @@ class QPDFTokenizer @@ -200,26 +200,36 @@ class QPDFTokenizer
200 200
201 enum state_e { 201 enum state_e {
202 st_top, 202 st_top,
  203 + st_in_hexstring,
  204 + st_in_string,
  205 + st_in_hexstring_2nd,
  206 + st_literal,
203 st_in_space, 207 st_in_space,
204 st_in_comment, 208 st_in_comment,
205 - st_in_string,  
206 st_string_escape, 209 st_string_escape,
207 st_char_code, 210 st_char_code,
208 st_string_after_cr, 211 st_string_after_cr,
209 st_lt, 212 st_lt,
210 st_gt, 213 st_gt,
211 - st_literal,  
212 - st_in_hexstring,  
213 - st_in_hexstring_2nd,  
214 st_inline_image, 214 st_inline_image,
215 st_token_ready 215 st_token_ready
216 }; 216 };
217 217
218 void handleCharacter(char); 218 void handleCharacter(char);
  219 + void inTop(char);
  220 + void inSpace(char);
  221 + void inComment(char);
  222 + void inString(char);
  223 + void inLt(char);
  224 + void inGt(char);
  225 + void inStringAfterCR(char);
  226 + void inStringEscape(char);
  227 + void inLiteral(char);
219 void inCharCode(char); 228 void inCharCode(char);
220 void inHexstring(char); 229 void inHexstring(char);
221 void inHexstring2nd(char); 230 void inHexstring2nd(char);
222 - void inString(char); 231 + void inInlineImage(char);
  232 + void inTokenReady(char);
223 233
224 void reset(); 234 void reset();
225 235
libqpdf/QPDFTokenizer.cc
@@ -217,134 +217,24 @@ QPDFTokenizer::handleCharacter(char ch) @@ -217,134 +217,24 @@ QPDFTokenizer::handleCharacter(char ch)
217 // the character that caused a state change in the new state. 217 // the character that caused a state change in the new state.
218 218
219 switch (this->state) { 219 switch (this->state) {
220 - case (st_token_ready):  
221 - throw std::logic_error(  
222 - "INTERNAL ERROR: QPDF tokenizer presented character "  
223 - "while token is waiting");  
224 -  
225 case st_top: 220 case st_top:
226 - // Note: we specifically do not use ctype here. It is  
227 - // locale-dependent.  
228 - if (isSpace(ch)) {  
229 - if (this->include_ignorable) {  
230 - this->state = st_in_space;  
231 - this->val += ch;  
232 - }  
233 - return;  
234 - }  
235 - switch (ch) {  
236 - case '%':  
237 - this->state = st_in_comment;  
238 - if (this->include_ignorable) {  
239 - this->val += ch;  
240 - }  
241 - return;  
242 -  
243 - case '(':  
244 - this->string_depth = 1;  
245 - this->state = st_in_string;  
246 - return;  
247 -  
248 - case '<':  
249 - this->state = st_lt;  
250 - return;  
251 -  
252 - case '>':  
253 - this->state = st_gt;  
254 - return;  
255 -  
256 - case (')'):  
257 - this->type = tt_bad;  
258 - QTC::TC("qpdf", "QPDFTokenizer bad )");  
259 - this->error_message = "unexpected )";  
260 - this->val += ch;  
261 - this->state = st_token_ready;  
262 - return;  
263 -  
264 - case '[':  
265 - this->type = tt_array_open;  
266 - this->state = st_token_ready;  
267 - this->val += ch;  
268 - return;  
269 -  
270 - case ']':  
271 - this->type = tt_array_close;  
272 - this->val += ch;  
273 - this->state = st_token_ready;  
274 - return;  
275 -  
276 - case '{':  
277 - this->type = tt_brace_open;  
278 - this->state = st_token_ready;  
279 - this->val += ch;  
280 - return;  
281 -  
282 - case '}':  
283 - this->type = tt_brace_close;  
284 - this->state = st_token_ready;  
285 - this->val += ch;  
286 - return;  
287 -  
288 - default:  
289 - this->state = st_literal;  
290 - this->val += ch;  
291 - return;  
292 - } 221 + inTop(ch);
  222 + return;
293 223
294 case st_in_space: 224 case st_in_space:
295 - // We only enter this state if include_ignorable is true.  
296 - if (!isSpace(ch)) {  
297 - this->type = tt_space;  
298 - this->unread_char = true;  
299 - this->char_to_unread = ch;  
300 - this->state = st_token_ready;  
301 - return;  
302 - } else {  
303 - this->val += ch;  
304 - return;  
305 - } 225 + inSpace(ch);
  226 + return;
306 227
307 case st_in_comment: 228 case st_in_comment:
308 - if ((ch == '\r') || (ch == '\n')) {  
309 - if (this->include_ignorable) {  
310 - this->type = tt_comment;  
311 - this->unread_char = true;  
312 - this->char_to_unread = ch;  
313 - this->state = st_token_ready;  
314 - } else {  
315 - this->state = st_top;  
316 - }  
317 - } else if (this->include_ignorable) {  
318 - this->val += ch;  
319 - } 229 + inComment(ch);
320 return; 230 return;
321 231
322 case st_lt: 232 case st_lt:
323 - if (ch == '<') {  
324 - this->val += "<<";  
325 - this->type = tt_dict_open;  
326 - this->state = st_token_ready;  
327 - return;  
328 - }  
329 -  
330 - this->state = st_in_hexstring;  
331 - inHexstring(ch); 233 + inLt(ch);
332 return; 234 return;
333 235
334 case st_gt: 236 case st_gt:
335 - if (ch == '>') {  
336 - this->val += ">>";  
337 - this->type = tt_dict_close;  
338 - this->state = st_token_ready;  
339 - } else {  
340 - this->val += ">";  
341 - this->type = tt_bad;  
342 - QTC::TC("qpdf", "QPDFTokenizer bad >");  
343 - this->error_message = "unexpected >";  
344 - this->unread_char = true;  
345 - this->char_to_unread = ch;  
346 - this->state = st_token_ready;  
347 - } 237 + inGt(ch);
348 return; 238 return;
349 239
350 case st_in_string: 240 case st_in_string:
@@ -352,107 +242,308 @@ QPDFTokenizer::handleCharacter(char ch) @@ -352,107 +242,308 @@ QPDFTokenizer::handleCharacter(char ch)
352 return; 242 return;
353 243
354 case st_string_after_cr: 244 case st_string_after_cr:
355 - // CR LF in strings are either ignored or normalized to CR  
356 - this->state = st_in_string;  
357 - if (ch != '\n') {  
358 - inString(ch);  
359 - } 245 + inStringAfterCR(ch);
360 return; 246 return;
361 247
362 case st_string_escape: 248 case st_string_escape:
363 - this->state = st_in_string;  
364 - switch (ch) {  
365 - case '0':  
366 - case '1':  
367 - case '2':  
368 - case '3':  
369 - case '4':  
370 - case '5':  
371 - case '6':  
372 - case '7':  
373 - this->state = st_char_code;  
374 - this->char_code = 0;  
375 - this->digit_count = 0;  
376 - inCharCode(ch);  
377 - return; 249 + inStringEscape(ch);
  250 + return;
378 251
379 - case 'n':  
380 - this->val += '\n';  
381 - return; 252 + case st_char_code:
  253 + inCharCode(ch);
  254 + return;
382 255
383 - case 'r':  
384 - this->val += '\r';  
385 - return; 256 + case st_literal:
  257 + inLiteral(ch);
  258 + return;
386 259
387 - case 't':  
388 - this->val += '\t';  
389 - return; 260 + case st_inline_image:
  261 + inInlineImage(ch);
  262 + return;
  263 + this->val += ch;
390 264
391 - case 'b':  
392 - this->val += '\b';  
393 - return; 265 + case st_in_hexstring:
  266 + inHexstring(ch);
  267 + return;
394 268
395 - case 'f':  
396 - this->val += '\f';  
397 - return; 269 + case st_in_hexstring_2nd:
  270 + inHexstring2nd(ch);
  271 + return;
398 272
399 - case '\n':  
400 - return; 273 + case (st_token_ready):
  274 + inTokenReady(ch);
  275 + return;
401 276
402 - case '\r':  
403 - this->state = st_string_after_cr;  
404 - return; 277 + default:
  278 + throw std::logic_error(
  279 + "INTERNAL ERROR: invalid state while reading token");
  280 + }
  281 +}
  282 +
  283 +void
  284 +QPDFTokenizer::inTokenReady(char ch)
  285 +{
  286 + throw std::logic_error("INTERNAL ERROR: QPDF tokenizer presented character "
  287 + "while token is waiting");
  288 +}
405 289
406 - default:  
407 - // PDF spec says backslash is ignored before anything else 290 +void
  291 +QPDFTokenizer::inTop(char ch)
  292 +{
  293 + // Note: we specifically do not use ctype here. It is
  294 + // locale-dependent.
  295 + if (isSpace(ch)) {
  296 + if (this->include_ignorable) {
  297 + this->state = st_in_space;
408 this->val += ch; 298 this->val += ch;
409 return; 299 return;
410 } 300 }
  301 + return;
  302 + }
  303 + switch (ch) {
  304 + case '%':
  305 + this->state = st_in_comment;
  306 + if (this->include_ignorable) {
  307 + this->val += ch;
  308 + }
  309 + return;
411 310
412 - case st_char_code:  
413 - inCharCode(ch); 311 + case '(':
  312 + this->string_depth = 1;
  313 + this->state = st_in_string;
414 return; 314 return;
415 315
416 - case st_literal:  
417 - if (isDelimiter(ch)) {  
418 - // A C-locale whitespace character or delimiter terminates  
419 - // token. It is important to unread the whitespace  
420 - // character even though it is ignored since it may be the  
421 - // newline after a stream keyword. Removing it here could  
422 - // make the stream-reading code break on some files,  
423 - // though not on any files in the test suite as of this  
424 - // writing.  
425 -  
426 - this->type = tt_word; 316 + case '<':
  317 + this->state = st_lt;
  318 + return;
  319 +
  320 + case '>':
  321 + this->state = st_gt;
  322 + return;
  323 +
  324 + case (')'):
  325 + this->type = tt_bad;
  326 + QTC::TC("qpdf", "QPDFTokenizer bad )");
  327 + this->error_message = "unexpected )";
  328 + this->val += ch;
  329 + this->state = st_token_ready;
  330 + return;
  331 +
  332 + case '[':
  333 + this->type = tt_array_open;
  334 + this->state = st_token_ready;
  335 + this->val += ch;
  336 + return;
  337 +
  338 + case ']':
  339 + this->type = tt_array_close;
  340 + this->val += ch;
  341 + this->state = st_token_ready;
  342 + return;
  343 +
  344 + case '{':
  345 + this->type = tt_brace_open;
  346 + this->state = st_token_ready;
  347 + this->val += ch;
  348 + return;
  349 +
  350 + case '}':
  351 + this->type = tt_brace_close;
  352 + this->state = st_token_ready;
  353 + this->val += ch;
  354 + return;
  355 +
  356 + default:
  357 + this->state = st_literal;
  358 + this->val += ch;
  359 + return;
  360 + }
  361 +}
  362 +
  363 +void
  364 +QPDFTokenizer::inSpace(char ch)
  365 +{
  366 + // We only enter this state if include_ignorable is true.
  367 + if (!isSpace(ch)) {
  368 + this->type = tt_space;
  369 + this->unread_char = true;
  370 + this->char_to_unread = ch;
  371 + this->state = st_token_ready;
  372 + return;
  373 + } else {
  374 + this->val += ch;
  375 + return;
  376 + }
  377 +}
  378 +
  379 +void
  380 +QPDFTokenizer::inComment(char ch)
  381 +{
  382 + if ((ch == '\r') || (ch == '\n')) {
  383 + if (this->include_ignorable) {
  384 + this->type = tt_comment;
427 this->unread_char = true; 385 this->unread_char = true;
428 this->char_to_unread = ch; 386 this->char_to_unread = ch;
429 this->state = st_token_ready; 387 this->state = st_token_ready;
430 } else { 388 } else {
431 - this->val += ch; 389 + this->state = st_top;
432 } 390 }
  391 + } else if (this->include_ignorable) {
  392 + this->val += ch;
  393 + }
  394 +}
  395 +
  396 +void
  397 +QPDFTokenizer::inString(char ch)
  398 +{
  399 + switch (ch) {
  400 + case '\\':
  401 + this->state = st_string_escape;
433 return; 402 return;
434 403
435 - case st_inline_image: 404 + case '(':
436 this->val += ch; 405 this->val += ch;
437 - if (this->val.length() == this->inline_image_bytes) {  
438 - QTC::TC("qpdf", "QPDFTokenizer found EI by byte count");  
439 - this->type = tt_inline_image;  
440 - this->inline_image_bytes = 0; 406 + ++this->string_depth;
  407 + return;
  408 +
  409 + case ')':
  410 + if (--this->string_depth == 0) {
  411 + this->type = tt_string;
441 this->state = st_token_ready; 412 this->state = st_token_ready;
  413 + return;
442 } 414 }
  415 +
  416 + this->val += ch;
443 return; 417 return;
444 418
445 - case st_in_hexstring:  
446 - inHexstring(ch); 419 + case '\r':
  420 + // CR by itself is converted to LF
  421 + this->val += '\n';
  422 + this->state = st_string_after_cr;
447 return; 423 return;
448 424
449 - case st_in_hexstring_2nd:  
450 - inHexstring2nd(ch); 425 + case '\n':
  426 + this->val += ch;
451 return; 427 return;
452 428
453 default: 429 default:
454 - throw std::logic_error(  
455 - "INTERNAL ERROR: invalid state while reading token"); 430 + this->val += ch;
  431 + return;
  432 + }
  433 +}
  434 +
  435 +void
  436 +QPDFTokenizer::inStringEscape(char ch)
  437 +{
  438 + this->state = st_in_string;
  439 + switch (ch) {
  440 + case '0':
  441 + case '1':
  442 + case '2':
  443 + case '3':
  444 + case '4':
  445 + case '5':
  446 + case '6':
  447 + case '7':
  448 + this->state = st_char_code;
  449 + this->char_code = 0;
  450 + this->digit_count = 0;
  451 + inCharCode(ch);
  452 + return;
  453 +
  454 + case 'n':
  455 + this->val += '\n';
  456 + return;
  457 +
  458 + case 'r':
  459 + this->val += '\r';
  460 + return;
  461 +
  462 + case 't':
  463 + this->val += '\t';
  464 + return;
  465 +
  466 + case 'b':
  467 + this->val += '\b';
  468 + return;
  469 +
  470 + case 'f':
  471 + this->val += '\f';
  472 + return;
  473 +
  474 + case '\n':
  475 + return;
  476 +
  477 + case '\r':
  478 + this->state = st_string_after_cr;
  479 + return;
  480 +
  481 + default:
  482 + // PDF spec says backslash is ignored before anything else
  483 + this->val += ch;
  484 + return;
  485 + }
  486 +}
  487 +
  488 +void
  489 +QPDFTokenizer::inStringAfterCR(char ch)
  490 +{
  491 + this->state = st_in_string;
  492 + if (ch != '\n') {
  493 + inString(ch);
  494 + }
  495 +}
  496 +
  497 +void
  498 +QPDFTokenizer::inLt(char ch)
  499 +{
  500 + if (ch == '<') {
  501 + this->val += "<<";
  502 + this->type = tt_dict_open;
  503 + this->state = st_token_ready;
  504 + return;
  505 + }
  506 +
  507 + this->state = st_in_hexstring;
  508 + inHexstring(ch);
  509 +}
  510 +
  511 +void
  512 +QPDFTokenizer::inGt(char ch)
  513 +{
  514 + if (ch == '>') {
  515 + this->val += ">>";
  516 + this->type = tt_dict_close;
  517 + this->state = st_token_ready;
  518 + } else {
  519 + this->val += ">";
  520 + this->type = tt_bad;
  521 + QTC::TC("qpdf", "QPDFTokenizer bad >");
  522 + this->error_message = "unexpected >";
  523 + this->unread_char = true;
  524 + this->char_to_unread = ch;
  525 + this->state = st_token_ready;
  526 + }
  527 +}
  528 +
  529 +void
  530 +QPDFTokenizer::inLiteral(char ch)
  531 +{
  532 + if (isDelimiter(ch)) {
  533 + // A C-locale whitespace character or delimiter terminates
  534 + // token. It is important to unread the whitespace
  535 + // character even though it is ignored since it may be the
  536 + // newline after a stream keyword. Removing it here could
  537 + // make the stream-reading code break on some files,
  538 + // though not on any files in the test suite as of this
  539 + // writing.
  540 +
  541 + this->type = tt_word;
  542 + this->unread_char = true;
  543 + this->char_to_unread = ch;
  544 + this->state = st_token_ready;
  545 + } else {
  546 + this->val += ch;
456 } 547 }
457 } 548 }
458 549
@@ -521,45 +612,6 @@ QPDFTokenizer::inHexstring2nd(char ch) @@ -521,45 +612,6 @@ QPDFTokenizer::inHexstring2nd(char ch)
521 } 612 }
522 613
523 void 614 void
524 -QPDFTokenizer::inString(char ch)  
525 -{  
526 - switch (ch) {  
527 - case '\\':  
528 - this->state = st_string_escape;  
529 - return;  
530 -  
531 - case '(':  
532 - this->val += ch;  
533 - ++this->string_depth;  
534 - return;  
535 -  
536 - case ')':  
537 - if (--this->string_depth == 0) {  
538 - this->type = tt_string;  
539 - this->state = st_token_ready;  
540 - return;  
541 - }  
542 -  
543 - this->val += ch;  
544 - return;  
545 -  
546 - case '\r':  
547 - // CR by itself is converted to LF  
548 - this->val += '\n';  
549 - this->state = st_string_after_cr;  
550 - return;  
551 -  
552 - case '\n':  
553 - this->val += ch;  
554 - return;  
555 -  
556 - default:  
557 - this->val += ch;  
558 - return;  
559 - }  
560 -}  
561 -  
562 -void  
563 QPDFTokenizer::inCharCode(char ch) 615 QPDFTokenizer::inCharCode(char ch)
564 { 616 {
565 if (('0' <= ch) && (ch <= '7')) { 617 if (('0' <= ch) && (ch <= '7')) {
@@ -576,6 +628,18 @@ QPDFTokenizer::inCharCode(char ch) @@ -576,6 +628,18 @@ QPDFTokenizer::inCharCode(char ch)
576 } 628 }
577 629
578 void 630 void
  631 +QPDFTokenizer::inInlineImage(char ch)
  632 +{
  633 + this->val += ch;
  634 + if (this->val.length() == this->inline_image_bytes) {
  635 + QTC::TC("qpdf", "QPDFTokenizer found EI by byte count");
  636 + this->type = tt_inline_image;
  637 + this->inline_image_bytes = 0;
  638 + this->state = st_token_ready;
  639 + }
  640 +}
  641 +
  642 +void
579 QPDFTokenizer::presentEOF() 643 QPDFTokenizer::presentEOF()
580 { 644 {
581 if (this->state == st_literal) { 645 if (this->state == st_literal) {