Commit a3f3238f371f07cd2b2e1a96753cde6590712dc5

Authored by m-holger
1 parent 6111a6a4

Split QPDFTokenizer::handleCharacter into individual methods

include/qpdf/QPDFTokenizer.hh
... ... @@ -200,26 +200,36 @@ class QPDFTokenizer
200 200  
201 201 enum state_e {
202 202 st_top,
  203 + st_in_hexstring,
  204 + st_in_string,
  205 + st_in_hexstring_2nd,
  206 + st_literal,
203 207 st_in_space,
204 208 st_in_comment,
205   - st_in_string,
206 209 st_string_escape,
207 210 st_char_code,
208 211 st_string_after_cr,
209 212 st_lt,
210 213 st_gt,
211   - st_literal,
212   - st_in_hexstring,
213   - st_in_hexstring_2nd,
214 214 st_inline_image,
215 215 st_token_ready
216 216 };
217 217  
218 218 void handleCharacter(char);
  219 + void inTop(char);
  220 + void inSpace(char);
  221 + void inComment(char);
  222 + void inString(char);
  223 + void inLt(char);
  224 + void inGt(char);
  225 + void inStringAfterCR(char);
  226 + void inStringEscape(char);
  227 + void inLiteral(char);
219 228 void inCharCode(char);
220 229 void inHexstring(char);
221 230 void inHexstring2nd(char);
222   - void inString(char);
  231 + void inInlineImage(char);
  232 + void inTokenReady(char);
223 233  
224 234 void reset();
225 235  
... ...
libqpdf/QPDFTokenizer.cc
... ... @@ -217,134 +217,24 @@ QPDFTokenizer::handleCharacter(char ch)
217 217 // the character that caused a state change in the new state.
218 218  
219 219 switch (this->state) {
220   - case (st_token_ready):
221   - throw std::logic_error(
222   - "INTERNAL ERROR: QPDF tokenizer presented character "
223   - "while token is waiting");
224   -
225 220 case st_top:
226   - // Note: we specifically do not use ctype here. It is
227   - // locale-dependent.
228   - if (isSpace(ch)) {
229   - if (this->include_ignorable) {
230   - this->state = st_in_space;
231   - this->val += ch;
232   - }
233   - return;
234   - }
235   - switch (ch) {
236   - case '%':
237   - this->state = st_in_comment;
238   - if (this->include_ignorable) {
239   - this->val += ch;
240   - }
241   - return;
242   -
243   - case '(':
244   - this->string_depth = 1;
245   - this->state = st_in_string;
246   - return;
247   -
248   - case '<':
249   - this->state = st_lt;
250   - return;
251   -
252   - case '>':
253   - this->state = st_gt;
254   - return;
255   -
256   - case (')'):
257   - this->type = tt_bad;
258   - QTC::TC("qpdf", "QPDFTokenizer bad )");
259   - this->error_message = "unexpected )";
260   - this->val += ch;
261   - this->state = st_token_ready;
262   - return;
263   -
264   - case '[':
265   - this->type = tt_array_open;
266   - this->state = st_token_ready;
267   - this->val += ch;
268   - return;
269   -
270   - case ']':
271   - this->type = tt_array_close;
272   - this->val += ch;
273   - this->state = st_token_ready;
274   - return;
275   -
276   - case '{':
277   - this->type = tt_brace_open;
278   - this->state = st_token_ready;
279   - this->val += ch;
280   - return;
281   -
282   - case '}':
283   - this->type = tt_brace_close;
284   - this->state = st_token_ready;
285   - this->val += ch;
286   - return;
287   -
288   - default:
289   - this->state = st_literal;
290   - this->val += ch;
291   - return;
292   - }
  221 + inTop(ch);
  222 + return;
293 223  
294 224 case st_in_space:
295   - // We only enter this state if include_ignorable is true.
296   - if (!isSpace(ch)) {
297   - this->type = tt_space;
298   - this->unread_char = true;
299   - this->char_to_unread = ch;
300   - this->state = st_token_ready;
301   - return;
302   - } else {
303   - this->val += ch;
304   - return;
305   - }
  225 + inSpace(ch);
  226 + return;
306 227  
307 228 case st_in_comment:
308   - if ((ch == '\r') || (ch == '\n')) {
309   - if (this->include_ignorable) {
310   - this->type = tt_comment;
311   - this->unread_char = true;
312   - this->char_to_unread = ch;
313   - this->state = st_token_ready;
314   - } else {
315   - this->state = st_top;
316   - }
317   - } else if (this->include_ignorable) {
318   - this->val += ch;
319   - }
  229 + inComment(ch);
320 230 return;
321 231  
322 232 case st_lt:
323   - if (ch == '<') {
324   - this->val += "<<";
325   - this->type = tt_dict_open;
326   - this->state = st_token_ready;
327   - return;
328   - }
329   -
330   - this->state = st_in_hexstring;
331   - inHexstring(ch);
  233 + inLt(ch);
332 234 return;
333 235  
334 236 case st_gt:
335   - if (ch == '>') {
336   - this->val += ">>";
337   - this->type = tt_dict_close;
338   - this->state = st_token_ready;
339   - } else {
340   - this->val += ">";
341   - this->type = tt_bad;
342   - QTC::TC("qpdf", "QPDFTokenizer bad >");
343   - this->error_message = "unexpected >";
344   - this->unread_char = true;
345   - this->char_to_unread = ch;
346   - this->state = st_token_ready;
347   - }
  237 + inGt(ch);
348 238 return;
349 239  
350 240 case st_in_string:
... ... @@ -352,107 +242,308 @@ QPDFTokenizer::handleCharacter(char ch)
352 242 return;
353 243  
354 244 case st_string_after_cr:
355   - // CR LF in strings are either ignored or normalized to CR
356   - this->state = st_in_string;
357   - if (ch != '\n') {
358   - inString(ch);
359   - }
  245 + inStringAfterCR(ch);
360 246 return;
361 247  
362 248 case st_string_escape:
363   - this->state = st_in_string;
364   - switch (ch) {
365   - case '0':
366   - case '1':
367   - case '2':
368   - case '3':
369   - case '4':
370   - case '5':
371   - case '6':
372   - case '7':
373   - this->state = st_char_code;
374   - this->char_code = 0;
375   - this->digit_count = 0;
376   - inCharCode(ch);
377   - return;
  249 + inStringEscape(ch);
  250 + return;
378 251  
379   - case 'n':
380   - this->val += '\n';
381   - return;
  252 + case st_char_code:
  253 + inCharCode(ch);
  254 + return;
382 255  
383   - case 'r':
384   - this->val += '\r';
385   - return;
  256 + case st_literal:
  257 + inLiteral(ch);
  258 + return;
386 259  
387   - case 't':
388   - this->val += '\t';
389   - return;
  260 + case st_inline_image:
  261 + inInlineImage(ch);
  262 + return;
  263 + this->val += ch;
390 264  
391   - case 'b':
392   - this->val += '\b';
393   - return;
  265 + case st_in_hexstring:
  266 + inHexstring(ch);
  267 + return;
394 268  
395   - case 'f':
396   - this->val += '\f';
397   - return;
  269 + case st_in_hexstring_2nd:
  270 + inHexstring2nd(ch);
  271 + return;
398 272  
399   - case '\n':
400   - return;
  273 + case (st_token_ready):
  274 + inTokenReady(ch);
  275 + return;
401 276  
402   - case '\r':
403   - this->state = st_string_after_cr;
404   - return;
  277 + default:
  278 + throw std::logic_error(
  279 + "INTERNAL ERROR: invalid state while reading token");
  280 + }
  281 +}
  282 +
  283 +void
  284 +QPDFTokenizer::inTokenReady(char ch)
  285 +{
  286 + throw std::logic_error("INTERNAL ERROR: QPDF tokenizer presented character "
  287 + "while token is waiting");
  288 +}
405 289  
406   - default:
407   - // PDF spec says backslash is ignored before anything else
  290 +void
  291 +QPDFTokenizer::inTop(char ch)
  292 +{
  293 + // Note: we specifically do not use ctype here. It is
  294 + // locale-dependent.
  295 + if (isSpace(ch)) {
  296 + if (this->include_ignorable) {
  297 + this->state = st_in_space;
408 298 this->val += ch;
409 299 return;
410 300 }
  301 + return;
  302 + }
  303 + switch (ch) {
  304 + case '%':
  305 + this->state = st_in_comment;
  306 + if (this->include_ignorable) {
  307 + this->val += ch;
  308 + }
  309 + return;
411 310  
412   - case st_char_code:
413   - inCharCode(ch);
  311 + case '(':
  312 + this->string_depth = 1;
  313 + this->state = st_in_string;
414 314 return;
415 315  
416   - case st_literal:
417   - if (isDelimiter(ch)) {
418   - // A C-locale whitespace character or delimiter terminates
419   - // token. It is important to unread the whitespace
420   - // character even though it is ignored since it may be the
421   - // newline after a stream keyword. Removing it here could
422   - // make the stream-reading code break on some files,
423   - // though not on any files in the test suite as of this
424   - // writing.
425   -
426   - this->type = tt_word;
  316 + case '<':
  317 + this->state = st_lt;
  318 + return;
  319 +
  320 + case '>':
  321 + this->state = st_gt;
  322 + return;
  323 +
  324 + case (')'):
  325 + this->type = tt_bad;
  326 + QTC::TC("qpdf", "QPDFTokenizer bad )");
  327 + this->error_message = "unexpected )";
  328 + this->val += ch;
  329 + this->state = st_token_ready;
  330 + return;
  331 +
  332 + case '[':
  333 + this->type = tt_array_open;
  334 + this->state = st_token_ready;
  335 + this->val += ch;
  336 + return;
  337 +
  338 + case ']':
  339 + this->type = tt_array_close;
  340 + this->val += ch;
  341 + this->state = st_token_ready;
  342 + return;
  343 +
  344 + case '{':
  345 + this->type = tt_brace_open;
  346 + this->state = st_token_ready;
  347 + this->val += ch;
  348 + return;
  349 +
  350 + case '}':
  351 + this->type = tt_brace_close;
  352 + this->state = st_token_ready;
  353 + this->val += ch;
  354 + return;
  355 +
  356 + default:
  357 + this->state = st_literal;
  358 + this->val += ch;
  359 + return;
  360 + }
  361 +}
  362 +
  363 +void
  364 +QPDFTokenizer::inSpace(char ch)
  365 +{
  366 + // We only enter this state if include_ignorable is true.
  367 + if (!isSpace(ch)) {
  368 + this->type = tt_space;
  369 + this->unread_char = true;
  370 + this->char_to_unread = ch;
  371 + this->state = st_token_ready;
  372 + return;
  373 + } else {
  374 + this->val += ch;
  375 + return;
  376 + }
  377 +}
  378 +
  379 +void
  380 +QPDFTokenizer::inComment(char ch)
  381 +{
  382 + if ((ch == '\r') || (ch == '\n')) {
  383 + if (this->include_ignorable) {
  384 + this->type = tt_comment;
427 385 this->unread_char = true;
428 386 this->char_to_unread = ch;
429 387 this->state = st_token_ready;
430 388 } else {
431   - this->val += ch;
  389 + this->state = st_top;
432 390 }
  391 + } else if (this->include_ignorable) {
  392 + this->val += ch;
  393 + }
  394 +}
  395 +
  396 +void
  397 +QPDFTokenizer::inString(char ch)
  398 +{
  399 + switch (ch) {
  400 + case '\\':
  401 + this->state = st_string_escape;
433 402 return;
434 403  
435   - case st_inline_image:
  404 + case '(':
436 405 this->val += ch;
437   - if (this->val.length() == this->inline_image_bytes) {
438   - QTC::TC("qpdf", "QPDFTokenizer found EI by byte count");
439   - this->type = tt_inline_image;
440   - this->inline_image_bytes = 0;
  406 + ++this->string_depth;
  407 + return;
  408 +
  409 + case ')':
  410 + if (--this->string_depth == 0) {
  411 + this->type = tt_string;
441 412 this->state = st_token_ready;
  413 + return;
442 414 }
  415 +
  416 + this->val += ch;
443 417 return;
444 418  
445   - case st_in_hexstring:
446   - inHexstring(ch);
  419 + case '\r':
  420 + // CR by itself is converted to LF
  421 + this->val += '\n';
  422 + this->state = st_string_after_cr;
447 423 return;
448 424  
449   - case st_in_hexstring_2nd:
450   - inHexstring2nd(ch);
  425 + case '\n':
  426 + this->val += ch;
451 427 return;
452 428  
453 429 default:
454   - throw std::logic_error(
455   - "INTERNAL ERROR: invalid state while reading token");
  430 + this->val += ch;
  431 + return;
  432 + }
  433 +}
  434 +
  435 +void
  436 +QPDFTokenizer::inStringEscape(char ch)
  437 +{
  438 + this->state = st_in_string;
  439 + switch (ch) {
  440 + case '0':
  441 + case '1':
  442 + case '2':
  443 + case '3':
  444 + case '4':
  445 + case '5':
  446 + case '6':
  447 + case '7':
  448 + this->state = st_char_code;
  449 + this->char_code = 0;
  450 + this->digit_count = 0;
  451 + inCharCode(ch);
  452 + return;
  453 +
  454 + case 'n':
  455 + this->val += '\n';
  456 + return;
  457 +
  458 + case 'r':
  459 + this->val += '\r';
  460 + return;
  461 +
  462 + case 't':
  463 + this->val += '\t';
  464 + return;
  465 +
  466 + case 'b':
  467 + this->val += '\b';
  468 + return;
  469 +
  470 + case 'f':
  471 + this->val += '\f';
  472 + return;
  473 +
  474 + case '\n':
  475 + return;
  476 +
  477 + case '\r':
  478 + this->state = st_string_after_cr;
  479 + return;
  480 +
  481 + default:
  482 + // PDF spec says backslash is ignored before anything else
  483 + this->val += ch;
  484 + return;
  485 + }
  486 +}
  487 +
  488 +void
  489 +QPDFTokenizer::inStringAfterCR(char ch)
  490 +{
  491 + this->state = st_in_string;
  492 + if (ch != '\n') {
  493 + inString(ch);
  494 + }
  495 +}
  496 +
  497 +void
  498 +QPDFTokenizer::inLt(char ch)
  499 +{
  500 + if (ch == '<') {
  501 + this->val += "<<";
  502 + this->type = tt_dict_open;
  503 + this->state = st_token_ready;
  504 + return;
  505 + }
  506 +
  507 + this->state = st_in_hexstring;
  508 + inHexstring(ch);
  509 +}
  510 +
  511 +void
  512 +QPDFTokenizer::inGt(char ch)
  513 +{
  514 + if (ch == '>') {
  515 + this->val += ">>";
  516 + this->type = tt_dict_close;
  517 + this->state = st_token_ready;
  518 + } else {
  519 + this->val += ">";
  520 + this->type = tt_bad;
  521 + QTC::TC("qpdf", "QPDFTokenizer bad >");
  522 + this->error_message = "unexpected >";
  523 + this->unread_char = true;
  524 + this->char_to_unread = ch;
  525 + this->state = st_token_ready;
  526 + }
  527 +}
  528 +
  529 +void
  530 +QPDFTokenizer::inLiteral(char ch)
  531 +{
  532 + if (isDelimiter(ch)) {
  533 + // A C-locale whitespace character or delimiter terminates
  534 + // token. It is important to unread the whitespace
  535 + // character even though it is ignored since it may be the
  536 + // newline after a stream keyword. Removing it here could
  537 + // make the stream-reading code break on some files,
  538 + // though not on any files in the test suite as of this
  539 + // writing.
  540 +
  541 + this->type = tt_word;
  542 + this->unread_char = true;
  543 + this->char_to_unread = ch;
  544 + this->state = st_token_ready;
  545 + } else {
  546 + this->val += ch;
456 547 }
457 548 }
458 549  
... ... @@ -521,45 +612,6 @@ QPDFTokenizer::inHexstring2nd(char ch)
521 612 }
522 613  
523 614 void
524   -QPDFTokenizer::inString(char ch)
525   -{
526   - switch (ch) {
527   - case '\\':
528   - this->state = st_string_escape;
529   - return;
530   -
531   - case '(':
532   - this->val += ch;
533   - ++this->string_depth;
534   - return;
535   -
536   - case ')':
537   - if (--this->string_depth == 0) {
538   - this->type = tt_string;
539   - this->state = st_token_ready;
540   - return;
541   - }
542   -
543   - this->val += ch;
544   - return;
545   -
546   - case '\r':
547   - // CR by itself is converted to LF
548   - this->val += '\n';
549   - this->state = st_string_after_cr;
550   - return;
551   -
552   - case '\n':
553   - this->val += ch;
554   - return;
555   -
556   - default:
557   - this->val += ch;
558   - return;
559   - }
560   -}
561   -
562   -void
563 615 QPDFTokenizer::inCharCode(char ch)
564 616 {
565 617 if (('0' <= ch) && (ch <= '7')) {
... ... @@ -576,6 +628,18 @@ QPDFTokenizer::inCharCode(char ch)
576 628 }
577 629  
578 630 void
  631 +QPDFTokenizer::inInlineImage(char ch)
  632 +{
  633 + this->val += ch;
  634 + if (this->val.length() == this->inline_image_bytes) {
  635 + QTC::TC("qpdf", "QPDFTokenizer found EI by byte count");
  636 + this->type = tt_inline_image;
  637 + this->inline_image_bytes = 0;
  638 + this->state = st_token_ready;
  639 + }
  640 +}
  641 +
  642 +void
579 643 QPDFTokenizer::presentEOF()
580 644 {
581 645 if (this->state == st_literal) {
... ...