Commit a3f3238f371f07cd2b2e1a96753cde6590712dc5
1 parent
6111a6a4
Split QPDFTokenizer::handleCharacter into individual methods
Showing
2 changed files
with
302 additions
and
228 deletions
include/qpdf/QPDFTokenizer.hh
| ... | ... | @@ -200,26 +200,36 @@ class QPDFTokenizer |
| 200 | 200 | |
| 201 | 201 | enum state_e { |
| 202 | 202 | st_top, |
| 203 | + st_in_hexstring, | |
| 204 | + st_in_string, | |
| 205 | + st_in_hexstring_2nd, | |
| 206 | + st_literal, | |
| 203 | 207 | st_in_space, |
| 204 | 208 | st_in_comment, |
| 205 | - st_in_string, | |
| 206 | 209 | st_string_escape, |
| 207 | 210 | st_char_code, |
| 208 | 211 | st_string_after_cr, |
| 209 | 212 | st_lt, |
| 210 | 213 | st_gt, |
| 211 | - st_literal, | |
| 212 | - st_in_hexstring, | |
| 213 | - st_in_hexstring_2nd, | |
| 214 | 214 | st_inline_image, |
| 215 | 215 | st_token_ready |
| 216 | 216 | }; |
| 217 | 217 | |
| 218 | 218 | void handleCharacter(char); |
| 219 | + void inTop(char); | |
| 220 | + void inSpace(char); | |
| 221 | + void inComment(char); | |
| 222 | + void inString(char); | |
| 223 | + void inLt(char); | |
| 224 | + void inGt(char); | |
| 225 | + void inStringAfterCR(char); | |
| 226 | + void inStringEscape(char); | |
| 227 | + void inLiteral(char); | |
| 219 | 228 | void inCharCode(char); |
| 220 | 229 | void inHexstring(char); |
| 221 | 230 | void inHexstring2nd(char); |
| 222 | - void inString(char); | |
| 231 | + void inInlineImage(char); | |
| 232 | + void inTokenReady(char); | |
| 223 | 233 | |
| 224 | 234 | void reset(); |
| 225 | 235 | ... | ... |
libqpdf/QPDFTokenizer.cc
| ... | ... | @@ -217,134 +217,24 @@ QPDFTokenizer::handleCharacter(char ch) |
| 217 | 217 | // the character that caused a state change in the new state. |
| 218 | 218 | |
| 219 | 219 | switch (this->state) { |
| 220 | - case (st_token_ready): | |
| 221 | - throw std::logic_error( | |
| 222 | - "INTERNAL ERROR: QPDF tokenizer presented character " | |
| 223 | - "while token is waiting"); | |
| 224 | - | |
| 225 | 220 | case st_top: |
| 226 | - // Note: we specifically do not use ctype here. It is | |
| 227 | - // locale-dependent. | |
| 228 | - if (isSpace(ch)) { | |
| 229 | - if (this->include_ignorable) { | |
| 230 | - this->state = st_in_space; | |
| 231 | - this->val += ch; | |
| 232 | - } | |
| 233 | - return; | |
| 234 | - } | |
| 235 | - switch (ch) { | |
| 236 | - case '%': | |
| 237 | - this->state = st_in_comment; | |
| 238 | - if (this->include_ignorable) { | |
| 239 | - this->val += ch; | |
| 240 | - } | |
| 241 | - return; | |
| 242 | - | |
| 243 | - case '(': | |
| 244 | - this->string_depth = 1; | |
| 245 | - this->state = st_in_string; | |
| 246 | - return; | |
| 247 | - | |
| 248 | - case '<': | |
| 249 | - this->state = st_lt; | |
| 250 | - return; | |
| 251 | - | |
| 252 | - case '>': | |
| 253 | - this->state = st_gt; | |
| 254 | - return; | |
| 255 | - | |
| 256 | - case (')'): | |
| 257 | - this->type = tt_bad; | |
| 258 | - QTC::TC("qpdf", "QPDFTokenizer bad )"); | |
| 259 | - this->error_message = "unexpected )"; | |
| 260 | - this->val += ch; | |
| 261 | - this->state = st_token_ready; | |
| 262 | - return; | |
| 263 | - | |
| 264 | - case '[': | |
| 265 | - this->type = tt_array_open; | |
| 266 | - this->state = st_token_ready; | |
| 267 | - this->val += ch; | |
| 268 | - return; | |
| 269 | - | |
| 270 | - case ']': | |
| 271 | - this->type = tt_array_close; | |
| 272 | - this->val += ch; | |
| 273 | - this->state = st_token_ready; | |
| 274 | - return; | |
| 275 | - | |
| 276 | - case '{': | |
| 277 | - this->type = tt_brace_open; | |
| 278 | - this->state = st_token_ready; | |
| 279 | - this->val += ch; | |
| 280 | - return; | |
| 281 | - | |
| 282 | - case '}': | |
| 283 | - this->type = tt_brace_close; | |
| 284 | - this->state = st_token_ready; | |
| 285 | - this->val += ch; | |
| 286 | - return; | |
| 287 | - | |
| 288 | - default: | |
| 289 | - this->state = st_literal; | |
| 290 | - this->val += ch; | |
| 291 | - return; | |
| 292 | - } | |
| 221 | + inTop(ch); | |
| 222 | + return; | |
| 293 | 223 | |
| 294 | 224 | case st_in_space: |
| 295 | - // We only enter this state if include_ignorable is true. | |
| 296 | - if (!isSpace(ch)) { | |
| 297 | - this->type = tt_space; | |
| 298 | - this->unread_char = true; | |
| 299 | - this->char_to_unread = ch; | |
| 300 | - this->state = st_token_ready; | |
| 301 | - return; | |
| 302 | - } else { | |
| 303 | - this->val += ch; | |
| 304 | - return; | |
| 305 | - } | |
| 225 | + inSpace(ch); | |
| 226 | + return; | |
| 306 | 227 | |
| 307 | 228 | case st_in_comment: |
| 308 | - if ((ch == '\r') || (ch == '\n')) { | |
| 309 | - if (this->include_ignorable) { | |
| 310 | - this->type = tt_comment; | |
| 311 | - this->unread_char = true; | |
| 312 | - this->char_to_unread = ch; | |
| 313 | - this->state = st_token_ready; | |
| 314 | - } else { | |
| 315 | - this->state = st_top; | |
| 316 | - } | |
| 317 | - } else if (this->include_ignorable) { | |
| 318 | - this->val += ch; | |
| 319 | - } | |
| 229 | + inComment(ch); | |
| 320 | 230 | return; |
| 321 | 231 | |
| 322 | 232 | case st_lt: |
| 323 | - if (ch == '<') { | |
| 324 | - this->val += "<<"; | |
| 325 | - this->type = tt_dict_open; | |
| 326 | - this->state = st_token_ready; | |
| 327 | - return; | |
| 328 | - } | |
| 329 | - | |
| 330 | - this->state = st_in_hexstring; | |
| 331 | - inHexstring(ch); | |
| 233 | + inLt(ch); | |
| 332 | 234 | return; |
| 333 | 235 | |
| 334 | 236 | case st_gt: |
| 335 | - if (ch == '>') { | |
| 336 | - this->val += ">>"; | |
| 337 | - this->type = tt_dict_close; | |
| 338 | - this->state = st_token_ready; | |
| 339 | - } else { | |
| 340 | - this->val += ">"; | |
| 341 | - this->type = tt_bad; | |
| 342 | - QTC::TC("qpdf", "QPDFTokenizer bad >"); | |
| 343 | - this->error_message = "unexpected >"; | |
| 344 | - this->unread_char = true; | |
| 345 | - this->char_to_unread = ch; | |
| 346 | - this->state = st_token_ready; | |
| 347 | - } | |
| 237 | + inGt(ch); | |
| 348 | 238 | return; |
| 349 | 239 | |
| 350 | 240 | case st_in_string: |
| ... | ... | @@ -352,107 +242,308 @@ QPDFTokenizer::handleCharacter(char ch) |
| 352 | 242 | return; |
| 353 | 243 | |
| 354 | 244 | case st_string_after_cr: |
| 355 | - // CR LF in strings are either ignored or normalized to CR | |
| 356 | - this->state = st_in_string; | |
| 357 | - if (ch != '\n') { | |
| 358 | - inString(ch); | |
| 359 | - } | |
| 245 | + inStringAfterCR(ch); | |
| 360 | 246 | return; |
| 361 | 247 | |
| 362 | 248 | case st_string_escape: |
| 363 | - this->state = st_in_string; | |
| 364 | - switch (ch) { | |
| 365 | - case '0': | |
| 366 | - case '1': | |
| 367 | - case '2': | |
| 368 | - case '3': | |
| 369 | - case '4': | |
| 370 | - case '5': | |
| 371 | - case '6': | |
| 372 | - case '7': | |
| 373 | - this->state = st_char_code; | |
| 374 | - this->char_code = 0; | |
| 375 | - this->digit_count = 0; | |
| 376 | - inCharCode(ch); | |
| 377 | - return; | |
| 249 | + inStringEscape(ch); | |
| 250 | + return; | |
| 378 | 251 | |
| 379 | - case 'n': | |
| 380 | - this->val += '\n'; | |
| 381 | - return; | |
| 252 | + case st_char_code: | |
| 253 | + inCharCode(ch); | |
| 254 | + return; | |
| 382 | 255 | |
| 383 | - case 'r': | |
| 384 | - this->val += '\r'; | |
| 385 | - return; | |
| 256 | + case st_literal: | |
| 257 | + inLiteral(ch); | |
| 258 | + return; | |
| 386 | 259 | |
| 387 | - case 't': | |
| 388 | - this->val += '\t'; | |
| 389 | - return; | |
| 260 | + case st_inline_image: | |
| 261 | + inInlineImage(ch); | |
| 262 | + return; | |
| 263 | + this->val += ch; | |
| 390 | 264 | |
| 391 | - case 'b': | |
| 392 | - this->val += '\b'; | |
| 393 | - return; | |
| 265 | + case st_in_hexstring: | |
| 266 | + inHexstring(ch); | |
| 267 | + return; | |
| 394 | 268 | |
| 395 | - case 'f': | |
| 396 | - this->val += '\f'; | |
| 397 | - return; | |
| 269 | + case st_in_hexstring_2nd: | |
| 270 | + inHexstring2nd(ch); | |
| 271 | + return; | |
| 398 | 272 | |
| 399 | - case '\n': | |
| 400 | - return; | |
| 273 | + case (st_token_ready): | |
| 274 | + inTokenReady(ch); | |
| 275 | + return; | |
| 401 | 276 | |
| 402 | - case '\r': | |
| 403 | - this->state = st_string_after_cr; | |
| 404 | - return; | |
| 277 | + default: | |
| 278 | + throw std::logic_error( | |
| 279 | + "INTERNAL ERROR: invalid state while reading token"); | |
| 280 | + } | |
| 281 | +} | |
| 282 | + | |
| 283 | +void | |
| 284 | +QPDFTokenizer::inTokenReady(char ch) | |
| 285 | +{ | |
| 286 | + throw std::logic_error("INTERNAL ERROR: QPDF tokenizer presented character " | |
| 287 | + "while token is waiting"); | |
| 288 | +} | |
| 405 | 289 | |
| 406 | - default: | |
| 407 | - // PDF spec says backslash is ignored before anything else | |
| 290 | +void | |
| 291 | +QPDFTokenizer::inTop(char ch) | |
| 292 | +{ | |
| 293 | + // Note: we specifically do not use ctype here. It is | |
| 294 | + // locale-dependent. | |
| 295 | + if (isSpace(ch)) { | |
| 296 | + if (this->include_ignorable) { | |
| 297 | + this->state = st_in_space; | |
| 408 | 298 | this->val += ch; |
| 409 | 299 | return; |
| 410 | 300 | } |
| 301 | + return; | |
| 302 | + } | |
| 303 | + switch (ch) { | |
| 304 | + case '%': | |
| 305 | + this->state = st_in_comment; | |
| 306 | + if (this->include_ignorable) { | |
| 307 | + this->val += ch; | |
| 308 | + } | |
| 309 | + return; | |
| 411 | 310 | |
| 412 | - case st_char_code: | |
| 413 | - inCharCode(ch); | |
| 311 | + case '(': | |
| 312 | + this->string_depth = 1; | |
| 313 | + this->state = st_in_string; | |
| 414 | 314 | return; |
| 415 | 315 | |
| 416 | - case st_literal: | |
| 417 | - if (isDelimiter(ch)) { | |
| 418 | - // A C-locale whitespace character or delimiter terminates | |
| 419 | - // token. It is important to unread the whitespace | |
| 420 | - // character even though it is ignored since it may be the | |
| 421 | - // newline after a stream keyword. Removing it here could | |
| 422 | - // make the stream-reading code break on some files, | |
| 423 | - // though not on any files in the test suite as of this | |
| 424 | - // writing. | |
| 425 | - | |
| 426 | - this->type = tt_word; | |
| 316 | + case '<': | |
| 317 | + this->state = st_lt; | |
| 318 | + return; | |
| 319 | + | |
| 320 | + case '>': | |
| 321 | + this->state = st_gt; | |
| 322 | + return; | |
| 323 | + | |
| 324 | + case (')'): | |
| 325 | + this->type = tt_bad; | |
| 326 | + QTC::TC("qpdf", "QPDFTokenizer bad )"); | |
| 327 | + this->error_message = "unexpected )"; | |
| 328 | + this->val += ch; | |
| 329 | + this->state = st_token_ready; | |
| 330 | + return; | |
| 331 | + | |
| 332 | + case '[': | |
| 333 | + this->type = tt_array_open; | |
| 334 | + this->state = st_token_ready; | |
| 335 | + this->val += ch; | |
| 336 | + return; | |
| 337 | + | |
| 338 | + case ']': | |
| 339 | + this->type = tt_array_close; | |
| 340 | + this->val += ch; | |
| 341 | + this->state = st_token_ready; | |
| 342 | + return; | |
| 343 | + | |
| 344 | + case '{': | |
| 345 | + this->type = tt_brace_open; | |
| 346 | + this->state = st_token_ready; | |
| 347 | + this->val += ch; | |
| 348 | + return; | |
| 349 | + | |
| 350 | + case '}': | |
| 351 | + this->type = tt_brace_close; | |
| 352 | + this->state = st_token_ready; | |
| 353 | + this->val += ch; | |
| 354 | + return; | |
| 355 | + | |
| 356 | + default: | |
| 357 | + this->state = st_literal; | |
| 358 | + this->val += ch; | |
| 359 | + return; | |
| 360 | + } | |
| 361 | +} | |
| 362 | + | |
| 363 | +void | |
| 364 | +QPDFTokenizer::inSpace(char ch) | |
| 365 | +{ | |
| 366 | + // We only enter this state if include_ignorable is true. | |
| 367 | + if (!isSpace(ch)) { | |
| 368 | + this->type = tt_space; | |
| 369 | + this->unread_char = true; | |
| 370 | + this->char_to_unread = ch; | |
| 371 | + this->state = st_token_ready; | |
| 372 | + return; | |
| 373 | + } else { | |
| 374 | + this->val += ch; | |
| 375 | + return; | |
| 376 | + } | |
| 377 | +} | |
| 378 | + | |
| 379 | +void | |
| 380 | +QPDFTokenizer::inComment(char ch) | |
| 381 | +{ | |
| 382 | + if ((ch == '\r') || (ch == '\n')) { | |
| 383 | + if (this->include_ignorable) { | |
| 384 | + this->type = tt_comment; | |
| 427 | 385 | this->unread_char = true; |
| 428 | 386 | this->char_to_unread = ch; |
| 429 | 387 | this->state = st_token_ready; |
| 430 | 388 | } else { |
| 431 | - this->val += ch; | |
| 389 | + this->state = st_top; | |
| 432 | 390 | } |
| 391 | + } else if (this->include_ignorable) { | |
| 392 | + this->val += ch; | |
| 393 | + } | |
| 394 | +} | |
| 395 | + | |
| 396 | +void | |
| 397 | +QPDFTokenizer::inString(char ch) | |
| 398 | +{ | |
| 399 | + switch (ch) { | |
| 400 | + case '\\': | |
| 401 | + this->state = st_string_escape; | |
| 433 | 402 | return; |
| 434 | 403 | |
| 435 | - case st_inline_image: | |
| 404 | + case '(': | |
| 436 | 405 | this->val += ch; |
| 437 | - if (this->val.length() == this->inline_image_bytes) { | |
| 438 | - QTC::TC("qpdf", "QPDFTokenizer found EI by byte count"); | |
| 439 | - this->type = tt_inline_image; | |
| 440 | - this->inline_image_bytes = 0; | |
| 406 | + ++this->string_depth; | |
| 407 | + return; | |
| 408 | + | |
| 409 | + case ')': | |
| 410 | + if (--this->string_depth == 0) { | |
| 411 | + this->type = tt_string; | |
| 441 | 412 | this->state = st_token_ready; |
| 413 | + return; | |
| 442 | 414 | } |
| 415 | + | |
| 416 | + this->val += ch; | |
| 443 | 417 | return; |
| 444 | 418 | |
| 445 | - case st_in_hexstring: | |
| 446 | - inHexstring(ch); | |
| 419 | + case '\r': | |
| 420 | + // CR by itself is converted to LF | |
| 421 | + this->val += '\n'; | |
| 422 | + this->state = st_string_after_cr; | |
| 447 | 423 | return; |
| 448 | 424 | |
| 449 | - case st_in_hexstring_2nd: | |
| 450 | - inHexstring2nd(ch); | |
| 425 | + case '\n': | |
| 426 | + this->val += ch; | |
| 451 | 427 | return; |
| 452 | 428 | |
| 453 | 429 | default: |
| 454 | - throw std::logic_error( | |
| 455 | - "INTERNAL ERROR: invalid state while reading token"); | |
| 430 | + this->val += ch; | |
| 431 | + return; | |
| 432 | + } | |
| 433 | +} | |
| 434 | + | |
| 435 | +void | |
| 436 | +QPDFTokenizer::inStringEscape(char ch) | |
| 437 | +{ | |
| 438 | + this->state = st_in_string; | |
| 439 | + switch (ch) { | |
| 440 | + case '0': | |
| 441 | + case '1': | |
| 442 | + case '2': | |
| 443 | + case '3': | |
| 444 | + case '4': | |
| 445 | + case '5': | |
| 446 | + case '6': | |
| 447 | + case '7': | |
| 448 | + this->state = st_char_code; | |
| 449 | + this->char_code = 0; | |
| 450 | + this->digit_count = 0; | |
| 451 | + inCharCode(ch); | |
| 452 | + return; | |
| 453 | + | |
| 454 | + case 'n': | |
| 455 | + this->val += '\n'; | |
| 456 | + return; | |
| 457 | + | |
| 458 | + case 'r': | |
| 459 | + this->val += '\r'; | |
| 460 | + return; | |
| 461 | + | |
| 462 | + case 't': | |
| 463 | + this->val += '\t'; | |
| 464 | + return; | |
| 465 | + | |
| 466 | + case 'b': | |
| 467 | + this->val += '\b'; | |
| 468 | + return; | |
| 469 | + | |
| 470 | + case 'f': | |
| 471 | + this->val += '\f'; | |
| 472 | + return; | |
| 473 | + | |
| 474 | + case '\n': | |
| 475 | + return; | |
| 476 | + | |
| 477 | + case '\r': | |
| 478 | + this->state = st_string_after_cr; | |
| 479 | + return; | |
| 480 | + | |
| 481 | + default: | |
| 482 | + // PDF spec says backslash is ignored before anything else | |
| 483 | + this->val += ch; | |
| 484 | + return; | |
| 485 | + } | |
| 486 | +} | |
| 487 | + | |
| 488 | +void | |
| 489 | +QPDFTokenizer::inStringAfterCR(char ch) | |
| 490 | +{ | |
| 491 | + this->state = st_in_string; | |
| 492 | + if (ch != '\n') { | |
| 493 | + inString(ch); | |
| 494 | + } | |
| 495 | +} | |
| 496 | + | |
| 497 | +void | |
| 498 | +QPDFTokenizer::inLt(char ch) | |
| 499 | +{ | |
| 500 | + if (ch == '<') { | |
| 501 | + this->val += "<<"; | |
| 502 | + this->type = tt_dict_open; | |
| 503 | + this->state = st_token_ready; | |
| 504 | + return; | |
| 505 | + } | |
| 506 | + | |
| 507 | + this->state = st_in_hexstring; | |
| 508 | + inHexstring(ch); | |
| 509 | +} | |
| 510 | + | |
| 511 | +void | |
| 512 | +QPDFTokenizer::inGt(char ch) | |
| 513 | +{ | |
| 514 | + if (ch == '>') { | |
| 515 | + this->val += ">>"; | |
| 516 | + this->type = tt_dict_close; | |
| 517 | + this->state = st_token_ready; | |
| 518 | + } else { | |
| 519 | + this->val += ">"; | |
| 520 | + this->type = tt_bad; | |
| 521 | + QTC::TC("qpdf", "QPDFTokenizer bad >"); | |
| 522 | + this->error_message = "unexpected >"; | |
| 523 | + this->unread_char = true; | |
| 524 | + this->char_to_unread = ch; | |
| 525 | + this->state = st_token_ready; | |
| 526 | + } | |
| 527 | +} | |
| 528 | + | |
| 529 | +void | |
| 530 | +QPDFTokenizer::inLiteral(char ch) | |
| 531 | +{ | |
| 532 | + if (isDelimiter(ch)) { | |
| 533 | + // A C-locale whitespace character or delimiter terminates | |
| 534 | + // token. It is important to unread the whitespace | |
| 535 | + // character even though it is ignored since it may be the | |
| 536 | + // newline after a stream keyword. Removing it here could | |
| 537 | + // make the stream-reading code break on some files, | |
| 538 | + // though not on any files in the test suite as of this | |
| 539 | + // writing. | |
| 540 | + | |
| 541 | + this->type = tt_word; | |
| 542 | + this->unread_char = true; | |
| 543 | + this->char_to_unread = ch; | |
| 544 | + this->state = st_token_ready; | |
| 545 | + } else { | |
| 546 | + this->val += ch; | |
| 456 | 547 | } |
| 457 | 548 | } |
| 458 | 549 | |
| ... | ... | @@ -521,45 +612,6 @@ QPDFTokenizer::inHexstring2nd(char ch) |
| 521 | 612 | } |
| 522 | 613 | |
| 523 | 614 | void |
| 524 | -QPDFTokenizer::inString(char ch) | |
| 525 | -{ | |
| 526 | - switch (ch) { | |
| 527 | - case '\\': | |
| 528 | - this->state = st_string_escape; | |
| 529 | - return; | |
| 530 | - | |
| 531 | - case '(': | |
| 532 | - this->val += ch; | |
| 533 | - ++this->string_depth; | |
| 534 | - return; | |
| 535 | - | |
| 536 | - case ')': | |
| 537 | - if (--this->string_depth == 0) { | |
| 538 | - this->type = tt_string; | |
| 539 | - this->state = st_token_ready; | |
| 540 | - return; | |
| 541 | - } | |
| 542 | - | |
| 543 | - this->val += ch; | |
| 544 | - return; | |
| 545 | - | |
| 546 | - case '\r': | |
| 547 | - // CR by itself is converted to LF | |
| 548 | - this->val += '\n'; | |
| 549 | - this->state = st_string_after_cr; | |
| 550 | - return; | |
| 551 | - | |
| 552 | - case '\n': | |
| 553 | - this->val += ch; | |
| 554 | - return; | |
| 555 | - | |
| 556 | - default: | |
| 557 | - this->val += ch; | |
| 558 | - return; | |
| 559 | - } | |
| 560 | -} | |
| 561 | - | |
| 562 | -void | |
| 563 | 615 | QPDFTokenizer::inCharCode(char ch) |
| 564 | 616 | { |
| 565 | 617 | if (('0' <= ch) && (ch <= '7')) { |
| ... | ... | @@ -576,6 +628,18 @@ QPDFTokenizer::inCharCode(char ch) |
| 576 | 628 | } |
| 577 | 629 | |
| 578 | 630 | void |
| 631 | +QPDFTokenizer::inInlineImage(char ch) | |
| 632 | +{ | |
| 633 | + this->val += ch; | |
| 634 | + if (this->val.length() == this->inline_image_bytes) { | |
| 635 | + QTC::TC("qpdf", "QPDFTokenizer found EI by byte count"); | |
| 636 | + this->type = tt_inline_image; | |
| 637 | + this->inline_image_bytes = 0; | |
| 638 | + this->state = st_token_ready; | |
| 639 | + } | |
| 640 | +} | |
| 641 | + | |
| 642 | +void | |
| 579 | 643 | QPDFTokenizer::presentEOF() |
| 580 | 644 | { |
| 581 | 645 | if (this->state == st_literal) { | ... | ... |