Commit 931fbb615623f00de0942f12e3e5b2b6e141b09f
1 parent
a3f3238f
Integrate names into state machine in QPDFTokenizer
Showing
3 changed files
with
124 additions
and
45 deletions
include/qpdf/QPDFTokenizer.hh
| ... | ... | @@ -203,6 +203,7 @@ class QPDFTokenizer |
| 203 | 203 | st_in_hexstring, |
| 204 | 204 | st_in_string, |
| 205 | 205 | st_in_hexstring_2nd, |
| 206 | + st_name, | |
| 206 | 207 | st_literal, |
| 207 | 208 | st_in_space, |
| 208 | 209 | st_in_comment, |
| ... | ... | @@ -212,6 +213,8 @@ class QPDFTokenizer |
| 212 | 213 | st_lt, |
| 213 | 214 | st_gt, |
| 214 | 215 | st_inline_image, |
| 216 | + st_name_hex1, | |
| 217 | + st_name_hex2, | |
| 215 | 218 | st_token_ready |
| 216 | 219 | }; |
| 217 | 220 | |
| ... | ... | @@ -220,6 +223,7 @@ class QPDFTokenizer |
| 220 | 223 | void inSpace(char); |
| 221 | 224 | void inComment(char); |
| 222 | 225 | void inString(char); |
| 226 | + void inName(char); | |
| 223 | 227 | void inLt(char); |
| 224 | 228 | void inGt(char); |
| 225 | 229 | void inStringAfterCR(char); |
| ... | ... | @@ -230,7 +234,8 @@ class QPDFTokenizer |
| 230 | 234 | void inHexstring2nd(char); |
| 231 | 235 | void inInlineImage(char); |
| 232 | 236 | void inTokenReady(char); |
| 233 | - | |
| 237 | + void inNameHex1(char); | |
| 238 | + void inNameHex2(char); | |
| 234 | 239 | void reset(); |
| 235 | 240 | |
| 236 | 241 | // Lexer state |
| ... | ... | @@ -247,10 +252,12 @@ class QPDFTokenizer |
| 247 | 252 | bool unread_char; |
| 248 | 253 | char char_to_unread; |
| 249 | 254 | size_t inline_image_bytes; |
| 255 | + bool bad; | |
| 250 | 256 | |
| 251 | 257 | // State for strings |
| 252 | 258 | int string_depth; |
| 253 | 259 | int char_code; |
| 260 | + char hex_char; | |
| 254 | 261 | int digit_count; |
| 255 | 262 | }; |
| 256 | 263 | ... | ... |
libqpdf/QPDFTokenizer.cc
| ... | ... | @@ -85,6 +85,7 @@ QPDFTokenizer::reset() |
| 85 | 85 | char_to_unread = '\0'; |
| 86 | 86 | inline_image_bytes = 0; |
| 87 | 87 | string_depth = 0; |
| 88 | + bad = false; | |
| 88 | 89 | } |
| 89 | 90 | |
| 90 | 91 | QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) : |
| ... | ... | @@ -133,48 +134,7 @@ QPDFTokenizer::isDelimiter(char ch) |
| 133 | 134 | void |
| 134 | 135 | QPDFTokenizer::resolveLiteral() |
| 135 | 136 | { |
| 136 | - if ((this->val.length() > 0) && (this->val.at(0) == '/')) { | |
| 137 | - this->type = tt_name; | |
| 138 | - // Deal with # in name token. Note: '/' by itself is a | |
| 139 | - // valid name, so don't strip leading /. That way we | |
| 140 | - // don't have to deal with the empty string as a name. | |
| 141 | - std::string nval = "/"; | |
| 142 | - size_t len = this->val.length(); | |
| 143 | - for (size_t i = 1; i < len; ++i) { | |
| 144 | - char ch = this->val.at(i); | |
| 145 | - if (ch == '#') { | |
| 146 | - if ((i + 2 < len) && QUtil::is_hex_digit(this->val.at(i + 1)) && | |
| 147 | - QUtil::is_hex_digit(this->val.at(i + 2))) { | |
| 148 | - char num[3]; | |
| 149 | - num[0] = this->val.at(i + 1); | |
| 150 | - num[1] = this->val.at(i + 2); | |
| 151 | - num[2] = '\0'; | |
| 152 | - char ch2 = static_cast<char>(strtol(num, nullptr, 16)); | |
| 153 | - if (ch2 == '\0') { | |
| 154 | - this->type = tt_bad; | |
| 155 | - QTC::TC("qpdf", "QPDFTokenizer null in name"); | |
| 156 | - this->error_message = | |
| 157 | - "null character not allowed in name token"; | |
| 158 | - nval += "#00"; | |
| 159 | - } else { | |
| 160 | - nval.append(1, ch2); | |
| 161 | - } | |
| 162 | - i += 2; | |
| 163 | - } else { | |
| 164 | - QTC::TC("qpdf", "QPDFTokenizer bad name"); | |
| 165 | - this->error_message = | |
| 166 | - "name with stray # will not work with PDF >= 1.2"; | |
| 167 | - // Use null to encode a bad # -- this is reversed | |
| 168 | - // in QPDF_Name::normalizeName. | |
| 169 | - nval += '\0'; | |
| 170 | - } | |
| 171 | - } else { | |
| 172 | - nval.append(1, ch); | |
| 173 | - } | |
| 174 | - } | |
| 175 | - this->val.clear(); | |
| 176 | - this->val += nval; | |
| 177 | - } else if (QUtil::is_number(this->val.c_str())) { | |
| 137 | + if (QUtil::is_number(this->val.c_str())) { | |
| 178 | 138 | if (this->val.find('.') != std::string::npos) { |
| 179 | 139 | this->type = tt_real; |
| 180 | 140 | } else { |
| ... | ... | @@ -241,6 +201,10 @@ QPDFTokenizer::handleCharacter(char ch) |
| 241 | 201 | inString(ch); |
| 242 | 202 | return; |
| 243 | 203 | |
| 204 | + case st_name: | |
| 205 | + inName(ch); | |
| 206 | + return; | |
| 207 | + | |
| 244 | 208 | case st_string_after_cr: |
| 245 | 209 | inStringAfterCR(ch); |
| 246 | 210 | return; |
| ... | ... | @@ -270,6 +234,14 @@ QPDFTokenizer::handleCharacter(char ch) |
| 270 | 234 | inHexstring2nd(ch); |
| 271 | 235 | return; |
| 272 | 236 | |
| 237 | + case st_name_hex1: | |
| 238 | + inNameHex1(ch); | |
| 239 | + return; | |
| 240 | + | |
| 241 | + case st_name_hex2: | |
| 242 | + inNameHex2(ch); | |
| 243 | + return; | |
| 244 | + | |
| 273 | 245 | case (st_token_ready): |
| 274 | 246 | inTokenReady(ch); |
| 275 | 247 | return; |
| ... | ... | @@ -353,6 +325,11 @@ QPDFTokenizer::inTop(char ch) |
| 353 | 325 | this->val += ch; |
| 354 | 326 | return; |
| 355 | 327 | |
| 328 | + case '/': | |
| 329 | + this->state = st_name; | |
| 330 | + this->val += ch; | |
| 331 | + return; | |
| 332 | + | |
| 356 | 333 | default: |
| 357 | 334 | this->state = st_literal; |
| 358 | 335 | this->val += ch; |
| ... | ... | @@ -433,6 +410,93 @@ QPDFTokenizer::inString(char ch) |
| 433 | 410 | } |
| 434 | 411 | |
| 435 | 412 | void |
| 413 | +QPDFTokenizer::inName(char ch) | |
| 414 | +{ | |
| 415 | + if (isDelimiter(ch)) { | |
| 416 | + // A C-locale whitespace character or delimiter terminates | |
| 417 | + // token. It is important to unread the whitespace | |
| 418 | + // character even though it is ignored since it may be the | |
| 419 | + // newline after a stream keyword. Removing it here could | |
| 420 | + // make the stream-reading code break on some files, | |
| 421 | + // though not on any files in the test suite as of this | |
| 422 | + // writing. | |
| 423 | + | |
| 424 | + this->type = this->bad ? tt_bad : tt_name; | |
| 425 | + this->unread_char = true; | |
| 426 | + this->char_to_unread = ch; | |
| 427 | + this->state = st_token_ready; | |
| 428 | + } else if (ch == '#') { | |
| 429 | + this->char_code = 0; | |
| 430 | + this->state = st_name_hex1; | |
| 431 | + } else { | |
| 432 | + this->val += ch; | |
| 433 | + } | |
| 434 | +} | |
| 435 | + | |
| 436 | +void | |
| 437 | +QPDFTokenizer::inNameHex1(char ch) | |
| 438 | +{ | |
| 439 | + this->hex_char = ch; | |
| 440 | + | |
| 441 | + if ('0' <= ch && ch <= '9') { | |
| 442 | + this->char_code = 16 * (int(ch) - int('0')); | |
| 443 | + this->state = st_name_hex2; | |
| 444 | + | |
| 445 | + } else if ('A' <= ch && ch <= 'F') { | |
| 446 | + this->char_code = 16 * (10 + int(ch) - int('A')); | |
| 447 | + this->state = st_name_hex2; | |
| 448 | + | |
| 449 | + } else if ('a' <= ch && ch <= 'f') { | |
| 450 | + this->char_code = 16 * (10 + int(ch) - int('a')); | |
| 451 | + this->state = st_name_hex2; | |
| 452 | + | |
| 453 | + } else { | |
| 454 | + QTC::TC("qpdf", "QPDFTokenizer bad name 1"); | |
| 455 | + this->error_message = "name with stray # will not work with PDF >= 1.2"; | |
| 456 | + // Use null to encode a bad # -- this is reversed | |
| 457 | + // in QPDF_Name::normalizeName. | |
| 458 | + this->val += '\0'; | |
| 459 | + this->state = st_name; | |
| 460 | + inName(ch); | |
| 461 | + } | |
| 462 | +} | |
| 463 | + | |
| 464 | +void | |
| 465 | +QPDFTokenizer::inNameHex2(char ch) | |
| 466 | +{ | |
| 467 | + if ('0' <= ch && ch <= '9') { | |
| 468 | + this->char_code += int(ch) - int('0'); | |
| 469 | + | |
| 470 | + } else if ('A' <= ch && ch <= 'F') { | |
| 471 | + this->char_code += 10 + int(ch) - int('A'); | |
| 472 | + | |
| 473 | + } else if ('a' <= ch && ch <= 'f') { | |
| 474 | + this->char_code += 10 + int(ch) - int('a'); | |
| 475 | + | |
| 476 | + } else { | |
| 477 | + QTC::TC("qpdf", "QPDFTokenizer bad name 2"); | |
| 478 | + this->error_message = "name with stray # will not work with PDF >= 1.2"; | |
| 479 | + // Use null to encode a bad # -- this is reversed | |
| 480 | + // in QPDF_Name::normalizeName. | |
| 481 | + this->val += '\0'; | |
| 482 | + this->val += this->hex_char; | |
| 483 | + this->state = st_name; | |
| 484 | + inName(ch); | |
| 485 | + return; | |
| 486 | + } | |
| 487 | + if (this->char_code == 0) { | |
| 488 | + QTC::TC("qpdf", "QPDFTokenizer null in name"); | |
| 489 | + this->error_message = "null character not allowed in name token"; | |
| 490 | + this->val += "#00"; | |
| 491 | + this->state = st_name; | |
| 492 | + this->bad = true; | |
| 493 | + } else { | |
| 494 | + this->val += char(this->char_code); | |
| 495 | + this->state = st_name; | |
| 496 | + } | |
| 497 | +} | |
| 498 | + | |
| 499 | +void | |
| 436 | 500 | QPDFTokenizer::inStringEscape(char ch) |
| 437 | 501 | { |
| 438 | 502 | this->state = st_in_string; |
| ... | ... | @@ -642,9 +706,16 @@ QPDFTokenizer::inInlineImage(char ch) |
| 642 | 706 | void |
| 643 | 707 | QPDFTokenizer::presentEOF() |
| 644 | 708 | { |
| 645 | - if (this->state == st_literal) { | |
| 709 | + if (this->state == st_name || this->state == st_name_hex1 || | |
| 710 | + this->state == st_name_hex2) { | |
| 711 | + // Push any delimiter to the state machine to finish off the final | |
| 712 | + // token. | |
| 713 | + presentCharacter('\f'); | |
| 714 | + this->unread_char = false; | |
| 715 | + } else if (this->state == st_literal) { | |
| 646 | 716 | QTC::TC("qpdf", "QPDFTokenizer EOF reading appendable token"); |
| 647 | 717 | resolveLiteral(); |
| 718 | + | |
| 648 | 719 | } else if ((this->include_ignorable) && (this->state == st_in_space)) { |
| 649 | 720 | this->type = tt_space; |
| 650 | 721 | } else if ((this->include_ignorable) && (this->state == st_in_comment)) { | ... | ... |
qpdf/qpdf.testcov
| ... | ... | @@ -68,7 +68,8 @@ QPDFTokenizer bad > 0 |
| 68 | 68 | QPDFTokenizer bad hexstring character 0 |
| 69 | 69 | QPDFTokenizer bad hexstring 2nd character 0 |
| 70 | 70 | QPDFTokenizer null in name 0 |
| 71 | -QPDFTokenizer bad name 0 | |
| 71 | +QPDFTokenizer bad name 1 0 | |
| 72 | +QPDFTokenizer bad name 2 0 | |
| 72 | 73 | QPDF_Stream invalid filter 0 |
| 73 | 74 | QPDF UseOutlines but no Outlines 0 |
| 74 | 75 | QPDFObjectHandle makeDirect loop 0 | ... | ... |