Commit 931fbb615623f00de0942f12e3e5b2b6e141b09f
1 parent
a3f3238f
Integrate names into state machine in QPDFTokenizer
Showing
3 changed files
with
124 additions
and
45 deletions
include/qpdf/QPDFTokenizer.hh
| @@ -203,6 +203,7 @@ class QPDFTokenizer | @@ -203,6 +203,7 @@ class QPDFTokenizer | ||
| 203 | st_in_hexstring, | 203 | st_in_hexstring, |
| 204 | st_in_string, | 204 | st_in_string, |
| 205 | st_in_hexstring_2nd, | 205 | st_in_hexstring_2nd, |
| 206 | + st_name, | ||
| 206 | st_literal, | 207 | st_literal, |
| 207 | st_in_space, | 208 | st_in_space, |
| 208 | st_in_comment, | 209 | st_in_comment, |
| @@ -212,6 +213,8 @@ class QPDFTokenizer | @@ -212,6 +213,8 @@ class QPDFTokenizer | ||
| 212 | st_lt, | 213 | st_lt, |
| 213 | st_gt, | 214 | st_gt, |
| 214 | st_inline_image, | 215 | st_inline_image, |
| 216 | + st_name_hex1, | ||
| 217 | + st_name_hex2, | ||
| 215 | st_token_ready | 218 | st_token_ready |
| 216 | }; | 219 | }; |
| 217 | 220 | ||
| @@ -220,6 +223,7 @@ class QPDFTokenizer | @@ -220,6 +223,7 @@ class QPDFTokenizer | ||
| 220 | void inSpace(char); | 223 | void inSpace(char); |
| 221 | void inComment(char); | 224 | void inComment(char); |
| 222 | void inString(char); | 225 | void inString(char); |
| 226 | + void inName(char); | ||
| 223 | void inLt(char); | 227 | void inLt(char); |
| 224 | void inGt(char); | 228 | void inGt(char); |
| 225 | void inStringAfterCR(char); | 229 | void inStringAfterCR(char); |
| @@ -230,7 +234,8 @@ class QPDFTokenizer | @@ -230,7 +234,8 @@ class QPDFTokenizer | ||
| 230 | void inHexstring2nd(char); | 234 | void inHexstring2nd(char); |
| 231 | void inInlineImage(char); | 235 | void inInlineImage(char); |
| 232 | void inTokenReady(char); | 236 | void inTokenReady(char); |
| 233 | - | 237 | + void inNameHex1(char); |
| 238 | + void inNameHex2(char); | ||
| 234 | void reset(); | 239 | void reset(); |
| 235 | 240 | ||
| 236 | // Lexer state | 241 | // Lexer state |
| @@ -247,10 +252,12 @@ class QPDFTokenizer | @@ -247,10 +252,12 @@ class QPDFTokenizer | ||
| 247 | bool unread_char; | 252 | bool unread_char; |
| 248 | char char_to_unread; | 253 | char char_to_unread; |
| 249 | size_t inline_image_bytes; | 254 | size_t inline_image_bytes; |
| 255 | + bool bad; | ||
| 250 | 256 | ||
| 251 | // State for strings | 257 | // State for strings |
| 252 | int string_depth; | 258 | int string_depth; |
| 253 | int char_code; | 259 | int char_code; |
| 260 | + char hex_char; | ||
| 254 | int digit_count; | 261 | int digit_count; |
| 255 | }; | 262 | }; |
| 256 | 263 |
libqpdf/QPDFTokenizer.cc
| @@ -85,6 +85,7 @@ QPDFTokenizer::reset() | @@ -85,6 +85,7 @@ QPDFTokenizer::reset() | ||
| 85 | char_to_unread = '\0'; | 85 | char_to_unread = '\0'; |
| 86 | inline_image_bytes = 0; | 86 | inline_image_bytes = 0; |
| 87 | string_depth = 0; | 87 | string_depth = 0; |
| 88 | + bad = false; | ||
| 88 | } | 89 | } |
| 89 | 90 | ||
| 90 | QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) : | 91 | QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) : |
| @@ -133,48 +134,7 @@ QPDFTokenizer::isDelimiter(char ch) | @@ -133,48 +134,7 @@ QPDFTokenizer::isDelimiter(char ch) | ||
| 133 | void | 134 | void |
| 134 | QPDFTokenizer::resolveLiteral() | 135 | QPDFTokenizer::resolveLiteral() |
| 135 | { | 136 | { |
| 136 | - if ((this->val.length() > 0) && (this->val.at(0) == '/')) { | ||
| 137 | - this->type = tt_name; | ||
| 138 | - // Deal with # in name token. Note: '/' by itself is a | ||
| 139 | - // valid name, so don't strip leading /. That way we | ||
| 140 | - // don't have to deal with the empty string as a name. | ||
| 141 | - std::string nval = "/"; | ||
| 142 | - size_t len = this->val.length(); | ||
| 143 | - for (size_t i = 1; i < len; ++i) { | ||
| 144 | - char ch = this->val.at(i); | ||
| 145 | - if (ch == '#') { | ||
| 146 | - if ((i + 2 < len) && QUtil::is_hex_digit(this->val.at(i + 1)) && | ||
| 147 | - QUtil::is_hex_digit(this->val.at(i + 2))) { | ||
| 148 | - char num[3]; | ||
| 149 | - num[0] = this->val.at(i + 1); | ||
| 150 | - num[1] = this->val.at(i + 2); | ||
| 151 | - num[2] = '\0'; | ||
| 152 | - char ch2 = static_cast<char>(strtol(num, nullptr, 16)); | ||
| 153 | - if (ch2 == '\0') { | ||
| 154 | - this->type = tt_bad; | ||
| 155 | - QTC::TC("qpdf", "QPDFTokenizer null in name"); | ||
| 156 | - this->error_message = | ||
| 157 | - "null character not allowed in name token"; | ||
| 158 | - nval += "#00"; | ||
| 159 | - } else { | ||
| 160 | - nval.append(1, ch2); | ||
| 161 | - } | ||
| 162 | - i += 2; | ||
| 163 | - } else { | ||
| 164 | - QTC::TC("qpdf", "QPDFTokenizer bad name"); | ||
| 165 | - this->error_message = | ||
| 166 | - "name with stray # will not work with PDF >= 1.2"; | ||
| 167 | - // Use null to encode a bad # -- this is reversed | ||
| 168 | - // in QPDF_Name::normalizeName. | ||
| 169 | - nval += '\0'; | ||
| 170 | - } | ||
| 171 | - } else { | ||
| 172 | - nval.append(1, ch); | ||
| 173 | - } | ||
| 174 | - } | ||
| 175 | - this->val.clear(); | ||
| 176 | - this->val += nval; | ||
| 177 | - } else if (QUtil::is_number(this->val.c_str())) { | 137 | + if (QUtil::is_number(this->val.c_str())) { |
| 178 | if (this->val.find('.') != std::string::npos) { | 138 | if (this->val.find('.') != std::string::npos) { |
| 179 | this->type = tt_real; | 139 | this->type = tt_real; |
| 180 | } else { | 140 | } else { |
| @@ -241,6 +201,10 @@ QPDFTokenizer::handleCharacter(char ch) | @@ -241,6 +201,10 @@ QPDFTokenizer::handleCharacter(char ch) | ||
| 241 | inString(ch); | 201 | inString(ch); |
| 242 | return; | 202 | return; |
| 243 | 203 | ||
| 204 | + case st_name: | ||
| 205 | + inName(ch); | ||
| 206 | + return; | ||
| 207 | + | ||
| 244 | case st_string_after_cr: | 208 | case st_string_after_cr: |
| 245 | inStringAfterCR(ch); | 209 | inStringAfterCR(ch); |
| 246 | return; | 210 | return; |
| @@ -270,6 +234,14 @@ QPDFTokenizer::handleCharacter(char ch) | @@ -270,6 +234,14 @@ QPDFTokenizer::handleCharacter(char ch) | ||
| 270 | inHexstring2nd(ch); | 234 | inHexstring2nd(ch); |
| 271 | return; | 235 | return; |
| 272 | 236 | ||
| 237 | + case st_name_hex1: | ||
| 238 | + inNameHex1(ch); | ||
| 239 | + return; | ||
| 240 | + | ||
| 241 | + case st_name_hex2: | ||
| 242 | + inNameHex2(ch); | ||
| 243 | + return; | ||
| 244 | + | ||
| 273 | case (st_token_ready): | 245 | case (st_token_ready): |
| 274 | inTokenReady(ch); | 246 | inTokenReady(ch); |
| 275 | return; | 247 | return; |
| @@ -353,6 +325,11 @@ QPDFTokenizer::inTop(char ch) | @@ -353,6 +325,11 @@ QPDFTokenizer::inTop(char ch) | ||
| 353 | this->val += ch; | 325 | this->val += ch; |
| 354 | return; | 326 | return; |
| 355 | 327 | ||
| 328 | + case '/': | ||
| 329 | + this->state = st_name; | ||
| 330 | + this->val += ch; | ||
| 331 | + return; | ||
| 332 | + | ||
| 356 | default: | 333 | default: |
| 357 | this->state = st_literal; | 334 | this->state = st_literal; |
| 358 | this->val += ch; | 335 | this->val += ch; |
| @@ -433,6 +410,93 @@ QPDFTokenizer::inString(char ch) | @@ -433,6 +410,93 @@ QPDFTokenizer::inString(char ch) | ||
| 433 | } | 410 | } |
| 434 | 411 | ||
| 435 | void | 412 | void |
| 413 | +QPDFTokenizer::inName(char ch) | ||
| 414 | +{ | ||
| 415 | + if (isDelimiter(ch)) { | ||
| 416 | + // A C-locale whitespace character or delimiter terminates | ||
| 417 | + // token. It is important to unread the whitespace | ||
| 418 | + // character even though it is ignored since it may be the | ||
| 419 | + // newline after a stream keyword. Removing it here could | ||
| 420 | + // make the stream-reading code break on some files, | ||
| 421 | + // though not on any files in the test suite as of this | ||
| 422 | + // writing. | ||
| 423 | + | ||
| 424 | + this->type = this->bad ? tt_bad : tt_name; | ||
| 425 | + this->unread_char = true; | ||
| 426 | + this->char_to_unread = ch; | ||
| 427 | + this->state = st_token_ready; | ||
| 428 | + } else if (ch == '#') { | ||
| 429 | + this->char_code = 0; | ||
| 430 | + this->state = st_name_hex1; | ||
| 431 | + } else { | ||
| 432 | + this->val += ch; | ||
| 433 | + } | ||
| 434 | +} | ||
| 435 | + | ||
| 436 | +void | ||
| 437 | +QPDFTokenizer::inNameHex1(char ch) | ||
| 438 | +{ | ||
| 439 | + this->hex_char = ch; | ||
| 440 | + | ||
| 441 | + if ('0' <= ch && ch <= '9') { | ||
| 442 | + this->char_code = 16 * (int(ch) - int('0')); | ||
| 443 | + this->state = st_name_hex2; | ||
| 444 | + | ||
| 445 | + } else if ('A' <= ch && ch <= 'F') { | ||
| 446 | + this->char_code = 16 * (10 + int(ch) - int('A')); | ||
| 447 | + this->state = st_name_hex2; | ||
| 448 | + | ||
| 449 | + } else if ('a' <= ch && ch <= 'f') { | ||
| 450 | + this->char_code = 16 * (10 + int(ch) - int('a')); | ||
| 451 | + this->state = st_name_hex2; | ||
| 452 | + | ||
| 453 | + } else { | ||
| 454 | + QTC::TC("qpdf", "QPDFTokenizer bad name 1"); | ||
| 455 | + this->error_message = "name with stray # will not work with PDF >= 1.2"; | ||
| 456 | + // Use null to encode a bad # -- this is reversed | ||
| 457 | + // in QPDF_Name::normalizeName. | ||
| 458 | + this->val += '\0'; | ||
| 459 | + this->state = st_name; | ||
| 460 | + inName(ch); | ||
| 461 | + } | ||
| 462 | +} | ||
| 463 | + | ||
| 464 | +void | ||
| 465 | +QPDFTokenizer::inNameHex2(char ch) | ||
| 466 | +{ | ||
| 467 | + if ('0' <= ch && ch <= '9') { | ||
| 468 | + this->char_code += int(ch) - int('0'); | ||
| 469 | + | ||
| 470 | + } else if ('A' <= ch && ch <= 'F') { | ||
| 471 | + this->char_code += 10 + int(ch) - int('A'); | ||
| 472 | + | ||
| 473 | + } else if ('a' <= ch && ch <= 'f') { | ||
| 474 | + this->char_code += 10 + int(ch) - int('a'); | ||
| 475 | + | ||
| 476 | + } else { | ||
| 477 | + QTC::TC("qpdf", "QPDFTokenizer bad name 2"); | ||
| 478 | + this->error_message = "name with stray # will not work with PDF >= 1.2"; | ||
| 479 | + // Use null to encode a bad # -- this is reversed | ||
| 480 | + // in QPDF_Name::normalizeName. | ||
| 481 | + this->val += '\0'; | ||
| 482 | + this->val += this->hex_char; | ||
| 483 | + this->state = st_name; | ||
| 484 | + inName(ch); | ||
| 485 | + return; | ||
| 486 | + } | ||
| 487 | + if (this->char_code == 0) { | ||
| 488 | + QTC::TC("qpdf", "QPDFTokenizer null in name"); | ||
| 489 | + this->error_message = "null character not allowed in name token"; | ||
| 490 | + this->val += "#00"; | ||
| 491 | + this->state = st_name; | ||
| 492 | + this->bad = true; | ||
| 493 | + } else { | ||
| 494 | + this->val += char(this->char_code); | ||
| 495 | + this->state = st_name; | ||
| 496 | + } | ||
| 497 | +} | ||
| 498 | + | ||
| 499 | +void | ||
| 436 | QPDFTokenizer::inStringEscape(char ch) | 500 | QPDFTokenizer::inStringEscape(char ch) |
| 437 | { | 501 | { |
| 438 | this->state = st_in_string; | 502 | this->state = st_in_string; |
| @@ -642,9 +706,16 @@ QPDFTokenizer::inInlineImage(char ch) | @@ -642,9 +706,16 @@ QPDFTokenizer::inInlineImage(char ch) | ||
| 642 | void | 706 | void |
| 643 | QPDFTokenizer::presentEOF() | 707 | QPDFTokenizer::presentEOF() |
| 644 | { | 708 | { |
| 645 | - if (this->state == st_literal) { | 709 | + if (this->state == st_name || this->state == st_name_hex1 || |
| 710 | + this->state == st_name_hex2) { | ||
| 711 | + // Push any delimiter to the state machine to finish off the final | ||
| 712 | + // token. | ||
| 713 | + presentCharacter('\f'); | ||
| 714 | + this->unread_char = false; | ||
| 715 | + } else if (this->state == st_literal) { | ||
| 646 | QTC::TC("qpdf", "QPDFTokenizer EOF reading appendable token"); | 716 | QTC::TC("qpdf", "QPDFTokenizer EOF reading appendable token"); |
| 647 | resolveLiteral(); | 717 | resolveLiteral(); |
| 718 | + | ||
| 648 | } else if ((this->include_ignorable) && (this->state == st_in_space)) { | 719 | } else if ((this->include_ignorable) && (this->state == st_in_space)) { |
| 649 | this->type = tt_space; | 720 | this->type = tt_space; |
| 650 | } else if ((this->include_ignorable) && (this->state == st_in_comment)) { | 721 | } else if ((this->include_ignorable) && (this->state == st_in_comment)) { |
qpdf/qpdf.testcov
| @@ -68,7 +68,8 @@ QPDFTokenizer bad > 0 | @@ -68,7 +68,8 @@ QPDFTokenizer bad > 0 | ||
| 68 | QPDFTokenizer bad hexstring character 0 | 68 | QPDFTokenizer bad hexstring character 0 |
| 69 | QPDFTokenizer bad hexstring 2nd character 0 | 69 | QPDFTokenizer bad hexstring 2nd character 0 |
| 70 | QPDFTokenizer null in name 0 | 70 | QPDFTokenizer null in name 0 |
| 71 | -QPDFTokenizer bad name 0 | 71 | +QPDFTokenizer bad name 1 0 |
| 72 | +QPDFTokenizer bad name 2 0 | ||
| 72 | QPDF_Stream invalid filter 0 | 73 | QPDF_Stream invalid filter 0 |
| 73 | QPDF UseOutlines but no Outlines 0 | 74 | QPDF UseOutlines but no Outlines 0 |
| 74 | QPDFObjectHandle makeDirect loop 0 | 75 | QPDFObjectHandle makeDirect loop 0 |