Commit 931fbb615623f00de0942f12e3e5b2b6e141b09f

Authored by m-holger
1 parent a3f3238f

Integrate names into state machine in QPDFTokenizer

include/qpdf/QPDFTokenizer.hh
... ... @@ -203,6 +203,7 @@ class QPDFTokenizer
203 203 st_in_hexstring,
204 204 st_in_string,
205 205 st_in_hexstring_2nd,
  206 + st_name,
206 207 st_literal,
207 208 st_in_space,
208 209 st_in_comment,
... ... @@ -212,6 +213,8 @@ class QPDFTokenizer
212 213 st_lt,
213 214 st_gt,
214 215 st_inline_image,
  216 + st_name_hex1,
  217 + st_name_hex2,
215 218 st_token_ready
216 219 };
217 220  
... ... @@ -220,6 +223,7 @@ class QPDFTokenizer
220 223 void inSpace(char);
221 224 void inComment(char);
222 225 void inString(char);
  226 + void inName(char);
223 227 void inLt(char);
224 228 void inGt(char);
225 229 void inStringAfterCR(char);
... ... @@ -230,7 +234,8 @@ class QPDFTokenizer
230 234 void inHexstring2nd(char);
231 235 void inInlineImage(char);
232 236 void inTokenReady(char);
233   -
  237 + void inNameHex1(char);
  238 + void inNameHex2(char);
234 239 void reset();
235 240  
236 241 // Lexer state
... ... @@ -247,10 +252,12 @@ class QPDFTokenizer
247 252 bool unread_char;
248 253 char char_to_unread;
249 254 size_t inline_image_bytes;
  255 + bool bad;
250 256  
251 257 // State for strings
252 258 int string_depth;
253 259 int char_code;
  260 + char hex_char;
254 261 int digit_count;
255 262 };
256 263  
... ...
libqpdf/QPDFTokenizer.cc
... ... @@ -85,6 +85,7 @@ QPDFTokenizer::reset()
85 85 char_to_unread = '\0';
86 86 inline_image_bytes = 0;
87 87 string_depth = 0;
  88 + bad = false;
88 89 }
89 90  
90 91 QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) :
... ... @@ -133,48 +134,7 @@ QPDFTokenizer::isDelimiter(char ch)
133 134 void
134 135 QPDFTokenizer::resolveLiteral()
135 136 {
136   - if ((this->val.length() > 0) && (this->val.at(0) == '/')) {
137   - this->type = tt_name;
138   - // Deal with # in name token. Note: '/' by itself is a
139   - // valid name, so don't strip leading /. That way we
140   - // don't have to deal with the empty string as a name.
141   - std::string nval = "/";
142   - size_t len = this->val.length();
143   - for (size_t i = 1; i < len; ++i) {
144   - char ch = this->val.at(i);
145   - if (ch == '#') {
146   - if ((i + 2 < len) && QUtil::is_hex_digit(this->val.at(i + 1)) &&
147   - QUtil::is_hex_digit(this->val.at(i + 2))) {
148   - char num[3];
149   - num[0] = this->val.at(i + 1);
150   - num[1] = this->val.at(i + 2);
151   - num[2] = '\0';
152   - char ch2 = static_cast<char>(strtol(num, nullptr, 16));
153   - if (ch2 == '\0') {
154   - this->type = tt_bad;
155   - QTC::TC("qpdf", "QPDFTokenizer null in name");
156   - this->error_message =
157   - "null character not allowed in name token";
158   - nval += "#00";
159   - } else {
160   - nval.append(1, ch2);
161   - }
162   - i += 2;
163   - } else {
164   - QTC::TC("qpdf", "QPDFTokenizer bad name");
165   - this->error_message =
166   - "name with stray # will not work with PDF >= 1.2";
167   - // Use null to encode a bad # -- this is reversed
168   - // in QPDF_Name::normalizeName.
169   - nval += '\0';
170   - }
171   - } else {
172   - nval.append(1, ch);
173   - }
174   - }
175   - this->val.clear();
176   - this->val += nval;
177   - } else if (QUtil::is_number(this->val.c_str())) {
  137 + if (QUtil::is_number(this->val.c_str())) {
178 138 if (this->val.find('.') != std::string::npos) {
179 139 this->type = tt_real;
180 140 } else {
... ... @@ -241,6 +201,10 @@ QPDFTokenizer::handleCharacter(char ch)
241 201 inString(ch);
242 202 return;
243 203  
  204 + case st_name:
  205 + inName(ch);
  206 + return;
  207 +
244 208 case st_string_after_cr:
245 209 inStringAfterCR(ch);
246 210 return;
... ... @@ -270,6 +234,14 @@ QPDFTokenizer::handleCharacter(char ch)
270 234 inHexstring2nd(ch);
271 235 return;
272 236  
  237 + case st_name_hex1:
  238 + inNameHex1(ch);
  239 + return;
  240 +
  241 + case st_name_hex2:
  242 + inNameHex2(ch);
  243 + return;
  244 +
273 245 case (st_token_ready):
274 246 inTokenReady(ch);
275 247 return;
... ... @@ -353,6 +325,11 @@ QPDFTokenizer::inTop(char ch)
353 325 this->val += ch;
354 326 return;
355 327  
  328 + case '/':
  329 + this->state = st_name;
  330 + this->val += ch;
  331 + return;
  332 +
356 333 default:
357 334 this->state = st_literal;
358 335 this->val += ch;
... ... @@ -433,6 +410,93 @@ QPDFTokenizer::inString(char ch)
433 410 }
434 411  
435 412 void
  413 +QPDFTokenizer::inName(char ch)
  414 +{
  415 + if (isDelimiter(ch)) {
  416 + // A C-locale whitespace character or delimiter terminates
  417 + // token. It is important to unread the whitespace
  418 + // character even though it is ignored since it may be the
  419 + // newline after a stream keyword. Removing it here could
  420 + // make the stream-reading code break on some files,
  421 + // though not on any files in the test suite as of this
  422 + // writing.
  423 +
  424 + this->type = this->bad ? tt_bad : tt_name;
  425 + this->unread_char = true;
  426 + this->char_to_unread = ch;
  427 + this->state = st_token_ready;
  428 + } else if (ch == '#') {
  429 + this->char_code = 0;
  430 + this->state = st_name_hex1;
  431 + } else {
  432 + this->val += ch;
  433 + }
  434 +}
  435 +
  436 +void
  437 +QPDFTokenizer::inNameHex1(char ch)
  438 +{
  439 + this->hex_char = ch;
  440 +
  441 + if ('0' <= ch && ch <= '9') {
  442 + this->char_code = 16 * (int(ch) - int('0'));
  443 + this->state = st_name_hex2;
  444 +
  445 + } else if ('A' <= ch && ch <= 'F') {
  446 + this->char_code = 16 * (10 + int(ch) - int('A'));
  447 + this->state = st_name_hex2;
  448 +
  449 + } else if ('a' <= ch && ch <= 'f') {
  450 + this->char_code = 16 * (10 + int(ch) - int('a'));
  451 + this->state = st_name_hex2;
  452 +
  453 + } else {
  454 + QTC::TC("qpdf", "QPDFTokenizer bad name 1");
  455 + this->error_message = "name with stray # will not work with PDF >= 1.2";
  456 + // Use null to encode a bad # -- this is reversed
  457 + // in QPDF_Name::normalizeName.
  458 + this->val += '\0';
  459 + this->state = st_name;
  460 + inName(ch);
  461 + }
  462 +}
  463 +
  464 +void
  465 +QPDFTokenizer::inNameHex2(char ch)
  466 +{
  467 + if ('0' <= ch && ch <= '9') {
  468 + this->char_code += int(ch) - int('0');
  469 +
  470 + } else if ('A' <= ch && ch <= 'F') {
  471 + this->char_code += 10 + int(ch) - int('A');
  472 +
  473 + } else if ('a' <= ch && ch <= 'f') {
  474 + this->char_code += 10 + int(ch) - int('a');
  475 +
  476 + } else {
  477 + QTC::TC("qpdf", "QPDFTokenizer bad name 2");
  478 + this->error_message = "name with stray # will not work with PDF >= 1.2";
  479 + // Use null to encode a bad # -- this is reversed
  480 + // in QPDF_Name::normalizeName.
  481 + this->val += '\0';
  482 + this->val += this->hex_char;
  483 + this->state = st_name;
  484 + inName(ch);
  485 + return;
  486 + }
  487 + if (this->char_code == 0) {
  488 + QTC::TC("qpdf", "QPDFTokenizer null in name");
  489 + this->error_message = "null character not allowed in name token";
  490 + this->val += "#00";
  491 + this->state = st_name;
  492 + this->bad = true;
  493 + } else {
  494 + this->val += char(this->char_code);
  495 + this->state = st_name;
  496 + }
  497 +}
  498 +
  499 +void
436 500 QPDFTokenizer::inStringEscape(char ch)
437 501 {
438 502 this->state = st_in_string;
... ... @@ -642,9 +706,16 @@ QPDFTokenizer::inInlineImage(char ch)
642 706 void
643 707 QPDFTokenizer::presentEOF()
644 708 {
645   - if (this->state == st_literal) {
  709 + if (this->state == st_name || this->state == st_name_hex1 ||
  710 + this->state == st_name_hex2) {
  711 + // Push any delimiter to the state machine to finish off the final
  712 + // token.
  713 + presentCharacter('\f');
  714 + this->unread_char = false;
  715 + } else if (this->state == st_literal) {
646 716 QTC::TC("qpdf", "QPDFTokenizer EOF reading appendable token");
647 717 resolveLiteral();
  718 +
648 719 } else if ((this->include_ignorable) && (this->state == st_in_space)) {
649 720 this->type = tt_space;
650 721 } else if ((this->include_ignorable) && (this->state == st_in_comment)) {
... ...
qpdf/qpdf.testcov
... ... @@ -68,7 +68,8 @@ QPDFTokenizer bad &gt; 0
68 68 QPDFTokenizer bad hexstring character 0
69 69 QPDFTokenizer bad hexstring 2nd character 0
70 70 QPDFTokenizer null in name 0
71   -QPDFTokenizer bad name 0
  71 +QPDFTokenizer bad name 1 0
  72 +QPDFTokenizer bad name 2 0
72 73 QPDF_Stream invalid filter 0
73 74 QPDF UseOutlines but no Outlines 0
74 75 QPDFObjectHandle makeDirect loop 0
... ...