Commit 931fbb615623f00de0942f12e3e5b2b6e141b09f

Authored by m-holger
1 parent a3f3238f

Integrate names into state machine in QPDFTokenizer

include/qpdf/QPDFTokenizer.hh
@@ -203,6 +203,7 @@ class QPDFTokenizer @@ -203,6 +203,7 @@ class QPDFTokenizer
203 st_in_hexstring, 203 st_in_hexstring,
204 st_in_string, 204 st_in_string,
205 st_in_hexstring_2nd, 205 st_in_hexstring_2nd,
  206 + st_name,
206 st_literal, 207 st_literal,
207 st_in_space, 208 st_in_space,
208 st_in_comment, 209 st_in_comment,
@@ -212,6 +213,8 @@ class QPDFTokenizer @@ -212,6 +213,8 @@ class QPDFTokenizer
212 st_lt, 213 st_lt,
213 st_gt, 214 st_gt,
214 st_inline_image, 215 st_inline_image,
  216 + st_name_hex1,
  217 + st_name_hex2,
215 st_token_ready 218 st_token_ready
216 }; 219 };
217 220
@@ -220,6 +223,7 @@ class QPDFTokenizer @@ -220,6 +223,7 @@ class QPDFTokenizer
220 void inSpace(char); 223 void inSpace(char);
221 void inComment(char); 224 void inComment(char);
222 void inString(char); 225 void inString(char);
  226 + void inName(char);
223 void inLt(char); 227 void inLt(char);
224 void inGt(char); 228 void inGt(char);
225 void inStringAfterCR(char); 229 void inStringAfterCR(char);
@@ -230,7 +234,8 @@ class QPDFTokenizer @@ -230,7 +234,8 @@ class QPDFTokenizer
230 void inHexstring2nd(char); 234 void inHexstring2nd(char);
231 void inInlineImage(char); 235 void inInlineImage(char);
232 void inTokenReady(char); 236 void inTokenReady(char);
233 - 237 + void inNameHex1(char);
  238 + void inNameHex2(char);
234 void reset(); 239 void reset();
235 240
236 // Lexer state 241 // Lexer state
@@ -247,10 +252,12 @@ class QPDFTokenizer @@ -247,10 +252,12 @@ class QPDFTokenizer
247 bool unread_char; 252 bool unread_char;
248 char char_to_unread; 253 char char_to_unread;
249 size_t inline_image_bytes; 254 size_t inline_image_bytes;
  255 + bool bad;
250 256
251 // State for strings 257 // State for strings
252 int string_depth; 258 int string_depth;
253 int char_code; 259 int char_code;
  260 + char hex_char;
254 int digit_count; 261 int digit_count;
255 }; 262 };
256 263
libqpdf/QPDFTokenizer.cc
@@ -85,6 +85,7 @@ QPDFTokenizer::reset() @@ -85,6 +85,7 @@ QPDFTokenizer::reset()
85 char_to_unread = '\0'; 85 char_to_unread = '\0';
86 inline_image_bytes = 0; 86 inline_image_bytes = 0;
87 string_depth = 0; 87 string_depth = 0;
  88 + bad = false;
88 } 89 }
89 90
90 QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) : 91 QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) :
@@ -133,48 +134,7 @@ QPDFTokenizer::isDelimiter(char ch) @@ -133,48 +134,7 @@ QPDFTokenizer::isDelimiter(char ch)
133 void 134 void
134 QPDFTokenizer::resolveLiteral() 135 QPDFTokenizer::resolveLiteral()
135 { 136 {
136 - if ((this->val.length() > 0) && (this->val.at(0) == '/')) {  
137 - this->type = tt_name;  
138 - // Deal with # in name token. Note: '/' by itself is a  
139 - // valid name, so don't strip leading /. That way we  
140 - // don't have to deal with the empty string as a name.  
141 - std::string nval = "/";  
142 - size_t len = this->val.length();  
143 - for (size_t i = 1; i < len; ++i) {  
144 - char ch = this->val.at(i);  
145 - if (ch == '#') {  
146 - if ((i + 2 < len) && QUtil::is_hex_digit(this->val.at(i + 1)) &&  
147 - QUtil::is_hex_digit(this->val.at(i + 2))) {  
148 - char num[3];  
149 - num[0] = this->val.at(i + 1);  
150 - num[1] = this->val.at(i + 2);  
151 - num[2] = '\0';  
152 - char ch2 = static_cast<char>(strtol(num, nullptr, 16));  
153 - if (ch2 == '\0') {  
154 - this->type = tt_bad;  
155 - QTC::TC("qpdf", "QPDFTokenizer null in name");  
156 - this->error_message =  
157 - "null character not allowed in name token";  
158 - nval += "#00";  
159 - } else {  
160 - nval.append(1, ch2);  
161 - }  
162 - i += 2;  
163 - } else {  
164 - QTC::TC("qpdf", "QPDFTokenizer bad name");  
165 - this->error_message =  
166 - "name with stray # will not work with PDF >= 1.2";  
167 - // Use null to encode a bad # -- this is reversed  
168 - // in QPDF_Name::normalizeName.  
169 - nval += '\0';  
170 - }  
171 - } else {  
172 - nval.append(1, ch);  
173 - }  
174 - }  
175 - this->val.clear();  
176 - this->val += nval;  
177 - } else if (QUtil::is_number(this->val.c_str())) { 137 + if (QUtil::is_number(this->val.c_str())) {
178 if (this->val.find('.') != std::string::npos) { 138 if (this->val.find('.') != std::string::npos) {
179 this->type = tt_real; 139 this->type = tt_real;
180 } else { 140 } else {
@@ -241,6 +201,10 @@ QPDFTokenizer::handleCharacter(char ch) @@ -241,6 +201,10 @@ QPDFTokenizer::handleCharacter(char ch)
241 inString(ch); 201 inString(ch);
242 return; 202 return;
243 203
  204 + case st_name:
  205 + inName(ch);
  206 + return;
  207 +
244 case st_string_after_cr: 208 case st_string_after_cr:
245 inStringAfterCR(ch); 209 inStringAfterCR(ch);
246 return; 210 return;
@@ -270,6 +234,14 @@ QPDFTokenizer::handleCharacter(char ch) @@ -270,6 +234,14 @@ QPDFTokenizer::handleCharacter(char ch)
270 inHexstring2nd(ch); 234 inHexstring2nd(ch);
271 return; 235 return;
272 236
  237 + case st_name_hex1:
  238 + inNameHex1(ch);
  239 + return;
  240 +
  241 + case st_name_hex2:
  242 + inNameHex2(ch);
  243 + return;
  244 +
273 case (st_token_ready): 245 case (st_token_ready):
274 inTokenReady(ch); 246 inTokenReady(ch);
275 return; 247 return;
@@ -353,6 +325,11 @@ QPDFTokenizer::inTop(char ch) @@ -353,6 +325,11 @@ QPDFTokenizer::inTop(char ch)
353 this->val += ch; 325 this->val += ch;
354 return; 326 return;
355 327
  328 + case '/':
  329 + this->state = st_name;
  330 + this->val += ch;
  331 + return;
  332 +
356 default: 333 default:
357 this->state = st_literal; 334 this->state = st_literal;
358 this->val += ch; 335 this->val += ch;
@@ -433,6 +410,93 @@ QPDFTokenizer::inString(char ch) @@ -433,6 +410,93 @@ QPDFTokenizer::inString(char ch)
433 } 410 }
434 411
435 void 412 void
  413 +QPDFTokenizer::inName(char ch)
  414 +{
  415 + if (isDelimiter(ch)) {
  416 + // A C-locale whitespace character or delimiter terminates
  417 + // token. It is important to unread the whitespace
  418 + // character even though it is ignored since it may be the
  419 + // newline after a stream keyword. Removing it here could
  420 + // make the stream-reading code break on some files,
  421 + // though not on any files in the test suite as of this
  422 + // writing.
  423 +
  424 + this->type = this->bad ? tt_bad : tt_name;
  425 + this->unread_char = true;
  426 + this->char_to_unread = ch;
  427 + this->state = st_token_ready;
  428 + } else if (ch == '#') {
  429 + this->char_code = 0;
  430 + this->state = st_name_hex1;
  431 + } else {
  432 + this->val += ch;
  433 + }
  434 +}
  435 +
  436 +void
  437 +QPDFTokenizer::inNameHex1(char ch)
  438 +{
  439 + this->hex_char = ch;
  440 +
  441 + if ('0' <= ch && ch <= '9') {
  442 + this->char_code = 16 * (int(ch) - int('0'));
  443 + this->state = st_name_hex2;
  444 +
  445 + } else if ('A' <= ch && ch <= 'F') {
  446 + this->char_code = 16 * (10 + int(ch) - int('A'));
  447 + this->state = st_name_hex2;
  448 +
  449 + } else if ('a' <= ch && ch <= 'f') {
  450 + this->char_code = 16 * (10 + int(ch) - int('a'));
  451 + this->state = st_name_hex2;
  452 +
  453 + } else {
  454 + QTC::TC("qpdf", "QPDFTokenizer bad name 1");
  455 + this->error_message = "name with stray # will not work with PDF >= 1.2";
  456 + // Use null to encode a bad # -- this is reversed
  457 + // in QPDF_Name::normalizeName.
  458 + this->val += '\0';
  459 + this->state = st_name;
  460 + inName(ch);
  461 + }
  462 +}
  463 +
  464 +void
  465 +QPDFTokenizer::inNameHex2(char ch)
  466 +{
  467 + if ('0' <= ch && ch <= '9') {
  468 + this->char_code += int(ch) - int('0');
  469 +
  470 + } else if ('A' <= ch && ch <= 'F') {
  471 + this->char_code += 10 + int(ch) - int('A');
  472 +
  473 + } else if ('a' <= ch && ch <= 'f') {
  474 + this->char_code += 10 + int(ch) - int('a');
  475 +
  476 + } else {
  477 + QTC::TC("qpdf", "QPDFTokenizer bad name 2");
  478 + this->error_message = "name with stray # will not work with PDF >= 1.2";
  479 + // Use null to encode a bad # -- this is reversed
  480 + // in QPDF_Name::normalizeName.
  481 + this->val += '\0';
  482 + this->val += this->hex_char;
  483 + this->state = st_name;
  484 + inName(ch);
  485 + return;
  486 + }
  487 + if (this->char_code == 0) {
  488 + QTC::TC("qpdf", "QPDFTokenizer null in name");
  489 + this->error_message = "null character not allowed in name token";
  490 + this->val += "#00";
  491 + this->state = st_name;
  492 + this->bad = true;
  493 + } else {
  494 + this->val += char(this->char_code);
  495 + this->state = st_name;
  496 + }
  497 +}
  498 +
  499 +void
436 QPDFTokenizer::inStringEscape(char ch) 500 QPDFTokenizer::inStringEscape(char ch)
437 { 501 {
438 this->state = st_in_string; 502 this->state = st_in_string;
@@ -642,9 +706,16 @@ QPDFTokenizer::inInlineImage(char ch) @@ -642,9 +706,16 @@ QPDFTokenizer::inInlineImage(char ch)
642 void 706 void
643 QPDFTokenizer::presentEOF() 707 QPDFTokenizer::presentEOF()
644 { 708 {
645 - if (this->state == st_literal) { 709 + if (this->state == st_name || this->state == st_name_hex1 ||
  710 + this->state == st_name_hex2) {
  711 + // Push any delimiter to the state machine to finish off the final
  712 + // token.
  713 + presentCharacter('\f');
  714 + this->unread_char = false;
  715 + } else if (this->state == st_literal) {
646 QTC::TC("qpdf", "QPDFTokenizer EOF reading appendable token"); 716 QTC::TC("qpdf", "QPDFTokenizer EOF reading appendable token");
647 resolveLiteral(); 717 resolveLiteral();
  718 +
648 } else if ((this->include_ignorable) && (this->state == st_in_space)) { 719 } else if ((this->include_ignorable) && (this->state == st_in_space)) {
649 this->type = tt_space; 720 this->type = tt_space;
650 } else if ((this->include_ignorable) && (this->state == st_in_comment)) { 721 } else if ((this->include_ignorable) && (this->state == st_in_comment)) {
qpdf/qpdf.testcov
@@ -68,7 +68,8 @@ QPDFTokenizer bad &gt; 0 @@ -68,7 +68,8 @@ QPDFTokenizer bad &gt; 0
68 QPDFTokenizer bad hexstring character 0 68 QPDFTokenizer bad hexstring character 0
69 QPDFTokenizer bad hexstring 2nd character 0 69 QPDFTokenizer bad hexstring 2nd character 0
70 QPDFTokenizer null in name 0 70 QPDFTokenizer null in name 0
71 -QPDFTokenizer bad name 0 71 +QPDFTokenizer bad name 1 0
  72 +QPDFTokenizer bad name 2 0
72 QPDF_Stream invalid filter 0 73 QPDF_Stream invalid filter 0
73 QPDF UseOutlines but no Outlines 0 74 QPDF UseOutlines but no Outlines 0
74 QPDFObjectHandle makeDirect loop 0 75 QPDFObjectHandle makeDirect loop 0