Commit f0a52502017d7bba4bdad4034d6c30f64cd3cb5b
Committed by
Philippe Lagadec
1 parent
303f0de1
xls_parser: parse binary parts of xlsb files
Showing
1 changed file
with
184 additions
and
19 deletions
oletools/xls_parser.py
| ... | ... | @@ -97,6 +97,30 @@ def is_xls(filename): |
| 97 | 97 | return False |
| 98 | 98 | |
| 99 | 99 | |
| 100 | +def read_unicode(data, start_idx, n_chars): | |
| 101 | + """ read a unicode string from a XLUnicodeStringNoCch structure """ | |
| 102 | + # first bit 0x0 --> only low-bytes are saved, all high bytes are 0 | |
| 103 | + # first bit 0x1 --> 2 bytes per character | |
| 104 | + low_bytes_only = (ord(data[start_idx]) == 0) | |
| 105 | + if low_bytes_only: | |
| 106 | + end_idx = start_idx + 1 + n_chars | |
| 107 | + return data[start_idx+1:end_idx].decode('ascii'), end_idx | |
| 108 | + else: | |
| 109 | + return read_unicode_2byte(data, start_idx+1, n_chars) | |
| 110 | + | |
| 111 | + | |
| 112 | +def read_unicode_2byte(data, start_idx, n_chars): | |
| 113 | + """ read a unicode string with characters encoded by 2 bytes """ | |
| 114 | + end_idx = start_idx + n_chars * 2 | |
| 115 | + if n_chars < 256: # faster version, long format string for unpack | |
| 116 | + unichars = (unichr(val) for val in | |
| 117 | + unpack('<' + 'H'*n_chars, data[start_idx:end_idx])) | |
| 118 | + else: # slower version but less memory-extensive | |
| 119 | + unichars = (unichr(unpack('<H', data[data_idx:data_idx+2])[0]) | |
| 120 | + for data_idx in xrange(start_idx, end_idx, 2)) | |
| 121 | + return u''.join(unichars), end_idx | |
| 122 | + | |
| 123 | + | |
| 100 | 124 | ############################################################################### |
| 101 | 125 | # File, Storage, Stream |
| 102 | 126 | ############################################################################### |
| ... | ... | @@ -133,6 +157,8 @@ class XlsStream(object): |
| 133 | 157 | |
| 134 | 158 | Currently not much use, but may be interesting for further sub-classing |
| 135 | 159 | when extending this code. |
| 160 | + | |
| 161 | + stream argument can be oleile.OleStream or ooxml.ZipSubFile | |
| 136 | 162 | """ |
| 137 | 163 | |
| 138 | 164 | def __init__(self, stream, name): |
| ... | ... | @@ -149,11 +175,10 @@ class WorkbookStream(XlsStream): |
| 149 | 175 | """ the workbook stream which contains records """ |
| 150 | 176 | |
| 151 | 177 | def iter_records(self, fill_data=False): |
| 152 | - """ iterate over records in streams""" | |
| 153 | - if self.stream.tell() != 0: | |
| 154 | - logging.debug('have to jump to start') | |
| 155 | - self.stream.seek(0) | |
| 178 | + """ iterate over records in streams | |
| 156 | 179 | |
| 180 | + Stream must be positioned at start of records (e.g. start of stream). | |
| 181 | + """ | |
| 157 | 182 | while True: |
| 158 | 183 | # unpacking as in olevba._extract_vba |
| 159 | 184 | pos = self.stream.tell() |
| ... | ... | @@ -183,6 +208,52 @@ class WorkbookStream(XlsStream): |
| 183 | 208 | return '[Workbook Stream (size {0})'.format(self.size) |
| 184 | 209 | |
| 185 | 210 | |
| 211 | +class XlsbStream(XlsStream): | |
| 212 | + """ binary stream of an xlsb file, usually have a record structure """ | |
| 213 | + | |
| 214 | + HIGH_BIT_MASK = 0b10000000 | |
| 215 | + LOW7_BIT_MASK = 0b01111111 | |
| 216 | + | |
| 217 | + def iter_records(self): | |
| 218 | + """ iterate over records in stream | |
| 219 | + | |
| 220 | + Record type and size are encoded differently than in xls streams. | |
| 221 | + (c.f. [MS-XLSB, Paragraph 2.1.4: Record) | |
| 222 | + """ | |
| 223 | + while True: | |
| 224 | + pos = self.stream.tell() | |
| 225 | + if pos >= self.size: | |
| 226 | + break | |
| 227 | + val = ord(self.stream.read(1)) | |
| 228 | + if val & self.HIGH_BIT_MASK: # high bit of the low byte is 1 | |
| 229 | + val2 = ord(self.stream.read(1)) # need another byte | |
| 230 | + # combine 7 low bits of each byte | |
| 231 | + type = (val & self.LOW7_BIT_MASK) + \ | |
| 232 | + ((val2 & self.LOW7_BIT_MASK) << 7) | |
| 233 | + else: | |
| 234 | + type = val | |
| 235 | + | |
| 236 | + size = 0 | |
| 237 | + shift = 0 | |
| 238 | + for _ in range(4): # size needs up to 4 byte | |
| 239 | + val = ord(self.stream.read(1)) | |
| 240 | + size += (val & self.LOW7_BIT_MASK) << shift | |
| 241 | + shift += 7 | |
| 242 | + if (val & self.HIGH_BIT_MASK) == 0: # high-bit is 0 --> done | |
| 243 | + break | |
| 244 | + | |
| 245 | + if pos + size > self.size: | |
| 246 | + raise ValueError('Stream does not seem to have record ' | |
| 247 | + 'structure or is incomplete (record size {0})' | |
| 248 | + .format(size)) | |
| 249 | + data = self.stream.read(size) | |
| 250 | + | |
| 251 | + clz = XlsbRecord | |
| 252 | + if type == XlsbBeginSupBook.TYPE: | |
| 253 | + clz = XlsbBeginSupBook | |
| 254 | + yield clz(type, size, pos, data) | |
| 255 | + | |
| 256 | + | |
| 186 | 257 | ############################################################################### |
| 187 | 258 | # RECORDS |
| 188 | 259 | ############################################################################### |
| ... | ... | @@ -231,11 +302,33 @@ FREQUENT_RECORDS = dict([ |
| 231 | 302 | (2194, 'StyleExt') # pylint: disable=bad-whitespace |
| 232 | 303 | ]) |
| 233 | 304 | |
| 305 | +#: records found in xlsb binary parts | |
| 306 | +FREQUENT_RECORDS_XLSB = dict([ | |
| 307 | + (360, 'BrtBeginSupBook'), | |
| 308 | + (588, 'BrtEndSupBook'), | |
| 309 | + (667, 'BrtSupAddin'), | |
| 310 | + (355, 'BrtSupBookSrc'), | |
| 311 | + (586, 'BrtSupNameBits'), | |
| 312 | + (584, 'BrtSupNameBool'), | |
| 313 | + (587, 'BrtSupNameEnd'), | |
| 314 | + (581, 'BrtSupNameErr'), | |
| 315 | + (585, 'BrtSupNameFmla'), | |
| 316 | + (583, 'BrtSupNameNil'), | |
| 317 | + (580, 'BrtSupNameNum'), | |
| 318 | + (582, 'BrtSupNameSt'), | |
| 319 | + (577, 'BrtSupNameStart'), | |
| 320 | + (579, 'BrtSupNameValueEnd'), | |
| 321 | + (578, 'BrtSupNameValueStart'), | |
| 322 | + (358, 'BrtSupSame'), | |
| 323 | + (357, 'BrtSupSelf'), | |
| 324 | + (359, 'BrtSupTabs'), | |
| 325 | +]) | |
| 326 | + | |
| 234 | 327 | |
| 235 | 328 | class XlsRecord(object): |
| 236 | 329 | """ basic building block of data in workbook stream """ |
| 237 | 330 | |
| 238 | - #: max size of a record | |
| 331 | + #: max size of a record in xls stream (does not apply to xlsb) | |
| 239 | 332 | MAX_SIZE = 8224 |
| 240 | 333 | |
| 241 | 334 | # to be overwritten in subclasses that have fixed type/size |
| ... | ... | @@ -245,8 +338,9 @@ class XlsRecord(object): |
| 245 | 338 | def __init__(self, type, size, pos, data=None): |
| 246 | 339 | """ create a record """ |
| 247 | 340 | self.type = type |
| 248 | - if size > self.MAX_SIZE: | |
| 249 | - raise ValueError('size {0} exceeds max size'.format(size)) | |
| 341 | + if self.MAX_SIZE is not None and size > self.MAX_SIZE: | |
| 342 | + logging.warning('record size {0} exceeds max size' | |
| 343 | + .format(size)) | |
| 250 | 344 | elif self.SIZE is not None and size != self.SIZE: |
| 251 | 345 | raise ValueError('size {0} is not as expected for this type' |
| 252 | 346 | .format(size)) |
| ... | ... | @@ -362,18 +456,89 @@ class XlsRecordSupBook(XlsRecord): |
| 362 | 456 | return 'SupBook Record ({0})'.format(self.support_link_type) |
| 363 | 457 | |
| 364 | 458 | |
| 365 | -def read_unicode(data, start_idx, n_chars): | |
| 366 | - """ read a unicode string from a XLUnicodeStringNoCch structure """ | |
| 367 | - # first bit 0x0 --> only low-bytes are saved, all high bytes are 0 | |
| 368 | - # first bit 0x1 --> 2 bytes per character | |
| 369 | - low_bytes_only = (ord(data[start_idx]) == 0) | |
| 370 | - if low_bytes_only: | |
| 371 | - end_idx = start_idx + 1 + n_chars | |
| 372 | - return data[start_idx+1:end_idx].decode('ascii'), end_idx | |
| 373 | - end_idx = start_idx + 1 + n_chars * 2 | |
| 374 | - return u''.join(unichr(val) for val in | |
| 375 | - unpack('<' + 'H'*n_chars, data[start_idx+1:end_idx])), \ | |
| 376 | - end_idx | |
| 459 | +class XlsbRecord(XlsRecord): | |
| 460 | + """ like an xls record, but from binary part of xlsb file | |
| 461 | + | |
| 462 | + has no MAX_SIZE and types have different meanings | |
| 463 | + """ | |
| 464 | + | |
| 465 | + MAX_SIZE = None | |
| 466 | + | |
| 467 | + def _type_str(self): | |
| 468 | + """ simplification for subclasses to create their own __str__ """ | |
| 469 | + try: | |
| 470 | + return FREQUENT_RECORDS_XLSB[self.type] | |
| 471 | + except KeyError: | |
| 472 | + return 'XlsbRecord type {0}'.format(self.type) | |
| 473 | + | |
| 474 | + | |
| 475 | +class XlsbBeginSupBook(XlsbRecord): | |
| 476 | + """ Record beginning an external link in xlsb file | |
| 477 | + | |
| 478 | + contains information about the link itself (e.g. for DDE the link is | |
| 479 | + string1 + ' ' + string2) | |
| 480 | + """ | |
| 481 | + | |
| 482 | + TYPE = 360 | |
| 483 | + LINK_TYPE_WORKBOOK = 'workbook' | |
| 484 | + LINK_TYPE_DDE = 'DDE' | |
| 485 | + LINK_TYPE_OLE = 'OLE' | |
| 486 | + LINK_TYPE_UNEXPECTED = 'unexpected' | |
| 487 | + LINK_TYPE_UNKNOWN = 'unknown' | |
| 488 | + | |
| 489 | + def __init__(self, *args, **kwargs): | |
| 490 | + super(XlsbBeginSupBook, self).__init__(*args, **kwargs) | |
| 491 | + self.link_type = self.LINK_TYPE_UNKNOWN | |
| 492 | + self.string1 = '' | |
| 493 | + self.string2 = '' | |
| 494 | + if self.data is None: | |
| 495 | + return | |
| 496 | + self.sbt = unpack('<H', self.data[0:2])[0] | |
| 497 | + if self.sbt == 0: | |
| 498 | + self.link_type = self.LINK_TYPE_WORKBOOK | |
| 499 | + elif self.sbt == 1: | |
| 500 | + self.link_type = self.LINK_TYPE_DDE | |
| 501 | + elif self.sbt == 2: | |
| 502 | + self.link_type = self.LINK_TYPE_OLE | |
| 503 | + else: | |
| 504 | + logging.warning('Unexpected link type {0} encountered' | |
| 505 | + .format(self.data[0])) | |
| 506 | + self.link_type = self.LINK_TYPE_UNEXPECTED | |
| 507 | + | |
| 508 | + start_idx = 2 | |
| 509 | + n_chars = unpack('<I', self.data[start_idx:start_idx+4])[0] | |
| 510 | + if n_chars == 0xFFFFFFFF: | |
| 511 | + logging.warning('Max string length 0xFFFFFFF is not allowed') | |
| 512 | + elif self.size < n_chars*2 + start_idx+4: | |
| 513 | + logging.warning('Impossible string length {0} for data length {1}' | |
| 514 | + .format(n_chars, self.size)) | |
| 515 | + else: | |
| 516 | + self.string1, start_idx = read_unicode_2byte(self.data, | |
| 517 | + start_idx+4, n_chars) | |
| 518 | + | |
| 519 | + n_chars = unpack('<I', self.data[start_idx:start_idx+4])[0] | |
| 520 | + if n_chars == 0xFFFFFFFF: | |
| 521 | + logging.warning('Max string length 0xFFFFFFF is not allowed') | |
| 522 | + elif self.size < n_chars*2 + start_idx+4: | |
| 523 | + logging.warning('Impossible string length {0} for data length {1}' | |
| 524 | + .format(n_chars, self.size) + ' for string2') | |
| 525 | + else: | |
| 526 | + self.string2, _ = read_unicode_2byte(self.data, start_idx+4, | |
| 527 | + n_chars) | |
| 528 | + | |
| 529 | + def _type_str(self): | |
| 530 | + return 'XlsbBeginSupBook Record ({0}, "{1}", "{2}")' \ | |
| 531 | + .format(self.link_type, self.string1, self.string2) | |
| 532 | + | |
| 533 | + | |
| 534 | +############################################################################### | |
| 535 | +# XLSB Binary Parts | |
| 536 | +############################################################################### | |
| 537 | + | |
| 538 | +def parse_xlsb_part(stream, _, filename): | |
| 539 | + """ Excel xlsb files also have a record structure. iter records """ | |
| 540 | + for record in XlsbStream(stream, filename).iter_records(): | |
| 541 | + yield record | |
| 377 | 542 | |
| 378 | 543 | |
| 379 | 544 | ############################################################################### | ... | ... |