Commit f0a52502017d7bba4bdad4034d6c30f64cd3cb5b
Committed by
Philippe Lagadec
1 parent
303f0de1
xls_parser: parse binary parts of xlsb files
Showing
1 changed file
with
184 additions
and
19 deletions
oletools/xls_parser.py
| @@ -97,6 +97,30 @@ def is_xls(filename): | @@ -97,6 +97,30 @@ def is_xls(filename): | ||
| 97 | return False | 97 | return False |
| 98 | 98 | ||
| 99 | 99 | ||
| 100 | +def read_unicode(data, start_idx, n_chars): | ||
| 101 | + """ read a unicode string from a XLUnicodeStringNoCch structure """ | ||
| 102 | + # first bit 0x0 --> only low-bytes are saved, all high bytes are 0 | ||
| 103 | + # first bit 0x1 --> 2 bytes per character | ||
| 104 | + low_bytes_only = (ord(data[start_idx]) == 0) | ||
| 105 | + if low_bytes_only: | ||
| 106 | + end_idx = start_idx + 1 + n_chars | ||
| 107 | + return data[start_idx+1:end_idx].decode('ascii'), end_idx | ||
| 108 | + else: | ||
| 109 | + return read_unicode_2byte(data, start_idx+1, n_chars) | ||
| 110 | + | ||
| 111 | + | ||
| 112 | +def read_unicode_2byte(data, start_idx, n_chars): | ||
| 113 | + """ read a unicode string with characters encoded by 2 bytes """ | ||
| 114 | + end_idx = start_idx + n_chars * 2 | ||
| 115 | + if n_chars < 256: # faster version, long format string for unpack | ||
| 116 | + unichars = (unichr(val) for val in | ||
| 117 | + unpack('<' + 'H'*n_chars, data[start_idx:end_idx])) | ||
| 118 | + else: # slower version but less memory-extensive | ||
| 119 | + unichars = (unichr(unpack('<H', data[data_idx:data_idx+2])[0]) | ||
| 120 | + for data_idx in xrange(start_idx, end_idx, 2)) | ||
| 121 | + return u''.join(unichars), end_idx | ||
| 122 | + | ||
| 123 | + | ||
| 100 | ############################################################################### | 124 | ############################################################################### |
| 101 | # File, Storage, Stream | 125 | # File, Storage, Stream |
| 102 | ############################################################################### | 126 | ############################################################################### |
| @@ -133,6 +157,8 @@ class XlsStream(object): | @@ -133,6 +157,8 @@ class XlsStream(object): | ||
| 133 | 157 | ||
| 134 | Currently not much use, but may be interesting for further sub-classing | 158 | Currently not much use, but may be interesting for further sub-classing |
| 135 | when extending this code. | 159 | when extending this code. |
| 160 | + | ||
| 161 | + stream argument can be oleile.OleStream or ooxml.ZipSubFile | ||
| 136 | """ | 162 | """ |
| 137 | 163 | ||
| 138 | def __init__(self, stream, name): | 164 | def __init__(self, stream, name): |
| @@ -149,11 +175,10 @@ class WorkbookStream(XlsStream): | @@ -149,11 +175,10 @@ class WorkbookStream(XlsStream): | ||
| 149 | """ the workbook stream which contains records """ | 175 | """ the workbook stream which contains records """ |
| 150 | 176 | ||
| 151 | def iter_records(self, fill_data=False): | 177 | def iter_records(self, fill_data=False): |
| 152 | - """ iterate over records in streams""" | ||
| 153 | - if self.stream.tell() != 0: | ||
| 154 | - logging.debug('have to jump to start') | ||
| 155 | - self.stream.seek(0) | 178 | + """ iterate over records in streams |
| 156 | 179 | ||
| 180 | + Stream must be positioned at start of records (e.g. start of stream). | ||
| 181 | + """ | ||
| 157 | while True: | 182 | while True: |
| 158 | # unpacking as in olevba._extract_vba | 183 | # unpacking as in olevba._extract_vba |
| 159 | pos = self.stream.tell() | 184 | pos = self.stream.tell() |
| @@ -183,6 +208,52 @@ class WorkbookStream(XlsStream): | @@ -183,6 +208,52 @@ class WorkbookStream(XlsStream): | ||
| 183 | return '[Workbook Stream (size {0})'.format(self.size) | 208 | return '[Workbook Stream (size {0})'.format(self.size) |
| 184 | 209 | ||
| 185 | 210 | ||
| 211 | +class XlsbStream(XlsStream): | ||
| 212 | + """ binary stream of an xlsb file, usually have a record structure """ | ||
| 213 | + | ||
| 214 | + HIGH_BIT_MASK = 0b10000000 | ||
| 215 | + LOW7_BIT_MASK = 0b01111111 | ||
| 216 | + | ||
| 217 | + def iter_records(self): | ||
| 218 | + """ iterate over records in stream | ||
| 219 | + | ||
| 220 | + Record type and size are encoded differently than in xls streams. | ||
| 221 | + (c.f. [MS-XLSB, Paragraph 2.1.4: Record) | ||
| 222 | + """ | ||
| 223 | + while True: | ||
| 224 | + pos = self.stream.tell() | ||
| 225 | + if pos >= self.size: | ||
| 226 | + break | ||
| 227 | + val = ord(self.stream.read(1)) | ||
| 228 | + if val & self.HIGH_BIT_MASK: # high bit of the low byte is 1 | ||
| 229 | + val2 = ord(self.stream.read(1)) # need another byte | ||
| 230 | + # combine 7 low bits of each byte | ||
| 231 | + type = (val & self.LOW7_BIT_MASK) + \ | ||
| 232 | + ((val2 & self.LOW7_BIT_MASK) << 7) | ||
| 233 | + else: | ||
| 234 | + type = val | ||
| 235 | + | ||
| 236 | + size = 0 | ||
| 237 | + shift = 0 | ||
| 238 | + for _ in range(4): # size needs up to 4 byte | ||
| 239 | + val = ord(self.stream.read(1)) | ||
| 240 | + size += (val & self.LOW7_BIT_MASK) << shift | ||
| 241 | + shift += 7 | ||
| 242 | + if (val & self.HIGH_BIT_MASK) == 0: # high-bit is 0 --> done | ||
| 243 | + break | ||
| 244 | + | ||
| 245 | + if pos + size > self.size: | ||
| 246 | + raise ValueError('Stream does not seem to have record ' | ||
| 247 | + 'structure or is incomplete (record size {0})' | ||
| 248 | + .format(size)) | ||
| 249 | + data = self.stream.read(size) | ||
| 250 | + | ||
| 251 | + clz = XlsbRecord | ||
| 252 | + if type == XlsbBeginSupBook.TYPE: | ||
| 253 | + clz = XlsbBeginSupBook | ||
| 254 | + yield clz(type, size, pos, data) | ||
| 255 | + | ||
| 256 | + | ||
| 186 | ############################################################################### | 257 | ############################################################################### |
| 187 | # RECORDS | 258 | # RECORDS |
| 188 | ############################################################################### | 259 | ############################################################################### |
| @@ -231,11 +302,33 @@ FREQUENT_RECORDS = dict([ | @@ -231,11 +302,33 @@ FREQUENT_RECORDS = dict([ | ||
| 231 | (2194, 'StyleExt') # pylint: disable=bad-whitespace | 302 | (2194, 'StyleExt') # pylint: disable=bad-whitespace |
| 232 | ]) | 303 | ]) |
| 233 | 304 | ||
| 305 | +#: records found in xlsb binary parts | ||
| 306 | +FREQUENT_RECORDS_XLSB = dict([ | ||
| 307 | + (360, 'BrtBeginSupBook'), | ||
| 308 | + (588, 'BrtEndSupBook'), | ||
| 309 | + (667, 'BrtSupAddin'), | ||
| 310 | + (355, 'BrtSupBookSrc'), | ||
| 311 | + (586, 'BrtSupNameBits'), | ||
| 312 | + (584, 'BrtSupNameBool'), | ||
| 313 | + (587, 'BrtSupNameEnd'), | ||
| 314 | + (581, 'BrtSupNameErr'), | ||
| 315 | + (585, 'BrtSupNameFmla'), | ||
| 316 | + (583, 'BrtSupNameNil'), | ||
| 317 | + (580, 'BrtSupNameNum'), | ||
| 318 | + (582, 'BrtSupNameSt'), | ||
| 319 | + (577, 'BrtSupNameStart'), | ||
| 320 | + (579, 'BrtSupNameValueEnd'), | ||
| 321 | + (578, 'BrtSupNameValueStart'), | ||
| 322 | + (358, 'BrtSupSame'), | ||
| 323 | + (357, 'BrtSupSelf'), | ||
| 324 | + (359, 'BrtSupTabs'), | ||
| 325 | +]) | ||
| 326 | + | ||
| 234 | 327 | ||
| 235 | class XlsRecord(object): | 328 | class XlsRecord(object): |
| 236 | """ basic building block of data in workbook stream """ | 329 | """ basic building block of data in workbook stream """ |
| 237 | 330 | ||
| 238 | - #: max size of a record | 331 | + #: max size of a record in xls stream (does not apply to xlsb) |
| 239 | MAX_SIZE = 8224 | 332 | MAX_SIZE = 8224 |
| 240 | 333 | ||
| 241 | # to be overwritten in subclasses that have fixed type/size | 334 | # to be overwritten in subclasses that have fixed type/size |
| @@ -245,8 +338,9 @@ class XlsRecord(object): | @@ -245,8 +338,9 @@ class XlsRecord(object): | ||
| 245 | def __init__(self, type, size, pos, data=None): | 338 | def __init__(self, type, size, pos, data=None): |
| 246 | """ create a record """ | 339 | """ create a record """ |
| 247 | self.type = type | 340 | self.type = type |
| 248 | - if size > self.MAX_SIZE: | ||
| 249 | - raise ValueError('size {0} exceeds max size'.format(size)) | 341 | + if self.MAX_SIZE is not None and size > self.MAX_SIZE: |
| 342 | + logging.warning('record size {0} exceeds max size' | ||
| 343 | + .format(size)) | ||
| 250 | elif self.SIZE is not None and size != self.SIZE: | 344 | elif self.SIZE is not None and size != self.SIZE: |
| 251 | raise ValueError('size {0} is not as expected for this type' | 345 | raise ValueError('size {0} is not as expected for this type' |
| 252 | .format(size)) | 346 | .format(size)) |
| @@ -362,18 +456,89 @@ class XlsRecordSupBook(XlsRecord): | @@ -362,18 +456,89 @@ class XlsRecordSupBook(XlsRecord): | ||
| 362 | return 'SupBook Record ({0})'.format(self.support_link_type) | 456 | return 'SupBook Record ({0})'.format(self.support_link_type) |
| 363 | 457 | ||
| 364 | 458 | ||
| 365 | -def read_unicode(data, start_idx, n_chars): | ||
| 366 | - """ read a unicode string from a XLUnicodeStringNoCch structure """ | ||
| 367 | - # first bit 0x0 --> only low-bytes are saved, all high bytes are 0 | ||
| 368 | - # first bit 0x1 --> 2 bytes per character | ||
| 369 | - low_bytes_only = (ord(data[start_idx]) == 0) | ||
| 370 | - if low_bytes_only: | ||
| 371 | - end_idx = start_idx + 1 + n_chars | ||
| 372 | - return data[start_idx+1:end_idx].decode('ascii'), end_idx | ||
| 373 | - end_idx = start_idx + 1 + n_chars * 2 | ||
| 374 | - return u''.join(unichr(val) for val in | ||
| 375 | - unpack('<' + 'H'*n_chars, data[start_idx+1:end_idx])), \ | ||
| 376 | - end_idx | 459 | +class XlsbRecord(XlsRecord): |
| 460 | + """ like an xls record, but from binary part of xlsb file | ||
| 461 | + | ||
| 462 | + has no MAX_SIZE and types have different meanings | ||
| 463 | + """ | ||
| 464 | + | ||
| 465 | + MAX_SIZE = None | ||
| 466 | + | ||
| 467 | + def _type_str(self): | ||
| 468 | + """ simplification for subclasses to create their own __str__ """ | ||
| 469 | + try: | ||
| 470 | + return FREQUENT_RECORDS_XLSB[self.type] | ||
| 471 | + except KeyError: | ||
| 472 | + return 'XlsbRecord type {0}'.format(self.type) | ||
| 473 | + | ||
| 474 | + | ||
| 475 | +class XlsbBeginSupBook(XlsbRecord): | ||
| 476 | + """ Record beginning an external link in xlsb file | ||
| 477 | + | ||
| 478 | + contains information about the link itself (e.g. for DDE the link is | ||
| 479 | + string1 + ' ' + string2) | ||
| 480 | + """ | ||
| 481 | + | ||
| 482 | + TYPE = 360 | ||
| 483 | + LINK_TYPE_WORKBOOK = 'workbook' | ||
| 484 | + LINK_TYPE_DDE = 'DDE' | ||
| 485 | + LINK_TYPE_OLE = 'OLE' | ||
| 486 | + LINK_TYPE_UNEXPECTED = 'unexpected' | ||
| 487 | + LINK_TYPE_UNKNOWN = 'unknown' | ||
| 488 | + | ||
| 489 | + def __init__(self, *args, **kwargs): | ||
| 490 | + super(XlsbBeginSupBook, self).__init__(*args, **kwargs) | ||
| 491 | + self.link_type = self.LINK_TYPE_UNKNOWN | ||
| 492 | + self.string1 = '' | ||
| 493 | + self.string2 = '' | ||
| 494 | + if self.data is None: | ||
| 495 | + return | ||
| 496 | + self.sbt = unpack('<H', self.data[0:2])[0] | ||
| 497 | + if self.sbt == 0: | ||
| 498 | + self.link_type = self.LINK_TYPE_WORKBOOK | ||
| 499 | + elif self.sbt == 1: | ||
| 500 | + self.link_type = self.LINK_TYPE_DDE | ||
| 501 | + elif self.sbt == 2: | ||
| 502 | + self.link_type = self.LINK_TYPE_OLE | ||
| 503 | + else: | ||
| 504 | + logging.warning('Unexpected link type {0} encountered' | ||
| 505 | + .format(self.data[0])) | ||
| 506 | + self.link_type = self.LINK_TYPE_UNEXPECTED | ||
| 507 | + | ||
| 508 | + start_idx = 2 | ||
| 509 | + n_chars = unpack('<I', self.data[start_idx:start_idx+4])[0] | ||
| 510 | + if n_chars == 0xFFFFFFFF: | ||
| 511 | + logging.warning('Max string length 0xFFFFFFF is not allowed') | ||
| 512 | + elif self.size < n_chars*2 + start_idx+4: | ||
| 513 | + logging.warning('Impossible string length {0} for data length {1}' | ||
| 514 | + .format(n_chars, self.size)) | ||
| 515 | + else: | ||
| 516 | + self.string1, start_idx = read_unicode_2byte(self.data, | ||
| 517 | + start_idx+4, n_chars) | ||
| 518 | + | ||
| 519 | + n_chars = unpack('<I', self.data[start_idx:start_idx+4])[0] | ||
| 520 | + if n_chars == 0xFFFFFFFF: | ||
| 521 | + logging.warning('Max string length 0xFFFFFFF is not allowed') | ||
| 522 | + elif self.size < n_chars*2 + start_idx+4: | ||
| 523 | + logging.warning('Impossible string length {0} for data length {1}' | ||
| 524 | + .format(n_chars, self.size) + ' for string2') | ||
| 525 | + else: | ||
| 526 | + self.string2, _ = read_unicode_2byte(self.data, start_idx+4, | ||
| 527 | + n_chars) | ||
| 528 | + | ||
| 529 | + def _type_str(self): | ||
| 530 | + return 'XlsbBeginSupBook Record ({0}, "{1}", "{2}")' \ | ||
| 531 | + .format(self.link_type, self.string1, self.string2) | ||
| 532 | + | ||
| 533 | + | ||
| 534 | +############################################################################### | ||
| 535 | +# XLSB Binary Parts | ||
| 536 | +############################################################################### | ||
| 537 | + | ||
| 538 | +def parse_xlsb_part(stream, _, filename): | ||
| 539 | + """ Excel xlsb files also have a record structure. iter records """ | ||
| 540 | + for record in XlsbStream(stream, filename).iter_records(): | ||
| 541 | + yield record | ||
| 377 | 542 | ||
| 378 | 543 | ||
| 379 | ############################################################################### | 544 | ############################################################################### |