From f0a52502017d7bba4bdad4034d6c30f64cd3cb5b Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Wed, 29 Nov 2017 15:53:58 +0100 Subject: [PATCH] xls_parser: parse binary parts of xlsb files --- oletools/xls_parser.py | 203 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 184 insertions(+), 19 deletions(-) diff --git a/oletools/xls_parser.py b/oletools/xls_parser.py index 30f7840..018924b 100644 --- a/oletools/xls_parser.py +++ b/oletools/xls_parser.py @@ -97,6 +97,30 @@ def is_xls(filename): return False +def read_unicode(data, start_idx, n_chars): + """ read a unicode string from a XLUnicodeStringNoCch structure """ + # first bit 0x0 --> only low-bytes are saved, all high bytes are 0 + # first bit 0x1 --> 2 bytes per character + low_bytes_only = (ord(data[start_idx]) == 0) + if low_bytes_only: + end_idx = start_idx + 1 + n_chars + return data[start_idx+1:end_idx].decode('ascii'), end_idx + else: + return read_unicode_2byte(data, start_idx+1, n_chars) + + +def read_unicode_2byte(data, start_idx, n_chars): + """ read a unicode string with characters encoded by 2 bytes """ + end_idx = start_idx + n_chars * 2 + if n_chars < 256: # faster version, long format string for unpack + unichars = (unichr(val) for val in + unpack('<' + 'H'*n_chars, data[start_idx:end_idx])) + else: # slower version but less memory-extensive + unichars = (unichr(unpack('= self.size: + break + val = ord(self.stream.read(1)) + if val & self.HIGH_BIT_MASK: # high bit of the low byte is 1 + val2 = ord(self.stream.read(1)) # need another byte + # combine 7 low bits of each byte + type = (val & self.LOW7_BIT_MASK) + \ + ((val2 & self.LOW7_BIT_MASK) << 7) + else: + type = val + + size = 0 + shift = 0 + for _ in range(4): # size needs up to 4 byte + val = ord(self.stream.read(1)) + size += (val & self.LOW7_BIT_MASK) << shift + shift += 7 + if (val & self.HIGH_BIT_MASK) == 0: # high-bit is 0 --> done + break + + if pos + size > self.size: + raise ValueError('Stream does not seem to have record ' + 'structure or is incomplete (record size {0})' + .format(size)) + data = self.stream.read(size) + + clz = XlsbRecord + if type == XlsbBeginSupBook.TYPE: + clz = XlsbBeginSupBook + yield clz(type, size, pos, data) + + ############################################################################### # RECORDS ############################################################################### @@ -231,11 +302,33 @@ FREQUENT_RECORDS = dict([ (2194, 'StyleExt') # pylint: disable=bad-whitespace ]) +#: records found in xlsb binary parts +FREQUENT_RECORDS_XLSB = dict([ + (360, 'BrtBeginSupBook'), + (588, 'BrtEndSupBook'), + (667, 'BrtSupAddin'), + (355, 'BrtSupBookSrc'), + (586, 'BrtSupNameBits'), + (584, 'BrtSupNameBool'), + (587, 'BrtSupNameEnd'), + (581, 'BrtSupNameErr'), + (585, 'BrtSupNameFmla'), + (583, 'BrtSupNameNil'), + (580, 'BrtSupNameNum'), + (582, 'BrtSupNameSt'), + (577, 'BrtSupNameStart'), + (579, 'BrtSupNameValueEnd'), + (578, 'BrtSupNameValueStart'), + (358, 'BrtSupSame'), + (357, 'BrtSupSelf'), + (359, 'BrtSupTabs'), +]) + class XlsRecord(object): """ basic building block of data in workbook stream """ - #: max size of a record + #: max size of a record in xls stream (does not apply to xlsb) MAX_SIZE = 8224 # to be overwritten in subclasses that have fixed type/size @@ -245,8 +338,9 @@ class XlsRecord(object): def __init__(self, type, size, pos, data=None): """ create a record """ self.type = type - if size > self.MAX_SIZE: - raise ValueError('size {0} exceeds max size'.format(size)) + if self.MAX_SIZE is not None and size > self.MAX_SIZE: + logging.warning('record size {0} exceeds max size' + .format(size)) elif self.SIZE is not None and size != self.SIZE: raise ValueError('size {0} is not as expected for this type' .format(size)) @@ -362,18 +456,89 @@ class XlsRecordSupBook(XlsRecord): return 'SupBook Record ({0})'.format(self.support_link_type) -def read_unicode(data, start_idx, n_chars): - """ read a unicode string from a XLUnicodeStringNoCch structure """ - # first bit 0x0 --> only low-bytes are saved, all high bytes are 0 - # first bit 0x1 --> 2 bytes per character - low_bytes_only = (ord(data[start_idx]) == 0) - if low_bytes_only: - end_idx = start_idx + 1 + n_chars - return data[start_idx+1:end_idx].decode('ascii'), end_idx - end_idx = start_idx + 1 + n_chars * 2 - return u''.join(unichr(val) for val in - unpack('<' + 'H'*n_chars, data[start_idx+1:end_idx])), \ - end_idx +class XlsbRecord(XlsRecord): + """ like an xls record, but from binary part of xlsb file + + has no MAX_SIZE and types have different meanings + """ + + MAX_SIZE = None + + def _type_str(self): + """ simplification for subclasses to create their own __str__ """ + try: + return FREQUENT_RECORDS_XLSB[self.type] + except KeyError: + return 'XlsbRecord type {0}'.format(self.type) + + +class XlsbBeginSupBook(XlsbRecord): + """ Record beginning an external link in xlsb file + + contains information about the link itself (e.g. for DDE the link is + string1 + ' ' + string2) + """ + + TYPE = 360 + LINK_TYPE_WORKBOOK = 'workbook' + LINK_TYPE_DDE = 'DDE' + LINK_TYPE_OLE = 'OLE' + LINK_TYPE_UNEXPECTED = 'unexpected' + LINK_TYPE_UNKNOWN = 'unknown' + + def __init__(self, *args, **kwargs): + super(XlsbBeginSupBook, self).__init__(*args, **kwargs) + self.link_type = self.LINK_TYPE_UNKNOWN + self.string1 = '' + self.string2 = '' + if self.data is None: + return + self.sbt = unpack('