Commit f0a52502017d7bba4bdad4034d6c30f64cd3cb5b

Authored by Christian Herdtweck
Committed by Philippe Lagadec
1 parent 303f0de1

xls_parser: parse binary parts of xlsb files

Showing 1 changed file with 184 additions and 19 deletions
oletools/xls_parser.py
... ... @@ -97,6 +97,30 @@ def is_xls(filename):
97 97 return False
98 98  
99 99  
  100 +def read_unicode(data, start_idx, n_chars):
  101 + """ read a unicode string from a XLUnicodeStringNoCch structure """
  102 + # first bit 0x0 --> only low-bytes are saved, all high bytes are 0
  103 + # first bit 0x1 --> 2 bytes per character
  104 + low_bytes_only = (ord(data[start_idx]) == 0)
  105 + if low_bytes_only:
  106 + end_idx = start_idx + 1 + n_chars
  107 + return data[start_idx+1:end_idx].decode('ascii'), end_idx
  108 + else:
  109 + return read_unicode_2byte(data, start_idx+1, n_chars)
  110 +
  111 +
  112 +def read_unicode_2byte(data, start_idx, n_chars):
  113 + """ read a unicode string with characters encoded by 2 bytes """
  114 + end_idx = start_idx + n_chars * 2
  115 + if n_chars < 256: # faster version, long format string for unpack
  116 + unichars = (unichr(val) for val in
  117 + unpack('<' + 'H'*n_chars, data[start_idx:end_idx]))
  118 + else: # slower version but less memory-extensive
  119 + unichars = (unichr(unpack('<H', data[data_idx:data_idx+2])[0])
  120 + for data_idx in xrange(start_idx, end_idx, 2))
  121 + return u''.join(unichars), end_idx
  122 +
  123 +
100 124 ###############################################################################
101 125 # File, Storage, Stream
102 126 ###############################################################################
... ... @@ -133,6 +157,8 @@ class XlsStream(object):
133 157  
134 158 Currently not much use, but may be interesting for further sub-classing
135 159 when extending this code.
  160 +
  161 + stream argument can be oleile.OleStream or ooxml.ZipSubFile
136 162 """
137 163  
138 164 def __init__(self, stream, name):
... ... @@ -149,11 +175,10 @@ class WorkbookStream(XlsStream):
149 175 """ the workbook stream which contains records """
150 176  
151 177 def iter_records(self, fill_data=False):
152   - """ iterate over records in streams"""
153   - if self.stream.tell() != 0:
154   - logging.debug('have to jump to start')
155   - self.stream.seek(0)
  178 + """ iterate over records in streams
156 179  
  180 + Stream must be positioned at start of records (e.g. start of stream).
  181 + """
157 182 while True:
158 183 # unpacking as in olevba._extract_vba
159 184 pos = self.stream.tell()
... ... @@ -183,6 +208,52 @@ class WorkbookStream(XlsStream):
183 208 return '[Workbook Stream (size {0})'.format(self.size)
184 209  
185 210  
  211 +class XlsbStream(XlsStream):
  212 + """ binary stream of an xlsb file, usually have a record structure """
  213 +
  214 + HIGH_BIT_MASK = 0b10000000
  215 + LOW7_BIT_MASK = 0b01111111
  216 +
  217 + def iter_records(self):
  218 + """ iterate over records in stream
  219 +
  220 + Record type and size are encoded differently than in xls streams.
  221 + (c.f. [MS-XLSB, Paragraph 2.1.4: Record)
  222 + """
  223 + while True:
  224 + pos = self.stream.tell()
  225 + if pos >= self.size:
  226 + break
  227 + val = ord(self.stream.read(1))
  228 + if val & self.HIGH_BIT_MASK: # high bit of the low byte is 1
  229 + val2 = ord(self.stream.read(1)) # need another byte
  230 + # combine 7 low bits of each byte
  231 + type = (val & self.LOW7_BIT_MASK) + \
  232 + ((val2 & self.LOW7_BIT_MASK) << 7)
  233 + else:
  234 + type = val
  235 +
  236 + size = 0
  237 + shift = 0
  238 + for _ in range(4): # size needs up to 4 byte
  239 + val = ord(self.stream.read(1))
  240 + size += (val & self.LOW7_BIT_MASK) << shift
  241 + shift += 7
  242 + if (val & self.HIGH_BIT_MASK) == 0: # high-bit is 0 --> done
  243 + break
  244 +
  245 + if pos + size > self.size:
  246 + raise ValueError('Stream does not seem to have record '
  247 + 'structure or is incomplete (record size {0})'
  248 + .format(size))
  249 + data = self.stream.read(size)
  250 +
  251 + clz = XlsbRecord
  252 + if type == XlsbBeginSupBook.TYPE:
  253 + clz = XlsbBeginSupBook
  254 + yield clz(type, size, pos, data)
  255 +
  256 +
186 257 ###############################################################################
187 258 # RECORDS
188 259 ###############################################################################
... ... @@ -231,11 +302,33 @@ FREQUENT_RECORDS = dict([
231 302 (2194, 'StyleExt') # pylint: disable=bad-whitespace
232 303 ])
233 304  
  305 +#: records found in xlsb binary parts
  306 +FREQUENT_RECORDS_XLSB = dict([
  307 + (360, 'BrtBeginSupBook'),
  308 + (588, 'BrtEndSupBook'),
  309 + (667, 'BrtSupAddin'),
  310 + (355, 'BrtSupBookSrc'),
  311 + (586, 'BrtSupNameBits'),
  312 + (584, 'BrtSupNameBool'),
  313 + (587, 'BrtSupNameEnd'),
  314 + (581, 'BrtSupNameErr'),
  315 + (585, 'BrtSupNameFmla'),
  316 + (583, 'BrtSupNameNil'),
  317 + (580, 'BrtSupNameNum'),
  318 + (582, 'BrtSupNameSt'),
  319 + (577, 'BrtSupNameStart'),
  320 + (579, 'BrtSupNameValueEnd'),
  321 + (578, 'BrtSupNameValueStart'),
  322 + (358, 'BrtSupSame'),
  323 + (357, 'BrtSupSelf'),
  324 + (359, 'BrtSupTabs'),
  325 +])
  326 +
234 327  
235 328 class XlsRecord(object):
236 329 """ basic building block of data in workbook stream """
237 330  
238   - #: max size of a record
  331 + #: max size of a record in xls stream (does not apply to xlsb)
239 332 MAX_SIZE = 8224
240 333  
241 334 # to be overwritten in subclasses that have fixed type/size
... ... @@ -245,8 +338,9 @@ class XlsRecord(object):
245 338 def __init__(self, type, size, pos, data=None):
246 339 """ create a record """
247 340 self.type = type
248   - if size > self.MAX_SIZE:
249   - raise ValueError('size {0} exceeds max size'.format(size))
  341 + if self.MAX_SIZE is not None and size > self.MAX_SIZE:
  342 + logging.warning('record size {0} exceeds max size'
  343 + .format(size))
250 344 elif self.SIZE is not None and size != self.SIZE:
251 345 raise ValueError('size {0} is not as expected for this type'
252 346 .format(size))
... ... @@ -362,18 +456,89 @@ class XlsRecordSupBook(XlsRecord):
362 456 return 'SupBook Record ({0})'.format(self.support_link_type)
363 457  
364 458  
365   -def read_unicode(data, start_idx, n_chars):
366   - """ read a unicode string from a XLUnicodeStringNoCch structure """
367   - # first bit 0x0 --> only low-bytes are saved, all high bytes are 0
368   - # first bit 0x1 --> 2 bytes per character
369   - low_bytes_only = (ord(data[start_idx]) == 0)
370   - if low_bytes_only:
371   - end_idx = start_idx + 1 + n_chars
372   - return data[start_idx+1:end_idx].decode('ascii'), end_idx
373   - end_idx = start_idx + 1 + n_chars * 2
374   - return u''.join(unichr(val) for val in
375   - unpack('<' + 'H'*n_chars, data[start_idx+1:end_idx])), \
376   - end_idx
  459 +class XlsbRecord(XlsRecord):
  460 + """ like an xls record, but from binary part of xlsb file
  461 +
  462 + has no MAX_SIZE and types have different meanings
  463 + """
  464 +
  465 + MAX_SIZE = None
  466 +
  467 + def _type_str(self):
  468 + """ simplification for subclasses to create their own __str__ """
  469 + try:
  470 + return FREQUENT_RECORDS_XLSB[self.type]
  471 + except KeyError:
  472 + return 'XlsbRecord type {0}'.format(self.type)
  473 +
  474 +
  475 +class XlsbBeginSupBook(XlsbRecord):
  476 + """ Record beginning an external link in xlsb file
  477 +
  478 + contains information about the link itself (e.g. for DDE the link is
  479 + string1 + ' ' + string2)
  480 + """
  481 +
  482 + TYPE = 360
  483 + LINK_TYPE_WORKBOOK = 'workbook'
  484 + LINK_TYPE_DDE = 'DDE'
  485 + LINK_TYPE_OLE = 'OLE'
  486 + LINK_TYPE_UNEXPECTED = 'unexpected'
  487 + LINK_TYPE_UNKNOWN = 'unknown'
  488 +
  489 + def __init__(self, *args, **kwargs):
  490 + super(XlsbBeginSupBook, self).__init__(*args, **kwargs)
  491 + self.link_type = self.LINK_TYPE_UNKNOWN
  492 + self.string1 = ''
  493 + self.string2 = ''
  494 + if self.data is None:
  495 + return
  496 + self.sbt = unpack('<H', self.data[0:2])[0]
  497 + if self.sbt == 0:
  498 + self.link_type = self.LINK_TYPE_WORKBOOK
  499 + elif self.sbt == 1:
  500 + self.link_type = self.LINK_TYPE_DDE
  501 + elif self.sbt == 2:
  502 + self.link_type = self.LINK_TYPE_OLE
  503 + else:
  504 + logging.warning('Unexpected link type {0} encountered'
  505 + .format(self.data[0]))
  506 + self.link_type = self.LINK_TYPE_UNEXPECTED
  507 +
  508 + start_idx = 2
  509 + n_chars = unpack('<I', self.data[start_idx:start_idx+4])[0]
  510 + if n_chars == 0xFFFFFFFF:
  511 + logging.warning('Max string length 0xFFFFFFF is not allowed')
  512 + elif self.size < n_chars*2 + start_idx+4:
  513 + logging.warning('Impossible string length {0} for data length {1}'
  514 + .format(n_chars, self.size))
  515 + else:
  516 + self.string1, start_idx = read_unicode_2byte(self.data,
  517 + start_idx+4, n_chars)
  518 +
  519 + n_chars = unpack('<I', self.data[start_idx:start_idx+4])[0]
  520 + if n_chars == 0xFFFFFFFF:
  521 + logging.warning('Max string length 0xFFFFFFF is not allowed')
  522 + elif self.size < n_chars*2 + start_idx+4:
  523 + logging.warning('Impossible string length {0} for data length {1}'
  524 + .format(n_chars, self.size) + ' for string2')
  525 + else:
  526 + self.string2, _ = read_unicode_2byte(self.data, start_idx+4,
  527 + n_chars)
  528 +
  529 + def _type_str(self):
  530 + return 'XlsbBeginSupBook Record ({0}, "{1}", "{2}")' \
  531 + .format(self.link_type, self.string1, self.string2)
  532 +
  533 +
  534 +###############################################################################
  535 +# XLSB Binary Parts
  536 +###############################################################################
  537 +
  538 +def parse_xlsb_part(stream, _, filename):
  539 + """ Excel xlsb files also have a record structure. iter records """
  540 + for record in XlsbStream(stream, filename).iter_records():
  541 + yield record
377 542  
378 543  
379 544 ###############################################################################
... ...