Commit 75259a45f4e9b375b7c22f830ddf725b648b8917

Authored by Philippe Lagadec
1 parent 4795c8b9

olevba: improved MSO files parsing, taking into account

various data offsets (fixed issue #12) - improved detection of MSO files, avoiding incorrect parsing errors (fixed issue #7)
Showing 1 changed file with 76 additions and 17 deletions
oletools/olevba.py
@@ -11,7 +11,7 @@ Supported formats: @@ -11,7 +11,7 @@ Supported formats:
11 - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) 11 - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
12 - PowerPoint 2007+ (.pptm, .ppsm) 12 - PowerPoint 2007+ (.pptm, .ppsm)
13 - Word 2003 XML (.xml) 13 - Word 2003 XML (.xml)
14 -- Word Single File Web Page / MHTML (.mht) 14 +- Word/Excel Single File Web Page / MHTML (.mht)
15 15
16 Author: Philippe Lagadec - http://www.decalage.info 16 Author: Philippe Lagadec - http://www.decalage.info
17 License: BSD, see source code or documentation 17 License: BSD, see source code or documentation
@@ -132,7 +132,10 @@ https://github.com/unixfreak0037/officeparser @@ -132,7 +132,10 @@ https://github.com/unixfreak0037/officeparser
132 # (issue #10 reported by Greg from SpamStopsHere) 132 # (issue #10 reported by Greg from SpamStopsHere)
133 # 2015-05-24 v0.28 PL: - improved support for MHTML files with modified header 133 # 2015-05-24 v0.28 PL: - improved support for MHTML files with modified header
134 # (issue #11 reported by Thomas Chopitea) 134 # (issue #11 reported by Thomas Chopitea)
135 -# 2015-05-26 v0.29 PL: - improved MSO files parsing (issue #12) 135 +# 2015-05-26 v0.29 PL: - improved MSO files parsing, taking into account
  136 +# various data offsets (issue #12)
  137 +# - improved detection of MSO files, avoiding incorrect
  138 +# parsing errors (issue #7)
136 139
137 __version__ = '0.29' 140 __version__ = '0.29'
138 141
@@ -416,11 +419,63 @@ def is_mso_file(data): @@ -416,11 +419,63 @@ def is_mso_file(data):
416 the ones created by Outlook in some cases, or Word/Excel when saving a 419 the ones created by Outlook in some cases, or Word/Excel when saving a
417 file with the MHTML format or the Word 2003 XML format. 420 file with the MHTML format or the Word 2003 XML format.
418 This function only checks the ActiveMime magic at the beginning of data. 421 This function only checks the ActiveMime magic at the beginning of data.
419 - :param data: bytes string 422 + :param data: bytes string, MSO/ActiveMime file content
420 :return: bool, True if the file is MSO, False otherwise 423 :return: bool, True if the file is MSO, False otherwise
421 """ 424 """
422 return data.startswith(MSO_ACTIVEMIME_HEADER) 425 return data.startswith(MSO_ACTIVEMIME_HEADER)
423 426
  427 +
  428 +# regex to find zlib block headers, starting with byte 0x78 = 'x'
  429 +re_zlib_header = re.compile(r'x')
  430 +
  431 +
  432 +def mso_file_extract(data):
  433 + """
  434 + Extract the data stored into a MSO/ActiveMime file, such as
  435 + the ones created by Outlook in some cases, or Word/Excel when saving a
  436 + file with the MHTML format or the Word 2003 XML format.
  437 +
  438 + :param data: bytes string, MSO/ActiveMime file content
  439 + :return: bytes string, extracted data (uncompressed)
  440 +
  441 + raise a RuntimeError if the data cannot be extracted
  442 + """
  443 + # check the magic:
  444 + assert is_mso_file(data)
  445 + # First, attempt to get the compressed data offset from the header
  446 + # According to my tests, it should be an unsigned 16 bits integer,
  447 + # at offset 0x1E (little endian) + add 46:
  448 + try:
  449 + offset = struct.unpack_from('<H', data, offset=0x1E)[0] + 46
  450 + logging.debug('Parsing MSO file: data offset = 0x%X' % offset)
  451 + except:
  452 + logging.exception('Unable to parse MSO/ActiveMime file header')
  453 + raise RuntimeError('Unable to parse MSO/ActiveMime file header')
  454 + # In all the samples seen so far, Word always uses an offset of 0x32,
  455 + # and Excel 0x22A. But we read the offset from the header to be more
  456 + # generic.
  457 + # Let's try that offset, then 0x32 and 0x22A, just in case:
  458 + for start in (offset, 0x32, 0x22A):
  459 + try:
  460 + logging.debug('Attempting zlib decompression from MSO file offset 0x%X' % start)
  461 + extracted_data = zlib.decompress(data[start:])
  462 + return extracted_data
  463 + except:
  464 + logging.exception('zlib decompression failed')
  465 + # None of the guessed offsets worked, let's try brute-forcing by looking
  466 + # for potential zlib-compressed blocks starting with 0x78:
  467 + logging.debug('Looking for potential zlib-compressed blocks in MSO file')
  468 + for match in re_zlib_header.finditer(data):
  469 + start = match.start()
  470 + try:
  471 + logging.debug('Attempting zlib decompression from MSO file offset 0x%X' % start)
  472 + extracted_data = zlib.decompress(data[start:])
  473 + return extracted_data
  474 + except:
  475 + logging.exception('zlib decompression failed')
  476 + raise RuntimeError('Unable to decompress data from a MSO/ActiveMime file')
  477 +
  478 +
424 #--- FUNCTIONS ---------------------------------------------------------------- 479 #--- FUNCTIONS ----------------------------------------------------------------
425 480
426 def copytoken_help(decompressed_current, decompressed_chunk_start): 481 def copytoken_help(decompressed_current, decompressed_chunk_start):
@@ -1351,16 +1406,19 @@ class VBA_Parser(object): @@ -1351,16 +1406,19 @@ class VBA_Parser(object):
1351 # get the filename: 1406 # get the filename:
1352 fname = bindata.get(ATTR_NAME, 'noname.mso') 1407 fname = bindata.get(ATTR_NAME, 'noname.mso')
1353 # decode the base64 activemime 1408 # decode the base64 activemime
1354 - activemime = binascii.a2b_base64(bindata.text)  
1355 - # decompress the zlib data starting at offset 0x32, which is the OLE container:  
1356 - # TODO: handle different offsets => separate function  
1357 - ole_data = zlib.decompress(activemime[0x32:])  
1358 - try:  
1359 - self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data))  
1360 - except:  
1361 - logging.debug('%s is not a valid OLE file' % fname)  
1362 - continue 1409 + mso_data = binascii.a2b_base64(bindata.text)
  1410 + if is_mso_file(mso_data):
  1411 + # decompress the zlib data stored in the MSO file, which is the OLE container:
  1412 + # TODO: handle different offsets => separate function
  1413 + ole_data = mso_file_extract(mso_data)
  1414 + try:
  1415 + self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data))
  1416 + except:
  1417 + logging.error('%s does not contain a valid OLE file' % fname)
  1418 + else:
  1419 + logging.error('%s is not a valid MSO file' % fname)
1363 except: 1420 except:
  1421 + # TODO: differentiate exceptions for each parsing stage
1364 logging.exception('Failed XML parsing for file %r' % self.filename) 1422 logging.exception('Failed XML parsing for file %r' % self.filename)
1365 pass 1423 pass
1366 # check if it is a MHT file (MIME HTML, Word or Excel saved as "Single File Web Page"): 1424 # check if it is a MHT file (MIME HTML, Word or Excel saved as "Single File Web Page"):
@@ -1381,6 +1439,7 @@ class VBA_Parser(object): @@ -1381,6 +1439,7 @@ class VBA_Parser(object):
1381 for part in mhtml.walk(): 1439 for part in mhtml.walk():
1382 content_type = part.get_content_type() # always returns a value 1440 content_type = part.get_content_type() # always returns a value
1383 fname = part.get_filename(None) # returns None if it fails 1441 fname = part.get_filename(None) # returns None if it fails
  1442 + # TODO: get content-location if no filename
1384 logging.debug('MHTML part: filename=%r, content-type=%r' % (fname, content_type)) 1443 logging.debug('MHTML part: filename=%r, content-type=%r' % (fname, content_type))
1385 part_data = part.get_payload(decode=True) 1444 part_data = part.get_payload(decode=True)
1386 # VBA macros are stored in a binary file named "editdata.mso". 1445 # VBA macros are stored in a binary file named "editdata.mso".
@@ -1391,15 +1450,15 @@ class VBA_Parser(object): @@ -1391,15 +1450,15 @@ class VBA_Parser(object):
1391 if isinstance(part_data, str) and is_mso_file(part_data): 1450 if isinstance(part_data, str) and is_mso_file(part_data):
1392 logging.debug('Found ActiveMime header, decompressing MSO container') 1451 logging.debug('Found ActiveMime header, decompressing MSO container')
1393 try: 1452 try:
1394 - ole_data = zlib.decompress(part_data[0x32:]) 1453 + ole_data = mso_file_extract(part_data)
1395 try: 1454 try:
1396 # TODO: check if it is actually an OLE file 1455 # TODO: check if it is actually an OLE file
1397 # TODO: get the MSO filename from content_location? 1456 # TODO: get the MSO filename from content_location?
1398 self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data)) 1457 self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data))
1399 except: 1458 except:
1400 - logging.debug('%s is not a valid OLE file' % fname) 1459 + logging.debug('%s does not contain a valid OLE file' % fname)
1401 except: 1460 except:
1402 - logging.error('Failed decompressing an MSO container in %r - %s' 1461 + logging.exception('Failed decompressing an MSO container in %r - %s'
1403 % (fname, MSG_OLEVBA_ISSUES)) 1462 % (fname, MSG_OLEVBA_ISSUES))
1404 # TODO: bug here - need to split in smaller functions/classes? 1463 # TODO: bug here - need to split in smaller functions/classes?
1405 except: 1464 except:
@@ -1768,9 +1827,9 @@ def main(): @@ -1768,9 +1827,9 @@ def main():
1768 print 'olevba %s - http://decalage.info/python/oletools' % __version__ 1827 print 'olevba %s - http://decalage.info/python/oletools' % __version__
1769 1828
1770 # TODO: option to set logging level, none by default 1829 # TODO: option to set logging level, none by default
1771 - logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.DEBUG) #.WARNING) #INFO) 1830 + logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING) #.DEBUG) #INFO)
1772 # For now, all logging is disabled: 1831 # For now, all logging is disabled:
1773 - #logging.disable(logging.CRITICAL) 1832 + logging.disable(logging.CRITICAL)
1774 1833
1775 if options.input: 1834 if options.input:
1776 # input file provided with VBA source code to be analyzed directly: 1835 # input file provided with VBA source code to be analyzed directly: