Commit 75259a45f4e9b375b7c22f830ddf725b648b8917

Authored by Philippe Lagadec
1 parent 4795c8b9

olevba: improved MSO files parsing, taking into account

various data offsets (fixed issue #12) - improved detection of MSO files, avoiding incorrect parsing errors (fixed issue #7)
Showing 1 changed file with 76 additions and 17 deletions
oletools/olevba.py
... ... @@ -11,7 +11,7 @@ Supported formats:
11 11 - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
12 12 - PowerPoint 2007+ (.pptm, .ppsm)
13 13 - Word 2003 XML (.xml)
14   -- Word Single File Web Page / MHTML (.mht)
  14 +- Word/Excel Single File Web Page / MHTML (.mht)
15 15  
16 16 Author: Philippe Lagadec - http://www.decalage.info
17 17 License: BSD, see source code or documentation
... ... @@ -132,7 +132,10 @@ https://github.com/unixfreak0037/officeparser
132 132 # (issue #10 reported by Greg from SpamStopsHere)
133 133 # 2015-05-24 v0.28 PL: - improved support for MHTML files with modified header
134 134 # (issue #11 reported by Thomas Chopitea)
135   -# 2015-05-26 v0.29 PL: - improved MSO files parsing (issue #12)
  135 +# 2015-05-26 v0.29 PL: - improved MSO files parsing, taking into account
  136 +# various data offsets (issue #12)
  137 +# - improved detection of MSO files, avoiding incorrect
  138 +# parsing errors (issue #7)
136 139  
137 140 __version__ = '0.29'
138 141  
... ... @@ -416,11 +419,63 @@ def is_mso_file(data):
416 419 the ones created by Outlook in some cases, or Word/Excel when saving a
417 420 file with the MHTML format or the Word 2003 XML format.
418 421 This function only checks the ActiveMime magic at the beginning of data.
419   - :param data: bytes string
  422 + :param data: bytes string, MSO/ActiveMime file content
420 423 :return: bool, True if the file is MSO, False otherwise
421 424 """
422 425 return data.startswith(MSO_ACTIVEMIME_HEADER)
423 426  
  427 +
  428 +# regex to find zlib block headers, starting with byte 0x78 = 'x'
  429 +re_zlib_header = re.compile(r'x')
  430 +
  431 +
  432 +def mso_file_extract(data):
  433 + """
  434 + Extract the data stored into a MSO/ActiveMime file, such as
  435 + the ones created by Outlook in some cases, or Word/Excel when saving a
  436 + file with the MHTML format or the Word 2003 XML format.
  437 +
  438 + :param data: bytes string, MSO/ActiveMime file content
  439 + :return: bytes string, extracted data (uncompressed)
  440 +
  441 + raise a RuntimeError if the data cannot be extracted
  442 + """
  443 + # check the magic:
  444 + assert is_mso_file(data)
  445 + # First, attempt to get the compressed data offset from the header
  446 + # According to my tests, it should be an unsigned 16 bits integer,
  447 + # at offset 0x1E (little endian) + add 46:
  448 + try:
  449 + offset = struct.unpack_from('<H', data, offset=0x1E)[0] + 46
  450 + logging.debug('Parsing MSO file: data offset = 0x%X' % offset)
  451 + except:
  452 + logging.exception('Unable to parse MSO/ActiveMime file header')
  453 + raise RuntimeError('Unable to parse MSO/ActiveMime file header')
  454 + # In all the samples seen so far, Word always uses an offset of 0x32,
  455 + # and Excel 0x22A. But we read the offset from the header to be more
  456 + # generic.
  457 + # Let's try that offset, then 0x32 and 0x22A, just in case:
  458 + for start in (offset, 0x32, 0x22A):
  459 + try:
  460 + logging.debug('Attempting zlib decompression from MSO file offset 0x%X' % start)
  461 + extracted_data = zlib.decompress(data[start:])
  462 + return extracted_data
  463 + except:
  464 + logging.exception('zlib decompression failed')
  465 + # None of the guessed offsets worked, let's try brute-forcing by looking
  466 + # for potential zlib-compressed blocks starting with 0x78:
  467 + logging.debug('Looking for potential zlib-compressed blocks in MSO file')
  468 + for match in re_zlib_header.finditer(data):
  469 + start = match.start()
  470 + try:
  471 + logging.debug('Attempting zlib decompression from MSO file offset 0x%X' % start)
  472 + extracted_data = zlib.decompress(data[start:])
  473 + return extracted_data
  474 + except:
  475 + logging.exception('zlib decompression failed')
  476 + raise RuntimeError('Unable to decompress data from a MSO/ActiveMime file')
  477 +
  478 +
424 479 #--- FUNCTIONS ----------------------------------------------------------------
425 480  
426 481 def copytoken_help(decompressed_current, decompressed_chunk_start):
... ... @@ -1351,16 +1406,19 @@ class VBA_Parser(object):
1351 1406 # get the filename:
1352 1407 fname = bindata.get(ATTR_NAME, 'noname.mso')
1353 1408 # decode the base64 activemime
1354   - activemime = binascii.a2b_base64(bindata.text)
1355   - # decompress the zlib data starting at offset 0x32, which is the OLE container:
1356   - # TODO: handle different offsets => separate function
1357   - ole_data = zlib.decompress(activemime[0x32:])
1358   - try:
1359   - self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data))
1360   - except:
1361   - logging.debug('%s is not a valid OLE file' % fname)
1362   - continue
  1409 + mso_data = binascii.a2b_base64(bindata.text)
  1410 + if is_mso_file(mso_data):
  1411 + # decompress the zlib data stored in the MSO file, which is the OLE container:
  1412 + # TODO: handle different offsets => separate function
  1413 + ole_data = mso_file_extract(mso_data)
  1414 + try:
  1415 + self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data))
  1416 + except:
  1417 + logging.error('%s does not contain a valid OLE file' % fname)
  1418 + else:
  1419 + logging.error('%s is not a valid MSO file' % fname)
1363 1420 except:
  1421 + # TODO: differentiate exceptions for each parsing stage
1364 1422 logging.exception('Failed XML parsing for file %r' % self.filename)
1365 1423 pass
1366 1424 # check if it is a MHT file (MIME HTML, Word or Excel saved as "Single File Web Page"):
... ... @@ -1381,6 +1439,7 @@ class VBA_Parser(object):
1381 1439 for part in mhtml.walk():
1382 1440 content_type = part.get_content_type() # always returns a value
1383 1441 fname = part.get_filename(None) # returns None if it fails
  1442 + # TODO: get content-location if no filename
1384 1443 logging.debug('MHTML part: filename=%r, content-type=%r' % (fname, content_type))
1385 1444 part_data = part.get_payload(decode=True)
1386 1445 # VBA macros are stored in a binary file named "editdata.mso".
... ... @@ -1391,15 +1450,15 @@ class VBA_Parser(object):
1391 1450 if isinstance(part_data, str) and is_mso_file(part_data):
1392 1451 logging.debug('Found ActiveMime header, decompressing MSO container')
1393 1452 try:
1394   - ole_data = zlib.decompress(part_data[0x32:])
  1453 + ole_data = mso_file_extract(part_data)
1395 1454 try:
1396 1455 # TODO: check if it is actually an OLE file
1397 1456 # TODO: get the MSO filename from content_location?
1398 1457 self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data))
1399 1458 except:
1400   - logging.debug('%s is not a valid OLE file' % fname)
  1459 + logging.debug('%s does not contain a valid OLE file' % fname)
1401 1460 except:
1402   - logging.error('Failed decompressing an MSO container in %r - %s'
  1461 + logging.exception('Failed decompressing an MSO container in %r - %s'
1403 1462 % (fname, MSG_OLEVBA_ISSUES))
1404 1463 # TODO: bug here - need to split in smaller functions/classes?
1405 1464 except:
... ... @@ -1768,9 +1827,9 @@ def main():
1768 1827 print 'olevba %s - http://decalage.info/python/oletools' % __version__
1769 1828  
1770 1829 # TODO: option to set logging level, none by default
1771   - logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.DEBUG) #.WARNING) #INFO)
  1830 + logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING) #.DEBUG) #INFO)
1772 1831 # For now, all logging is disabled:
1773   - #logging.disable(logging.CRITICAL)
  1832 + logging.disable(logging.CRITICAL)
1774 1833  
1775 1834 if options.input:
1776 1835 # input file provided with VBA source code to be analyzed directly:
... ...