Commit 75259a45f4e9b375b7c22f830ddf725b648b8917
1 parent
4795c8b9
olevba: improved MSO files parsing, taking into account
various data offsets (fixed issue #12) - improved detection of MSO files, avoiding incorrect parsing errors (fixed issue #7)
Showing
1 changed file
with
76 additions
and
17 deletions
oletools/olevba.py
| ... | ... | @@ -11,7 +11,7 @@ Supported formats: |
| 11 | 11 | - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) |
| 12 | 12 | - PowerPoint 2007+ (.pptm, .ppsm) |
| 13 | 13 | - Word 2003 XML (.xml) |
| 14 | -- Word Single File Web Page / MHTML (.mht) | |
| 14 | +- Word/Excel Single File Web Page / MHTML (.mht) | |
| 15 | 15 | |
| 16 | 16 | Author: Philippe Lagadec - http://www.decalage.info |
| 17 | 17 | License: BSD, see source code or documentation |
| ... | ... | @@ -132,7 +132,10 @@ https://github.com/unixfreak0037/officeparser |
| 132 | 132 | # (issue #10 reported by Greg from SpamStopsHere) |
| 133 | 133 | # 2015-05-24 v0.28 PL: - improved support for MHTML files with modified header |
| 134 | 134 | # (issue #11 reported by Thomas Chopitea) |
| 135 | -# 2015-05-26 v0.29 PL: - improved MSO files parsing (issue #12) | |
| 135 | +# 2015-05-26 v0.29 PL: - improved MSO files parsing, taking into account | |
| 136 | +# various data offsets (issue #12) | |
| 137 | +# - improved detection of MSO files, avoiding incorrect | |
| 138 | +# parsing errors (issue #7) | |
| 136 | 139 | |
| 137 | 140 | __version__ = '0.29' |
| 138 | 141 | |
| ... | ... | @@ -416,11 +419,63 @@ def is_mso_file(data): |
| 416 | 419 | the ones created by Outlook in some cases, or Word/Excel when saving a |
| 417 | 420 | file with the MHTML format or the Word 2003 XML format. |
| 418 | 421 | This function only checks the ActiveMime magic at the beginning of data. |
| 419 | - :param data: bytes string | |
| 422 | + :param data: bytes string, MSO/ActiveMime file content | |
| 420 | 423 | :return: bool, True if the file is MSO, False otherwise |
| 421 | 424 | """ |
| 422 | 425 | return data.startswith(MSO_ACTIVEMIME_HEADER) |
| 423 | 426 | |
| 427 | + | |
| 428 | +# regex to find zlib block headers, starting with byte 0x78 = 'x' | |
| 429 | +re_zlib_header = re.compile(r'x') | |
| 430 | + | |
| 431 | + | |
| 432 | +def mso_file_extract(data): | |
| 433 | + """ | |
| 434 | + Extract the data stored into a MSO/ActiveMime file, such as | |
| 435 | + the ones created by Outlook in some cases, or Word/Excel when saving a | |
| 436 | + file with the MHTML format or the Word 2003 XML format. | |
| 437 | + | |
| 438 | + :param data: bytes string, MSO/ActiveMime file content | |
| 439 | + :return: bytes string, extracted data (uncompressed) | |
| 440 | + | |
| 441 | + raise a RuntimeError if the data cannot be extracted | |
| 442 | + """ | |
| 443 | + # check the magic: | |
| 444 | + assert is_mso_file(data) | |
| 445 | + # First, attempt to get the compressed data offset from the header | |
| 446 | + # According to my tests, it should be an unsigned 16 bits integer, | |
| 447 | + # at offset 0x1E (little endian) + add 46: | |
| 448 | + try: | |
| 449 | + offset = struct.unpack_from('<H', data, offset=0x1E)[0] + 46 | |
| 450 | + logging.debug('Parsing MSO file: data offset = 0x%X' % offset) | |
| 451 | + except: | |
| 452 | + logging.exception('Unable to parse MSO/ActiveMime file header') | |
| 453 | + raise RuntimeError('Unable to parse MSO/ActiveMime file header') | |
| 454 | + # In all the samples seen so far, Word always uses an offset of 0x32, | |
| 455 | + # and Excel 0x22A. But we read the offset from the header to be more | |
| 456 | + # generic. | |
| 457 | + # Let's try that offset, then 0x32 and 0x22A, just in case: | |
| 458 | + for start in (offset, 0x32, 0x22A): | |
| 459 | + try: | |
| 460 | + logging.debug('Attempting zlib decompression from MSO file offset 0x%X' % start) | |
| 461 | + extracted_data = zlib.decompress(data[start:]) | |
| 462 | + return extracted_data | |
| 463 | + except: | |
| 464 | + logging.exception('zlib decompression failed') | |
| 465 | + # None of the guessed offsets worked, let's try brute-forcing by looking | |
| 466 | + # for potential zlib-compressed blocks starting with 0x78: | |
| 467 | + logging.debug('Looking for potential zlib-compressed blocks in MSO file') | |
| 468 | + for match in re_zlib_header.finditer(data): | |
| 469 | + start = match.start() | |
| 470 | + try: | |
| 471 | + logging.debug('Attempting zlib decompression from MSO file offset 0x%X' % start) | |
| 472 | + extracted_data = zlib.decompress(data[start:]) | |
| 473 | + return extracted_data | |
| 474 | + except: | |
| 475 | + logging.exception('zlib decompression failed') | |
| 476 | + raise RuntimeError('Unable to decompress data from a MSO/ActiveMime file') | |
| 477 | + | |
| 478 | + | |
| 424 | 479 | #--- FUNCTIONS ---------------------------------------------------------------- |
| 425 | 480 | |
| 426 | 481 | def copytoken_help(decompressed_current, decompressed_chunk_start): |
| ... | ... | @@ -1351,16 +1406,19 @@ class VBA_Parser(object): |
| 1351 | 1406 | # get the filename: |
| 1352 | 1407 | fname = bindata.get(ATTR_NAME, 'noname.mso') |
| 1353 | 1408 | # decode the base64 activemime |
| 1354 | - activemime = binascii.a2b_base64(bindata.text) | |
| 1355 | - # decompress the zlib data starting at offset 0x32, which is the OLE container: | |
| 1356 | - # TODO: handle different offsets => separate function | |
| 1357 | - ole_data = zlib.decompress(activemime[0x32:]) | |
| 1358 | - try: | |
| 1359 | - self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data)) | |
| 1360 | - except: | |
| 1361 | - logging.debug('%s is not a valid OLE file' % fname) | |
| 1362 | - continue | |
| 1409 | + mso_data = binascii.a2b_base64(bindata.text) | |
| 1410 | + if is_mso_file(mso_data): | |
| 1411 | + # decompress the zlib data stored in the MSO file, which is the OLE container: | |
| 1412 | + # TODO: handle different offsets => separate function | |
| 1413 | + ole_data = mso_file_extract(mso_data) | |
| 1414 | + try: | |
| 1415 | + self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data)) | |
| 1416 | + except: | |
| 1417 | + logging.error('%s does not contain a valid OLE file' % fname) | |
| 1418 | + else: | |
| 1419 | + logging.error('%s is not a valid MSO file' % fname) | |
| 1363 | 1420 | except: |
| 1421 | + # TODO: differentiate exceptions for each parsing stage | |
| 1364 | 1422 | logging.exception('Failed XML parsing for file %r' % self.filename) |
| 1365 | 1423 | pass |
| 1366 | 1424 | # check if it is a MHT file (MIME HTML, Word or Excel saved as "Single File Web Page"): |
| ... | ... | @@ -1381,6 +1439,7 @@ class VBA_Parser(object): |
| 1381 | 1439 | for part in mhtml.walk(): |
| 1382 | 1440 | content_type = part.get_content_type() # always returns a value |
| 1383 | 1441 | fname = part.get_filename(None) # returns None if it fails |
| 1442 | + # TODO: get content-location if no filename | |
| 1384 | 1443 | logging.debug('MHTML part: filename=%r, content-type=%r' % (fname, content_type)) |
| 1385 | 1444 | part_data = part.get_payload(decode=True) |
| 1386 | 1445 | # VBA macros are stored in a binary file named "editdata.mso". |
| ... | ... | @@ -1391,15 +1450,15 @@ class VBA_Parser(object): |
| 1391 | 1450 | if isinstance(part_data, str) and is_mso_file(part_data): |
| 1392 | 1451 | logging.debug('Found ActiveMime header, decompressing MSO container') |
| 1393 | 1452 | try: |
| 1394 | - ole_data = zlib.decompress(part_data[0x32:]) | |
| 1453 | + ole_data = mso_file_extract(part_data) | |
| 1395 | 1454 | try: |
| 1396 | 1455 | # TODO: check if it is actually an OLE file |
| 1397 | 1456 | # TODO: get the MSO filename from content_location? |
| 1398 | 1457 | self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data)) |
| 1399 | 1458 | except: |
| 1400 | - logging.debug('%s is not a valid OLE file' % fname) | |
| 1459 | + logging.debug('%s does not contain a valid OLE file' % fname) | |
| 1401 | 1460 | except: |
| 1402 | - logging.error('Failed decompressing an MSO container in %r - %s' | |
| 1461 | + logging.exception('Failed decompressing an MSO container in %r - %s' | |
| 1403 | 1462 | % (fname, MSG_OLEVBA_ISSUES)) |
| 1404 | 1463 | # TODO: bug here - need to split in smaller functions/classes? |
| 1405 | 1464 | except: |
| ... | ... | @@ -1768,9 +1827,9 @@ def main(): |
| 1768 | 1827 | print 'olevba %s - http://decalage.info/python/oletools' % __version__ |
| 1769 | 1828 | |
| 1770 | 1829 | # TODO: option to set logging level, none by default |
| 1771 | - logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.DEBUG) #.WARNING) #INFO) | |
| 1830 | + logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING) #.DEBUG) #INFO) | |
| 1772 | 1831 | # For now, all logging is disabled: |
| 1773 | - #logging.disable(logging.CRITICAL) | |
| 1832 | + logging.disable(logging.CRITICAL) | |
| 1774 | 1833 | |
| 1775 | 1834 | if options.input: |
| 1776 | 1835 | # input file provided with VBA source code to be analyzed directly: | ... | ... |