Commit 2356048d3ed22779ce74db34aa8941b0c2ad9b90

Authored by decalage2
1 parent ae4f1882

olevba3: added support for Word/PowerPoint 2007+ XML (FlatOPC) - issue #283

Showing 1 changed file with 70 additions and 3 deletions
oletools/olevba3.py
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 """ 2 """
3 -olevba.py 3 +olevba3.py
4 4
5 olevba is a script to parse OLE and OpenXML files such as MS Office documents 5 olevba is a script to parse OLE and OpenXML files such as MS Office documents
6 (e.g. Word, Excel), to extract VBA Macro code in clear text, deobfuscate 6 (e.g. Word, Excel), to extract VBA Macro code in clear text, deobfuscate
7 and analyze malicious macros. 7 and analyze malicious macros.
8 8
  9 +olevba3 is the version of olevba that runs on Python 3.x.
  10 +
9 Supported formats: 11 Supported formats:
10 - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) 12 - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)
11 - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) 13 - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
12 - PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm) 14 - PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm)
  15 +- Word/PowerPoint 2007+ XML (aka Flat OPC)
13 - Word 2003 XML (.xml) 16 - Word 2003 XML (.xml)
14 - Word/Excel Single File Web Page / MHTML (.mht) 17 - Word/Excel Single File Web Page / MHTML (.mht)
15 - Publisher (.pub) 18 - Publisher (.pub)
@@ -198,8 +201,10 @@ from __future__ import print_function @@ -198,8 +201,10 @@ from __future__ import print_function
198 # 2017-06-15 PL: - deobfuscation line by line to handle large files 201 # 2017-06-15 PL: - deobfuscation line by line to handle large files
199 # 2017-07-11 v0.52 PL: - raise exception instead of sys.exit (issue #180) 202 # 2017-07-11 v0.52 PL: - raise exception instead of sys.exit (issue #180)
200 # 2018-03-19 PL: - removed pyparsing from the thirdparty subfolder 203 # 2018-03-19 PL: - removed pyparsing from the thirdparty subfolder
  204 +# 2018-05-13 v0.53 PL: - added support for Word/PowerPoint 2007+ XML (FlatOPC)
  205 +# (issue #283)
201 206
202 -__version__ = '0.52.3' 207 +__version__ = '0.53dev10'
203 208
204 #------------------------------------------------------------------------------ 209 #------------------------------------------------------------------------------
205 # TODO: 210 # TODO:
@@ -493,6 +498,7 @@ MSG_OLEVBA_ISSUES = 'Please report this issue on %s' % URL_OLEVBA_ISSUES @@ -493,6 +498,7 @@ MSG_OLEVBA_ISSUES = 'Please report this issue on %s' % URL_OLEVBA_ISSUES
493 # Container types: 498 # Container types:
494 TYPE_OLE = 'OLE' 499 TYPE_OLE = 'OLE'
495 TYPE_OpenXML = 'OpenXML' 500 TYPE_OpenXML = 'OpenXML'
  501 +TYPE_FlatOPC_XML = 'FlatOPC_XML'
496 TYPE_Word2003_XML = 'Word2003_XML' 502 TYPE_Word2003_XML = 'Word2003_XML'
497 TYPE_MHTML = 'MHTML' 503 TYPE_MHTML = 'MHTML'
498 TYPE_TEXT = 'Text' 504 TYPE_TEXT = 'Text'
@@ -502,6 +508,7 @@ TYPE_PPT = 'PPT' @@ -502,6 +508,7 @@ TYPE_PPT = 'PPT'
502 TYPE2TAG = { 508 TYPE2TAG = {
503 TYPE_OLE: 'OLE:', 509 TYPE_OLE: 'OLE:',
504 TYPE_OpenXML: 'OpX:', 510 TYPE_OpenXML: 'OpX:',
  511 + TYPE_FlatOPC_XML: 'FlX:',
505 TYPE_Word2003_XML: 'XML:', 512 TYPE_Word2003_XML: 'XML:',
506 TYPE_MHTML: 'MHT:', 513 TYPE_MHTML: 'MHT:',
507 TYPE_TEXT: 'TXT:', 514 TYPE_TEXT: 'TXT:',
@@ -522,6 +529,18 @@ NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}' @@ -522,6 +529,18 @@ NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}'
522 TAG_BINDATA = NS_W + 'binData' 529 TAG_BINDATA = NS_W + 'binData'
523 ATTR_NAME = NS_W + 'name' 530 ATTR_NAME = NS_W + 'name'
524 531
  532 +# Namespaces and tags for Word/PowerPoint 2007+ XML parsing:
  533 +# root: <pkg:package xmlns:pkg="http://schemas.microsoft.com/office/2006/xmlPackage">
  534 +NS_XMLPACKAGE = '{http://schemas.microsoft.com/office/2006/xmlPackage}'
  535 +TAG_PACKAGE = NS_XMLPACKAGE + 'package'
  536 +# the tag <pkg:part> includes <pkg:binaryData> that contains the VBA macro code in Base64:
  537 +# <pkg:part pkg:name="/word/vbaProject.bin" pkg:contentType="application/vnd.ms-office.vbaProject"><pkg:binaryData>
  538 +TAG_PKGPART = NS_XMLPACKAGE + 'part'
  539 +ATTR_PKG_NAME = NS_XMLPACKAGE + 'name'
  540 +ATTR_PKG_CONTENTTYPE = NS_XMLPACKAGE + 'contentType'
  541 +CTYPE_VBAPROJECT = "application/vnd.ms-office.vbaProject"
  542 +TAG_PKGBINDATA = NS_XMLPACKAGE + 'binaryData'
  543 +
525 # Keywords to detect auto-executable macros 544 # Keywords to detect auto-executable macros
526 AUTOEXEC_KEYWORDS = { 545 AUTOEXEC_KEYWORDS = {
527 # MS Word: 546 # MS Word:
@@ -2350,6 +2369,9 @@ class VBA_Parser(object): @@ -2350,6 +2369,9 @@ class VBA_Parser(object):
2350 # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace 2369 # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace
2351 if b'http://schemas.microsoft.com/office/word/2003/wordml' in data: 2370 if b'http://schemas.microsoft.com/office/word/2003/wordml' in data:
2352 self.open_word2003xml(data) 2371 self.open_word2003xml(data)
  2372 + # check if it is a Word/PowerPoint 2007+ XML file (Flat OPC): must contain the namespace
  2373 + if b'http://schemas.microsoft.com/office/2006/xmlPackage' in data:
  2374 + self.open_flatopc(data)
2353 # store a lowercase version for the next tests: 2375 # store a lowercase version for the next tests:
2354 data_lowercase = data.lower() 2376 data_lowercase = data.lower()
2355 # check if it is a MHT file (MIME HTML, Word or Excel saved as "Single File Web Page"): 2377 # check if it is a MHT file (MIME HTML, Word or Excel saved as "Single File Web Page"):
@@ -2500,6 +2522,51 @@ class VBA_Parser(object): @@ -2500,6 +2522,51 @@ class VBA_Parser(object):
2500 log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc)) 2522 log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc))
2501 log.debug('Trace:', exc_info=True) 2523 log.debug('Trace:', exc_info=True)
2502 2524
  2525 + def open_flatopc(self, data):
  2526 + """
  2527 + Open a Word or PowerPoint 2007+ XML file, aka "Flat OPC"
  2528 + :param data: file contents in a string or bytes
  2529 + :return: nothing
  2530 + """
  2531 + log.info('Opening Flat OPC Word/PowerPoint XML file %s' % self.filename)
  2532 + try:
  2533 + # parse the XML content
  2534 + # TODO: handle XML parsing exceptions
  2535 + et = ET.fromstring(data)
  2536 + # TODO: check root node namespace and tag
  2537 + # find all the pkg:part elements:
  2538 + for pkgpart in et.iter(TAG_PKGPART):
  2539 + fname = pkgpart.get(ATTR_PKG_NAME, 'unknown')
  2540 + content_type = pkgpart.get(ATTR_PKG_CONTENTTYPE, 'unknown')
  2541 + if content_type == CTYPE_VBAPROJECT:
  2542 + for bindata in pkgpart.iterfind(TAG_PKGBINDATA):
  2543 + try:
  2544 + ole_data = binascii.a2b_base64(bindata.text)
  2545 + self.ole_subfiles.append(
  2546 + VBA_Parser(filename=fname, data=ole_data,
  2547 + relaxed=self.relaxed))
  2548 + except OlevbaBaseException as exc:
  2549 + if self.relaxed:
  2550 + log.info('Error parsing subfile {0}: {1}'
  2551 + .format(fname, exc))
  2552 + log.debug('Trace:', exc_info=True)
  2553 + else:
  2554 + raise SubstreamOpenError(self.filename, fname, exc)
  2555 + # set type only if parsing succeeds
  2556 + self.type = TYPE_FlatOPC_XML
  2557 + except OlevbaBaseException as exc:
  2558 + if self.relaxed:
  2559 + log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc))
  2560 + log.debug('Trace:', exc_info=True)
  2561 + else:
  2562 + raise
  2563 + except Exception as exc:
  2564 + # TODO: differentiate exceptions for each parsing stage
  2565 + # (but ET is different libs, no good exception description in API)
  2566 + # found: XMLSyntaxError
  2567 + log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc))
  2568 + log.debug('Trace:', exc_info=True)
  2569 +
2503 def open_mht(self, data): 2570 def open_mht(self, data):
2504 """ 2571 """
2505 Open a MHTML file 2572 Open a MHTML file
@@ -3399,7 +3466,7 @@ def main(cmd_line_args=None): @@ -3399,7 +3466,7 @@ def main(cmd_line_args=None):
3399 url='http://decalage.info/python/oletools', 3466 url='http://decalage.info/python/oletools',
3400 type='MetaInformation', _json_is_first=True) 3467 type='MetaInformation', _json_is_first=True)
3401 else: 3468 else:
3402 - print('olevba %s - http://decalage.info/python/oletools' % __version__) 3469 + print('olevba3 %s - http://decalage.info/python/oletools' % __version__)
3403 3470
3404 logging.basicConfig(level=options.loglevel, format='%(levelname)-8s %(message)s') 3471 logging.basicConfig(level=options.loglevel, format='%(levelname)-8s %(message)s')
3405 # enable logging in the modules: 3472 # enable logging in the modules: