Commit 2356048d3ed22779ce74db34aa8941b0c2ad9b90

Authored by decalage2
1 parent ae4f1882

olevba3: added support for Word/PowerPoint 2007+ XML (FlatOPC) - issue #283

Showing 1 changed file with 70 additions and 3 deletions
oletools/olevba3.py
1 1 #!/usr/bin/env python
2 2 """
3   -olevba.py
  3 +olevba3.py
4 4  
5 5 olevba is a script to parse OLE and OpenXML files such as MS Office documents
6 6 (e.g. Word, Excel), to extract VBA Macro code in clear text, deobfuscate
7 7 and analyze malicious macros.
8 8  
  9 +olevba3 is the version of olevba that runs on Python 3.x.
  10 +
9 11 Supported formats:
10 12 - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)
11 13 - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
12 14 - PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm)
  15 +- Word/PowerPoint 2007+ XML (aka Flat OPC)
13 16 - Word 2003 XML (.xml)
14 17 - Word/Excel Single File Web Page / MHTML (.mht)
15 18 - Publisher (.pub)
... ... @@ -198,8 +201,10 @@ from __future__ import print_function
198 201 # 2017-06-15 PL: - deobfuscation line by line to handle large files
199 202 # 2017-07-11 v0.52 PL: - raise exception instead of sys.exit (issue #180)
200 203 # 2018-03-19 PL: - removed pyparsing from the thirdparty subfolder
  204 +# 2018-05-13 v0.53 PL: - added support for Word/PowerPoint 2007+ XML (FlatOPC)
  205 +# (issue #283)
201 206  
202   -__version__ = '0.52.3'
  207 +__version__ = '0.53dev10'
203 208  
204 209 #------------------------------------------------------------------------------
205 210 # TODO:
... ... @@ -493,6 +498,7 @@ MSG_OLEVBA_ISSUES = 'Please report this issue on %s' % URL_OLEVBA_ISSUES
493 498 # Container types:
494 499 TYPE_OLE = 'OLE'
495 500 TYPE_OpenXML = 'OpenXML'
  501 +TYPE_FlatOPC_XML = 'FlatOPC_XML'
496 502 TYPE_Word2003_XML = 'Word2003_XML'
497 503 TYPE_MHTML = 'MHTML'
498 504 TYPE_TEXT = 'Text'
... ... @@ -502,6 +508,7 @@ TYPE_PPT = 'PPT'
502 508 TYPE2TAG = {
503 509 TYPE_OLE: 'OLE:',
504 510 TYPE_OpenXML: 'OpX:',
  511 + TYPE_FlatOPC_XML: 'FlX:',
505 512 TYPE_Word2003_XML: 'XML:',
506 513 TYPE_MHTML: 'MHT:',
507 514 TYPE_TEXT: 'TXT:',
... ... @@ -522,6 +529,18 @@ NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}'
522 529 TAG_BINDATA = NS_W + 'binData'
523 530 ATTR_NAME = NS_W + 'name'
524 531  
  532 +# Namespaces and tags for Word/PowerPoint 2007+ XML parsing:
  533 +# root: <pkg:package xmlns:pkg="http://schemas.microsoft.com/office/2006/xmlPackage">
  534 +NS_XMLPACKAGE = '{http://schemas.microsoft.com/office/2006/xmlPackage}'
  535 +TAG_PACKAGE = NS_XMLPACKAGE + 'package'
  536 +# the tag <pkg:part> includes <pkg:binaryData> that contains the VBA macro code in Base64:
  537 +# <pkg:part pkg:name="/word/vbaProject.bin" pkg:contentType="application/vnd.ms-office.vbaProject"><pkg:binaryData>
  538 +TAG_PKGPART = NS_XMLPACKAGE + 'part'
  539 +ATTR_PKG_NAME = NS_XMLPACKAGE + 'name'
  540 +ATTR_PKG_CONTENTTYPE = NS_XMLPACKAGE + 'contentType'
  541 +CTYPE_VBAPROJECT = "application/vnd.ms-office.vbaProject"
  542 +TAG_PKGBINDATA = NS_XMLPACKAGE + 'binaryData'
  543 +
525 544 # Keywords to detect auto-executable macros
526 545 AUTOEXEC_KEYWORDS = {
527 546 # MS Word:
... ... @@ -2350,6 +2369,9 @@ class VBA_Parser(object):
2350 2369 # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace
2351 2370 if b'http://schemas.microsoft.com/office/word/2003/wordml' in data:
2352 2371 self.open_word2003xml(data)
  2372 + # check if it is a Word/PowerPoint 2007+ XML file (Flat OPC): must contain the namespace
  2373 + if b'http://schemas.microsoft.com/office/2006/xmlPackage' in data:
  2374 + self.open_flatopc(data)
2353 2375 # store a lowercase version for the next tests:
2354 2376 data_lowercase = data.lower()
2355 2377 # check if it is a MHT file (MIME HTML, Word or Excel saved as "Single File Web Page"):
... ... @@ -2500,6 +2522,51 @@ class VBA_Parser(object):
2500 2522 log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc))
2501 2523 log.debug('Trace:', exc_info=True)
2502 2524  
  2525 + def open_flatopc(self, data):
  2526 + """
  2527 + Open a Word or PowerPoint 2007+ XML file, aka "Flat OPC"
  2528 + :param data: file contents in a string or bytes
  2529 + :return: nothing
  2530 + """
  2531 + log.info('Opening Flat OPC Word/PowerPoint XML file %s' % self.filename)
  2532 + try:
  2533 + # parse the XML content
  2534 + # TODO: handle XML parsing exceptions
  2535 + et = ET.fromstring(data)
  2536 + # TODO: check root node namespace and tag
  2537 + # find all the pkg:part elements:
  2538 + for pkgpart in et.iter(TAG_PKGPART):
  2539 + fname = pkgpart.get(ATTR_PKG_NAME, 'unknown')
  2540 + content_type = pkgpart.get(ATTR_PKG_CONTENTTYPE, 'unknown')
  2541 + if content_type == CTYPE_VBAPROJECT:
  2542 + for bindata in pkgpart.iterfind(TAG_PKGBINDATA):
  2543 + try:
  2544 + ole_data = binascii.a2b_base64(bindata.text)
  2545 + self.ole_subfiles.append(
  2546 + VBA_Parser(filename=fname, data=ole_data,
  2547 + relaxed=self.relaxed))
  2548 + except OlevbaBaseException as exc:
  2549 + if self.relaxed:
  2550 + log.info('Error parsing subfile {0}: {1}'
  2551 + .format(fname, exc))
  2552 + log.debug('Trace:', exc_info=True)
  2553 + else:
  2554 + raise SubstreamOpenError(self.filename, fname, exc)
  2555 + # set type only if parsing succeeds
  2556 + self.type = TYPE_FlatOPC_XML
  2557 + except OlevbaBaseException as exc:
  2558 + if self.relaxed:
  2559 + log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc))
  2560 + log.debug('Trace:', exc_info=True)
  2561 + else:
  2562 + raise
  2563 + except Exception as exc:
  2564 + # TODO: differentiate exceptions for each parsing stage
  2565 + # (but ET is different libs, no good exception description in API)
  2566 + # found: XMLSyntaxError
  2567 + log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc))
  2568 + log.debug('Trace:', exc_info=True)
  2569 +
2503 2570 def open_mht(self, data):
2504 2571 """
2505 2572 Open a MHTML file
... ... @@ -3399,7 +3466,7 @@ def main(cmd_line_args=None):
3399 3466 url='http://decalage.info/python/oletools',
3400 3467 type='MetaInformation', _json_is_first=True)
3401 3468 else:
3402   - print('olevba %s - http://decalage.info/python/oletools' % __version__)
  3469 + print('olevba3 %s - http://decalage.info/python/oletools' % __version__)
3403 3470  
3404 3471 logging.basicConfig(level=options.loglevel, format='%(levelname)-8s %(message)s')
3405 3472 # enable logging in the modules:
... ...