Commit 2356048d3ed22779ce74db34aa8941b0c2ad9b90
1 parent
ae4f1882
olevba3: added support for Word/PowerPoint 2007+ XML (FlatOPC) - issue #283
Showing
1 changed file
with
70 additions
and
3 deletions
oletools/olevba3.py
| 1 | #!/usr/bin/env python | 1 | #!/usr/bin/env python |
| 2 | """ | 2 | """ |
| 3 | -olevba.py | 3 | +olevba3.py |
| 4 | 4 | ||
| 5 | olevba is a script to parse OLE and OpenXML files such as MS Office documents | 5 | olevba is a script to parse OLE and OpenXML files such as MS Office documents |
| 6 | (e.g. Word, Excel), to extract VBA Macro code in clear text, deobfuscate | 6 | (e.g. Word, Excel), to extract VBA Macro code in clear text, deobfuscate |
| 7 | and analyze malicious macros. | 7 | and analyze malicious macros. |
| 8 | 8 | ||
| 9 | +olevba3 is the version of olevba that runs on Python 3.x. | ||
| 10 | + | ||
| 9 | Supported formats: | 11 | Supported formats: |
| 10 | - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) | 12 | - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) |
| 11 | - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) | 13 | - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) |
| 12 | - PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm) | 14 | - PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm) |
| 15 | +- Word/PowerPoint 2007+ XML (aka Flat OPC) | ||
| 13 | - Word 2003 XML (.xml) | 16 | - Word 2003 XML (.xml) |
| 14 | - Word/Excel Single File Web Page / MHTML (.mht) | 17 | - Word/Excel Single File Web Page / MHTML (.mht) |
| 15 | - Publisher (.pub) | 18 | - Publisher (.pub) |
| @@ -198,8 +201,10 @@ from __future__ import print_function | @@ -198,8 +201,10 @@ from __future__ import print_function | ||
| 198 | # 2017-06-15 PL: - deobfuscation line by line to handle large files | 201 | # 2017-06-15 PL: - deobfuscation line by line to handle large files |
| 199 | # 2017-07-11 v0.52 PL: - raise exception instead of sys.exit (issue #180) | 202 | # 2017-07-11 v0.52 PL: - raise exception instead of sys.exit (issue #180) |
| 200 | # 2018-03-19 PL: - removed pyparsing from the thirdparty subfolder | 203 | # 2018-03-19 PL: - removed pyparsing from the thirdparty subfolder |
| 204 | +# 2018-05-13 v0.53 PL: - added support for Word/PowerPoint 2007+ XML (FlatOPC) | ||
| 205 | +# (issue #283) | ||
| 201 | 206 | ||
| 202 | -__version__ = '0.52.3' | 207 | +__version__ = '0.53dev10' |
| 203 | 208 | ||
| 204 | #------------------------------------------------------------------------------ | 209 | #------------------------------------------------------------------------------ |
| 205 | # TODO: | 210 | # TODO: |
| @@ -493,6 +498,7 @@ MSG_OLEVBA_ISSUES = 'Please report this issue on %s' % URL_OLEVBA_ISSUES | @@ -493,6 +498,7 @@ MSG_OLEVBA_ISSUES = 'Please report this issue on %s' % URL_OLEVBA_ISSUES | ||
| 493 | # Container types: | 498 | # Container types: |
| 494 | TYPE_OLE = 'OLE' | 499 | TYPE_OLE = 'OLE' |
| 495 | TYPE_OpenXML = 'OpenXML' | 500 | TYPE_OpenXML = 'OpenXML' |
| 501 | +TYPE_FlatOPC_XML = 'FlatOPC_XML' | ||
| 496 | TYPE_Word2003_XML = 'Word2003_XML' | 502 | TYPE_Word2003_XML = 'Word2003_XML' |
| 497 | TYPE_MHTML = 'MHTML' | 503 | TYPE_MHTML = 'MHTML' |
| 498 | TYPE_TEXT = 'Text' | 504 | TYPE_TEXT = 'Text' |
| @@ -502,6 +508,7 @@ TYPE_PPT = 'PPT' | @@ -502,6 +508,7 @@ TYPE_PPT = 'PPT' | ||
| 502 | TYPE2TAG = { | 508 | TYPE2TAG = { |
| 503 | TYPE_OLE: 'OLE:', | 509 | TYPE_OLE: 'OLE:', |
| 504 | TYPE_OpenXML: 'OpX:', | 510 | TYPE_OpenXML: 'OpX:', |
| 511 | + TYPE_FlatOPC_XML: 'FlX:', | ||
| 505 | TYPE_Word2003_XML: 'XML:', | 512 | TYPE_Word2003_XML: 'XML:', |
| 506 | TYPE_MHTML: 'MHT:', | 513 | TYPE_MHTML: 'MHT:', |
| 507 | TYPE_TEXT: 'TXT:', | 514 | TYPE_TEXT: 'TXT:', |
| @@ -522,6 +529,18 @@ NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}' | @@ -522,6 +529,18 @@ NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}' | ||
| 522 | TAG_BINDATA = NS_W + 'binData' | 529 | TAG_BINDATA = NS_W + 'binData' |
| 523 | ATTR_NAME = NS_W + 'name' | 530 | ATTR_NAME = NS_W + 'name' |
| 524 | 531 | ||
| 532 | +# Namespaces and tags for Word/PowerPoint 2007+ XML parsing: | ||
| 533 | +# root: <pkg:package xmlns:pkg="http://schemas.microsoft.com/office/2006/xmlPackage"> | ||
| 534 | +NS_XMLPACKAGE = '{http://schemas.microsoft.com/office/2006/xmlPackage}' | ||
| 535 | +TAG_PACKAGE = NS_XMLPACKAGE + 'package' | ||
| 536 | +# the tag <pkg:part> includes <pkg:binaryData> that contains the VBA macro code in Base64: | ||
| 537 | +# <pkg:part pkg:name="/word/vbaProject.bin" pkg:contentType="application/vnd.ms-office.vbaProject"><pkg:binaryData> | ||
| 538 | +TAG_PKGPART = NS_XMLPACKAGE + 'part' | ||
| 539 | +ATTR_PKG_NAME = NS_XMLPACKAGE + 'name' | ||
| 540 | +ATTR_PKG_CONTENTTYPE = NS_XMLPACKAGE + 'contentType' | ||
| 541 | +CTYPE_VBAPROJECT = "application/vnd.ms-office.vbaProject" | ||
| 542 | +TAG_PKGBINDATA = NS_XMLPACKAGE + 'binaryData' | ||
| 543 | + | ||
| 525 | # Keywords to detect auto-executable macros | 544 | # Keywords to detect auto-executable macros |
| 526 | AUTOEXEC_KEYWORDS = { | 545 | AUTOEXEC_KEYWORDS = { |
| 527 | # MS Word: | 546 | # MS Word: |
| @@ -2350,6 +2369,9 @@ class VBA_Parser(object): | @@ -2350,6 +2369,9 @@ class VBA_Parser(object): | ||
| 2350 | # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace | 2369 | # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace |
| 2351 | if b'http://schemas.microsoft.com/office/word/2003/wordml' in data: | 2370 | if b'http://schemas.microsoft.com/office/word/2003/wordml' in data: |
| 2352 | self.open_word2003xml(data) | 2371 | self.open_word2003xml(data) |
| 2372 | + # check if it is a Word/PowerPoint 2007+ XML file (Flat OPC): must contain the namespace | ||
| 2373 | + if b'http://schemas.microsoft.com/office/2006/xmlPackage' in data: | ||
| 2374 | + self.open_flatopc(data) | ||
| 2353 | # store a lowercase version for the next tests: | 2375 | # store a lowercase version for the next tests: |
| 2354 | data_lowercase = data.lower() | 2376 | data_lowercase = data.lower() |
| 2355 | # check if it is a MHT file (MIME HTML, Word or Excel saved as "Single File Web Page"): | 2377 | # check if it is a MHT file (MIME HTML, Word or Excel saved as "Single File Web Page"): |
| @@ -2500,6 +2522,51 @@ class VBA_Parser(object): | @@ -2500,6 +2522,51 @@ class VBA_Parser(object): | ||
| 2500 | log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc)) | 2522 | log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc)) |
| 2501 | log.debug('Trace:', exc_info=True) | 2523 | log.debug('Trace:', exc_info=True) |
| 2502 | 2524 | ||
| 2525 | + def open_flatopc(self, data): | ||
| 2526 | + """ | ||
| 2527 | + Open a Word or PowerPoint 2007+ XML file, aka "Flat OPC" | ||
| 2528 | + :param data: file contents in a string or bytes | ||
| 2529 | + :return: nothing | ||
| 2530 | + """ | ||
| 2531 | + log.info('Opening Flat OPC Word/PowerPoint XML file %s' % self.filename) | ||
| 2532 | + try: | ||
| 2533 | + # parse the XML content | ||
| 2534 | + # TODO: handle XML parsing exceptions | ||
| 2535 | + et = ET.fromstring(data) | ||
| 2536 | + # TODO: check root node namespace and tag | ||
| 2537 | + # find all the pkg:part elements: | ||
| 2538 | + for pkgpart in et.iter(TAG_PKGPART): | ||
| 2539 | + fname = pkgpart.get(ATTR_PKG_NAME, 'unknown') | ||
| 2540 | + content_type = pkgpart.get(ATTR_PKG_CONTENTTYPE, 'unknown') | ||
| 2541 | + if content_type == CTYPE_VBAPROJECT: | ||
| 2542 | + for bindata in pkgpart.iterfind(TAG_PKGBINDATA): | ||
| 2543 | + try: | ||
| 2544 | + ole_data = binascii.a2b_base64(bindata.text) | ||
| 2545 | + self.ole_subfiles.append( | ||
| 2546 | + VBA_Parser(filename=fname, data=ole_data, | ||
| 2547 | + relaxed=self.relaxed)) | ||
| 2548 | + except OlevbaBaseException as exc: | ||
| 2549 | + if self.relaxed: | ||
| 2550 | + log.info('Error parsing subfile {0}: {1}' | ||
| 2551 | + .format(fname, exc)) | ||
| 2552 | + log.debug('Trace:', exc_info=True) | ||
| 2553 | + else: | ||
| 2554 | + raise SubstreamOpenError(self.filename, fname, exc) | ||
| 2555 | + # set type only if parsing succeeds | ||
| 2556 | + self.type = TYPE_FlatOPC_XML | ||
| 2557 | + except OlevbaBaseException as exc: | ||
| 2558 | + if self.relaxed: | ||
| 2559 | + log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc)) | ||
| 2560 | + log.debug('Trace:', exc_info=True) | ||
| 2561 | + else: | ||
| 2562 | + raise | ||
| 2563 | + except Exception as exc: | ||
| 2564 | + # TODO: differentiate exceptions for each parsing stage | ||
| 2565 | + # (but ET is different libs, no good exception description in API) | ||
| 2566 | + # found: XMLSyntaxError | ||
| 2567 | + log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc)) | ||
| 2568 | + log.debug('Trace:', exc_info=True) | ||
| 2569 | + | ||
| 2503 | def open_mht(self, data): | 2570 | def open_mht(self, data): |
| 2504 | """ | 2571 | """ |
| 2505 | Open a MHTML file | 2572 | Open a MHTML file |
| @@ -3399,7 +3466,7 @@ def main(cmd_line_args=None): | @@ -3399,7 +3466,7 @@ def main(cmd_line_args=None): | ||
| 3399 | url='http://decalage.info/python/oletools', | 3466 | url='http://decalage.info/python/oletools', |
| 3400 | type='MetaInformation', _json_is_first=True) | 3467 | type='MetaInformation', _json_is_first=True) |
| 3401 | else: | 3468 | else: |
| 3402 | - print('olevba %s - http://decalage.info/python/oletools' % __version__) | 3469 | + print('olevba3 %s - http://decalage.info/python/oletools' % __version__) |
| 3403 | 3470 | ||
| 3404 | logging.basicConfig(level=options.loglevel, format='%(levelname)-8s %(message)s') | 3471 | logging.basicConfig(level=options.loglevel, format='%(levelname)-8s %(message)s') |
| 3405 | # enable logging in the modules: | 3472 | # enable logging in the modules: |