Commit 2356048d3ed22779ce74db34aa8941b0c2ad9b90
1 parent
ae4f1882
olevba3: added support for Word/PowerPoint 2007+ XML (FlatOPC) - issue #283
Showing
1 changed file
with
70 additions
and
3 deletions
oletools/olevba3.py
| 1 | 1 | #!/usr/bin/env python |
| 2 | 2 | """ |
| 3 | -olevba.py | |
| 3 | +olevba3.py | |
| 4 | 4 | |
| 5 | 5 | olevba is a script to parse OLE and OpenXML files such as MS Office documents |
| 6 | 6 | (e.g. Word, Excel), to extract VBA Macro code in clear text, deobfuscate |
| 7 | 7 | and analyze malicious macros. |
| 8 | 8 | |
| 9 | +olevba3 is the version of olevba that runs on Python 3.x. | |
| 10 | + | |
| 9 | 11 | Supported formats: |
| 10 | 12 | - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) |
| 11 | 13 | - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) |
| 12 | 14 | - PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm) |
| 15 | +- Word/PowerPoint 2007+ XML (aka Flat OPC) | |
| 13 | 16 | - Word 2003 XML (.xml) |
| 14 | 17 | - Word/Excel Single File Web Page / MHTML (.mht) |
| 15 | 18 | - Publisher (.pub) |
| ... | ... | @@ -198,8 +201,10 @@ from __future__ import print_function |
| 198 | 201 | # 2017-06-15 PL: - deobfuscation line by line to handle large files |
| 199 | 202 | # 2017-07-11 v0.52 PL: - raise exception instead of sys.exit (issue #180) |
| 200 | 203 | # 2018-03-19 PL: - removed pyparsing from the thirdparty subfolder |
| 204 | +# 2018-05-13 v0.53 PL: - added support for Word/PowerPoint 2007+ XML (FlatOPC) | |
| 205 | +# (issue #283) | |
| 201 | 206 | |
| 202 | -__version__ = '0.52.3' | |
| 207 | +__version__ = '0.53dev10' | |
| 203 | 208 | |
| 204 | 209 | #------------------------------------------------------------------------------ |
| 205 | 210 | # TODO: |
| ... | ... | @@ -493,6 +498,7 @@ MSG_OLEVBA_ISSUES = 'Please report this issue on %s' % URL_OLEVBA_ISSUES |
| 493 | 498 | # Container types: |
| 494 | 499 | TYPE_OLE = 'OLE' |
| 495 | 500 | TYPE_OpenXML = 'OpenXML' |
| 501 | +TYPE_FlatOPC_XML = 'FlatOPC_XML' | |
| 496 | 502 | TYPE_Word2003_XML = 'Word2003_XML' |
| 497 | 503 | TYPE_MHTML = 'MHTML' |
| 498 | 504 | TYPE_TEXT = 'Text' |
| ... | ... | @@ -502,6 +508,7 @@ TYPE_PPT = 'PPT' |
| 502 | 508 | TYPE2TAG = { |
| 503 | 509 | TYPE_OLE: 'OLE:', |
| 504 | 510 | TYPE_OpenXML: 'OpX:', |
| 511 | + TYPE_FlatOPC_XML: 'FlX:', | |
| 505 | 512 | TYPE_Word2003_XML: 'XML:', |
| 506 | 513 | TYPE_MHTML: 'MHT:', |
| 507 | 514 | TYPE_TEXT: 'TXT:', |
| ... | ... | @@ -522,6 +529,18 @@ NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}' |
| 522 | 529 | TAG_BINDATA = NS_W + 'binData' |
| 523 | 530 | ATTR_NAME = NS_W + 'name' |
| 524 | 531 | |
| 532 | +# Namespaces and tags for Word/PowerPoint 2007+ XML parsing: | |
| 533 | +# root: <pkg:package xmlns:pkg="http://schemas.microsoft.com/office/2006/xmlPackage"> | |
| 534 | +NS_XMLPACKAGE = '{http://schemas.microsoft.com/office/2006/xmlPackage}' | |
| 535 | +TAG_PACKAGE = NS_XMLPACKAGE + 'package' | |
| 536 | +# the tag <pkg:part> includes <pkg:binaryData> that contains the VBA macro code in Base64: | |
| 537 | +# <pkg:part pkg:name="/word/vbaProject.bin" pkg:contentType="application/vnd.ms-office.vbaProject"><pkg:binaryData> | |
| 538 | +TAG_PKGPART = NS_XMLPACKAGE + 'part' | |
| 539 | +ATTR_PKG_NAME = NS_XMLPACKAGE + 'name' | |
| 540 | +ATTR_PKG_CONTENTTYPE = NS_XMLPACKAGE + 'contentType' | |
| 541 | +CTYPE_VBAPROJECT = "application/vnd.ms-office.vbaProject" | |
| 542 | +TAG_PKGBINDATA = NS_XMLPACKAGE + 'binaryData' | |
| 543 | + | |
| 525 | 544 | # Keywords to detect auto-executable macros |
| 526 | 545 | AUTOEXEC_KEYWORDS = { |
| 527 | 546 | # MS Word: |
| ... | ... | @@ -2350,6 +2369,9 @@ class VBA_Parser(object): |
| 2350 | 2369 | # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace |
| 2351 | 2370 | if b'http://schemas.microsoft.com/office/word/2003/wordml' in data: |
| 2352 | 2371 | self.open_word2003xml(data) |
| 2372 | + # check if it is a Word/PowerPoint 2007+ XML file (Flat OPC): must contain the namespace | |
| 2373 | + if b'http://schemas.microsoft.com/office/2006/xmlPackage' in data: | |
| 2374 | + self.open_flatopc(data) | |
| 2353 | 2375 | # store a lowercase version for the next tests: |
| 2354 | 2376 | data_lowercase = data.lower() |
| 2355 | 2377 | # check if it is a MHT file (MIME HTML, Word or Excel saved as "Single File Web Page"): |
| ... | ... | @@ -2500,6 +2522,51 @@ class VBA_Parser(object): |
| 2500 | 2522 | log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc)) |
| 2501 | 2523 | log.debug('Trace:', exc_info=True) |
| 2502 | 2524 | |
| 2525 | + def open_flatopc(self, data): | |
| 2526 | + """ | |
| 2527 | + Open a Word or PowerPoint 2007+ XML file, aka "Flat OPC" | |
| 2528 | + :param data: file contents in a string or bytes | |
| 2529 | + :return: nothing | |
| 2530 | + """ | |
| 2531 | + log.info('Opening Flat OPC Word/PowerPoint XML file %s' % self.filename) | |
| 2532 | + try: | |
| 2533 | + # parse the XML content | |
| 2534 | + # TODO: handle XML parsing exceptions | |
| 2535 | + et = ET.fromstring(data) | |
| 2536 | + # TODO: check root node namespace and tag | |
| 2537 | + # find all the pkg:part elements: | |
| 2538 | + for pkgpart in et.iter(TAG_PKGPART): | |
| 2539 | + fname = pkgpart.get(ATTR_PKG_NAME, 'unknown') | |
| 2540 | + content_type = pkgpart.get(ATTR_PKG_CONTENTTYPE, 'unknown') | |
| 2541 | + if content_type == CTYPE_VBAPROJECT: | |
| 2542 | + for bindata in pkgpart.iterfind(TAG_PKGBINDATA): | |
| 2543 | + try: | |
| 2544 | + ole_data = binascii.a2b_base64(bindata.text) | |
| 2545 | + self.ole_subfiles.append( | |
| 2546 | + VBA_Parser(filename=fname, data=ole_data, | |
| 2547 | + relaxed=self.relaxed)) | |
| 2548 | + except OlevbaBaseException as exc: | |
| 2549 | + if self.relaxed: | |
| 2550 | + log.info('Error parsing subfile {0}: {1}' | |
| 2551 | + .format(fname, exc)) | |
| 2552 | + log.debug('Trace:', exc_info=True) | |
| 2553 | + else: | |
| 2554 | + raise SubstreamOpenError(self.filename, fname, exc) | |
| 2555 | + # set type only if parsing succeeds | |
| 2556 | + self.type = TYPE_FlatOPC_XML | |
| 2557 | + except OlevbaBaseException as exc: | |
| 2558 | + if self.relaxed: | |
| 2559 | + log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc)) | |
| 2560 | + log.debug('Trace:', exc_info=True) | |
| 2561 | + else: | |
| 2562 | + raise | |
| 2563 | + except Exception as exc: | |
| 2564 | + # TODO: differentiate exceptions for each parsing stage | |
| 2565 | + # (but ET is different libs, no good exception description in API) | |
| 2566 | + # found: XMLSyntaxError | |
| 2567 | + log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc)) | |
| 2568 | + log.debug('Trace:', exc_info=True) | |
| 2569 | + | |
| 2503 | 2570 | def open_mht(self, data): |
| 2504 | 2571 | """ |
| 2505 | 2572 | Open a MHTML file |
| ... | ... | @@ -3399,7 +3466,7 @@ def main(cmd_line_args=None): |
| 3399 | 3466 | url='http://decalage.info/python/oletools', |
| 3400 | 3467 | type='MetaInformation', _json_is_first=True) |
| 3401 | 3468 | else: |
| 3402 | - print('olevba %s - http://decalage.info/python/oletools' % __version__) | |
| 3469 | + print('olevba3 %s - http://decalage.info/python/oletools' % __version__) | |
| 3403 | 3470 | |
| 3404 | 3471 | logging.basicConfig(level=options.loglevel, format='%(levelname)-8s %(message)s') |
| 3405 | 3472 | # enable logging in the modules: | ... | ... |