Commit b7b13bb58d7e362a470ff0febed7d7714d0256d0

Authored by decalage2
1 parent 31b535f3

olevba 0.53dev4: added support for Word/PowerPoint 2007+ XML format, aka Flat OPC (issue #283)

Showing 2 changed files with 67 additions and 2 deletions
oletools/olevba.py
... ... @@ -10,6 +10,7 @@ Supported formats:
10 10 - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)
11 11 - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
12 12 - PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm)
  13 +- Word/PowerPoint 2007+ XML (aka Flat OPC)
13 14 - Word 2003 XML (.xml)
14 15 - Word/Excel Single File Web Page / MHTML (.mht)
15 16 - Publisher (.pub)
... ... @@ -203,8 +204,10 @@ from __future__ import print_function
203 204 # 2017-11-24 PL: - added keywords to detect self-modifying macros and
204 205 # attempts to disable macro security (issue #221)
205 206 # 2018-03-19 PL: - removed pyparsing from the thirdparty subfolder
  207 +# 2018-04-15 v0.53 PL: - added support for Word/PowerPoint 2007+ XML (FlatOPC)
  208 +# (issue #283)
206 209  
207   -__version__ = '0.52.3'
  210 +__version__ = '0.53dev4'
208 211  
209 212 #------------------------------------------------------------------------------
210 213 # TODO:
... ... @@ -482,6 +485,7 @@ MSG_OLEVBA_ISSUES = 'Please report this issue on %s' % URL_OLEVBA_ISSUES
482 485 # Container types:
483 486 TYPE_OLE = 'OLE'
484 487 TYPE_OpenXML = 'OpenXML'
  488 +TYPE_FlatOPC_XML = 'FlatOPC_XML'
485 489 TYPE_Word2003_XML = 'Word2003_XML'
486 490 TYPE_MHTML = 'MHTML'
487 491 TYPE_TEXT = 'Text'
... ... @@ -491,6 +495,7 @@ TYPE_PPT = 'PPT'
491 495 TYPE2TAG = {
492 496 TYPE_OLE: 'OLE:',
493 497 TYPE_OpenXML: 'OpX:',
  498 + TYPE_FlatOPC_XML: 'FlX:',
494 499 TYPE_Word2003_XML: 'XML:',
495 500 TYPE_MHTML: 'MHT:',
496 501 TYPE_TEXT: 'TXT:',
... ... @@ -511,6 +516,18 @@ NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}'
511 516 TAG_BINDATA = NS_W + 'binData'
512 517 ATTR_NAME = NS_W + 'name'
513 518  
  519 +# Namespaces and tags for Word/PowerPoint 2007+ XML parsing:
  520 +# root: <pkg:package xmlns:pkg="http://schemas.microsoft.com/office/2006/xmlPackage">
  521 +NS_XMLPACKAGE = '{http://schemas.microsoft.com/office/2006/xmlPackage}'
  522 +TAG_PACKAGE = NS_XMLPACKAGE + 'package'
  523 +# the tag <pkg:part> includes <pkg:binaryData> that contains the VBA macro code in Base64:
  524 +# <pkg:part pkg:name="/word/vbaProject.bin" pkg:contentType="application/vnd.ms-office.vbaProject"><pkg:binaryData>
  525 +TAG_PKGPART = NS_XMLPACKAGE + 'part'
  526 +ATTR_PKG_NAME = NS_XMLPACKAGE + 'name'
  527 +ATTR_PKG_CONTENTTYPE = NS_XMLPACKAGE + 'contentType'
  528 +CTYPE_VBAPROJECT = "application/vnd.ms-office.vbaProject"
  529 +TAG_PKGBINDATA = NS_XMLPACKAGE + 'binaryData'
  530 +
514 531 # Keywords to detect auto-executable macros
515 532 AUTOEXEC_KEYWORDS = {
516 533 # MS Word:
... ... @@ -2343,6 +2360,9 @@ class VBA_Parser(object):
2343 2360 # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace
2344 2361 if b'http://schemas.microsoft.com/office/word/2003/wordml' in data:
2345 2362 self.open_word2003xml(data)
  2363 + # check if it is a Word/PowerPoint 2007+ XML file (Flat OPC): must contain the namespace
  2364 + if b'http://schemas.microsoft.com/office/2006/xmlPackage' in data:
  2365 + self.open_flatopc(data)
2346 2366 # store a lowercase version for the next tests:
2347 2367 data_lowercase = data.lower()
2348 2368 # check if it is a MHT file (MIME HTML, Word or Excel saved as "Single File Web Page"):
... ... @@ -2493,6 +2513,51 @@ class VBA_Parser(object):
2493 2513 log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc))
2494 2514 log.debug('Trace:', exc_info=True)
2495 2515  
  2516 + def open_flatopc(self, data):
  2517 + """
  2518 + Open a Word or PowerPoint 2007+ XML file, aka "Flat OPC"
  2519 + :param data: file contents in a string or bytes
  2520 + :return: nothing
  2521 + """
  2522 + log.info('Opening Flat OPC Word/PowerPoint XML file %s' % self.filename)
  2523 + try:
  2524 + # parse the XML content
  2525 + # TODO: handle XML parsing exceptions
  2526 + et = ET.fromstring(data)
  2527 + # TODO: check root node namespace and tag
  2528 + # find all the pkg:part elements:
  2529 + for pkgpart in et.iter(TAG_PKGPART):
  2530 + fname = pkgpart.get(ATTR_PKG_NAME, 'unknown')
  2531 + content_type = pkgpart.get(ATTR_PKG_CONTENTTYPE, 'unknown')
  2532 + if content_type == CTYPE_VBAPROJECT:
  2533 + for bindata in pkgpart.iterfind(TAG_PKGBINDATA):
  2534 + try:
  2535 + ole_data = binascii.a2b_base64(bindata.text)
  2536 + self.ole_subfiles.append(
  2537 + VBA_Parser(filename=fname, data=ole_data,
  2538 + relaxed=self.relaxed))
  2539 + except OlevbaBaseException as exc:
  2540 + if self.relaxed:
  2541 + log.info('Error parsing subfile {0}: {1}'
  2542 + .format(fname, exc))
  2543 + log.debug('Trace:', exc_info=True)
  2544 + else:
  2545 + raise SubstreamOpenError(self.filename, fname, exc)
  2546 + # set type only if parsing succeeds
  2547 + self.type = TYPE_FlatOPC_XML
  2548 + except OlevbaBaseException as exc:
  2549 + if self.relaxed:
  2550 + log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc))
  2551 + log.debug('Trace:', exc_info=True)
  2552 + else:
  2553 + raise
  2554 + except Exception as exc:
  2555 + # TODO: differentiate exceptions for each parsing stage
  2556 + # (but ET is different libs, no good exception description in API)
  2557 + # found: XMLSyntaxError
  2558 + log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc))
  2559 + log.debug('Trace:', exc_info=True)
  2560 +
2496 2561 def open_mht(self, data):
2497 2562 """
2498 2563 Open a MHTML file
... ...
setup.py
... ... @@ -43,7 +43,7 @@ import os, fnmatch
43 43 #--- METADATA -----------------------------------------------------------------
44 44  
45 45 name = "oletools"
46   -version = '0.53dev3'
  46 +version = '0.53dev4'
47 47 desc = "Python tools to analyze security characteristics of MS Office and OLE files (also called Structured Storage, Compound File Binary Format or Compound Document File Format), for Malware Analysis and Incident Response #DFIR"
48 48 long_desc = open('oletools/README.rst').read()
49 49 author = "Philippe Lagadec"
... ...