Commit b7b13bb58d7e362a470ff0febed7d7714d0256d0

Authored by decalage2
1 parent 31b535f3

olevba 0.53dev4: added support for Word/PowerPoint 2007+ XML format, aka Flat OPC (issue #283)

Showing 2 changed files with 67 additions and 2 deletions
oletools/olevba.py
@@ -10,6 +10,7 @@ Supported formats: @@ -10,6 +10,7 @@ Supported formats:
10 - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) 10 - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)
11 - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) 11 - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
12 - PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm) 12 - PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm)
  13 +- Word/PowerPoint 2007+ XML (aka Flat OPC)
13 - Word 2003 XML (.xml) 14 - Word 2003 XML (.xml)
14 - Word/Excel Single File Web Page / MHTML (.mht) 15 - Word/Excel Single File Web Page / MHTML (.mht)
15 - Publisher (.pub) 16 - Publisher (.pub)
@@ -203,8 +204,10 @@ from __future__ import print_function @@ -203,8 +204,10 @@ from __future__ import print_function
203 # 2017-11-24 PL: - added keywords to detect self-modifying macros and 204 # 2017-11-24 PL: - added keywords to detect self-modifying macros and
204 # attempts to disable macro security (issue #221) 205 # attempts to disable macro security (issue #221)
205 # 2018-03-19 PL: - removed pyparsing from the thirdparty subfolder 206 # 2018-03-19 PL: - removed pyparsing from the thirdparty subfolder
  207 +# 2018-04-15 v0.53 PL: - added support for Word/PowerPoint 2007+ XML (FlatOPC)
  208 +# (issue #283)
206 209
207 -__version__ = '0.52.3' 210 +__version__ = '0.53dev4'
208 211
209 #------------------------------------------------------------------------------ 212 #------------------------------------------------------------------------------
210 # TODO: 213 # TODO:
@@ -482,6 +485,7 @@ MSG_OLEVBA_ISSUES = 'Please report this issue on %s' % URL_OLEVBA_ISSUES @@ -482,6 +485,7 @@ MSG_OLEVBA_ISSUES = 'Please report this issue on %s' % URL_OLEVBA_ISSUES
482 # Container types: 485 # Container types:
483 TYPE_OLE = 'OLE' 486 TYPE_OLE = 'OLE'
484 TYPE_OpenXML = 'OpenXML' 487 TYPE_OpenXML = 'OpenXML'
  488 +TYPE_FlatOPC_XML = 'FlatOPC_XML'
485 TYPE_Word2003_XML = 'Word2003_XML' 489 TYPE_Word2003_XML = 'Word2003_XML'
486 TYPE_MHTML = 'MHTML' 490 TYPE_MHTML = 'MHTML'
487 TYPE_TEXT = 'Text' 491 TYPE_TEXT = 'Text'
@@ -491,6 +495,7 @@ TYPE_PPT = 'PPT' @@ -491,6 +495,7 @@ TYPE_PPT = 'PPT'
491 TYPE2TAG = { 495 TYPE2TAG = {
492 TYPE_OLE: 'OLE:', 496 TYPE_OLE: 'OLE:',
493 TYPE_OpenXML: 'OpX:', 497 TYPE_OpenXML: 'OpX:',
  498 + TYPE_FlatOPC_XML: 'FlX:',
494 TYPE_Word2003_XML: 'XML:', 499 TYPE_Word2003_XML: 'XML:',
495 TYPE_MHTML: 'MHT:', 500 TYPE_MHTML: 'MHT:',
496 TYPE_TEXT: 'TXT:', 501 TYPE_TEXT: 'TXT:',
@@ -511,6 +516,18 @@ NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}' @@ -511,6 +516,18 @@ NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}'
511 TAG_BINDATA = NS_W + 'binData' 516 TAG_BINDATA = NS_W + 'binData'
512 ATTR_NAME = NS_W + 'name' 517 ATTR_NAME = NS_W + 'name'
513 518
  519 +# Namespaces and tags for Word/PowerPoint 2007+ XML parsing:
  520 +# root: <pkg:package xmlns:pkg="http://schemas.microsoft.com/office/2006/xmlPackage">
  521 +NS_XMLPACKAGE = '{http://schemas.microsoft.com/office/2006/xmlPackage}'
  522 +TAG_PACKAGE = NS_XMLPACKAGE + 'package'
  523 +# the tag <pkg:part> includes <pkg:binaryData> that contains the VBA macro code in Base64:
  524 +# <pkg:part pkg:name="/word/vbaProject.bin" pkg:contentType="application/vnd.ms-office.vbaProject"><pkg:binaryData>
  525 +TAG_PKGPART = NS_XMLPACKAGE + 'part'
  526 +ATTR_PKG_NAME = NS_XMLPACKAGE + 'name'
  527 +ATTR_PKG_CONTENTTYPE = NS_XMLPACKAGE + 'contentType'
  528 +CTYPE_VBAPROJECT = "application/vnd.ms-office.vbaProject"
  529 +TAG_PKGBINDATA = NS_XMLPACKAGE + 'binaryData'
  530 +
514 # Keywords to detect auto-executable macros 531 # Keywords to detect auto-executable macros
515 AUTOEXEC_KEYWORDS = { 532 AUTOEXEC_KEYWORDS = {
516 # MS Word: 533 # MS Word:
@@ -2343,6 +2360,9 @@ class VBA_Parser(object): @@ -2343,6 +2360,9 @@ class VBA_Parser(object):
2343 # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace 2360 # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace
2344 if b'http://schemas.microsoft.com/office/word/2003/wordml' in data: 2361 if b'http://schemas.microsoft.com/office/word/2003/wordml' in data:
2345 self.open_word2003xml(data) 2362 self.open_word2003xml(data)
  2363 + # check if it is a Word/PowerPoint 2007+ XML file (Flat OPC): must contain the namespace
  2364 + if b'http://schemas.microsoft.com/office/2006/xmlPackage' in data:
  2365 + self.open_flatopc(data)
2346 # store a lowercase version for the next tests: 2366 # store a lowercase version for the next tests:
2347 data_lowercase = data.lower() 2367 data_lowercase = data.lower()
2348 # check if it is a MHT file (MIME HTML, Word or Excel saved as "Single File Web Page"): 2368 # check if it is a MHT file (MIME HTML, Word or Excel saved as "Single File Web Page"):
@@ -2493,6 +2513,51 @@ class VBA_Parser(object): @@ -2493,6 +2513,51 @@ class VBA_Parser(object):
2493 log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc)) 2513 log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc))
2494 log.debug('Trace:', exc_info=True) 2514 log.debug('Trace:', exc_info=True)
2495 2515
  2516 + def open_flatopc(self, data):
  2517 + """
  2518 + Open a Word or PowerPoint 2007+ XML file, aka "Flat OPC"
  2519 + :param data: file contents in a string or bytes
  2520 + :return: nothing
  2521 + """
  2522 + log.info('Opening Flat OPC Word/PowerPoint XML file %s' % self.filename)
  2523 + try:
  2524 + # parse the XML content
  2525 + # TODO: handle XML parsing exceptions
  2526 + et = ET.fromstring(data)
  2527 + # TODO: check root node namespace and tag
  2528 + # find all the pkg:part elements:
  2529 + for pkgpart in et.iter(TAG_PKGPART):
  2530 + fname = pkgpart.get(ATTR_PKG_NAME, 'unknown')
  2531 + content_type = pkgpart.get(ATTR_PKG_CONTENTTYPE, 'unknown')
  2532 + if content_type == CTYPE_VBAPROJECT:
  2533 + for bindata in pkgpart.iterfind(TAG_PKGBINDATA):
  2534 + try:
  2535 + ole_data = binascii.a2b_base64(bindata.text)
  2536 + self.ole_subfiles.append(
  2537 + VBA_Parser(filename=fname, data=ole_data,
  2538 + relaxed=self.relaxed))
  2539 + except OlevbaBaseException as exc:
  2540 + if self.relaxed:
  2541 + log.info('Error parsing subfile {0}: {1}'
  2542 + .format(fname, exc))
  2543 + log.debug('Trace:', exc_info=True)
  2544 + else:
  2545 + raise SubstreamOpenError(self.filename, fname, exc)
  2546 + # set type only if parsing succeeds
  2547 + self.type = TYPE_FlatOPC_XML
  2548 + except OlevbaBaseException as exc:
  2549 + if self.relaxed:
  2550 + log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc))
  2551 + log.debug('Trace:', exc_info=True)
  2552 + else:
  2553 + raise
  2554 + except Exception as exc:
  2555 + # TODO: differentiate exceptions for each parsing stage
  2556 + # (but ET is different libs, no good exception description in API)
  2557 + # found: XMLSyntaxError
  2558 + log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc))
  2559 + log.debug('Trace:', exc_info=True)
  2560 +
2496 def open_mht(self, data): 2561 def open_mht(self, data):
2497 """ 2562 """
2498 Open a MHTML file 2563 Open a MHTML file
setup.py
@@ -43,7 +43,7 @@ import os, fnmatch @@ -43,7 +43,7 @@ import os, fnmatch
43 #--- METADATA ----------------------------------------------------------------- 43 #--- METADATA -----------------------------------------------------------------
44 44
45 name = "oletools" 45 name = "oletools"
46 -version = '0.53dev3' 46 +version = '0.53dev4'
47 desc = "Python tools to analyze security characteristics of MS Office and OLE files (also called Structured Storage, Compound File Binary Format or Compound Document File Format), for Malware Analysis and Incident Response #DFIR" 47 desc = "Python tools to analyze security characteristics of MS Office and OLE files (also called Structured Storage, Compound File Binary Format or Compound Document File Format), for Malware Analysis and Incident Response #DFIR"
48 long_desc = open('oletools/README.rst').read() 48 long_desc = open('oletools/README.rst').read()
49 author = "Philippe Lagadec" 49 author = "Philippe Lagadec"