Commit 6bc009b3cea1eab1e2058acf5d911660ec8f4be8

Authored by Philippe Lagadec
1 parent 5a23b490

olevba 0.03: added support for OpenXML formats, can find the VBA project root anywhere in the file

Showing 1 changed file with 121 additions and 36 deletions
oletools/olevba.py
1 1 #!/usr/bin/env python
2 2 """
3   -olevba.py v0.02 2014-08-15
  3 +olevba.py v0.03 2014-08-15
4 4  
5   -olevba is a script to parse OLE files such as MS Office documents (e.g. Word,
6   -Excel), to extract VBA Macro code in clear text.
  5 +olevba is a script to parse OLE and OpenXML files such as MS Office documents
  6 +(e.g. Word, Excel), to extract VBA Macro code in clear text.
7 7  
8   -olevba project website: http://www.decalage.info/python/olevba
  8 +Supported formats:
  9 +- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)
  10 +- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
  11 +- PowerPoint 2007+ (.pptm, .ppsm)
  12 +
  13 +Author: Philippe Lagadec - http://www.decalage.info
  14 +License: BSD, see source code or documentation
9 15  
10 16 olevba is part of the python-oletools package:
11 17 http://www.decalage.info/python/oletools
12 18  
  19 +olevba is based on source code from officeparser by John William Davison
  20 +https://github.com/unixfreak0037/officeparser
  21 +
13 22 Usage: olevba.py <file>
14 23 """
15 24  
16   -__version__ = '0.02'
  25 +__version__ = '0.03'
17 26  
18 27 #=== LICENSE ==================================================================
19 28  
... ... @@ -69,16 +78,22 @@ __version__ = &#39;0.02&#39;
69 78 # 2014-08-05 v0.01 PL: - first version based on officeparser code
70 79 # 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser
71 80 # 2014-08-15 PL: - fixed incorrect value check in PROJECTHELPFILEPATH Record
  81 +# 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats
  82 +# and to find the VBA project root anywhere in the file
72 83  
73 84 #------------------------------------------------------------------------------
74 85 # TODO:
  86 +# + extract_macros should yield filename, code
75 87 # + optparse
76 88 # + nicer output
77   -# + output to file
78 89 # + setup logging (common with other oletools)
79   -# + support OpenXML files
  90 +# + update readme, wiki and decalage.info, pypi (link to sample files)
  91 +
  92 +# TODO later:
  93 +# + output to file
80 94 # + process several files in dirs or zips with password
81 95 # + look for VBA in embedded documents (e.g. Excel in Word)
  96 +# + support SRP streams (see Lenny's article + links and sample)
82 97 # - python 3.x support
83 98 # - add support for PowerPoint macros (see libclamav, libgsf)
84 99 # - check VBA macros in Visio, Access, Project, etc
... ... @@ -99,6 +114,7 @@ import sys, logging
99 114 import struct
100 115 import cStringIO
101 116 import math
  117 +import zipfile
102 118  
103 119 from thirdparty.OleFileIO_PL import OleFileIO_PL
104 120  
... ... @@ -248,29 +264,72 @@ def decompress_stream (compressed_container):
248 264 return decompressed_container
249 265  
250 266  
251   -def extract_macros(ole):
  267 +def extract_macros_ole(ole):
252 268 """
253 269 Extract VBA macros from an OLE file
254 270 """
255 271 # Find the VBA project root (different in MS Word, Excel, etc):
256   - vba_root = None
257   - for stream in ('Macros', '_VBA_PROJECT_CUR'):
258   - if ole.exists(stream):
259   - logging.debug('found VBA root stream: %s' % stream)
260   - vba_root = stream
261   - break
262   - if vba_root is None:
263   - logging.debug('VBA root stream not found')
264   - return None
265   - # Find the PROJECT stream:
266   - project = None
267   - project_path = vba_root + '/PROJECT'
268   - if ole.exists(project_path):
269   - logging.debug('found PROJECT stream: %s' % project_path)
270   - project = ole.openstream(project_path)
271   - else:
272   - logging.debug('missing PROJECT stream')
273   - return None
  272 + # - Word 97-2003: Macros
  273 + # - Excel 97-2003: _VBA_PROJECT_CUR
  274 + # - PowerPoint 97-2003: not supported yet (different file structure)
  275 + # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin.
  276 + # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word
  277 + # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word
  278 + # - Visio 2007: not supported yet (different file structure)
  279 +
  280 + # According to MS-OVBA section 2.2.1:
  281 + # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream
  282 + # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream
  283 + # - all names are case-insensitive
  284 +
  285 + # Look for any storage containing those storage/streams:
  286 + for storage in ole.listdir(streams=False, storages=True):
  287 + # Look for a storage ending with "VBA":
  288 + if storage[-1].upper() == 'VBA':
  289 + logging.debug('Found VBA storage: %s' % ('/'.join(storage)))
  290 + vba_root = '/'.join(storage[:-1])
  291 + # Add a trailing slash to vba_root, unless it is the root of the OLE file:
  292 + # (used later to append all the child streams/storages)
  293 + if vba_root != '':
  294 + vba_root += '/'
  295 + logging.debug('Checking vba_root="%s"' % vba_root)
  296 +
  297 + def check_vba_stream(ole, vba_root, stream_path):
  298 + full_path = vba_root + stream_path
  299 + if ole.exists(full_path) and ole.get_type(full_path) == OleFileIO_PL.STGTY_STREAM:
  300 + logging.debug('Found %s stream: %s' % (stream_path, full_path))
  301 + return full_path
  302 + else:
  303 + logging.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path)
  304 + return False
  305 +
  306 + # Check if the VBA root storage also contains a PROJECT stream:
  307 + project_path = check_vba_stream(ole, vba_root, 'PROJECT')
  308 + if not project_path: continue
  309 + # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream:
  310 + vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT')
  311 + if not vba_project_path: continue
  312 + # Check if the VBA root storage also contains a VBA/dir stream:
  313 + dir_path = check_vba_stream(ole, vba_root, 'VBA/dir')
  314 + if not dir_path: continue
  315 + # Now we are pretty sure it is a VBA project structure
  316 + logging.debug('VBA root storage: "%s"' % vba_root)
  317 + # extract all VBA macros from that VBA root storage:
  318 + _extract_vba(ole, vba_root, project_path, dir_path)
  319 +
  320 +
  321 +
  322 +def _extract_vba (ole, vba_root, project_path, dir_path):
  323 + """
  324 + Extract VBA macros from an OleFileIO object.
  325 + Internal function, do not call directly.
  326 +
  327 + vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream
  328 + vba_project: path to the PROJECT stream
  329 + This is a generator, yielding (filename, stream path, VBA source code) for each VBA code stream
  330 + """
  331 + # Open the PROJECT stream:
  332 + project = ole.openstream(project_path)
274 333  
275 334 # sample content of the PROJECT stream:
276 335  
... ... @@ -313,11 +372,6 @@ def extract_macros(ole):
313 372 elif name == 'BaseClass':
314 373 code_modules[value] = FORM_EXTENSION
315 374  
316   - # Find the dir stream
317   - dir_path = vba_root + '/VBA/dir'
318   - if not ole.exists(dir_path):
319   - logging.debug('missing dir stream')
320   - return None
321 375 # read data from dir stream (compressed)
322 376 dir_compressed = ole.openstream(dir_path).read()
323 377  
... ... @@ -620,7 +674,7 @@ def extract_macros(ole):
620 674 logging.debug("StreamName = {0}".format(MODULESTREAMNAME_StreamName))
621 675 logging.debug("TextOffset = {0}".format(MODULEOFFSET_TextOffset))
622 676  
623   - code_path = vba_root + '/VBA/' + MODULESTREAMNAME_StreamName
  677 + code_path = vba_root + 'VBA/' + MODULESTREAMNAME_StreamName
624 678 #TODO: test if stream exists
625 679 code_data = ole.openstream(code_path).read()
626 680 logging.debug("length of code_data = {0}".format(len(code_data)))
... ... @@ -642,6 +696,39 @@ def extract_macros(ole):
642 696 return
643 697  
644 698  
  699 +def extract_macros (filename):
  700 + if OleFileIO_PL.isOleFile(filename):
  701 + # This looks like an OLE file
  702 + logging.info('Extracting VBA Macros from OLE file %s' % filename)
  703 + ole = OleFileIO_PL.OleFileIO(filename)
  704 + extract_macros_ole(ole)
  705 + ole.close()
  706 + elif zipfile.is_zipfile(filename):
  707 + # This looks like a zip file, need to look for vbaProject.bin inside
  708 + #TODO: here we could even look for any OLE file inside the archive
  709 + #...because vbaProject.bin can be renamed:
  710 + # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18
  711 + logging.info('Opening ZIP/OpenXML file %s' % filename)
  712 + z = zipfile.ZipFile(filename)
  713 + for f in z.namelist():
  714 + if f.lower().endswith('vbaproject.bin'):
  715 + logging.debug('Opening OLE VBA storage %s within zip' % f)
  716 + vbadata = z.open(f).read()
  717 + vbafile = cStringIO.StringIO(vbadata)
  718 + try:
  719 + ole = OleFileIO_PL.OleFileIO(vbafile)
  720 + except:
  721 + logging.debug('%s is not a valid OLE file' % f)
  722 + continue
  723 + logging.info('Extracting VBA Macros from %s/%s' % (filename, f))
  724 + extract_macros_ole(ole)
  725 + ole.close()
  726 + z.close()
  727 + else:
  728 + logging.error('%s is not an OLE nor an OpenXML file, cannot extract VBA Macros.' % filename)
  729 +
  730 +
  731 +
645 732 #=== MAIN =====================================================================
646 733  
647 734 if __name__ == '__main__':
... ... @@ -650,9 +737,7 @@ if __name__ == &#39;__main__&#39;:
650 737 print __doc__
651 738 sys.exit(1)
652 739  
653   - logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING)
  740 + logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
654 741  
655   - ole = OleFileIO_PL.OleFileIO(sys.argv[1])
656   - extract_macros(ole)
  742 + extract_macros(sys.argv[1])
657 743  
658   - ole.close()
... ...