Commit 6bc009b3cea1eab1e2058acf5d911660ec8f4be8

Authored by Philippe Lagadec
1 parent 5a23b490

olevba 0.03: added support for OpenXML formats, can find the VBA project root anywhere in the file

Showing 1 changed file with 121 additions and 36 deletions
oletools/olevba.py
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 """ 2 """
3 -olevba.py v0.02 2014-08-15 3 +olevba.py v0.03 2014-08-15
4 4
5 -olevba is a script to parse OLE files such as MS Office documents (e.g. Word,  
6 -Excel), to extract VBA Macro code in clear text. 5 +olevba is a script to parse OLE and OpenXML files such as MS Office documents
  6 +(e.g. Word, Excel), to extract VBA Macro code in clear text.
7 7
8 -olevba project website: http://www.decalage.info/python/olevba 8 +Supported formats:
  9 +- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)
  10 +- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
  11 +- PowerPoint 2007+ (.pptm, .ppsm)
  12 +
  13 +Author: Philippe Lagadec - http://www.decalage.info
  14 +License: BSD, see source code or documentation
9 15
10 olevba is part of the python-oletools package: 16 olevba is part of the python-oletools package:
11 http://www.decalage.info/python/oletools 17 http://www.decalage.info/python/oletools
12 18
  19 +olevba is based on source code from officeparser by John William Davison
  20 +https://github.com/unixfreak0037/officeparser
  21 +
13 Usage: olevba.py <file> 22 Usage: olevba.py <file>
14 """ 23 """
15 24
16 -__version__ = '0.02' 25 +__version__ = '0.03'
17 26
18 #=== LICENSE ================================================================== 27 #=== LICENSE ==================================================================
19 28
@@ -69,16 +78,22 @@ __version__ = &#39;0.02&#39; @@ -69,16 +78,22 @@ __version__ = &#39;0.02&#39;
69 # 2014-08-05 v0.01 PL: - first version based on officeparser code 78 # 2014-08-05 v0.01 PL: - first version based on officeparser code
70 # 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser 79 # 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser
71 # 2014-08-15 PL: - fixed incorrect value check in PROJECTHELPFILEPATH Record 80 # 2014-08-15 PL: - fixed incorrect value check in PROJECTHELPFILEPATH Record
  81 +# 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats
  82 +# and to find the VBA project root anywhere in the file
72 83
73 #------------------------------------------------------------------------------ 84 #------------------------------------------------------------------------------
74 # TODO: 85 # TODO:
  86 +# + extract_macros should yield filename, code
75 # + optparse 87 # + optparse
76 # + nicer output 88 # + nicer output
77 -# + output to file  
78 # + setup logging (common with other oletools) 89 # + setup logging (common with other oletools)
79 -# + support OpenXML files 90 +# + update readme, wiki and decalage.info, pypi (link to sample files)
  91 +
  92 +# TODO later:
  93 +# + output to file
80 # + process several files in dirs or zips with password 94 # + process several files in dirs or zips with password
81 # + look for VBA in embedded documents (e.g. Excel in Word) 95 # + look for VBA in embedded documents (e.g. Excel in Word)
  96 +# + support SRP streams (see Lenny's article + links and sample)
82 # - python 3.x support 97 # - python 3.x support
83 # - add support for PowerPoint macros (see libclamav, libgsf) 98 # - add support for PowerPoint macros (see libclamav, libgsf)
84 # - check VBA macros in Visio, Access, Project, etc 99 # - check VBA macros in Visio, Access, Project, etc
@@ -99,6 +114,7 @@ import sys, logging @@ -99,6 +114,7 @@ import sys, logging
99 import struct 114 import struct
100 import cStringIO 115 import cStringIO
101 import math 116 import math
  117 +import zipfile
102 118
103 from thirdparty.OleFileIO_PL import OleFileIO_PL 119 from thirdparty.OleFileIO_PL import OleFileIO_PL
104 120
@@ -248,29 +264,72 @@ def decompress_stream (compressed_container): @@ -248,29 +264,72 @@ def decompress_stream (compressed_container):
248 return decompressed_container 264 return decompressed_container
249 265
250 266
251 -def extract_macros(ole): 267 +def extract_macros_ole(ole):
252 """ 268 """
253 Extract VBA macros from an OLE file 269 Extract VBA macros from an OLE file
254 """ 270 """
255 # Find the VBA project root (different in MS Word, Excel, etc): 271 # Find the VBA project root (different in MS Word, Excel, etc):
256 - vba_root = None  
257 - for stream in ('Macros', '_VBA_PROJECT_CUR'):  
258 - if ole.exists(stream):  
259 - logging.debug('found VBA root stream: %s' % stream)  
260 - vba_root = stream  
261 - break  
262 - if vba_root is None:  
263 - logging.debug('VBA root stream not found')  
264 - return None  
265 - # Find the PROJECT stream:  
266 - project = None  
267 - project_path = vba_root + '/PROJECT'  
268 - if ole.exists(project_path):  
269 - logging.debug('found PROJECT stream: %s' % project_path)  
270 - project = ole.openstream(project_path)  
271 - else:  
272 - logging.debug('missing PROJECT stream')  
273 - return None 272 + # - Word 97-2003: Macros
  273 + # - Excel 97-2003: _VBA_PROJECT_CUR
  274 + # - PowerPoint 97-2003: not supported yet (different file structure)
  275 + # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin.
  276 + # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word
  277 + # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word
  278 + # - Visio 2007: not supported yet (different file structure)
  279 +
  280 + # According to MS-OVBA section 2.2.1:
  281 + # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream
  282 + # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream
  283 + # - all names are case-insensitive
  284 +
  285 + # Look for any storage containing those storage/streams:
  286 + for storage in ole.listdir(streams=False, storages=True):
  287 + # Look for a storage ending with "VBA":
  288 + if storage[-1].upper() == 'VBA':
  289 + logging.debug('Found VBA storage: %s' % ('/'.join(storage)))
  290 + vba_root = '/'.join(storage[:-1])
  291 + # Add a trailing slash to vba_root, unless it is the root of the OLE file:
  292 + # (used later to append all the child streams/storages)
  293 + if vba_root != '':
  294 + vba_root += '/'
  295 + logging.debug('Checking vba_root="%s"' % vba_root)
  296 +
  297 + def check_vba_stream(ole, vba_root, stream_path):
  298 + full_path = vba_root + stream_path
  299 + if ole.exists(full_path) and ole.get_type(full_path) == OleFileIO_PL.STGTY_STREAM:
  300 + logging.debug('Found %s stream: %s' % (stream_path, full_path))
  301 + return full_path
  302 + else:
  303 + logging.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path)
  304 + return False
  305 +
  306 + # Check if the VBA root storage also contains a PROJECT stream:
  307 + project_path = check_vba_stream(ole, vba_root, 'PROJECT')
  308 + if not project_path: continue
  309 + # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream:
  310 + vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT')
  311 + if not vba_project_path: continue
  312 + # Check if the VBA root storage also contains a VBA/dir stream:
  313 + dir_path = check_vba_stream(ole, vba_root, 'VBA/dir')
  314 + if not dir_path: continue
  315 + # Now we are pretty sure it is a VBA project structure
  316 + logging.debug('VBA root storage: "%s"' % vba_root)
  317 + # extract all VBA macros from that VBA root storage:
  318 + _extract_vba(ole, vba_root, project_path, dir_path)
  319 +
  320 +
  321 +
  322 +def _extract_vba (ole, vba_root, project_path, dir_path):
  323 + """
  324 + Extract VBA macros from an OleFileIO object.
  325 + Internal function, do not call directly.
  326 +
  327 + vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream
  328 + vba_project: path to the PROJECT stream
  329 + This is a generator, yielding (filename, stream path, VBA source code) for each VBA code stream
  330 + """
  331 + # Open the PROJECT stream:
  332 + project = ole.openstream(project_path)
274 333
275 # sample content of the PROJECT stream: 334 # sample content of the PROJECT stream:
276 335
@@ -313,11 +372,6 @@ def extract_macros(ole): @@ -313,11 +372,6 @@ def extract_macros(ole):
313 elif name == 'BaseClass': 372 elif name == 'BaseClass':
314 code_modules[value] = FORM_EXTENSION 373 code_modules[value] = FORM_EXTENSION
315 374
316 - # Find the dir stream  
317 - dir_path = vba_root + '/VBA/dir'  
318 - if not ole.exists(dir_path):  
319 - logging.debug('missing dir stream')  
320 - return None  
321 # read data from dir stream (compressed) 375 # read data from dir stream (compressed)
322 dir_compressed = ole.openstream(dir_path).read() 376 dir_compressed = ole.openstream(dir_path).read()
323 377
@@ -620,7 +674,7 @@ def extract_macros(ole): @@ -620,7 +674,7 @@ def extract_macros(ole):
620 logging.debug("StreamName = {0}".format(MODULESTREAMNAME_StreamName)) 674 logging.debug("StreamName = {0}".format(MODULESTREAMNAME_StreamName))
621 logging.debug("TextOffset = {0}".format(MODULEOFFSET_TextOffset)) 675 logging.debug("TextOffset = {0}".format(MODULEOFFSET_TextOffset))
622 676
623 - code_path = vba_root + '/VBA/' + MODULESTREAMNAME_StreamName 677 + code_path = vba_root + 'VBA/' + MODULESTREAMNAME_StreamName
624 #TODO: test if stream exists 678 #TODO: test if stream exists
625 code_data = ole.openstream(code_path).read() 679 code_data = ole.openstream(code_path).read()
626 logging.debug("length of code_data = {0}".format(len(code_data))) 680 logging.debug("length of code_data = {0}".format(len(code_data)))
@@ -642,6 +696,39 @@ def extract_macros(ole): @@ -642,6 +696,39 @@ def extract_macros(ole):
642 return 696 return
643 697
644 698
  699 +def extract_macros (filename):
  700 + if OleFileIO_PL.isOleFile(filename):
  701 + # This looks like an OLE file
  702 + logging.info('Extracting VBA Macros from OLE file %s' % filename)
  703 + ole = OleFileIO_PL.OleFileIO(filename)
  704 + extract_macros_ole(ole)
  705 + ole.close()
  706 + elif zipfile.is_zipfile(filename):
  707 + # This looks like a zip file, need to look for vbaProject.bin inside
  708 + #TODO: here we could even look for any OLE file inside the archive
  709 + #...because vbaProject.bin can be renamed:
  710 + # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18
  711 + logging.info('Opening ZIP/OpenXML file %s' % filename)
  712 + z = zipfile.ZipFile(filename)
  713 + for f in z.namelist():
  714 + if f.lower().endswith('vbaproject.bin'):
  715 + logging.debug('Opening OLE VBA storage %s within zip' % f)
  716 + vbadata = z.open(f).read()
  717 + vbafile = cStringIO.StringIO(vbadata)
  718 + try:
  719 + ole = OleFileIO_PL.OleFileIO(vbafile)
  720 + except:
  721 + logging.debug('%s is not a valid OLE file' % f)
  722 + continue
  723 + logging.info('Extracting VBA Macros from %s/%s' % (filename, f))
  724 + extract_macros_ole(ole)
  725 + ole.close()
  726 + z.close()
  727 + else:
  728 + logging.error('%s is not an OLE nor an OpenXML file, cannot extract VBA Macros.' % filename)
  729 +
  730 +
  731 +
645 #=== MAIN ===================================================================== 732 #=== MAIN =====================================================================
646 733
647 if __name__ == '__main__': 734 if __name__ == '__main__':
@@ -650,9 +737,7 @@ if __name__ == &#39;__main__&#39;: @@ -650,9 +737,7 @@ if __name__ == &#39;__main__&#39;:
650 print __doc__ 737 print __doc__
651 sys.exit(1) 738 sys.exit(1)
652 739
653 - logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING) 740 + logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
654 741
655 - ole = OleFileIO_PL.OleFileIO(sys.argv[1])  
656 - extract_macros(ole) 742 + extract_macros(sys.argv[1])
657 743
658 - ole.close()