Commit 6bc009b3cea1eab1e2058acf5d911660ec8f4be8
1 parent
5a23b490
olevba 0.03: added support for OpenXML formats, can find the VBA project root anywhere in the file
Showing
1 changed file
with
121 additions
and
36 deletions
oletools/olevba.py
| 1 | #!/usr/bin/env python | 1 | #!/usr/bin/env python |
| 2 | """ | 2 | """ |
| 3 | -olevba.py v0.02 2014-08-15 | 3 | +olevba.py v0.03 2014-08-15 |
| 4 | 4 | ||
| 5 | -olevba is a script to parse OLE files such as MS Office documents (e.g. Word, | ||
| 6 | -Excel), to extract VBA Macro code in clear text. | 5 | +olevba is a script to parse OLE and OpenXML files such as MS Office documents |
| 6 | +(e.g. Word, Excel), to extract VBA Macro code in clear text. | ||
| 7 | 7 | ||
| 8 | -olevba project website: http://www.decalage.info/python/olevba | 8 | +Supported formats: |
| 9 | +- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) | ||
| 10 | +- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) | ||
| 11 | +- PowerPoint 2007+ (.pptm, .ppsm) | ||
| 12 | + | ||
| 13 | +Author: Philippe Lagadec - http://www.decalage.info | ||
| 14 | +License: BSD, see source code or documentation | ||
| 9 | 15 | ||
| 10 | olevba is part of the python-oletools package: | 16 | olevba is part of the python-oletools package: |
| 11 | http://www.decalage.info/python/oletools | 17 | http://www.decalage.info/python/oletools |
| 12 | 18 | ||
| 19 | +olevba is based on source code from officeparser by John William Davison | ||
| 20 | +https://github.com/unixfreak0037/officeparser | ||
| 21 | + | ||
| 13 | Usage: olevba.py <file> | 22 | Usage: olevba.py <file> |
| 14 | """ | 23 | """ |
| 15 | 24 | ||
| 16 | -__version__ = '0.02' | 25 | +__version__ = '0.03' |
| 17 | 26 | ||
| 18 | #=== LICENSE ================================================================== | 27 | #=== LICENSE ================================================================== |
| 19 | 28 | ||
| @@ -69,16 +78,22 @@ __version__ = '0.02' | @@ -69,16 +78,22 @@ __version__ = '0.02' | ||
| 69 | # 2014-08-05 v0.01 PL: - first version based on officeparser code | 78 | # 2014-08-05 v0.01 PL: - first version based on officeparser code |
| 70 | # 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser | 79 | # 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser |
| 71 | # 2014-08-15 PL: - fixed incorrect value check in PROJECTHELPFILEPATH Record | 80 | # 2014-08-15 PL: - fixed incorrect value check in PROJECTHELPFILEPATH Record |
| 81 | +# 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats | ||
| 82 | +# and to find the VBA project root anywhere in the file | ||
| 72 | 83 | ||
| 73 | #------------------------------------------------------------------------------ | 84 | #------------------------------------------------------------------------------ |
| 74 | # TODO: | 85 | # TODO: |
| 86 | +# + extract_macros should yield filename, code | ||
| 75 | # + optparse | 87 | # + optparse |
| 76 | # + nicer output | 88 | # + nicer output |
| 77 | -# + output to file | ||
| 78 | # + setup logging (common with other oletools) | 89 | # + setup logging (common with other oletools) |
| 79 | -# + support OpenXML files | 90 | +# + update readme, wiki and decalage.info, pypi (link to sample files) |
| 91 | + | ||
| 92 | +# TODO later: | ||
| 93 | +# + output to file | ||
| 80 | # + process several files in dirs or zips with password | 94 | # + process several files in dirs or zips with password |
| 81 | # + look for VBA in embedded documents (e.g. Excel in Word) | 95 | # + look for VBA in embedded documents (e.g. Excel in Word) |
| 96 | +# + support SRP streams (see Lenny's article + links and sample) | ||
| 82 | # - python 3.x support | 97 | # - python 3.x support |
| 83 | # - add support for PowerPoint macros (see libclamav, libgsf) | 98 | # - add support for PowerPoint macros (see libclamav, libgsf) |
| 84 | # - check VBA macros in Visio, Access, Project, etc | 99 | # - check VBA macros in Visio, Access, Project, etc |
| @@ -99,6 +114,7 @@ import sys, logging | @@ -99,6 +114,7 @@ import sys, logging | ||
| 99 | import struct | 114 | import struct |
| 100 | import cStringIO | 115 | import cStringIO |
| 101 | import math | 116 | import math |
| 117 | +import zipfile | ||
| 102 | 118 | ||
| 103 | from thirdparty.OleFileIO_PL import OleFileIO_PL | 119 | from thirdparty.OleFileIO_PL import OleFileIO_PL |
| 104 | 120 | ||
| @@ -248,29 +264,72 @@ def decompress_stream (compressed_container): | @@ -248,29 +264,72 @@ def decompress_stream (compressed_container): | ||
| 248 | return decompressed_container | 264 | return decompressed_container |
| 249 | 265 | ||
| 250 | 266 | ||
| 251 | -def extract_macros(ole): | 267 | +def extract_macros_ole(ole): |
| 252 | """ | 268 | """ |
| 253 | Extract VBA macros from an OLE file | 269 | Extract VBA macros from an OLE file |
| 254 | """ | 270 | """ |
| 255 | # Find the VBA project root (different in MS Word, Excel, etc): | 271 | # Find the VBA project root (different in MS Word, Excel, etc): |
| 256 | - vba_root = None | ||
| 257 | - for stream in ('Macros', '_VBA_PROJECT_CUR'): | ||
| 258 | - if ole.exists(stream): | ||
| 259 | - logging.debug('found VBA root stream: %s' % stream) | ||
| 260 | - vba_root = stream | ||
| 261 | - break | ||
| 262 | - if vba_root is None: | ||
| 263 | - logging.debug('VBA root stream not found') | ||
| 264 | - return None | ||
| 265 | - # Find the PROJECT stream: | ||
| 266 | - project = None | ||
| 267 | - project_path = vba_root + '/PROJECT' | ||
| 268 | - if ole.exists(project_path): | ||
| 269 | - logging.debug('found PROJECT stream: %s' % project_path) | ||
| 270 | - project = ole.openstream(project_path) | ||
| 271 | - else: | ||
| 272 | - logging.debug('missing PROJECT stream') | ||
| 273 | - return None | 272 | + # - Word 97-2003: Macros |
| 273 | + # - Excel 97-2003: _VBA_PROJECT_CUR | ||
| 274 | + # - PowerPoint 97-2003: not supported yet (different file structure) | ||
| 275 | + # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin. | ||
| 276 | + # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word | ||
| 277 | + # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word | ||
| 278 | + # - Visio 2007: not supported yet (different file structure) | ||
| 279 | + | ||
| 280 | + # According to MS-OVBA section 2.2.1: | ||
| 281 | + # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream | ||
| 282 | + # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream | ||
| 283 | + # - all names are case-insensitive | ||
| 284 | + | ||
| 285 | + # Look for any storage containing those storage/streams: | ||
| 286 | + for storage in ole.listdir(streams=False, storages=True): | ||
| 287 | + # Look for a storage ending with "VBA": | ||
| 288 | + if storage[-1].upper() == 'VBA': | ||
| 289 | + logging.debug('Found VBA storage: %s' % ('/'.join(storage))) | ||
| 290 | + vba_root = '/'.join(storage[:-1]) | ||
| 291 | + # Add a trailing slash to vba_root, unless it is the root of the OLE file: | ||
| 292 | + # (used later to append all the child streams/storages) | ||
| 293 | + if vba_root != '': | ||
| 294 | + vba_root += '/' | ||
| 295 | + logging.debug('Checking vba_root="%s"' % vba_root) | ||
| 296 | + | ||
| 297 | + def check_vba_stream(ole, vba_root, stream_path): | ||
| 298 | + full_path = vba_root + stream_path | ||
| 299 | + if ole.exists(full_path) and ole.get_type(full_path) == OleFileIO_PL.STGTY_STREAM: | ||
| 300 | + logging.debug('Found %s stream: %s' % (stream_path, full_path)) | ||
| 301 | + return full_path | ||
| 302 | + else: | ||
| 303 | + logging.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path) | ||
| 304 | + return False | ||
| 305 | + | ||
| 306 | + # Check if the VBA root storage also contains a PROJECT stream: | ||
| 307 | + project_path = check_vba_stream(ole, vba_root, 'PROJECT') | ||
| 308 | + if not project_path: continue | ||
| 309 | + # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream: | ||
| 310 | + vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT') | ||
| 311 | + if not vba_project_path: continue | ||
| 312 | + # Check if the VBA root storage also contains a VBA/dir stream: | ||
| 313 | + dir_path = check_vba_stream(ole, vba_root, 'VBA/dir') | ||
| 314 | + if not dir_path: continue | ||
| 315 | + # Now we are pretty sure it is a VBA project structure | ||
| 316 | + logging.debug('VBA root storage: "%s"' % vba_root) | ||
| 317 | + # extract all VBA macros from that VBA root storage: | ||
| 318 | + _extract_vba(ole, vba_root, project_path, dir_path) | ||
| 319 | + | ||
| 320 | + | ||
| 321 | + | ||
| 322 | +def _extract_vba (ole, vba_root, project_path, dir_path): | ||
| 323 | + """ | ||
| 324 | + Extract VBA macros from an OleFileIO object. | ||
| 325 | + Internal function, do not call directly. | ||
| 326 | + | ||
| 327 | + vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream | ||
| 328 | + vba_project: path to the PROJECT stream | ||
| 329 | + This is a generator, yielding (filename, stream path, VBA source code) for each VBA code stream | ||
| 330 | + """ | ||
| 331 | + # Open the PROJECT stream: | ||
| 332 | + project = ole.openstream(project_path) | ||
| 274 | 333 | ||
| 275 | # sample content of the PROJECT stream: | 334 | # sample content of the PROJECT stream: |
| 276 | 335 | ||
| @@ -313,11 +372,6 @@ def extract_macros(ole): | @@ -313,11 +372,6 @@ def extract_macros(ole): | ||
| 313 | elif name == 'BaseClass': | 372 | elif name == 'BaseClass': |
| 314 | code_modules[value] = FORM_EXTENSION | 373 | code_modules[value] = FORM_EXTENSION |
| 315 | 374 | ||
| 316 | - # Find the dir stream | ||
| 317 | - dir_path = vba_root + '/VBA/dir' | ||
| 318 | - if not ole.exists(dir_path): | ||
| 319 | - logging.debug('missing dir stream') | ||
| 320 | - return None | ||
| 321 | # read data from dir stream (compressed) | 375 | # read data from dir stream (compressed) |
| 322 | dir_compressed = ole.openstream(dir_path).read() | 376 | dir_compressed = ole.openstream(dir_path).read() |
| 323 | 377 | ||
| @@ -620,7 +674,7 @@ def extract_macros(ole): | @@ -620,7 +674,7 @@ def extract_macros(ole): | ||
| 620 | logging.debug("StreamName = {0}".format(MODULESTREAMNAME_StreamName)) | 674 | logging.debug("StreamName = {0}".format(MODULESTREAMNAME_StreamName)) |
| 621 | logging.debug("TextOffset = {0}".format(MODULEOFFSET_TextOffset)) | 675 | logging.debug("TextOffset = {0}".format(MODULEOFFSET_TextOffset)) |
| 622 | 676 | ||
| 623 | - code_path = vba_root + '/VBA/' + MODULESTREAMNAME_StreamName | 677 | + code_path = vba_root + 'VBA/' + MODULESTREAMNAME_StreamName |
| 624 | #TODO: test if stream exists | 678 | #TODO: test if stream exists |
| 625 | code_data = ole.openstream(code_path).read() | 679 | code_data = ole.openstream(code_path).read() |
| 626 | logging.debug("length of code_data = {0}".format(len(code_data))) | 680 | logging.debug("length of code_data = {0}".format(len(code_data))) |
| @@ -642,6 +696,39 @@ def extract_macros(ole): | @@ -642,6 +696,39 @@ def extract_macros(ole): | ||
| 642 | return | 696 | return |
| 643 | 697 | ||
| 644 | 698 | ||
| 699 | +def extract_macros (filename): | ||
| 700 | + if OleFileIO_PL.isOleFile(filename): | ||
| 701 | + # This looks like an OLE file | ||
| 702 | + logging.info('Extracting VBA Macros from OLE file %s' % filename) | ||
| 703 | + ole = OleFileIO_PL.OleFileIO(filename) | ||
| 704 | + extract_macros_ole(ole) | ||
| 705 | + ole.close() | ||
| 706 | + elif zipfile.is_zipfile(filename): | ||
| 707 | + # This looks like a zip file, need to look for vbaProject.bin inside | ||
| 708 | + #TODO: here we could even look for any OLE file inside the archive | ||
| 709 | + #...because vbaProject.bin can be renamed: | ||
| 710 | + # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18 | ||
| 711 | + logging.info('Opening ZIP/OpenXML file %s' % filename) | ||
| 712 | + z = zipfile.ZipFile(filename) | ||
| 713 | + for f in z.namelist(): | ||
| 714 | + if f.lower().endswith('vbaproject.bin'): | ||
| 715 | + logging.debug('Opening OLE VBA storage %s within zip' % f) | ||
| 716 | + vbadata = z.open(f).read() | ||
| 717 | + vbafile = cStringIO.StringIO(vbadata) | ||
| 718 | + try: | ||
| 719 | + ole = OleFileIO_PL.OleFileIO(vbafile) | ||
| 720 | + except: | ||
| 721 | + logging.debug('%s is not a valid OLE file' % f) | ||
| 722 | + continue | ||
| 723 | + logging.info('Extracting VBA Macros from %s/%s' % (filename, f)) | ||
| 724 | + extract_macros_ole(ole) | ||
| 725 | + ole.close() | ||
| 726 | + z.close() | ||
| 727 | + else: | ||
| 728 | + logging.error('%s is not an OLE nor an OpenXML file, cannot extract VBA Macros.' % filename) | ||
| 729 | + | ||
| 730 | + | ||
| 731 | + | ||
| 645 | #=== MAIN ===================================================================== | 732 | #=== MAIN ===================================================================== |
| 646 | 733 | ||
| 647 | if __name__ == '__main__': | 734 | if __name__ == '__main__': |
| @@ -650,9 +737,7 @@ if __name__ == '__main__': | @@ -650,9 +737,7 @@ if __name__ == '__main__': | ||
| 650 | print __doc__ | 737 | print __doc__ |
| 651 | sys.exit(1) | 738 | sys.exit(1) |
| 652 | 739 | ||
| 653 | - logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING) | 740 | + logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) |
| 654 | 741 | ||
| 655 | - ole = OleFileIO_PL.OleFileIO(sys.argv[1]) | ||
| 656 | - extract_macros(ole) | 742 | + extract_macros(sys.argv[1]) |
| 657 | 743 | ||
| 658 | - ole.close() |