Commit 6bc009b3cea1eab1e2058acf5d911660ec8f4be8
1 parent
5a23b490
olevba 0.03: added support for OpenXML formats, can find the VBA project root anywhere in the file
Showing
1 changed file
with
121 additions
and
36 deletions
oletools/olevba.py
| 1 | 1 | #!/usr/bin/env python |
| 2 | 2 | """ |
| 3 | -olevba.py v0.02 2014-08-15 | |
| 3 | +olevba.py v0.03 2014-08-15 | |
| 4 | 4 | |
| 5 | -olevba is a script to parse OLE files such as MS Office documents (e.g. Word, | |
| 6 | -Excel), to extract VBA Macro code in clear text. | |
| 5 | +olevba is a script to parse OLE and OpenXML files such as MS Office documents | |
| 6 | +(e.g. Word, Excel), to extract VBA Macro code in clear text. | |
| 7 | 7 | |
| 8 | -olevba project website: http://www.decalage.info/python/olevba | |
| 8 | +Supported formats: | |
| 9 | +- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) | |
| 10 | +- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) | |
| 11 | +- PowerPoint 2007+ (.pptm, .ppsm) | |
| 12 | + | |
| 13 | +Author: Philippe Lagadec - http://www.decalage.info | |
| 14 | +License: BSD, see source code or documentation | |
| 9 | 15 | |
| 10 | 16 | olevba is part of the python-oletools package: |
| 11 | 17 | http://www.decalage.info/python/oletools |
| 12 | 18 | |
| 19 | +olevba is based on source code from officeparser by John William Davison | |
| 20 | +https://github.com/unixfreak0037/officeparser | |
| 21 | + | |
| 13 | 22 | Usage: olevba.py <file> |
| 14 | 23 | """ |
| 15 | 24 | |
| 16 | -__version__ = '0.02' | |
| 25 | +__version__ = '0.03' | |
| 17 | 26 | |
| 18 | 27 | #=== LICENSE ================================================================== |
| 19 | 28 | |
| ... | ... | @@ -69,16 +78,22 @@ __version__ = '0.02' |
| 69 | 78 | # 2014-08-05 v0.01 PL: - first version based on officeparser code |
| 70 | 79 | # 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser |
| 71 | 80 | # 2014-08-15 PL: - fixed incorrect value check in PROJECTHELPFILEPATH Record |
| 81 | +# 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats | |
| 82 | +# and to find the VBA project root anywhere in the file | |
| 72 | 83 | |
| 73 | 84 | #------------------------------------------------------------------------------ |
| 74 | 85 | # TODO: |
| 86 | +# + extract_macros should yield filename, code | |
| 75 | 87 | # + optparse |
| 76 | 88 | # + nicer output |
| 77 | -# + output to file | |
| 78 | 89 | # + setup logging (common with other oletools) |
| 79 | -# + support OpenXML files | |
| 90 | +# + update readme, wiki and decalage.info, pypi (link to sample files) | |
| 91 | + | |
| 92 | +# TODO later: | |
| 93 | +# + output to file | |
| 80 | 94 | # + process several files in dirs or zips with password |
| 81 | 95 | # + look for VBA in embedded documents (e.g. Excel in Word) |
| 96 | +# + support SRP streams (see Lenny's article + links and sample) | |
| 82 | 97 | # - python 3.x support |
| 83 | 98 | # - add support for PowerPoint macros (see libclamav, libgsf) |
| 84 | 99 | # - check VBA macros in Visio, Access, Project, etc |
| ... | ... | @@ -99,6 +114,7 @@ import sys, logging |
| 99 | 114 | import struct |
| 100 | 115 | import cStringIO |
| 101 | 116 | import math |
| 117 | +import zipfile | |
| 102 | 118 | |
| 103 | 119 | from thirdparty.OleFileIO_PL import OleFileIO_PL |
| 104 | 120 | |
| ... | ... | @@ -248,29 +264,72 @@ def decompress_stream (compressed_container): |
| 248 | 264 | return decompressed_container |
| 249 | 265 | |
| 250 | 266 | |
| 251 | -def extract_macros(ole): | |
| 267 | +def extract_macros_ole(ole): | |
| 252 | 268 | """ |
| 253 | 269 | Extract VBA macros from an OLE file |
| 254 | 270 | """ |
| 255 | 271 | # Find the VBA project root (different in MS Word, Excel, etc): |
| 256 | - vba_root = None | |
| 257 | - for stream in ('Macros', '_VBA_PROJECT_CUR'): | |
| 258 | - if ole.exists(stream): | |
| 259 | - logging.debug('found VBA root stream: %s' % stream) | |
| 260 | - vba_root = stream | |
| 261 | - break | |
| 262 | - if vba_root is None: | |
| 263 | - logging.debug('VBA root stream not found') | |
| 264 | - return None | |
| 265 | - # Find the PROJECT stream: | |
| 266 | - project = None | |
| 267 | - project_path = vba_root + '/PROJECT' | |
| 268 | - if ole.exists(project_path): | |
| 269 | - logging.debug('found PROJECT stream: %s' % project_path) | |
| 270 | - project = ole.openstream(project_path) | |
| 271 | - else: | |
| 272 | - logging.debug('missing PROJECT stream') | |
| 273 | - return None | |
| 272 | + # - Word 97-2003: Macros | |
| 273 | + # - Excel 97-2003: _VBA_PROJECT_CUR | |
| 274 | + # - PowerPoint 97-2003: not supported yet (different file structure) | |
| 275 | + # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin. | |
| 276 | + # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word | |
| 277 | + # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word | |
| 278 | + # - Visio 2007: not supported yet (different file structure) | |
| 279 | + | |
| 280 | + # According to MS-OVBA section 2.2.1: | |
| 281 | + # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream | |
| 282 | + # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream | |
| 283 | + # - all names are case-insensitive | |
| 284 | + | |
| 285 | + # Look for any storage containing those storage/streams: | |
| 286 | + for storage in ole.listdir(streams=False, storages=True): | |
| 287 | + # Look for a storage ending with "VBA": | |
| 288 | + if storage[-1].upper() == 'VBA': | |
| 289 | + logging.debug('Found VBA storage: %s' % ('/'.join(storage))) | |
| 290 | + vba_root = '/'.join(storage[:-1]) | |
| 291 | + # Add a trailing slash to vba_root, unless it is the root of the OLE file: | |
| 292 | + # (used later to append all the child streams/storages) | |
| 293 | + if vba_root != '': | |
| 294 | + vba_root += '/' | |
| 295 | + logging.debug('Checking vba_root="%s"' % vba_root) | |
| 296 | + | |
| 297 | + def check_vba_stream(ole, vba_root, stream_path): | |
| 298 | + full_path = vba_root + stream_path | |
| 299 | + if ole.exists(full_path) and ole.get_type(full_path) == OleFileIO_PL.STGTY_STREAM: | |
| 300 | + logging.debug('Found %s stream: %s' % (stream_path, full_path)) | |
| 301 | + return full_path | |
| 302 | + else: | |
| 303 | + logging.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path) | |
| 304 | + return False | |
| 305 | + | |
| 306 | + # Check if the VBA root storage also contains a PROJECT stream: | |
| 307 | + project_path = check_vba_stream(ole, vba_root, 'PROJECT') | |
| 308 | + if not project_path: continue | |
| 309 | + # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream: | |
| 310 | + vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT') | |
| 311 | + if not vba_project_path: continue | |
| 312 | + # Check if the VBA root storage also contains a VBA/dir stream: | |
| 313 | + dir_path = check_vba_stream(ole, vba_root, 'VBA/dir') | |
| 314 | + if not dir_path: continue | |
| 315 | + # Now we are pretty sure it is a VBA project structure | |
| 316 | + logging.debug('VBA root storage: "%s"' % vba_root) | |
| 317 | + # extract all VBA macros from that VBA root storage: | |
| 318 | + _extract_vba(ole, vba_root, project_path, dir_path) | |
| 319 | + | |
| 320 | + | |
| 321 | + | |
| 322 | +def _extract_vba (ole, vba_root, project_path, dir_path): | |
| 323 | + """ | |
| 324 | + Extract VBA macros from an OleFileIO object. | |
| 325 | + Internal function, do not call directly. | |
| 326 | + | |
| 327 | + vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream | |
| 328 | + vba_project: path to the PROJECT stream | |
| 329 | + This is a generator, yielding (filename, stream path, VBA source code) for each VBA code stream | |
| 330 | + """ | |
| 331 | + # Open the PROJECT stream: | |
| 332 | + project = ole.openstream(project_path) | |
| 274 | 333 | |
| 275 | 334 | # sample content of the PROJECT stream: |
| 276 | 335 | |
| ... | ... | @@ -313,11 +372,6 @@ def extract_macros(ole): |
| 313 | 372 | elif name == 'BaseClass': |
| 314 | 373 | code_modules[value] = FORM_EXTENSION |
| 315 | 374 | |
| 316 | - # Find the dir stream | |
| 317 | - dir_path = vba_root + '/VBA/dir' | |
| 318 | - if not ole.exists(dir_path): | |
| 319 | - logging.debug('missing dir stream') | |
| 320 | - return None | |
| 321 | 375 | # read data from dir stream (compressed) |
| 322 | 376 | dir_compressed = ole.openstream(dir_path).read() |
| 323 | 377 | |
| ... | ... | @@ -620,7 +674,7 @@ def extract_macros(ole): |
| 620 | 674 | logging.debug("StreamName = {0}".format(MODULESTREAMNAME_StreamName)) |
| 621 | 675 | logging.debug("TextOffset = {0}".format(MODULEOFFSET_TextOffset)) |
| 622 | 676 | |
| 623 | - code_path = vba_root + '/VBA/' + MODULESTREAMNAME_StreamName | |
| 677 | + code_path = vba_root + 'VBA/' + MODULESTREAMNAME_StreamName | |
| 624 | 678 | #TODO: test if stream exists |
| 625 | 679 | code_data = ole.openstream(code_path).read() |
| 626 | 680 | logging.debug("length of code_data = {0}".format(len(code_data))) |
| ... | ... | @@ -642,6 +696,39 @@ def extract_macros(ole): |
| 642 | 696 | return |
| 643 | 697 | |
| 644 | 698 | |
| 699 | +def extract_macros (filename): | |
| 700 | + if OleFileIO_PL.isOleFile(filename): | |
| 701 | + # This looks like an OLE file | |
| 702 | + logging.info('Extracting VBA Macros from OLE file %s' % filename) | |
| 703 | + ole = OleFileIO_PL.OleFileIO(filename) | |
| 704 | + extract_macros_ole(ole) | |
| 705 | + ole.close() | |
| 706 | + elif zipfile.is_zipfile(filename): | |
| 707 | + # This looks like a zip file, need to look for vbaProject.bin inside | |
| 708 | + #TODO: here we could even look for any OLE file inside the archive | |
| 709 | + #...because vbaProject.bin can be renamed: | |
| 710 | + # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18 | |
| 711 | + logging.info('Opening ZIP/OpenXML file %s' % filename) | |
| 712 | + z = zipfile.ZipFile(filename) | |
| 713 | + for f in z.namelist(): | |
| 714 | + if f.lower().endswith('vbaproject.bin'): | |
| 715 | + logging.debug('Opening OLE VBA storage %s within zip' % f) | |
| 716 | + vbadata = z.open(f).read() | |
| 717 | + vbafile = cStringIO.StringIO(vbadata) | |
| 718 | + try: | |
| 719 | + ole = OleFileIO_PL.OleFileIO(vbafile) | |
| 720 | + except: | |
| 721 | + logging.debug('%s is not a valid OLE file' % f) | |
| 722 | + continue | |
| 723 | + logging.info('Extracting VBA Macros from %s/%s' % (filename, f)) | |
| 724 | + extract_macros_ole(ole) | |
| 725 | + ole.close() | |
| 726 | + z.close() | |
| 727 | + else: | |
| 728 | + logging.error('%s is not an OLE nor an OpenXML file, cannot extract VBA Macros.' % filename) | |
| 729 | + | |
| 730 | + | |
| 731 | + | |
| 645 | 732 | #=== MAIN ===================================================================== |
| 646 | 733 | |
| 647 | 734 | if __name__ == '__main__': |
| ... | ... | @@ -650,9 +737,7 @@ if __name__ == '__main__': |
| 650 | 737 | print __doc__ |
| 651 | 738 | sys.exit(1) |
| 652 | 739 | |
| 653 | - logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING) | |
| 740 | + logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) | |
| 654 | 741 | |
| 655 | - ole = OleFileIO_PL.OleFileIO(sys.argv[1]) | |
| 656 | - extract_macros(ole) | |
| 742 | + extract_macros(sys.argv[1]) | |
| 657 | 743 | |
| 658 | - ole.close() | ... | ... |