Commit 698fabf4a0f65263bcc254420b388cf081701afe
1 parent
31b5fcab
olevba: refactored the code, now with a class-based API and a new method to detect VBA macros
Showing
1 changed file
with
242 additions
and
99 deletions
oletools/olevba.py
| ... | ... | @@ -79,12 +79,16 @@ Usage: olevba.py <file> |
| 79 | 79 | # 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats |
| 80 | 80 | # and to find the VBA project root anywhere in the file |
| 81 | 81 | # 2014-11-29 v0.04 PL: - use olefile instead of OleFileIO_PL |
| 82 | +# 2014-12-05 v0.05 PL: - refactored most functions into a class, new API | |
| 83 | +# - added detect_vba_macros | |
| 82 | 84 | |
| 83 | -__version__ = '0.04' | |
| 85 | +__version__ = '0.05' | |
| 84 | 86 | |
| 85 | 87 | #------------------------------------------------------------------------------ |
| 86 | 88 | # TODO: |
| 87 | -# + extract_macros should yield filename, code | |
| 89 | +# + do not use logging, but a provided logger (null logger by default) | |
| 90 | +# + by default, do not display empty macros containing only lines with Attribute=... (option) | |
| 91 | +# (...unless it can be used to hide code: to be tested) | |
| 88 | 92 | # + optparse |
| 89 | 93 | # + nicer output |
| 90 | 94 | # + setup logging (common with other oletools) |
| ... | ... | @@ -125,8 +129,6 @@ MODULE_EXTENSION = "bas" |
| 125 | 129 | CLASS_EXTENSION = "cls" |
| 126 | 130 | FORM_EXTENSION = "frm" |
| 127 | 131 | |
| 128 | -BINFILE_PATH = "xl/vbaProject.bin" | |
| 129 | - | |
| 130 | 132 | |
| 131 | 133 | #--- FUNCTIONS ---------------------------------------------------------------- |
| 132 | 134 | |
| ... | ... | @@ -265,61 +267,6 @@ def decompress_stream (compressed_container): |
| 265 | 267 | return decompressed_container |
| 266 | 268 | |
| 267 | 269 | |
| 268 | -def extract_macros_ole(ole): | |
| 269 | - """ | |
| 270 | - Extract VBA macros from an OLE file | |
| 271 | - """ | |
| 272 | - # Find the VBA project root (different in MS Word, Excel, etc): | |
| 273 | - # - Word 97-2003: Macros | |
| 274 | - # - Excel 97-2003: _VBA_PROJECT_CUR | |
| 275 | - # - PowerPoint 97-2003: not supported yet (different file structure) | |
| 276 | - # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin. | |
| 277 | - # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word | |
| 278 | - # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word | |
| 279 | - # - Visio 2007: not supported yet (different file structure) | |
| 280 | - | |
| 281 | - # According to MS-OVBA section 2.2.1: | |
| 282 | - # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream | |
| 283 | - # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream | |
| 284 | - # - all names are case-insensitive | |
| 285 | - | |
| 286 | - # Look for any storage containing those storage/streams: | |
| 287 | - for storage in ole.listdir(streams=False, storages=True): | |
| 288 | - # Look for a storage ending with "VBA": | |
| 289 | - if storage[-1].upper() == 'VBA': | |
| 290 | - logging.debug('Found VBA storage: %s' % ('/'.join(storage))) | |
| 291 | - vba_root = '/'.join(storage[:-1]) | |
| 292 | - # Add a trailing slash to vba_root, unless it is the root of the OLE file: | |
| 293 | - # (used later to append all the child streams/storages) | |
| 294 | - if vba_root != '': | |
| 295 | - vba_root += '/' | |
| 296 | - logging.debug('Checking vba_root="%s"' % vba_root) | |
| 297 | - | |
| 298 | - def check_vba_stream(ole, vba_root, stream_path): | |
| 299 | - full_path = vba_root + stream_path | |
| 300 | - if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM: | |
| 301 | - logging.debug('Found %s stream: %s' % (stream_path, full_path)) | |
| 302 | - return full_path | |
| 303 | - else: | |
| 304 | - logging.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path) | |
| 305 | - return False | |
| 306 | - | |
| 307 | - # Check if the VBA root storage also contains a PROJECT stream: | |
| 308 | - project_path = check_vba_stream(ole, vba_root, 'PROJECT') | |
| 309 | - if not project_path: continue | |
| 310 | - # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream: | |
| 311 | - vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT') | |
| 312 | - if not vba_project_path: continue | |
| 313 | - # Check if the VBA root storage also contains a VBA/dir stream: | |
| 314 | - dir_path = check_vba_stream(ole, vba_root, 'VBA/dir') | |
| 315 | - if not dir_path: continue | |
| 316 | - # Now we are pretty sure it is a VBA project structure | |
| 317 | - logging.debug('VBA root storage: "%s"' % vba_root) | |
| 318 | - # extract all VBA macros from that VBA root storage: | |
| 319 | - _extract_vba(ole, vba_root, project_path, dir_path) | |
| 320 | - | |
| 321 | - | |
| 322 | - | |
| 323 | 270 | def _extract_vba (ole, vba_root, project_path, dir_path): |
| 324 | 271 | """ |
| 325 | 272 | Extract VBA macros from an OleFileIO object. |
| ... | ... | @@ -327,7 +274,7 @@ def _extract_vba (ole, vba_root, project_path, dir_path): |
| 327 | 274 | |
| 328 | 275 | vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream |
| 329 | 276 | vba_project: path to the PROJECT stream |
| 330 | - This is a generator, yielding (filename, stream path, VBA source code) for each VBA code stream | |
| 277 | + This is a generator, yielding (stream path, VBA filename, VBA source code) for each VBA code stream | |
| 331 | 278 | """ |
| 332 | 279 | # Open the PROJECT stream: |
| 333 | 280 | project = ole.openstream(project_path) |
| ... | ... | @@ -685,49 +632,225 @@ def _extract_vba (ole, vba_root, project_path, dir_path): |
| 685 | 632 | code_data = decompress_stream(code_data) |
| 686 | 633 | filext = code_modules[MODULENAME_ModuleName] |
| 687 | 634 | filename = '{0}.{1}'.format(MODULENAME_ModuleName, filext) |
| 688 | - #TODO: return list of strings or dict instead of printing | |
| 689 | - print '-'*79 | |
| 690 | - print filename | |
| 691 | - print '' | |
| 692 | - print code_data | |
| 693 | - print '' | |
| 635 | + yield (code_path, filename, code_data) | |
| 636 | + # print '-'*79 | |
| 637 | + # print filename | |
| 638 | + # print '' | |
| 639 | + # print code_data | |
| 640 | + # print '' | |
| 694 | 641 | logging.debug('extracted file {0}'.format(filename)) |
| 695 | 642 | else: |
| 696 | 643 | logging.warning("module stream {0} has code data length 0".format(MODULESTREAMNAME_StreamName)) |
| 697 | 644 | return |
| 698 | 645 | |
| 699 | 646 | |
| 700 | -def extract_macros (filename): | |
| 701 | - if olefile.isOleFile(filename): | |
| 702 | - # This looks like an OLE file | |
| 703 | - logging.info('Extracting VBA Macros from OLE file %s' % filename) | |
| 704 | - ole = olefile.OleFileIO(filename) | |
| 705 | - extract_macros_ole(ole) | |
| 706 | - ole.close() | |
| 707 | - elif zipfile.is_zipfile(filename): | |
| 708 | - # This looks like a zip file, need to look for vbaProject.bin inside | |
| 709 | - #TODO: here we should look for any OLE file inside the archive | |
| 710 | - #...because vbaProject.bin can be renamed: | |
| 711 | - # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18 | |
| 712 | - logging.info('Opening ZIP/OpenXML file %s' % filename) | |
| 713 | - z = zipfile.ZipFile(filename) | |
| 714 | - for f in z.namelist(): | |
| 715 | - if f.lower().endswith('vbaproject.bin'): | |
| 716 | - logging.debug('Opening OLE VBA storage %s within zip' % f) | |
| 717 | - vbadata = z.open(f).read() | |
| 718 | - vbafile = cStringIO.StringIO(vbadata) | |
| 719 | - try: | |
| 720 | - ole = olefile.OleFileIO(vbafile) | |
| 721 | - except: | |
| 722 | - logging.debug('%s is not a valid OLE file' % f) | |
| 723 | - continue | |
| 724 | - logging.info('Extracting VBA Macros from %s/%s' % (filename, f)) | |
| 725 | - extract_macros_ole(ole) | |
| 726 | - ole.close() | |
| 727 | - z.close() | |
| 728 | - else: | |
| 729 | - logging.error('%s is not an OLE nor an OpenXML file, cannot extract VBA Macros.' % filename) | |
| 647 | +#=== CLASSES ================================================================= | |
| 648 | + | |
| 649 | +class VBA_Parser(object): | |
| 650 | + """ | |
| 651 | + Class to parse MS Office files, to detect VBA macros and extract VBA source code | |
| 652 | + Supported file formats: | |
| 653 | + - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) | |
| 654 | + - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) | |
| 655 | + - PowerPoint 2007+ (.pptm, .ppsm) | |
| 656 | + """ | |
| 730 | 657 | |
| 658 | + def __init__(self, _file, filename=None): | |
| 659 | + """ | |
| 660 | + Constructor for VBA_Parser | |
| 661 | + | |
| 662 | + :param _file: path of file to parse, file-like object or file content | |
| 663 | + :param filename: actual filename if _file is a file-like object or file content | |
| 664 | + in a bytes string | |
| 665 | + """ | |
| 666 | + #TODO: also support olefile and zipfile as input | |
| 667 | + self.file = _file | |
| 668 | + self.ole_file = None | |
| 669 | + self.ole_subfiles = [] | |
| 670 | + self.filename = filename | |
| 671 | + self.type = None | |
| 672 | + self.vba_projects = None | |
| 673 | + if filename is None: | |
| 674 | + if isinstance(_file, basestring): | |
| 675 | + if len(_file) < olefile.MINIMAL_OLEFILE_SIZE: | |
| 676 | + self.filename = _file | |
| 677 | + else: | |
| 678 | + self.filename = '<file in bytes string>' | |
| 679 | + else: | |
| 680 | + self.filename = '<file-like object>' | |
| 681 | + if olefile.isOleFile(_file): | |
| 682 | + # This looks like an OLE file | |
| 683 | + logging.info('Parsing OLE file %s' % self.filename) | |
| 684 | + self.ole_file = olefile.OleFileIO(_file) | |
| 685 | + self.type = 'OLE' | |
| 686 | + elif zipfile.is_zipfile(_file): | |
| 687 | + # This looks like a zip file, need to look for vbaProject.bin inside | |
| 688 | + # It can be any OLE file inside the archive | |
| 689 | + #...because vbaProject.bin can be renamed: | |
| 690 | + # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18 | |
| 691 | + logging.info('Opening ZIP/OpenXML file %s' % self.filename) | |
| 692 | + self.type = 'OpenXML' | |
| 693 | + z = zipfile.ZipFile(_file) | |
| 694 | + #TODO: check if this is actually an OpenXML file | |
| 695 | + # check each file within the zip if it is an OLE file, by reading its magic: | |
| 696 | + for subfile in z.namelist(): | |
| 697 | + magic = z.open(subfile).read(len(olefile.MAGIC)) | |
| 698 | + if magic == olefile.MAGIC: | |
| 699 | + logging.debug('Opening OLE file %s within zip' % subfile) | |
| 700 | + ole_data = z.open(subfile).read() | |
| 701 | + try: | |
| 702 | + self.ole_subfiles.append(VBA_Parser(ole_data, filename=subfile)) | |
| 703 | + except: | |
| 704 | + logging.debug('%s is not a valid OLE file' % subfile) | |
| 705 | + continue | |
| 706 | + z.close() | |
| 707 | + else: | |
| 708 | + msg = '%s is not an OLE nor an OpenXML file, cannot extract VBA Macros.' % self.filename | |
| 709 | + logging.error(msg) | |
| 710 | + raise TypeError(msg) | |
| 711 | + | |
| 712 | + def find_vba_projects (self): | |
| 713 | + """ | |
| 714 | + Finds all the VBA projects stored in an OLE file. | |
| 715 | + | |
| 716 | + Return None if the file is not OLE but OpenXML. | |
| 717 | + Return a list of tuples (vba_root, project_path, dir_path) for each VBA project. | |
| 718 | + vba_root is the path of the root OLE storage containing the VBA project, | |
| 719 | + including a trailing slash unless it is the root of the OLE file. | |
| 720 | + project_path is the path of the OLE stream named "PROJECT" within the VBA project. | |
| 721 | + dir_path is the path of the OLE stream named "VBA/dir" within the VBA project. | |
| 722 | + | |
| 723 | + If this function returns an empty list for one of the supported formats | |
| 724 | + (i.e. Word, Excel, Powerpoint except Powerpoint 97-2003), then the | |
| 725 | + file does not contain VBA macros. | |
| 726 | + | |
| 727 | + :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path) | |
| 728 | + for each VBA project found if OLE file | |
| 729 | + """ | |
| 730 | + # if the file is not OLE but OpenXML, return None: | |
| 731 | + if self.ole_file is None: | |
| 732 | + return None | |
| 733 | + | |
| 734 | + # if this method has already been called, return previous result: | |
| 735 | + if self.vba_projects is not None: | |
| 736 | + return self.vba_projects | |
| 737 | + | |
| 738 | + # Find the VBA project root (different in MS Word, Excel, etc): | |
| 739 | + # - Word 97-2003: Macros | |
| 740 | + # - Excel 97-2003: _VBA_PROJECT_CUR | |
| 741 | + # - PowerPoint 97-2003: not supported yet (different file structure) | |
| 742 | + # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin. | |
| 743 | + # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word | |
| 744 | + # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word | |
| 745 | + # - Visio 2007: not supported yet (different file structure) | |
| 746 | + | |
| 747 | + # According to MS-OVBA section 2.2.1: | |
| 748 | + # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream | |
| 749 | + # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream | |
| 750 | + # - all names are case-insensitive | |
| 751 | + | |
| 752 | + # start with an empty list: | |
| 753 | + self.vba_projects = [] | |
| 754 | + # Look for any storage containing those storage/streams: | |
| 755 | + ole = self.ole_file | |
| 756 | + for storage in ole.listdir(streams=False, storages=True): | |
| 757 | + # Look for a storage ending with "VBA": | |
| 758 | + if storage[-1].upper() == 'VBA': | |
| 759 | + logging.debug('Found VBA storage: %s' % ('/'.join(storage))) | |
| 760 | + vba_root = '/'.join(storage[:-1]) | |
| 761 | + # Add a trailing slash to vba_root, unless it is the root of the OLE file: | |
| 762 | + # (used later to append all the child streams/storages) | |
| 763 | + if vba_root != '': | |
| 764 | + vba_root += '/' | |
| 765 | + logging.debug('Checking vba_root="%s"' % vba_root) | |
| 766 | + | |
| 767 | + def check_vba_stream(ole, vba_root, stream_path): | |
| 768 | + full_path = vba_root + stream_path | |
| 769 | + if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM: | |
| 770 | + logging.debug('Found %s stream: %s' % (stream_path, full_path)) | |
| 771 | + return full_path | |
| 772 | + else: | |
| 773 | + logging.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path) | |
| 774 | + return False | |
| 775 | + | |
| 776 | + # Check if the VBA root storage also contains a PROJECT stream: | |
| 777 | + project_path = check_vba_stream(ole, vba_root, 'PROJECT') | |
| 778 | + if not project_path: continue | |
| 779 | + # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream: | |
| 780 | + vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT') | |
| 781 | + if not vba_project_path: continue | |
| 782 | + # Check if the VBA root storage also contains a VBA/dir stream: | |
| 783 | + dir_path = check_vba_stream(ole, vba_root, 'VBA/dir') | |
| 784 | + if not dir_path: continue | |
| 785 | + # Now we are pretty sure it is a VBA project structure | |
| 786 | + logging.debug('VBA root storage: "%s"' % vba_root) | |
| 787 | + # append the results to the list as a tuple for later use: | |
| 788 | + self.vba_projects.append((vba_root, project_path, dir_path)) | |
| 789 | + return self.vba_projects | |
| 790 | + | |
| 791 | + def detect_vba_macros(self): | |
| 792 | + """ | |
| 793 | + Detect the potential presence of VBA macros in the file, by checking | |
| 794 | + if it contains VBA projects. Both OLE and OpenXML files are supported. | |
| 795 | + | |
| 796 | + Important: for now, results are accurate only for Word, Excel and PowerPoint | |
| 797 | + EXCEPT Powerpoint 97-2003, which has a different structure for VBA. | |
| 798 | + | |
| 799 | + Note: this method does NOT attempt to check the actual presence or validity | |
| 800 | + of VBA macro source code, so there might be false positives. | |
| 801 | + It may also detect VBA macros in files embedded within the main file, | |
| 802 | + for example an Excel workbook with macros embedded into a Word | |
| 803 | + document without macros may be detected, without distinction. | |
| 804 | + | |
| 805 | + :return: bool, True if at least one VBA project has been found, False otherwise | |
| 806 | + """ | |
| 807 | + #TODO: return None or raise exception if format not supported like PPT 97-2003 | |
| 808 | + #TODO: return the number of VBA projects found instead of True/False? | |
| 809 | + # if OpenXML, check all the OLE subfiles: | |
| 810 | + if self.ole_file is None: | |
| 811 | + for ole_subfile in self.ole_subfiles: | |
| 812 | + if ole_subfile.detect_vba_macros(): | |
| 813 | + return True | |
| 814 | + return False | |
| 815 | + # otherwise it's an OLE file, find VBA projects: | |
| 816 | + vba_projects = self.find_vba_projects() | |
| 817 | + if len(vba_projects) == 0: | |
| 818 | + return False | |
| 819 | + else: | |
| 820 | + return True | |
| 821 | + | |
| 822 | + | |
| 823 | + def extract_macros (self): | |
| 824 | + """ | |
| 825 | + Extract and decompress source code for each VBA macro found in the file | |
| 826 | + | |
| 827 | + Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found | |
| 828 | + If the file is OLE, filename is the path of the file. | |
| 829 | + If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros | |
| 830 | + within the zip archive, e.g. word/vbaProject.bin. | |
| 831 | + """ | |
| 832 | + if self.ole_file is None: | |
| 833 | + for ole_subfile in self.ole_subfiles: | |
| 834 | + for results in ole_subfile.extract_macros(): | |
| 835 | + yield results | |
| 836 | + else: | |
| 837 | + self.find_vba_projects() | |
| 838 | + for vba_root, project_path, dir_path in self.vba_projects: | |
| 839 | + # extract all VBA macros from that VBA root storage: | |
| 840 | + for stream_path, vba_filename, vba_code in _extract_vba(self.ole_file, vba_root, project_path, dir_path): | |
| 841 | + yield (self.filename, stream_path, vba_filename, vba_code) | |
| 842 | + | |
| 843 | + | |
| 844 | + def close(self): | |
| 845 | + """ | |
| 846 | + Close all the open files. This method must be called after usage, if | |
| 847 | + the application is opening many files. | |
| 848 | + """ | |
| 849 | + if self.ole_file is None: | |
| 850 | + for ole_subfile in self.ole_subfiles: | |
| 851 | + ole_subfile.close() | |
| 852 | + else: | |
| 853 | + self.ole_file.close() | |
| 731 | 854 | |
| 732 | 855 | |
| 733 | 856 | #=== MAIN ===================================================================== |
| ... | ... | @@ -738,8 +861,28 @@ if __name__ == '__main__': |
| 738 | 861 | print __doc__ |
| 739 | 862 | sys.exit(1) |
| 740 | 863 | |
| 741 | - logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) | |
| 742 | - | |
| 743 | - extract_macros(sys.argv[1]) | |
| 864 | + logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING) #INFO) | |
| 865 | + | |
| 866 | + #TODO: option parser | |
| 867 | + fname = sys.argv[1] | |
| 868 | + print '='*79 | |
| 869 | + print 'File:', fname | |
| 870 | + try: | |
| 871 | + vba = VBA_Parser(fname) | |
| 872 | + print 'Type:', vba.type | |
| 873 | + if vba.detect_vba_macros(): | |
| 874 | + print 'Contains VBA Macros:' | |
| 875 | + for (filename, stream_path, vba_filename, vba_code) in vba.extract_macros(): | |
| 876 | + print '-'*79 | |
| 877 | + print 'Filename :', filename | |
| 878 | + print 'OLE stream :', stream_path | |
| 879 | + print 'VBA filename:', vba_filename | |
| 880 | + print '- '*39 | |
| 881 | + print vba_code | |
| 882 | + else: | |
| 883 | + print 'No VBA macros found.' | |
| 884 | + except TypeError: | |
| 885 | + raise | |
| 886 | + print sys.exc_value | |
| 744 | 887 | |
| 745 | 888 | # This was coded while listening to "Dust" from I Love You But I've Chosen Darkness |
| 746 | 889 | \ No newline at end of file | ... | ... |