Commit 698fabf4a0f65263bcc254420b388cf081701afe

Authored by Philippe Lagadec
1 parent 31b5fcab

olevba: refactored the code, now with a class-based API and a new method to detect VBA macros

Showing 1 changed file with 242 additions and 99 deletions
oletools/olevba.py
@@ -79,12 +79,16 @@ Usage: olevba.py <file> @@ -79,12 +79,16 @@ Usage: olevba.py <file>
79 # 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats 79 # 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats
80 # and to find the VBA project root anywhere in the file 80 # and to find the VBA project root anywhere in the file
81 # 2014-11-29 v0.04 PL: - use olefile instead of OleFileIO_PL 81 # 2014-11-29 v0.04 PL: - use olefile instead of OleFileIO_PL
  82 +# 2014-12-05 v0.05 PL: - refactored most functions into a class, new API
  83 +# - added detect_vba_macros
82 84
83 -__version__ = '0.04' 85 +__version__ = '0.05'
84 86
85 #------------------------------------------------------------------------------ 87 #------------------------------------------------------------------------------
86 # TODO: 88 # TODO:
87 -# + extract_macros should yield filename, code 89 +# + do not use logging, but a provided logger (null logger by default)
  90 +# + by default, do not display empty macros containing only lines with Attribute=... (option)
  91 +# (...unless it can be used to hide code: to be tested)
88 # + optparse 92 # + optparse
89 # + nicer output 93 # + nicer output
90 # + setup logging (common with other oletools) 94 # + setup logging (common with other oletools)
@@ -125,8 +129,6 @@ MODULE_EXTENSION = "bas" @@ -125,8 +129,6 @@ MODULE_EXTENSION = "bas"
125 CLASS_EXTENSION = "cls" 129 CLASS_EXTENSION = "cls"
126 FORM_EXTENSION = "frm" 130 FORM_EXTENSION = "frm"
127 131
128 -BINFILE_PATH = "xl/vbaProject.bin"  
129 -  
130 132
131 #--- FUNCTIONS ---------------------------------------------------------------- 133 #--- FUNCTIONS ----------------------------------------------------------------
132 134
@@ -265,61 +267,6 @@ def decompress_stream (compressed_container): @@ -265,61 +267,6 @@ def decompress_stream (compressed_container):
265 return decompressed_container 267 return decompressed_container
266 268
267 269
268 -def extract_macros_ole(ole):  
269 - """  
270 - Extract VBA macros from an OLE file  
271 - """  
272 - # Find the VBA project root (different in MS Word, Excel, etc):  
273 - # - Word 97-2003: Macros  
274 - # - Excel 97-2003: _VBA_PROJECT_CUR  
275 - # - PowerPoint 97-2003: not supported yet (different file structure)  
276 - # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin.  
277 - # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word  
278 - # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word  
279 - # - Visio 2007: not supported yet (different file structure)  
280 -  
281 - # According to MS-OVBA section 2.2.1:  
282 - # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream  
283 - # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream  
284 - # - all names are case-insensitive  
285 -  
286 - # Look for any storage containing those storage/streams:  
287 - for storage in ole.listdir(streams=False, storages=True):  
288 - # Look for a storage ending with "VBA":  
289 - if storage[-1].upper() == 'VBA':  
290 - logging.debug('Found VBA storage: %s' % ('/'.join(storage)))  
291 - vba_root = '/'.join(storage[:-1])  
292 - # Add a trailing slash to vba_root, unless it is the root of the OLE file:  
293 - # (used later to append all the child streams/storages)  
294 - if vba_root != '':  
295 - vba_root += '/'  
296 - logging.debug('Checking vba_root="%s"' % vba_root)  
297 -  
298 - def check_vba_stream(ole, vba_root, stream_path):  
299 - full_path = vba_root + stream_path  
300 - if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM:  
301 - logging.debug('Found %s stream: %s' % (stream_path, full_path))  
302 - return full_path  
303 - else:  
304 - logging.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path)  
305 - return False  
306 -  
307 - # Check if the VBA root storage also contains a PROJECT stream:  
308 - project_path = check_vba_stream(ole, vba_root, 'PROJECT')  
309 - if not project_path: continue  
310 - # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream:  
311 - vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT')  
312 - if not vba_project_path: continue  
313 - # Check if the VBA root storage also contains a VBA/dir stream:  
314 - dir_path = check_vba_stream(ole, vba_root, 'VBA/dir')  
315 - if not dir_path: continue  
316 - # Now we are pretty sure it is a VBA project structure  
317 - logging.debug('VBA root storage: "%s"' % vba_root)  
318 - # extract all VBA macros from that VBA root storage:  
319 - _extract_vba(ole, vba_root, project_path, dir_path)  
320 -  
321 -  
322 -  
323 def _extract_vba (ole, vba_root, project_path, dir_path): 270 def _extract_vba (ole, vba_root, project_path, dir_path):
324 """ 271 """
325 Extract VBA macros from an OleFileIO object. 272 Extract VBA macros from an OleFileIO object.
@@ -327,7 +274,7 @@ def _extract_vba (ole, vba_root, project_path, dir_path): @@ -327,7 +274,7 @@ def _extract_vba (ole, vba_root, project_path, dir_path):
327 274
328 vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream 275 vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream
329 vba_project: path to the PROJECT stream 276 vba_project: path to the PROJECT stream
330 - This is a generator, yielding (filename, stream path, VBA source code) for each VBA code stream 277 + This is a generator, yielding (stream path, VBA filename, VBA source code) for each VBA code stream
331 """ 278 """
332 # Open the PROJECT stream: 279 # Open the PROJECT stream:
333 project = ole.openstream(project_path) 280 project = ole.openstream(project_path)
@@ -685,49 +632,225 @@ def _extract_vba (ole, vba_root, project_path, dir_path): @@ -685,49 +632,225 @@ def _extract_vba (ole, vba_root, project_path, dir_path):
685 code_data = decompress_stream(code_data) 632 code_data = decompress_stream(code_data)
686 filext = code_modules[MODULENAME_ModuleName] 633 filext = code_modules[MODULENAME_ModuleName]
687 filename = '{0}.{1}'.format(MODULENAME_ModuleName, filext) 634 filename = '{0}.{1}'.format(MODULENAME_ModuleName, filext)
688 - #TODO: return list of strings or dict instead of printing  
689 - print '-'*79  
690 - print filename  
691 - print ''  
692 - print code_data  
693 - print '' 635 + yield (code_path, filename, code_data)
  636 + # print '-'*79
  637 + # print filename
  638 + # print ''
  639 + # print code_data
  640 + # print ''
694 logging.debug('extracted file {0}'.format(filename)) 641 logging.debug('extracted file {0}'.format(filename))
695 else: 642 else:
696 logging.warning("module stream {0} has code data length 0".format(MODULESTREAMNAME_StreamName)) 643 logging.warning("module stream {0} has code data length 0".format(MODULESTREAMNAME_StreamName))
697 return 644 return
698 645
699 646
700 -def extract_macros (filename):  
701 - if olefile.isOleFile(filename):  
702 - # This looks like an OLE file  
703 - logging.info('Extracting VBA Macros from OLE file %s' % filename)  
704 - ole = olefile.OleFileIO(filename)  
705 - extract_macros_ole(ole)  
706 - ole.close()  
707 - elif zipfile.is_zipfile(filename):  
708 - # This looks like a zip file, need to look for vbaProject.bin inside  
709 - #TODO: here we should look for any OLE file inside the archive  
710 - #...because vbaProject.bin can be renamed:  
711 - # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18  
712 - logging.info('Opening ZIP/OpenXML file %s' % filename)  
713 - z = zipfile.ZipFile(filename)  
714 - for f in z.namelist():  
715 - if f.lower().endswith('vbaproject.bin'):  
716 - logging.debug('Opening OLE VBA storage %s within zip' % f)  
717 - vbadata = z.open(f).read()  
718 - vbafile = cStringIO.StringIO(vbadata)  
719 - try:  
720 - ole = olefile.OleFileIO(vbafile)  
721 - except:  
722 - logging.debug('%s is not a valid OLE file' % f)  
723 - continue  
724 - logging.info('Extracting VBA Macros from %s/%s' % (filename, f))  
725 - extract_macros_ole(ole)  
726 - ole.close()  
727 - z.close()  
728 - else:  
729 - logging.error('%s is not an OLE nor an OpenXML file, cannot extract VBA Macros.' % filename) 647 +#=== CLASSES =================================================================
  648 +
  649 +class VBA_Parser(object):
  650 + """
  651 + Class to parse MS Office files, to detect VBA macros and extract VBA source code
  652 + Supported file formats:
  653 + - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)
  654 + - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
  655 + - PowerPoint 2007+ (.pptm, .ppsm)
  656 + """
730 657
  658 + def __init__(self, _file, filename=None):
  659 + """
  660 + Constructor for VBA_Parser
  661 +
  662 + :param _file: path of file to parse, file-like object or file content
  663 + :param filename: actual filename if _file is a file-like object or file content
  664 + in a bytes string
  665 + """
  666 + #TODO: also support olefile and zipfile as input
  667 + self.file = _file
  668 + self.ole_file = None
  669 + self.ole_subfiles = []
  670 + self.filename = filename
  671 + self.type = None
  672 + self.vba_projects = None
  673 + if filename is None:
  674 + if isinstance(_file, basestring):
  675 + if len(_file) < olefile.MINIMAL_OLEFILE_SIZE:
  676 + self.filename = _file
  677 + else:
  678 + self.filename = '<file in bytes string>'
  679 + else:
  680 + self.filename = '<file-like object>'
  681 + if olefile.isOleFile(_file):
  682 + # This looks like an OLE file
  683 + logging.info('Parsing OLE file %s' % self.filename)
  684 + self.ole_file = olefile.OleFileIO(_file)
  685 + self.type = 'OLE'
  686 + elif zipfile.is_zipfile(_file):
  687 + # This looks like a zip file, need to look for vbaProject.bin inside
  688 + # It can be any OLE file inside the archive
  689 + #...because vbaProject.bin can be renamed:
  690 + # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18
  691 + logging.info('Opening ZIP/OpenXML file %s' % self.filename)
  692 + self.type = 'OpenXML'
  693 + z = zipfile.ZipFile(_file)
  694 + #TODO: check if this is actually an OpenXML file
  695 + # check each file within the zip if it is an OLE file, by reading its magic:
  696 + for subfile in z.namelist():
  697 + magic = z.open(subfile).read(len(olefile.MAGIC))
  698 + if magic == olefile.MAGIC:
  699 + logging.debug('Opening OLE file %s within zip' % subfile)
  700 + ole_data = z.open(subfile).read()
  701 + try:
  702 + self.ole_subfiles.append(VBA_Parser(ole_data, filename=subfile))
  703 + except:
  704 + logging.debug('%s is not a valid OLE file' % subfile)
  705 + continue
  706 + z.close()
  707 + else:
  708 + msg = '%s is not an OLE nor an OpenXML file, cannot extract VBA Macros.' % self.filename
  709 + logging.error(msg)
  710 + raise TypeError(msg)
  711 +
  712 + def find_vba_projects (self):
  713 + """
  714 + Finds all the VBA projects stored in an OLE file.
  715 +
  716 + Return None if the file is not OLE but OpenXML.
  717 + Return a list of tuples (vba_root, project_path, dir_path) for each VBA project.
  718 + vba_root is the path of the root OLE storage containing the VBA project,
  719 + including a trailing slash unless it is the root of the OLE file.
  720 + project_path is the path of the OLE stream named "PROJECT" within the VBA project.
  721 + dir_path is the path of the OLE stream named "VBA/dir" within the VBA project.
  722 +
  723 + If this function returns an empty list for one of the supported formats
  724 + (i.e. Word, Excel, Powerpoint except Powerpoint 97-2003), then the
  725 + file does not contain VBA macros.
  726 +
  727 + :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path)
  728 + for each VBA project found if OLE file
  729 + """
  730 + # if the file is not OLE but OpenXML, return None:
  731 + if self.ole_file is None:
  732 + return None
  733 +
  734 + # if this method has already been called, return previous result:
  735 + if self.vba_projects is not None:
  736 + return self.vba_projects
  737 +
  738 + # Find the VBA project root (different in MS Word, Excel, etc):
  739 + # - Word 97-2003: Macros
  740 + # - Excel 97-2003: _VBA_PROJECT_CUR
  741 + # - PowerPoint 97-2003: not supported yet (different file structure)
  742 + # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin.
  743 + # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word
  744 + # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word
  745 + # - Visio 2007: not supported yet (different file structure)
  746 +
  747 + # According to MS-OVBA section 2.2.1:
  748 + # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream
  749 + # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream
  750 + # - all names are case-insensitive
  751 +
  752 + # start with an empty list:
  753 + self.vba_projects = []
  754 + # Look for any storage containing those storage/streams:
  755 + ole = self.ole_file
  756 + for storage in ole.listdir(streams=False, storages=True):
  757 + # Look for a storage ending with "VBA":
  758 + if storage[-1].upper() == 'VBA':
  759 + logging.debug('Found VBA storage: %s' % ('/'.join(storage)))
  760 + vba_root = '/'.join(storage[:-1])
  761 + # Add a trailing slash to vba_root, unless it is the root of the OLE file:
  762 + # (used later to append all the child streams/storages)
  763 + if vba_root != '':
  764 + vba_root += '/'
  765 + logging.debug('Checking vba_root="%s"' % vba_root)
  766 +
  767 + def check_vba_stream(ole, vba_root, stream_path):
  768 + full_path = vba_root + stream_path
  769 + if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM:
  770 + logging.debug('Found %s stream: %s' % (stream_path, full_path))
  771 + return full_path
  772 + else:
  773 + logging.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path)
  774 + return False
  775 +
  776 + # Check if the VBA root storage also contains a PROJECT stream:
  777 + project_path = check_vba_stream(ole, vba_root, 'PROJECT')
  778 + if not project_path: continue
  779 + # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream:
  780 + vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT')
  781 + if not vba_project_path: continue
  782 + # Check if the VBA root storage also contains a VBA/dir stream:
  783 + dir_path = check_vba_stream(ole, vba_root, 'VBA/dir')
  784 + if not dir_path: continue
  785 + # Now we are pretty sure it is a VBA project structure
  786 + logging.debug('VBA root storage: "%s"' % vba_root)
  787 + # append the results to the list as a tuple for later use:
  788 + self.vba_projects.append((vba_root, project_path, dir_path))
  789 + return self.vba_projects
  790 +
  791 + def detect_vba_macros(self):
  792 + """
  793 + Detect the potential presence of VBA macros in the file, by checking
  794 + if it contains VBA projects. Both OLE and OpenXML files are supported.
  795 +
  796 + Important: for now, results are accurate only for Word, Excel and PowerPoint
  797 + EXCEPT Powerpoint 97-2003, which has a different structure for VBA.
  798 +
  799 + Note: this method does NOT attempt to check the actual presence or validity
  800 + of VBA macro source code, so there might be false positives.
  801 + It may also detect VBA macros in files embedded within the main file,
  802 + for example an Excel workbook with macros embedded into a Word
  803 + document without macros may be detected, without distinction.
  804 +
  805 + :return: bool, True if at least one VBA project has been found, False otherwise
  806 + """
  807 + #TODO: return None or raise exception if format not supported like PPT 97-2003
  808 + #TODO: return the number of VBA projects found instead of True/False?
  809 + # if OpenXML, check all the OLE subfiles:
  810 + if self.ole_file is None:
  811 + for ole_subfile in self.ole_subfiles:
  812 + if ole_subfile.detect_vba_macros():
  813 + return True
  814 + return False
  815 + # otherwise it's an OLE file, find VBA projects:
  816 + vba_projects = self.find_vba_projects()
  817 + if len(vba_projects) == 0:
  818 + return False
  819 + else:
  820 + return True
  821 +
  822 +
  823 + def extract_macros (self):
  824 + """
  825 + Extract and decompress source code for each VBA macro found in the file
  826 +
  827 + Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found
  828 + If the file is OLE, filename is the path of the file.
  829 + If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros
  830 + within the zip archive, e.g. word/vbaProject.bin.
  831 + """
  832 + if self.ole_file is None:
  833 + for ole_subfile in self.ole_subfiles:
  834 + for results in ole_subfile.extract_macros():
  835 + yield results
  836 + else:
  837 + self.find_vba_projects()
  838 + for vba_root, project_path, dir_path in self.vba_projects:
  839 + # extract all VBA macros from that VBA root storage:
  840 + for stream_path, vba_filename, vba_code in _extract_vba(self.ole_file, vba_root, project_path, dir_path):
  841 + yield (self.filename, stream_path, vba_filename, vba_code)
  842 +
  843 +
  844 + def close(self):
  845 + """
  846 + Close all the open files. This method must be called after usage, if
  847 + the application is opening many files.
  848 + """
  849 + if self.ole_file is None:
  850 + for ole_subfile in self.ole_subfiles:
  851 + ole_subfile.close()
  852 + else:
  853 + self.ole_file.close()
731 854
732 855
733 #=== MAIN ===================================================================== 856 #=== MAIN =====================================================================
@@ -738,8 +861,28 @@ if __name__ == &#39;__main__&#39;: @@ -738,8 +861,28 @@ if __name__ == &#39;__main__&#39;:
738 print __doc__ 861 print __doc__
739 sys.exit(1) 862 sys.exit(1)
740 863
741 - logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)  
742 -  
743 - extract_macros(sys.argv[1]) 864 + logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING) #INFO)
  865 +
  866 + #TODO: option parser
  867 + fname = sys.argv[1]
  868 + print '='*79
  869 + print 'File:', fname
  870 + try:
  871 + vba = VBA_Parser(fname)
  872 + print 'Type:', vba.type
  873 + if vba.detect_vba_macros():
  874 + print 'Contains VBA Macros:'
  875 + for (filename, stream_path, vba_filename, vba_code) in vba.extract_macros():
  876 + print '-'*79
  877 + print 'Filename :', filename
  878 + print 'OLE stream :', stream_path
  879 + print 'VBA filename:', vba_filename
  880 + print '- '*39
  881 + print vba_code
  882 + else:
  883 + print 'No VBA macros found.'
  884 + except TypeError:
  885 + raise
  886 + print sys.exc_value
744 887
745 # This was coded while listening to "Dust" from I Love You But I've Chosen Darkness 888 # This was coded while listening to "Dust" from I Love You But I've Chosen Darkness
746 \ No newline at end of file 889 \ No newline at end of file