Commit 698fabf4a0f65263bcc254420b388cf081701afe

Authored by Philippe Lagadec
1 parent 31b5fcab

olevba: refactored the code, now with a class-based API and a new method to detect VBA macros

Showing 1 changed file with 242 additions and 99 deletions
oletools/olevba.py
... ... @@ -79,12 +79,16 @@ Usage: olevba.py <file>
79 79 # 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats
80 80 # and to find the VBA project root anywhere in the file
81 81 # 2014-11-29 v0.04 PL: - use olefile instead of OleFileIO_PL
  82 +# 2014-12-05 v0.05 PL: - refactored most functions into a class, new API
  83 +# - added detect_vba_macros
82 84  
83   -__version__ = '0.04'
  85 +__version__ = '0.05'
84 86  
85 87 #------------------------------------------------------------------------------
86 88 # TODO:
87   -# + extract_macros should yield filename, code
  89 +# + do not use logging, but a provided logger (null logger by default)
  90 +# + by default, do not display empty macros containing only lines with Attribute=... (option)
  91 +# (...unless it can be used to hide code: to be tested)
88 92 # + optparse
89 93 # + nicer output
90 94 # + setup logging (common with other oletools)
... ... @@ -125,8 +129,6 @@ MODULE_EXTENSION = "bas"
125 129 CLASS_EXTENSION = "cls"
126 130 FORM_EXTENSION = "frm"
127 131  
128   -BINFILE_PATH = "xl/vbaProject.bin"
129   -
130 132  
131 133 #--- FUNCTIONS ----------------------------------------------------------------
132 134  
... ... @@ -265,61 +267,6 @@ def decompress_stream (compressed_container):
265 267 return decompressed_container
266 268  
267 269  
268   -def extract_macros_ole(ole):
269   - """
270   - Extract VBA macros from an OLE file
271   - """
272   - # Find the VBA project root (different in MS Word, Excel, etc):
273   - # - Word 97-2003: Macros
274   - # - Excel 97-2003: _VBA_PROJECT_CUR
275   - # - PowerPoint 97-2003: not supported yet (different file structure)
276   - # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin.
277   - # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word
278   - # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word
279   - # - Visio 2007: not supported yet (different file structure)
280   -
281   - # According to MS-OVBA section 2.2.1:
282   - # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream
283   - # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream
284   - # - all names are case-insensitive
285   -
286   - # Look for any storage containing those storage/streams:
287   - for storage in ole.listdir(streams=False, storages=True):
288   - # Look for a storage ending with "VBA":
289   - if storage[-1].upper() == 'VBA':
290   - logging.debug('Found VBA storage: %s' % ('/'.join(storage)))
291   - vba_root = '/'.join(storage[:-1])
292   - # Add a trailing slash to vba_root, unless it is the root of the OLE file:
293   - # (used later to append all the child streams/storages)
294   - if vba_root != '':
295   - vba_root += '/'
296   - logging.debug('Checking vba_root="%s"' % vba_root)
297   -
298   - def check_vba_stream(ole, vba_root, stream_path):
299   - full_path = vba_root + stream_path
300   - if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM:
301   - logging.debug('Found %s stream: %s' % (stream_path, full_path))
302   - return full_path
303   - else:
304   - logging.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path)
305   - return False
306   -
307   - # Check if the VBA root storage also contains a PROJECT stream:
308   - project_path = check_vba_stream(ole, vba_root, 'PROJECT')
309   - if not project_path: continue
310   - # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream:
311   - vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT')
312   - if not vba_project_path: continue
313   - # Check if the VBA root storage also contains a VBA/dir stream:
314   - dir_path = check_vba_stream(ole, vba_root, 'VBA/dir')
315   - if not dir_path: continue
316   - # Now we are pretty sure it is a VBA project structure
317   - logging.debug('VBA root storage: "%s"' % vba_root)
318   - # extract all VBA macros from that VBA root storage:
319   - _extract_vba(ole, vba_root, project_path, dir_path)
320   -
321   -
322   -
323 270 def _extract_vba (ole, vba_root, project_path, dir_path):
324 271 """
325 272 Extract VBA macros from an OleFileIO object.
... ... @@ -327,7 +274,7 @@ def _extract_vba (ole, vba_root, project_path, dir_path):
327 274  
328 275 vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream
329 276 vba_project: path to the PROJECT stream
330   - This is a generator, yielding (filename, stream path, VBA source code) for each VBA code stream
  277 + This is a generator, yielding (stream path, VBA filename, VBA source code) for each VBA code stream
331 278 """
332 279 # Open the PROJECT stream:
333 280 project = ole.openstream(project_path)
... ... @@ -685,49 +632,225 @@ def _extract_vba (ole, vba_root, project_path, dir_path):
685 632 code_data = decompress_stream(code_data)
686 633 filext = code_modules[MODULENAME_ModuleName]
687 634 filename = '{0}.{1}'.format(MODULENAME_ModuleName, filext)
688   - #TODO: return list of strings or dict instead of printing
689   - print '-'*79
690   - print filename
691   - print ''
692   - print code_data
693   - print ''
  635 + yield (code_path, filename, code_data)
  636 + # print '-'*79
  637 + # print filename
  638 + # print ''
  639 + # print code_data
  640 + # print ''
694 641 logging.debug('extracted file {0}'.format(filename))
695 642 else:
696 643 logging.warning("module stream {0} has code data length 0".format(MODULESTREAMNAME_StreamName))
697 644 return
698 645  
699 646  
700   -def extract_macros (filename):
701   - if olefile.isOleFile(filename):
702   - # This looks like an OLE file
703   - logging.info('Extracting VBA Macros from OLE file %s' % filename)
704   - ole = olefile.OleFileIO(filename)
705   - extract_macros_ole(ole)
706   - ole.close()
707   - elif zipfile.is_zipfile(filename):
708   - # This looks like a zip file, need to look for vbaProject.bin inside
709   - #TODO: here we should look for any OLE file inside the archive
710   - #...because vbaProject.bin can be renamed:
711   - # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18
712   - logging.info('Opening ZIP/OpenXML file %s' % filename)
713   - z = zipfile.ZipFile(filename)
714   - for f in z.namelist():
715   - if f.lower().endswith('vbaproject.bin'):
716   - logging.debug('Opening OLE VBA storage %s within zip' % f)
717   - vbadata = z.open(f).read()
718   - vbafile = cStringIO.StringIO(vbadata)
719   - try:
720   - ole = olefile.OleFileIO(vbafile)
721   - except:
722   - logging.debug('%s is not a valid OLE file' % f)
723   - continue
724   - logging.info('Extracting VBA Macros from %s/%s' % (filename, f))
725   - extract_macros_ole(ole)
726   - ole.close()
727   - z.close()
728   - else:
729   - logging.error('%s is not an OLE nor an OpenXML file, cannot extract VBA Macros.' % filename)
  647 +#=== CLASSES =================================================================
  648 +
  649 +class VBA_Parser(object):
  650 + """
  651 + Class to parse MS Office files, to detect VBA macros and extract VBA source code
  652 + Supported file formats:
  653 + - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)
  654 + - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
  655 + - PowerPoint 2007+ (.pptm, .ppsm)
  656 + """
730 657  
  658 + def __init__(self, _file, filename=None):
  659 + """
  660 + Constructor for VBA_Parser
  661 +
  662 + :param _file: path of file to parse, file-like object or file content
  663 + :param filename: actual filename if _file is a file-like object or file content
  664 + in a bytes string
  665 + """
  666 + #TODO: also support olefile and zipfile as input
  667 + self.file = _file
  668 + self.ole_file = None
  669 + self.ole_subfiles = []
  670 + self.filename = filename
  671 + self.type = None
  672 + self.vba_projects = None
  673 + if filename is None:
  674 + if isinstance(_file, basestring):
  675 + if len(_file) < olefile.MINIMAL_OLEFILE_SIZE:
  676 + self.filename = _file
  677 + else:
  678 + self.filename = '<file in bytes string>'
  679 + else:
  680 + self.filename = '<file-like object>'
  681 + if olefile.isOleFile(_file):
  682 + # This looks like an OLE file
  683 + logging.info('Parsing OLE file %s' % self.filename)
  684 + self.ole_file = olefile.OleFileIO(_file)
  685 + self.type = 'OLE'
  686 + elif zipfile.is_zipfile(_file):
  687 + # This looks like a zip file, need to look for vbaProject.bin inside
  688 + # It can be any OLE file inside the archive
  689 + #...because vbaProject.bin can be renamed:
  690 + # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18
  691 + logging.info('Opening ZIP/OpenXML file %s' % self.filename)
  692 + self.type = 'OpenXML'
  693 + z = zipfile.ZipFile(_file)
  694 + #TODO: check if this is actually an OpenXML file
  695 + # check each file within the zip if it is an OLE file, by reading its magic:
  696 + for subfile in z.namelist():
  697 + magic = z.open(subfile).read(len(olefile.MAGIC))
  698 + if magic == olefile.MAGIC:
  699 + logging.debug('Opening OLE file %s within zip' % subfile)
  700 + ole_data = z.open(subfile).read()
  701 + try:
  702 + self.ole_subfiles.append(VBA_Parser(ole_data, filename=subfile))
  703 + except:
  704 + logging.debug('%s is not a valid OLE file' % subfile)
  705 + continue
  706 + z.close()
  707 + else:
  708 + msg = '%s is not an OLE nor an OpenXML file, cannot extract VBA Macros.' % self.filename
  709 + logging.error(msg)
  710 + raise TypeError(msg)
  711 +
  712 + def find_vba_projects (self):
  713 + """
  714 + Finds all the VBA projects stored in an OLE file.
  715 +
  716 + Return None if the file is not OLE but OpenXML.
  717 + Return a list of tuples (vba_root, project_path, dir_path) for each VBA project.
  718 + vba_root is the path of the root OLE storage containing the VBA project,
  719 + including a trailing slash unless it is the root of the OLE file.
  720 + project_path is the path of the OLE stream named "PROJECT" within the VBA project.
  721 + dir_path is the path of the OLE stream named "VBA/dir" within the VBA project.
  722 +
  723 + If this function returns an empty list for one of the supported formats
  724 + (i.e. Word, Excel, Powerpoint except Powerpoint 97-2003), then the
  725 + file does not contain VBA macros.
  726 +
  727 + :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path)
  728 + for each VBA project found if OLE file
  729 + """
  730 + # if the file is not OLE but OpenXML, return None:
  731 + if self.ole_file is None:
  732 + return None
  733 +
  734 + # if this method has already been called, return previous result:
  735 + if self.vba_projects is not None:
  736 + return self.vba_projects
  737 +
  738 + # Find the VBA project root (different in MS Word, Excel, etc):
  739 + # - Word 97-2003: Macros
  740 + # - Excel 97-2003: _VBA_PROJECT_CUR
  741 + # - PowerPoint 97-2003: not supported yet (different file structure)
  742 + # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin.
  743 + # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word
  744 + # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word
  745 + # - Visio 2007: not supported yet (different file structure)
  746 +
  747 + # According to MS-OVBA section 2.2.1:
  748 + # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream
  749 + # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream
  750 + # - all names are case-insensitive
  751 +
  752 + # start with an empty list:
  753 + self.vba_projects = []
  754 + # Look for any storage containing those storage/streams:
  755 + ole = self.ole_file
  756 + for storage in ole.listdir(streams=False, storages=True):
  757 + # Look for a storage ending with "VBA":
  758 + if storage[-1].upper() == 'VBA':
  759 + logging.debug('Found VBA storage: %s' % ('/'.join(storage)))
  760 + vba_root = '/'.join(storage[:-1])
  761 + # Add a trailing slash to vba_root, unless it is the root of the OLE file:
  762 + # (used later to append all the child streams/storages)
  763 + if vba_root != '':
  764 + vba_root += '/'
  765 + logging.debug('Checking vba_root="%s"' % vba_root)
  766 +
  767 + def check_vba_stream(ole, vba_root, stream_path):
  768 + full_path = vba_root + stream_path
  769 + if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM:
  770 + logging.debug('Found %s stream: %s' % (stream_path, full_path))
  771 + return full_path
  772 + else:
  773 + logging.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path)
  774 + return False
  775 +
  776 + # Check if the VBA root storage also contains a PROJECT stream:
  777 + project_path = check_vba_stream(ole, vba_root, 'PROJECT')
  778 + if not project_path: continue
  779 + # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream:
  780 + vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT')
  781 + if not vba_project_path: continue
  782 + # Check if the VBA root storage also contains a VBA/dir stream:
  783 + dir_path = check_vba_stream(ole, vba_root, 'VBA/dir')
  784 + if not dir_path: continue
  785 + # Now we are pretty sure it is a VBA project structure
  786 + logging.debug('VBA root storage: "%s"' % vba_root)
  787 + # append the results to the list as a tuple for later use:
  788 + self.vba_projects.append((vba_root, project_path, dir_path))
  789 + return self.vba_projects
  790 +
  791 + def detect_vba_macros(self):
  792 + """
  793 + Detect the potential presence of VBA macros in the file, by checking
  794 + if it contains VBA projects. Both OLE and OpenXML files are supported.
  795 +
  796 + Important: for now, results are accurate only for Word, Excel and PowerPoint
  797 + EXCEPT Powerpoint 97-2003, which has a different structure for VBA.
  798 +
  799 + Note: this method does NOT attempt to check the actual presence or validity
  800 + of VBA macro source code, so there might be false positives.
  801 + It may also detect VBA macros in files embedded within the main file,
  802 + for example an Excel workbook with macros embedded into a Word
  803 + document without macros may be detected, without distinction.
  804 +
  805 + :return: bool, True if at least one VBA project has been found, False otherwise
  806 + """
  807 + #TODO: return None or raise exception if format not supported like PPT 97-2003
  808 + #TODO: return the number of VBA projects found instead of True/False?
  809 + # if OpenXML, check all the OLE subfiles:
  810 + if self.ole_file is None:
  811 + for ole_subfile in self.ole_subfiles:
  812 + if ole_subfile.detect_vba_macros():
  813 + return True
  814 + return False
  815 + # otherwise it's an OLE file, find VBA projects:
  816 + vba_projects = self.find_vba_projects()
  817 + if len(vba_projects) == 0:
  818 + return False
  819 + else:
  820 + return True
  821 +
  822 +
  823 + def extract_macros (self):
  824 + """
  825 + Extract and decompress source code for each VBA macro found in the file
  826 +
  827 + Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found
  828 + If the file is OLE, filename is the path of the file.
  829 + If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros
  830 + within the zip archive, e.g. word/vbaProject.bin.
  831 + """
  832 + if self.ole_file is None:
  833 + for ole_subfile in self.ole_subfiles:
  834 + for results in ole_subfile.extract_macros():
  835 + yield results
  836 + else:
  837 + self.find_vba_projects()
  838 + for vba_root, project_path, dir_path in self.vba_projects:
  839 + # extract all VBA macros from that VBA root storage:
  840 + for stream_path, vba_filename, vba_code in _extract_vba(self.ole_file, vba_root, project_path, dir_path):
  841 + yield (self.filename, stream_path, vba_filename, vba_code)
  842 +
  843 +
  844 + def close(self):
  845 + """
  846 + Close all the open files. This method must be called after usage, if
  847 + the application is opening many files.
  848 + """
  849 + if self.ole_file is None:
  850 + for ole_subfile in self.ole_subfiles:
  851 + ole_subfile.close()
  852 + else:
  853 + self.ole_file.close()
731 854  
732 855  
733 856 #=== MAIN =====================================================================
... ... @@ -738,8 +861,28 @@ if __name__ == &#39;__main__&#39;:
738 861 print __doc__
739 862 sys.exit(1)
740 863  
741   - logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
742   -
743   - extract_macros(sys.argv[1])
  864 + logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING) #INFO)
  865 +
  866 + #TODO: option parser
  867 + fname = sys.argv[1]
  868 + print '='*79
  869 + print 'File:', fname
  870 + try:
  871 + vba = VBA_Parser(fname)
  872 + print 'Type:', vba.type
  873 + if vba.detect_vba_macros():
  874 + print 'Contains VBA Macros:'
  875 + for (filename, stream_path, vba_filename, vba_code) in vba.extract_macros():
  876 + print '-'*79
  877 + print 'Filename :', filename
  878 + print 'OLE stream :', stream_path
  879 + print 'VBA filename:', vba_filename
  880 + print '- '*39
  881 + print vba_code
  882 + else:
  883 + print 'No VBA macros found.'
  884 + except TypeError:
  885 + raise
  886 + print sys.exc_value
744 887  
745 888 # This was coded while listening to "Dust" from I Love You But I've Chosen Darkness
746 889 \ No newline at end of file
... ...