Commit e6148632893ae77754319a20284481033d2bbd8b

Authored by Philippe Lagadec
1 parent 7c2a7d81

olevba: improved VBA_Parser, refactored the main CLI functions

Showing 2 changed files with 125 additions and 56 deletions
oletools/olevba.py
... ... @@ -145,6 +145,7 @@ https://github.com/unixfreak0037/officeparser
145 145 # 2015-07-09 v0.33 PL: - removed usage of sys.stderr which causes issues
146 146 # 2015-07-12 PL: - added Hex function decoding to VBA Parser
147 147 # 2015-07-13 PL: - added Base64 function decoding to VBA Parser
  148 +# 2015-09-06 PL: - improved VBA_Parser, refactored the main functions
148 149  
149 150 __version__ = '0.33'
150 151  
... ... @@ -1468,6 +1469,16 @@ class VBA_Scanner(object):
1468 1469 self.code_base64 = ''
1469 1470 self.code_dridex = ''
1470 1471 self.code_vba = ''
  1472 + self.strReverse = None
  1473 + # results = None before scanning, then a list of tuples after scanning
  1474 + self.results = None
  1475 + self.autoexec_keywords = None
  1476 + self.suspicious_keywords = None
  1477 + self.iocs = None
  1478 + self.hex_strings = None
  1479 + self.base64_strings = None
  1480 + self.dridex_strings = None
  1481 + self.vba_strings = None
1471 1482  
1472 1483  
1473 1484 def scan(self, include_decoded_strings=False):
... ... @@ -1558,6 +1569,7 @@ class VBA_Scanner(object):
1558 1569 for encoded, decoded in self.vba_strings:
1559 1570 if include_decoded_strings or is_printable(decoded):
1560 1571 results.append(('VBA string', decoded, encoded))
  1572 + self.results = results
1561 1573 return results
1562 1574  
1563 1575 def scan_summary(self):
... ... @@ -1569,7 +1581,9 @@ class VBA_Scanner(object):
1569 1581 :return: tuple with the number of items found for each category:
1570 1582 (autoexec, suspicious, IOCs, hex, base64, dridex, vba)
1571 1583 """
1572   - self.scan()
  1584 + # avoid scanning the same code twice:
  1585 + if self.results is None:
  1586 + self.scan()
1573 1587 return (len(self.autoexec_keywords), len(self.suspicious_keywords),
1574 1588 len(self.iocs), len(self.hex_strings), len(self.base64_strings),
1575 1589 len(self.dridex_strings), len(self.vba_strings))
... ... @@ -1630,6 +1644,22 @@ class VBA_Parser(object):
1630 1644 self.filename = filename
1631 1645 self.type = None
1632 1646 self.vba_projects = None
  1647 + self.contains_macros = None # will be set to True or False by detect_macros
  1648 + self.vba_code_all_modules = None # to store the source code of all modules
  1649 + # list of tuples for each module: (subfilename, stream_path, vba_filename, vba_code)
  1650 + self.modules = None
  1651 + # Analysis results: list of tuples (type, keyword, description) - See VBA_Scanner
  1652 + self.analysis_results = None
  1653 + # statistics for the scan summary and flags
  1654 + self.nb_macros = 0
  1655 + self.nb_autoexec = 0
  1656 + self.nb_suspicious = 0
  1657 + self.nb_iocs = 0
  1658 + self.nb_hexstrings = 0
  1659 + self.nb_base64strings = 0
  1660 + self.nb_dridexstrings = 0
  1661 + self.nb_vbastrings = 0
  1662 +
1633 1663 # if filename is None:
1634 1664 # if isinstance(_file, basestring):
1635 1665 # if len(_file) < olefile.MINIMAL_OLEFILE_SIZE:
... ... @@ -1857,19 +1887,25 @@ class VBA_Parser(object):
1857 1887 """
1858 1888 #TODO: return None or raise exception if format not supported like PPT 97-2003
1859 1889 #TODO: return the number of VBA projects found instead of True/False?
  1890 + # if this method was already called, return the previous result:
  1891 + if self.contains_macros is not None:
  1892 + return self.contains_macros
1860 1893 # if OpenXML, check all the OLE subfiles:
1861 1894 if self.ole_file is None:
1862 1895 for ole_subfile in self.ole_subfiles:
1863 1896 if ole_subfile.detect_vba_macros():
  1897 + self.contains_macros = True
1864 1898 return True
  1899 + # otherwise, no macro found:
  1900 + self.contains_macros = False
1865 1901 return False
1866 1902 # otherwise it's an OLE file, find VBA projects:
1867 1903 vba_projects = self.find_vba_projects()
1868 1904 if len(vba_projects) == 0:
1869   - return False
  1905 + self.contains_macros = False
1870 1906 else:
1871   - return True
1872   -
  1907 + self.contains_macros = True
  1908 + return self.contains_macros
1873 1909  
1874 1910 def extract_macros(self):
1875 1911 """
... ... @@ -1893,6 +1929,52 @@ class VBA_Parser(object):
1893 1929 yield (self.filename, stream_path, vba_filename, vba_code)
1894 1930  
1895 1931  
  1932 + def extract_all_macros(self):
  1933 + """
  1934 + Extract and decompress source code for each VBA macro found in the file
  1935 + by calling extract_macros(), store the results as a list of tuples
  1936 + (filename, stream_path, vba_filename, vba_code) in self.modules.
  1937 + See extract_macros for details.
  1938 + """
  1939 + if self.modules is None:
  1940 + self.modules = []
  1941 + for (subfilename, stream_path, vba_filename, vba_code) in self.extract_macros():
  1942 + self.modules.append((subfilename, stream_path, vba_filename, vba_code))
  1943 + self.nb_macros = len(self.modules)
  1944 + return self.modules
  1945 +
  1946 +
  1947 +
  1948 + def analyze_macros(self, show_decoded_strings=False):
  1949 + """
  1950 + runs extract_macros and analyze the source code of all VBA macros
  1951 + found in the file.
  1952 + """
  1953 + if self.detect_vba_macros():
  1954 + # variable to merge source code from all modules:
  1955 + if self.vba_code_all_modules is None:
  1956 + self.vba_code_all_modules = ''
  1957 + for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros():
  1958 + #TODO: filter code? (each module)
  1959 + self.vba_code_all_modules += vba_code + '\n'
  1960 + # Analyze the whole code at once:
  1961 + scanner = VBA_Scanner(self.vba_code_all_modules)
  1962 + self.analysis_results = scanner.scan(show_decoded_strings)
  1963 + autoexec, suspicious, iocs, hexstrings, base64strings, dridex, vbastrings = scanner.scan_summary()
  1964 + self.nb_autoexec += autoexec
  1965 + self.nb_suspicious += suspicious
  1966 + self.nb_iocs += iocs
  1967 + self.nb_hexstrings += hexstrings
  1968 + self.nb_base64strings += base64strings
  1969 + self.nb_dridexstrings += dridex
  1970 + self.nb_vbastrings += vbastrings
  1971 +
  1972 + return self.analysis_results
  1973 +
  1974 +
  1975 +
  1976 +
  1977 +
1896 1978 def close(self):
1897 1979 """
1898 1980 Close all the open files. This method must be called after usage, if
... ... @@ -1905,7 +1987,7 @@ class VBA_Parser(object):
1905 1987 self.ole_file.close()
1906 1988  
1907 1989  
1908   -def print_analysis(vba_code, show_decoded_strings=False):
  1990 +def print_analysis(vba_parser, show_decoded_strings=False):
1909 1991 """
1910 1992 Analyze the provided VBA code, and print the results in a table
1911 1993  
... ... @@ -1916,7 +1998,8 @@ def print_analysis(vba_code, show_decoded_strings=False):
1916 1998 # print a waiting message only if the output is not redirected to a file:
1917 1999 if sys.stdout.isatty():
1918 2000 print 'Analysis...\r',
1919   - results = scan_vba(vba_code, show_decoded_strings)
  2001 + sys.stdout.flush()
  2002 + results = vba_parser.analyze_macros(show_decoded_strings)
1920 2003 if results:
1921 2004 t = prettytable.PrettyTable(('Type', 'Keyword', 'Description'))
1922 2005 t.align = 'l'
... ... @@ -1967,9 +2050,7 @@ def process_file(container, filename, data, show_decoded_strings=False,
1967 2050 print 'Type:', vba.type
1968 2051 if vba.detect_vba_macros():
1969 2052 #print 'Contains VBA Macros:'
1970   - # variable to merge source code from all modules:
1971   - vba_code_all_modules = ''
1972   - for (subfilename, stream_path, vba_filename, vba_code) in vba.extract_macros():
  2053 + for (subfilename, stream_path, vba_filename, vba_code) in vba.extract_all_macros():
1973 2054 if hide_attributes:
1974 2055 # hide attribute lines:
1975 2056 vba_code_filtered = filter_vba(vba_code)
... ... @@ -1986,15 +2067,15 @@ def process_file(container, filename, data, show_decoded_strings=False,
1986 2067 else:
1987 2068 print vba_code_filtered
1988 2069 if not global_analysis and not vba_code_only:
  2070 + #TODO: remove this option
  2071 + raise NotImplementedError
1989 2072 print '- ' * 39
1990 2073 print 'ANALYSIS:'
1991 2074 # analyse each module's code, filtered to avoid false positives:
1992   - print_analysis(vba_code_filtered, show_decoded_strings)
1993   - else:
1994   - vba_code_all_modules += vba_code_filtered + '\n'
  2075 + print_analysis(vba, show_decoded_strings)
1995 2076 if global_analysis and not vba_code_only:
1996 2077 # analyse the code from all modules at once:
1997   - print_analysis(vba_code_all_modules, show_decoded_strings)
  2078 + print_analysis(vba, show_decoded_strings)
1998 2079 else:
1999 2080 print 'No VBA macros found.'
2000 2081 except: #TypeError:
... ... @@ -2005,6 +2086,13 @@ def process_file(container, filename, data, show_decoded_strings=False,
2005 2086 traceback.print_exc()
2006 2087 print ''
2007 2088  
  2089 +# short tag to display file types in triage mode:
  2090 +TYPE2TAG = {
  2091 + TYPE_OLE: 'OLE:',
  2092 + TYPE_OpenXML: 'OpX:',
  2093 + TYPE_Word2003_XML: 'XML:',
  2094 + TYPE_MHTML: 'MHT:',
  2095 +}
2008 2096  
2009 2097 def process_file_triage(container, filename, data):
2010 2098 """
... ... @@ -2016,56 +2104,30 @@ def process_file_triage(container, filename, data):
2016 2104 :param data: bytes, content of the file if it is in a container, None if it is a file on disk.
2017 2105 """
2018 2106 #TODO: replace print by writing to a provided output file (sys.stdout by default)
2019   - nb_macros = 0
2020   - nb_autoexec = 0
2021   - nb_suspicious = 0
2022   - nb_iocs = 0
2023   - nb_hexstrings = 0
2024   - nb_base64strings = 0
2025   - nb_dridexstrings = 0
2026   - nb_vbastrings = 0
2027 2107 # ftype = 'Other'
2028 2108 message = ''
2029 2109 try:
2030 2110 #TODO: handle olefile errors, when an OLE file is malformed
2031 2111 vba = VBA_Parser(filename, data)
2032 2112 if vba.detect_vba_macros():
2033   - for (subfilename, stream_path, vba_filename, vba_code) in vba.extract_macros():
2034   - nb_macros += 1
2035   - if vba_code.strip() != '':
2036   - # print a waiting message only if the output is not redirected to a file:
2037   - if sys.stdout.isatty():
2038   - print 'Analysis...\r',
2039   - # analyse the whole code, filtered to avoid false positives:
2040   - scanner = VBA_Scanner(filter_vba(vba_code))
2041   - autoexec, suspicious, iocs, hexstrings, base64strings, dridex, vbastrings = scanner.scan_summary()
2042   - nb_autoexec += autoexec
2043   - nb_suspicious += suspicious
2044   - nb_iocs += iocs
2045   - nb_hexstrings += hexstrings
2046   - nb_base64strings += base64strings
2047   - nb_dridexstrings += dridex
2048   - nb_vbastrings += vbastrings
2049   - if vba.type == TYPE_OLE:
2050   - flags = 'OLE:'
2051   - elif vba.type == TYPE_OpenXML:
2052   - flags = 'OpX:'
2053   - elif vba.type == TYPE_Word2003_XML:
2054   - flags = 'XML:'
2055   - elif vba.type == TYPE_MHTML:
2056   - flags = 'MHT:'
  2113 + # print a waiting message only if the output is not redirected to a file:
  2114 + if sys.stdout.isatty():
  2115 + print 'Analysis...\r',
  2116 + sys.stdout.flush()
  2117 + vba.analyze_macros()
  2118 + flags = TYPE2TAG[vba.type]
2057 2119 macros = autoexec = suspicious = iocs = hexstrings = base64obf = dridex = vba_obf = '-'
2058   - if nb_macros: macros = 'M'
2059   - if nb_autoexec: autoexec = 'A'
2060   - if nb_suspicious: suspicious = 'S'
2061   - if nb_iocs: iocs = 'I'
2062   - if nb_hexstrings: hexstrings = 'H'
2063   - if nb_base64strings: base64obf = 'B'
2064   - if nb_dridexstrings: dridex = 'D'
2065   - if nb_vbastrings: vba_obf = 'V'
  2120 + if vba.nb_macros: macros = 'M'
  2121 + if vba.nb_autoexec: autoexec = 'A'
  2122 + if vba.nb_suspicious: suspicious = 'S'
  2123 + if vba.nb_iocs: iocs = 'I'
  2124 + if vba.nb_hexstrings: hexstrings = 'H'
  2125 + if vba.nb_base64strings: base64obf = 'B'
  2126 + if vba.nb_dridexstrings: dridex = 'D'
  2127 + if vba.nb_vbastrings: vba_obf = 'V'
2066 2128 flags += '%s%s%s%s%s%s%s%s' % (macros, autoexec, suspicious, iocs, hexstrings,
2067 2129 base64obf, dridex, vba_obf)
2068   -
  2130 + # old table display:
2069 2131 # macros = autoexec = suspicious = iocs = hexstrings = 'no'
2070 2132 # if nb_macros: macros = 'YES:%d' % nb_macros
2071 2133 # if nb_autoexec: autoexec = 'YES:%d' % nb_autoexec
... ... @@ -2123,7 +2185,7 @@ def main():
2123 2185 parser.add_option("-r", action="store_true", dest="recursive",
2124 2186 help='find files recursively in subdirectories.')
2125 2187 parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None,
2126   - help='if the file is a zip archive, open first file from it, using the provided password (requires Python 2.6+)')
  2188 + help='if the file is a zip archive, open all files from it, using the provided password (requires Python 2.6+)')
2127 2189 parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*',
2128 2190 help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)')
2129 2191 parser.add_option("-t", '--triage', action="store_true", dest="triage_mode",
... ... @@ -2162,17 +2224,23 @@ def main():
2162 2224 logging.disable(logging.CRITICAL)
2163 2225  
2164 2226 if options.input:
  2227 + #TODO: remove this option
  2228 + raise NotImplementedError
2165 2229 # input file provided with VBA source code to be analyzed directly:
2166 2230 print 'Analysis of VBA source code from %s:' % options.input
2167 2231 vba_code = open(options.input).read()
2168 2232 print_analysis(vba_code, show_decoded_strings=options.show_decoded_strings)
2169 2233 sys.exit()
2170 2234  
  2235 + # Old display with number of items detected:
2171 2236 # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('Type', 'Macros', 'AutoEx', 'Susp.', 'IOCs', 'HexStr')
2172 2237 # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('-'*8, '-'*7, '-'*7, '-'*7, '-'*7, '-'*7)
  2238 +
  2239 + # Column headers (except if detailed mode)
2173 2240 if not options.detailed_mode or options.triage_mode:
2174 2241 print '%-12s %-65s' % ('Flags', 'Filename')
2175 2242 print '%-12s %-65s' % ('-' * 11, '-' * 65)
  2243 +
2176 2244 previous_container = None
2177 2245 count = 0
2178 2246 container = filename = data = None
... ... @@ -2203,6 +2271,7 @@ def main():
2203 2271 if count == 1 and not options.triage_mode and not options.detailed_mode:
2204 2272 # if options -t and -d were not specified and it's a single file, print details:
2205 2273 #TODO: avoid doing the analysis twice by storing results
  2274 + #TODO: all the cli functions should be methods of a class VBA_Parser_CLI
2206 2275 process_file(container, filename, data, show_decoded_strings=options.show_decoded_strings,
2207 2276 display_code=options.display_code, global_analysis=options.global_analysis,
2208 2277 hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only)
... ...
setup.py 100755 → 100644
... ... @@ -37,7 +37,7 @@ import sys, os, fnmatch
37 37 #--- METADATA -----------------------------------------------------------------
38 38  
39 39 name = "oletools"
40   -version = '0.12'
  40 +version = '0.13'
41 41 desc = "Python tools to analyze security characteristics of MS Office and OLE files (also called Structured Storage, Compound File Binary Format or Compound Document File Format), for Malware Analysis and Incident Response #DFIR"
42 42 long_desc = open('oletools/README.rst').read()
43 43 author ="Philippe Lagadec"
... ...