Commit e636b4f8ad1666964596f6a7cae79414d32571bd

Authored by decalage2
1 parent 1541d5de

olevba: reverted to python 2.7 version, moved python 3 version to olevba3.py

oletools/olevba.py
@@ -215,7 +215,7 @@ __version__ = '0.50' @@ -215,7 +215,7 @@ __version__ = '0.50'
215 215
216 import sys, logging 216 import sys, logging
217 import struct 217 import struct
218 -from _io import StringIO,BytesIO 218 +import cStringIO
219 import math 219 import math
220 import zipfile 220 import zipfile
221 import re 221 import re
@@ -240,9 +240,9 @@ except ImportError: @@ -240,9 +240,9 @@ except ImportError:
240 # Python <2.5: standalone ElementTree install 240 # Python <2.5: standalone ElementTree install
241 import elementtree.cElementTree as ET 241 import elementtree.cElementTree as ET
242 except ImportError: 242 except ImportError:
243 - raise(ImportError, "lxml or ElementTree are not installed, " \ 243 + raise ImportError, "lxml or ElementTree are not installed, " \
244 + "see http://codespeak.net/lxml " \ 244 + "see http://codespeak.net/lxml " \
245 - + "or http://effbot.org/zone/element-index.htm") 245 + + "or http://effbot.org/zone/element-index.htm"
246 246
247 import thirdparty.olefile as olefile 247 import thirdparty.olefile as olefile
248 from thirdparty.prettytable import prettytable 248 from thirdparty.prettytable import prettytable
@@ -421,7 +421,7 @@ TYPE2TAG = { @@ -421,7 +421,7 @@ TYPE2TAG = {
421 421
422 422
423 # MSO files ActiveMime header magic 423 # MSO files ActiveMime header magic
424 -MSO_ACTIVEMIME_HEADER = b'ActiveMime' 424 +MSO_ACTIVEMIME_HEADER = 'ActiveMime'
425 425
426 MODULE_EXTENSION = "bas" 426 MODULE_EXTENSION = "bas"
427 CLASS_EXTENSION = "cls" 427 CLASS_EXTENSION = "cls"
@@ -630,7 +630,7 @@ re_dridex_string = re.compile(r&#39;&quot;[0-9A-Za-z]{20,}&quot;&#39;) @@ -630,7 +630,7 @@ re_dridex_string = re.compile(r&#39;&quot;[0-9A-Za-z]{20,}&quot;&#39;)
630 re_nothex_check = re.compile(r'[G-Zg-z]') 630 re_nothex_check = re.compile(r'[G-Zg-z]')
631 631
632 # regex to extract printable strings (at least 5 chars) from VBA Forms: 632 # regex to extract printable strings (at least 5 chars) from VBA Forms:
633 -re_printable_string = re.compile(rb'[\t\r\n\x20-\xFF]{5,}') 633 +re_printable_string = re.compile(r'[\t\r\n\x20-\xFF]{5,}')
634 634
635 635
636 # === PARTIAL VBA GRAMMAR ==================================================== 636 # === PARTIAL VBA GRAMMAR ====================================================
@@ -1060,10 +1060,10 @@ def decompress_stream(compressed_container): @@ -1060,10 +1060,10 @@ def decompress_stream(compressed_container):
1060 # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the 1060 # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the
1061 # DecompressedBuffer (section 2.4.1.1.2). 1061 # DecompressedBuffer (section 2.4.1.1.2).
1062 1062
1063 - decompressed_container = b'' # result 1063 + decompressed_container = '' # result
1064 compressed_current = 0 1064 compressed_current = 0
1065 1065
1066 - sig_byte = compressed_container[compressed_current] 1066 + sig_byte = ord(compressed_container[compressed_current])
1067 if sig_byte != 0x01: 1067 if sig_byte != 0x01:
1068 raise ValueError('invalid signature byte {0:02X}'.format(sig_byte)) 1068 raise ValueError('invalid signature byte {0:02X}'.format(sig_byte))
1069 1069
@@ -1109,7 +1109,7 @@ def decompress_stream(compressed_container): @@ -1109,7 +1109,7 @@ def decompress_stream(compressed_container):
1109 # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk 1109 # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk
1110 # uncompressed chunk: read the next 4096 bytes as-is 1110 # uncompressed chunk: read the next 4096 bytes as-is
1111 #TODO: check if there are at least 4096 bytes left 1111 #TODO: check if there are at least 4096 bytes left
1112 - decompressed_container += bytes([compressed_container[compressed_current:compressed_current + 4096]]) 1112 + decompressed_container += compressed_container[compressed_current:compressed_current + 4096]
1113 compressed_current += 4096 1113 compressed_current += 4096
1114 else: 1114 else:
1115 # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk 1115 # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk
@@ -1120,9 +1120,9 @@ def decompress_stream(compressed_container): @@ -1120,9 +1120,9 @@ def decompress_stream(compressed_container):
1120 # log.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end)) 1120 # log.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end))
1121 # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or 1121 # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or
1122 # copy tokens (reference to a previous literal token) 1122 # copy tokens (reference to a previous literal token)
1123 - flag_byte = compressed_container[compressed_current] 1123 + flag_byte = ord(compressed_container[compressed_current])
1124 compressed_current += 1 1124 compressed_current += 1
1125 - for bit_index in range(0, 8): 1125 + for bit_index in xrange(0, 8):
1126 # log.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end)) 1126 # log.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end))
1127 if compressed_current >= compressed_end: 1127 if compressed_current >= compressed_end:
1128 break 1128 break
@@ -1132,7 +1132,7 @@ def decompress_stream(compressed_container): @@ -1132,7 +1132,7 @@ def decompress_stream(compressed_container):
1132 #log.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit)) 1132 #log.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit))
1133 if flag_bit == 0: # LiteralToken 1133 if flag_bit == 0: # LiteralToken
1134 # copy one byte directly to output 1134 # copy one byte directly to output
1135 - decompressed_container += bytes([compressed_container[compressed_current]]) 1135 + decompressed_container += compressed_container[compressed_current]
1136 compressed_current += 1 1136 compressed_current += 1
1137 else: # CopyToken 1137 else: # CopyToken
1138 # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken 1138 # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken
@@ -1147,8 +1147,8 @@ def decompress_stream(compressed_container): @@ -1147,8 +1147,8 @@ def decompress_stream(compressed_container):
1147 offset = (temp1 >> temp2) + 1 1147 offset = (temp1 >> temp2) + 1
1148 #log.debug('offset=%d length=%d' % (offset, length)) 1148 #log.debug('offset=%d length=%d' % (offset, length))
1149 copy_source = len(decompressed_container) - offset 1149 copy_source = len(decompressed_container) - offset
1150 - for index in range(copy_source, copy_source + length):  
1151 - decompressed_container += bytes([decompressed_container[index]]) 1150 + for index in xrange(copy_source, copy_source + length):
  1151 + decompressed_container += decompressed_container[index]
1152 compressed_current += 2 1152 compressed_current += 2
1153 return decompressed_container 1153 return decompressed_container
1154 1154
@@ -1191,7 +1191,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): @@ -1191,7 +1191,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1191 code_modules = {} 1191 code_modules = {}
1192 1192
1193 for line in project: 1193 for line in project:
1194 - line = line.strip().decode('utf-8','ignore') 1194 + line = line.strip()
1195 if '=' in line: 1195 if '=' in line:
1196 # split line at the 1st equal sign: 1196 # split line at the 1st equal sign:
1197 name, value = line.split('=', 1) 1197 name, value = line.split('=', 1)
@@ -1222,7 +1222,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): @@ -1222,7 +1222,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1222 else: 1222 else:
1223 raise UnexpectedDataError(dir_path, name, expected, value) 1223 raise UnexpectedDataError(dir_path, name, expected, value)
1224 1224
1225 - dir_stream = BytesIO(decompress_stream(dir_compressed)) 1225 + dir_stream = cStringIO.StringIO(decompress_stream(dir_compressed))
1226 1226
1227 # PROJECTSYSKIND Record 1227 # PROJECTSYSKIND Record
1228 projectsyskind_id = struct.unpack("<H", dir_stream.read(2))[0] 1228 projectsyskind_id = struct.unpack("<H", dir_stream.read(2))[0]
@@ -1484,7 +1484,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): @@ -1484,7 +1484,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1484 uni_out = lambda unicode_text: unicode_text.encode('utf-8', 'replace') 1484 uni_out = lambda unicode_text: unicode_text.encode('utf-8', 'replace')
1485 1485
1486 log.debug("parsing {0} modules".format(projectmodules_count)) 1486 log.debug("parsing {0} modules".format(projectmodules_count))
1487 - for projectmodule_index in range(0, projectmodules_count): 1487 + for projectmodule_index in xrange(0, projectmodules_count):
1488 try: 1488 try:
1489 modulename_id = struct.unpack("<H", dir_stream.read(2))[0] 1489 modulename_id = struct.unpack("<H", dir_stream.read(2))[0]
1490 check_value('MODULENAME_Id', 0x0019, modulename_id) 1490 check_value('MODULENAME_Id', 0x0019, modulename_id)
@@ -1881,19 +1881,19 @@ def json2ascii(json_obj, encoding=&#39;utf8&#39;, errors=&#39;replace&#39;): @@ -1881,19 +1881,19 @@ def json2ascii(json_obj, encoding=&#39;utf8&#39;, errors=&#39;replace&#39;):
1881 pass 1881 pass
1882 elif isinstance(json_obj, str): 1882 elif isinstance(json_obj, str):
1883 # de-code and re-encode 1883 # de-code and re-encode
1884 - dencoded = json_obj 1884 + dencoded = json_obj.decode(encoding, errors).encode(encoding, errors)
1885 if dencoded != json_obj: 1885 if dencoded != json_obj:
1886 log.debug('json2ascii: replaced: {0} (len {1})' 1886 log.debug('json2ascii: replaced: {0} (len {1})'
1887 .format(json_obj, len(json_obj))) 1887 .format(json_obj, len(json_obj)))
1888 log.debug('json2ascii: with: {0} (len {1})' 1888 log.debug('json2ascii: with: {0} (len {1})'
1889 .format(dencoded, len(dencoded))) 1889 .format(dencoded, len(dencoded)))
1890 return dencoded 1890 return dencoded
1891 - elif isinstance(json_obj, bytes): 1891 + elif isinstance(json_obj, unicode):
1892 log.debug('json2ascii: encode unicode: {0}' 1892 log.debug('json2ascii: encode unicode: {0}'
1893 - .format(json_obj.decode(encoding, errors))) 1893 + .format(json_obj.encode(encoding, errors)))
1894 # cannot put original into logger 1894 # cannot put original into logger
1895 # print 'original: ' json_obj 1895 # print 'original: ' json_obj
1896 - return json_obj.decode(encoding, errors) 1896 + return json_obj.encode(encoding, errors)
1897 elif isinstance(json_obj, dict): 1897 elif isinstance(json_obj, dict):
1898 for key in json_obj: 1898 for key in json_obj:
1899 json_obj[key] = json2ascii(json_obj[key]) 1899 json_obj[key] = json2ascii(json_obj[key])
@@ -1931,18 +1931,18 @@ def print_json(json_dict=None, _json_is_last=False, **json_parts): @@ -1931,18 +1931,18 @@ def print_json(json_dict=None, _json_is_last=False, **json_parts):
1931 json_dict = json_parts 1931 json_dict = json_parts
1932 1932
1933 if not _have_printed_json_start: 1933 if not _have_printed_json_start:
1934 - print('[') 1934 + print '['
1935 _have_printed_json_start = True 1935 _have_printed_json_start = True
1936 1936
1937 lines = json.dumps(json2ascii(json_dict), check_circular=False, 1937 lines = json.dumps(json2ascii(json_dict), check_circular=False,
1938 indent=4, ensure_ascii=False).splitlines() 1938 indent=4, ensure_ascii=False).splitlines()
1939 for line in lines[:-1]: 1939 for line in lines[:-1]:
1940 - print(' {0}'.format(line)) 1940 + print ' {0}'.format(line)
1941 if _json_is_last: 1941 if _json_is_last:
1942 - print(' {0}'.format(lines[-1])) # print last line without comma  
1943 - print(']') 1942 + print ' {0}'.format(lines[-1]) # print last line without comma
  1943 + print ']'
1944 else: 1944 else:
1945 - print(' {0},'.format(lines[-1])) # print last line with comma 1945 + print ' {0},'.format(lines[-1]) # print last line with comma
1946 1946
1947 1947
1948 class VBA_Scanner(object): 1948 class VBA_Scanner(object):
@@ -1959,10 +1959,10 @@ class VBA_Scanner(object): @@ -1959,10 +1959,10 @@ class VBA_Scanner(object):
1959 """ 1959 """
1960 # join long lines ending with " _": 1960 # join long lines ending with " _":
1961 self.code = vba_collapse_long_lines(vba_code) 1961 self.code = vba_collapse_long_lines(vba_code)
1962 - self.code_hex = b''  
1963 - self.code_hex_rev = b''  
1964 - self.code_rev_hex = b''  
1965 - self.code_base64 = b'' 1962 + self.code_hex = ''
  1963 + self.code_hex_rev = ''
  1964 + self.code_rev_hex = ''
  1965 + self.code_base64 = ''
1966 self.code_dridex = '' 1966 self.code_dridex = ''
1967 self.code_vba = '' 1967 self.code_vba = ''
1968 self.strReverse = None 1968 self.strReverse = None
@@ -1995,19 +1995,19 @@ class VBA_Scanner(object): @@ -1995,19 +1995,19 @@ class VBA_Scanner(object):
1995 if 'strreverse' in self.code.lower(): self.strReverse = True 1995 if 'strreverse' in self.code.lower(): self.strReverse = True
1996 # Then append the decoded strings to the VBA code, to detect obfuscated IOCs and keywords: 1996 # Then append the decoded strings to the VBA code, to detect obfuscated IOCs and keywords:
1997 for encoded, decoded in self.hex_strings: 1997 for encoded, decoded in self.hex_strings:
1998 - self.code_hex += b'\n' + decoded 1998 + self.code_hex += '\n' + decoded
1999 # if the code contains "StrReverse", also append the hex strings in reverse order: 1999 # if the code contains "StrReverse", also append the hex strings in reverse order:
2000 if self.strReverse: 2000 if self.strReverse:
2001 # StrReverse after hex decoding: 2001 # StrReverse after hex decoding:
2002 - self.code_hex_rev += b'\n' + decoded[::-1] 2002 + self.code_hex_rev += '\n' + decoded[::-1]
2003 # StrReverse before hex decoding: 2003 # StrReverse before hex decoding:
2004 - self.code_rev_hex += b'\n' + binascii.unhexlify(encoded[::-1]) 2004 + self.code_rev_hex += '\n' + binascii.unhexlify(encoded[::-1])
2005 #example: https://malwr.com/analysis/NmFlMGI4YTY1YzYyNDkwNTg1ZTBiZmY5OGI3YjlhYzU/ 2005 #example: https://malwr.com/analysis/NmFlMGI4YTY1YzYyNDkwNTg1ZTBiZmY5OGI3YjlhYzU/
2006 #TODO: also append the full code reversed if StrReverse? (risk of false positives?) 2006 #TODO: also append the full code reversed if StrReverse? (risk of false positives?)
2007 # Detect Base64-encoded strings 2007 # Detect Base64-encoded strings
2008 self.base64_strings = detect_base64_strings(self.code) 2008 self.base64_strings = detect_base64_strings(self.code)
2009 for encoded, decoded in self.base64_strings: 2009 for encoded, decoded in self.base64_strings:
2010 - self.code_base64 += b'\n' + decoded 2010 + self.code_base64 += '\n' + decoded
2011 # Detect Dridex-encoded strings 2011 # Detect Dridex-encoded strings
2012 self.dridex_strings = detect_dridex_strings(self.code) 2012 self.dridex_strings = detect_dridex_strings(self.code)
2013 for encoded, decoded in self.dridex_strings: 2013 for encoded, decoded in self.dridex_strings:
@@ -2026,15 +2026,13 @@ class VBA_Scanner(object): @@ -2026,15 +2026,13 @@ class VBA_Scanner(object):
2026 2026
2027 for code, obfuscation in ( 2027 for code, obfuscation in (
2028 (self.code, None), 2028 (self.code, None),
2029 - (self.code_hex.decode('utf-8','replace'), 'Hex'), 2029 + (self.code_hex, 'Hex'),
2030 (self.code_hex_rev, 'Hex+StrReverse'), 2030 (self.code_hex_rev, 'Hex+StrReverse'),
2031 (self.code_rev_hex, 'StrReverse+Hex'), 2031 (self.code_rev_hex, 'StrReverse+Hex'),
2032 - (self.code_base64.decode('utf-8', 'replace'), 'Base64'), 2032 + (self.code_base64, 'Base64'),
2033 (self.code_dridex, 'Dridex'), 2033 (self.code_dridex, 'Dridex'),
2034 (self.code_vba, 'VBA expression'), 2034 (self.code_vba, 'VBA expression'),
2035 ): 2035 ):
2036 - if isinstance(code,bytes):  
2037 - code=code.decode('utf-8','replace')  
2038 self.autoexec_keywords += detect_autoexec(code, obfuscation) 2036 self.autoexec_keywords += detect_autoexec(code, obfuscation)
2039 self.suspicious_keywords += detect_suspicious(code, obfuscation) 2037 self.suspicious_keywords += detect_suspicious(code, obfuscation)
2040 self.iocs += detect_patterns(code, obfuscation) 2038 self.iocs += detect_patterns(code, obfuscation)
@@ -2160,7 +2158,7 @@ class VBA_Parser(object): @@ -2160,7 +2158,7 @@ class VBA_Parser(object):
2160 _file = filename 2158 _file = filename
2161 else: 2159 else:
2162 # file already read in memory, make it a file-like object for zipfile: 2160 # file already read in memory, make it a file-like object for zipfile:
2163 - _file = BytesIO(data) 2161 + _file = cStringIO.StringIO(data)
2164 #self.file = _file 2162 #self.file = _file
2165 self.ole_file = None 2163 self.ole_file = None
2166 self.ole_subfiles = [] 2164 self.ole_subfiles = []
@@ -2209,7 +2207,7 @@ class VBA_Parser(object): @@ -2209,7 +2207,7 @@ class VBA_Parser(object):
2209 if data is None: 2207 if data is None:
2210 data = open(filename, 'rb').read() 2208 data = open(filename, 'rb').read()
2211 # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace 2209 # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace
2212 - if b'http://schemas.microsoft.com/office/word/2003/wordml' in data: 2210 + if 'http://schemas.microsoft.com/office/word/2003/wordml' in data:
2213 self.open_word2003xml(data) 2211 self.open_word2003xml(data)
2214 # store a lowercase version for the next tests: 2212 # store a lowercase version for the next tests:
2215 data_lowercase = data.lower() 2213 data_lowercase = data.lower()
@@ -2219,14 +2217,14 @@ class VBA_Parser(object): @@ -2219,14 +2217,14 @@ class VBA_Parser(object):
2219 # and even whitespaces in between "MIME", "-", "Version" and ":". The version number is ignored. 2217 # and even whitespaces in between "MIME", "-", "Version" and ":". The version number is ignored.
2220 # And the line is case insensitive. 2218 # And the line is case insensitive.
2221 # so we'll just check the presence of mime, version and multipart anywhere: 2219 # so we'll just check the presence of mime, version and multipart anywhere:
2222 - if self.type is None and b'mime' in data_lowercase and b'version' in data_lowercase \  
2223 - and b'multipart' in data_lowercase: 2220 + if self.type is None and 'mime' in data_lowercase and 'version' in data_lowercase \
  2221 + and 'multipart' in data_lowercase:
2224 self.open_mht(data) 2222 self.open_mht(data)
2225 #TODO: handle exceptions 2223 #TODO: handle exceptions
2226 #TODO: Excel 2003 XML 2224 #TODO: Excel 2003 XML
2227 # Check if this is a plain text VBA or VBScript file: 2225 # Check if this is a plain text VBA or VBScript file:
2228 # To avoid scanning binary files, we simply check for some control chars: 2226 # To avoid scanning binary files, we simply check for some control chars:
2229 - if self.type is None and b'\x00' not in data: 2227 + if self.type is None and '\x00' not in data:
2230 self.open_text(data) 2228 self.open_text(data)
2231 if self.type is None: 2229 if self.type is None:
2232 # At this stage, could not match a known format: 2230 # At this stage, could not match a known format:
@@ -2360,8 +2358,6 @@ class VBA_Parser(object): @@ -2360,8 +2358,6 @@ class VBA_Parser(object):
2360 """ 2358 """
2361 log.info('Opening MHTML file %s' % self.filename) 2359 log.info('Opening MHTML file %s' % self.filename)
2362 try: 2360 try:
2363 - if isinstance(data,bytes):  
2364 - data = data.decode('utf8', 'replace')  
2365 # parse the MIME content 2361 # parse the MIME content
2366 # remove any leading whitespace or newline (workaround for issue in email package) 2362 # remove any leading whitespace or newline (workaround for issue in email package)
2367 stripped_data = data.lstrip('\r\n\t ') 2363 stripped_data = data.lstrip('\r\n\t ')
@@ -2391,8 +2387,7 @@ class VBA_Parser(object): @@ -2391,8 +2387,7 @@ class VBA_Parser(object):
2391 # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded. 2387 # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded.
2392 # decompress the zlib data starting at offset 0x32, which is the OLE container: 2388 # decompress the zlib data starting at offset 0x32, which is the OLE container:
2393 # check ActiveMime header: 2389 # check ActiveMime header:
2394 -  
2395 - if (isinstance(part_data, str) or isinstance(part_data, bytes)) and is_mso_file(part_data): 2390 + if isinstance(part_data, str) and is_mso_file(part_data):
2396 log.debug('Found ActiveMime header, decompressing MSO container') 2391 log.debug('Found ActiveMime header, decompressing MSO container')
2397 try: 2392 try:
2398 ole_data = mso_file_extract(part_data) 2393 ole_data = mso_file_extract(part_data)
@@ -2463,8 +2458,6 @@ class VBA_Parser(object): @@ -2463,8 +2458,6 @@ class VBA_Parser(object):
2463 """ 2458 """
2464 log.info('Opening text file %s' % self.filename) 2459 log.info('Opening text file %s' % self.filename)
2465 # directly store the source code: 2460 # directly store the source code:
2466 - if isinstance(data,bytes):  
2467 - data=data.decode('utf8','replace')  
2468 self.vba_code_all_modules = data 2461 self.vba_code_all_modules = data
2469 self.contains_macros = True 2462 self.contains_macros = True
2470 # set type only if parsing succeeds 2463 # set type only if parsing succeeds
@@ -2603,7 +2596,7 @@ class VBA_Parser(object): @@ -2603,7 +2596,7 @@ class VBA_Parser(object):
2603 # Also look for VBA code in any stream including orphans 2596 # Also look for VBA code in any stream including orphans
2604 # (happens in some malformed files) 2597 # (happens in some malformed files)
2605 ole = self.ole_file 2598 ole = self.ole_file
2606 - for sid in range(len(ole.direntries)): 2599 + for sid in xrange(len(ole.direntries)):
2607 # check if id is already done above: 2600 # check if id is already done above:
2608 log.debug('Checking DirEntry #%d' % sid) 2601 log.debug('Checking DirEntry #%d' % sid)
2609 d = ole.direntries[sid] 2602 d = ole.direntries[sid]
@@ -2621,7 +2614,7 @@ class VBA_Parser(object): @@ -2621,7 +2614,7 @@ class VBA_Parser(object):
2621 log.debug('%r...[much more data]...%r' % (data[:100], data[-50:])) 2614 log.debug('%r...[much more data]...%r' % (data[:100], data[-50:]))
2622 else: 2615 else:
2623 log.debug(repr(data)) 2616 log.debug(repr(data))
2624 - if 'Attribut' in data.decode('utf-8','ignore'): 2617 + if 'Attribut' in data:
2625 log.debug('Found VBA compressed code') 2618 log.debug('Found VBA compressed code')
2626 self.contains_macros = True 2619 self.contains_macros = True
2627 except IOError as exc: 2620 except IOError as exc:
@@ -2669,7 +2662,7 @@ class VBA_Parser(object): @@ -2669,7 +2662,7 @@ class VBA_Parser(object):
2669 # Also look for VBA code in any stream including orphans 2662 # Also look for VBA code in any stream including orphans
2670 # (happens in some malformed files) 2663 # (happens in some malformed files)
2671 ole = self.ole_file 2664 ole = self.ole_file
2672 - for sid in range(len(ole.direntries)): 2665 + for sid in xrange(len(ole.direntries)):
2673 # check if id is already done above: 2666 # check if id is already done above:
2674 log.debug('Checking DirEntry #%d' % sid) 2667 log.debug('Checking DirEntry #%d' % sid)
2675 if sid in vba_stream_ids: 2668 if sid in vba_stream_ids:
@@ -2684,7 +2677,7 @@ class VBA_Parser(object): @@ -2684,7 +2677,7 @@ class VBA_Parser(object):
2684 # read data 2677 # read data
2685 log.debug('Reading data from stream %r' % d.name) 2678 log.debug('Reading data from stream %r' % d.name)
2686 data = ole._open(d.isectStart, d.size).read() 2679 data = ole._open(d.isectStart, d.size).read()
2687 - for match in re.finditer(rb'\x00Attribut[^e]', data, flags=re.IGNORECASE): 2680 + for match in re.finditer(r'\x00Attribut[^e]', data, flags=re.IGNORECASE):
2688 start = match.start() - 3 2681 start = match.start() - 3
2689 log.debug('Found VBA compressed code at index %X' % start) 2682 log.debug('Found VBA compressed code at index %X' % start)
2690 compressed_code = data[start:] 2683 compressed_code = data[start:]
@@ -2727,9 +2720,9 @@ class VBA_Parser(object): @@ -2727,9 +2720,9 @@ class VBA_Parser(object):
2727 self.vba_code_all_modules = '' 2720 self.vba_code_all_modules = ''
2728 for (_, _, _, vba_code) in self.extract_all_macros(): 2721 for (_, _, _, vba_code) in self.extract_all_macros():
2729 #TODO: filter code? (each module) 2722 #TODO: filter code? (each module)
2730 - self.vba_code_all_modules += vba_code.decode('utf-8', 'ignore') + '\n' 2723 + self.vba_code_all_modules += vba_code + '\n'
2731 for (_, _, form_string) in self.extract_form_strings(): 2724 for (_, _, form_string) in self.extract_form_strings():
2732 - self.vba_code_all_modules += form_string.decode('utf-8', 'ignore') + '\n' 2725 + self.vba_code_all_modules += form_string + '\n'
2733 # Analyze the whole code at once: 2726 # Analyze the whole code at once:
2734 scanner = VBA_Scanner(self.vba_code_all_modules) 2727 scanner = VBA_Scanner(self.vba_code_all_modules)
2735 self.analysis_results = scanner.scan(show_decoded_strings, deobfuscate) 2728 self.analysis_results = scanner.scan(show_decoded_strings, deobfuscate)
@@ -2904,7 +2897,7 @@ class VBA_Parser_CLI(VBA_Parser): @@ -2904,7 +2897,7 @@ class VBA_Parser_CLI(VBA_Parser):
2904 """ 2897 """
2905 # print a waiting message only if the output is not redirected to a file: 2898 # print a waiting message only if the output is not redirected to a file:
2906 if sys.stdout.isatty(): 2899 if sys.stdout.isatty():
2907 - print('Analysis...\r') 2900 + print 'Analysis...\r',
2908 sys.stdout.flush() 2901 sys.stdout.flush()
2909 results = self.analyze_macros(show_decoded_strings, deobfuscate) 2902 results = self.analyze_macros(show_decoded_strings, deobfuscate)
2910 if results: 2903 if results:
@@ -2920,9 +2913,9 @@ class VBA_Parser_CLI(VBA_Parser): @@ -2920,9 +2913,9 @@ class VBA_Parser_CLI(VBA_Parser):
2920 if not is_printable(description): 2913 if not is_printable(description):
2921 description = repr(description) 2914 description = repr(description)
2922 t.add_row((kw_type, keyword, description)) 2915 t.add_row((kw_type, keyword, description))
2923 - print(t) 2916 + print t
2924 else: 2917 else:
2925 - print('No suspicious keyword or IOC found.') 2918 + print 'No suspicious keyword or IOC found.'
2926 2919
2927 def print_analysis_json(self, show_decoded_strings=False, deobfuscate=False): 2920 def print_analysis_json(self, show_decoded_strings=False, deobfuscate=False):
2928 """ 2921 """
@@ -2936,7 +2929,7 @@ class VBA_Parser_CLI(VBA_Parser): @@ -2936,7 +2929,7 @@ class VBA_Parser_CLI(VBA_Parser):
2936 """ 2929 """
2937 # print a waiting message only if the output is not redirected to a file: 2930 # print a waiting message only if the output is not redirected to a file:
2938 if sys.stdout.isatty(): 2931 if sys.stdout.isatty():
2939 - print('Analysis...\r') 2932 + print 'Analysis...\r',
2940 sys.stdout.flush() 2933 sys.stdout.flush()
2941 return [dict(type=kw_type, keyword=keyword, description=description) 2934 return [dict(type=kw_type, keyword=keyword, description=description)
2942 for kw_type, keyword, description in self.analyze_macros(show_decoded_strings, deobfuscate)] 2935 for kw_type, keyword, description in self.analyze_macros(show_decoded_strings, deobfuscate)]
@@ -2965,44 +2958,42 @@ class VBA_Parser_CLI(VBA_Parser): @@ -2965,44 +2958,42 @@ class VBA_Parser_CLI(VBA_Parser):
2965 display_filename = '%s in %s' % (self.filename, self.container) 2958 display_filename = '%s in %s' % (self.filename, self.container)
2966 else: 2959 else:
2967 display_filename = self.filename 2960 display_filename = self.filename
2968 - print('=' * 79)  
2969 - print('FILE:', display_filename) 2961 + print '=' * 79
  2962 + print 'FILE:', display_filename
2970 try: 2963 try:
2971 #TODO: handle olefile errors, when an OLE file is malformed 2964 #TODO: handle olefile errors, when an OLE file is malformed
2972 - print('Type: %s' % self.type) 2965 + print 'Type:', self.type
2973 if self.detect_vba_macros(): 2966 if self.detect_vba_macros():
2974 #print 'Contains VBA Macros:' 2967 #print 'Contains VBA Macros:'
2975 for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros(): 2968 for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros():
2976 if hide_attributes: 2969 if hide_attributes:
2977 # hide attribute lines: 2970 # hide attribute lines:
2978 - if isinstance(vba_code,bytes):  
2979 - vba_code =vba_code.decode('utf-8','replace')  
2980 vba_code_filtered = filter_vba(vba_code) 2971 vba_code_filtered = filter_vba(vba_code)
2981 else: 2972 else:
2982 vba_code_filtered = vba_code 2973 vba_code_filtered = vba_code
2983 - print('-' * 79)  
2984 - print('VBA MACRO %s ' % vba_filename)  
2985 - print('in file: %s - OLE stream: %s' % (subfilename, repr(stream_path))) 2974 + print '-' * 79
  2975 + print 'VBA MACRO %s ' % vba_filename
  2976 + print 'in file: %s - OLE stream: %s' % (subfilename, repr(stream_path))
2986 if display_code: 2977 if display_code:
2987 - print('- ' * 39) 2978 + print '- ' * 39
2988 # detect empty macros: 2979 # detect empty macros:
2989 if vba_code_filtered.strip() == '': 2980 if vba_code_filtered.strip() == '':
2990 - print('(empty macro)') 2981 + print '(empty macro)'
2991 else: 2982 else:
2992 - print(vba_code_filtered) 2983 + print vba_code_filtered
2993 for (subfilename, stream_path, form_string) in self.extract_form_strings(): 2984 for (subfilename, stream_path, form_string) in self.extract_form_strings():
2994 - print('-' * 79)  
2995 - print('VBA FORM STRING IN %r - OLE stream: %r' % (subfilename, stream_path))  
2996 - print('- ' * 39)  
2997 - print(form_string.decode('utf-8', 'ignore')) 2985 + print '-' * 79
  2986 + print 'VBA FORM STRING IN %r - OLE stream: %r' % (subfilename, stream_path)
  2987 + print '- ' * 39
  2988 + print form_string
2998 if not vba_code_only: 2989 if not vba_code_only:
2999 # analyse the code from all modules at once: 2990 # analyse the code from all modules at once:
3000 self.print_analysis(show_decoded_strings, deobfuscate) 2991 self.print_analysis(show_decoded_strings, deobfuscate)
3001 if show_deobfuscated_code: 2992 if show_deobfuscated_code:
3002 - print('MACRO SOURCE CODE WITH DEOBFUSCATED VBA STRINGS (EXPERIMENTAL):\n\n')  
3003 - print(self.reveal()) 2993 + print 'MACRO SOURCE CODE WITH DEOBFUSCATED VBA STRINGS (EXPERIMENTAL):\n\n'
  2994 + print self.reveal()
3004 else: 2995 else:
3005 - print('No VBA macros found.') 2996 + print 'No VBA macros found.'
3006 except OlevbaBaseException: 2997 except OlevbaBaseException:
3007 raise 2998 raise
3008 except Exception as exc: 2999 except Exception as exc:
@@ -3010,7 +3001,7 @@ class VBA_Parser_CLI(VBA_Parser): @@ -3010,7 +3001,7 @@ class VBA_Parser_CLI(VBA_Parser):
3010 log.info('Error processing file %s (%s)' % (self.filename, exc)) 3001 log.info('Error processing file %s (%s)' % (self.filename, exc))
3011 log.debug('Traceback:', exc_info=True) 3002 log.debug('Traceback:', exc_info=True)
3012 raise ProcessingError(self.filename, exc) 3003 raise ProcessingError(self.filename, exc)
3013 - print('') 3004 + print ''
3014 3005
3015 3006
3016 def process_file_json(self, show_decoded_strings=False, 3007 def process_file_json(self, show_decoded_strings=False,
@@ -3057,7 +3048,7 @@ class VBA_Parser_CLI(VBA_Parser): @@ -3057,7 +3048,7 @@ class VBA_Parser_CLI(VBA_Parser):
3057 curr_macro = {} 3048 curr_macro = {}
3058 if hide_attributes: 3049 if hide_attributes:
3059 # hide attribute lines: 3050 # hide attribute lines:
3060 - vba_code_filtered = filter_vba(vba_code.decode('utf-8','replace')) 3051 + vba_code_filtered = filter_vba(vba_code)
3061 else: 3052 else:
3062 vba_code_filtered = vba_code 3053 vba_code_filtered = vba_code
3063 3054
@@ -3096,7 +3087,7 @@ class VBA_Parser_CLI(VBA_Parser): @@ -3096,7 +3087,7 @@ class VBA_Parser_CLI(VBA_Parser):
3096 if self.detect_vba_macros(): 3087 if self.detect_vba_macros():
3097 # print a waiting message only if the output is not redirected to a file: 3088 # print a waiting message only if the output is not redirected to a file:
3098 if sys.stdout.isatty(): 3089 if sys.stdout.isatty():
3099 - print('Analysis...\r') 3090 + print 'Analysis...\r',
3100 sys.stdout.flush() 3091 sys.stdout.flush()
3101 self.analyze_macros(show_decoded_strings=show_decoded_strings, 3092 self.analyze_macros(show_decoded_strings=show_decoded_strings,
3102 deobfuscate=deobfuscate) 3093 deobfuscate=deobfuscate)
@@ -3114,7 +3105,7 @@ class VBA_Parser_CLI(VBA_Parser): @@ -3114,7 +3105,7 @@ class VBA_Parser_CLI(VBA_Parser):
3114 base64obf, dridex, vba_obf) 3105 base64obf, dridex, vba_obf)
3115 3106
3116 line = '%-12s %s' % (flags, self.filename) 3107 line = '%-12s %s' % (flags, self.filename)
3117 - print(line) 3108 + print line
3118 3109
3119 # old table display: 3110 # old table display:
3120 # macros = autoexec = suspicious = iocs = hexstrings = 'no' 3111 # macros = autoexec = suspicious = iocs = hexstrings = 'no'
@@ -3207,7 +3198,7 @@ def main(): @@ -3207,7 +3198,7 @@ def main():
3207 3198
3208 # Print help if no arguments are passed 3199 # Print help if no arguments are passed
3209 if len(args) == 0: 3200 if len(args) == 0:
3210 - print(__doc__) 3201 + print __doc__
3211 parser.print_help() 3202 parser.print_help()
3212 sys.exit(RETURN_WRONG_ARGS) 3203 sys.exit(RETURN_WRONG_ARGS)
3213 3204
@@ -3218,7 +3209,7 @@ def main(): @@ -3218,7 +3209,7 @@ def main():
3218 url='http://decalage.info/python/oletools', 3209 url='http://decalage.info/python/oletools',
3219 type='MetaInformation') 3210 type='MetaInformation')
3220 else: 3211 else:
3221 - print('olevba %s - http://decalage.info/python/oletools' % __version__) 3212 + print 'olevba %s - http://decalage.info/python/oletools' % __version__
3222 3213
3223 logging.basicConfig(level=LOG_LEVELS[options.loglevel], format='%(levelname)-8s %(message)s') 3214 logging.basicConfig(level=LOG_LEVELS[options.loglevel], format='%(levelname)-8s %(message)s')
3224 # enable logging in the modules: 3215 # enable logging in the modules:
@@ -3238,8 +3229,8 @@ def main(): @@ -3238,8 +3229,8 @@ def main():
3238 # Column headers (do not know how many files there will be yet, so if no output_mode 3229 # Column headers (do not know how many files there will be yet, so if no output_mode
3239 # was specified, we will print triage for first file --> need these headers) 3230 # was specified, we will print triage for first file --> need these headers)
3240 if options.output_mode in ('triage', 'unspecified'): 3231 if options.output_mode in ('triage', 'unspecified'):
3241 - print('%-12s %-65s' % ('Flags', 'Filename'))  
3242 - print('%-12s %-65s' % ('-' * 11, '-' * 65)) 3232 + print '%-12s %-65s' % ('Flags', 'Filename')
  3233 + print '%-12s %-65s' % ('-' * 11, '-' * 65)
3243 3234
3244 previous_container = None 3235 previous_container = None
3245 count = 0 3236 count = 0
@@ -3257,14 +3248,14 @@ def main(): @@ -3257,14 +3248,14 @@ def main():
3257 if isinstance(data, Exception): 3248 if isinstance(data, Exception):
3258 if isinstance(data, PathNotFoundException): 3249 if isinstance(data, PathNotFoundException):
3259 if options.output_mode in ('triage', 'unspecified'): 3250 if options.output_mode in ('triage', 'unspecified'):
3260 - print('%-12s %s - File not found' % ('?', filename)) 3251 + print '%-12s %s - File not found' % ('?', filename)
3261 elif options.output_mode != 'json': 3252 elif options.output_mode != 'json':
3262 log.error('Given path %r does not exist!' % filename) 3253 log.error('Given path %r does not exist!' % filename)
3263 return_code = RETURN_FILE_NOT_FOUND if return_code == 0 \ 3254 return_code = RETURN_FILE_NOT_FOUND if return_code == 0 \
3264 else RETURN_SEVERAL_ERRS 3255 else RETURN_SEVERAL_ERRS
3265 else: 3256 else:
3266 if options.output_mode in ('triage', 'unspecified'): 3257 if options.output_mode in ('triage', 'unspecified'):
3267 - print('%-12s %s - Failed to read from zip file %s' % ('?', filename, container)) 3258 + print '%-12s %s - Failed to read from zip file %s' % ('?', filename, container)
3268 elif options.output_mode != 'json': 3259 elif options.output_mode != 'json':
3269 log.error('Exception opening/reading %r from zip file %r: %s' 3260 log.error('Exception opening/reading %r from zip file %r: %s'
3270 % (filename, container, data)) 3261 % (filename, container, data))
@@ -3291,7 +3282,7 @@ def main(): @@ -3291,7 +3282,7 @@ def main():
3291 # print container name when it changes: 3282 # print container name when it changes:
3292 if container != previous_container: 3283 if container != previous_container:
3293 if container is not None: 3284 if container is not None:
3294 - print('\nFiles in %s:' % container) 3285 + print '\nFiles in %s:' % container
3295 previous_container = container 3286 previous_container = container
3296 # summarized output for triage: 3287 # summarized output for triage:
3297 vba_parser.process_file_triage(show_decoded_strings=options.show_decoded_strings, 3288 vba_parser.process_file_triage(show_decoded_strings=options.show_decoded_strings,
@@ -3309,8 +3300,8 @@ def main(): @@ -3309,8 +3300,8 @@ def main():
3309 3300
3310 except (SubstreamOpenError, UnexpectedDataError) as exc: 3301 except (SubstreamOpenError, UnexpectedDataError) as exc:
3311 if options.output_mode in ('triage', 'unspecified'): 3302 if options.output_mode in ('triage', 'unspecified'):
3312 - print('%-12s %s - Error opening substream or uenxpected ' \  
3313 - 'content' % ('?', filename)) 3303 + print '%-12s %s - Error opening substream or uenxpected ' \
  3304 + 'content' % ('?', filename)
3314 elif options.output_mode == 'json': 3305 elif options.output_mode == 'json':
3315 print_json(file=filename, type='error', 3306 print_json(file=filename, type='error',
3316 error=type(exc).__name__, message=str(exc)) 3307 error=type(exc).__name__, message=str(exc))
@@ -3321,7 +3312,7 @@ def main(): @@ -3321,7 +3312,7 @@ def main():
3321 else RETURN_SEVERAL_ERRS 3312 else RETURN_SEVERAL_ERRS
3322 except FileOpenError as exc: 3313 except FileOpenError as exc:
3323 if options.output_mode in ('triage', 'unspecified'): 3314 if options.output_mode in ('triage', 'unspecified'):
3324 - print('%-12s %s - File format not supported' % ('?', filename)) 3315 + print '%-12s %s - File format not supported' % ('?', filename)
3325 elif options.output_mode == 'json': 3316 elif options.output_mode == 'json':
3326 print_json(file=filename, type='error', 3317 print_json(file=filename, type='error',
3327 error=type(exc).__name__, message=str(exc)) 3318 error=type(exc).__name__, message=str(exc))
@@ -3331,7 +3322,7 @@ def main(): @@ -3331,7 +3322,7 @@ def main():
3331 else RETURN_SEVERAL_ERRS 3322 else RETURN_SEVERAL_ERRS
3332 except ProcessingError as exc: 3323 except ProcessingError as exc:
3333 if options.output_mode in ('triage', 'unspecified'): 3324 if options.output_mode in ('triage', 'unspecified'):
3334 - print('%-12s %s - %s' % ('!ERROR', filename, exc.orig_exc)) 3325 + print '%-12s %s - %s' % ('!ERROR', filename, exc.orig_exc)
3335 elif options.output_mode == 'json': 3326 elif options.output_mode == 'json':
3336 print_json(file=filename, type='error', 3327 print_json(file=filename, type='error',
3337 error=type(exc).__name__, 3328 error=type(exc).__name__,
@@ -3346,9 +3337,9 @@ def main(): @@ -3346,9 +3337,9 @@ def main():
3346 vba_parser.close() 3337 vba_parser.close()
3347 3338
3348 if options.output_mode == 'triage': 3339 if options.output_mode == 'triage':
3349 - print('\n(Flags: OpX=OpenXML, XML=Word2003XML, MHT=MHTML, TXT=Text, M=Macros, ' \ 3340 + print '\n(Flags: OpX=OpenXML, XML=Word2003XML, MHT=MHTML, TXT=Text, M=Macros, ' \
3350 'A=Auto-executable, S=Suspicious keywords, I=IOCs, H=Hex strings, ' \ 3341 'A=Auto-executable, S=Suspicious keywords, I=IOCs, H=Hex strings, ' \
3351 - 'B=Base64 strings, D=Dridex strings, V=VBA strings, ?=Unknown)\n') 3342 + 'B=Base64 strings, D=Dridex strings, V=VBA strings, ?=Unknown)\n'
3352 3343
3353 if count == 1 and options.output_mode == 'unspecified': 3344 if count == 1 and options.output_mode == 'unspecified':
3354 # if options -t, -d and -j were not specified and it's a single file, print details: 3345 # if options -t, -d and -j were not specified and it's a single file, print details:
oletools/olevba3.py 0 โ†’ 100755
  1 +#!/usr/bin/env python
  2 +"""
  3 +olevba.py
  4 +
  5 +olevba is a script to parse OLE and OpenXML files such as MS Office documents
  6 +(e.g. Word, Excel), to extract VBA Macro code in clear text, deobfuscate
  7 +and analyze malicious macros.
  8 +
  9 +Supported formats:
  10 +- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)
  11 +- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
  12 +- PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm)
  13 +- Word 2003 XML (.xml)
  14 +- Word/Excel Single File Web Page / MHTML (.mht)
  15 +
  16 +Author: Philippe Lagadec - http://www.decalage.info
  17 +License: BSD, see source code or documentation
  18 +
  19 +olevba is part of the python-oletools package:
  20 +http://www.decalage.info/python/oletools
  21 +
  22 +olevba is based on source code from officeparser by John William Davison
  23 +https://github.com/unixfreak0037/officeparser
  24 +"""
  25 +
  26 +# === LICENSE ==================================================================
  27 +
  28 +# olevba is copyright (c) 2014-2016 Philippe Lagadec (http://www.decalage.info)
  29 +# All rights reserved.
  30 +#
  31 +# Redistribution and use in source and binary forms, with or without modification,
  32 +# are permitted provided that the following conditions are met:
  33 +#
  34 +# * Redistributions of source code must retain the above copyright notice, this
  35 +# list of conditions and the following disclaimer.
  36 +# * Redistributions in binary form must reproduce the above copyright notice,
  37 +# this list of conditions and the following disclaimer in the documentation
  38 +# and/or other materials provided with the distribution.
  39 +#
  40 +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  41 +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  42 +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  43 +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  44 +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  45 +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  46 +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  47 +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  48 +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  49 +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  50 +
  51 +
  52 +# olevba contains modified source code from the officeparser project, published
  53 +# under the following MIT License (MIT):
  54 +#
  55 +# officeparser is copyright (c) 2014 John William Davison
  56 +#
  57 +# Permission is hereby granted, free of charge, to any person obtaining a copy
  58 +# of this software and associated documentation files (the "Software"), to deal
  59 +# in the Software without restriction, including without limitation the rights
  60 +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  61 +# copies of the Software, and to permit persons to whom the Software is
  62 +# furnished to do so, subject to the following conditions:
  63 +#
  64 +# The above copyright notice and this permission notice shall be included in all
  65 +# copies or substantial portions of the Software.
  66 +#
  67 +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  68 +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  69 +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  70 +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  71 +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  72 +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  73 +# SOFTWARE.
  74 +
  75 +#------------------------------------------------------------------------------
  76 +# CHANGELOG:
  77 +# 2014-08-05 v0.01 PL: - first version based on officeparser code
  78 +# 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser
  79 +# 2014-08-15 PL: - fixed incorrect value check in projecthelpfilepath Record
  80 +# 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats
  81 +# and to find the VBA project root anywhere in the file
  82 +# 2014-11-29 v0.04 PL: - use olefile instead of OleFileIO_PL
  83 +# 2014-12-05 v0.05 PL: - refactored most functions into a class, new API
  84 +# - added detect_vba_macros
  85 +# 2014-12-10 v0.06 PL: - hide first lines with VB attributes
  86 +# - detect auto-executable macros
  87 +# - ignore empty macros
  88 +# 2014-12-14 v0.07 PL: - detect_autoexec() is now case-insensitive
  89 +# 2014-12-15 v0.08 PL: - improved display for empty macros
  90 +# - added pattern extraction
  91 +# 2014-12-25 v0.09 PL: - added suspicious keywords detection
  92 +# 2014-12-27 v0.10 PL: - added OptionParser, main and process_file
  93 +# - uses xglob to scan several files with wildcards
  94 +# - option -r to recurse subdirectories
  95 +# - option -z to scan files in password-protected zips
  96 +# 2015-01-02 v0.11 PL: - improved filter_vba to detect colons
  97 +# 2015-01-03 v0.12 PL: - fixed detect_patterns to detect all patterns
  98 +# - process_file: improved display, shows container file
  99 +# - improved list of executable file extensions
  100 +# 2015-01-04 v0.13 PL: - added several suspicious keywords, improved display
  101 +# 2015-01-08 v0.14 PL: - added hex strings detection and decoding
  102 +# - fixed issue #2, decoding VBA stream names using
  103 +# specified codepage and unicode stream names
  104 +# 2015-01-11 v0.15 PL: - added new triage mode, options -t and -d
  105 +# 2015-01-16 v0.16 PL: - fix for issue #3 (exception when module name="text")
  106 +# - added several suspicious keywords
  107 +# - added option -i to analyze VBA source code directly
  108 +# 2015-01-17 v0.17 PL: - removed .com from the list of executable extensions
  109 +# - added scan_vba to run all detection algorithms
  110 +# - decoded hex strings are now also scanned + reversed
  111 +# 2015-01-23 v0.18 PL: - fixed issue #3, case-insensitive search in code_modules
  112 +# 2015-01-24 v0.19 PL: - improved the detection of IOCs obfuscated with hex
  113 +# strings and StrReverse
  114 +# 2015-01-26 v0.20 PL: - added option --hex to show all hex strings decoded
  115 +# 2015-01-29 v0.21 PL: - added Dridex obfuscation decoding
  116 +# - improved display, shows obfuscation name
  117 +# 2015-02-01 v0.22 PL: - fixed issue #4: regex for URL, e-mail and exe filename
  118 +# - added Base64 obfuscation decoding (contribution from
  119 +# @JamesHabben)
  120 +# 2015-02-03 v0.23 PL: - triage now uses VBA_Scanner results, shows Base64 and
  121 +# Dridex strings
  122 +# - exception handling in detect_base64_strings
  123 +# 2015-02-07 v0.24 PL: - renamed option --hex to --decode, fixed display
  124 +# - display exceptions with stack trace
  125 +# - added several suspicious keywords
  126 +# - improved Base64 detection and decoding
  127 +# - fixed triage mode not to scan attrib lines
  128 +# 2015-03-04 v0.25 PL: - added support for Word 2003 XML
  129 +# 2015-03-22 v0.26 PL: - added suspicious keywords for sandboxing and
  130 +# virtualisation detection
  131 +# 2015-05-06 v0.27 PL: - added support for MHTML files with VBA macros
  132 +# (issue #10 reported by Greg from SpamStopsHere)
  133 +# 2015-05-24 v0.28 PL: - improved support for MHTML files with modified header
  134 +# (issue #11 reported by Thomas Chopitea)
  135 +# 2015-05-26 v0.29 PL: - improved MSO files parsing, taking into account
  136 +# various data offsets (issue #12)
  137 +# - improved detection of MSO files, avoiding incorrect
  138 +# parsing errors (issue #7)
  139 +# 2015-05-29 v0.30 PL: - added suspicious keywords suggested by @ozhermit,
  140 +# Davy Douhine (issue #9), issue #13
  141 +# 2015-06-16 v0.31 PL: - added generic VBA expression deobfuscation (chr,asc,etc)
  142 +# 2015-06-19 PL: - added options -a, -c, --each, --attr
  143 +# 2015-06-21 v0.32 PL: - always display decoded strings which are printable
  144 +# - fix VBA_Scanner.scan to return raw strings, not repr()
  145 +# 2015-07-09 v0.40 PL: - removed usage of sys.stderr which causes issues
  146 +# 2015-07-12 PL: - added Hex function decoding to VBA Parser
  147 +# 2015-07-13 PL: - added Base64 function decoding to VBA Parser
  148 +# 2015-09-06 PL: - improved VBA_Parser, refactored the main functions
  149 +# 2015-09-13 PL: - moved main functions to a class VBA_Parser_CLI
  150 +# - fixed issue when analysis was done twice
  151 +# 2015-09-15 PL: - remove duplicate IOCs from results
  152 +# 2015-09-16 PL: - join long VBA lines ending with underscore before scan
  153 +# - disabled unused option --each
  154 +# 2015-09-22 v0.41 PL: - added new option --reveal
  155 +# - added suspicious strings for PowerShell.exe options
  156 +# 2015-10-09 v0.42 PL: - VBA_Parser: split each format into a separate method
  157 +# 2015-10-10 PL: - added support for text files with VBA source code
  158 +# 2015-11-17 PL: - fixed bug with --decode option
  159 +# 2015-12-16 PL: - fixed bug in main (no options input anymore)
  160 +# - improved logging, added -l option
  161 +# 2016-01-31 PL: - fixed issue #31 in VBA_Parser.open_mht
  162 +# - fixed issue #32 by monkeypatching email.feedparser
  163 +# 2016-02-07 PL: - KeyboardInterrupt is now raised properly
  164 +# 2016-02-20 v0.43 PL: - fixed issue #34 in the VBA parser and vba_chr
  165 +# 2016-02-29 PL: - added Workbook_Activate to suspicious keywords
  166 +# 2016-03-08 v0.44 PL: - added VBA Form strings extraction and analysis
  167 +# 2016-03-04 v0.45 CH: - added JSON output (by Christian Herdtweck)
  168 +# 2016-03-16 CH: - added option --no-deobfuscate (temporary)
  169 +# 2016-04-19 v0.46 PL: - new option --deobf instead of --no-deobfuscate
  170 +# - updated suspicious keywords
  171 +# 2016-05-04 v0.47 PL: - look for VBA code in any stream including orphans
  172 +# 2016-04-28 CH: - return an exit code depending on the results
  173 +# - improved error and exception handling
  174 +# - improved JSON output
  175 +# 2016-05-12 CH: - added support for PowerPoint 97-2003 files
  176 +# 2016-06-06 CH: - improved handling of unicode VBA module names
  177 +# 2016-06-07 CH: - added option --relaxed, stricter parsing by default
  178 +# 2016-06-12 v0.50 PL: - fixed small bugs in VBA parsing code
  179 +# 2016-07-01 PL: - fixed issue #58 with format() to support Python 2.6
  180 +# 2016-07-29 CH: - fixed several bugs including #73 (Mac Roman encoding)
  181 +
  182 +__version__ = '0.50'
  183 +
  184 +#------------------------------------------------------------------------------
  185 +# TODO:
  186 +# + setup logging (common with other oletools)
  187 +# + add xor bruteforcing like bbharvest
  188 +# + options -a and -c should imply -d
  189 +
  190 +# TODO later:
  191 +# + performance improvement: instead of searching each keyword separately,
  192 +# first split vba code into a list of words (per line), then check each
  193 +# word against a dict. (or put vba words into a set/dict?)
  194 +# + for regex, maybe combine them into a single re with named groups?
  195 +# + add Yara support, include sample rules? plugins like balbuzard?
  196 +# + add balbuzard support
  197 +# + output to file (replace print by file.write, sys.stdout by default)
  198 +# + look for VBA in embedded documents (e.g. Excel in Word)
  199 +# + support SRP streams (see Lenny's article + links and sample)
  200 +# - python 3.x support
  201 +# - check VBA macros in Visio, Access, Project, etc
  202 +# - extract_macros: convert to a class, split long function into smaller methods
  203 +# - extract_macros: read bytes from stream file objects instead of strings
  204 +# - extract_macros: use combined struct.unpack instead of many calls
  205 +# - all except clauses should target specific exceptions
  206 +
  207 +#------------------------------------------------------------------------------
  208 +# REFERENCES:
  209 +# - [MS-OVBA]: Microsoft Office VBA File Format Structure
  210 +# http://msdn.microsoft.com/en-us/library/office/cc313094%28v=office.12%29.aspx
  211 +# - officeparser: https://github.com/unixfreak0037/officeparser
  212 +
  213 +
  214 +#--- IMPORTS ------------------------------------------------------------------
  215 +
  216 +import sys, logging
  217 +import struct
  218 +from _io import StringIO,BytesIO
  219 +import math
  220 +import zipfile
  221 +import re
  222 +import optparse
  223 +import binascii
  224 +import base64
  225 +import zlib
  226 +import email # for MHTML parsing
  227 +import string # for printable
  228 +import json # for json output mode (argument --json)
  229 +
  230 +# import lxml or ElementTree for XML parsing:
  231 +try:
  232 + # lxml: best performance for XML processing
  233 + import lxml.etree as ET
  234 +except ImportError:
  235 + try:
  236 + # Python 2.5+: batteries included
  237 + import xml.etree.cElementTree as ET
  238 + except ImportError:
  239 + try:
  240 + # Python <2.5: standalone ElementTree install
  241 + import elementtree.cElementTree as ET
  242 + except ImportError:
  243 + raise(ImportError, "lxml or ElementTree are not installed, " \
  244 + + "see http://codespeak.net/lxml " \
  245 + + "or http://effbot.org/zone/element-index.htm")
  246 +
  247 +import thirdparty.olefile as olefile
  248 +from thirdparty.prettytable import prettytable
  249 +from thirdparty.xglob import xglob, PathNotFoundException
  250 +from thirdparty.pyparsing.pyparsing import \
  251 + CaselessKeyword, CaselessLiteral, Combine, Forward, Literal, \
  252 + Optional, QuotedString,Regex, Suppress, Word, WordStart, \
  253 + alphanums, alphas, hexnums,nums, opAssoc, srange, \
  254 + infixNotation
  255 +import ppt_parser
  256 +
  257 +# monkeypatch email to fix issue #32:
  258 +# allow header lines without ":"
  259 +import email.feedparser
  260 +email.feedparser.headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:?|[\t ])')
  261 +
  262 +
  263 +# === LOGGING =================================================================
  264 +
  265 +class NullHandler(logging.Handler):
  266 + """
  267 + Log Handler without output, to avoid printing messages if logging is not
  268 + configured by the main application.
  269 + Python 2.7 has logging.NullHandler, but this is necessary for 2.6:
  270 + see https://docs.python.org/2.6/library/logging.html#configuring-logging-for-a-library
  271 + """
  272 + def emit(self, record):
  273 + pass
  274 +
  275 +def get_logger(name, level=logging.CRITICAL+1):
  276 + """
  277 + Create a suitable logger object for this module.
  278 + The goal is not to change settings of the root logger, to avoid getting
  279 + other modules' logs on the screen.
  280 + If a logger exists with same name, reuse it. (Else it would have duplicate
  281 + handlers and messages would be doubled.)
  282 + The level is set to CRITICAL+1 by default, to avoid any logging.
  283 + """
  284 + # First, test if there is already a logger with the same name, else it
  285 + # will generate duplicate messages (due to duplicate handlers):
  286 + if name in logging.Logger.manager.loggerDict:
  287 + #NOTE: another less intrusive but more "hackish" solution would be to
  288 + # use getLogger then test if its effective level is not default.
  289 + logger = logging.getLogger(name)
  290 + # make sure level is OK:
  291 + logger.setLevel(level)
  292 + return logger
  293 + # get a new logger:
  294 + logger = logging.getLogger(name)
  295 + # only add a NullHandler for this logger, it is up to the application
  296 + # to configure its own logging:
  297 + logger.addHandler(NullHandler())
  298 + logger.setLevel(level)
  299 + return logger
  300 +
  301 +# a global logger object used for debugging:
  302 +log = get_logger('olevba')
  303 +
  304 +
  305 +#=== EXCEPTIONS ==============================================================
  306 +
  307 +class OlevbaBaseException(Exception):
  308 + """ Base class for exceptions produced here for simpler except clauses """
  309 + def __init__(self, msg, filename=None, orig_exc=None, **kwargs):
  310 + if orig_exc:
  311 + super(OlevbaBaseException, self).__init__(msg +
  312 + ' ({0})'.format(orig_exc),
  313 + **kwargs)
  314 + else:
  315 + super(OlevbaBaseException, self).__init__(msg, **kwargs)
  316 + self.msg = msg
  317 + self.filename = filename
  318 + self.orig_exc = orig_exc
  319 +
  320 +
  321 +class FileOpenError(OlevbaBaseException):
  322 + """ raised by VBA_Parser constructor if all open_... attempts failed
  323 +
  324 + probably means the file type is not supported
  325 + """
  326 +
  327 + def __init__(self, filename, orig_exc=None):
  328 + super(FileOpenError, self).__init__(
  329 + 'Failed to open file %s' % filename, filename, orig_exc)
  330 +
  331 +
  332 +class ProcessingError(OlevbaBaseException):
  333 + """ raised by VBA_Parser.process_file* functions """
  334 +
  335 + def __init__(self, filename, orig_exc):
  336 + super(ProcessingError, self).__init__(
  337 + 'Error processing file %s' % filename, filename, orig_exc)
  338 +
  339 +
  340 +class MsoExtractionError(RuntimeError, OlevbaBaseException):
  341 + """ raised by mso_file_extract if parsing MSO/ActiveMIME data failed """
  342 +
  343 + def __init__(self, msg):
  344 + MsoExtractionError.__init__(self, msg)
  345 + OlevbaBaseException.__init__(self, msg)
  346 +
  347 +
  348 +class SubstreamOpenError(FileOpenError):
  349 + """ special kind of FileOpenError: file is a substream of original file """
  350 +
  351 + def __init__(self, filename, subfilename, orig_exc=None):
  352 + super(SubstreamOpenError, self).__init__(
  353 + str(filename) + '/' + str(subfilename), orig_exc)
  354 + self.filename = filename # overwrite setting in OlevbaBaseException
  355 + self.subfilename = subfilename
  356 +
  357 +
  358 +class UnexpectedDataError(OlevbaBaseException):
  359 + """ raised when parsing is strict (=not relaxed) and data is unexpected """
  360 +
  361 + def __init__(self, stream_path, variable, expected, value):
  362 + super(UnexpectedDataError, self).__init__(
  363 + 'Unexpected value in {0} for variable {1}: '
  364 + 'expected {2:04X} but found {3:04X}!'
  365 + .format(stream_path, variable, expected, value))
  366 + self.stream_path = stream_path
  367 + self.variable = variable
  368 + self.expected = expected
  369 + self.value = value
  370 +
  371 +#--- CONSTANTS ----------------------------------------------------------------
  372 +
  373 +# return codes
  374 +RETURN_OK = 0
  375 +RETURN_WARNINGS = 1 # (reserved, not used yet)
  376 +RETURN_WRONG_ARGS = 2 # (fixed, built into optparse)
  377 +RETURN_FILE_NOT_FOUND = 3
  378 +RETURN_XGLOB_ERR = 4
  379 +RETURN_OPEN_ERROR = 5
  380 +RETURN_PARSE_ERROR = 6
  381 +RETURN_SEVERAL_ERRS = 7
  382 +RETURN_UNEXPECTED = 8
  383 +
  384 +# MAC codepages (from http://stackoverflow.com/questions/1592925/decoding-mac-os-text-in-python)
  385 +MAC_CODEPAGES = {
  386 + 10000: 'mac-roman',
  387 + 10001: 'shiftjis', # not found: 'mac-shift-jis',
  388 + 10003: 'ascii', # nothing appropriate found: 'mac-hangul',
  389 + 10008: 'gb2321', # not found: 'mac-gb2312',
  390 + 10002: 'big5', # not found: 'mac-big5',
  391 + 10005: 'hebrew', # not found: 'mac-hebrew',
  392 + 10004: 'mac-arabic',
  393 + 10006: 'mac-greek',
  394 + 10081: 'mac-turkish',
  395 + 10021: 'thai', # not found: mac-thai',
  396 + 10029: 'maccentraleurope', # not found: 'mac-east europe',
  397 + 10007: 'ascii', # nothing appropriate found: 'mac-russian',
  398 +}
  399 +
  400 +# URL and message to report issues:
  401 +URL_OLEVBA_ISSUES = 'https://github.com/decalage2/oletools/issues'
  402 +MSG_OLEVBA_ISSUES = 'Please report this issue on %s' % URL_OLEVBA_ISSUES
  403 +
  404 +# Container types:
  405 +TYPE_OLE = 'OLE'
  406 +TYPE_OpenXML = 'OpenXML'
  407 +TYPE_Word2003_XML = 'Word2003_XML'
  408 +TYPE_MHTML = 'MHTML'
  409 +TYPE_TEXT = 'Text'
  410 +TYPE_PPT = 'PPT'
  411 +
  412 +# short tag to display file types in triage mode:
  413 +TYPE2TAG = {
  414 + TYPE_OLE: 'OLE:',
  415 + TYPE_OpenXML: 'OpX:',
  416 + TYPE_Word2003_XML: 'XML:',
  417 + TYPE_MHTML: 'MHT:',
  418 + TYPE_TEXT: 'TXT:',
  419 + TYPE_PPT: 'PPT',
  420 +}
  421 +
  422 +
  423 +# MSO files ActiveMime header magic
  424 +MSO_ACTIVEMIME_HEADER = b'ActiveMime'
  425 +
  426 +MODULE_EXTENSION = "bas"
  427 +CLASS_EXTENSION = "cls"
  428 +FORM_EXTENSION = "frm"
  429 +
  430 +# Namespaces and tags for Word2003 XML parsing:
  431 +NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}'
  432 +# the tag <w:binData w:name="editdata.mso"> contains the VBA macro code:
  433 +TAG_BINDATA = NS_W + 'binData'
  434 +ATTR_NAME = NS_W + 'name'
  435 +
  436 +# Keywords to detect auto-executable macros
  437 +AUTOEXEC_KEYWORDS = {
  438 + # MS Word:
  439 + 'Runs when the Word document is opened':
  440 + ('AutoExec', 'AutoOpen', 'Document_Open', 'DocumentOpen'),
  441 + 'Runs when the Word document is closed':
  442 + ('AutoExit', 'AutoClose', 'Document_Close', 'DocumentBeforeClose'),
  443 + 'Runs when the Word document is modified':
  444 + ('DocumentChange',),
  445 + 'Runs when a new Word document is created':
  446 + ('AutoNew', 'Document_New', 'NewDocument'),
  447 +
  448 + # MS Excel:
  449 + 'Runs when the Excel Workbook is opened':
  450 + ('Auto_Open', 'Workbook_Open', 'Workbook_Activate'),
  451 + 'Runs when the Excel Workbook is closed':
  452 + ('Auto_Close', 'Workbook_Close'),
  453 +
  454 + #TODO: full list in MS specs??
  455 +}
  456 +
  457 +# Suspicious Keywords that may be used by malware
  458 +# See VBA language reference: http://msdn.microsoft.com/en-us/library/office/jj692818%28v=office.15%29.aspx
  459 +SUSPICIOUS_KEYWORDS = {
  460 + #TODO: use regex to support variable whitespaces
  461 + 'May read system environment variables':
  462 + ('Environ',),
  463 + 'May open a file':
  464 + ('Open',),
  465 + 'May write to a file (if combined with Open)':
  466 + #TODO: regex to find Open+Write on same line
  467 + ('Write', 'Put', 'Output', 'Print #'),
  468 + 'May read or write a binary file (if combined with Open)':
  469 + #TODO: regex to find Open+Binary on same line
  470 + ('Binary',),
  471 + 'May copy a file':
  472 + ('FileCopy', 'CopyFile'),
  473 + #FileCopy: http://msdn.microsoft.com/en-us/library/office/gg264390%28v=office.15%29.aspx
  474 + #CopyFile: http://msdn.microsoft.com/en-us/library/office/gg264089%28v=office.15%29.aspx
  475 + 'May delete a file':
  476 + ('Kill',),
  477 + 'May create a text file':
  478 + ('CreateTextFile', 'ADODB.Stream', 'WriteText', 'SaveToFile'),
  479 + #CreateTextFile: http://msdn.microsoft.com/en-us/library/office/gg264617%28v=office.15%29.aspx
  480 + #ADODB.Stream sample: http://pastebin.com/Z4TMyuq6
  481 + 'May run an executable file or a system command':
  482 + ('Shell', 'vbNormal', 'vbNormalFocus', 'vbHide', 'vbMinimizedFocus', 'vbMaximizedFocus', 'vbNormalNoFocus',
  483 + 'vbMinimizedNoFocus', 'WScript.Shell', 'Run', 'ShellExecute'),
  484 + #Shell: http://msdn.microsoft.com/en-us/library/office/gg278437%28v=office.15%29.aspx
  485 + #WScript.Shell+Run sample: http://pastebin.com/Z4TMyuq6
  486 + 'May run PowerShell commands':
  487 + #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/
  488 + #also: https://bitbucket.org/decalage/oletools/issues/14/olevba-library-update-ioc
  489 + # ref: https://blog.netspi.com/15-ways-to-bypass-the-powershell-execution-policy/
  490 + # TODO: add support for keywords starting with a non-alpha character, such as "-noexit"
  491 + # TODO: '-command', '-EncodedCommand', '-scriptblock'
  492 + ('PowerShell', 'noexit', 'ExecutionPolicy', 'noprofile', 'command', 'EncodedCommand',
  493 + 'invoke-command', 'scriptblock', 'Invoke-Expression', 'AuthorizationManager'),
  494 + 'May run an executable file or a system command using PowerShell':
  495 + ('Start-Process',),
  496 + 'May hide the application':
  497 + ('Application.Visible', 'ShowWindow', 'SW_HIDE'),
  498 + 'May create a directory':
  499 + ('MkDir',),
  500 + 'May save the current workbook':
  501 + ('ActiveWorkbook.SaveAs',),
  502 + 'May change which directory contains files to open at startup':
  503 + #TODO: confirm the actual effect
  504 + ('Application.AltStartupPath',),
  505 + 'May create an OLE object':
  506 + ('CreateObject',),
  507 + 'May create an OLE object using PowerShell':
  508 + ('New-Object',),
  509 + 'May run an application (if combined with CreateObject)':
  510 + ('Shell.Application',),
  511 + 'May enumerate application windows (if combined with Shell.Application object)':
  512 + ('Windows', 'FindWindow'),
  513 + 'May run code from a DLL':
  514 + #TODO: regex to find declare+lib on same line
  515 + ('Lib',),
  516 + 'May inject code into another process':
  517 + ('CreateThread', 'VirtualAlloc', # (issue #9) suggested by Davy Douhine - used by MSF payload
  518 + ),
  519 + 'May download files from the Internet':
  520 + #TODO: regex to find urlmon+URLDownloadToFileA on same line
  521 + ('URLDownloadToFileA', 'Msxml2.XMLHTTP', 'Microsoft.XMLHTTP',
  522 + 'MSXML2.ServerXMLHTTP', # suggested in issue #13
  523 + 'User-Agent', # sample from @ozhermit: http://pastebin.com/MPc3iV6z
  524 + ),
  525 + 'May download files from the Internet using PowerShell':
  526 + #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/
  527 + ('Net.WebClient', 'DownloadFile', 'DownloadString'),
  528 + 'May control another application by simulating user keystrokes':
  529 + ('SendKeys', 'AppActivate'),
  530 + #SendKeys: http://msdn.microsoft.com/en-us/library/office/gg278655%28v=office.15%29.aspx
  531 + 'May attempt to obfuscate malicious function calls':
  532 + ('CallByName',),
  533 + #CallByName: http://msdn.microsoft.com/en-us/library/office/gg278760%28v=office.15%29.aspx
  534 + 'May attempt to obfuscate specific strings':
  535 + #TODO: regex to find several Chr*, not just one
  536 + ('Chr', 'ChrB', 'ChrW', 'StrReverse', 'Xor'),
  537 + #Chr: http://msdn.microsoft.com/en-us/library/office/gg264465%28v=office.15%29.aspx
  538 + 'May read or write registry keys':
  539 + #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/
  540 + ('RegOpenKeyExA', 'RegOpenKeyEx', 'RegCloseKey'),
  541 + 'May read registry keys':
  542 + #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/
  543 + ('RegQueryValueExA', 'RegQueryValueEx',
  544 + 'RegRead', #with Wscript.Shell
  545 + ),
  546 + 'May detect virtualization':
  547 + # sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/
  548 + (r'SYSTEM\ControlSet001\Services\Disk\Enum', 'VIRTUAL', 'VMWARE', 'VBOX'),
  549 + 'May detect Anubis Sandbox':
  550 + # sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/
  551 + # NOTES: this sample also checks App.EXEName but that seems to be a bug, it works in VB6 but not in VBA
  552 + # ref: http://www.syssec-project.eu/m/page-media/3/disarm-raid11.pdf
  553 + ('GetVolumeInformationA', 'GetVolumeInformation', # with kernel32.dll
  554 + '1824245000', r'HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows NT\CurrentVersion\ProductId',
  555 + '76487-337-8429955-22614', 'andy', 'sample', r'C:\exec\exec.exe', 'popupkiller'
  556 + ),
  557 + 'May detect Sandboxie':
  558 + # sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/
  559 + # ref: http://www.cplusplus.com/forum/windows/96874/
  560 + ('SbieDll.dll', 'SandboxieControlWndClass'),
  561 + 'May detect Sunbelt Sandbox':
  562 + # ref: http://www.cplusplus.com/forum/windows/96874/
  563 + (r'C:\file.exe',),
  564 + 'May detect Norman Sandbox':
  565 + # ref: http://www.cplusplus.com/forum/windows/96874/
  566 + ('currentuser',),
  567 + 'May detect CW Sandbox':
  568 + # ref: http://www.cplusplus.com/forum/windows/96874/
  569 + ('Schmidti',),
  570 + 'May detect WinJail Sandbox':
  571 + # ref: http://www.cplusplus.com/forum/windows/96874/
  572 + ('Afx:400000:0',),
  573 +}
  574 +
  575 +# Regular Expression for a URL:
  576 +# http://en.wikipedia.org/wiki/Uniform_resource_locator
  577 +# http://www.w3.org/Addressing/URL/uri-spec.html
  578 +#TODO: also support username:password@server
  579 +#TODO: other protocols (file, gopher, wais, ...?)
  580 +SCHEME = r'\b(?:http|ftp)s?'
  581 +# see http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains
  582 +TLD = r'(?:xn--[a-zA-Z0-9]{4,20}|[a-zA-Z]{2,20})'
  583 +DNS_NAME = r'(?:[a-zA-Z0-9\-\.]+\.' + TLD + ')'
  584 +#TODO: IPv6 - see https://www.debuggex.com/
  585 +# A literal numeric IPv6 address may be given, but must be enclosed in [ ] e.g. [db8:0cec::99:123a]
  586 +NUMBER_0_255 = r'(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'
  587 +IPv4 = r'(?:' + NUMBER_0_255 + r'\.){3}' + NUMBER_0_255
  588 +# IPv4 must come before the DNS name because it is more specific
  589 +SERVER = r'(?:' + IPv4 + '|' + DNS_NAME + ')'
  590 +PORT = r'(?:\:[0-9]{1,5})?'
  591 +SERVER_PORT = SERVER + PORT
  592 +URL_PATH = r'(?:/[a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~]*)?' # [^\.\,\)\(\s"]
  593 +URL_RE = SCHEME + r'\://' + SERVER_PORT + URL_PATH
  594 +re_url = re.compile(URL_RE)
  595 +
  596 +
  597 +# Patterns to be extracted (IP addresses, URLs, etc)
  598 +# From patterns.py in balbuzard
  599 +RE_PATTERNS = (
  600 + ('URL', re.compile(URL_RE)),
  601 + ('IPv4 address', re.compile(IPv4)),
  602 + # TODO: add IPv6
  603 + ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@' + SERVER + '\b')),
  604 + # ('Domain name', re.compile(r'(?=^.{1,254}$)(^(?:(?!\d+\.|-)[a-zA-Z0-9_\-]{1,63}(?<!-)\.?)+(?:[a-zA-Z]{2,})$)')),
  605 + # Executable file name with known extensions (except .com which is present in many URLs, and .application):
  606 + ("Executable file name", re.compile(
  607 + r"(?i)\b\w+\.(EXE|PIF|GADGET|MSI|MSP|MSC|VBS|VBE|VB|JSE|JS|WSF|WSC|WSH|WS|BAT|CMD|DLL|SCR|HTA|CPL|CLASS|JAR|PS1XML|PS1|PS2XML|PS2|PSC1|PSC2|SCF|LNK|INF|REG)\b")),
  608 + # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/
  609 + # TODO: https://support.office.com/en-us/article/Blocked-attachments-in-Outlook-3811cddc-17c3-4279-a30c-060ba0207372#__attachment_file_types
  610 + # TODO: add win & unix file paths
  611 + #('Hex string', re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')),
  612 +)
  613 +
  614 +# regex to detect strings encoded in hexadecimal
  615 +re_hex_string = re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')
  616 +
  617 +# regex to detect strings encoded in base64
  618 +#re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?"')
  619 +# better version from balbuzard, less false positives:
  620 +# (plain version without double quotes, used also below in quoted_base64_string)
  621 +BASE64_RE = r'(?:[A-Za-z0-9+/]{4}){1,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)?'
  622 +re_base64_string = re.compile('"' + BASE64_RE + '"')
  623 +# white list of common strings matching the base64 regex, but which are not base64 strings (all lowercase):
  624 +BASE64_WHITELIST = set(['thisdocument', 'thisworkbook', 'test', 'temp', 'http', 'open', 'exit'])
  625 +
  626 +# regex to detect strings encoded with a specific Dridex algorithm
  627 +# (see https://github.com/JamesHabben/MalwareStuff)
  628 +re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"')
  629 +# regex to check that it is not just a hex string:
  630 +re_nothex_check = re.compile(r'[G-Zg-z]')
  631 +
  632 +# regex to extract printable strings (at least 5 chars) from VBA Forms:
  633 +re_printable_string = re.compile(rb'[\t\r\n\x20-\xFF]{5,}')
  634 +
  635 +
  636 +# === PARTIAL VBA GRAMMAR ====================================================
  637 +
  638 +# REFERENCES:
  639 +# - [MS-VBAL]: VBA Language Specification
  640 +# https://msdn.microsoft.com/en-us/library/dd361851.aspx
  641 +# - pyparsing: http://pyparsing.wikispaces.com/
  642 +
  643 +# TODO: set whitespaces according to VBA
  644 +# TODO: merge extended lines before parsing
  645 +
  646 +# VBA identifier chars (from MS-VBAL 3.3.5)
  647 +vba_identifier_chars = alphanums + '_'
  648 +
  649 +class VbaExpressionString(str):
  650 + """
  651 + Class identical to str, used to distinguish plain strings from strings
  652 + obfuscated using VBA expressions (Chr, StrReverse, etc)
  653 + Usage: each VBA expression parse action should convert strings to
  654 + VbaExpressionString.
  655 + Then isinstance(s, VbaExpressionString) is True only for VBA expressions.
  656 + (see detect_vba_strings)
  657 + """
  658 + # TODO: use Unicode everywhere instead of str
  659 + pass
  660 +
  661 +
  662 +# --- NUMBER TOKENS ----------------------------------------------------------
  663 +
  664 +# 3.3.2 Number Tokens
  665 +# INTEGER = integer-literal ["%" / "&" / "^"]
  666 +# integer-literal = decimal-literal / octal-literal / hex-literal
  667 +# decimal-literal = 1*decimal-digit
  668 +# octal-literal = "&" [%x004F / %x006F] 1*octal-digit
  669 +# ; & or &o or &O
  670 +# hex-literal = "&" (%x0048 / %x0068) 1*hex-digit
  671 +# ; &h or &H
  672 +# octal-digit = "0" / "1" / "2" / "3" / "4" / "5" / "6" / "7"
  673 +# decimal-digit = octal-digit / "8" / "9"
  674 +# hex-digit = decimal-digit / %x0041-0046 / %x0061-0066 ;A-F / a-f
  675 +
  676 +# NOTE: here Combine() is required to avoid spaces between elements
  677 +# NOTE: here WordStart is necessary to avoid matching a number preceded by
  678 +# letters or underscore (e.g. "VBT1" or "ABC_34"), when using scanString
  679 +decimal_literal = Combine(WordStart(vba_identifier_chars) + Word(nums)
  680 + + Suppress(Optional(Word('%&^', exact=1))))
  681 +decimal_literal.setParseAction(lambda t: int(t[0]))
  682 +
  683 +octal_literal = Combine(Suppress(Literal('&') + Optional((CaselessLiteral('o')))) + Word(srange('[0-7]'))
  684 + + Suppress(Optional(Word('%&^', exact=1))))
  685 +octal_literal.setParseAction(lambda t: int(t[0], base=8))
  686 +
  687 +hex_literal = Combine(Suppress(CaselessLiteral('&h')) + Word(srange('[0-9a-fA-F]'))
  688 + + Suppress(Optional(Word('%&^', exact=1))))
  689 +hex_literal.setParseAction(lambda t: int(t[0], base=16))
  690 +
  691 +integer = decimal_literal | octal_literal | hex_literal
  692 +
  693 +
  694 +# --- QUOTED STRINGS ---------------------------------------------------------
  695 +
  696 +# 3.3.4 String Tokens
  697 +# STRING = double-quote *string-character (double-quote / line-continuation / LINE-END)
  698 +# double-quote = %x0022 ; "
  699 +# string-character = NO-LINE-CONTINUATION ((double-quote double-quote) termination-character)
  700 +
  701 +quoted_string = QuotedString('"', escQuote='""')
  702 +quoted_string.setParseAction(lambda t: str(t[0]))
  703 +
  704 +
  705 +#--- VBA Expressions ---------------------------------------------------------
  706 +
  707 +# See MS-VBAL 5.6 Expressions
  708 +
  709 +# need to pre-declare using Forward() because it is recursive
  710 +# VBA string expression and integer expression
  711 +vba_expr_str = Forward()
  712 +vba_expr_int = Forward()
  713 +
  714 +# --- CHR --------------------------------------------------------------------
  715 +
  716 +# MS-VBAL 6.1.2.11.1.4 Chr / Chr$
  717 +# Function Chr(CharCode As Long) As Variant
  718 +# Function Chr$(CharCode As Long) As String
  719 +# Parameter Description
  720 +# CharCode Long whose value is a code point.
  721 +# Returns a String data value consisting of a single character containing the character whose code
  722 +# point is the data value of the argument.
  723 +# - If the argument is not in the range 0 to 255, Error Number 5 ("Invalid procedure call or
  724 +# argument") is raised unless the implementation supports a character set with a larger code point
  725 +# range.
  726 +# - If the argument value is in the range of 0 to 127, it is interpreted as a 7-bit ASCII code point.
  727 +# - If the argument value is in the range of 128 to 255, the code point interpretation of the value is
  728 +# implementation defined.
  729 +# - Chr$ has the same runtime semantics as Chr, however the declared type of its function result is
  730 +# String rather than Variant.
  731 +
  732 +# 6.1.2.11.1.5 ChrB / ChrB$
  733 +# Function ChrB(CharCode As Long) As Variant
  734 +# Function ChrB$(CharCode As Long) As String
  735 +# CharCode Long whose value is a code point.
  736 +# Returns a String data value consisting of a single byte character whose code point value is the
  737 +# data value of the argument.
  738 +# - If the argument is not in the range 0 to 255, Error Number 6 ("Overflow") is raised.
  739 +# - ChrB$ has the same runtime semantics as ChrB however the declared type of its function result
  740 +# is String rather than Variant.
  741 +# - Note: the ChrB function is used with byte data contained in a String. Instead of returning a
  742 +# character, which may be one or two bytes, ChrB always returns a single byte. The ChrW function
  743 +# returns a String containing the Unicode character except on platforms where Unicode is not
  744 +# supported, in which case, the behavior is identical to the Chr function.
  745 +
  746 +# 6.1.2.11.1.6 ChrW/ ChrW$
  747 +# Function ChrW(CharCode As Long) As Variant
  748 +# Function ChrW$(CharCode As Long) As String
  749 +# CharCode Long whose value is a code point.
  750 +# Returns a String data value consisting of a single character containing the character whose code
  751 +# point is the data value of the argument.
  752 +# - If the argument is not in the range -32,767 to 65,535 then Error Number 5 ("Invalid procedure
  753 +# call or argument") is raised.
  754 +# - If the argument is a negative value it is treated as if it was the value: CharCode + 65,536.
  755 +# - If the implemented uses 16-bit Unicode code points argument, data value is interpreted as a 16-
  756 +# bit Unicode code point.
  757 +# - If the implementation does not support Unicode, ChrW has the same semantics as Chr.
  758 +# - ChrW$ has the same runtime semantics as ChrW, however the declared type of its function result
  759 +# is String rather than Variant.
  760 +
  761 +# Chr, Chr$, ChrB, ChrW(int) => char
  762 +vba_chr = Suppress(
  763 + Combine(WordStart(vba_identifier_chars) + CaselessLiteral('Chr')
  764 + + Optional(CaselessLiteral('B') | CaselessLiteral('W')) + Optional('$'))
  765 + + '(') + vba_expr_int + Suppress(')')
  766 +
  767 +def vba_chr_tostr(t):
  768 + try:
  769 + i = t[0]
  770 + # normal, non-unicode character:
  771 + if i>=0 and i<=255:
  772 + return VbaExpressionString(chr(i))
  773 + else:
  774 + return VbaExpressionString(unichr(i).encode('utf-8', 'backslashreplace'))
  775 + except ValueError:
  776 + log.exception('ERROR: incorrect parameter value for chr(): %r' % i)
  777 + return VbaExpressionString('Chr(%r)' % i)
  778 +
  779 +vba_chr.setParseAction(vba_chr_tostr)
  780 +
  781 +
  782 +# --- ASC --------------------------------------------------------------------
  783 +
  784 +# Asc(char) => int
  785 +#TODO: see MS-VBAL 6.1.2.11.1.1 page 240 => AscB, AscW
  786 +vba_asc = Suppress(CaselessKeyword('Asc') + '(') + vba_expr_str + Suppress(')')
  787 +vba_asc.setParseAction(lambda t: ord(t[0]))
  788 +
  789 +
  790 +# --- VAL --------------------------------------------------------------------
  791 +
  792 +# Val(string) => int
  793 +# TODO: make sure the behavior of VBA's val is fully covered
  794 +vba_val = Suppress(CaselessKeyword('Val') + '(') + vba_expr_str + Suppress(')')
  795 +vba_val.setParseAction(lambda t: int(t[0].strip()))
  796 +
  797 +
  798 +# --- StrReverse() --------------------------------------------------------------------
  799 +
  800 +# StrReverse(string) => string
  801 +strReverse = Suppress(CaselessKeyword('StrReverse') + '(') + vba_expr_str + Suppress(')')
  802 +strReverse.setParseAction(lambda t: VbaExpressionString(str(t[0])[::-1]))
  803 +
  804 +
  805 +# --- ENVIRON() --------------------------------------------------------------------
  806 +
  807 +# Environ("name") => just translated to "%name%", that is enough for malware analysis
  808 +environ = Suppress(CaselessKeyword('Environ') + '(') + vba_expr_str + Suppress(')')
  809 +environ.setParseAction(lambda t: VbaExpressionString('%%%s%%' % t[0]))
  810 +
  811 +
  812 +# --- IDENTIFIER -------------------------------------------------------------
  813 +
  814 +#TODO: see MS-VBAL 3.3.5 page 33
  815 +# 3.3.5 Identifier Tokens
  816 +# Latin-identifier = first-Latin-identifier-character *subsequent-Latin-identifier-character
  817 +# first-Latin-identifier-character = (%x0041-005A / %x0061-007A) ; A-Z / a-z
  818 +# subsequent-Latin-identifier-character = first-Latin-identifier-character / DIGIT / %x5F ; underscore
  819 +latin_identifier = Word(initChars=alphas, bodyChars=alphanums + '_')
  820 +
  821 +# --- HEX FUNCTION -----------------------------------------------------------
  822 +
  823 +# match any custom function name with a hex string as argument:
  824 +# TODO: accept vba_expr_str_item as argument, check if it is a hex or base64 string at runtime
  825 +
  826 +# quoted string of at least two hexadecimal numbers of two digits:
  827 +quoted_hex_string = Suppress('"') + Combine(Word(hexnums, exact=2) * (2, None)) + Suppress('"')
  828 +quoted_hex_string.setParseAction(lambda t: str(t[0]))
  829 +
  830 +hex_function_call = Suppress(latin_identifier) + Suppress('(') + \
  831 + quoted_hex_string('hex_string') + Suppress(')')
  832 +hex_function_call.setParseAction(lambda t: VbaExpressionString(binascii.a2b_hex(t.hex_string)))
  833 +
  834 +
  835 +# --- BASE64 FUNCTION -----------------------------------------------------------
  836 +
  837 +# match any custom function name with a Base64 string as argument:
  838 +# TODO: accept vba_expr_str_item as argument, check if it is a hex or base64 string at runtime
  839 +
  840 +# quoted string of at least two hexadecimal numbers of two digits:
  841 +quoted_base64_string = Suppress('"') + Regex(BASE64_RE) + Suppress('"')
  842 +quoted_base64_string.setParseAction(lambda t: str(t[0]))
  843 +
  844 +base64_function_call = Suppress(latin_identifier) + Suppress('(') + \
  845 + quoted_base64_string('base64_string') + Suppress(')')
  846 +base64_function_call.setParseAction(lambda t: VbaExpressionString(binascii.a2b_base64(t.base64_string)))
  847 +
  848 +
  849 +# ---STRING EXPRESSION -------------------------------------------------------
  850 +
  851 +def concat_strings_list(tokens):
  852 + """
  853 + parse action to concatenate strings in a VBA expression with operators '+' or '&'
  854 + """
  855 + # extract argument from the tokens:
  856 + # expected to be a tuple containing a list of strings such as [a,'&',b,'&',c,...]
  857 + strings = tokens[0][::2]
  858 + return VbaExpressionString(''.join(strings))
  859 +
  860 +
  861 +vba_expr_str_item = (vba_chr | strReverse | environ | quoted_string | hex_function_call | base64_function_call)
  862 +
  863 +vba_expr_str <<= infixNotation(vba_expr_str_item,
  864 + [
  865 + ("+", 2, opAssoc.LEFT, concat_strings_list),
  866 + ("&", 2, opAssoc.LEFT, concat_strings_list),
  867 + ])
  868 +
  869 +
  870 +# --- INTEGER EXPRESSION -------------------------------------------------------
  871 +
  872 +def sum_ints_list(tokens):
  873 + """
  874 + parse action to sum integers in a VBA expression with operator '+'
  875 + """
  876 + # extract argument from the tokens:
  877 + # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...]
  878 + integers = tokens[0][::2]
  879 + return sum(integers)
  880 +
  881 +
  882 +def subtract_ints_list(tokens):
  883 + """
  884 + parse action to subtract integers in a VBA expression with operator '-'
  885 + """
  886 + # extract argument from the tokens:
  887 + # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...]
  888 + integers = tokens[0][::2]
  889 + return reduce(lambda x,y:x-y, integers)
  890 +
  891 +
  892 +def multiply_ints_list(tokens):
  893 + """
  894 + parse action to multiply integers in a VBA expression with operator '*'
  895 + """
  896 + # extract argument from the tokens:
  897 + # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...]
  898 + integers = tokens[0][::2]
  899 + return reduce(lambda x,y:x*y, integers)
  900 +
  901 +
  902 +def divide_ints_list(tokens):
  903 + """
  904 + parse action to divide integers in a VBA expression with operator '/'
  905 + """
  906 + # extract argument from the tokens:
  907 + # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...]
  908 + integers = tokens[0][::2]
  909 + return reduce(lambda x,y:x/y, integers)
  910 +
  911 +
  912 +vba_expr_int_item = (vba_asc | vba_val | integer)
  913 +
  914 +# operators associativity:
  915 +# https://en.wikipedia.org/wiki/Operator_associativity
  916 +
  917 +vba_expr_int <<= infixNotation(vba_expr_int_item,
  918 + [
  919 + ("*", 2, opAssoc.LEFT, multiply_ints_list),
  920 + ("/", 2, opAssoc.LEFT, divide_ints_list),
  921 + ("-", 2, opAssoc.LEFT, subtract_ints_list),
  922 + ("+", 2, opAssoc.LEFT, sum_ints_list),
  923 + ])
  924 +
  925 +
  926 +# see detect_vba_strings for the deobfuscation code using this grammar
  927 +
  928 +# === MSO/ActiveMime files parsing ===========================================
  929 +
  930 +def is_mso_file(data):
  931 + """
  932 + Check if the provided data is the content of a MSO/ActiveMime file, such as
  933 + the ones created by Outlook in some cases, or Word/Excel when saving a
  934 + file with the MHTML format or the Word 2003 XML format.
  935 + This function only checks the ActiveMime magic at the beginning of data.
  936 + :param data: bytes string, MSO/ActiveMime file content
  937 + :return: bool, True if the file is MSO, False otherwise
  938 + """
  939 + return data.startswith(MSO_ACTIVEMIME_HEADER)
  940 +
  941 +
  942 +# regex to find zlib block headers, starting with byte 0x78 = 'x'
  943 +re_zlib_header = re.compile(r'x')
  944 +
  945 +
  946 +def mso_file_extract(data):
  947 + """
  948 + Extract the data stored into a MSO/ActiveMime file, such as
  949 + the ones created by Outlook in some cases, or Word/Excel when saving a
  950 + file with the MHTML format or the Word 2003 XML format.
  951 +
  952 + :param data: bytes string, MSO/ActiveMime file content
  953 + :return: bytes string, extracted data (uncompressed)
  954 +
  955 + raise a MsoExtractionError if the data cannot be extracted
  956 + """
  957 + # check the magic:
  958 + assert is_mso_file(data)
  959 +
  960 + # In all the samples seen so far, Word always uses an offset of 0x32,
  961 + # and Excel 0x22A. But we read the offset from the header to be more
  962 + # generic.
  963 + offsets = [0x32, 0x22A]
  964 +
  965 + # First, attempt to get the compressed data offset from the header
  966 + # According to my tests, it should be an unsigned 16 bits integer,
  967 + # at offset 0x1E (little endian) + add 46:
  968 + try:
  969 + offset = struct.unpack_from('<H', data, offset=0x1E)[0] + 46
  970 + log.debug('Parsing MSO file: data offset = 0x%X' % offset)
  971 + offsets.insert(0, offset) # insert at beginning of offsets
  972 + except struct.error as exc:
  973 + log.info('Unable to parse MSO/ActiveMime file header (%s)' % exc)
  974 + log.debug('Trace:', exc_info=True)
  975 + raise MsoExtractionError('Unable to parse MSO/ActiveMime file header')
  976 + # now try offsets
  977 + for start in offsets:
  978 + try:
  979 + log.debug('Attempting zlib decompression from MSO file offset 0x%X' % start)
  980 + extracted_data = zlib.decompress(data[start:])
  981 + return extracted_data
  982 + except zlib.error as exc:
  983 + log.info('zlib decompression failed for offset %s (%s)'
  984 + % (start, exc))
  985 + log.debug('Trace:', exc_info=True)
  986 + # None of the guessed offsets worked, let's try brute-forcing by looking
  987 + # for potential zlib-compressed blocks starting with 0x78:
  988 + log.debug('Looking for potential zlib-compressed blocks in MSO file')
  989 + for match in re_zlib_header.finditer(data):
  990 + start = match.start()
  991 + try:
  992 + log.debug('Attempting zlib decompression from MSO file offset 0x%X' % start)
  993 + extracted_data = zlib.decompress(data[start:])
  994 + return extracted_data
  995 + except zlib.error as exc:
  996 + log.info('zlib decompression failed (%s)' % exc)
  997 + log.debug('Trace:', exc_info=True)
  998 + raise MsoExtractionError('Unable to decompress data from a MSO/ActiveMime file')
  999 +
  1000 +
  1001 +#--- FUNCTIONS ----------------------------------------------------------------
  1002 +
  1003 +# set of printable characters, for is_printable
  1004 +_PRINTABLE_SET = set(string.printable)
  1005 +
  1006 +def is_printable(s):
  1007 + """
  1008 + returns True if string s only contains printable ASCII characters
  1009 + (i.e. contained in string.printable)
  1010 + This is similar to Python 3's str.isprintable, for Python 2.x.
  1011 + :param s: str
  1012 + :return: bool
  1013 + """
  1014 + # inspired from http://stackoverflow.com/questions/3636928/test-if-a-python-string-is-printable
  1015 + # check if the set of chars from s is contained into the set of printable chars:
  1016 + return set(s).issubset(_PRINTABLE_SET)
  1017 +
  1018 +
  1019 +def copytoken_help(decompressed_current, decompressed_chunk_start):
  1020 + """
  1021 + compute bit masks to decode a CopyToken according to MS-OVBA 2.4.1.3.19.1 CopyToken Help
  1022 +
  1023 + decompressed_current: number of decompressed bytes so far, i.e. len(decompressed_container)
  1024 + decompressed_chunk_start: offset of the current chunk in the decompressed container
  1025 + return length_mask, offset_mask, bit_count, maximum_length
  1026 + """
  1027 + difference = decompressed_current - decompressed_chunk_start
  1028 + bit_count = int(math.ceil(math.log(difference, 2)))
  1029 + bit_count = max([bit_count, 4])
  1030 + length_mask = 0xFFFF >> bit_count
  1031 + offset_mask = ~length_mask
  1032 + maximum_length = (0xFFFF >> bit_count) + 3
  1033 + return length_mask, offset_mask, bit_count, maximum_length
  1034 +
  1035 +
  1036 +def decompress_stream(compressed_container):
  1037 + """
  1038 + Decompress a stream according to MS-OVBA section 2.4.1
  1039 +
  1040 + compressed_container: string compressed according to the MS-OVBA 2.4.1.3.6 Compression algorithm
  1041 + return the decompressed container as a string (bytes)
  1042 + """
  1043 + # 2.4.1.2 State Variables
  1044 +
  1045 + # The following state is maintained for the CompressedContainer (section 2.4.1.1.1):
  1046 + # CompressedRecordEnd: The location of the byte after the last byte in the CompressedContainer (section 2.4.1.1.1).
  1047 + # CompressedCurrent: The location of the next byte in the CompressedContainer (section 2.4.1.1.1) to be read by
  1048 + # decompression or to be written by compression.
  1049 +
  1050 + # The following state is maintained for the current CompressedChunk (section 2.4.1.1.4):
  1051 + # CompressedChunkStart: The location of the first byte of the CompressedChunk (section 2.4.1.1.4) within the
  1052 + # CompressedContainer (section 2.4.1.1.1).
  1053 +
  1054 + # The following state is maintained for a DecompressedBuffer (section 2.4.1.1.2):
  1055 + # DecompressedCurrent: The location of the next byte in the DecompressedBuffer (section 2.4.1.1.2) to be written by
  1056 + # decompression or to be read by compression.
  1057 + # DecompressedBufferEnd: The location of the byte after the last byte in the DecompressedBuffer (section 2.4.1.1.2).
  1058 +
  1059 + # The following state is maintained for the current DecompressedChunk (section 2.4.1.1.3):
  1060 + # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the
  1061 + # DecompressedBuffer (section 2.4.1.1.2).
  1062 +
  1063 + decompressed_container = b'' # result
  1064 + compressed_current = 0
  1065 +
  1066 + sig_byte = compressed_container[compressed_current]
  1067 + if sig_byte != 0x01:
  1068 + raise ValueError('invalid signature byte {0:02X}'.format(sig_byte))
  1069 +
  1070 + compressed_current += 1
  1071 +
  1072 + #NOTE: the definition of CompressedRecordEnd is ambiguous. Here we assume that
  1073 + # CompressedRecordEnd = len(compressed_container)
  1074 + while compressed_current < len(compressed_container):
  1075 + # 2.4.1.1.5
  1076 + compressed_chunk_start = compressed_current
  1077 + # chunk header = first 16 bits
  1078 + compressed_chunk_header = \
  1079 + struct.unpack("<H", compressed_container[compressed_chunk_start:compressed_chunk_start + 2])[0]
  1080 + # chunk size = 12 first bits of header + 3
  1081 + chunk_size = (compressed_chunk_header & 0x0FFF) + 3
  1082 + # chunk signature = 3 next bits - should always be 0b011
  1083 + chunk_signature = (compressed_chunk_header >> 12) & 0x07
  1084 + if chunk_signature != 0b011:
  1085 + raise ValueError('Invalid CompressedChunkSignature in VBA compressed stream')
  1086 + # chunk flag = next bit - 1 == compressed, 0 == uncompressed
  1087 + chunk_flag = (compressed_chunk_header >> 15) & 0x01
  1088 + log.debug("chunk size = {0}, compressed flag = {1}".format(chunk_size, chunk_flag))
  1089 +
  1090 + #MS-OVBA 2.4.1.3.12: the maximum size of a chunk including its header is 4098 bytes (header 2 + data 4096)
  1091 + # The minimum size is 3 bytes
  1092 + # NOTE: there seems to be a typo in MS-OVBA, the check should be with 4098, not 4095 (which is the max value
  1093 + # in chunk header before adding 3.
  1094 + # Also the first test is not useful since a 12 bits value cannot be larger than 4095.
  1095 + if chunk_flag == 1 and chunk_size > 4098:
  1096 + raise ValueError('CompressedChunkSize > 4098 but CompressedChunkFlag == 1')
  1097 + if chunk_flag == 0 and chunk_size != 4098:
  1098 + raise ValueError('CompressedChunkSize != 4098 but CompressedChunkFlag == 0')
  1099 +
  1100 + # check if chunk_size goes beyond the compressed data, instead of silently cutting it:
  1101 + #TODO: raise an exception?
  1102 + if compressed_chunk_start + chunk_size > len(compressed_container):
  1103 + log.warning('Chunk size is larger than remaining compressed data')
  1104 + compressed_end = min([len(compressed_container), compressed_chunk_start + chunk_size])
  1105 + # read after chunk header:
  1106 + compressed_current = compressed_chunk_start + 2
  1107 +
  1108 + if chunk_flag == 0:
  1109 + # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk
  1110 + # uncompressed chunk: read the next 4096 bytes as-is
  1111 + #TODO: check if there are at least 4096 bytes left
  1112 + decompressed_container += bytes([compressed_container[compressed_current:compressed_current + 4096]])
  1113 + compressed_current += 4096
  1114 + else:
  1115 + # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk
  1116 + # compressed chunk
  1117 + decompressed_chunk_start = len(decompressed_container)
  1118 + while compressed_current < compressed_end:
  1119 + # MS-OVBA 2.4.1.3.4 Decompressing a TokenSequence
  1120 + # log.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end))
  1121 + # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or
  1122 + # copy tokens (reference to a previous literal token)
  1123 + flag_byte = compressed_container[compressed_current]
  1124 + compressed_current += 1
  1125 + for bit_index in range(0, 8):
  1126 + # log.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end))
  1127 + if compressed_current >= compressed_end:
  1128 + break
  1129 + # MS-OVBA 2.4.1.3.5 Decompressing a Token
  1130 + # MS-OVBA 2.4.1.3.17 Extract FlagBit
  1131 + flag_bit = (flag_byte >> bit_index) & 1
  1132 + #log.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit))
  1133 + if flag_bit == 0: # LiteralToken
  1134 + # copy one byte directly to output
  1135 + decompressed_container += bytes([compressed_container[compressed_current]])
  1136 + compressed_current += 1
  1137 + else: # CopyToken
  1138 + # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken
  1139 + copy_token = \
  1140 + struct.unpack("<H", compressed_container[compressed_current:compressed_current + 2])[0]
  1141 + #TODO: check this
  1142 + length_mask, offset_mask, bit_count, _ = copytoken_help(
  1143 + len(decompressed_container), decompressed_chunk_start)
  1144 + length = (copy_token & length_mask) + 3
  1145 + temp1 = copy_token & offset_mask
  1146 + temp2 = 16 - bit_count
  1147 + offset = (temp1 >> temp2) + 1
  1148 + #log.debug('offset=%d length=%d' % (offset, length))
  1149 + copy_source = len(decompressed_container) - offset
  1150 + for index in range(copy_source, copy_source + length):
  1151 + decompressed_container += bytes([decompressed_container[index]])
  1152 + compressed_current += 2
  1153 + return decompressed_container
  1154 +
  1155 +
  1156 +def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
  1157 + """
  1158 + Extract VBA macros from an OleFileIO object.
  1159 + Internal function, do not call directly.
  1160 +
  1161 + vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream
  1162 + vba_project: path to the PROJECT stream
  1163 + :param relaxed: If True, only create info/debug log entry if data is not as expected
  1164 + (e.g. opening substream fails); if False, raise an error in this case
  1165 + This is a generator, yielding (stream path, VBA filename, VBA source code) for each VBA code stream
  1166 + """
  1167 + # Open the PROJECT stream:
  1168 + project = ole.openstream(project_path)
  1169 + log.debug('relaxed is %s' % relaxed)
  1170 +
  1171 + # sample content of the PROJECT stream:
  1172 +
  1173 + ## ID="{5312AC8A-349D-4950-BDD0-49BE3C4DD0F0}"
  1174 + ## Document=ThisDocument/&H00000000
  1175 + ## Module=NewMacros
  1176 + ## Name="Project"
  1177 + ## HelpContextID="0"
  1178 + ## VersionCompatible32="393222000"
  1179 + ## CMG="F1F301E705E705E705E705"
  1180 + ## DPB="8F8D7FE3831F2020202020"
  1181 + ## GC="2D2FDD81E51EE61EE6E1"
  1182 + ##
  1183 + ## [Host Extender Info]
  1184 + ## &H00000001={3832D640-CF90-11CF-8E43-00A0C911005A};VBE;&H00000000
  1185 + ## &H00000002={000209F2-0000-0000-C000-000000000046};Word8.0;&H00000000
  1186 + ##
  1187 + ## [Workspace]
  1188 + ## ThisDocument=22, 29, 339, 477, Z
  1189 + ## NewMacros=-4, 42, 832, 510, C
  1190 +
  1191 + code_modules = {}
  1192 +
  1193 + for line in project:
  1194 + line = line.strip().decode('utf-8','ignore')
  1195 + if '=' in line:
  1196 + # split line at the 1st equal sign:
  1197 + name, value = line.split('=', 1)
  1198 + # looking for code modules
  1199 + # add the code module as a key in the dictionary
  1200 + # the value will be the extension needed later
  1201 + # The value is converted to lowercase, to allow case-insensitive matching (issue #3)
  1202 + value = value.lower()
  1203 + if name == 'Document':
  1204 + # split value at the 1st slash, keep 1st part:
  1205 + value = value.split('/', 1)[0]
  1206 + code_modules[value] = CLASS_EXTENSION
  1207 + elif name == 'Module':
  1208 + code_modules[value] = MODULE_EXTENSION
  1209 + elif name == 'Class':
  1210 + code_modules[value] = CLASS_EXTENSION
  1211 + elif name == 'BaseClass':
  1212 + code_modules[value] = FORM_EXTENSION
  1213 +
  1214 + # read data from dir stream (compressed)
  1215 + dir_compressed = ole.openstream(dir_path).read()
  1216 +
  1217 + def check_value(name, expected, value):
  1218 + if expected != value:
  1219 + if relaxed:
  1220 + log.error("invalid value for {0} expected {1:04X} got {2:04X}"
  1221 + .format(name, expected, value))
  1222 + else:
  1223 + raise UnexpectedDataError(dir_path, name, expected, value)
  1224 +
  1225 + dir_stream = BytesIO(decompress_stream(dir_compressed))
  1226 +
  1227 + # PROJECTSYSKIND Record
  1228 + projectsyskind_id = struct.unpack("<H", dir_stream.read(2))[0]
  1229 + check_value('PROJECTSYSKIND_Id', 0x0001, projectsyskind_id)
  1230 + projectsyskind_size = struct.unpack("<L", dir_stream.read(4))[0]
  1231 + check_value('PROJECTSYSKIND_Size', 0x0004, projectsyskind_size)
  1232 + projectsyskind_syskind = struct.unpack("<L", dir_stream.read(4))[0]
  1233 + if projectsyskind_syskind == 0x00:
  1234 + log.debug("16-bit Windows")
  1235 + elif projectsyskind_syskind == 0x01:
  1236 + log.debug("32-bit Windows")
  1237 + elif projectsyskind_syskind == 0x02:
  1238 + log.debug("Macintosh")
  1239 + elif projectsyskind_syskind == 0x03:
  1240 + log.debug("64-bit Windows")
  1241 + else:
  1242 + log.error("invalid PROJECTSYSKIND_SysKind {0:04X}".format(projectsyskind_syskind))
  1243 +
  1244 + # PROJECTLCID Record
  1245 + projectlcid_id = struct.unpack("<H", dir_stream.read(2))[0]
  1246 + check_value('PROJECTLCID_Id', 0x0002, projectlcid_id)
  1247 + projectlcid_size = struct.unpack("<L", dir_stream.read(4))[0]
  1248 + check_value('PROJECTLCID_Size', 0x0004, projectlcid_size)
  1249 + projectlcid_lcid = struct.unpack("<L", dir_stream.read(4))[0]
  1250 + check_value('PROJECTLCID_Lcid', 0x409, projectlcid_lcid)
  1251 +
  1252 + # PROJECTLCIDINVOKE Record
  1253 + projectlcidinvoke_id = struct.unpack("<H", dir_stream.read(2))[0]
  1254 + check_value('PROJECTLCIDINVOKE_Id', 0x0014, projectlcidinvoke_id)
  1255 + projectlcidinvoke_size = struct.unpack("<L", dir_stream.read(4))[0]
  1256 + check_value('PROJECTLCIDINVOKE_Size', 0x0004, projectlcidinvoke_size)
  1257 + projectlcidinvoke_lcidinvoke = struct.unpack("<L", dir_stream.read(4))[0]
  1258 + check_value('PROJECTLCIDINVOKE_LcidInvoke', 0x409, projectlcidinvoke_lcidinvoke)
  1259 +
  1260 + # PROJECTCODEPAGE Record
  1261 + projectcodepage_id = struct.unpack("<H", dir_stream.read(2))[0]
  1262 + check_value('PROJECTCODEPAGE_Id', 0x0003, projectcodepage_id)
  1263 + projectcodepage_size = struct.unpack("<L", dir_stream.read(4))[0]
  1264 + check_value('PROJECTCODEPAGE_Size', 0x0002, projectcodepage_size)
  1265 + projectcodepage_codepage = struct.unpack("<H", dir_stream.read(2))[0]
  1266 +
  1267 + # PROJECTNAME Record
  1268 + projectname_id = struct.unpack("<H", dir_stream.read(2))[0]
  1269 + check_value('PROJECTNAME_Id', 0x0004, projectname_id)
  1270 + projectname_sizeof_projectname = struct.unpack("<L", dir_stream.read(4))[0]
  1271 + if projectname_sizeof_projectname < 1 or projectname_sizeof_projectname > 128:
  1272 + log.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(projectname_sizeof_projectname))
  1273 + projectname_projectname = dir_stream.read(projectname_sizeof_projectname)
  1274 + unused = projectname_projectname
  1275 +
  1276 + # PROJECTDOCSTRING Record
  1277 + projectdocstring_id = struct.unpack("<H", dir_stream.read(2))[0]
  1278 + check_value('PROJECTDOCSTRING_Id', 0x0005, projectdocstring_id)
  1279 + projectdocstring_sizeof_docstring = struct.unpack("<L", dir_stream.read(4))[0]
  1280 + if projectdocstring_sizeof_docstring > 2000:
  1281 + log.error(
  1282 + "PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(projectdocstring_sizeof_docstring))
  1283 + projectdocstring_docstring = dir_stream.read(projectdocstring_sizeof_docstring)
  1284 + projectdocstring_reserved = struct.unpack("<H", dir_stream.read(2))[0]
  1285 + check_value('PROJECTDOCSTRING_Reserved', 0x0040, projectdocstring_reserved)
  1286 + projectdocstring_sizeof_docstring_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1287 + if projectdocstring_sizeof_docstring_unicode % 2 != 0:
  1288 + log.error("PROJECTDOCSTRING_SizeOfDocStringUnicode is not even")
  1289 + projectdocstring_docstring_unicode = dir_stream.read(projectdocstring_sizeof_docstring_unicode)
  1290 + unused = projectdocstring_docstring
  1291 + unused = projectdocstring_docstring_unicode
  1292 +
  1293 + # PROJECTHELPFILEPATH Record - MS-OVBA 2.3.4.2.1.7
  1294 + projecthelpfilepath_id = struct.unpack("<H", dir_stream.read(2))[0]
  1295 + check_value('PROJECTHELPFILEPATH_Id', 0x0006, projecthelpfilepath_id)
  1296 + projecthelpfilepath_sizeof_helpfile1 = struct.unpack("<L", dir_stream.read(4))[0]
  1297 + if projecthelpfilepath_sizeof_helpfile1 > 260:
  1298 + log.error(
  1299 + "PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(projecthelpfilepath_sizeof_helpfile1))
  1300 + projecthelpfilepath_helpfile1 = dir_stream.read(projecthelpfilepath_sizeof_helpfile1)
  1301 + projecthelpfilepath_reserved = struct.unpack("<H", dir_stream.read(2))[0]
  1302 + check_value('PROJECTHELPFILEPATH_Reserved', 0x003D, projecthelpfilepath_reserved)
  1303 + projecthelpfilepath_sizeof_helpfile2 = struct.unpack("<L", dir_stream.read(4))[0]
  1304 + if projecthelpfilepath_sizeof_helpfile2 != projecthelpfilepath_sizeof_helpfile1:
  1305 + log.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 does not equal PROJECTHELPFILEPATH_SizeOfHelpFile2")
  1306 + projecthelpfilepath_helpfile2 = dir_stream.read(projecthelpfilepath_sizeof_helpfile2)
  1307 + if projecthelpfilepath_helpfile2 != projecthelpfilepath_helpfile1:
  1308 + log.error("PROJECTHELPFILEPATH_HelpFile1 does not equal PROJECTHELPFILEPATH_HelpFile2")
  1309 +
  1310 + # PROJECTHELPCONTEXT Record
  1311 + projecthelpcontext_id = struct.unpack("<H", dir_stream.read(2))[0]
  1312 + check_value('PROJECTHELPCONTEXT_Id', 0x0007, projecthelpcontext_id)
  1313 + projecthelpcontext_size = struct.unpack("<L", dir_stream.read(4))[0]
  1314 + check_value('PROJECTHELPCONTEXT_Size', 0x0004, projecthelpcontext_size)
  1315 + projecthelpcontext_helpcontext = struct.unpack("<L", dir_stream.read(4))[0]
  1316 + unused = projecthelpcontext_helpcontext
  1317 +
  1318 + # PROJECTLIBFLAGS Record
  1319 + projectlibflags_id = struct.unpack("<H", dir_stream.read(2))[0]
  1320 + check_value('PROJECTLIBFLAGS_Id', 0x0008, projectlibflags_id)
  1321 + projectlibflags_size = struct.unpack("<L", dir_stream.read(4))[0]
  1322 + check_value('PROJECTLIBFLAGS_Size', 0x0004, projectlibflags_size)
  1323 + projectlibflags_projectlibflags = struct.unpack("<L", dir_stream.read(4))[0]
  1324 + check_value('PROJECTLIBFLAGS_ProjectLibFlags', 0x0000, projectlibflags_projectlibflags)
  1325 +
  1326 + # PROJECTVERSION Record
  1327 + projectversion_id = struct.unpack("<H", dir_stream.read(2))[0]
  1328 + check_value('PROJECTVERSION_Id', 0x0009, projectversion_id)
  1329 + projectversion_reserved = struct.unpack("<L", dir_stream.read(4))[0]
  1330 + check_value('PROJECTVERSION_Reserved', 0x0004, projectversion_reserved)
  1331 + projectversion_versionmajor = struct.unpack("<L", dir_stream.read(4))[0]
  1332 + projectversion_versionminor = struct.unpack("<H", dir_stream.read(2))[0]
  1333 + unused = projectversion_versionmajor
  1334 + unused = projectversion_versionminor
  1335 +
  1336 + # PROJECTCONSTANTS Record
  1337 + projectconstants_id = struct.unpack("<H", dir_stream.read(2))[0]
  1338 + check_value('PROJECTCONSTANTS_Id', 0x000C, projectconstants_id)
  1339 + projectconstants_sizeof_constants = struct.unpack("<L", dir_stream.read(4))[0]
  1340 + if projectconstants_sizeof_constants > 1015:
  1341 + log.error(
  1342 + "PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(projectconstants_sizeof_constants))
  1343 + projectconstants_constants = dir_stream.read(projectconstants_sizeof_constants)
  1344 + projectconstants_reserved = struct.unpack("<H", dir_stream.read(2))[0]
  1345 + check_value('PROJECTCONSTANTS_Reserved', 0x003C, projectconstants_reserved)
  1346 + projectconstants_sizeof_constants_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1347 + if projectconstants_sizeof_constants_unicode % 2 != 0:
  1348 + log.error("PROJECTCONSTANTS_SizeOfConstantsUnicode is not even")
  1349 + projectconstants_constants_unicode = dir_stream.read(projectconstants_sizeof_constants_unicode)
  1350 + unused = projectconstants_constants
  1351 + unused = projectconstants_constants_unicode
  1352 +
  1353 + # array of REFERENCE records
  1354 + check = None
  1355 + while True:
  1356 + check = struct.unpack("<H", dir_stream.read(2))[0]
  1357 + log.debug("reference type = {0:04X}".format(check))
  1358 + if check == 0x000F:
  1359 + break
  1360 +
  1361 + if check == 0x0016:
  1362 + # REFERENCENAME
  1363 + reference_id = check
  1364 + reference_sizeof_name = struct.unpack("<L", dir_stream.read(4))[0]
  1365 + reference_name = dir_stream.read(reference_sizeof_name)
  1366 + reference_reserved = struct.unpack("<H", dir_stream.read(2))[0]
  1367 + if reference_reserved not in (0x003E, 0x000D):
  1368 + raise UnexpectedDataError(dir_path, 'REFERENCE_Reserved',
  1369 + (0x003E, 0x000D), value)
  1370 + reference_sizeof_name_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1371 + reference_name_unicode = dir_stream.read(reference_sizeof_name_unicode)
  1372 + unused = reference_id
  1373 + unused = reference_name
  1374 + unused = reference_name_unicode
  1375 + continue
  1376 +
  1377 + if check == 0x0033:
  1378 + # REFERENCEORIGINAL (followed by REFERENCECONTROL)
  1379 + referenceoriginal_id = check
  1380 + referenceoriginal_sizeof_libidoriginal = struct.unpack("<L", dir_stream.read(4))[0]
  1381 + referenceoriginal_libidoriginal = dir_stream.read(referenceoriginal_sizeof_libidoriginal)
  1382 + unused = referenceoriginal_id
  1383 + unused = referenceoriginal_libidoriginal
  1384 + continue
  1385 +
  1386 + if check == 0x002F:
  1387 + # REFERENCECONTROL
  1388 + referencecontrol_id = check
  1389 + referencecontrol_sizetwiddled = struct.unpack("<L", dir_stream.read(4))[0] # ignore
  1390 + referencecontrol_sizeof_libidtwiddled = struct.unpack("<L", dir_stream.read(4))[0]
  1391 + referencecontrol_libidtwiddled = dir_stream.read(referencecontrol_sizeof_libidtwiddled)
  1392 + referencecontrol_reserved1 = struct.unpack("<L", dir_stream.read(4))[0] # ignore
  1393 + check_value('REFERENCECONTROL_Reserved1', 0x0000, referencecontrol_reserved1)
  1394 + referencecontrol_reserved2 = struct.unpack("<H", dir_stream.read(2))[0] # ignore
  1395 + check_value('REFERENCECONTROL_Reserved2', 0x0000, referencecontrol_reserved2)
  1396 + unused = referencecontrol_id
  1397 + unused = referencecontrol_sizetwiddled
  1398 + unused = referencecontrol_libidtwiddled
  1399 + # optional field
  1400 + check2 = struct.unpack("<H", dir_stream.read(2))[0]
  1401 + if check2 == 0x0016:
  1402 + referencecontrol_namerecordextended_id = check
  1403 + referencecontrol_namerecordextended_sizeof_name = struct.unpack("<L", dir_stream.read(4))[0]
  1404 + referencecontrol_namerecordextended_name = dir_stream.read(
  1405 + referencecontrol_namerecordextended_sizeof_name)
  1406 + referencecontrol_namerecordextended_reserved = struct.unpack("<H", dir_stream.read(2))[0]
  1407 + check_value('REFERENCECONTROL_NameRecordExtended_Reserved', 0x003E,
  1408 + referencecontrol_namerecordextended_reserved)
  1409 + referencecontrol_namerecordextended_sizeof_name_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1410 + referencecontrol_namerecordextended_name_unicode = dir_stream.read(
  1411 + referencecontrol_namerecordextended_sizeof_name_unicode)
  1412 + referencecontrol_reserved3 = struct.unpack("<H", dir_stream.read(2))[0]
  1413 + unused = referencecontrol_namerecordextended_id
  1414 + unused = referencecontrol_namerecordextended_name
  1415 + unused = referencecontrol_namerecordextended_name_unicode
  1416 + else:
  1417 + referencecontrol_reserved3 = check2
  1418 +
  1419 + check_value('REFERENCECONTROL_Reserved3', 0x0030, referencecontrol_reserved3)
  1420 + referencecontrol_sizeextended = struct.unpack("<L", dir_stream.read(4))[0]
  1421 + referencecontrol_sizeof_libidextended = struct.unpack("<L", dir_stream.read(4))[0]
  1422 + referencecontrol_libidextended = dir_stream.read(referencecontrol_sizeof_libidextended)
  1423 + referencecontrol_reserved4 = struct.unpack("<L", dir_stream.read(4))[0]
  1424 + referencecontrol_reserved5 = struct.unpack("<H", dir_stream.read(2))[0]
  1425 + referencecontrol_originaltypelib = dir_stream.read(16)
  1426 + referencecontrol_cookie = struct.unpack("<L", dir_stream.read(4))[0]
  1427 + unused = referencecontrol_sizeextended
  1428 + unused = referencecontrol_libidextended
  1429 + unused = referencecontrol_reserved4
  1430 + unused = referencecontrol_reserved5
  1431 + unused = referencecontrol_originaltypelib
  1432 + unused = referencecontrol_cookie
  1433 + continue
  1434 +
  1435 + if check == 0x000D:
  1436 + # REFERENCEREGISTERED
  1437 + referenceregistered_id = check
  1438 + referenceregistered_size = struct.unpack("<L", dir_stream.read(4))[0]
  1439 + referenceregistered_sizeof_libid = struct.unpack("<L", dir_stream.read(4))[0]
  1440 + referenceregistered_libid = dir_stream.read(referenceregistered_sizeof_libid)
  1441 + referenceregistered_reserved1 = struct.unpack("<L", dir_stream.read(4))[0]
  1442 + check_value('REFERENCEREGISTERED_Reserved1', 0x0000, referenceregistered_reserved1)
  1443 + referenceregistered_reserved2 = struct.unpack("<H", dir_stream.read(2))[0]
  1444 + check_value('REFERENCEREGISTERED_Reserved2', 0x0000, referenceregistered_reserved2)
  1445 + unused = referenceregistered_id
  1446 + unused = referenceregistered_size
  1447 + unused = referenceregistered_libid
  1448 + continue
  1449 +
  1450 + if check == 0x000E:
  1451 + # REFERENCEPROJECT
  1452 + referenceproject_id = check
  1453 + referenceproject_size = struct.unpack("<L", dir_stream.read(4))[0]
  1454 + referenceproject_sizeof_libidabsolute = struct.unpack("<L", dir_stream.read(4))[0]
  1455 + referenceproject_libidabsolute = dir_stream.read(referenceproject_sizeof_libidabsolute)
  1456 + referenceproject_sizeof_libidrelative = struct.unpack("<L", dir_stream.read(4))[0]
  1457 + referenceproject_libidrelative = dir_stream.read(referenceproject_sizeof_libidrelative)
  1458 + referenceproject_majorversion = struct.unpack("<L", dir_stream.read(4))[0]
  1459 + referenceproject_minorversion = struct.unpack("<H", dir_stream.read(2))[0]
  1460 + unused = referenceproject_id
  1461 + unused = referenceproject_size
  1462 + unused = referenceproject_libidabsolute
  1463 + unused = referenceproject_libidrelative
  1464 + unused = referenceproject_majorversion
  1465 + unused = referenceproject_minorversion
  1466 + continue
  1467 +
  1468 + log.error('invalid or unknown check Id {0:04X}'.format(check))
  1469 + sys.exit(0)
  1470 +
  1471 + projectmodules_id = check #struct.unpack("<H", dir_stream.read(2))[0]
  1472 + check_value('PROJECTMODULES_Id', 0x000F, projectmodules_id)
  1473 + projectmodules_size = struct.unpack("<L", dir_stream.read(4))[0]
  1474 + check_value('PROJECTMODULES_Size', 0x0002, projectmodules_size)
  1475 + projectmodules_count = struct.unpack("<H", dir_stream.read(2))[0]
  1476 + projectmodules_projectcookierecord_id = struct.unpack("<H", dir_stream.read(2))[0]
  1477 + check_value('PROJECTMODULES_ProjectCookieRecord_Id', 0x0013, projectmodules_projectcookierecord_id)
  1478 + projectmodules_projectcookierecord_size = struct.unpack("<L", dir_stream.read(4))[0]
  1479 + check_value('PROJECTMODULES_ProjectCookieRecord_Size', 0x0002, projectmodules_projectcookierecord_size)
  1480 + projectmodules_projectcookierecord_cookie = struct.unpack("<H", dir_stream.read(2))[0]
  1481 + unused = projectmodules_projectcookierecord_cookie
  1482 +
  1483 + # short function to simplify unicode text output
  1484 + uni_out = lambda unicode_text: unicode_text.encode('utf-8', 'replace')
  1485 +
  1486 + log.debug("parsing {0} modules".format(projectmodules_count))
  1487 + for projectmodule_index in range(0, projectmodules_count):
  1488 + try:
  1489 + modulename_id = struct.unpack("<H", dir_stream.read(2))[0]
  1490 + check_value('MODULENAME_Id', 0x0019, modulename_id)
  1491 + modulename_sizeof_modulename = struct.unpack("<L", dir_stream.read(4))[0]
  1492 + modulename_modulename = dir_stream.read(modulename_sizeof_modulename)
  1493 + # TODO: preset variables to avoid "referenced before assignment" errors
  1494 + modulename_unicode_modulename_unicode = ''
  1495 + # account for optional sections
  1496 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1497 + if section_id == 0x0047:
  1498 + modulename_unicode_id = section_id
  1499 + modulename_unicode_sizeof_modulename_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1500 + modulename_unicode_modulename_unicode = dir_stream.read(
  1501 + modulename_unicode_sizeof_modulename_unicode).decode('UTF-16LE', 'replace')
  1502 + # just guessing that this is the same encoding as used in OleFileIO
  1503 + unused = modulename_unicode_id
  1504 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1505 + if section_id == 0x001A:
  1506 + modulestreamname_id = section_id
  1507 + modulestreamname_sizeof_streamname = struct.unpack("<L", dir_stream.read(4))[0]
  1508 + modulestreamname_streamname = dir_stream.read(modulestreamname_sizeof_streamname)
  1509 + modulestreamname_reserved = struct.unpack("<H", dir_stream.read(2))[0]
  1510 + check_value('MODULESTREAMNAME_Reserved', 0x0032, modulestreamname_reserved)
  1511 + modulestreamname_sizeof_streamname_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1512 + modulestreamname_streamname_unicode = dir_stream.read(
  1513 + modulestreamname_sizeof_streamname_unicode).decode('UTF-16LE', 'replace')
  1514 + # just guessing that this is the same encoding as used in OleFileIO
  1515 + unused = modulestreamname_id
  1516 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1517 + if section_id == 0x001C:
  1518 + moduledocstring_id = section_id
  1519 + check_value('MODULEDOCSTRING_Id', 0x001C, moduledocstring_id)
  1520 + moduledocstring_sizeof_docstring = struct.unpack("<L", dir_stream.read(4))[0]
  1521 + moduledocstring_docstring = dir_stream.read(moduledocstring_sizeof_docstring)
  1522 + moduledocstring_reserved = struct.unpack("<H", dir_stream.read(2))[0]
  1523 + check_value('MODULEDOCSTRING_Reserved', 0x0048, moduledocstring_reserved)
  1524 + moduledocstring_sizeof_docstring_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1525 + moduledocstring_docstring_unicode = dir_stream.read(moduledocstring_sizeof_docstring_unicode)
  1526 + unused = moduledocstring_docstring
  1527 + unused = moduledocstring_docstring_unicode
  1528 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1529 + if section_id == 0x0031:
  1530 + moduleoffset_id = section_id
  1531 + check_value('MODULEOFFSET_Id', 0x0031, moduleoffset_id)
  1532 + moduleoffset_size = struct.unpack("<L", dir_stream.read(4))[0]
  1533 + check_value('MODULEOFFSET_Size', 0x0004, moduleoffset_size)
  1534 + moduleoffset_textoffset = struct.unpack("<L", dir_stream.read(4))[0]
  1535 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1536 + if section_id == 0x001E:
  1537 + modulehelpcontext_id = section_id
  1538 + check_value('MODULEHELPCONTEXT_Id', 0x001E, modulehelpcontext_id)
  1539 + modulehelpcontext_size = struct.unpack("<L", dir_stream.read(4))[0]
  1540 + check_value('MODULEHELPCONTEXT_Size', 0x0004, modulehelpcontext_size)
  1541 + modulehelpcontext_helpcontext = struct.unpack("<L", dir_stream.read(4))[0]
  1542 + unused = modulehelpcontext_helpcontext
  1543 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1544 + if section_id == 0x002C:
  1545 + modulecookie_id = section_id
  1546 + check_value('MODULECOOKIE_Id', 0x002C, modulecookie_id)
  1547 + modulecookie_size = struct.unpack("<L", dir_stream.read(4))[0]
  1548 + check_value('MODULECOOKIE_Size', 0x0002, modulecookie_size)
  1549 + modulecookie_cookie = struct.unpack("<H", dir_stream.read(2))[0]
  1550 + unused = modulecookie_cookie
  1551 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1552 + if section_id == 0x0021 or section_id == 0x0022:
  1553 + moduletype_id = section_id
  1554 + moduletype_reserved = struct.unpack("<L", dir_stream.read(4))[0]
  1555 + unused = moduletype_id
  1556 + unused = moduletype_reserved
  1557 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1558 + if section_id == 0x0025:
  1559 + modulereadonly_id = section_id
  1560 + check_value('MODULEREADONLY_Id', 0x0025, modulereadonly_id)
  1561 + modulereadonly_reserved = struct.unpack("<L", dir_stream.read(4))[0]
  1562 + check_value('MODULEREADONLY_Reserved', 0x0000, modulereadonly_reserved)
  1563 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1564 + if section_id == 0x0028:
  1565 + moduleprivate_id = section_id
  1566 + check_value('MODULEPRIVATE_Id', 0x0028, moduleprivate_id)
  1567 + moduleprivate_reserved = struct.unpack("<L", dir_stream.read(4))[0]
  1568 + check_value('MODULEPRIVATE_Reserved', 0x0000, moduleprivate_reserved)
  1569 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1570 + if section_id == 0x002B: # TERMINATOR
  1571 + module_reserved = struct.unpack("<L", dir_stream.read(4))[0]
  1572 + check_value('MODULE_Reserved', 0x0000, module_reserved)
  1573 + section_id = None
  1574 + if section_id != None:
  1575 + log.warning('unknown or invalid module section id {0:04X}'.format(section_id))
  1576 +
  1577 + log.debug('Project CodePage = %d' % projectcodepage_codepage)
  1578 + if projectcodepage_codepage in MAC_CODEPAGES:
  1579 + vba_codec = MAC_CODEPAGES[projectcodepage_codepage]
  1580 + else:
  1581 + vba_codec = 'cp%d' % projectcodepage_codepage
  1582 + log.debug("ModuleName = {0}".format(modulename_modulename))
  1583 + log.debug("ModuleNameUnicode = {0}".format(uni_out(modulename_unicode_modulename_unicode)))
  1584 + log.debug("StreamName = {0}".format(modulestreamname_streamname))
  1585 + try:
  1586 + streamname_unicode = modulestreamname_streamname.decode(vba_codec)
  1587 + except UnicodeError as ue:
  1588 + log.debug('failed to decode stream name {0!r} with codec {1}'
  1589 + .format(uni_out(streamname_unicode), vba_codec))
  1590 + streamname_unicode = modulestreamname_streamname.decode(vba_codec, errors='replace')
  1591 + log.debug("StreamName.decode('%s') = %s" % (vba_codec, uni_out(streamname_unicode)))
  1592 + log.debug("StreamNameUnicode = {0}".format(uni_out(modulestreamname_streamname_unicode)))
  1593 + log.debug("TextOffset = {0}".format(moduleoffset_textoffset))
  1594 +
  1595 + code_data = None
  1596 + try_names = streamname_unicode, \
  1597 + modulename_unicode_modulename_unicode, \
  1598 + modulestreamname_streamname_unicode
  1599 + for stream_name in try_names:
  1600 + # TODO: if olefile._find were less private, could replace this
  1601 + # try-except with calls to it
  1602 + try:
  1603 + code_path = vba_root + u'VBA/' + stream_name
  1604 + log.debug('opening VBA code stream %s' % uni_out(code_path))
  1605 + code_data = ole.openstream(code_path).read()
  1606 + break
  1607 + except IOError as ioe:
  1608 + log.debug('failed to open stream VBA/%r (%r), try other name'
  1609 + % (uni_out(stream_name), ioe))
  1610 +
  1611 + if code_data is None:
  1612 + log.info("Could not open stream %d of %d ('VBA/' + one of %r)!"
  1613 + % (projectmodule_index, projectmodules_count,
  1614 + '/'.join("'" + uni_out(stream_name) + "'"
  1615 + for stream_name in try_names)))
  1616 + if relaxed:
  1617 + continue # ... with next submodule
  1618 + else:
  1619 + raise SubstreamOpenError('[BASE]', 'VBA/' +
  1620 + uni_out(modulename_unicode_modulename_unicode))
  1621 +
  1622 + log.debug("length of code_data = {0}".format(len(code_data)))
  1623 + log.debug("offset of code_data = {0}".format(moduleoffset_textoffset))
  1624 + code_data = code_data[moduleoffset_textoffset:]
  1625 + if len(code_data) > 0:
  1626 + code_data = decompress_stream(code_data)
  1627 + # case-insensitive search in the code_modules dict to find the file extension:
  1628 + filext = code_modules.get(modulename_modulename.lower(), 'bin')
  1629 + filename = '{0}.{1}'.format(modulename_modulename, filext)
  1630 + #TODO: also yield the codepage so that callers can decode it properly
  1631 + yield (code_path, filename, code_data)
  1632 + # print '-'*79
  1633 + # print filename
  1634 + # print ''
  1635 + # print code_data
  1636 + # print ''
  1637 + log.debug('extracted file {0}'.format(filename))
  1638 + else:
  1639 + log.warning("module stream {0} has code data length 0".format(modulestreamname_streamname))
  1640 + except (UnexpectedDataError, SubstreamOpenError):
  1641 + raise
  1642 + except Exception as exc:
  1643 + log.info('Error parsing module {0} of {1} in _extract_vba:'
  1644 + .format(projectmodule_index, projectmodules_count),
  1645 + exc_info=True)
  1646 + if not relaxed:
  1647 + raise
  1648 + _ = unused # make pylint happy: now variable "unused" is being used ;-)
  1649 + return
  1650 +
  1651 +
  1652 +def vba_collapse_long_lines(vba_code):
  1653 + """
  1654 + Parse a VBA module code to detect continuation line characters (underscore) and
  1655 + collapse split lines. Continuation line characters are replaced by spaces.
  1656 +
  1657 + :param vba_code: str, VBA module code
  1658 + :return: str, VBA module code with long lines collapsed
  1659 + """
  1660 + # TODO: use a regex instead, to allow whitespaces after the underscore?
  1661 + vba_code = vba_code.replace(' _\r\n', ' ')
  1662 + vba_code = vba_code.replace(' _\r', ' ')
  1663 + vba_code = vba_code.replace(' _\n', ' ')
  1664 + return vba_code
  1665 +
  1666 +
  1667 +def filter_vba(vba_code):
  1668 + """
  1669 + Filter VBA source code to remove the first lines starting with "Attribute VB_",
  1670 + which are automatically added by MS Office and not displayed in the VBA Editor.
  1671 + This should only be used when displaying source code for human analysis.
  1672 +
  1673 + Note: lines are not filtered if they contain a colon, because it could be
  1674 + used to hide malicious instructions.
  1675 +
  1676 + :param vba_code: str, VBA source code
  1677 + :return: str, filtered VBA source code
  1678 + """
  1679 + vba_lines = vba_code.splitlines()
  1680 + start = 0
  1681 + for line in vba_lines:
  1682 + if line.startswith("Attribute VB_") and not ':' in line:
  1683 + start += 1
  1684 + else:
  1685 + break
  1686 + #TODO: also remove empty lines?
  1687 + vba = '\n'.join(vba_lines[start:])
  1688 + return vba
  1689 +
  1690 +
  1691 +def detect_autoexec(vba_code, obfuscation=None):
  1692 + """
  1693 + Detect if the VBA code contains keywords corresponding to macros running
  1694 + automatically when triggered by specific actions (e.g. when a document is
  1695 + opened or closed).
  1696 +
  1697 + :param vba_code: str, VBA source code
  1698 + :param obfuscation: None or str, name of obfuscation to be added to description
  1699 + :return: list of str tuples (keyword, description)
  1700 + """
  1701 + #TODO: merge code with detect_suspicious
  1702 + # case-insensitive search
  1703 + #vba_code = vba_code.lower()
  1704 + results = []
  1705 + obf_text = ''
  1706 + if obfuscation:
  1707 + obf_text = ' (obfuscation: %s)' % obfuscation
  1708 + for description, keywords in AUTOEXEC_KEYWORDS.items():
  1709 + for keyword in keywords:
  1710 + #TODO: if keyword is already a compiled regex, use it as-is
  1711 + # search using regex to detect word boundaries:
  1712 + if re.search(r'(?i)\b' + keyword + r'\b', vba_code):
  1713 + #if keyword.lower() in vba_code:
  1714 + results.append((keyword, description + obf_text))
  1715 + return results
  1716 +
  1717 +
  1718 +def detect_suspicious(vba_code, obfuscation=None):
  1719 + """
  1720 + Detect if the VBA code contains suspicious keywords corresponding to
  1721 + potential malware behaviour.
  1722 +
  1723 + :param vba_code: str, VBA source code
  1724 + :param obfuscation: None or str, name of obfuscation to be added to description
  1725 + :return: list of str tuples (keyword, description)
  1726 + """
  1727 + # case-insensitive search
  1728 + #vba_code = vba_code.lower()
  1729 + results = []
  1730 + obf_text = ''
  1731 + if obfuscation:
  1732 + obf_text = ' (obfuscation: %s)' % obfuscation
  1733 + for description, keywords in SUSPICIOUS_KEYWORDS.items():
  1734 + for keyword in keywords:
  1735 + # search using regex to detect word boundaries:
  1736 + if re.search(r'(?i)\b' + keyword + r'\b', vba_code):
  1737 + #if keyword.lower() in vba_code:
  1738 + results.append((keyword, description + obf_text))
  1739 + return results
  1740 +
  1741 +
  1742 +def detect_patterns(vba_code, obfuscation=None):
  1743 + """
  1744 + Detect if the VBA code contains specific patterns such as IP addresses,
  1745 + URLs, e-mail addresses, executable file names, etc.
  1746 +
  1747 + :param vba_code: str, VBA source code
  1748 + :return: list of str tuples (pattern type, value)
  1749 + """
  1750 + results = []
  1751 + found = set()
  1752 + obf_text = ''
  1753 + if obfuscation:
  1754 + obf_text = ' (obfuscation: %s)' % obfuscation
  1755 + for pattern_type, pattern_re in RE_PATTERNS:
  1756 + for match in pattern_re.finditer(vba_code):
  1757 + value = match.group()
  1758 + if value not in found:
  1759 + results.append((pattern_type + obf_text, value))
  1760 + found.add(value)
  1761 + return results
  1762 +
  1763 +
  1764 +def detect_hex_strings(vba_code):
  1765 + """
  1766 + Detect if the VBA code contains strings encoded in hexadecimal.
  1767 +
  1768 + :param vba_code: str, VBA source code
  1769 + :return: list of str tuples (encoded string, decoded string)
  1770 + """
  1771 + results = []
  1772 + found = set()
  1773 + for match in re_hex_string.finditer(vba_code):
  1774 + value = match.group()
  1775 + if value not in found:
  1776 + decoded = binascii.unhexlify(value)
  1777 + results.append((value, decoded))
  1778 + found.add(value)
  1779 + return results
  1780 +
  1781 +
  1782 +def detect_base64_strings(vba_code):
  1783 + """
  1784 + Detect if the VBA code contains strings encoded in base64.
  1785 +
  1786 + :param vba_code: str, VBA source code
  1787 + :return: list of str tuples (encoded string, decoded string)
  1788 + """
  1789 + #TODO: avoid matching simple hex strings as base64?
  1790 + results = []
  1791 + found = set()
  1792 + for match in re_base64_string.finditer(vba_code):
  1793 + # extract the base64 string without quotes:
  1794 + value = match.group().strip('"')
  1795 + # check it is not just a hex string:
  1796 + if not re_nothex_check.search(value):
  1797 + continue
  1798 + # only keep new values and not in the whitelist:
  1799 + if value not in found and value.lower() not in BASE64_WHITELIST:
  1800 + try:
  1801 + decoded = base64.b64decode(value)
  1802 + results.append((value, decoded))
  1803 + found.add(value)
  1804 + except (TypeError, ValueError) as exc:
  1805 + log.debug('Failed to base64-decode (%s)' % exc)
  1806 + # if an exception occurs, it is likely not a base64-encoded string
  1807 + return results
  1808 +
  1809 +
  1810 +def detect_dridex_strings(vba_code):
  1811 + """
  1812 + Detect if the VBA code contains strings obfuscated with a specific algorithm found in Dridex samples.
  1813 +
  1814 + :param vba_code: str, VBA source code
  1815 + :return: list of str tuples (encoded string, decoded string)
  1816 + """
  1817 + from thirdparty.DridexUrlDecoder.DridexUrlDecoder import DridexUrlDecode
  1818 +
  1819 + results = []
  1820 + found = set()
  1821 + for match in re_dridex_string.finditer(vba_code):
  1822 + value = match.group()[1:-1]
  1823 + # check it is not just a hex string:
  1824 + if not re_nothex_check.search(value):
  1825 + continue
  1826 + if value not in found:
  1827 + try:
  1828 + decoded = DridexUrlDecode(value)
  1829 + results.append((value, decoded))
  1830 + found.add(value)
  1831 + except Exception as exc:
  1832 + log.debug('Failed to Dridex-decode (%s)' % exc)
  1833 + # if an exception occurs, it is likely not a dridex-encoded string
  1834 + return results
  1835 +
  1836 +
  1837 +def detect_vba_strings(vba_code):
  1838 + """
  1839 + Detect if the VBA code contains strings obfuscated with VBA expressions
  1840 + using keywords such as Chr, Asc, Val, StrReverse, etc.
  1841 +
  1842 + :param vba_code: str, VBA source code
  1843 + :return: list of str tuples (encoded string, decoded string)
  1844 + """
  1845 + # TODO: handle exceptions
  1846 + results = []
  1847 + found = set()
  1848 + # IMPORTANT: to extract the actual VBA expressions found in the code,
  1849 + # we must expand tabs to have the same string as pyparsing.
  1850 + # Otherwise, start and end offsets are incorrect.
  1851 + vba_code = vba_code.expandtabs()
  1852 + for tokens, start, end in vba_expr_str.scanString(vba_code):
  1853 + encoded = vba_code[start:end]
  1854 + decoded = tokens[0]
  1855 + if isinstance(decoded, VbaExpressionString):
  1856 + # This is a VBA expression, not a simple string
  1857 + # print 'VBA EXPRESSION: encoded=%r => decoded=%r' % (encoded, decoded)
  1858 + # remove parentheses and quotes from original string:
  1859 + # if encoded.startswith('(') and encoded.endswith(')'):
  1860 + # encoded = encoded[1:-1]
  1861 + # if encoded.startswith('"') and encoded.endswith('"'):
  1862 + # encoded = encoded[1:-1]
  1863 + # avoid duplicates and simple strings:
  1864 + if encoded not in found and decoded != encoded:
  1865 + results.append((encoded, decoded))
  1866 + found.add(encoded)
  1867 + # else:
  1868 + # print 'VBA STRING: encoded=%r => decoded=%r' % (encoded, decoded)
  1869 + return results
  1870 +
  1871 +
  1872 +def json2ascii(json_obj, encoding='utf8', errors='replace'):
  1873 + """ ensure there is no unicode in json and all strings are safe to decode
  1874 +
  1875 + works recursively, decodes and re-encodes every string to/from unicode
  1876 + to ensure there will be no trouble in loading the dumped json output
  1877 + """
  1878 + if json_obj is None:
  1879 + pass
  1880 + elif isinstance(json_obj, (bool, int, float)):
  1881 + pass
  1882 + elif isinstance(json_obj, str):
  1883 + # de-code and re-encode
  1884 + dencoded = json_obj
  1885 + if dencoded != json_obj:
  1886 + log.debug('json2ascii: replaced: {0} (len {1})'
  1887 + .format(json_obj, len(json_obj)))
  1888 + log.debug('json2ascii: with: {0} (len {1})'
  1889 + .format(dencoded, len(dencoded)))
  1890 + return dencoded
  1891 + elif isinstance(json_obj, bytes):
  1892 + log.debug('json2ascii: encode unicode: {0}'
  1893 + .format(json_obj.decode(encoding, errors)))
  1894 + # cannot put original into logger
  1895 + # print 'original: ' json_obj
  1896 + return json_obj.decode(encoding, errors)
  1897 + elif isinstance(json_obj, dict):
  1898 + for key in json_obj:
  1899 + json_obj[key] = json2ascii(json_obj[key])
  1900 + elif isinstance(json_obj, (list,tuple)):
  1901 + for item in json_obj:
  1902 + item = json2ascii(item)
  1903 + else:
  1904 + log.debug('unexpected type in json2ascii: {0} -- leave as is'
  1905 + .format(type(json_obj)))
  1906 + return json_obj
  1907 +
  1908 +
  1909 +_have_printed_json_start = False
  1910 +
  1911 +def print_json(json_dict=None, _json_is_last=False, **json_parts):
  1912 + """ line-wise print of json.dumps(json2ascii(..)) with options and indent+1
  1913 +
  1914 + can use in two ways:
  1915 + (1) print_json(some_dict)
  1916 + (2) print_json(key1=value1, key2=value2, ...)
  1917 +
  1918 + :param bool _json_is_last: set to True only for very last entry to complete
  1919 + the top-level json-list
  1920 + """
  1921 + global _have_printed_json_start
  1922 +
  1923 + if json_dict and json_parts:
  1924 + raise ValueError('Invalid json argument: want either single dict or '
  1925 + 'key=value parts but got both)')
  1926 + elif (json_dict is not None) and (not isinstance(json_dict, dict)):
  1927 + raise ValueError('Invalid json argument: want either single dict or '
  1928 + 'key=value parts but got {0} instead of dict)'
  1929 + .format(type(json_dict)))
  1930 + if json_parts:
  1931 + json_dict = json_parts
  1932 +
  1933 + if not _have_printed_json_start:
  1934 + print('[')
  1935 + _have_printed_json_start = True
  1936 +
  1937 + lines = json.dumps(json2ascii(json_dict), check_circular=False,
  1938 + indent=4, ensure_ascii=False).splitlines()
  1939 + for line in lines[:-1]:
  1940 + print(' {0}'.format(line))
  1941 + if _json_is_last:
  1942 + print(' {0}'.format(lines[-1])) # print last line without comma
  1943 + print(']')
  1944 + else:
  1945 + print(' {0},'.format(lines[-1])) # print last line with comma
  1946 +
  1947 +
  1948 +class VBA_Scanner(object):
  1949 + """
  1950 + Class to scan the source code of a VBA module to find obfuscated strings,
  1951 + suspicious keywords, IOCs, auto-executable macros, etc.
  1952 + """
  1953 +
  1954 + def __init__(self, vba_code):
  1955 + """
  1956 + VBA_Scanner constructor
  1957 +
  1958 + :param vba_code: str, VBA source code to be analyzed
  1959 + """
  1960 + # join long lines ending with " _":
  1961 + self.code = vba_collapse_long_lines(vba_code)
  1962 + self.code_hex = b''
  1963 + self.code_hex_rev = b''
  1964 + self.code_rev_hex = b''
  1965 + self.code_base64 = b''
  1966 + self.code_dridex = ''
  1967 + self.code_vba = ''
  1968 + self.strReverse = None
  1969 + # results = None before scanning, then a list of tuples after scanning
  1970 + self.results = None
  1971 + self.autoexec_keywords = None
  1972 + self.suspicious_keywords = None
  1973 + self.iocs = None
  1974 + self.hex_strings = None
  1975 + self.base64_strings = None
  1976 + self.dridex_strings = None
  1977 + self.vba_strings = None
  1978 +
  1979 +
  1980 + def scan(self, include_decoded_strings=False, deobfuscate=False):
  1981 + """
  1982 + Analyze the provided VBA code to detect suspicious keywords,
  1983 + auto-executable macros, IOC patterns, obfuscation patterns
  1984 + such as hex-encoded strings.
  1985 +
  1986 + :param include_decoded_strings: bool, if True, all encoded strings will be included with their decoded content.
  1987 + :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow)
  1988 + :return: list of tuples (type, keyword, description)
  1989 + (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String')
  1990 + """
  1991 + # First, detect and extract hex-encoded strings:
  1992 + self.hex_strings = detect_hex_strings(self.code)
  1993 + # detect if the code contains StrReverse:
  1994 + self.strReverse = False
  1995 + if 'strreverse' in self.code.lower(): self.strReverse = True
  1996 + # Then append the decoded strings to the VBA code, to detect obfuscated IOCs and keywords:
  1997 + for encoded, decoded in self.hex_strings:
  1998 + self.code_hex += b'\n' + decoded
  1999 + # if the code contains "StrReverse", also append the hex strings in reverse order:
  2000 + if self.strReverse:
  2001 + # StrReverse after hex decoding:
  2002 + self.code_hex_rev += b'\n' + decoded[::-1]
  2003 + # StrReverse before hex decoding:
  2004 + self.code_rev_hex += b'\n' + binascii.unhexlify(encoded[::-1])
  2005 + #example: https://malwr.com/analysis/NmFlMGI4YTY1YzYyNDkwNTg1ZTBiZmY5OGI3YjlhYzU/
  2006 + #TODO: also append the full code reversed if StrReverse? (risk of false positives?)
  2007 + # Detect Base64-encoded strings
  2008 + self.base64_strings = detect_base64_strings(self.code)
  2009 + for encoded, decoded in self.base64_strings:
  2010 + self.code_base64 += b'\n' + decoded
  2011 + # Detect Dridex-encoded strings
  2012 + self.dridex_strings = detect_dridex_strings(self.code)
  2013 + for encoded, decoded in self.dridex_strings:
  2014 + self.code_dridex += '\n' + decoded
  2015 + # Detect obfuscated strings in VBA expressions
  2016 + if deobfuscate:
  2017 + self.vba_strings = detect_vba_strings(self.code)
  2018 + else:
  2019 + self.vba_strings = []
  2020 + for encoded, decoded in self.vba_strings:
  2021 + self.code_vba += '\n' + decoded
  2022 + results = []
  2023 + self.autoexec_keywords = []
  2024 + self.suspicious_keywords = []
  2025 + self.iocs = []
  2026 +
  2027 + for code, obfuscation in (
  2028 + (self.code, None),
  2029 + (self.code_hex.decode('utf-8','replace'), 'Hex'),
  2030 + (self.code_hex_rev, 'Hex+StrReverse'),
  2031 + (self.code_rev_hex, 'StrReverse+Hex'),
  2032 + (self.code_base64.decode('utf-8', 'replace'), 'Base64'),
  2033 + (self.code_dridex, 'Dridex'),
  2034 + (self.code_vba, 'VBA expression'),
  2035 + ):
  2036 + if isinstance(code,bytes):
  2037 + code=code.decode('utf-8','replace')
  2038 + self.autoexec_keywords += detect_autoexec(code, obfuscation)
  2039 + self.suspicious_keywords += detect_suspicious(code, obfuscation)
  2040 + self.iocs += detect_patterns(code, obfuscation)
  2041 +
  2042 + # If hex-encoded strings were discovered, add an item to suspicious keywords:
  2043 + if self.hex_strings:
  2044 + self.suspicious_keywords.append(('Hex Strings',
  2045 + 'Hex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))
  2046 + if self.base64_strings:
  2047 + self.suspicious_keywords.append(('Base64 Strings',
  2048 + 'Base64-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))
  2049 + if self.dridex_strings:
  2050 + self.suspicious_keywords.append(('Dridex Strings',
  2051 + 'Dridex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))
  2052 + if self.vba_strings:
  2053 + self.suspicious_keywords.append(('VBA obfuscated Strings',
  2054 + 'VBA string expressions were detected, may be used to obfuscate strings (option --decode to see all)'))
  2055 + # use a set to avoid duplicate keywords
  2056 + keyword_set = set()
  2057 + for keyword, description in self.autoexec_keywords:
  2058 + if keyword not in keyword_set:
  2059 + results.append(('AutoExec', keyword, description))
  2060 + keyword_set.add(keyword)
  2061 + keyword_set = set()
  2062 + for keyword, description in self.suspicious_keywords:
  2063 + if keyword not in keyword_set:
  2064 + results.append(('Suspicious', keyword, description))
  2065 + keyword_set.add(keyword)
  2066 + keyword_set = set()
  2067 + for pattern_type, value in self.iocs:
  2068 + if value not in keyword_set:
  2069 + results.append(('IOC', value, pattern_type))
  2070 + keyword_set.add(value)
  2071 +
  2072 + # include decoded strings only if they are printable or if --decode option:
  2073 + for encoded, decoded in self.hex_strings:
  2074 + if include_decoded_strings or is_printable(decoded):
  2075 + results.append(('Hex String', decoded, encoded))
  2076 + for encoded, decoded in self.base64_strings:
  2077 + if include_decoded_strings or is_printable(decoded):
  2078 + results.append(('Base64 String', decoded, encoded))
  2079 + for encoded, decoded in self.dridex_strings:
  2080 + if include_decoded_strings or is_printable(decoded):
  2081 + results.append(('Dridex string', decoded, encoded))
  2082 + for encoded, decoded in self.vba_strings:
  2083 + if include_decoded_strings or is_printable(decoded):
  2084 + results.append(('VBA string', decoded, encoded))
  2085 + self.results = results
  2086 + return results
  2087 +
  2088 + def scan_summary(self):
  2089 + """
  2090 + Analyze the provided VBA code to detect suspicious keywords,
  2091 + auto-executable macros, IOC patterns, obfuscation patterns
  2092 + such as hex-encoded strings.
  2093 +
  2094 + :return: tuple with the number of items found for each category:
  2095 + (autoexec, suspicious, IOCs, hex, base64, dridex, vba)
  2096 + """
  2097 + # avoid scanning the same code twice:
  2098 + if self.results is None:
  2099 + self.scan()
  2100 + return (len(self.autoexec_keywords), len(self.suspicious_keywords),
  2101 + len(self.iocs), len(self.hex_strings), len(self.base64_strings),
  2102 + len(self.dridex_strings), len(self.vba_strings))
  2103 +
  2104 +
  2105 +def scan_vba(vba_code, include_decoded_strings, deobfuscate=False):
  2106 + """
  2107 + Analyze the provided VBA code to detect suspicious keywords,
  2108 + auto-executable macros, IOC patterns, obfuscation patterns
  2109 + such as hex-encoded strings.
  2110 + (shortcut for VBA_Scanner(vba_code).scan())
  2111 +
  2112 + :param vba_code: str, VBA source code to be analyzed
  2113 + :param include_decoded_strings: bool, if True all encoded strings will be included with their decoded content.
  2114 + :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow)
  2115 + :return: list of tuples (type, keyword, description)
  2116 + (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String')
  2117 + """
  2118 + return VBA_Scanner(vba_code).scan(include_decoded_strings, deobfuscate)
  2119 +
  2120 +
  2121 +#=== CLASSES =================================================================
  2122 +
  2123 +class VBA_Parser(object):
  2124 + """
  2125 + Class to parse MS Office files, to detect VBA macros and extract VBA source code
  2126 + Supported file formats:
  2127 + - Word 97-2003 (.doc, .dot)
  2128 + - Word 2007+ (.docm, .dotm)
  2129 + - Word 2003 XML (.xml)
  2130 + - Word MHT - Single File Web Page / MHTML (.mht)
  2131 + - Excel 97-2003 (.xls)
  2132 + - Excel 2007+ (.xlsm, .xlsb)
  2133 + - PowerPoint 97-2003 (.ppt)
  2134 + - PowerPoint 2007+ (.pptm, .ppsm)
  2135 + """
  2136 +
  2137 + def __init__(self, filename, data=None, container=None, relaxed=False):
  2138 + """
  2139 + Constructor for VBA_Parser
  2140 +
  2141 + :param filename: filename or path of file to parse, or file-like object
  2142 +
  2143 + :param data: None or bytes str, if None the file will be read from disk (or from the file-like object).
  2144 + If data is provided as a bytes string, it will be parsed as the content of the file in memory,
  2145 + and not read from disk. Note: files must be read in binary mode, i.e. open(f, 'rb').
  2146 +
  2147 + :param container: str, path and filename of container if the file is within
  2148 + a zip archive, None otherwise.
  2149 +
  2150 + :param relaxed: if True, treat mal-formed documents and missing streams more like MS office:
  2151 + do nothing; if False (default), raise errors in these cases
  2152 +
  2153 + raises a FileOpenError if all attemps to interpret the data header failed
  2154 + """
  2155 + #TODO: filename should only be a string, data should be used for the file-like object
  2156 + #TODO: filename should be mandatory, optional data is a string or file-like object
  2157 + #TODO: also support olefile and zipfile as input
  2158 + if data is None:
  2159 + # open file from disk:
  2160 + _file = filename
  2161 + else:
  2162 + # file already read in memory, make it a file-like object for zipfile:
  2163 + _file = BytesIO(data)
  2164 + #self.file = _file
  2165 + self.ole_file = None
  2166 + self.ole_subfiles = []
  2167 + self.filename = filename
  2168 + self.container = container
  2169 + self.relaxed = relaxed
  2170 + self.type = None
  2171 + self.vba_projects = None
  2172 + self.vba_forms = None
  2173 + self.contains_macros = None # will be set to True or False by detect_macros
  2174 + self.vba_code_all_modules = None # to store the source code of all modules
  2175 + # list of tuples for each module: (subfilename, stream_path, vba_filename, vba_code)
  2176 + self.modules = None
  2177 + # Analysis results: list of tuples (type, keyword, description) - See VBA_Scanner
  2178 + self.analysis_results = None
  2179 + # statistics for the scan summary and flags
  2180 + self.nb_macros = 0
  2181 + self.nb_autoexec = 0
  2182 + self.nb_suspicious = 0
  2183 + self.nb_iocs = 0
  2184 + self.nb_hexstrings = 0
  2185 + self.nb_base64strings = 0
  2186 + self.nb_dridexstrings = 0
  2187 + self.nb_vbastrings = 0
  2188 +
  2189 + # if filename is None:
  2190 + # if isinstance(_file, basestring):
  2191 + # if len(_file) < olefile.MINIMAL_OLEFILE_SIZE:
  2192 + # self.filename = _file
  2193 + # else:
  2194 + # self.filename = '<file in bytes string>'
  2195 + # else:
  2196 + # self.filename = '<file-like object>'
  2197 + if olefile.isOleFile(_file):
  2198 + # This looks like an OLE file
  2199 + self.open_ole(_file)
  2200 +
  2201 + # if this worked, try whether it is a ppt file (special ole file)
  2202 + self.open_ppt()
  2203 + if self.type is None and zipfile.is_zipfile(_file):
  2204 + # Zip file, which may be an OpenXML document
  2205 + self.open_openxml(_file)
  2206 + if self.type is None:
  2207 + # read file from disk, check if it is a Word 2003 XML file (WordProcessingML), Excel 2003 XML,
  2208 + # or a plain text file containing VBA code
  2209 + if data is None:
  2210 + data = open(filename, 'rb').read()
  2211 + # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace
  2212 + if b'http://schemas.microsoft.com/office/word/2003/wordml' in data:
  2213 + self.open_word2003xml(data)
  2214 + # store a lowercase version for the next tests:
  2215 + data_lowercase = data.lower()
  2216 + # check if it is a MHT file (MIME HTML, Word or Excel saved as "Single File Web Page"):
  2217 + # According to my tests, these files usually start with "MIME-Version: 1.0" on the 1st line
  2218 + # BUT Word accepts a blank line or other MIME headers inserted before,
  2219 + # and even whitespaces in between "MIME", "-", "Version" and ":". The version number is ignored.
  2220 + # And the line is case insensitive.
  2221 + # so we'll just check the presence of mime, version and multipart anywhere:
  2222 + if self.type is None and b'mime' in data_lowercase and b'version' in data_lowercase \
  2223 + and b'multipart' in data_lowercase:
  2224 + self.open_mht(data)
  2225 + #TODO: handle exceptions
  2226 + #TODO: Excel 2003 XML
  2227 + # Check if this is a plain text VBA or VBScript file:
  2228 + # To avoid scanning binary files, we simply check for some control chars:
  2229 + if self.type is None and b'\x00' not in data:
  2230 + self.open_text(data)
  2231 + if self.type is None:
  2232 + # At this stage, could not match a known format:
  2233 + msg = '%s is not a supported file type, cannot extract VBA Macros.' % self.filename
  2234 + log.info(msg)
  2235 + raise FileOpenError(msg)
  2236 +
  2237 + def open_ole(self, _file):
  2238 + """
  2239 + Open an OLE file
  2240 + :param _file: filename or file contents in a file object
  2241 + :return: nothing
  2242 + """
  2243 + log.info('Opening OLE file %s' % self.filename)
  2244 + try:
  2245 + # Open and parse the OLE file, using unicode for path names:
  2246 + self.ole_file = olefile.OleFileIO(_file, path_encoding=None)
  2247 + # set type only if parsing succeeds
  2248 + self.type = TYPE_OLE
  2249 + except (IOError, TypeError, ValueError) as exc:
  2250 + # TODO: handle OLE parsing exceptions
  2251 + log.info('Failed OLE parsing for file %r (%s)' % (self.filename, exc))
  2252 + log.debug('Trace:', exc_info=True)
  2253 +
  2254 +
  2255 + def open_openxml(self, _file):
  2256 + """
  2257 + Open an OpenXML file
  2258 + :param _file: filename or file contents in a file object
  2259 + :return: nothing
  2260 + """
  2261 + # This looks like a zip file, need to look for vbaProject.bin inside
  2262 + # It can be any OLE file inside the archive
  2263 + #...because vbaProject.bin can be renamed:
  2264 + # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18
  2265 + log.info('Opening ZIP/OpenXML file %s' % self.filename)
  2266 + try:
  2267 + z = zipfile.ZipFile(_file)
  2268 + #TODO: check if this is actually an OpenXML file
  2269 + #TODO: if the zip file is encrypted, suggest to use the -z option, or try '-z infected' automatically
  2270 + # check each file within the zip if it is an OLE file, by reading its magic:
  2271 + for subfile in z.namelist():
  2272 + magic = z.open(subfile).read(len(olefile.MAGIC))
  2273 + if magic == olefile.MAGIC:
  2274 + log.debug('Opening OLE file %s within zip' % subfile)
  2275 + ole_data = z.open(subfile).read()
  2276 + try:
  2277 + self.ole_subfiles.append(
  2278 + VBA_Parser(filename=subfile, data=ole_data,
  2279 + relaxed=self.relaxed))
  2280 + except OlevbaBaseException as exc:
  2281 + if self.relaxed:
  2282 + log.info('%s is not a valid OLE file (%s)' % (subfile, exc))
  2283 + log.debug('Trace:', exc_info=True)
  2284 + continue
  2285 + else:
  2286 + raise SubstreamOpenError(self.filename, subfile,
  2287 + exc)
  2288 + z.close()
  2289 + # set type only if parsing succeeds
  2290 + self.type = TYPE_OpenXML
  2291 + except OlevbaBaseException as exc:
  2292 + if self.relaxed:
  2293 + log.info('Error {0} caught in Zip/OpenXML parsing for file {1}'
  2294 + .format(exc, self.filename))
  2295 + log.debug('Trace:', exc_info=True)
  2296 + else:
  2297 + raise
  2298 + except (RuntimeError, zipfile.BadZipfile, zipfile.LargeZipFile, IOError) as exc:
  2299 + # TODO: handle parsing exceptions
  2300 + log.info('Failed Zip/OpenXML parsing for file %r (%s)'
  2301 + % (self.filename, exc))
  2302 + log.debug('Trace:', exc_info=True)
  2303 +
  2304 + def open_word2003xml(self, data):
  2305 + """
  2306 + Open a Word 2003 XML file
  2307 + :param data: file contents in a string or bytes
  2308 + :return: nothing
  2309 + """
  2310 + log.info('Opening Word 2003 XML file %s' % self.filename)
  2311 + try:
  2312 + # parse the XML content
  2313 + # TODO: handle XML parsing exceptions
  2314 + et = ET.fromstring(data)
  2315 + # find all the binData elements:
  2316 + for bindata in et.getiterator(TAG_BINDATA):
  2317 + # the binData content is an OLE container for the VBA project, compressed
  2318 + # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded.
  2319 + # get the filename:
  2320 + fname = bindata.get(ATTR_NAME, 'noname.mso')
  2321 + # decode the base64 activemime
  2322 + mso_data = binascii.a2b_base64(bindata.text)
  2323 + if is_mso_file(mso_data):
  2324 + # decompress the zlib data stored in the MSO file, which is the OLE container:
  2325 + # TODO: handle different offsets => separate function
  2326 + try:
  2327 + ole_data = mso_file_extract(mso_data)
  2328 + self.ole_subfiles.append(
  2329 + VBA_Parser(filename=fname, data=ole_data,
  2330 + relaxed=self.relaxed))
  2331 + except OlevbaBaseException as exc:
  2332 + if self.relaxed:
  2333 + log.info('Error parsing subfile {0}: {1}'
  2334 + .format(fname, exc))
  2335 + log.debug('Trace:', exc_info=True)
  2336 + else:
  2337 + raise SubstreamOpenError(self.filename, fname, exc)
  2338 + else:
  2339 + log.info('%s is not a valid MSO file' % fname)
  2340 + # set type only if parsing succeeds
  2341 + self.type = TYPE_Word2003_XML
  2342 + except OlevbaBaseException as exc:
  2343 + if self.relaxed:
  2344 + log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc))
  2345 + log.debug('Trace:', exc_info=True)
  2346 + else:
  2347 + raise
  2348 + except Exception as exc:
  2349 + # TODO: differentiate exceptions for each parsing stage
  2350 + # (but ET is different libs, no good exception description in API)
  2351 + # found: XMLSyntaxError
  2352 + log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc))
  2353 + log.debug('Trace:', exc_info=True)
  2354 +
  2355 + def open_mht(self, data):
  2356 + """
  2357 + Open a MHTML file
  2358 + :param data: file contents in a string or bytes
  2359 + :return: nothing
  2360 + """
  2361 + log.info('Opening MHTML file %s' % self.filename)
  2362 + try:
  2363 + if isinstance(data,bytes):
  2364 + data = data.decode('utf8', 'replace')
  2365 + # parse the MIME content
  2366 + # remove any leading whitespace or newline (workaround for issue in email package)
  2367 + stripped_data = data.lstrip('\r\n\t ')
  2368 + # strip any junk from the beginning of the file
  2369 + # (issue #31 fix by Greg C - gdigreg)
  2370 + # TODO: improve keywords to avoid false positives
  2371 + mime_offset = stripped_data.find('MIME')
  2372 + content_offset = stripped_data.find('Content')
  2373 + # if "MIME" is found, and located before "Content":
  2374 + if -1 < mime_offset <= content_offset:
  2375 + stripped_data = stripped_data[mime_offset:]
  2376 + # else if "Content" is found, and before "MIME"
  2377 + # TODO: can it work without "MIME" at all?
  2378 + elif content_offset > -1:
  2379 + stripped_data = stripped_data[content_offset:]
  2380 + # TODO: quick and dirty fix: insert a standard line with MIME-Version header?
  2381 + mhtml = email.message_from_string(stripped_data)
  2382 + # find all the attached files:
  2383 + for part in mhtml.walk():
  2384 + content_type = part.get_content_type() # always returns a value
  2385 + fname = part.get_filename(None) # returns None if it fails
  2386 + # TODO: get content-location if no filename
  2387 + log.debug('MHTML part: filename=%r, content-type=%r' % (fname, content_type))
  2388 + part_data = part.get_payload(decode=True)
  2389 + # VBA macros are stored in a binary file named "editdata.mso".
  2390 + # the data content is an OLE container for the VBA project, compressed
  2391 + # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded.
  2392 + # decompress the zlib data starting at offset 0x32, which is the OLE container:
  2393 + # check ActiveMime header:
  2394 +
  2395 + if (isinstance(part_data, str) or isinstance(part_data, bytes)) and is_mso_file(part_data):
  2396 + log.debug('Found ActiveMime header, decompressing MSO container')
  2397 + try:
  2398 + ole_data = mso_file_extract(part_data)
  2399 +
  2400 + # TODO: check if it is actually an OLE file
  2401 + # TODO: get the MSO filename from content_location?
  2402 + self.ole_subfiles.append(
  2403 + VBA_Parser(filename=fname, data=ole_data,
  2404 + relaxed=self.relaxed))
  2405 + except OlevbaBaseException as exc:
  2406 + if self.relaxed:
  2407 + log.info('%s does not contain a valid OLE file (%s)'
  2408 + % (fname, exc))
  2409 + log.debug('Trace:', exc_info=True)
  2410 + # TODO: bug here - need to split in smaller functions/classes?
  2411 + else:
  2412 + raise SubstreamOpenError(self.filename, fname, exc)
  2413 + else:
  2414 + log.debug('type(part_data) = %s' % type(part_data))
  2415 + try:
  2416 + log.debug('part_data[0:20] = %r' % part_data[0:20])
  2417 + except TypeError as err:
  2418 + log.debug('part_data has no __getitem__')
  2419 + # set type only if parsing succeeds
  2420 + self.type = TYPE_MHTML
  2421 + except OlevbaBaseException:
  2422 + raise
  2423 + except Exception:
  2424 + log.info('Failed MIME parsing for file %r - %s'
  2425 + % (self.filename, MSG_OLEVBA_ISSUES))
  2426 + log.debug('Trace:', exc_info=True)
  2427 +
  2428 + def open_ppt(self):
  2429 + """ try to interpret self.ole_file as PowerPoint 97-2003 using PptParser
  2430 +
  2431 + Although self.ole_file is a valid olefile.OleFileIO, we set
  2432 + self.ole_file = None in here and instead set self.ole_subfiles to the
  2433 + VBA ole streams found within the main ole file. That makes most of the
  2434 + code below treat this like an OpenXML file and only look at the
  2435 + ole_subfiles (except find_vba_* which needs to explicitly check for
  2436 + self.type)
  2437 + """
  2438 +
  2439 + log.info('Check whether OLE file is PPT')
  2440 + ppt_parser.enable_logging()
  2441 + try:
  2442 + ppt = ppt_parser.PptParser(self.ole_file, fast_fail=True)
  2443 + for vba_data in ppt.iter_vba_data():
  2444 + self.ole_subfiles.append(VBA_Parser(None, vba_data,
  2445 + container='PptParser'))
  2446 + log.info('File is PPT')
  2447 + self.ole_file.close() # just in case
  2448 + self.ole_file = None # required to make other methods look at ole_subfiles
  2449 + self.type = TYPE_PPT
  2450 + except Exception as exc:
  2451 + if self.container == 'PptParser':
  2452 + # this is a subfile of a ppt --> to be expected that is no ppt
  2453 + log.debug('PPT subfile is not a PPT file')
  2454 + else:
  2455 + log.debug("File appears not to be a ppt file (%s)" % exc)
  2456 +
  2457 +
  2458 + def open_text(self, data):
  2459 + """
  2460 + Open a text file containing VBA or VBScript source code
  2461 + :param data: file contents in a string or bytes
  2462 + :return: nothing
  2463 + """
  2464 + log.info('Opening text file %s' % self.filename)
  2465 + # directly store the source code:
  2466 + if isinstance(data,bytes):
  2467 + data=data.decode('utf8','replace')
  2468 + self.vba_code_all_modules = data
  2469 + self.contains_macros = True
  2470 + # set type only if parsing succeeds
  2471 + self.type = TYPE_TEXT
  2472 +
  2473 +
  2474 + def find_vba_projects(self):
  2475 + """
  2476 + Finds all the VBA projects stored in an OLE file.
  2477 +
  2478 + Return None if the file is not OLE but OpenXML.
  2479 + Return a list of tuples (vba_root, project_path, dir_path) for each VBA project.
  2480 + vba_root is the path of the root OLE storage containing the VBA project,
  2481 + including a trailing slash unless it is the root of the OLE file.
  2482 + project_path is the path of the OLE stream named "PROJECT" within the VBA project.
  2483 + dir_path is the path of the OLE stream named "VBA/dir" within the VBA project.
  2484 +
  2485 + If this function returns an empty list for one of the supported formats
  2486 + (i.e. Word, Excel, Powerpoint), then the file does not contain VBA macros.
  2487 +
  2488 + :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path)
  2489 + for each VBA project found if OLE file
  2490 + """
  2491 + log.debug('VBA_Parser.find_vba_projects')
  2492 +
  2493 + # if the file is not OLE but OpenXML, return None:
  2494 + if self.ole_file is None and self.type != TYPE_PPT:
  2495 + return None
  2496 +
  2497 + # if this method has already been called, return previous result:
  2498 + if self.vba_projects is not None:
  2499 + return self.vba_projects
  2500 +
  2501 + # if this is a ppt file (PowerPoint 97-2003):
  2502 + # self.ole_file is None but the ole_subfiles do contain vba_projects
  2503 + # (like for OpenXML files).
  2504 + if self.type == TYPE_PPT:
  2505 + # TODO: so far, this function is never called for PPT files, but
  2506 + # if that happens, the information is lost which ole file contains
  2507 + # which storage!
  2508 + log.warning('Returned info is not complete for PPT types!')
  2509 + self.vba_projects = []
  2510 + for subfile in self.ole_subfiles:
  2511 + self.vba_projects.extend(subfile.find_vba_projects())
  2512 + return self.vba_projects
  2513 +
  2514 + # Find the VBA project root (different in MS Word, Excel, etc):
  2515 + # - Word 97-2003: Macros
  2516 + # - Excel 97-2003: _VBA_PROJECT_CUR
  2517 + # - PowerPoint 97-2003: PptParser has identified ole_subfiles
  2518 + # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin.
  2519 + # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word
  2520 + # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word
  2521 + # - Visio 2007: not supported yet (different file structure)
  2522 +
  2523 + # According to MS-OVBA section 2.2.1:
  2524 + # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream
  2525 + # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream
  2526 + # - all names are case-insensitive
  2527 +
  2528 + def check_vba_stream(ole, vba_root, stream_path):
  2529 + full_path = vba_root + stream_path
  2530 + if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM:
  2531 + log.debug('Found %s stream: %s' % (stream_path, full_path))
  2532 + return full_path
  2533 + else:
  2534 + log.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path)
  2535 + return False
  2536 +
  2537 + # start with an empty list:
  2538 + self.vba_projects = []
  2539 + # Look for any storage containing those storage/streams:
  2540 + ole = self.ole_file
  2541 + for storage in ole.listdir(streams=False, storages=True):
  2542 + log.debug('Checking storage %r' % storage)
  2543 + # Look for a storage ending with "VBA":
  2544 + if storage[-1].upper() == 'VBA':
  2545 + log.debug('Found VBA storage: %s' % ('/'.join(storage)))
  2546 + vba_root = '/'.join(storage[:-1])
  2547 + # Add a trailing slash to vba_root, unless it is the root of the OLE file:
  2548 + # (used later to append all the child streams/storages)
  2549 + if vba_root != '':
  2550 + vba_root += '/'
  2551 + log.debug('Checking vba_root="%s"' % vba_root)
  2552 +
  2553 + # Check if the VBA root storage also contains a PROJECT stream:
  2554 + project_path = check_vba_stream(ole, vba_root, 'PROJECT')
  2555 + if not project_path: continue
  2556 + # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream:
  2557 + vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT')
  2558 + if not vba_project_path: continue
  2559 + # Check if the VBA root storage also contains a VBA/dir stream:
  2560 + dir_path = check_vba_stream(ole, vba_root, 'VBA/dir')
  2561 + if not dir_path: continue
  2562 + # Now we are pretty sure it is a VBA project structure
  2563 + log.debug('VBA root storage: "%s"' % vba_root)
  2564 + # append the results to the list as a tuple for later use:
  2565 + self.vba_projects.append((vba_root, project_path, dir_path))
  2566 + return self.vba_projects
  2567 +
  2568 + def detect_vba_macros(self):
  2569 + """
  2570 + Detect the potential presence of VBA macros in the file, by checking
  2571 + if it contains VBA projects. Both OLE and OpenXML files are supported.
  2572 +
  2573 + Important: for now, results are accurate only for Word, Excel and PowerPoint
  2574 +
  2575 + Note: this method does NOT attempt to check the actual presence or validity
  2576 + of VBA macro source code, so there might be false positives.
  2577 + It may also detect VBA macros in files embedded within the main file,
  2578 + for example an Excel workbook with macros embedded into a Word
  2579 + document without macros may be detected, without distinction.
  2580 +
  2581 + :return: bool, True if at least one VBA project has been found, False otherwise
  2582 + """
  2583 + #TODO: return None or raise exception if format not supported
  2584 + #TODO: return the number of VBA projects found instead of True/False?
  2585 + # if this method was already called, return the previous result:
  2586 + if self.contains_macros is not None:
  2587 + return self.contains_macros
  2588 + # if OpenXML/PPT, check all the OLE subfiles:
  2589 + if self.ole_file is None:
  2590 + for ole_subfile in self.ole_subfiles:
  2591 + if ole_subfile.detect_vba_macros():
  2592 + self.contains_macros = True
  2593 + return True
  2594 + # otherwise, no macro found:
  2595 + self.contains_macros = False
  2596 + return False
  2597 + # otherwise it's an OLE file, find VBA projects:
  2598 + vba_projects = self.find_vba_projects()
  2599 + if len(vba_projects) == 0:
  2600 + self.contains_macros = False
  2601 + else:
  2602 + self.contains_macros = True
  2603 + # Also look for VBA code in any stream including orphans
  2604 + # (happens in some malformed files)
  2605 + ole = self.ole_file
  2606 + for sid in range(len(ole.direntries)):
  2607 + # check if id is already done above:
  2608 + log.debug('Checking DirEntry #%d' % sid)
  2609 + d = ole.direntries[sid]
  2610 + if d is None:
  2611 + # this direntry is not part of the tree: either unused or an orphan
  2612 + d = ole._load_direntry(sid)
  2613 + log.debug('This DirEntry is an orphan or unused')
  2614 + if d.entry_type == olefile.STGTY_STREAM:
  2615 + # read data
  2616 + log.debug('Reading data from stream %r - size: %d bytes' % (d.name, d.size))
  2617 + try:
  2618 + data = ole._open(d.isectStart, d.size).read()
  2619 + log.debug('Read %d bytes' % len(data))
  2620 + if len(data) > 200:
  2621 + log.debug('%r...[much more data]...%r' % (data[:100], data[-50:]))
  2622 + else:
  2623 + log.debug(repr(data))
  2624 + if 'Attribut' in data.decode('utf-8','ignore'):
  2625 + log.debug('Found VBA compressed code')
  2626 + self.contains_macros = True
  2627 + except IOError as exc:
  2628 + if self.relaxed:
  2629 + log.info('Error when reading OLE Stream %r' % d.name)
  2630 + log.debug('Trace:', exc_trace=True)
  2631 + else:
  2632 + raise SubstreamOpenError(self.filename, d.name, exc)
  2633 + return self.contains_macros
  2634 +
  2635 + def extract_macros(self):
  2636 + """
  2637 + Extract and decompress source code for each VBA macro found in the file
  2638 +
  2639 + Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found
  2640 + If the file is OLE, filename is the path of the file.
  2641 + If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros
  2642 + within the zip archive, e.g. word/vbaProject.bin.
  2643 + If the file is PPT, result is as for OpenXML but filename is useless
  2644 + """
  2645 + log.debug('extract_macros:')
  2646 + if self.ole_file is None:
  2647 + # This may be either an OpenXML/PPT or a text file:
  2648 + if self.type == TYPE_TEXT:
  2649 + # This is a text file, yield the full code:
  2650 + yield (self.filename, '', self.filename, self.vba_code_all_modules)
  2651 + else:
  2652 + # OpenXML/PPT: recursively yield results from each OLE subfile:
  2653 + for ole_subfile in self.ole_subfiles:
  2654 + for results in ole_subfile.extract_macros():
  2655 + yield results
  2656 + else:
  2657 + # This is an OLE file:
  2658 + self.find_vba_projects()
  2659 + # set of stream ids
  2660 + vba_stream_ids = set()
  2661 + for vba_root, project_path, dir_path in self.vba_projects:
  2662 + # extract all VBA macros from that VBA root storage:
  2663 + for stream_path, vba_filename, vba_code in \
  2664 + _extract_vba(self.ole_file, vba_root, project_path,
  2665 + dir_path, self.relaxed):
  2666 + # store direntry ids in a set:
  2667 + vba_stream_ids.add(self.ole_file._find(stream_path))
  2668 + yield (self.filename, stream_path, vba_filename, vba_code)
  2669 + # Also look for VBA code in any stream including orphans
  2670 + # (happens in some malformed files)
  2671 + ole = self.ole_file
  2672 + for sid in range(len(ole.direntries)):
  2673 + # check if id is already done above:
  2674 + log.debug('Checking DirEntry #%d' % sid)
  2675 + if sid in vba_stream_ids:
  2676 + log.debug('Already extracted')
  2677 + continue
  2678 + d = ole.direntries[sid]
  2679 + if d is None:
  2680 + # this direntry is not part of the tree: either unused or an orphan
  2681 + d = ole._load_direntry(sid)
  2682 + log.debug('This DirEntry is an orphan or unused')
  2683 + if d.entry_type == olefile.STGTY_STREAM:
  2684 + # read data
  2685 + log.debug('Reading data from stream %r' % d.name)
  2686 + data = ole._open(d.isectStart, d.size).read()
  2687 + for match in re.finditer(rb'\x00Attribut[^e]', data, flags=re.IGNORECASE):
  2688 + start = match.start() - 3
  2689 + log.debug('Found VBA compressed code at index %X' % start)
  2690 + compressed_code = data[start:]
  2691 + try:
  2692 + vba_code = decompress_stream(compressed_code)
  2693 + yield (self.filename, d.name, d.name, vba_code)
  2694 + except Exception as exc:
  2695 + # display the exception with full stack trace for debugging
  2696 + log.debug('Error processing stream %r in file %r (%s)' % (d.name, self.filename, exc))
  2697 + log.debug('Traceback:', exc_info=True)
  2698 + # do not raise the error, as it is unlikely to be a compressed macro stream
  2699 +
  2700 + def extract_all_macros(self):
  2701 + """
  2702 + Extract and decompress source code for each VBA macro found in the file
  2703 + by calling extract_macros(), store the results as a list of tuples
  2704 + (filename, stream_path, vba_filename, vba_code) in self.modules.
  2705 + See extract_macros for details.
  2706 + """
  2707 + if self.modules is None:
  2708 + self.modules = []
  2709 + for (subfilename, stream_path, vba_filename, vba_code) in self.extract_macros():
  2710 + self.modules.append((subfilename, stream_path, vba_filename, vba_code))
  2711 + self.nb_macros = len(self.modules)
  2712 + return self.modules
  2713 +
  2714 +
  2715 +
  2716 + def analyze_macros(self, show_decoded_strings=False, deobfuscate=False):
  2717 + """
  2718 + runs extract_macros and analyze the source code of all VBA macros
  2719 + found in the file.
  2720 + """
  2721 + if self.detect_vba_macros():
  2722 + # if the analysis was already done, avoid doing it twice:
  2723 + if self.analysis_results is not None:
  2724 + return self.analysis_results
  2725 + # variable to merge source code from all modules:
  2726 + if self.vba_code_all_modules is None:
  2727 + self.vba_code_all_modules = ''
  2728 + for (_, _, _, vba_code) in self.extract_all_macros():
  2729 + #TODO: filter code? (each module)
  2730 + self.vba_code_all_modules += vba_code.decode('utf-8', 'ignore') + '\n'
  2731 + for (_, _, form_string) in self.extract_form_strings():
  2732 + self.vba_code_all_modules += form_string.decode('utf-8', 'ignore') + '\n'
  2733 + # Analyze the whole code at once:
  2734 + scanner = VBA_Scanner(self.vba_code_all_modules)
  2735 + self.analysis_results = scanner.scan(show_decoded_strings, deobfuscate)
  2736 + autoexec, suspicious, iocs, hexstrings, base64strings, dridex, vbastrings = scanner.scan_summary()
  2737 + self.nb_autoexec += autoexec
  2738 + self.nb_suspicious += suspicious
  2739 + self.nb_iocs += iocs
  2740 + self.nb_hexstrings += hexstrings
  2741 + self.nb_base64strings += base64strings
  2742 + self.nb_dridexstrings += dridex
  2743 + self.nb_vbastrings += vbastrings
  2744 +
  2745 + return self.analysis_results
  2746 +
  2747 +
  2748 + def reveal(self):
  2749 + # we only want printable strings:
  2750 + analysis = self.analyze_macros(show_decoded_strings=False)
  2751 + # to avoid replacing short strings contained into longer strings, we sort the analysis results
  2752 + # based on the length of the encoded string, in reverse order:
  2753 + analysis = sorted(analysis, key=lambda type_decoded_encoded: len(type_decoded_encoded[2]), reverse=True)
  2754 + # normally now self.vba_code_all_modules contains source code from all modules
  2755 + deobf_code = self.vba_code_all_modules
  2756 + for kw_type, decoded, encoded in analysis:
  2757 + if kw_type == 'VBA string':
  2758 + #print '%3d occurences: %r => %r' % (deobf_code.count(encoded), encoded, decoded)
  2759 + # need to add double quotes around the decoded strings
  2760 + # after escaping double-quotes as double-double-quotes for VBA:
  2761 + decoded = decoded.replace('"', '""')
  2762 + deobf_code = deobf_code.replace(encoded, '"%s"' % decoded)
  2763 + return deobf_code
  2764 + #TODO: repasser l'analyse plusieurs fois si des chaines hex ou base64 sont revelees
  2765 +
  2766 +
  2767 + def find_vba_forms(self):
  2768 + """
  2769 + Finds all the VBA forms stored in an OLE file.
  2770 +
  2771 + Return None if the file is not OLE but OpenXML.
  2772 + Return a list of tuples (vba_root, project_path, dir_path) for each VBA project.
  2773 + vba_root is the path of the root OLE storage containing the VBA project,
  2774 + including a trailing slash unless it is the root of the OLE file.
  2775 + project_path is the path of the OLE stream named "PROJECT" within the VBA project.
  2776 + dir_path is the path of the OLE stream named "VBA/dir" within the VBA project.
  2777 +
  2778 + If this function returns an empty list for one of the supported formats
  2779 + (i.e. Word, Excel, Powerpoint), then the file does not contain VBA forms.
  2780 +
  2781 + :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path)
  2782 + for each VBA project found if OLE file
  2783 + """
  2784 + log.debug('VBA_Parser.find_vba_forms')
  2785 +
  2786 + # if the file is not OLE but OpenXML, return None:
  2787 + if self.ole_file is None and self.type != TYPE_PPT:
  2788 + return None
  2789 +
  2790 + # if this method has already been called, return previous result:
  2791 + # if self.vba_projects is not None:
  2792 + # return self.vba_projects
  2793 +
  2794 + # According to MS-OFORMS section 2.1.2 Control Streams:
  2795 + # - A parent control, that is, a control that can contain embedded controls,
  2796 + # MUST be persisted as a storage that contains multiple streams.
  2797 + # - All parent controls MUST contain a FormControl. The FormControl
  2798 + # properties are persisted to a stream (1) as specified in section 2.1.1.2.
  2799 + # The name of this stream (1) MUST be "f".
  2800 + # - Embedded controls that cannot themselves contain other embedded
  2801 + # controls are persisted sequentially as FormEmbeddedActiveXControls
  2802 + # to a stream (1) contained in the same storage as the parent control.
  2803 + # The name of this stream (1) MUST be "o".
  2804 + # - all names are case-insensitive
  2805 +
  2806 + if self.type == TYPE_PPT:
  2807 + # TODO: so far, this function is never called for PPT files, but
  2808 + # if that happens, the information is lost which ole file contains
  2809 + # which storage!
  2810 + ole_files = self.ole_subfiles
  2811 + log.warning('Returned info is not complete for PPT types!')
  2812 + else:
  2813 + ole_files = [self.ole_file, ]
  2814 +
  2815 + # start with an empty list:
  2816 + self.vba_forms = []
  2817 +
  2818 + # Loop over ole streams
  2819 + for ole in ole_files:
  2820 + # Look for any storage containing those storage/streams:
  2821 + for storage in ole.listdir(streams=False, storages=True):
  2822 + log.debug('Checking storage %r' % storage)
  2823 + # Look for two streams named 'o' and 'f':
  2824 + o_stream = storage + ['o']
  2825 + f_stream = storage + ['f']
  2826 + log.debug('Checking if streams %r and %r exist' % (f_stream, o_stream))
  2827 + if ole.exists(o_stream) and ole.get_type(o_stream) == olefile.STGTY_STREAM \
  2828 + and ole.exists(f_stream) and ole.get_type(f_stream) == olefile.STGTY_STREAM:
  2829 + form_path = '/'.join(storage)
  2830 + log.debug('Found VBA Form: %r' % form_path)
  2831 + self.vba_forms.append(storage)
  2832 + return self.vba_forms
  2833 +
  2834 + def extract_form_strings(self):
  2835 + """
  2836 + Extract printable strings from each VBA Form found in the file
  2837 +
  2838 + Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found
  2839 + If the file is OLE, filename is the path of the file.
  2840 + If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros
  2841 + within the zip archive, e.g. word/vbaProject.bin.
  2842 + If the file is PPT, result is as for OpenXML but filename is useless
  2843 + """
  2844 + if self.ole_file is None:
  2845 + # This may be either an OpenXML/PPT or a text file:
  2846 + if self.type == TYPE_TEXT:
  2847 + # This is a text file, return no results:
  2848 + return
  2849 + else:
  2850 + # OpenXML/PPT: recursively yield results from each OLE subfile:
  2851 + for ole_subfile in self.ole_subfiles:
  2852 + for results in ole_subfile.extract_form_strings():
  2853 + yield results
  2854 + else:
  2855 + # This is an OLE file:
  2856 + self.find_vba_forms()
  2857 + ole = self.ole_file
  2858 + for form_storage in self.vba_forms:
  2859 + o_stream = form_storage + ['o']
  2860 + log.debug('Opening form object stream %r' % '/'.join(o_stream))
  2861 + form_data = ole.openstream(o_stream).read()
  2862 + # Extract printable strings from the form object stream "o":
  2863 + for m in re_printable_string.finditer(form_data):
  2864 + log.debug('Printable string found in form: %r' % m.group())
  2865 + yield (self.filename, '/'.join(o_stream), m.group())
  2866 +
  2867 +
  2868 + def close(self):
  2869 + """
  2870 + Close all the open files. This method must be called after usage, if
  2871 + the application is opening many files.
  2872 + """
  2873 + if self.ole_file is None:
  2874 + if self.ole_subfiles is not None:
  2875 + for ole_subfile in self.ole_subfiles:
  2876 + ole_subfile.close()
  2877 + else:
  2878 + self.ole_file.close()
  2879 +
  2880 +
  2881 +
  2882 +class VBA_Parser_CLI(VBA_Parser):
  2883 + """
  2884 + VBA parser and analyzer, adding methods for the command line interface
  2885 + of olevba. (see VBA_Parser)
  2886 + """
  2887 +
  2888 + def __init__(self, *args, **kwargs):
  2889 + """
  2890 + Constructor for VBA_Parser_CLI.
  2891 + Calls __init__ from VBA_Parser with all arguments --> see doc there
  2892 + """
  2893 + super(VBA_Parser_CLI, self).__init__(*args, **kwargs)
  2894 +
  2895 +
  2896 + def print_analysis(self, show_decoded_strings=False, deobfuscate=False):
  2897 + """
  2898 + Analyze the provided VBA code, and print the results in a table
  2899 +
  2900 + :param vba_code: str, VBA source code to be analyzed
  2901 + :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.
  2902 + :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow)
  2903 + :return: None
  2904 + """
  2905 + # print a waiting message only if the output is not redirected to a file:
  2906 + if sys.stdout.isatty():
  2907 + print('Analysis...\r')
  2908 + sys.stdout.flush()
  2909 + results = self.analyze_macros(show_decoded_strings, deobfuscate)
  2910 + if results:
  2911 + t = prettytable.PrettyTable(('Type', 'Keyword', 'Description'))
  2912 + t.align = 'l'
  2913 + t.max_width['Type'] = 10
  2914 + t.max_width['Keyword'] = 20
  2915 + t.max_width['Description'] = 39
  2916 + for kw_type, keyword, description in results:
  2917 + # handle non printable strings:
  2918 + if not is_printable(keyword):
  2919 + keyword = repr(keyword)
  2920 + if not is_printable(description):
  2921 + description = repr(description)
  2922 + t.add_row((kw_type, keyword, description))
  2923 + print(t)
  2924 + else:
  2925 + print('No suspicious keyword or IOC found.')
  2926 +
  2927 + def print_analysis_json(self, show_decoded_strings=False, deobfuscate=False):
  2928 + """
  2929 + Analyze the provided VBA code, and return the results in json format
  2930 +
  2931 + :param vba_code: str, VBA source code to be analyzed
  2932 + :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.
  2933 + :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow)
  2934 +
  2935 + :return: dict
  2936 + """
  2937 + # print a waiting message only if the output is not redirected to a file:
  2938 + if sys.stdout.isatty():
  2939 + print('Analysis...\r')
  2940 + sys.stdout.flush()
  2941 + return [dict(type=kw_type, keyword=keyword, description=description)
  2942 + for kw_type, keyword, description in self.analyze_macros(show_decoded_strings, deobfuscate)]
  2943 +
  2944 + def process_file(self, show_decoded_strings=False,
  2945 + display_code=True, hide_attributes=True,
  2946 + vba_code_only=False, show_deobfuscated_code=False,
  2947 + deobfuscate=False):
  2948 + """
  2949 + Process a single file
  2950 +
  2951 + :param filename: str, path and filename of file on disk, or within the container.
  2952 + :param data: bytes, content of the file if it is in a container, None if it is a file on disk.
  2953 + :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.
  2954 + :param display_code: bool, if False VBA source code is not displayed (default True)
  2955 + :param global_analysis: bool, if True all modules are merged for a single analysis (default),
  2956 + otherwise each module is analyzed separately (old behaviour)
  2957 + :param hide_attributes: bool, if True the first lines starting with "Attribute VB" are hidden (default)
  2958 + :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow)
  2959 + """
  2960 + #TODO: replace print by writing to a provided output file (sys.stdout by default)
  2961 + # fix conflicting parameters:
  2962 + if vba_code_only and not display_code:
  2963 + display_code = True
  2964 + if self.container:
  2965 + display_filename = '%s in %s' % (self.filename, self.container)
  2966 + else:
  2967 + display_filename = self.filename
  2968 + print('=' * 79)
  2969 + print('FILE:', display_filename)
  2970 + try:
  2971 + #TODO: handle olefile errors, when an OLE file is malformed
  2972 + print('Type: %s' % self.type)
  2973 + if self.detect_vba_macros():
  2974 + #print 'Contains VBA Macros:'
  2975 + for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros():
  2976 + if hide_attributes:
  2977 + # hide attribute lines:
  2978 + if isinstance(vba_code,bytes):
  2979 + vba_code =vba_code.decode('utf-8','replace')
  2980 + vba_code_filtered = filter_vba(vba_code)
  2981 + else:
  2982 + vba_code_filtered = vba_code
  2983 + print('-' * 79)
  2984 + print('VBA MACRO %s ' % vba_filename)
  2985 + print('in file: %s - OLE stream: %s' % (subfilename, repr(stream_path)))
  2986 + if display_code:
  2987 + print('- ' * 39)
  2988 + # detect empty macros:
  2989 + if vba_code_filtered.strip() == '':
  2990 + print('(empty macro)')
  2991 + else:
  2992 + print(vba_code_filtered)
  2993 + for (subfilename, stream_path, form_string) in self.extract_form_strings():
  2994 + print('-' * 79)
  2995 + print('VBA FORM STRING IN %r - OLE stream: %r' % (subfilename, stream_path))
  2996 + print('- ' * 39)
  2997 + print(form_string.decode('utf-8', 'ignore'))
  2998 + if not vba_code_only:
  2999 + # analyse the code from all modules at once:
  3000 + self.print_analysis(show_decoded_strings, deobfuscate)
  3001 + if show_deobfuscated_code:
  3002 + print('MACRO SOURCE CODE WITH DEOBFUSCATED VBA STRINGS (EXPERIMENTAL):\n\n')
  3003 + print(self.reveal())
  3004 + else:
  3005 + print('No VBA macros found.')
  3006 + except OlevbaBaseException:
  3007 + raise
  3008 + except Exception as exc:
  3009 + # display the exception with full stack trace for debugging
  3010 + log.info('Error processing file %s (%s)' % (self.filename, exc))
  3011 + log.debug('Traceback:', exc_info=True)
  3012 + raise ProcessingError(self.filename, exc)
  3013 + print('')
  3014 +
  3015 +
  3016 + def process_file_json(self, show_decoded_strings=False,
  3017 + display_code=True, hide_attributes=True,
  3018 + vba_code_only=False, show_deobfuscated_code=False,
  3019 + deobfuscate=False):
  3020 + """
  3021 + Process a single file
  3022 +
  3023 + every "show" or "print" here is to be translated as "add to json"
  3024 +
  3025 + :param filename: str, path and filename of file on disk, or within the container.
  3026 + :param data: bytes, content of the file if it is in a container, None if it is a file on disk.
  3027 + :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.
  3028 + :param display_code: bool, if False VBA source code is not displayed (default True)
  3029 + :param global_analysis: bool, if True all modules are merged for a single analysis (default),
  3030 + otherwise each module is analyzed separately (old behaviour)
  3031 + :param hide_attributes: bool, if True the first lines starting with "Attribute VB" are hidden (default)
  3032 + :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow)
  3033 + """
  3034 + #TODO: fix conflicting parameters (?)
  3035 +
  3036 + if vba_code_only and not display_code:
  3037 + display_code = True
  3038 +
  3039 + result = {}
  3040 +
  3041 + if self.container:
  3042 + result['container'] = self.container
  3043 + else:
  3044 + result['container'] = None
  3045 + result['file'] = self.filename
  3046 + result['json_conversion_successful'] = False
  3047 + result['analysis'] = None
  3048 + result['code_deobfuscated'] = None
  3049 + result['do_deobfuscate'] = deobfuscate
  3050 +
  3051 + try:
  3052 + #TODO: handle olefile errors, when an OLE file is malformed
  3053 + result['type'] = self.type
  3054 + macros = []
  3055 + if self.detect_vba_macros():
  3056 + for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros():
  3057 + curr_macro = {}
  3058 + if hide_attributes:
  3059 + # hide attribute lines:
  3060 + vba_code_filtered = filter_vba(vba_code.decode('utf-8','replace'))
  3061 + else:
  3062 + vba_code_filtered = vba_code
  3063 +
  3064 + curr_macro['vba_filename'] = vba_filename
  3065 + curr_macro['subfilename'] = subfilename
  3066 + curr_macro['ole_stream'] = stream_path
  3067 + if display_code:
  3068 + curr_macro['code'] = vba_code_filtered.strip()
  3069 + else:
  3070 + curr_macro['code'] = None
  3071 + macros.append(curr_macro)
  3072 + if not vba_code_only:
  3073 + # analyse the code from all modules at once:
  3074 + result['analysis'] = self.print_analysis_json(show_decoded_strings,
  3075 + deobfuscate)
  3076 + if show_deobfuscated_code:
  3077 + result['code_deobfuscated'] = self.reveal()
  3078 + result['macros'] = macros
  3079 + result['json_conversion_successful'] = True
  3080 + except Exception as exc:
  3081 + # display the exception with full stack trace for debugging
  3082 + log.info('Error processing file %s (%s)' % (self.filename, exc))
  3083 + log.debug('Traceback:', exc_info=True)
  3084 + raise ProcessingError(self.filename, exc)
  3085 +
  3086 + return result
  3087 +
  3088 +
  3089 + def process_file_triage(self, show_decoded_strings=False, deobfuscate=False):
  3090 + """
  3091 + Process a file in triage mode, showing only summary results on one line.
  3092 + """
  3093 + #TODO: replace print by writing to a provided output file (sys.stdout by default)
  3094 + try:
  3095 + #TODO: handle olefile errors, when an OLE file is malformed
  3096 + if self.detect_vba_macros():
  3097 + # print a waiting message only if the output is not redirected to a file:
  3098 + if sys.stdout.isatty():
  3099 + print('Analysis...\r')
  3100 + sys.stdout.flush()
  3101 + self.analyze_macros(show_decoded_strings=show_decoded_strings,
  3102 + deobfuscate=deobfuscate)
  3103 + flags = TYPE2TAG[self.type]
  3104 + macros = autoexec = suspicious = iocs = hexstrings = base64obf = dridex = vba_obf = '-'
  3105 + if self.contains_macros: macros = 'M'
  3106 + if self.nb_autoexec: autoexec = 'A'
  3107 + if self.nb_suspicious: suspicious = 'S'
  3108 + if self.nb_iocs: iocs = 'I'
  3109 + if self.nb_hexstrings: hexstrings = 'H'
  3110 + if self.nb_base64strings: base64obf = 'B'
  3111 + if self.nb_dridexstrings: dridex = 'D'
  3112 + if self.nb_vbastrings: vba_obf = 'V'
  3113 + flags += '%s%s%s%s%s%s%s%s' % (macros, autoexec, suspicious, iocs, hexstrings,
  3114 + base64obf, dridex, vba_obf)
  3115 +
  3116 + line = '%-12s %s' % (flags, self.filename)
  3117 + print(line)
  3118 +
  3119 + # old table display:
  3120 + # macros = autoexec = suspicious = iocs = hexstrings = 'no'
  3121 + # if nb_macros: macros = 'YES:%d' % nb_macros
  3122 + # if nb_autoexec: autoexec = 'YES:%d' % nb_autoexec
  3123 + # if nb_suspicious: suspicious = 'YES:%d' % nb_suspicious
  3124 + # if nb_iocs: iocs = 'YES:%d' % nb_iocs
  3125 + # if nb_hexstrings: hexstrings = 'YES:%d' % nb_hexstrings
  3126 + # # 2nd line = info
  3127 + # print '%-8s %-7s %-7s %-7s %-7s %-7s' % (self.type, macros, autoexec, suspicious, iocs, hexstrings)
  3128 + except Exception as exc:
  3129 + # display the exception with full stack trace for debugging only
  3130 + log.debug('Error processing file %s (%s)' % (self.filename, exc),
  3131 + exc_info=True)
  3132 + raise ProcessingError(self.filename, exc)
  3133 +
  3134 +
  3135 + # t = prettytable.PrettyTable(('filename', 'type', 'macros', 'autoexec', 'suspicious', 'ioc', 'hexstrings'),
  3136 + # header=False, border=False)
  3137 + # t.align = 'l'
  3138 + # t.max_width['filename'] = 30
  3139 + # t.max_width['type'] = 10
  3140 + # t.max_width['macros'] = 6
  3141 + # t.max_width['autoexec'] = 6
  3142 + # t.max_width['suspicious'] = 6
  3143 + # t.max_width['ioc'] = 6
  3144 + # t.max_width['hexstrings'] = 6
  3145 + # t.add_row((filename, ftype, macros, autoexec, suspicious, iocs, hexstrings))
  3146 + # print t
  3147 +
  3148 +
  3149 +#=== MAIN =====================================================================
  3150 +
  3151 +def main():
  3152 + """
  3153 + Main function, called when olevba is run from the command line
  3154 + """
  3155 + DEFAULT_LOG_LEVEL = "warning" # Default log level
  3156 + LOG_LEVELS = {
  3157 + 'debug': logging.DEBUG,
  3158 + 'info': logging.INFO,
  3159 + 'warning': logging.WARNING,
  3160 + 'error': logging.ERROR,
  3161 + 'critical': logging.CRITICAL
  3162 + }
  3163 +
  3164 + usage = 'usage: %prog [options] <filename> [filename2 ...]'
  3165 + parser = optparse.OptionParser(usage=usage)
  3166 + # parser.add_option('-o', '--outfile', dest='outfile',
  3167 + # help='output file')
  3168 + # parser.add_option('-c', '--csv', dest='csv',
  3169 + # help='export results to a CSV file')
  3170 + parser.add_option("-r", action="store_true", dest="recursive",
  3171 + help='find files recursively in subdirectories.')
  3172 + parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None,
  3173 + help='if the file is a zip archive, open all files from it, using the provided password (requires Python 2.6+)')
  3174 + parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*',
  3175 + help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)')
  3176 + # output mode; could make this even simpler with add_option(type='choice') but that would make
  3177 + # cmd line interface incompatible...
  3178 + modes = optparse.OptionGroup(parser, title='Output mode (mutually exclusive)')
  3179 + modes.add_option("-t", '--triage', action="store_const", dest="output_mode",
  3180 + const='triage', default='unspecified',
  3181 + help='triage mode, display results as a summary table (default for multiple files)')
  3182 + modes.add_option("-d", '--detailed', action="store_const", dest="output_mode",
  3183 + const='detailed', default='unspecified',
  3184 + help='detailed mode, display full results (default for single file)')
  3185 + modes.add_option("-j", '--json', action="store_const", dest="output_mode",
  3186 + const='json', default='unspecified',
  3187 + help='json mode, detailed in json format (never default)')
  3188 + parser.add_option_group(modes)
  3189 + parser.add_option("-a", '--analysis', action="store_false", dest="display_code", default=True,
  3190 + help='display only analysis results, not the macro source code')
  3191 + parser.add_option("-c", '--code', action="store_true", dest="vba_code_only", default=False,
  3192 + help='display only VBA source code, do not analyze it')
  3193 + parser.add_option("--decode", action="store_true", dest="show_decoded_strings",
  3194 + help='display all the obfuscated strings with their decoded content (Hex, Base64, StrReverse, Dridex, VBA).')
  3195 + parser.add_option("--attr", action="store_false", dest="hide_attributes", default=True,
  3196 + help='display the attribute lines at the beginning of VBA source code')
  3197 + parser.add_option("--reveal", action="store_true", dest="show_deobfuscated_code",
  3198 + help='display the macro source code after replacing all the obfuscated strings by their decoded content.')
  3199 + parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL,
  3200 + help="logging level debug/info/warning/error/critical (default=%default)")
  3201 + parser.add_option('--deobf', dest="deobfuscate", action="store_true", default=False,
  3202 + help="Attempt to deobfuscate VBA expressions (slow)")
  3203 + parser.add_option('--relaxed', dest="relaxed", action="store_true", default=False,
  3204 + help="Do not raise errors if opening of substream fails")
  3205 +
  3206 + (options, args) = parser.parse_args()
  3207 +
  3208 + # Print help if no arguments are passed
  3209 + if len(args) == 0:
  3210 + print(__doc__)
  3211 + parser.print_help()
  3212 + sys.exit(RETURN_WRONG_ARGS)
  3213 +
  3214 + # provide info about tool and its version
  3215 + if options.output_mode == 'json':
  3216 + # prints opening [
  3217 + print_json(script_name='olevba', version=__version__,
  3218 + url='http://decalage.info/python/oletools',
  3219 + type='MetaInformation')
  3220 + else:
  3221 + print('olevba %s - http://decalage.info/python/oletools' % __version__)
  3222 +
  3223 + logging.basicConfig(level=LOG_LEVELS[options.loglevel], format='%(levelname)-8s %(message)s')
  3224 + # enable logging in the modules:
  3225 + log.setLevel(logging.NOTSET)
  3226 +
  3227 + # Old display with number of items detected:
  3228 + # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('Type', 'Macros', 'AutoEx', 'Susp.', 'IOCs', 'HexStr')
  3229 + # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('-'*8, '-'*7, '-'*7, '-'*7, '-'*7, '-'*7)
  3230 +
  3231 + # with the option --reveal, make sure --deobf is also enabled:
  3232 + if options.show_deobfuscated_code and not options.deobfuscate:
  3233 + log.info('set --deobf because --reveal was set')
  3234 + options.deobfuscate = True
  3235 + if options.output_mode == 'triage' and options.show_deobfuscated_code:
  3236 + log.info('ignoring option --reveal in triage output mode')
  3237 +
  3238 + # Column headers (do not know how many files there will be yet, so if no output_mode
  3239 + # was specified, we will print triage for first file --> need these headers)
  3240 + if options.output_mode in ('triage', 'unspecified'):
  3241 + print('%-12s %-65s' % ('Flags', 'Filename'))
  3242 + print('%-12s %-65s' % ('-' * 11, '-' * 65))
  3243 +
  3244 + previous_container = None
  3245 + count = 0
  3246 + container = filename = data = None
  3247 + vba_parser = None
  3248 + return_code = RETURN_OK
  3249 + try:
  3250 + for container, filename, data in xglob.iter_files(args, recursive=options.recursive,
  3251 + zip_password=options.zip_password, zip_fname=options.zip_fname):
  3252 + # ignore directory names stored in zip files:
  3253 + if container and filename.endswith('/'):
  3254 + continue
  3255 +
  3256 + # handle errors from xglob
  3257 + if isinstance(data, Exception):
  3258 + if isinstance(data, PathNotFoundException):
  3259 + if options.output_mode in ('triage', 'unspecified'):
  3260 + print('%-12s %s - File not found' % ('?', filename))
  3261 + elif options.output_mode != 'json':
  3262 + log.error('Given path %r does not exist!' % filename)
  3263 + return_code = RETURN_FILE_NOT_FOUND if return_code == 0 \
  3264 + else RETURN_SEVERAL_ERRS
  3265 + else:
  3266 + if options.output_mode in ('triage', 'unspecified'):
  3267 + print('%-12s %s - Failed to read from zip file %s' % ('?', filename, container))
  3268 + elif options.output_mode != 'json':
  3269 + log.error('Exception opening/reading %r from zip file %r: %s'
  3270 + % (filename, container, data))
  3271 + return_code = RETURN_XGLOB_ERR if return_code == 0 \
  3272 + else RETURN_SEVERAL_ERRS
  3273 + if options.output_mode == 'json':
  3274 + print_json(file=filename, type='error',
  3275 + error=type(data).__name__, message=str(data))
  3276 + continue
  3277 +
  3278 + try:
  3279 + # Open the file
  3280 + vba_parser = VBA_Parser_CLI(filename, data=data, container=container,
  3281 + relaxed=options.relaxed)
  3282 +
  3283 + if options.output_mode == 'detailed':
  3284 + # fully detailed output
  3285 + vba_parser.process_file(show_decoded_strings=options.show_decoded_strings,
  3286 + display_code=options.display_code,
  3287 + hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only,
  3288 + show_deobfuscated_code=options.show_deobfuscated_code,
  3289 + deobfuscate=options.deobfuscate)
  3290 + elif options.output_mode in ('triage', 'unspecified'):
  3291 + # print container name when it changes:
  3292 + if container != previous_container:
  3293 + if container is not None:
  3294 + print('\nFiles in %s:' % container)
  3295 + previous_container = container
  3296 + # summarized output for triage:
  3297 + vba_parser.process_file_triage(show_decoded_strings=options.show_decoded_strings,
  3298 + deobfuscate=options.deobfuscate)
  3299 + elif options.output_mode == 'json':
  3300 + print_json(
  3301 + vba_parser.process_file_json(show_decoded_strings=options.show_decoded_strings,
  3302 + display_code=options.display_code,
  3303 + hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only,
  3304 + show_deobfuscated_code=options.show_deobfuscated_code,
  3305 + deobfuscate=options.deobfuscate))
  3306 + else: # (should be impossible)
  3307 + raise ValueError('unexpected output mode: "{0}"!'.format(options.output_mode))
  3308 + count += 1
  3309 +
  3310 + except (SubstreamOpenError, UnexpectedDataError) as exc:
  3311 + if options.output_mode in ('triage', 'unspecified'):
  3312 + print('%-12s %s - Error opening substream or uenxpected ' \
  3313 + 'content' % ('?', filename))
  3314 + elif options.output_mode == 'json':
  3315 + print_json(file=filename, type='error',
  3316 + error=type(exc).__name__, message=str(exc))
  3317 + else:
  3318 + log.exception('Error opening substream or unexpected '
  3319 + 'content in %s' % filename)
  3320 + return_code = RETURN_OPEN_ERROR if return_code == 0 \
  3321 + else RETURN_SEVERAL_ERRS
  3322 + except FileOpenError as exc:
  3323 + if options.output_mode in ('triage', 'unspecified'):
  3324 + print('%-12s %s - File format not supported' % ('?', filename))
  3325 + elif options.output_mode == 'json':
  3326 + print_json(file=filename, type='error',
  3327 + error=type(exc).__name__, message=str(exc))
  3328 + else:
  3329 + log.exception('Failed to open %s -- probably not supported!' % filename)
  3330 + return_code = RETURN_OPEN_ERROR if return_code == 0 \
  3331 + else RETURN_SEVERAL_ERRS
  3332 + except ProcessingError as exc:
  3333 + if options.output_mode in ('triage', 'unspecified'):
  3334 + print('%-12s %s - %s' % ('!ERROR', filename, exc.orig_exc))
  3335 + elif options.output_mode == 'json':
  3336 + print_json(file=filename, type='error',
  3337 + error=type(exc).__name__,
  3338 + message=str(exc.orig_exc))
  3339 + else:
  3340 + log.exception('Error processing file %s (%s)!'
  3341 + % (filename, exc.orig_exc))
  3342 + return_code = RETURN_PARSE_ERROR if return_code == 0 \
  3343 + else RETURN_SEVERAL_ERRS
  3344 + finally:
  3345 + if vba_parser is not None:
  3346 + vba_parser.close()
  3347 +
  3348 + if options.output_mode == 'triage':
  3349 + print('\n(Flags: OpX=OpenXML, XML=Word2003XML, MHT=MHTML, TXT=Text, M=Macros, ' \
  3350 + 'A=Auto-executable, S=Suspicious keywords, I=IOCs, H=Hex strings, ' \
  3351 + 'B=Base64 strings, D=Dridex strings, V=VBA strings, ?=Unknown)\n')
  3352 +
  3353 + if count == 1 and options.output_mode == 'unspecified':
  3354 + # if options -t, -d and -j were not specified and it's a single file, print details:
  3355 + vba_parser.process_file(show_decoded_strings=options.show_decoded_strings,
  3356 + display_code=options.display_code,
  3357 + hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only,
  3358 + show_deobfuscated_code=options.show_deobfuscated_code,
  3359 + deobfuscate=options.deobfuscate)
  3360 +
  3361 + if options.output_mode == 'json':
  3362 + # print last json entry (a last one without a comma) and closing ]
  3363 + print_json(type='MetaInformation', return_code=return_code,
  3364 + n_processed=count, _json_is_last=True)
  3365 +
  3366 + except Exception as exc:
  3367 + # some unexpected error, maybe some of the types caught in except clauses
  3368 + # above were not sufficient. This is very bad, so log complete trace at exception level
  3369 + # and do not care about output mode
  3370 + log.exception('Unhandled exception in main: %s' % exc, exc_info=True)
  3371 + return_code = RETURN_UNEXPECTED # even if there were others before -- this is more important
  3372 + # TODO: print msg with URL to report issues (except in JSON mode)
  3373 +
  3374 + # done. exit
  3375 + log.debug('will exit now with code %s' % return_code)
  3376 + sys.exit(return_code)
  3377 +
  3378 +if __name__ == '__main__':
  3379 + main()
  3380 +
  3381 +# This was coded while listening to "Dust" from I Love You But I've Chosen Darkness