Commit b1711cc0edc82fbc2e4a505e6d2551e0643dd884

Authored by Sébastien Larinier
1 parent f3dc0d66

extract and decode vba without option

Showing 1 changed file with 25 additions and 25 deletions
oletools/olevba.py
@@ -214,7 +214,7 @@ __version__ = '0.48' @@ -214,7 +214,7 @@ __version__ = '0.48'
214 214
215 import sys, logging 215 import sys, logging
216 import struct 216 import struct
217 -from _io import StringIO 217 +from _io import StringIO,BytesIO
218 import math 218 import math
219 import zipfile 219 import zipfile
220 import re 220 import re
@@ -613,7 +613,7 @@ re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"') @@ -613,7 +613,7 @@ re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"')
613 re_nothex_check = re.compile(r'[G-Zg-z]') 613 re_nothex_check = re.compile(r'[G-Zg-z]')
614 614
615 # regex to extract printable strings (at least 5 chars) from VBA Forms: 615 # regex to extract printable strings (at least 5 chars) from VBA Forms:
616 -re_printable_string = re.compile(r'[\t\r\n\x20-\xFF]{5,}') 616 +re_printable_string = re.compile(rb'[\t\r\n\x20-\xFF]{5,}')
617 617
618 618
619 # === PARTIAL VBA GRAMMAR ==================================================== 619 # === PARTIAL VBA GRAMMAR ====================================================
@@ -1043,10 +1043,10 @@ def decompress_stream(compressed_container): @@ -1043,10 +1043,10 @@ def decompress_stream(compressed_container):
1043 # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the 1043 # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the
1044 # DecompressedBuffer (section 2.4.1.1.2). 1044 # DecompressedBuffer (section 2.4.1.1.2).
1045 1045
1046 - decompressed_container = '' # result 1046 + decompressed_container = b'' # result
1047 compressed_current = 0 1047 compressed_current = 0
1048 1048
1049 - sig_byte = ord(compressed_container[compressed_current]) 1049 + sig_byte = compressed_container[compressed_current]
1050 if sig_byte != 0x01: 1050 if sig_byte != 0x01:
1051 raise ValueError('invalid signature byte {0:02X}'.format(sig_byte)) 1051 raise ValueError('invalid signature byte {0:02X}'.format(sig_byte))
1052 1052
@@ -1092,7 +1092,7 @@ def decompress_stream(compressed_container): @@ -1092,7 +1092,7 @@ def decompress_stream(compressed_container):
1092 # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk 1092 # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk
1093 # uncompressed chunk: read the next 4096 bytes as-is 1093 # uncompressed chunk: read the next 4096 bytes as-is
1094 #TODO: check if there are at least 4096 bytes left 1094 #TODO: check if there are at least 4096 bytes left
1095 - decompressed_container += compressed_container[compressed_current:compressed_current + 4096] 1095 + decompressed_container += bytes([compressed_container[compressed_current:compressed_current + 4096]])
1096 compressed_current += 4096 1096 compressed_current += 4096
1097 else: 1097 else:
1098 # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk 1098 # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk
@@ -1103,7 +1103,7 @@ def decompress_stream(compressed_container): @@ -1103,7 +1103,7 @@ def decompress_stream(compressed_container):
1103 # log.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end)) 1103 # log.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end))
1104 # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or 1104 # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or
1105 # copy tokens (reference to a previous literal token) 1105 # copy tokens (reference to a previous literal token)
1106 - flag_byte = ord(compressed_container[compressed_current]) 1106 + flag_byte = compressed_container[compressed_current]
1107 compressed_current += 1 1107 compressed_current += 1
1108 for bit_index in range(0, 8): 1108 for bit_index in range(0, 8):
1109 # log.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end)) 1109 # log.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end))
@@ -1115,7 +1115,7 @@ def decompress_stream(compressed_container): @@ -1115,7 +1115,7 @@ def decompress_stream(compressed_container):
1115 #log.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit)) 1115 #log.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit))
1116 if flag_bit == 0: # LiteralToken 1116 if flag_bit == 0: # LiteralToken
1117 # copy one byte directly to output 1117 # copy one byte directly to output
1118 - decompressed_container += compressed_container[compressed_current] 1118 + decompressed_container += bytes([compressed_container[compressed_current]])
1119 compressed_current += 1 1119 compressed_current += 1
1120 else: # CopyToken 1120 else: # CopyToken
1121 # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken 1121 # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken
@@ -1130,8 +1130,8 @@ def decompress_stream(compressed_container): @@ -1130,8 +1130,8 @@ def decompress_stream(compressed_container):
1130 offset = (temp1 >> temp2) + 1 1130 offset = (temp1 >> temp2) + 1
1131 #log.debug('offset=%d length=%d' % (offset, length)) 1131 #log.debug('offset=%d length=%d' % (offset, length))
1132 copy_source = len(decompressed_container) - offset 1132 copy_source = len(decompressed_container) - offset
1133 - for index in xrange(copy_source, copy_source + length):  
1134 - decompressed_container += decompressed_container[index] 1133 + for index in range(copy_source, copy_source + length):
  1134 + decompressed_container += bytes([decompressed_container[index]])
1135 compressed_current += 2 1135 compressed_current += 2
1136 return decompressed_container 1136 return decompressed_container
1137 1137
@@ -1174,7 +1174,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): @@ -1174,7 +1174,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1174 code_modules = {} 1174 code_modules = {}
1175 1175
1176 for line in project: 1176 for line in project:
1177 - line = line.strip() 1177 + line = line.strip().decode('utf-8','ignore')
1178 if '=' in line: 1178 if '=' in line:
1179 # split line at the 1st equal sign: 1179 # split line at the 1st equal sign:
1180 name, value = line.split('=', 1) 1180 name, value = line.split('=', 1)
@@ -1205,7 +1205,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): @@ -1205,7 +1205,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1205 else: 1205 else:
1206 raise UnexpectedDataError(dir_path, name, expected, value) 1206 raise UnexpectedDataError(dir_path, name, expected, value)
1207 1207
1208 - dir_stream = StringIO(decompress_stream(dir_compressed)) 1208 + dir_stream = BytesIO(decompress_stream(dir_compressed))
1209 1209
1210 # PROJECTSYSKIND Record 1210 # PROJECTSYSKIND Record
1211 projectsyskind_id = struct.unpack("<H", dir_stream.read(2))[0] 1211 projectsyskind_id = struct.unpack("<H", dir_stream.read(2))[0]
@@ -1465,7 +1465,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): @@ -1465,7 +1465,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1465 uni_out = lambda unicode_text: unicode_text.encode('utf-8', 'replace') 1465 uni_out = lambda unicode_text: unicode_text.encode('utf-8', 'replace')
1466 1466
1467 log.debug("parsing {0} modules".format(projectmodules_count)) 1467 log.debug("parsing {0} modules".format(projectmodules_count))
1468 - for projectmodule_index in xrange(0, projectmodules_count): 1468 + for projectmodule_index in range(0, projectmodules_count):
1469 try: 1469 try:
1470 modulename_id = struct.unpack("<H", dir_stream.read(2))[0] 1470 modulename_id = struct.unpack("<H", dir_stream.read(2))[0]
1471 check_value('MODULENAME_Id', 0x0019, modulename_id) 1471 check_value('MODULENAME_Id', 0x0019, modulename_id)
@@ -1932,10 +1932,10 @@ class VBA_Scanner(object): @@ -1932,10 +1932,10 @@ class VBA_Scanner(object):
1932 """ 1932 """
1933 # join long lines ending with " _": 1933 # join long lines ending with " _":
1934 self.code = vba_collapse_long_lines(vba_code) 1934 self.code = vba_collapse_long_lines(vba_code)
1935 - self.code_hex = '' 1935 + self.code_hex = b''
1936 self.code_hex_rev = '' 1936 self.code_hex_rev = ''
1937 self.code_rev_hex = '' 1937 self.code_rev_hex = ''
1938 - self.code_base64 = '' 1938 + self.code_base64 = b''
1939 self.code_dridex = '' 1939 self.code_dridex = ''
1940 self.code_vba = '' 1940 self.code_vba = ''
1941 self.strReverse = None 1941 self.strReverse = None
@@ -1968,7 +1968,7 @@ class VBA_Scanner(object): @@ -1968,7 +1968,7 @@ class VBA_Scanner(object):
1968 if 'strreverse' in self.code.lower(): self.strReverse = True 1968 if 'strreverse' in self.code.lower(): self.strReverse = True
1969 # Then append the decoded strings to the VBA code, to detect obfuscated IOCs and keywords: 1969 # Then append the decoded strings to the VBA code, to detect obfuscated IOCs and keywords:
1970 for encoded, decoded in self.hex_strings: 1970 for encoded, decoded in self.hex_strings:
1971 - self.code_hex += '\n' + decoded 1971 + self.code_hex += b'\n' + decoded
1972 # if the code contains "StrReverse", also append the hex strings in reverse order: 1972 # if the code contains "StrReverse", also append the hex strings in reverse order:
1973 if self.strReverse: 1973 if self.strReverse:
1974 # StrReverse after hex decoding: 1974 # StrReverse after hex decoding:
@@ -1980,7 +1980,7 @@ class VBA_Scanner(object): @@ -1980,7 +1980,7 @@ class VBA_Scanner(object):
1980 # Detect Base64-encoded strings 1980 # Detect Base64-encoded strings
1981 self.base64_strings = detect_base64_strings(self.code) 1981 self.base64_strings = detect_base64_strings(self.code)
1982 for encoded, decoded in self.base64_strings: 1982 for encoded, decoded in self.base64_strings:
1983 - self.code_base64 += '\n' + decoded 1983 + self.code_base64 += b'\n' + decoded
1984 # Detect Dridex-encoded strings 1984 # Detect Dridex-encoded strings
1985 self.dridex_strings = detect_dridex_strings(self.code) 1985 self.dridex_strings = detect_dridex_strings(self.code)
1986 for encoded, decoded in self.dridex_strings: 1986 for encoded, decoded in self.dridex_strings:
@@ -1999,10 +1999,10 @@ class VBA_Scanner(object): @@ -1999,10 +1999,10 @@ class VBA_Scanner(object):
1999 1999
2000 for code, obfuscation in ( 2000 for code, obfuscation in (
2001 (self.code, None), 2001 (self.code, None),
2002 - (self.code_hex, 'Hex'), 2002 + (self.code_hex.decode('utf-8','replace'), 'Hex'),
2003 (self.code_hex_rev, 'Hex+StrReverse'), 2003 (self.code_hex_rev, 'Hex+StrReverse'),
2004 (self.code_rev_hex, 'StrReverse+Hex'), 2004 (self.code_rev_hex, 'StrReverse+Hex'),
2005 - (self.code_base64, 'Base64'), 2005 + (self.code_base64.decode('utf-8', 'replace'), 'Base64'),
2006 (self.code_dridex, 'Dridex'), 2006 (self.code_dridex, 'Dridex'),
2007 (self.code_vba, 'VBA expression'), 2007 (self.code_vba, 'VBA expression'),
2008 ): 2008 ):
@@ -2587,7 +2587,7 @@ class VBA_Parser(object): @@ -2587,7 +2587,7 @@ class VBA_Parser(object):
2587 log.debug('%r...[much more data]...%r' % (data[:100], data[-50:])) 2587 log.debug('%r...[much more data]...%r' % (data[:100], data[-50:]))
2588 else: 2588 else:
2589 log.debug(repr(data)) 2589 log.debug(repr(data))
2590 - if 'Attribut' in data: 2590 + if 'Attribut' in data.decode('utf-8','ignore'):
2591 log.debug('Found VBA compressed code') 2591 log.debug('Found VBA compressed code')
2592 self.contains_macros = True 2592 self.contains_macros = True
2593 except IOError as exc: 2593 except IOError as exc:
@@ -2650,7 +2650,7 @@ class VBA_Parser(object): @@ -2650,7 +2650,7 @@ class VBA_Parser(object):
2650 # read data 2650 # read data
2651 log.debug('Reading data from stream %r' % d.name) 2651 log.debug('Reading data from stream %r' % d.name)
2652 data = ole._open(d.isectStart, d.size).read() 2652 data = ole._open(d.isectStart, d.size).read()
2653 - for match in re.finditer(r'\x00Attribut[^e]', data, flags=re.IGNORECASE): 2653 + for match in re.finditer(rb'\x00Attribut[^e]', data, flags=re.IGNORECASE):
2654 start = match.start() - 3 2654 start = match.start() - 3
2655 log.debug('Found VBA compressed code at index %X' % start) 2655 log.debug('Found VBA compressed code at index %X' % start)
2656 compressed_code = data[start:] 2656 compressed_code = data[start:]
@@ -2693,9 +2693,9 @@ class VBA_Parser(object): @@ -2693,9 +2693,9 @@ class VBA_Parser(object):
2693 self.vba_code_all_modules = '' 2693 self.vba_code_all_modules = ''
2694 for (_, _, _, vba_code) in self.extract_all_macros(): 2694 for (_, _, _, vba_code) in self.extract_all_macros():
2695 #TODO: filter code? (each module) 2695 #TODO: filter code? (each module)
2696 - self.vba_code_all_modules += vba_code + '\n' 2696 + self.vba_code_all_modules += vba_code.decode('utf-8', 'ignore') + '\n'
2697 for (_, _, form_string) in self.extract_form_strings(): 2697 for (_, _, form_string) in self.extract_form_strings():
2698 - self.vba_code_all_modules += form_string + '\n' 2698 + self.vba_code_all_modules += form_string.decode('utf-8', 'ignore') + '\n'
2699 # Analyze the whole code at once: 2699 # Analyze the whole code at once:
2700 scanner = VBA_Scanner(self.vba_code_all_modules) 2700 scanner = VBA_Scanner(self.vba_code_all_modules)
2701 self.analysis_results = scanner.scan(show_decoded_strings, deobfuscate) 2701 self.analysis_results = scanner.scan(show_decoded_strings, deobfuscate)
@@ -2935,13 +2935,13 @@ class VBA_Parser_CLI(VBA_Parser): @@ -2935,13 +2935,13 @@ class VBA_Parser_CLI(VBA_Parser):
2935 print('FILE:', display_filename) 2935 print('FILE:', display_filename)
2936 try: 2936 try:
2937 #TODO: handle olefile errors, when an OLE file is malformed 2937 #TODO: handle olefile errors, when an OLE file is malformed
2938 - print('Type: %s', self.type) 2938 + print('Type: %s' % self.type)
2939 if self.detect_vba_macros(): 2939 if self.detect_vba_macros():
2940 #print 'Contains VBA Macros:' 2940 #print 'Contains VBA Macros:'
2941 for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros(): 2941 for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros():
2942 if hide_attributes: 2942 if hide_attributes:
2943 # hide attribute lines: 2943 # hide attribute lines:
2944 - vba_code_filtered = filter_vba(vba_code) 2944 + vba_code_filtered = filter_vba(vba_code.decode('utf-8','replace'))
2945 else: 2945 else:
2946 vba_code_filtered = vba_code 2946 vba_code_filtered = vba_code
2947 print('-' * 79) 2947 print('-' * 79)
@@ -2958,7 +2958,7 @@ class VBA_Parser_CLI(VBA_Parser): @@ -2958,7 +2958,7 @@ class VBA_Parser_CLI(VBA_Parser):
2958 print('-' * 79) 2958 print('-' * 79)
2959 print('VBA FORM STRING IN %r - OLE stream: %r' % (subfilename, stream_path)) 2959 print('VBA FORM STRING IN %r - OLE stream: %r' % (subfilename, stream_path))
2960 print('- ' * 39) 2960 print('- ' * 39)
2961 - print(form_string) 2961 + print(form_string.decode('utf-8', 'ignore'))
2962 if not vba_code_only: 2962 if not vba_code_only:
2963 # analyse the code from all modules at once: 2963 # analyse the code from all modules at once:
2964 self.print_analysis(show_decoded_strings, deobfuscate) 2964 self.print_analysis(show_decoded_strings, deobfuscate)