Commit b1711cc0edc82fbc2e4a505e6d2551e0643dd884

Authored by Sébastien Larinier
1 parent f3dc0d66

extract and decode vba without option

Showing 1 changed file with 25 additions and 25 deletions
oletools/olevba.py
... ... @@ -214,7 +214,7 @@ __version__ = '0.48'
214 214  
215 215 import sys, logging
216 216 import struct
217   -from _io import StringIO
  217 +from _io import StringIO,BytesIO
218 218 import math
219 219 import zipfile
220 220 import re
... ... @@ -613,7 +613,7 @@ re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"')
613 613 re_nothex_check = re.compile(r'[G-Zg-z]')
614 614  
615 615 # regex to extract printable strings (at least 5 chars) from VBA Forms:
616   -re_printable_string = re.compile(r'[\t\r\n\x20-\xFF]{5,}')
  616 +re_printable_string = re.compile(rb'[\t\r\n\x20-\xFF]{5,}')
617 617  
618 618  
619 619 # === PARTIAL VBA GRAMMAR ====================================================
... ... @@ -1043,10 +1043,10 @@ def decompress_stream(compressed_container):
1043 1043 # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the
1044 1044 # DecompressedBuffer (section 2.4.1.1.2).
1045 1045  
1046   - decompressed_container = '' # result
  1046 + decompressed_container = b'' # result
1047 1047 compressed_current = 0
1048 1048  
1049   - sig_byte = ord(compressed_container[compressed_current])
  1049 + sig_byte = compressed_container[compressed_current]
1050 1050 if sig_byte != 0x01:
1051 1051 raise ValueError('invalid signature byte {0:02X}'.format(sig_byte))
1052 1052  
... ... @@ -1092,7 +1092,7 @@ def decompress_stream(compressed_container):
1092 1092 # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk
1093 1093 # uncompressed chunk: read the next 4096 bytes as-is
1094 1094 #TODO: check if there are at least 4096 bytes left
1095   - decompressed_container += compressed_container[compressed_current:compressed_current + 4096]
  1095 + decompressed_container += bytes([compressed_container[compressed_current:compressed_current + 4096]])
1096 1096 compressed_current += 4096
1097 1097 else:
1098 1098 # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk
... ... @@ -1103,7 +1103,7 @@ def decompress_stream(compressed_container):
1103 1103 # log.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end))
1104 1104 # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or
1105 1105 # copy tokens (reference to a previous literal token)
1106   - flag_byte = ord(compressed_container[compressed_current])
  1106 + flag_byte = compressed_container[compressed_current]
1107 1107 compressed_current += 1
1108 1108 for bit_index in range(0, 8):
1109 1109 # log.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end))
... ... @@ -1115,7 +1115,7 @@ def decompress_stream(compressed_container):
1115 1115 #log.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit))
1116 1116 if flag_bit == 0: # LiteralToken
1117 1117 # copy one byte directly to output
1118   - decompressed_container += compressed_container[compressed_current]
  1118 + decompressed_container += bytes([compressed_container[compressed_current]])
1119 1119 compressed_current += 1
1120 1120 else: # CopyToken
1121 1121 # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken
... ... @@ -1130,8 +1130,8 @@ def decompress_stream(compressed_container):
1130 1130 offset = (temp1 >> temp2) + 1
1131 1131 #log.debug('offset=%d length=%d' % (offset, length))
1132 1132 copy_source = len(decompressed_container) - offset
1133   - for index in xrange(copy_source, copy_source + length):
1134   - decompressed_container += decompressed_container[index]
  1133 + for index in range(copy_source, copy_source + length):
  1134 + decompressed_container += bytes([decompressed_container[index]])
1135 1135 compressed_current += 2
1136 1136 return decompressed_container
1137 1137  
... ... @@ -1174,7 +1174,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1174 1174 code_modules = {}
1175 1175  
1176 1176 for line in project:
1177   - line = line.strip()
  1177 + line = line.strip().decode('utf-8','ignore')
1178 1178 if '=' in line:
1179 1179 # split line at the 1st equal sign:
1180 1180 name, value = line.split('=', 1)
... ... @@ -1205,7 +1205,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1205 1205 else:
1206 1206 raise UnexpectedDataError(dir_path, name, expected, value)
1207 1207  
1208   - dir_stream = StringIO(decompress_stream(dir_compressed))
  1208 + dir_stream = BytesIO(decompress_stream(dir_compressed))
1209 1209  
1210 1210 # PROJECTSYSKIND Record
1211 1211 projectsyskind_id = struct.unpack("<H", dir_stream.read(2))[0]
... ... @@ -1465,7 +1465,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1465 1465 uni_out = lambda unicode_text: unicode_text.encode('utf-8', 'replace')
1466 1466  
1467 1467 log.debug("parsing {0} modules".format(projectmodules_count))
1468   - for projectmodule_index in xrange(0, projectmodules_count):
  1468 + for projectmodule_index in range(0, projectmodules_count):
1469 1469 try:
1470 1470 modulename_id = struct.unpack("<H", dir_stream.read(2))[0]
1471 1471 check_value('MODULENAME_Id', 0x0019, modulename_id)
... ... @@ -1932,10 +1932,10 @@ class VBA_Scanner(object):
1932 1932 """
1933 1933 # join long lines ending with " _":
1934 1934 self.code = vba_collapse_long_lines(vba_code)
1935   - self.code_hex = ''
  1935 + self.code_hex = b''
1936 1936 self.code_hex_rev = ''
1937 1937 self.code_rev_hex = ''
1938   - self.code_base64 = ''
  1938 + self.code_base64 = b''
1939 1939 self.code_dridex = ''
1940 1940 self.code_vba = ''
1941 1941 self.strReverse = None
... ... @@ -1968,7 +1968,7 @@ class VBA_Scanner(object):
1968 1968 if 'strreverse' in self.code.lower(): self.strReverse = True
1969 1969 # Then append the decoded strings to the VBA code, to detect obfuscated IOCs and keywords:
1970 1970 for encoded, decoded in self.hex_strings:
1971   - self.code_hex += '\n' + decoded
  1971 + self.code_hex += b'\n' + decoded
1972 1972 # if the code contains "StrReverse", also append the hex strings in reverse order:
1973 1973 if self.strReverse:
1974 1974 # StrReverse after hex decoding:
... ... @@ -1980,7 +1980,7 @@ class VBA_Scanner(object):
1980 1980 # Detect Base64-encoded strings
1981 1981 self.base64_strings = detect_base64_strings(self.code)
1982 1982 for encoded, decoded in self.base64_strings:
1983   - self.code_base64 += '\n' + decoded
  1983 + self.code_base64 += b'\n' + decoded
1984 1984 # Detect Dridex-encoded strings
1985 1985 self.dridex_strings = detect_dridex_strings(self.code)
1986 1986 for encoded, decoded in self.dridex_strings:
... ... @@ -1999,10 +1999,10 @@ class VBA_Scanner(object):
1999 1999  
2000 2000 for code, obfuscation in (
2001 2001 (self.code, None),
2002   - (self.code_hex, 'Hex'),
  2002 + (self.code_hex.decode('utf-8','replace'), 'Hex'),
2003 2003 (self.code_hex_rev, 'Hex+StrReverse'),
2004 2004 (self.code_rev_hex, 'StrReverse+Hex'),
2005   - (self.code_base64, 'Base64'),
  2005 + (self.code_base64.decode('utf-8', 'replace'), 'Base64'),
2006 2006 (self.code_dridex, 'Dridex'),
2007 2007 (self.code_vba, 'VBA expression'),
2008 2008 ):
... ... @@ -2587,7 +2587,7 @@ class VBA_Parser(object):
2587 2587 log.debug('%r...[much more data]...%r' % (data[:100], data[-50:]))
2588 2588 else:
2589 2589 log.debug(repr(data))
2590   - if 'Attribut' in data:
  2590 + if 'Attribut' in data.decode('utf-8','ignore'):
2591 2591 log.debug('Found VBA compressed code')
2592 2592 self.contains_macros = True
2593 2593 except IOError as exc:
... ... @@ -2650,7 +2650,7 @@ class VBA_Parser(object):
2650 2650 # read data
2651 2651 log.debug('Reading data from stream %r' % d.name)
2652 2652 data = ole._open(d.isectStart, d.size).read()
2653   - for match in re.finditer(r'\x00Attribut[^e]', data, flags=re.IGNORECASE):
  2653 + for match in re.finditer(rb'\x00Attribut[^e]', data, flags=re.IGNORECASE):
2654 2654 start = match.start() - 3
2655 2655 log.debug('Found VBA compressed code at index %X' % start)
2656 2656 compressed_code = data[start:]
... ... @@ -2693,9 +2693,9 @@ class VBA_Parser(object):
2693 2693 self.vba_code_all_modules = ''
2694 2694 for (_, _, _, vba_code) in self.extract_all_macros():
2695 2695 #TODO: filter code? (each module)
2696   - self.vba_code_all_modules += vba_code + '\n'
  2696 + self.vba_code_all_modules += vba_code.decode('utf-8', 'ignore') + '\n'
2697 2697 for (_, _, form_string) in self.extract_form_strings():
2698   - self.vba_code_all_modules += form_string + '\n'
  2698 + self.vba_code_all_modules += form_string.decode('utf-8', 'ignore') + '\n'
2699 2699 # Analyze the whole code at once:
2700 2700 scanner = VBA_Scanner(self.vba_code_all_modules)
2701 2701 self.analysis_results = scanner.scan(show_decoded_strings, deobfuscate)
... ... @@ -2935,13 +2935,13 @@ class VBA_Parser_CLI(VBA_Parser):
2935 2935 print('FILE:', display_filename)
2936 2936 try:
2937 2937 #TODO: handle olefile errors, when an OLE file is malformed
2938   - print('Type: %s', self.type)
  2938 + print('Type: %s' % self.type)
2939 2939 if self.detect_vba_macros():
2940 2940 #print 'Contains VBA Macros:'
2941 2941 for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros():
2942 2942 if hide_attributes:
2943 2943 # hide attribute lines:
2944   - vba_code_filtered = filter_vba(vba_code)
  2944 + vba_code_filtered = filter_vba(vba_code.decode('utf-8','replace'))
2945 2945 else:
2946 2946 vba_code_filtered = vba_code
2947 2947 print('-' * 79)
... ... @@ -2958,7 +2958,7 @@ class VBA_Parser_CLI(VBA_Parser):
2958 2958 print('-' * 79)
2959 2959 print('VBA FORM STRING IN %r - OLE stream: %r' % (subfilename, stream_path))
2960 2960 print('- ' * 39)
2961   - print(form_string)
  2961 + print(form_string.decode('utf-8', 'ignore'))
2962 2962 if not vba_code_only:
2963 2963 # analyse the code from all modules at once:
2964 2964 self.print_analysis(show_decoded_strings, deobfuscate)
... ...