Commit b1711cc0edc82fbc2e4a505e6d2551e0643dd884
1 parent
f3dc0d66
extract and decode vba without option
Showing
1 changed file
with
25 additions
and
25 deletions
oletools/olevba.py
| ... | ... | @@ -214,7 +214,7 @@ __version__ = '0.48' |
| 214 | 214 | |
| 215 | 215 | import sys, logging |
| 216 | 216 | import struct |
| 217 | -from _io import StringIO | |
| 217 | +from _io import StringIO,BytesIO | |
| 218 | 218 | import math |
| 219 | 219 | import zipfile |
| 220 | 220 | import re |
| ... | ... | @@ -613,7 +613,7 @@ re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"') |
| 613 | 613 | re_nothex_check = re.compile(r'[G-Zg-z]') |
| 614 | 614 | |
| 615 | 615 | # regex to extract printable strings (at least 5 chars) from VBA Forms: |
| 616 | -re_printable_string = re.compile(r'[\t\r\n\x20-\xFF]{5,}') | |
| 616 | +re_printable_string = re.compile(rb'[\t\r\n\x20-\xFF]{5,}') | |
| 617 | 617 | |
| 618 | 618 | |
| 619 | 619 | # === PARTIAL VBA GRAMMAR ==================================================== |
| ... | ... | @@ -1043,10 +1043,10 @@ def decompress_stream(compressed_container): |
| 1043 | 1043 | # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the |
| 1044 | 1044 | # DecompressedBuffer (section 2.4.1.1.2). |
| 1045 | 1045 | |
| 1046 | - decompressed_container = '' # result | |
| 1046 | + decompressed_container = b'' # result | |
| 1047 | 1047 | compressed_current = 0 |
| 1048 | 1048 | |
| 1049 | - sig_byte = ord(compressed_container[compressed_current]) | |
| 1049 | + sig_byte = compressed_container[compressed_current] | |
| 1050 | 1050 | if sig_byte != 0x01: |
| 1051 | 1051 | raise ValueError('invalid signature byte {0:02X}'.format(sig_byte)) |
| 1052 | 1052 | |
| ... | ... | @@ -1092,7 +1092,7 @@ def decompress_stream(compressed_container): |
| 1092 | 1092 | # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk |
| 1093 | 1093 | # uncompressed chunk: read the next 4096 bytes as-is |
| 1094 | 1094 | #TODO: check if there are at least 4096 bytes left |
| 1095 | - decompressed_container += compressed_container[compressed_current:compressed_current + 4096] | |
| 1095 | + decompressed_container += bytes([compressed_container[compressed_current:compressed_current + 4096]]) | |
| 1096 | 1096 | compressed_current += 4096 |
| 1097 | 1097 | else: |
| 1098 | 1098 | # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk |
| ... | ... | @@ -1103,7 +1103,7 @@ def decompress_stream(compressed_container): |
| 1103 | 1103 | # log.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end)) |
| 1104 | 1104 | # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or |
| 1105 | 1105 | # copy tokens (reference to a previous literal token) |
| 1106 | - flag_byte = ord(compressed_container[compressed_current]) | |
| 1106 | + flag_byte = compressed_container[compressed_current] | |
| 1107 | 1107 | compressed_current += 1 |
| 1108 | 1108 | for bit_index in range(0, 8): |
| 1109 | 1109 | # log.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end)) |
| ... | ... | @@ -1115,7 +1115,7 @@ def decompress_stream(compressed_container): |
| 1115 | 1115 | #log.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit)) |
| 1116 | 1116 | if flag_bit == 0: # LiteralToken |
| 1117 | 1117 | # copy one byte directly to output |
| 1118 | - decompressed_container += compressed_container[compressed_current] | |
| 1118 | + decompressed_container += bytes([compressed_container[compressed_current]]) | |
| 1119 | 1119 | compressed_current += 1 |
| 1120 | 1120 | else: # CopyToken |
| 1121 | 1121 | # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken |
| ... | ... | @@ -1130,8 +1130,8 @@ def decompress_stream(compressed_container): |
| 1130 | 1130 | offset = (temp1 >> temp2) + 1 |
| 1131 | 1131 | #log.debug('offset=%d length=%d' % (offset, length)) |
| 1132 | 1132 | copy_source = len(decompressed_container) - offset |
| 1133 | - for index in xrange(copy_source, copy_source + length): | |
| 1134 | - decompressed_container += decompressed_container[index] | |
| 1133 | + for index in range(copy_source, copy_source + length): | |
| 1134 | + decompressed_container += bytes([decompressed_container[index]]) | |
| 1135 | 1135 | compressed_current += 2 |
| 1136 | 1136 | return decompressed_container |
| 1137 | 1137 | |
| ... | ... | @@ -1174,7 +1174,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): |
| 1174 | 1174 | code_modules = {} |
| 1175 | 1175 | |
| 1176 | 1176 | for line in project: |
| 1177 | - line = line.strip() | |
| 1177 | + line = line.strip().decode('utf-8','ignore') | |
| 1178 | 1178 | if '=' in line: |
| 1179 | 1179 | # split line at the 1st equal sign: |
| 1180 | 1180 | name, value = line.split('=', 1) |
| ... | ... | @@ -1205,7 +1205,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): |
| 1205 | 1205 | else: |
| 1206 | 1206 | raise UnexpectedDataError(dir_path, name, expected, value) |
| 1207 | 1207 | |
| 1208 | - dir_stream = StringIO(decompress_stream(dir_compressed)) | |
| 1208 | + dir_stream = BytesIO(decompress_stream(dir_compressed)) | |
| 1209 | 1209 | |
| 1210 | 1210 | # PROJECTSYSKIND Record |
| 1211 | 1211 | projectsyskind_id = struct.unpack("<H", dir_stream.read(2))[0] |
| ... | ... | @@ -1465,7 +1465,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): |
| 1465 | 1465 | uni_out = lambda unicode_text: unicode_text.encode('utf-8', 'replace') |
| 1466 | 1466 | |
| 1467 | 1467 | log.debug("parsing {0} modules".format(projectmodules_count)) |
| 1468 | - for projectmodule_index in xrange(0, projectmodules_count): | |
| 1468 | + for projectmodule_index in range(0, projectmodules_count): | |
| 1469 | 1469 | try: |
| 1470 | 1470 | modulename_id = struct.unpack("<H", dir_stream.read(2))[0] |
| 1471 | 1471 | check_value('MODULENAME_Id', 0x0019, modulename_id) |
| ... | ... | @@ -1932,10 +1932,10 @@ class VBA_Scanner(object): |
| 1932 | 1932 | """ |
| 1933 | 1933 | # join long lines ending with " _": |
| 1934 | 1934 | self.code = vba_collapse_long_lines(vba_code) |
| 1935 | - self.code_hex = '' | |
| 1935 | + self.code_hex = b'' | |
| 1936 | 1936 | self.code_hex_rev = '' |
| 1937 | 1937 | self.code_rev_hex = '' |
| 1938 | - self.code_base64 = '' | |
| 1938 | + self.code_base64 = b'' | |
| 1939 | 1939 | self.code_dridex = '' |
| 1940 | 1940 | self.code_vba = '' |
| 1941 | 1941 | self.strReverse = None |
| ... | ... | @@ -1968,7 +1968,7 @@ class VBA_Scanner(object): |
| 1968 | 1968 | if 'strreverse' in self.code.lower(): self.strReverse = True |
| 1969 | 1969 | # Then append the decoded strings to the VBA code, to detect obfuscated IOCs and keywords: |
| 1970 | 1970 | for encoded, decoded in self.hex_strings: |
| 1971 | - self.code_hex += '\n' + decoded | |
| 1971 | + self.code_hex += b'\n' + decoded | |
| 1972 | 1972 | # if the code contains "StrReverse", also append the hex strings in reverse order: |
| 1973 | 1973 | if self.strReverse: |
| 1974 | 1974 | # StrReverse after hex decoding: |
| ... | ... | @@ -1980,7 +1980,7 @@ class VBA_Scanner(object): |
| 1980 | 1980 | # Detect Base64-encoded strings |
| 1981 | 1981 | self.base64_strings = detect_base64_strings(self.code) |
| 1982 | 1982 | for encoded, decoded in self.base64_strings: |
| 1983 | - self.code_base64 += '\n' + decoded | |
| 1983 | + self.code_base64 += b'\n' + decoded | |
| 1984 | 1984 | # Detect Dridex-encoded strings |
| 1985 | 1985 | self.dridex_strings = detect_dridex_strings(self.code) |
| 1986 | 1986 | for encoded, decoded in self.dridex_strings: |
| ... | ... | @@ -1999,10 +1999,10 @@ class VBA_Scanner(object): |
| 1999 | 1999 | |
| 2000 | 2000 | for code, obfuscation in ( |
| 2001 | 2001 | (self.code, None), |
| 2002 | - (self.code_hex, 'Hex'), | |
| 2002 | + (self.code_hex.decode('utf-8','replace'), 'Hex'), | |
| 2003 | 2003 | (self.code_hex_rev, 'Hex+StrReverse'), |
| 2004 | 2004 | (self.code_rev_hex, 'StrReverse+Hex'), |
| 2005 | - (self.code_base64, 'Base64'), | |
| 2005 | + (self.code_base64.decode('utf-8', 'replace'), 'Base64'), | |
| 2006 | 2006 | (self.code_dridex, 'Dridex'), |
| 2007 | 2007 | (self.code_vba, 'VBA expression'), |
| 2008 | 2008 | ): |
| ... | ... | @@ -2587,7 +2587,7 @@ class VBA_Parser(object): |
| 2587 | 2587 | log.debug('%r...[much more data]...%r' % (data[:100], data[-50:])) |
| 2588 | 2588 | else: |
| 2589 | 2589 | log.debug(repr(data)) |
| 2590 | - if 'Attribut' in data: | |
| 2590 | + if 'Attribut' in data.decode('utf-8','ignore'): | |
| 2591 | 2591 | log.debug('Found VBA compressed code') |
| 2592 | 2592 | self.contains_macros = True |
| 2593 | 2593 | except IOError as exc: |
| ... | ... | @@ -2650,7 +2650,7 @@ class VBA_Parser(object): |
| 2650 | 2650 | # read data |
| 2651 | 2651 | log.debug('Reading data from stream %r' % d.name) |
| 2652 | 2652 | data = ole._open(d.isectStart, d.size).read() |
| 2653 | - for match in re.finditer(r'\x00Attribut[^e]', data, flags=re.IGNORECASE): | |
| 2653 | + for match in re.finditer(rb'\x00Attribut[^e]', data, flags=re.IGNORECASE): | |
| 2654 | 2654 | start = match.start() - 3 |
| 2655 | 2655 | log.debug('Found VBA compressed code at index %X' % start) |
| 2656 | 2656 | compressed_code = data[start:] |
| ... | ... | @@ -2693,9 +2693,9 @@ class VBA_Parser(object): |
| 2693 | 2693 | self.vba_code_all_modules = '' |
| 2694 | 2694 | for (_, _, _, vba_code) in self.extract_all_macros(): |
| 2695 | 2695 | #TODO: filter code? (each module) |
| 2696 | - self.vba_code_all_modules += vba_code + '\n' | |
| 2696 | + self.vba_code_all_modules += vba_code.decode('utf-8', 'ignore') + '\n' | |
| 2697 | 2697 | for (_, _, form_string) in self.extract_form_strings(): |
| 2698 | - self.vba_code_all_modules += form_string + '\n' | |
| 2698 | + self.vba_code_all_modules += form_string.decode('utf-8', 'ignore') + '\n' | |
| 2699 | 2699 | # Analyze the whole code at once: |
| 2700 | 2700 | scanner = VBA_Scanner(self.vba_code_all_modules) |
| 2701 | 2701 | self.analysis_results = scanner.scan(show_decoded_strings, deobfuscate) |
| ... | ... | @@ -2935,13 +2935,13 @@ class VBA_Parser_CLI(VBA_Parser): |
| 2935 | 2935 | print('FILE:', display_filename) |
| 2936 | 2936 | try: |
| 2937 | 2937 | #TODO: handle olefile errors, when an OLE file is malformed |
| 2938 | - print('Type: %s', self.type) | |
| 2938 | + print('Type: %s' % self.type) | |
| 2939 | 2939 | if self.detect_vba_macros(): |
| 2940 | 2940 | #print 'Contains VBA Macros:' |
| 2941 | 2941 | for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros(): |
| 2942 | 2942 | if hide_attributes: |
| 2943 | 2943 | # hide attribute lines: |
| 2944 | - vba_code_filtered = filter_vba(vba_code) | |
| 2944 | + vba_code_filtered = filter_vba(vba_code.decode('utf-8','replace')) | |
| 2945 | 2945 | else: |
| 2946 | 2946 | vba_code_filtered = vba_code |
| 2947 | 2947 | print('-' * 79) |
| ... | ... | @@ -2958,7 +2958,7 @@ class VBA_Parser_CLI(VBA_Parser): |
| 2958 | 2958 | print('-' * 79) |
| 2959 | 2959 | print('VBA FORM STRING IN %r - OLE stream: %r' % (subfilename, stream_path)) |
| 2960 | 2960 | print('- ' * 39) |
| 2961 | - print(form_string) | |
| 2961 | + print(form_string.decode('utf-8', 'ignore')) | |
| 2962 | 2962 | if not vba_code_only: |
| 2963 | 2963 | # analyse the code from all modules at once: |
| 2964 | 2964 | self.print_analysis(show_decoded_strings, deobfuscate) | ... | ... |