Commit ea37768589cffccc2835cfa503dfe50e893773de

Authored by decalage2
1 parent 19911393

sync olevba and olevba3 (work in progress): updated decompress_stream to use a bytearray

oletools/olevba.py
... ... @@ -322,6 +322,9 @@ email.feedparser.headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:?|[\t
322 322  
323 323 if sys.version_info[0] <= 2:
324 324 # Python 2.x
  325 + # to use ord on bytes/bytearray items the same way in Python 2+3
  326 + # on Python 2, just use the normal ord() because items are bytes
  327 + byte_ord = ord
325 328 if sys.version_info[1] <= 6:
326 329 # Python 2.6
327 330 # use is_zipfile backported from Python 2.7:
... ... @@ -331,6 +334,9 @@ if sys.version_info[0] &lt;= 2:
331 334 from zipfile import is_zipfile
332 335 else:
333 336 # Python 3.x+
  337 + # to use ord on bytes/bytearray items the same way in Python 2+3
  338 + # on Python 3, items are int, so just return the item
  339 + byte_ord = lambda x: x
334 340 from zipfile import is_zipfile
335 341 # xrange is now called range:
336 342 xrange = range
... ... @@ -1235,10 +1241,13 @@ def decompress_stream(compressed_container):
1235 1241 # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the
1236 1242 # DecompressedBuffer (section 2.4.1.1.2).
1237 1243  
1238   - decompressed_container = '' # result
  1244 + # Check the input is a bytearray:
  1245 + if not isinstance(compressed_container, bytearray):
  1246 + raise TypeError('decompress_stream requires a bytearray as input')
  1247 + decompressed_container = bytearray() # result
1239 1248 compressed_current = 0
1240 1249  
1241   - sig_byte = ord(compressed_container[compressed_current])
  1250 + sig_byte = compressed_container[compressed_current]
1242 1251 if sig_byte != 0x01:
1243 1252 raise ValueError('invalid signature byte {0:02X}'.format(sig_byte))
1244 1253  
... ... @@ -1284,7 +1293,7 @@ def decompress_stream(compressed_container):
1284 1293 # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk
1285 1294 # uncompressed chunk: read the next 4096 bytes as-is
1286 1295 #TODO: check if there are at least 4096 bytes left
1287   - decompressed_container += compressed_container[compressed_current:compressed_current + 4096]
  1296 + decompressed_container.extend([compressed_container[compressed_current:compressed_current + 4096]])
1288 1297 compressed_current += 4096
1289 1298 else:
1290 1299 # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk
... ... @@ -1295,7 +1304,7 @@ def decompress_stream(compressed_container):
1295 1304 # log.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end))
1296 1305 # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or
1297 1306 # copy tokens (reference to a previous literal token)
1298   - flag_byte = ord(compressed_container[compressed_current])
  1307 + flag_byte = compressed_container[compressed_current]
1299 1308 compressed_current += 1
1300 1309 for bit_index in xrange(0, 8):
1301 1310 # log.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end))
... ... @@ -1307,7 +1316,7 @@ def decompress_stream(compressed_container):
1307 1316 #log.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit))
1308 1317 if flag_bit == 0: # LiteralToken
1309 1318 # copy one byte directly to output
1310   - decompressed_container += compressed_container[compressed_current]
  1319 + decompressed_container.extend([compressed_container[compressed_current]])
1311 1320 compressed_current += 1
1312 1321 else: # CopyToken
1313 1322 # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken
... ... @@ -1323,9 +1332,9 @@ def decompress_stream(compressed_container):
1323 1332 #log.debug('offset=%d length=%d' % (offset, length))
1324 1333 copy_source = len(decompressed_container) - offset
1325 1334 for index in xrange(copy_source, copy_source + length):
1326   - decompressed_container += decompressed_container[index]
  1335 + decompressed_container.extend([decompressed_container[index]])
1327 1336 compressed_current += 2
1328   - return decompressed_container
  1337 + return bytes(decompressed_container)
1329 1338  
1330 1339  
1331 1340 def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
... ... @@ -1366,6 +1375,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1366 1375 code_modules = {}
1367 1376  
1368 1377 for line in project:
  1378 + log.debug('PROJECT: %r' % line)
1369 1379 line = line.strip()
1370 1380 if '=' in line:
1371 1381 # split line at the 1st equal sign:
... ... @@ -1397,7 +1407,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1397 1407 else:
1398 1408 raise UnexpectedDataError(dir_path, name, expected, value)
1399 1409  
1400   - dir_stream = BytesIO(decompress_stream(dir_compressed))
  1410 + dir_stream = BytesIO(decompress_stream(bytearray(dir_compressed)))
1401 1411  
1402 1412 # PROJECTSYSKIND Record
1403 1413 projectsyskind_id = struct.unpack("<H", dir_stream.read(2))[0]
... ... @@ -1813,7 +1823,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1813 1823 log.debug("offset of code_data = {0}".format(moduleoffset_textoffset))
1814 1824 code_data = code_data[moduleoffset_textoffset:]
1815 1825 if len(code_data) > 0:
1816   - code_data = decompress_stream(code_data)
  1826 + code_data = decompress_stream(bytearray(code_data))
1817 1827 # case-insensitive search in the code_modules dict to find the file extension:
1818 1828 filext = code_modules.get(modulename_modulename.lower(), 'bin')
1819 1829 filename = '{0}.{1}'.format(modulename_modulename, filext)
... ... @@ -2120,7 +2130,6 @@ def print_json(json_dict=None, _json_is_first=False, _json_is_last=False,
2120 2130 :param bool _json_is_last: set to True only for very last entry to complete
2121 2131 the top-level json-list
2122 2132 """
2123   -
2124 2133 if json_dict and json_parts:
2125 2134 raise ValueError('Invalid json argument: want either single dict or '
2126 2135 'key=value parts but got both)')
... ... @@ -2949,7 +2958,7 @@ class VBA_Parser(object):
2949 2958 log.debug('Found VBA compressed code at index %X' % start)
2950 2959 compressed_code = data[start:]
2951 2960 try:
2952   - vba_code = decompress_stream(compressed_code)
  2961 + vba_code = decompress_stream(bytearray(compressed_code))
2953 2962 yield (self.filename, d.name, d.name, vba_code)
2954 2963 except Exception as exc:
2955 2964 # display the exception with full stack trace for debugging
... ...
oletools/olevba3.py
... ... @@ -1232,6 +1232,9 @@ def decompress_stream(compressed_container):
1232 1232 # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the
1233 1233 # DecompressedBuffer (section 2.4.1.1.2).
1234 1234  
  1235 + # Check the input is a bytearray:
  1236 + if not isinstance(compressed_container, bytearray):
  1237 + raise TypeError('decompress_stream requires a bytearray as input')
1235 1238 decompressed_container = bytearray() # result
1236 1239 compressed_current = 0
1237 1240  
... ... @@ -1294,7 +1297,7 @@ def decompress_stream(compressed_container):
1294 1297 # copy tokens (reference to a previous literal token)
1295 1298 flag_byte = compressed_container[compressed_current]
1296 1299 compressed_current += 1
1297   - for bit_index in range(0, 8):
  1300 + for bit_index in xrange(0, 8):
1298 1301 # log.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end))
1299 1302 if compressed_current >= compressed_end:
1300 1303 break
... ... @@ -1319,7 +1322,7 @@ def decompress_stream(compressed_container):
1319 1322 offset = (temp1 >> temp2) + 1
1320 1323 #log.debug('offset=%d length=%d' % (offset, length))
1321 1324 copy_source = len(decompressed_container) - offset
1322   - for index in range(copy_source, copy_source + length):
  1325 + for index in xrange(copy_source, copy_source + length):
1323 1326 decompressed_container.extend([decompressed_container[index]])
1324 1327 compressed_current += 2
1325 1328 return bytes(decompressed_container)
... ... @@ -1394,7 +1397,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1394 1397 else:
1395 1398 raise UnexpectedDataError(dir_path, name, expected, value)
1396 1399  
1397   - dir_stream = BytesIO(decompress_stream(dir_compressed))
  1400 + dir_stream = BytesIO(decompress_stream(bytearray(dir_compressed)))
1398 1401  
1399 1402 # PROJECTSYSKIND Record
1400 1403 projectsyskind_id = struct.unpack("<H", dir_stream.read(2))[0]
... ... @@ -1541,7 +1544,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1541 1544 # So let's ignore it, otherwise it crashes on some files (issue #132)
1542 1545 # PR #135 by @c1fe:
1543 1546 # contrary to the specification I think that the unicode name
1544   - # is optional. if reference_reserved is not 0x003E I think it
  1547 + # is optional. if reference_reserved is not 0x003E I think it
1545 1548 # is actually the start of another REFERENCE record
1546 1549 # at least when projectsyskind_syskind == 0x02 (Macintosh)
1547 1550 if reference_reserved == 0x003E:
... ... @@ -1671,7 +1674,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1671 1674 uni_out = lambda unicode_text: unicode_text.encode('utf-8', 'replace')
1672 1675  
1673 1676 log.debug("parsing {0} modules".format(projectmodules_count))
1674   - for projectmodule_index in range(0, projectmodules_count):
  1677 + for projectmodule_index in xrange(0, projectmodules_count):
1675 1678 try:
1676 1679 modulename_id = struct.unpack("<H", dir_stream.read(2))[0]
1677 1680 check_value('MODULENAME_Id', 0x0019, modulename_id)
... ... @@ -1810,7 +1813,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1810 1813 log.debug("offset of code_data = {0}".format(moduleoffset_textoffset))
1811 1814 code_data = code_data[moduleoffset_textoffset:]
1812 1815 if len(code_data) > 0:
1813   - code_data = decompress_stream(code_data)
  1816 + code_data = decompress_stream(bytearray(code_data))
1814 1817 # case-insensitive search in the code_modules dict to find the file extension:
1815 1818 filext = code_modules.get(modulename_modulename.lower(), 'bin')
1816 1819 filename = '{0}.{1}'.format(modulename_modulename, filext)
... ... @@ -2950,7 +2953,7 @@ class VBA_Parser(object):
2950 2953 log.debug('Found VBA compressed code at index %X' % start)
2951 2954 compressed_code = data[start:]
2952 2955 try:
2953   - vba_code = decompress_stream(compressed_code)
  2956 + vba_code = decompress_stream(bytearray(compressed_code))
2954 2957 yield (self.filename, d.name, d.name, vba_code)
2955 2958 except Exception as exc:
2956 2959 # display the exception with full stack trace for debugging
... ... @@ -2978,6 +2981,8 @@ class VBA_Parser(object):
2978 2981 """
2979 2982 runs extract_macros and analyze the source code of all VBA macros
2980 2983 found in the file.
  2984 + All results are stored in self.analysis_results.
  2985 + If called more than once, simply returns the previous results.
2981 2986 """
2982 2987 if self.detect_vba_macros():
2983 2988 # if the analysis was already done, avoid doing it twice:
... ... @@ -3390,16 +3395,6 @@ class VBA_Parser_CLI(VBA_Parser):
3390 3395  
3391 3396 line = '%-12s %s' % (flags, self.filename)
3392 3397 print(line)
3393   -
3394   - # old table display:
3395   - # macros = autoexec = suspicious = iocs = hexstrings = 'no'
3396   - # if nb_macros: macros = 'YES:%d' % nb_macros
3397   - # if nb_autoexec: autoexec = 'YES:%d' % nb_autoexec
3398   - # if nb_suspicious: suspicious = 'YES:%d' % nb_suspicious
3399   - # if nb_iocs: iocs = 'YES:%d' % nb_iocs
3400   - # if nb_hexstrings: hexstrings = 'YES:%d' % nb_hexstrings
3401   - # # 2nd line = info
3402   - # print '%-8s %-7s %-7s %-7s %-7s %-7s' % (self.type, macros, autoexec, suspicious, iocs, hexstrings)
3403 3398 except Exception as exc:
3404 3399 # display the exception with full stack trace for debugging only
3405 3400 log.debug('Error processing file %s (%s)' % (self.filename, exc),
... ... @@ -3407,20 +3402,6 @@ class VBA_Parser_CLI(VBA_Parser):
3407 3402 raise ProcessingError(self.filename, exc)
3408 3403  
3409 3404  
3410   - # t = prettytable.PrettyTable(('filename', 'type', 'macros', 'autoexec', 'suspicious', 'ioc', 'hexstrings'),
3411   - # header=False, border=False)
3412   - # t.align = 'l'
3413   - # t.max_width['filename'] = 30
3414   - # t.max_width['type'] = 10
3415   - # t.max_width['macros'] = 6
3416   - # t.max_width['autoexec'] = 6
3417   - # t.max_width['suspicious'] = 6
3418   - # t.max_width['ioc'] = 6
3419   - # t.max_width['hexstrings'] = 6
3420   - # t.add_row((filename, ftype, macros, autoexec, suspicious, iocs, hexstrings))
3421   - # print t
3422   -
3423   -
3424 3405 #=== MAIN =====================================================================
3425 3406  
3426 3407 def parse_args(cmd_line_args=None):
... ... @@ -3515,10 +3496,6 @@ def main(cmd_line_args=None):
3515 3496 # enable logging in the modules:
3516 3497 enable_logging()
3517 3498  
3518   - # Old display with number of items detected:
3519   - # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('Type', 'Macros', 'AutoEx', 'Susp.', 'IOCs', 'HexStr')
3520   - # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('-'*8, '-'*7, '-'*7, '-'*7, '-'*7, '-'*7)
3521   -
3522 3499 # with the option --reveal, make sure --deobf is also enabled:
3523 3500 if options.show_deobfuscated_code and not options.deobfuscate:
3524 3501 log.info('set --deobf because --reveal was set')
... ...