Commit 07333a3a869722c50b1b199a4d6311e1b7eb4858

Authored by decalage2
1 parent 558b3748

olevba: added encoding option to VBA_Parser (work in progress), fixed some comments for Sphinx

Showing 1 changed file with 46 additions and 32 deletions
oletools/olevba.py
@@ -7,14 +7,14 @@ olevba is a script to parse OLE and OpenXML files such as MS Office documents @@ -7,14 +7,14 @@ olevba is a script to parse OLE and OpenXML files such as MS Office documents
7 and analyze malicious macros. 7 and analyze malicious macros.
8 8
9 Supported formats: 9 Supported formats:
10 -- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)  
11 -- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)  
12 -- PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm)  
13 -- Word/PowerPoint 2007+ XML (aka Flat OPC)  
14 -- Word 2003 XML (.xml)  
15 -- Word/Excel Single File Web Page / MHTML (.mht)  
16 -- Publisher (.pub)  
17 -- raises an error if run with files encrypted using MS Crypto API RC4 10 + - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)
  11 + - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
  12 + - PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm)
  13 + - Word/PowerPoint 2007+ XML (aka Flat OPC)
  14 + - Word 2003 XML (.xml)
  15 + - Word/Excel Single File Web Page / MHTML (.mht)
  16 + - Publisher (.pub)
  17 + - raises an error if run with files encrypted using MS Crypto API RC4
18 18
19 Author: Philippe Lagadec - http://www.decalage.info 19 Author: Philippe Lagadec - http://www.decalage.info
20 License: BSD, see source code or documentation 20 License: BSD, see source code or documentation
@@ -328,6 +328,8 @@ if sys.version_info[0] <= 2: @@ -328,6 +328,8 @@ if sys.version_info[0] <= 2:
328 # to use ord on bytes/bytearray items the same way in Python 2+3 328 # to use ord on bytes/bytearray items the same way in Python 2+3
329 # on Python 2, just use the normal ord() because items are bytes 329 # on Python 2, just use the normal ord() because items are bytes
330 byte_ord = ord 330 byte_ord = ord
  331 + #: Default string encoding for the olevba API
  332 + DEFAULT_API_ENCODING = 'utf8' # on Python 2: UTF-8 (bytes)
331 else: 333 else:
332 # Python 3.x+ 334 # Python 3.x+
333 # to use ord on bytes/bytearray items the same way in Python 2+3 335 # to use ord on bytes/bytearray items the same way in Python 2+3
@@ -338,6 +340,8 @@ else: @@ -338,6 +340,8 @@ else:
338 # unichr does not exist anymore, only chr: 340 # unichr does not exist anymore, only chr:
339 unichr = chr 341 unichr = chr
340 from functools import reduce 342 from functools import reduce
  343 + #: Default string encoding for the olevba API
  344 + DEFAULT_API_ENCODING = None # on Python 3: None (unicode)
341 345
342 346
343 # === PYTHON 3.0 - 3.4 SUPPORT ====================================================== 347 # === PYTHON 3.0 - 3.4 SUPPORT ======================================================
@@ -1338,7 +1342,7 @@ class VBA_Module(object): @@ -1338,7 +1342,7 @@ class VBA_Module(object):
1338 :param olefile.OleStream dir_stream: olefile.OleStream, file object containing the module record 1342 :param olefile.OleStream dir_stream: olefile.OleStream, file object containing the module record
1339 :param int module_index: int, index of the module in the VBA project list 1343 :param int module_index: int, index of the module in the VBA project list
1340 """ 1344 """
1341 - #: store a reference to the VBA project for later use: 1345 + #: reference to the VBA project for later use
1342 self.project = project 1346 self.project = project
1343 #: VBA project name 1347 #: VBA project name
1344 self.name = None 1348 self.name = None
@@ -2423,7 +2427,7 @@ def scan_vba(vba_code, include_decoded_strings, deobfuscate=False): @@ -2423,7 +2427,7 @@ def scan_vba(vba_code, include_decoded_strings, deobfuscate=False):
2423 :param include_decoded_strings: bool, if True all encoded strings will be included with their decoded content. 2427 :param include_decoded_strings: bool, if True all encoded strings will be included with their decoded content.
2424 :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) 2428 :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow)
2425 :return: list of tuples (type, keyword, description) 2429 :return: list of tuples (type, keyword, description)
2426 - (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String') 2430 + with type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String'
2427 """ 2431 """
2428 return VBA_Scanner(vba_code).scan(include_decoded_strings, deobfuscate) 2432 return VBA_Scanner(vba_code).scan(include_decoded_strings, deobfuscate)
2429 2433
@@ -2433,38 +2437,32 @@ def scan_vba(vba_code, include_decoded_strings, deobfuscate=False): @@ -2433,38 +2437,32 @@ def scan_vba(vba_code, include_decoded_strings, deobfuscate=False):
2433 class VBA_Parser(object): 2437 class VBA_Parser(object):
2434 """ 2438 """
2435 Class to parse MS Office files, to detect VBA macros and extract VBA source code 2439 Class to parse MS Office files, to detect VBA macros and extract VBA source code
2436 - Supported file formats:  
2437 - - Word 97-2003 (.doc, .dot)  
2438 - - Word 2007+ (.docm, .dotm)  
2439 - - Word 2003 XML (.xml)  
2440 - - Word MHT - Single File Web Page / MHTML (.mht)  
2441 - - Excel 97-2003 (.xls)  
2442 - - Excel 2007+ (.xlsm, .xlsb)  
2443 - - PowerPoint 97-2003 (.ppt)  
2444 - - PowerPoint 2007+ (.pptm, .ppsm)  
2445 """ 2440 """
2446 2441
2447 - def __init__(self, filename, data=None, container=None, relaxed=False): 2442 + def __init__(self, filename, data=None, container=None, relaxed=False, encoding=DEFAULT_API_ENCODING):
2448 """ 2443 """
2449 Constructor for VBA_Parser 2444 Constructor for VBA_Parser
2450 2445
2451 - :param filename: filename or path of file to parse, or file-like object 2446 + :param str filename: filename or path of file to parse, or file-like object
2452 2447
2453 - :param data: None or bytes str, if None the file will be read from disk (or from the file-like object).  
2454 - If data is provided as a bytes string, it will be parsed as the content of the file in memory,  
2455 - and not read from disk. Note: files must be read in binary mode, i.e. open(f, 'rb'). 2448 + :param bytes data: None or bytes str, if None the file will be read from disk (or from the file-like object).
  2449 + If data is provided as a bytes string, it will be parsed as the content of the file in memory,
  2450 + and not read from disk. Note: files must be read in binary mode, i.e. open(f, 'rb').
2456 2451
2457 - :param container: str, path and filename of container if the file is within  
2458 - a zip archive, None otherwise. 2452 + :param str container: str, path and filename of container if the file is within
  2453 + a zip archive, None otherwise.
2459 2454
2460 - :param relaxed: if True, treat mal-formed documents and missing streams more like MS office:  
2461 - do nothing; if False (default), raise errors in these cases 2455 + :param bool relaxed: if True, treat mal-formed documents and missing streams more like MS office:
  2456 + do nothing; if False (default), raise errors in these cases
2462 2457
2463 - raises a FileOpenError if all attemps to interpret the data header failed 2458 + :param str encoding: encoding for VBA source code and strings.
  2459 + Default: UTF-8 bytes strings on Python 2, unicode strings on Python 3 (None)
  2460 +
  2461 + raises a FileOpenError if all attempts to interpret the data header failed.
2464 """ 2462 """
2465 - #TODO: filename should only be a string, data should be used for the file-like object  
2466 - #TODO: filename should be mandatory, optional data is a string or file-like object  
2467 - #TODO: also support olefile and zipfile as input 2463 + # TODO: filename should only be a string, data should be used for the file-like object
  2464 + # TODO: filename should be mandatory, optional data is a string or file-like object
  2465 + # TODO: also support olefile and zipfile as input
2468 if data is None: 2466 if data is None:
2469 # open file from disk: 2467 # open file from disk:
2470 _file = filename 2468 _file = filename
@@ -2495,6 +2493,8 @@ class VBA_Parser(object): @@ -2495,6 +2493,8 @@ class VBA_Parser(object):
2495 self.nb_base64strings = 0 2493 self.nb_base64strings = 0
2496 self.nb_dridexstrings = 0 2494 self.nb_dridexstrings = 0
2497 self.nb_vbastrings = 0 2495 self.nb_vbastrings = 0
  2496 + #: Encoding for VBA source code and strings returned by all methods
  2497 + self.encoding = encoding
2498 2498
2499 # if filename is None: 2499 # if filename is None:
2500 # if isinstance(_file, basestring): 2500 # if isinstance(_file, basestring):
@@ -3000,6 +3000,19 @@ class VBA_Parser(object): @@ -3000,6 +3000,19 @@ class VBA_Parser(object):
3000 raise SubstreamOpenError(self.filename, d.name, exc) 3000 raise SubstreamOpenError(self.filename, d.name, exc)
3001 return self.contains_macros 3001 return self.contains_macros
3002 3002
  3003 + def encode_string(self, unicode_str):
  3004 + """
  3005 + Encode a unicode string to bytes or str, using the specified encoding
  3006 + for the VBA_parser. By default, it will be bytes/UTF-8 on Python 2, and
  3007 + a normal unicode string on Python 3.
  3008 + :param str unicode_str: string to be encoded
  3009 + :return: encoded string
  3010 + """
  3011 + if self.encoding is None:
  3012 + return unicode_str
  3013 + else:
  3014 + return unicode_str.encode(self.encoding, errors='replace')
  3015 +
3003 def extract_macros(self): 3016 def extract_macros(self):
3004 """ 3017 """
3005 Extract and decompress source code for each VBA macro found in the file 3018 Extract and decompress source code for each VBA macro found in the file
@@ -3062,6 +3075,7 @@ class VBA_Parser(object): @@ -3062,6 +3075,7 @@ class VBA_Parser(object):
3062 compressed_code = data[start:] 3075 compressed_code = data[start:]
3063 try: 3076 try:
3064 vba_code = decompress_stream(bytearray(compressed_code)) 3077 vba_code = decompress_stream(bytearray(compressed_code))
  3078 + # TODO vba_code = self.encode_string(vba_code)
3065 yield (self.filename, d.name, d.name, vba_code) 3079 yield (self.filename, d.name, d.name, vba_code)
3066 except Exception as exc: 3080 except Exception as exc:
3067 # display the exception with full stack trace for debugging 3081 # display the exception with full stack trace for debugging