Commit 07333a3a869722c50b1b199a4d6311e1b7eb4858

Authored by decalage2
1 parent 558b3748

olevba: added encoding option to VBA_Parser (work in progress), fixed some comments for Sphinx

Showing 1 changed file with 46 additions and 32 deletions
oletools/olevba.py
... ... @@ -7,14 +7,14 @@ olevba is a script to parse OLE and OpenXML files such as MS Office documents
7 7 and analyze malicious macros.
8 8  
9 9 Supported formats:
10   -- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)
11   -- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
12   -- PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm)
13   -- Word/PowerPoint 2007+ XML (aka Flat OPC)
14   -- Word 2003 XML (.xml)
15   -- Word/Excel Single File Web Page / MHTML (.mht)
16   -- Publisher (.pub)
17   -- raises an error if run with files encrypted using MS Crypto API RC4
  10 + - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)
  11 + - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
  12 + - PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm)
  13 + - Word/PowerPoint 2007+ XML (aka Flat OPC)
  14 + - Word 2003 XML (.xml)
  15 + - Word/Excel Single File Web Page / MHTML (.mht)
  16 + - Publisher (.pub)
  17 + - raises an error if run with files encrypted using MS Crypto API RC4
18 18  
19 19 Author: Philippe Lagadec - http://www.decalage.info
20 20 License: BSD, see source code or documentation
... ... @@ -328,6 +328,8 @@ if sys.version_info[0] <= 2:
328 328 # to use ord on bytes/bytearray items the same way in Python 2+3
329 329 # on Python 2, just use the normal ord() because items are bytes
330 330 byte_ord = ord
  331 + #: Default string encoding for the olevba API
  332 + DEFAULT_API_ENCODING = 'utf8' # on Python 2: UTF-8 (bytes)
331 333 else:
332 334 # Python 3.x+
333 335 # to use ord on bytes/bytearray items the same way in Python 2+3
... ... @@ -338,6 +340,8 @@ else:
338 340 # unichr does not exist anymore, only chr:
339 341 unichr = chr
340 342 from functools import reduce
  343 + #: Default string encoding for the olevba API
  344 + DEFAULT_API_ENCODING = None # on Python 3: None (unicode)
341 345  
342 346  
343 347 # === PYTHON 3.0 - 3.4 SUPPORT ======================================================
... ... @@ -1338,7 +1342,7 @@ class VBA_Module(object):
1338 1342 :param olefile.OleStream dir_stream: olefile.OleStream, file object containing the module record
1339 1343 :param int module_index: int, index of the module in the VBA project list
1340 1344 """
1341   - #: store a reference to the VBA project for later use:
  1345 + #: reference to the VBA project for later use
1342 1346 self.project = project
1343 1347 #: VBA project name
1344 1348 self.name = None
... ... @@ -2423,7 +2427,7 @@ def scan_vba(vba_code, include_decoded_strings, deobfuscate=False):
2423 2427 :param include_decoded_strings: bool, if True all encoded strings will be included with their decoded content.
2424 2428 :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow)
2425 2429 :return: list of tuples (type, keyword, description)
2426   - (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String')
  2430 + with type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String'
2427 2431 """
2428 2432 return VBA_Scanner(vba_code).scan(include_decoded_strings, deobfuscate)
2429 2433  
... ... @@ -2433,38 +2437,32 @@ def scan_vba(vba_code, include_decoded_strings, deobfuscate=False):
2433 2437 class VBA_Parser(object):
2434 2438 """
2435 2439 Class to parse MS Office files, to detect VBA macros and extract VBA source code
2436   - Supported file formats:
2437   - - Word 97-2003 (.doc, .dot)
2438   - - Word 2007+ (.docm, .dotm)
2439   - - Word 2003 XML (.xml)
2440   - - Word MHT - Single File Web Page / MHTML (.mht)
2441   - - Excel 97-2003 (.xls)
2442   - - Excel 2007+ (.xlsm, .xlsb)
2443   - - PowerPoint 97-2003 (.ppt)
2444   - - PowerPoint 2007+ (.pptm, .ppsm)
2445 2440 """
2446 2441  
2447   - def __init__(self, filename, data=None, container=None, relaxed=False):
  2442 + def __init__(self, filename, data=None, container=None, relaxed=False, encoding=DEFAULT_API_ENCODING):
2448 2443 """
2449 2444 Constructor for VBA_Parser
2450 2445  
2451   - :param filename: filename or path of file to parse, or file-like object
  2446 + :param str filename: filename or path of file to parse, or file-like object
2452 2447  
2453   - :param data: None or bytes str, if None the file will be read from disk (or from the file-like object).
2454   - If data is provided as a bytes string, it will be parsed as the content of the file in memory,
2455   - and not read from disk. Note: files must be read in binary mode, i.e. open(f, 'rb').
  2448 + :param bytes data: None or bytes str, if None the file will be read from disk (or from the file-like object).
  2449 + If data is provided as a bytes string, it will be parsed as the content of the file in memory,
  2450 + and not read from disk. Note: files must be read in binary mode, i.e. open(f, 'rb').
2456 2451  
2457   - :param container: str, path and filename of container if the file is within
2458   - a zip archive, None otherwise.
  2452 + :param str container: str, path and filename of container if the file is within
  2453 + a zip archive, None otherwise.
2459 2454  
2460   - :param relaxed: if True, treat mal-formed documents and missing streams more like MS office:
2461   - do nothing; if False (default), raise errors in these cases
  2455 + :param bool relaxed: if True, treat mal-formed documents and missing streams more like MS office:
  2456 + do nothing; if False (default), raise errors in these cases
2462 2457  
2463   - raises a FileOpenError if all attemps to interpret the data header failed
  2458 + :param str encoding: encoding for VBA source code and strings.
  2459 + Default: UTF-8 bytes strings on Python 2, unicode strings on Python 3 (None)
  2460 +
  2461 + raises a FileOpenError if all attempts to interpret the data header failed.
2464 2462 """
2465   - #TODO: filename should only be a string, data should be used for the file-like object
2466   - #TODO: filename should be mandatory, optional data is a string or file-like object
2467   - #TODO: also support olefile and zipfile as input
  2463 + # TODO: filename should only be a string, data should be used for the file-like object
  2464 + # TODO: filename should be mandatory, optional data is a string or file-like object
  2465 + # TODO: also support olefile and zipfile as input
2468 2466 if data is None:
2469 2467 # open file from disk:
2470 2468 _file = filename
... ... @@ -2495,6 +2493,8 @@ class VBA_Parser(object):
2495 2493 self.nb_base64strings = 0
2496 2494 self.nb_dridexstrings = 0
2497 2495 self.nb_vbastrings = 0
  2496 + #: Encoding for VBA source code and strings returned by all methods
  2497 + self.encoding = encoding
2498 2498  
2499 2499 # if filename is None:
2500 2500 # if isinstance(_file, basestring):
... ... @@ -3000,6 +3000,19 @@ class VBA_Parser(object):
3000 3000 raise SubstreamOpenError(self.filename, d.name, exc)
3001 3001 return self.contains_macros
3002 3002  
  3003 + def encode_string(self, unicode_str):
  3004 + """
  3005 + Encode a unicode string to bytes or str, using the specified encoding
  3006 + for the VBA_parser. By default, it will be bytes/UTF-8 on Python 2, and
  3007 + a normal unicode string on Python 3.
  3008 + :param str unicode_str: string to be encoded
  3009 + :return: encoded string
  3010 + """
  3011 + if self.encoding is None:
  3012 + return unicode_str
  3013 + else:
  3014 + return unicode_str.encode(self.encoding, errors='replace')
  3015 +
3003 3016 def extract_macros(self):
3004 3017 """
3005 3018 Extract and decompress source code for each VBA macro found in the file
... ... @@ -3062,6 +3075,7 @@ class VBA_Parser(object):
3062 3075 compressed_code = data[start:]
3063 3076 try:
3064 3077 vba_code = decompress_stream(bytearray(compressed_code))
  3078 + # TODO vba_code = self.encode_string(vba_code)
3065 3079 yield (self.filename, d.name, d.name, vba_code)
3066 3080 except Exception as exc:
3067 3081 # display the exception with full stack trace for debugging
... ...