Commit 07333a3a869722c50b1b199a4d6311e1b7eb4858
1 parent
558b3748
olevba: added encoding option to VBA_Parser (work in progress), fixed some comments for Sphinx
Showing
1 changed file
with
46 additions
and
32 deletions
oletools/olevba.py
| @@ -7,14 +7,14 @@ olevba is a script to parse OLE and OpenXML files such as MS Office documents | @@ -7,14 +7,14 @@ olevba is a script to parse OLE and OpenXML files such as MS Office documents | ||
| 7 | and analyze malicious macros. | 7 | and analyze malicious macros. |
| 8 | 8 | ||
| 9 | Supported formats: | 9 | Supported formats: |
| 10 | -- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) | ||
| 11 | -- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) | ||
| 12 | -- PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm) | ||
| 13 | -- Word/PowerPoint 2007+ XML (aka Flat OPC) | ||
| 14 | -- Word 2003 XML (.xml) | ||
| 15 | -- Word/Excel Single File Web Page / MHTML (.mht) | ||
| 16 | -- Publisher (.pub) | ||
| 17 | -- raises an error if run with files encrypted using MS Crypto API RC4 | 10 | + - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) |
| 11 | + - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) | ||
| 12 | + - PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm) | ||
| 13 | + - Word/PowerPoint 2007+ XML (aka Flat OPC) | ||
| 14 | + - Word 2003 XML (.xml) | ||
| 15 | + - Word/Excel Single File Web Page / MHTML (.mht) | ||
| 16 | + - Publisher (.pub) | ||
| 17 | + - raises an error if run with files encrypted using MS Crypto API RC4 | ||
| 18 | 18 | ||
| 19 | Author: Philippe Lagadec - http://www.decalage.info | 19 | Author: Philippe Lagadec - http://www.decalage.info |
| 20 | License: BSD, see source code or documentation | 20 | License: BSD, see source code or documentation |
| @@ -328,6 +328,8 @@ if sys.version_info[0] <= 2: | @@ -328,6 +328,8 @@ if sys.version_info[0] <= 2: | ||
| 328 | # to use ord on bytes/bytearray items the same way in Python 2+3 | 328 | # to use ord on bytes/bytearray items the same way in Python 2+3 |
| 329 | # on Python 2, just use the normal ord() because items are bytes | 329 | # on Python 2, just use the normal ord() because items are bytes |
| 330 | byte_ord = ord | 330 | byte_ord = ord |
| 331 | + #: Default string encoding for the olevba API | ||
| 332 | + DEFAULT_API_ENCODING = 'utf8' # on Python 2: UTF-8 (bytes) | ||
| 331 | else: | 333 | else: |
| 332 | # Python 3.x+ | 334 | # Python 3.x+ |
| 333 | # to use ord on bytes/bytearray items the same way in Python 2+3 | 335 | # to use ord on bytes/bytearray items the same way in Python 2+3 |
| @@ -338,6 +340,8 @@ else: | @@ -338,6 +340,8 @@ else: | ||
| 338 | # unichr does not exist anymore, only chr: | 340 | # unichr does not exist anymore, only chr: |
| 339 | unichr = chr | 341 | unichr = chr |
| 340 | from functools import reduce | 342 | from functools import reduce |
| 343 | + #: Default string encoding for the olevba API | ||
| 344 | + DEFAULT_API_ENCODING = None # on Python 3: None (unicode) | ||
| 341 | 345 | ||
| 342 | 346 | ||
| 343 | # === PYTHON 3.0 - 3.4 SUPPORT ====================================================== | 347 | # === PYTHON 3.0 - 3.4 SUPPORT ====================================================== |
| @@ -1338,7 +1342,7 @@ class VBA_Module(object): | @@ -1338,7 +1342,7 @@ class VBA_Module(object): | ||
| 1338 | :param olefile.OleStream dir_stream: olefile.OleStream, file object containing the module record | 1342 | :param olefile.OleStream dir_stream: olefile.OleStream, file object containing the module record |
| 1339 | :param int module_index: int, index of the module in the VBA project list | 1343 | :param int module_index: int, index of the module in the VBA project list |
| 1340 | """ | 1344 | """ |
| 1341 | - #: store a reference to the VBA project for later use: | 1345 | + #: reference to the VBA project for later use |
| 1342 | self.project = project | 1346 | self.project = project |
| 1343 | #: VBA project name | 1347 | #: VBA project name |
| 1344 | self.name = None | 1348 | self.name = None |
| @@ -2423,7 +2427,7 @@ def scan_vba(vba_code, include_decoded_strings, deobfuscate=False): | @@ -2423,7 +2427,7 @@ def scan_vba(vba_code, include_decoded_strings, deobfuscate=False): | ||
| 2423 | :param include_decoded_strings: bool, if True all encoded strings will be included with their decoded content. | 2427 | :param include_decoded_strings: bool, if True all encoded strings will be included with their decoded content. |
| 2424 | :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) | 2428 | :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) |
| 2425 | :return: list of tuples (type, keyword, description) | 2429 | :return: list of tuples (type, keyword, description) |
| 2426 | - (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String') | 2430 | + with type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String' |
| 2427 | """ | 2431 | """ |
| 2428 | return VBA_Scanner(vba_code).scan(include_decoded_strings, deobfuscate) | 2432 | return VBA_Scanner(vba_code).scan(include_decoded_strings, deobfuscate) |
| 2429 | 2433 | ||
| @@ -2433,38 +2437,32 @@ def scan_vba(vba_code, include_decoded_strings, deobfuscate=False): | @@ -2433,38 +2437,32 @@ def scan_vba(vba_code, include_decoded_strings, deobfuscate=False): | ||
| 2433 | class VBA_Parser(object): | 2437 | class VBA_Parser(object): |
| 2434 | """ | 2438 | """ |
| 2435 | Class to parse MS Office files, to detect VBA macros and extract VBA source code | 2439 | Class to parse MS Office files, to detect VBA macros and extract VBA source code |
| 2436 | - Supported file formats: | ||
| 2437 | - - Word 97-2003 (.doc, .dot) | ||
| 2438 | - - Word 2007+ (.docm, .dotm) | ||
| 2439 | - - Word 2003 XML (.xml) | ||
| 2440 | - - Word MHT - Single File Web Page / MHTML (.mht) | ||
| 2441 | - - Excel 97-2003 (.xls) | ||
| 2442 | - - Excel 2007+ (.xlsm, .xlsb) | ||
| 2443 | - - PowerPoint 97-2003 (.ppt) | ||
| 2444 | - - PowerPoint 2007+ (.pptm, .ppsm) | ||
| 2445 | """ | 2440 | """ |
| 2446 | 2441 | ||
| 2447 | - def __init__(self, filename, data=None, container=None, relaxed=False): | 2442 | + def __init__(self, filename, data=None, container=None, relaxed=False, encoding=DEFAULT_API_ENCODING): |
| 2448 | """ | 2443 | """ |
| 2449 | Constructor for VBA_Parser | 2444 | Constructor for VBA_Parser |
| 2450 | 2445 | ||
| 2451 | - :param filename: filename or path of file to parse, or file-like object | 2446 | + :param str filename: filename or path of file to parse, or file-like object |
| 2452 | 2447 | ||
| 2453 | - :param data: None or bytes str, if None the file will be read from disk (or from the file-like object). | ||
| 2454 | - If data is provided as a bytes string, it will be parsed as the content of the file in memory, | ||
| 2455 | - and not read from disk. Note: files must be read in binary mode, i.e. open(f, 'rb'). | 2448 | + :param bytes data: None or bytes str, if None the file will be read from disk (or from the file-like object). |
| 2449 | + If data is provided as a bytes string, it will be parsed as the content of the file in memory, | ||
| 2450 | + and not read from disk. Note: files must be read in binary mode, i.e. open(f, 'rb'). | ||
| 2456 | 2451 | ||
| 2457 | - :param container: str, path and filename of container if the file is within | ||
| 2458 | - a zip archive, None otherwise. | 2452 | + :param str container: str, path and filename of container if the file is within |
| 2453 | + a zip archive, None otherwise. | ||
| 2459 | 2454 | ||
| 2460 | - :param relaxed: if True, treat mal-formed documents and missing streams more like MS office: | ||
| 2461 | - do nothing; if False (default), raise errors in these cases | 2455 | + :param bool relaxed: if True, treat mal-formed documents and missing streams more like MS office: |
| 2456 | + do nothing; if False (default), raise errors in these cases | ||
| 2462 | 2457 | ||
| 2463 | - raises a FileOpenError if all attemps to interpret the data header failed | 2458 | + :param str encoding: encoding for VBA source code and strings. |
| 2459 | + Default: UTF-8 bytes strings on Python 2, unicode strings on Python 3 (None) | ||
| 2460 | + | ||
| 2461 | + raises a FileOpenError if all attempts to interpret the data header failed. | ||
| 2464 | """ | 2462 | """ |
| 2465 | - #TODO: filename should only be a string, data should be used for the file-like object | ||
| 2466 | - #TODO: filename should be mandatory, optional data is a string or file-like object | ||
| 2467 | - #TODO: also support olefile and zipfile as input | 2463 | + # TODO: filename should only be a string, data should be used for the file-like object |
| 2464 | + # TODO: filename should be mandatory, optional data is a string or file-like object | ||
| 2465 | + # TODO: also support olefile and zipfile as input | ||
| 2468 | if data is None: | 2466 | if data is None: |
| 2469 | # open file from disk: | 2467 | # open file from disk: |
| 2470 | _file = filename | 2468 | _file = filename |
| @@ -2495,6 +2493,8 @@ class VBA_Parser(object): | @@ -2495,6 +2493,8 @@ class VBA_Parser(object): | ||
| 2495 | self.nb_base64strings = 0 | 2493 | self.nb_base64strings = 0 |
| 2496 | self.nb_dridexstrings = 0 | 2494 | self.nb_dridexstrings = 0 |
| 2497 | self.nb_vbastrings = 0 | 2495 | self.nb_vbastrings = 0 |
| 2496 | + #: Encoding for VBA source code and strings returned by all methods | ||
| 2497 | + self.encoding = encoding | ||
| 2498 | 2498 | ||
| 2499 | # if filename is None: | 2499 | # if filename is None: |
| 2500 | # if isinstance(_file, basestring): | 2500 | # if isinstance(_file, basestring): |
| @@ -3000,6 +3000,19 @@ class VBA_Parser(object): | @@ -3000,6 +3000,19 @@ class VBA_Parser(object): | ||
| 3000 | raise SubstreamOpenError(self.filename, d.name, exc) | 3000 | raise SubstreamOpenError(self.filename, d.name, exc) |
| 3001 | return self.contains_macros | 3001 | return self.contains_macros |
| 3002 | 3002 | ||
| 3003 | + def encode_string(self, unicode_str): | ||
| 3004 | + """ | ||
| 3005 | + Encode a unicode string to bytes or str, using the specified encoding | ||
| 3006 | + for the VBA_parser. By default, it will be bytes/UTF-8 on Python 2, and | ||
| 3007 | + a normal unicode string on Python 3. | ||
| 3008 | + :param str unicode_str: string to be encoded | ||
| 3009 | + :return: encoded string | ||
| 3010 | + """ | ||
| 3011 | + if self.encoding is None: | ||
| 3012 | + return unicode_str | ||
| 3013 | + else: | ||
| 3014 | + return unicode_str.encode(self.encoding, errors='replace') | ||
| 3015 | + | ||
| 3003 | def extract_macros(self): | 3016 | def extract_macros(self): |
| 3004 | """ | 3017 | """ |
| 3005 | Extract and decompress source code for each VBA macro found in the file | 3018 | Extract and decompress source code for each VBA macro found in the file |
| @@ -3062,6 +3075,7 @@ class VBA_Parser(object): | @@ -3062,6 +3075,7 @@ class VBA_Parser(object): | ||
| 3062 | compressed_code = data[start:] | 3075 | compressed_code = data[start:] |
| 3063 | try: | 3076 | try: |
| 3064 | vba_code = decompress_stream(bytearray(compressed_code)) | 3077 | vba_code = decompress_stream(bytearray(compressed_code)) |
| 3078 | + # TODO vba_code = self.encode_string(vba_code) | ||
| 3065 | yield (self.filename, d.name, d.name, vba_code) | 3079 | yield (self.filename, d.name, d.name, vba_code) |
| 3066 | except Exception as exc: | 3080 | except Exception as exc: |
| 3067 | # display the exception with full stack trace for debugging | 3081 | # display the exception with full stack trace for debugging |