Commit 07333a3a869722c50b1b199a4d6311e1b7eb4858
1 parent
558b3748
olevba: added encoding option to VBA_Parser (work in progress), fixed some comments for Sphinx
Showing
1 changed file
with
46 additions
and
32 deletions
oletools/olevba.py
| ... | ... | @@ -7,14 +7,14 @@ olevba is a script to parse OLE and OpenXML files such as MS Office documents |
| 7 | 7 | and analyze malicious macros. |
| 8 | 8 | |
| 9 | 9 | Supported formats: |
| 10 | -- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) | |
| 11 | -- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) | |
| 12 | -- PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm) | |
| 13 | -- Word/PowerPoint 2007+ XML (aka Flat OPC) | |
| 14 | -- Word 2003 XML (.xml) | |
| 15 | -- Word/Excel Single File Web Page / MHTML (.mht) | |
| 16 | -- Publisher (.pub) | |
| 17 | -- raises an error if run with files encrypted using MS Crypto API RC4 | |
| 10 | + - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) | |
| 11 | + - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) | |
| 12 | + - PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm) | |
| 13 | + - Word/PowerPoint 2007+ XML (aka Flat OPC) | |
| 14 | + - Word 2003 XML (.xml) | |
| 15 | + - Word/Excel Single File Web Page / MHTML (.mht) | |
| 16 | + - Publisher (.pub) | |
| 17 | + - raises an error if run with files encrypted using MS Crypto API RC4 | |
| 18 | 18 | |
| 19 | 19 | Author: Philippe Lagadec - http://www.decalage.info |
| 20 | 20 | License: BSD, see source code or documentation |
| ... | ... | @@ -328,6 +328,8 @@ if sys.version_info[0] <= 2: |
| 328 | 328 | # to use ord on bytes/bytearray items the same way in Python 2+3 |
| 329 | 329 | # on Python 2, just use the normal ord() because items are bytes |
| 330 | 330 | byte_ord = ord |
| 331 | + #: Default string encoding for the olevba API | |
| 332 | + DEFAULT_API_ENCODING = 'utf8' # on Python 2: UTF-8 (bytes) | |
| 331 | 333 | else: |
| 332 | 334 | # Python 3.x+ |
| 333 | 335 | # to use ord on bytes/bytearray items the same way in Python 2+3 |
| ... | ... | @@ -338,6 +340,8 @@ else: |
| 338 | 340 | # unichr does not exist anymore, only chr: |
| 339 | 341 | unichr = chr |
| 340 | 342 | from functools import reduce |
| 343 | + #: Default string encoding for the olevba API | |
| 344 | + DEFAULT_API_ENCODING = None # on Python 3: None (unicode) | |
| 341 | 345 | |
| 342 | 346 | |
| 343 | 347 | # === PYTHON 3.0 - 3.4 SUPPORT ====================================================== |
| ... | ... | @@ -1338,7 +1342,7 @@ class VBA_Module(object): |
| 1338 | 1342 | :param olefile.OleStream dir_stream: olefile.OleStream, file object containing the module record |
| 1339 | 1343 | :param int module_index: int, index of the module in the VBA project list |
| 1340 | 1344 | """ |
| 1341 | - #: store a reference to the VBA project for later use: | |
| 1345 | + #: reference to the VBA project for later use | |
| 1342 | 1346 | self.project = project |
| 1343 | 1347 | #: VBA project name |
| 1344 | 1348 | self.name = None |
| ... | ... | @@ -2423,7 +2427,7 @@ def scan_vba(vba_code, include_decoded_strings, deobfuscate=False): |
| 2423 | 2427 | :param include_decoded_strings: bool, if True all encoded strings will be included with their decoded content. |
| 2424 | 2428 | :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) |
| 2425 | 2429 | :return: list of tuples (type, keyword, description) |
| 2426 | - (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String') | |
| 2430 | + with type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String' | |
| 2427 | 2431 | """ |
| 2428 | 2432 | return VBA_Scanner(vba_code).scan(include_decoded_strings, deobfuscate) |
| 2429 | 2433 | |
| ... | ... | @@ -2433,38 +2437,32 @@ def scan_vba(vba_code, include_decoded_strings, deobfuscate=False): |
| 2433 | 2437 | class VBA_Parser(object): |
| 2434 | 2438 | """ |
| 2435 | 2439 | Class to parse MS Office files, to detect VBA macros and extract VBA source code |
| 2436 | - Supported file formats: | |
| 2437 | - - Word 97-2003 (.doc, .dot) | |
| 2438 | - - Word 2007+ (.docm, .dotm) | |
| 2439 | - - Word 2003 XML (.xml) | |
| 2440 | - - Word MHT - Single File Web Page / MHTML (.mht) | |
| 2441 | - - Excel 97-2003 (.xls) | |
| 2442 | - - Excel 2007+ (.xlsm, .xlsb) | |
| 2443 | - - PowerPoint 97-2003 (.ppt) | |
| 2444 | - - PowerPoint 2007+ (.pptm, .ppsm) | |
| 2445 | 2440 | """ |
| 2446 | 2441 | |
| 2447 | - def __init__(self, filename, data=None, container=None, relaxed=False): | |
| 2442 | + def __init__(self, filename, data=None, container=None, relaxed=False, encoding=DEFAULT_API_ENCODING): | |
| 2448 | 2443 | """ |
| 2449 | 2444 | Constructor for VBA_Parser |
| 2450 | 2445 | |
| 2451 | - :param filename: filename or path of file to parse, or file-like object | |
| 2446 | + :param str filename: filename or path of file to parse, or file-like object | |
| 2452 | 2447 | |
| 2453 | - :param data: None or bytes str, if None the file will be read from disk (or from the file-like object). | |
| 2454 | - If data is provided as a bytes string, it will be parsed as the content of the file in memory, | |
| 2455 | - and not read from disk. Note: files must be read in binary mode, i.e. open(f, 'rb'). | |
| 2448 | + :param bytes data: None or bytes str, if None the file will be read from disk (or from the file-like object). | |
| 2449 | + If data is provided as a bytes string, it will be parsed as the content of the file in memory, | |
| 2450 | + and not read from disk. Note: files must be read in binary mode, i.e. open(f, 'rb'). | |
| 2456 | 2451 | |
| 2457 | - :param container: str, path and filename of container if the file is within | |
| 2458 | - a zip archive, None otherwise. | |
| 2452 | + :param str container: str, path and filename of container if the file is within | |
| 2453 | + a zip archive, None otherwise. | |
| 2459 | 2454 | |
| 2460 | - :param relaxed: if True, treat mal-formed documents and missing streams more like MS office: | |
| 2461 | - do nothing; if False (default), raise errors in these cases | |
| 2455 | + :param bool relaxed: if True, treat mal-formed documents and missing streams more like MS office: | |
| 2456 | + do nothing; if False (default), raise errors in these cases | |
| 2462 | 2457 | |
| 2463 | - raises a FileOpenError if all attemps to interpret the data header failed | |
| 2458 | + :param str encoding: encoding for VBA source code and strings. | |
| 2459 | + Default: UTF-8 bytes strings on Python 2, unicode strings on Python 3 (None) | |
| 2460 | + | |
| 2461 | + raises a FileOpenError if all attempts to interpret the data header failed. | |
| 2464 | 2462 | """ |
| 2465 | - #TODO: filename should only be a string, data should be used for the file-like object | |
| 2466 | - #TODO: filename should be mandatory, optional data is a string or file-like object | |
| 2467 | - #TODO: also support olefile and zipfile as input | |
| 2463 | + # TODO: filename should only be a string, data should be used for the file-like object | |
| 2464 | + # TODO: filename should be mandatory, optional data is a string or file-like object | |
| 2465 | + # TODO: also support olefile and zipfile as input | |
| 2468 | 2466 | if data is None: |
| 2469 | 2467 | # open file from disk: |
| 2470 | 2468 | _file = filename |
| ... | ... | @@ -2495,6 +2493,8 @@ class VBA_Parser(object): |
| 2495 | 2493 | self.nb_base64strings = 0 |
| 2496 | 2494 | self.nb_dridexstrings = 0 |
| 2497 | 2495 | self.nb_vbastrings = 0 |
| 2496 | + #: Encoding for VBA source code and strings returned by all methods | |
| 2497 | + self.encoding = encoding | |
| 2498 | 2498 | |
| 2499 | 2499 | # if filename is None: |
| 2500 | 2500 | # if isinstance(_file, basestring): |
| ... | ... | @@ -3000,6 +3000,19 @@ class VBA_Parser(object): |
| 3000 | 3000 | raise SubstreamOpenError(self.filename, d.name, exc) |
| 3001 | 3001 | return self.contains_macros |
| 3002 | 3002 | |
| 3003 | + def encode_string(self, unicode_str): | |
| 3004 | + """ | |
| 3005 | + Encode a unicode string to bytes or str, using the specified encoding | |
| 3006 | + for the VBA_parser. By default, it will be bytes/UTF-8 on Python 2, and | |
| 3007 | + a normal unicode string on Python 3. | |
| 3008 | + :param str unicode_str: string to be encoded | |
| 3009 | + :return: encoded string | |
| 3010 | + """ | |
| 3011 | + if self.encoding is None: | |
| 3012 | + return unicode_str | |
| 3013 | + else: | |
| 3014 | + return unicode_str.encode(self.encoding, errors='replace') | |
| 3015 | + | |
| 3003 | 3016 | def extract_macros(self): |
| 3004 | 3017 | """ |
| 3005 | 3018 | Extract and decompress source code for each VBA macro found in the file |
| ... | ... | @@ -3062,6 +3075,7 @@ class VBA_Parser(object): |
| 3062 | 3075 | compressed_code = data[start:] |
| 3063 | 3076 | try: |
| 3064 | 3077 | vba_code = decompress_stream(bytearray(compressed_code)) |
| 3078 | + # TODO vba_code = self.encode_string(vba_code) | |
| 3065 | 3079 | yield (self.filename, d.name, d.name, vba_code) |
| 3066 | 3080 | except Exception as exc: |
| 3067 | 3081 | # display the exception with full stack trace for debugging | ... | ... |