Commit 6b091f95c491a6eb7a45ed98c3c925da5b46b38e
Committed by
Philippe Lagadec
1 parent
1c84a13f
Some more fixes related to python3 (#160)
- Improve decoding - Use backslashreplace in order to keep the non-utf8 characters. - Fix python 3.6+ support - Generic fix for the lack of backslashreplace in python 3.0 to 3.4
Showing
1 changed file
with
32 additions
and
9 deletions
oletools/olevba3.py
| @@ -238,6 +238,8 @@ import email # for MHTML parsing | @@ -238,6 +238,8 @@ import email # for MHTML parsing | ||
| 238 | import string # for printable | 238 | import string # for printable |
| 239 | import json # for json output mode (argument --json) | 239 | import json # for json output mode (argument --json) |
| 240 | 240 | ||
| 241 | +from pyparsing import ParserElement | ||
| 242 | + | ||
| 241 | # import lxml or ElementTree for XML parsing: | 243 | # import lxml or ElementTree for XML parsing: |
| 242 | try: | 244 | try: |
| 243 | # lxml: best performance for XML processing | 245 | # lxml: best performance for XML processing |
| @@ -287,6 +289,25 @@ else: | @@ -287,6 +289,25 @@ else: | ||
| 287 | # xrange is now called range: | 289 | # xrange is now called range: |
| 288 | xrange = range | 290 | xrange = range |
| 289 | 291 | ||
| 292 | + | ||
| 293 | +# === PYTHON 3.0 - 3.4 SUPPORT ====================================================== | ||
| 294 | + | ||
| 295 | +# From https://gist.github.com/ynkdir/867347/c5e188a4886bc2dd71876c7e069a7b00b6c16c61 | ||
| 296 | + | ||
| 297 | +if sys.version_info >= (3, 0) and sys.version_info < (3, 5): | ||
| 298 | + import codecs | ||
| 299 | + | ||
| 300 | + _backslashreplace_errors = codecs.lookup_error("backslashreplace") | ||
| 301 | + | ||
| 302 | + def backslashreplace_errors(exc): | ||
| 303 | + if isinstance(exc, UnicodeDecodeError): | ||
| 304 | + u = "".join("\\x{0:02x}".format(c) for c in exc.object[exc.start:exc.end]) | ||
| 305 | + return (u, exc.end) | ||
| 306 | + return _backslashreplace_errors(exc) | ||
| 307 | + | ||
| 308 | + codecs.register_error("backslashreplace", backslashreplace_errors) | ||
| 309 | + | ||
| 310 | + | ||
| 290 | # === LOGGING ================================================================= | 311 | # === LOGGING ================================================================= |
| 291 | 312 | ||
| 292 | class NullHandler(logging.Handler): | 313 | class NullHandler(logging.Handler): |
| @@ -1535,7 +1556,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): | @@ -1535,7 +1556,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): | ||
| 1535 | modulename_id = struct.unpack("<H", dir_stream.read(2))[0] | 1556 | modulename_id = struct.unpack("<H", dir_stream.read(2))[0] |
| 1536 | check_value('MODULENAME_Id', 0x0019, modulename_id) | 1557 | check_value('MODULENAME_Id', 0x0019, modulename_id) |
| 1537 | modulename_sizeof_modulename = struct.unpack("<L", dir_stream.read(4))[0] | 1558 | modulename_sizeof_modulename = struct.unpack("<L", dir_stream.read(4))[0] |
| 1538 | - modulename_modulename = dir_stream.read(modulename_sizeof_modulename) | 1559 | + modulename_modulename = dir_stream.read(modulename_sizeof_modulename).decode('utf-8', 'backslashreplace') |
| 1539 | # TODO: preset variables to avoid "referenced before assignment" errors | 1560 | # TODO: preset variables to avoid "referenced before assignment" errors |
| 1540 | modulename_unicode_modulename_unicode = '' | 1561 | modulename_unicode_modulename_unicode = '' |
| 1541 | # account for optional sections | 1562 | # account for optional sections |
| @@ -1781,7 +1802,7 @@ def detect_suspicious(vba_code, obfuscation=None): | @@ -1781,7 +1802,7 @@ def detect_suspicious(vba_code, obfuscation=None): | ||
| 1781 | for description, keywords in SUSPICIOUS_KEYWORDS.items(): | 1802 | for description, keywords in SUSPICIOUS_KEYWORDS.items(): |
| 1782 | for keyword in keywords: | 1803 | for keyword in keywords: |
| 1783 | # search using regex to detect word boundaries: | 1804 | # search using regex to detect word boundaries: |
| 1784 | - match = re.search(r'(?i)\b' + keyword + r'\b', vba_code) | 1805 | + match = re.search(r'(?i)\b' + re.escape(keyword) + r'\b', vba_code) |
| 1785 | if match: | 1806 | if match: |
| 1786 | #if keyword.lower() in vba_code: | 1807 | #if keyword.lower() in vba_code: |
| 1787 | found_keyword = match.group() | 1808 | found_keyword = match.group() |
| @@ -1824,7 +1845,7 @@ def detect_hex_strings(vba_code): | @@ -1824,7 +1845,7 @@ def detect_hex_strings(vba_code): | ||
| 1824 | value = match.group() | 1845 | value = match.group() |
| 1825 | if value not in found: | 1846 | if value not in found: |
| 1826 | decoded = binascii.unhexlify(value) | 1847 | decoded = binascii.unhexlify(value) |
| 1827 | - results.append((value, decoded.decode('utf-8','replace'))) | 1848 | + results.append((value, decoded.decode('utf-8', 'backslashreplace'))) |
| 1828 | found.add(value) | 1849 | found.add(value) |
| 1829 | return results | 1850 | return results |
| 1830 | 1851 | ||
| @@ -2007,6 +2028,8 @@ class VBA_Scanner(object): | @@ -2007,6 +2028,8 @@ class VBA_Scanner(object): | ||
| 2007 | 2028 | ||
| 2008 | :param vba_code: str, VBA source code to be analyzed | 2029 | :param vba_code: str, VBA source code to be analyzed |
| 2009 | """ | 2030 | """ |
| 2031 | + if isinstance(vba_code, bytes): | ||
| 2032 | + vba_code = vba_code.decode('utf-8', 'backslashreplace') | ||
| 2010 | # join long lines ending with " _": | 2033 | # join long lines ending with " _": |
| 2011 | self.code = vba_collapse_long_lines(vba_code) | 2034 | self.code = vba_collapse_long_lines(vba_code) |
| 2012 | self.code_hex = '' | 2035 | self.code_hex = '' |
| @@ -2084,7 +2107,7 @@ class VBA_Scanner(object): | @@ -2084,7 +2107,7 @@ class VBA_Scanner(object): | ||
| 2084 | (self.code_vba, 'VBA expression'), | 2107 | (self.code_vba, 'VBA expression'), |
| 2085 | ): | 2108 | ): |
| 2086 | if isinstance(code,bytes): | 2109 | if isinstance(code,bytes): |
| 2087 | - code=code.decode('utf-8','replace') | 2110 | + code=code.decode('utf-8','backslashreplace') |
| 2088 | self.autoexec_keywords += detect_autoexec(code, obfuscation) | 2111 | self.autoexec_keywords += detect_autoexec(code, obfuscation) |
| 2089 | self.suspicious_keywords += detect_suspicious(code, obfuscation) | 2112 | self.suspicious_keywords += detect_suspicious(code, obfuscation) |
| 2090 | self.iocs += detect_patterns(code, obfuscation) | 2113 | self.iocs += detect_patterns(code, obfuscation) |
| @@ -2411,7 +2434,7 @@ class VBA_Parser(object): | @@ -2411,7 +2434,7 @@ class VBA_Parser(object): | ||
| 2411 | log.info('Opening MHTML file %s' % self.filename) | 2434 | log.info('Opening MHTML file %s' % self.filename) |
| 2412 | try: | 2435 | try: |
| 2413 | if isinstance(data,bytes): | 2436 | if isinstance(data,bytes): |
| 2414 | - data = data.decode('utf8', 'replace') | 2437 | + data = data.decode('utf8', 'backslashreplace') |
| 2415 | # parse the MIME content | 2438 | # parse the MIME content |
| 2416 | # remove any leading whitespace or newline (workaround for issue in email package) | 2439 | # remove any leading whitespace or newline (workaround for issue in email package) |
| 2417 | stripped_data = data.lstrip('\r\n\t ') | 2440 | stripped_data = data.lstrip('\r\n\t ') |
| @@ -2514,7 +2537,7 @@ class VBA_Parser(object): | @@ -2514,7 +2537,7 @@ class VBA_Parser(object): | ||
| 2514 | log.info('Opening text file %s' % self.filename) | 2537 | log.info('Opening text file %s' % self.filename) |
| 2515 | # directly store the source code: | 2538 | # directly store the source code: |
| 2516 | if isinstance(data,bytes): | 2539 | if isinstance(data,bytes): |
| 2517 | - data=data.decode('utf8','replace') | 2540 | + data=data.decode('utf8','backslashreplace') |
| 2518 | self.vba_code_all_modules = data | 2541 | self.vba_code_all_modules = data |
| 2519 | self.contains_macros = True | 2542 | self.contains_macros = True |
| 2520 | # set type only if parsing succeeds | 2543 | # set type only if parsing succeeds |
| @@ -2671,7 +2694,7 @@ class VBA_Parser(object): | @@ -2671,7 +2694,7 @@ class VBA_Parser(object): | ||
| 2671 | log.debug('%r...[much more data]...%r' % (data[:100], data[-50:])) | 2694 | log.debug('%r...[much more data]...%r' % (data[:100], data[-50:])) |
| 2672 | else: | 2695 | else: |
| 2673 | log.debug(repr(data)) | 2696 | log.debug(repr(data)) |
| 2674 | - if 'Attribut' in data.decode('utf-8','ignore'): | 2697 | + if 'Attribut' in data.decode('utf-8', 'ignore'): |
| 2675 | log.debug('Found VBA compressed code') | 2698 | log.debug('Found VBA compressed code') |
| 2676 | self.contains_macros = True | 2699 | self.contains_macros = True |
| 2677 | except IOError as exc: | 2700 | except IOError as exc: |
| @@ -3026,7 +3049,7 @@ class VBA_Parser_CLI(VBA_Parser): | @@ -3026,7 +3049,7 @@ class VBA_Parser_CLI(VBA_Parser): | ||
| 3026 | if hide_attributes: | 3049 | if hide_attributes: |
| 3027 | # hide attribute lines: | 3050 | # hide attribute lines: |
| 3028 | if isinstance(vba_code,bytes): | 3051 | if isinstance(vba_code,bytes): |
| 3029 | - vba_code =vba_code.decode('utf-8','replace') | 3052 | + vba_code =vba_code.decode('utf-8','backslashreplace') |
| 3030 | vba_code_filtered = filter_vba(vba_code) | 3053 | vba_code_filtered = filter_vba(vba_code) |
| 3031 | else: | 3054 | else: |
| 3032 | vba_code_filtered = vba_code | 3055 | vba_code_filtered = vba_code |
| @@ -3107,7 +3130,7 @@ class VBA_Parser_CLI(VBA_Parser): | @@ -3107,7 +3130,7 @@ class VBA_Parser_CLI(VBA_Parser): | ||
| 3107 | curr_macro = {} | 3130 | curr_macro = {} |
| 3108 | if hide_attributes: | 3131 | if hide_attributes: |
| 3109 | # hide attribute lines: | 3132 | # hide attribute lines: |
| 3110 | - vba_code_filtered = filter_vba(vba_code.decode('utf-8','replace')) | 3133 | + vba_code_filtered = filter_vba(vba_code.decode('utf-8','backslashreplace')) |
| 3111 | else: | 3134 | else: |
| 3112 | vba_code_filtered = vba_code | 3135 | vba_code_filtered = vba_code |
| 3113 | 3136 |