Commit 6b091f95c491a6eb7a45ed98c3c925da5b46b38e
Committed by
Philippe Lagadec
1 parent
1c84a13f
Some more fixes related to python3 (#160)
- Improve decoding - Use backslashreplace in order to keep the non-utf8 characters. - Fix python 3.6+ support - Generic fix for the lack of backslashreplace in python 3.0 to 3.4
Showing
1 changed file
with
32 additions
and
9 deletions
oletools/olevba3.py
| ... | ... | @@ -238,6 +238,8 @@ import email # for MHTML parsing |
| 238 | 238 | import string # for printable |
| 239 | 239 | import json # for json output mode (argument --json) |
| 240 | 240 | |
| 241 | +from pyparsing import ParserElement | |
| 242 | + | |
| 241 | 243 | # import lxml or ElementTree for XML parsing: |
| 242 | 244 | try: |
| 243 | 245 | # lxml: best performance for XML processing |
| ... | ... | @@ -287,6 +289,25 @@ else: |
| 287 | 289 | # xrange is now called range: |
| 288 | 290 | xrange = range |
| 289 | 291 | |
| 292 | + | |
| 293 | +# === PYTHON 3.0 - 3.4 SUPPORT ====================================================== | |
| 294 | + | |
| 295 | +# From https://gist.github.com/ynkdir/867347/c5e188a4886bc2dd71876c7e069a7b00b6c16c61 | |
| 296 | + | |
| 297 | +if sys.version_info >= (3, 0) and sys.version_info < (3, 5): | |
| 298 | + import codecs | |
| 299 | + | |
| 300 | + _backslashreplace_errors = codecs.lookup_error("backslashreplace") | |
| 301 | + | |
| 302 | + def backslashreplace_errors(exc): | |
| 303 | + if isinstance(exc, UnicodeDecodeError): | |
| 304 | + u = "".join("\\x{0:02x}".format(c) for c in exc.object[exc.start:exc.end]) | |
| 305 | + return (u, exc.end) | |
| 306 | + return _backslashreplace_errors(exc) | |
| 307 | + | |
| 308 | + codecs.register_error("backslashreplace", backslashreplace_errors) | |
| 309 | + | |
| 310 | + | |
| 290 | 311 | # === LOGGING ================================================================= |
| 291 | 312 | |
| 292 | 313 | class NullHandler(logging.Handler): |
| ... | ... | @@ -1535,7 +1556,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): |
| 1535 | 1556 | modulename_id = struct.unpack("<H", dir_stream.read(2))[0] |
| 1536 | 1557 | check_value('MODULENAME_Id', 0x0019, modulename_id) |
| 1537 | 1558 | modulename_sizeof_modulename = struct.unpack("<L", dir_stream.read(4))[0] |
| 1538 | - modulename_modulename = dir_stream.read(modulename_sizeof_modulename) | |
| 1559 | + modulename_modulename = dir_stream.read(modulename_sizeof_modulename).decode('utf-8', 'backslashreplace') | |
| 1539 | 1560 | # TODO: preset variables to avoid "referenced before assignment" errors |
| 1540 | 1561 | modulename_unicode_modulename_unicode = '' |
| 1541 | 1562 | # account for optional sections |
| ... | ... | @@ -1781,7 +1802,7 @@ def detect_suspicious(vba_code, obfuscation=None): |
| 1781 | 1802 | for description, keywords in SUSPICIOUS_KEYWORDS.items(): |
| 1782 | 1803 | for keyword in keywords: |
| 1783 | 1804 | # search using regex to detect word boundaries: |
| 1784 | - match = re.search(r'(?i)\b' + keyword + r'\b', vba_code) | |
| 1805 | + match = re.search(r'(?i)\b' + re.escape(keyword) + r'\b', vba_code) | |
| 1785 | 1806 | if match: |
| 1786 | 1807 | #if keyword.lower() in vba_code: |
| 1787 | 1808 | found_keyword = match.group() |
| ... | ... | @@ -1824,7 +1845,7 @@ def detect_hex_strings(vba_code): |
| 1824 | 1845 | value = match.group() |
| 1825 | 1846 | if value not in found: |
| 1826 | 1847 | decoded = binascii.unhexlify(value) |
| 1827 | - results.append((value, decoded.decode('utf-8','replace'))) | |
| 1848 | + results.append((value, decoded.decode('utf-8', 'backslashreplace'))) | |
| 1828 | 1849 | found.add(value) |
| 1829 | 1850 | return results |
| 1830 | 1851 | |
| ... | ... | @@ -2007,6 +2028,8 @@ class VBA_Scanner(object): |
| 2007 | 2028 | |
| 2008 | 2029 | :param vba_code: str, VBA source code to be analyzed |
| 2009 | 2030 | """ |
| 2031 | + if isinstance(vba_code, bytes): | |
| 2032 | + vba_code = vba_code.decode('utf-8', 'backslashreplace') | |
| 2010 | 2033 | # join long lines ending with " _": |
| 2011 | 2034 | self.code = vba_collapse_long_lines(vba_code) |
| 2012 | 2035 | self.code_hex = '' |
| ... | ... | @@ -2084,7 +2107,7 @@ class VBA_Scanner(object): |
| 2084 | 2107 | (self.code_vba, 'VBA expression'), |
| 2085 | 2108 | ): |
| 2086 | 2109 | if isinstance(code,bytes): |
| 2087 | - code=code.decode('utf-8','replace') | |
| 2110 | + code=code.decode('utf-8','backslashreplace') | |
| 2088 | 2111 | self.autoexec_keywords += detect_autoexec(code, obfuscation) |
| 2089 | 2112 | self.suspicious_keywords += detect_suspicious(code, obfuscation) |
| 2090 | 2113 | self.iocs += detect_patterns(code, obfuscation) |
| ... | ... | @@ -2411,7 +2434,7 @@ class VBA_Parser(object): |
| 2411 | 2434 | log.info('Opening MHTML file %s' % self.filename) |
| 2412 | 2435 | try: |
| 2413 | 2436 | if isinstance(data,bytes): |
| 2414 | - data = data.decode('utf8', 'replace') | |
| 2437 | + data = data.decode('utf8', 'backslashreplace') | |
| 2415 | 2438 | # parse the MIME content |
| 2416 | 2439 | # remove any leading whitespace or newline (workaround for issue in email package) |
| 2417 | 2440 | stripped_data = data.lstrip('\r\n\t ') |
| ... | ... | @@ -2514,7 +2537,7 @@ class VBA_Parser(object): |
| 2514 | 2537 | log.info('Opening text file %s' % self.filename) |
| 2515 | 2538 | # directly store the source code: |
| 2516 | 2539 | if isinstance(data,bytes): |
| 2517 | - data=data.decode('utf8','replace') | |
| 2540 | + data=data.decode('utf8','backslashreplace') | |
| 2518 | 2541 | self.vba_code_all_modules = data |
| 2519 | 2542 | self.contains_macros = True |
| 2520 | 2543 | # set type only if parsing succeeds |
| ... | ... | @@ -2671,7 +2694,7 @@ class VBA_Parser(object): |
| 2671 | 2694 | log.debug('%r...[much more data]...%r' % (data[:100], data[-50:])) |
| 2672 | 2695 | else: |
| 2673 | 2696 | log.debug(repr(data)) |
| 2674 | - if 'Attribut' in data.decode('utf-8','ignore'): | |
| 2697 | + if 'Attribut' in data.decode('utf-8', 'ignore'): | |
| 2675 | 2698 | log.debug('Found VBA compressed code') |
| 2676 | 2699 | self.contains_macros = True |
| 2677 | 2700 | except IOError as exc: |
| ... | ... | @@ -3026,7 +3049,7 @@ class VBA_Parser_CLI(VBA_Parser): |
| 3026 | 3049 | if hide_attributes: |
| 3027 | 3050 | # hide attribute lines: |
| 3028 | 3051 | if isinstance(vba_code,bytes): |
| 3029 | - vba_code =vba_code.decode('utf-8','replace') | |
| 3052 | + vba_code =vba_code.decode('utf-8','backslashreplace') | |
| 3030 | 3053 | vba_code_filtered = filter_vba(vba_code) |
| 3031 | 3054 | else: |
| 3032 | 3055 | vba_code_filtered = vba_code |
| ... | ... | @@ -3107,7 +3130,7 @@ class VBA_Parser_CLI(VBA_Parser): |
| 3107 | 3130 | curr_macro = {} |
| 3108 | 3131 | if hide_attributes: |
| 3109 | 3132 | # hide attribute lines: |
| 3110 | - vba_code_filtered = filter_vba(vba_code.decode('utf-8','replace')) | |
| 3133 | + vba_code_filtered = filter_vba(vba_code.decode('utf-8','backslashreplace')) | |
| 3111 | 3134 | else: |
| 3112 | 3135 | vba_code_filtered = vba_code |
| 3113 | 3136 | ... | ... |