Commit 6b091f95c491a6eb7a45ed98c3c925da5b46b38e

Authored by Raphaël Vinot
Committed by Philippe Lagadec
1 parent 1c84a13f

Some more fixes related to python3 (#160)

- Improve decoding
- Use backslashreplace in order to keep the non-utf8 characters.
- Fix python 3.6+ support
- Generic fix for the lack of backslashreplace in python 3.0 to 3.4
Showing 1 changed file with 32 additions and 9 deletions
oletools/olevba3.py
@@ -238,6 +238,8 @@ import email # for MHTML parsing @@ -238,6 +238,8 @@ import email # for MHTML parsing
238 import string # for printable 238 import string # for printable
239 import json # for json output mode (argument --json) 239 import json # for json output mode (argument --json)
240 240
  241 +from pyparsing import ParserElement
  242 +
241 # import lxml or ElementTree for XML parsing: 243 # import lxml or ElementTree for XML parsing:
242 try: 244 try:
243 # lxml: best performance for XML processing 245 # lxml: best performance for XML processing
@@ -287,6 +289,25 @@ else: @@ -287,6 +289,25 @@ else:
287 # xrange is now called range: 289 # xrange is now called range:
288 xrange = range 290 xrange = range
289 291
  292 +
  293 +# === PYTHON 3.0 - 3.4 SUPPORT ======================================================
  294 +
  295 +# From https://gist.github.com/ynkdir/867347/c5e188a4886bc2dd71876c7e069a7b00b6c16c61
  296 +
  297 +if sys.version_info >= (3, 0) and sys.version_info < (3, 5):
  298 + import codecs
  299 +
  300 + _backslashreplace_errors = codecs.lookup_error("backslashreplace")
  301 +
  302 + def backslashreplace_errors(exc):
  303 + if isinstance(exc, UnicodeDecodeError):
  304 + u = "".join("\\x{0:02x}".format(c) for c in exc.object[exc.start:exc.end])
  305 + return (u, exc.end)
  306 + return _backslashreplace_errors(exc)
  307 +
  308 + codecs.register_error("backslashreplace", backslashreplace_errors)
  309 +
  310 +
290 # === LOGGING ================================================================= 311 # === LOGGING =================================================================
291 312
292 class NullHandler(logging.Handler): 313 class NullHandler(logging.Handler):
@@ -1535,7 +1556,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): @@ -1535,7 +1556,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1535 modulename_id = struct.unpack("<H", dir_stream.read(2))[0] 1556 modulename_id = struct.unpack("<H", dir_stream.read(2))[0]
1536 check_value('MODULENAME_Id', 0x0019, modulename_id) 1557 check_value('MODULENAME_Id', 0x0019, modulename_id)
1537 modulename_sizeof_modulename = struct.unpack("<L", dir_stream.read(4))[0] 1558 modulename_sizeof_modulename = struct.unpack("<L", dir_stream.read(4))[0]
1538 - modulename_modulename = dir_stream.read(modulename_sizeof_modulename) 1559 + modulename_modulename = dir_stream.read(modulename_sizeof_modulename).decode('utf-8', 'backslashreplace')
1539 # TODO: preset variables to avoid "referenced before assignment" errors 1560 # TODO: preset variables to avoid "referenced before assignment" errors
1540 modulename_unicode_modulename_unicode = '' 1561 modulename_unicode_modulename_unicode = ''
1541 # account for optional sections 1562 # account for optional sections
@@ -1781,7 +1802,7 @@ def detect_suspicious(vba_code, obfuscation=None): @@ -1781,7 +1802,7 @@ def detect_suspicious(vba_code, obfuscation=None):
1781 for description, keywords in SUSPICIOUS_KEYWORDS.items(): 1802 for description, keywords in SUSPICIOUS_KEYWORDS.items():
1782 for keyword in keywords: 1803 for keyword in keywords:
1783 # search using regex to detect word boundaries: 1804 # search using regex to detect word boundaries:
1784 - match = re.search(r'(?i)\b' + keyword + r'\b', vba_code) 1805 + match = re.search(r'(?i)\b' + re.escape(keyword) + r'\b', vba_code)
1785 if match: 1806 if match:
1786 #if keyword.lower() in vba_code: 1807 #if keyword.lower() in vba_code:
1787 found_keyword = match.group() 1808 found_keyword = match.group()
@@ -1824,7 +1845,7 @@ def detect_hex_strings(vba_code): @@ -1824,7 +1845,7 @@ def detect_hex_strings(vba_code):
1824 value = match.group() 1845 value = match.group()
1825 if value not in found: 1846 if value not in found:
1826 decoded = binascii.unhexlify(value) 1847 decoded = binascii.unhexlify(value)
1827 - results.append((value, decoded.decode('utf-8','replace'))) 1848 + results.append((value, decoded.decode('utf-8', 'backslashreplace')))
1828 found.add(value) 1849 found.add(value)
1829 return results 1850 return results
1830 1851
@@ -2007,6 +2028,8 @@ class VBA_Scanner(object): @@ -2007,6 +2028,8 @@ class VBA_Scanner(object):
2007 2028
2008 :param vba_code: str, VBA source code to be analyzed 2029 :param vba_code: str, VBA source code to be analyzed
2009 """ 2030 """
  2031 + if isinstance(vba_code, bytes):
  2032 + vba_code = vba_code.decode('utf-8', 'backslashreplace')
2010 # join long lines ending with " _": 2033 # join long lines ending with " _":
2011 self.code = vba_collapse_long_lines(vba_code) 2034 self.code = vba_collapse_long_lines(vba_code)
2012 self.code_hex = '' 2035 self.code_hex = ''
@@ -2084,7 +2107,7 @@ class VBA_Scanner(object): @@ -2084,7 +2107,7 @@ class VBA_Scanner(object):
2084 (self.code_vba, 'VBA expression'), 2107 (self.code_vba, 'VBA expression'),
2085 ): 2108 ):
2086 if isinstance(code,bytes): 2109 if isinstance(code,bytes):
2087 - code=code.decode('utf-8','replace') 2110 + code=code.decode('utf-8','backslashreplace')
2088 self.autoexec_keywords += detect_autoexec(code, obfuscation) 2111 self.autoexec_keywords += detect_autoexec(code, obfuscation)
2089 self.suspicious_keywords += detect_suspicious(code, obfuscation) 2112 self.suspicious_keywords += detect_suspicious(code, obfuscation)
2090 self.iocs += detect_patterns(code, obfuscation) 2113 self.iocs += detect_patterns(code, obfuscation)
@@ -2411,7 +2434,7 @@ class VBA_Parser(object): @@ -2411,7 +2434,7 @@ class VBA_Parser(object):
2411 log.info('Opening MHTML file %s' % self.filename) 2434 log.info('Opening MHTML file %s' % self.filename)
2412 try: 2435 try:
2413 if isinstance(data,bytes): 2436 if isinstance(data,bytes):
2414 - data = data.decode('utf8', 'replace') 2437 + data = data.decode('utf8', 'backslashreplace')
2415 # parse the MIME content 2438 # parse the MIME content
2416 # remove any leading whitespace or newline (workaround for issue in email package) 2439 # remove any leading whitespace or newline (workaround for issue in email package)
2417 stripped_data = data.lstrip('\r\n\t ') 2440 stripped_data = data.lstrip('\r\n\t ')
@@ -2514,7 +2537,7 @@ class VBA_Parser(object): @@ -2514,7 +2537,7 @@ class VBA_Parser(object):
2514 log.info('Opening text file %s' % self.filename) 2537 log.info('Opening text file %s' % self.filename)
2515 # directly store the source code: 2538 # directly store the source code:
2516 if isinstance(data,bytes): 2539 if isinstance(data,bytes):
2517 - data=data.decode('utf8','replace') 2540 + data=data.decode('utf8','backslashreplace')
2518 self.vba_code_all_modules = data 2541 self.vba_code_all_modules = data
2519 self.contains_macros = True 2542 self.contains_macros = True
2520 # set type only if parsing succeeds 2543 # set type only if parsing succeeds
@@ -2671,7 +2694,7 @@ class VBA_Parser(object): @@ -2671,7 +2694,7 @@ class VBA_Parser(object):
2671 log.debug('%r...[much more data]...%r' % (data[:100], data[-50:])) 2694 log.debug('%r...[much more data]...%r' % (data[:100], data[-50:]))
2672 else: 2695 else:
2673 log.debug(repr(data)) 2696 log.debug(repr(data))
2674 - if 'Attribut' in data.decode('utf-8','ignore'): 2697 + if 'Attribut' in data.decode('utf-8', 'ignore'):
2675 log.debug('Found VBA compressed code') 2698 log.debug('Found VBA compressed code')
2676 self.contains_macros = True 2699 self.contains_macros = True
2677 except IOError as exc: 2700 except IOError as exc:
@@ -3026,7 +3049,7 @@ class VBA_Parser_CLI(VBA_Parser): @@ -3026,7 +3049,7 @@ class VBA_Parser_CLI(VBA_Parser):
3026 if hide_attributes: 3049 if hide_attributes:
3027 # hide attribute lines: 3050 # hide attribute lines:
3028 if isinstance(vba_code,bytes): 3051 if isinstance(vba_code,bytes):
3029 - vba_code =vba_code.decode('utf-8','replace') 3052 + vba_code =vba_code.decode('utf-8','backslashreplace')
3030 vba_code_filtered = filter_vba(vba_code) 3053 vba_code_filtered = filter_vba(vba_code)
3031 else: 3054 else:
3032 vba_code_filtered = vba_code 3055 vba_code_filtered = vba_code
@@ -3107,7 +3130,7 @@ class VBA_Parser_CLI(VBA_Parser): @@ -3107,7 +3130,7 @@ class VBA_Parser_CLI(VBA_Parser):
3107 curr_macro = {} 3130 curr_macro = {}
3108 if hide_attributes: 3131 if hide_attributes:
3109 # hide attribute lines: 3132 # hide attribute lines:
3110 - vba_code_filtered = filter_vba(vba_code.decode('utf-8','replace')) 3133 + vba_code_filtered = filter_vba(vba_code.decode('utf-8','backslashreplace'))
3111 else: 3134 else:
3112 vba_code_filtered = vba_code 3135 vba_code_filtered = vba_code
3113 3136