Commit 6b091f95c491a6eb7a45ed98c3c925da5b46b38e

Authored by Raphaël Vinot
Committed by Philippe Lagadec
1 parent 1c84a13f

Some more fixes related to python3 (#160)

- Improve decoding
- Use backslashreplace in order to keep the non-utf8 characters.
- Fix python 3.6+ support
- Generic fix for the lack of backslashreplace in python 3.0 to 3.4
Showing 1 changed file with 32 additions and 9 deletions
oletools/olevba3.py
... ... @@ -238,6 +238,8 @@ import email # for MHTML parsing
238 238 import string # for printable
239 239 import json # for json output mode (argument --json)
240 240  
  241 +from pyparsing import ParserElement
  242 +
241 243 # import lxml or ElementTree for XML parsing:
242 244 try:
243 245 # lxml: best performance for XML processing
... ... @@ -287,6 +289,25 @@ else:
287 289 # xrange is now called range:
288 290 xrange = range
289 291  
  292 +
  293 +# === PYTHON 3.0 - 3.4 SUPPORT ======================================================
  294 +
  295 +# From https://gist.github.com/ynkdir/867347/c5e188a4886bc2dd71876c7e069a7b00b6c16c61
  296 +
  297 +if sys.version_info >= (3, 0) and sys.version_info < (3, 5):
  298 + import codecs
  299 +
  300 + _backslashreplace_errors = codecs.lookup_error("backslashreplace")
  301 +
  302 + def backslashreplace_errors(exc):
  303 + if isinstance(exc, UnicodeDecodeError):
  304 + u = "".join("\\x{0:02x}".format(c) for c in exc.object[exc.start:exc.end])
  305 + return (u, exc.end)
  306 + return _backslashreplace_errors(exc)
  307 +
  308 + codecs.register_error("backslashreplace", backslashreplace_errors)
  309 +
  310 +
290 311 # === LOGGING =================================================================
291 312  
292 313 class NullHandler(logging.Handler):
... ... @@ -1535,7 +1556,7 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1535 1556 modulename_id = struct.unpack("<H", dir_stream.read(2))[0]
1536 1557 check_value('MODULENAME_Id', 0x0019, modulename_id)
1537 1558 modulename_sizeof_modulename = struct.unpack("<L", dir_stream.read(4))[0]
1538   - modulename_modulename = dir_stream.read(modulename_sizeof_modulename)
  1559 + modulename_modulename = dir_stream.read(modulename_sizeof_modulename).decode('utf-8', 'backslashreplace')
1539 1560 # TODO: preset variables to avoid "referenced before assignment" errors
1540 1561 modulename_unicode_modulename_unicode = ''
1541 1562 # account for optional sections
... ... @@ -1781,7 +1802,7 @@ def detect_suspicious(vba_code, obfuscation=None):
1781 1802 for description, keywords in SUSPICIOUS_KEYWORDS.items():
1782 1803 for keyword in keywords:
1783 1804 # search using regex to detect word boundaries:
1784   - match = re.search(r'(?i)\b' + keyword + r'\b', vba_code)
  1805 + match = re.search(r'(?i)\b' + re.escape(keyword) + r'\b', vba_code)
1785 1806 if match:
1786 1807 #if keyword.lower() in vba_code:
1787 1808 found_keyword = match.group()
... ... @@ -1824,7 +1845,7 @@ def detect_hex_strings(vba_code):
1824 1845 value = match.group()
1825 1846 if value not in found:
1826 1847 decoded = binascii.unhexlify(value)
1827   - results.append((value, decoded.decode('utf-8','replace')))
  1848 + results.append((value, decoded.decode('utf-8', 'backslashreplace')))
1828 1849 found.add(value)
1829 1850 return results
1830 1851  
... ... @@ -2007,6 +2028,8 @@ class VBA_Scanner(object):
2007 2028  
2008 2029 :param vba_code: str, VBA source code to be analyzed
2009 2030 """
  2031 + if isinstance(vba_code, bytes):
  2032 + vba_code = vba_code.decode('utf-8', 'backslashreplace')
2010 2033 # join long lines ending with " _":
2011 2034 self.code = vba_collapse_long_lines(vba_code)
2012 2035 self.code_hex = ''
... ... @@ -2084,7 +2107,7 @@ class VBA_Scanner(object):
2084 2107 (self.code_vba, 'VBA expression'),
2085 2108 ):
2086 2109 if isinstance(code,bytes):
2087   - code=code.decode('utf-8','replace')
  2110 + code=code.decode('utf-8','backslashreplace')
2088 2111 self.autoexec_keywords += detect_autoexec(code, obfuscation)
2089 2112 self.suspicious_keywords += detect_suspicious(code, obfuscation)
2090 2113 self.iocs += detect_patterns(code, obfuscation)
... ... @@ -2411,7 +2434,7 @@ class VBA_Parser(object):
2411 2434 log.info('Opening MHTML file %s' % self.filename)
2412 2435 try:
2413 2436 if isinstance(data,bytes):
2414   - data = data.decode('utf8', 'replace')
  2437 + data = data.decode('utf8', 'backslashreplace')
2415 2438 # parse the MIME content
2416 2439 # remove any leading whitespace or newline (workaround for issue in email package)
2417 2440 stripped_data = data.lstrip('\r\n\t ')
... ... @@ -2514,7 +2537,7 @@ class VBA_Parser(object):
2514 2537 log.info('Opening text file %s' % self.filename)
2515 2538 # directly store the source code:
2516 2539 if isinstance(data,bytes):
2517   - data=data.decode('utf8','replace')
  2540 + data=data.decode('utf8','backslashreplace')
2518 2541 self.vba_code_all_modules = data
2519 2542 self.contains_macros = True
2520 2543 # set type only if parsing succeeds
... ... @@ -2671,7 +2694,7 @@ class VBA_Parser(object):
2671 2694 log.debug('%r...[much more data]...%r' % (data[:100], data[-50:]))
2672 2695 else:
2673 2696 log.debug(repr(data))
2674   - if 'Attribut' in data.decode('utf-8','ignore'):
  2697 + if 'Attribut' in data.decode('utf-8', 'ignore'):
2675 2698 log.debug('Found VBA compressed code')
2676 2699 self.contains_macros = True
2677 2700 except IOError as exc:
... ... @@ -3026,7 +3049,7 @@ class VBA_Parser_CLI(VBA_Parser):
3026 3049 if hide_attributes:
3027 3050 # hide attribute lines:
3028 3051 if isinstance(vba_code,bytes):
3029   - vba_code =vba_code.decode('utf-8','replace')
  3052 + vba_code =vba_code.decode('utf-8','backslashreplace')
3030 3053 vba_code_filtered = filter_vba(vba_code)
3031 3054 else:
3032 3055 vba_code_filtered = vba_code
... ... @@ -3107,7 +3130,7 @@ class VBA_Parser_CLI(VBA_Parser):
3107 3130 curr_macro = {}
3108 3131 if hide_attributes:
3109 3132 # hide attribute lines:
3110   - vba_code_filtered = filter_vba(vba_code.decode('utf-8','replace'))
  3133 + vba_code_filtered = filter_vba(vba_code.decode('utf-8','backslashreplace'))
3111 3134 else:
3112 3135 vba_code_filtered = vba_code
3113 3136  
... ...