Commit b038d9276d05d9a84445d7ff741c422aa65162da

Authored by decalage2
1 parent fc17c53d

olevba: convert bytes to unicode for Python 3, escape keywords for regex search (issue #106)

Showing 1 changed file with 12 additions and 2 deletions
oletools/olevba.py
@@ -2055,7 +2055,7 @@ def detect_autoexec(vba_code, obfuscation=None): @@ -2055,7 +2055,7 @@ def detect_autoexec(vba_code, obfuscation=None):
2055 for keyword in keywords: 2055 for keyword in keywords:
2056 #TODO: if keyword is already a compiled regex, use it as-is 2056 #TODO: if keyword is already a compiled regex, use it as-is
2057 # search using regex to detect word boundaries: 2057 # search using regex to detect word boundaries:
2058 - match = re.search(r'(?i)\b' + keyword + r'\b', vba_code) 2058 + match = re.search(r'(?i)\b' + re.escape(keyword) + r'\b', vba_code)
2059 if match: 2059 if match:
2060 #if keyword.lower() in vba_code: 2060 #if keyword.lower() in vba_code:
2061 found_keyword = match.group() 2061 found_keyword = match.group()
@@ -2081,7 +2081,8 @@ def detect_suspicious(vba_code, obfuscation=None): @@ -2081,7 +2081,8 @@ def detect_suspicious(vba_code, obfuscation=None):
2081 for description, keywords in SUSPICIOUS_KEYWORDS.items(): 2081 for description, keywords in SUSPICIOUS_KEYWORDS.items():
2082 for keyword in keywords: 2082 for keyword in keywords:
2083 # search using regex to detect word boundaries: 2083 # search using regex to detect word boundaries:
2084 - match = re.search(r'(?i)\b' + keyword + r'\b', vba_code) 2084 + # note: each keyword must be escaped if it contains special chars such as '\'
  2085 + match = re.search(r'(?i)\b' + re.escape(keyword) + r'\b', vba_code)
2085 if match: 2086 if match:
2086 #if keyword.lower() in vba_code: 2087 #if keyword.lower() in vba_code:
2087 found_keyword = match.group() 2088 found_keyword = match.group()
@@ -2128,6 +2129,9 @@ def detect_hex_strings(vba_code): @@ -2128,6 +2129,9 @@ def detect_hex_strings(vba_code):
2128 value = match.group() 2129 value = match.group()
2129 if value not in found: 2130 if value not in found:
2130 decoded = binascii.unhexlify(value) 2131 decoded = binascii.unhexlify(value)
  2132 + # On python 3, convert it to unicode
  2133 + if not PYTHON2:
  2134 + decoded = decoded.decode('utf8', errors='replace')
2131 results.append((value, decoded)) 2135 results.append((value, decoded))
2132 found.add(value) 2136 found.add(value)
2133 return results 2137 return results
@@ -2153,6 +2157,9 @@ def detect_base64_strings(vba_code): @@ -2153,6 +2157,9 @@ def detect_base64_strings(vba_code):
2153 if value not in found and value.lower() not in BASE64_WHITELIST: 2157 if value not in found and value.lower() not in BASE64_WHITELIST:
2154 try: 2158 try:
2155 decoded = base64.b64decode(value) 2159 decoded = base64.b64decode(value)
  2160 + # On python 3, convert it to unicode
  2161 + if not PYTHON2:
  2162 + decoded = decoded.decode('utf8', errors='replace')
2156 results.append((value, decoded)) 2163 results.append((value, decoded))
2157 found.add(value) 2164 found.add(value)
2158 except (TypeError, ValueError) as exc: 2165 except (TypeError, ValueError) as exc:
@@ -2181,6 +2188,9 @@ def detect_dridex_strings(vba_code): @@ -2181,6 +2188,9 @@ def detect_dridex_strings(vba_code):
2181 if value not in found: 2188 if value not in found:
2182 try: 2189 try:
2183 decoded = DridexUrlDecode(value) 2190 decoded = DridexUrlDecode(value)
  2191 + # On python 3, convert it to unicode
  2192 + if not PYTHON2:
  2193 + decoded = decoded.decode('utf8', errors='replace')
2184 results.append((value, decoded)) 2194 results.append((value, decoded))
2185 found.add(value) 2195 found.add(value)
2186 except Exception as exc: 2196 except Exception as exc: