Commit 67f0725b7be892dcda47317ee5e180dce881d410

Authored by Philippe Lagadec
1 parent 8f37786d

olevba: improved Base64 detection and decoding

Showing 1 changed file with 18 additions and 8 deletions
oletools/olevba.py
... ... @@ -120,6 +120,7 @@ https://github.com/unixfreak0037/officeparser
120 120 # 2015-02-07 v0.24 PL: - renamed option --hex to --decode, fixed display
121 121 # - display exceptions with stack trace
122 122 # - added several suspicious keywords
  123 +# - improved Base64 detection and decoding
123 124  
124 125 __version__ = '0.24'
125 126  
... ... @@ -127,6 +128,8 @@ __version__ = '0.24'
127 128 # TODO:
128 129 # + do not use logging, but a provided logger (null logger by default)
129 130 # + setup logging (common with other oletools)
  131 +# + add xor bruteforcing like bbharvest
  132 +# + add chr() decoding
130 133  
131 134 # TODO later:
132 135 # + performance improvement: instead of searching each keyword separately,
... ... @@ -249,7 +252,7 @@ SUSPICIOUS_KEYWORDS = {
249 252 ('Lib',),
250 253 'May download files from the Internet':
251 254 #TODO: regex to find urlmon+URLDownloadToFileA on same line
252   - ('URLDownloadToFileA',),
  255 + ('URLDownloadToFileA', 'Msxml2.XMLHTTP', 'Microsoft.XMLHTTP'),
253 256 'May control another application by simulating user keystrokes':
254 257 ('SendKeys', 'AppActivate'),
255 258 #SendKeys: http://msdn.microsoft.com/en-us/library/office/gg278655%28v=office.15%29.aspx
... ... @@ -303,8 +306,16 @@ re_hex_string = re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')
303 306  
304 307 # regex to detect strings encoded in base64
305 308 #re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?"')
306   -# alternate version from balbuzard:
307   -re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4}){2,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)"')
  309 +# better version from balbuzard, less false positives:
  310 +re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4}){1,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)?"')
  311 +# white list of common strings matching the base64 regex, but which are not base64 strings (all lowercase):
  312 +BASE64_WHITELIST = set(['thisdocument'])
  313 +
  314 +# regex to detect strings encoded with a specific Dridex algorithm
  315 +# (see https://github.com/JamesHabben/MalwareStuff)
  316 +re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"')
  317 +# regex to check that it is not just a hex string:
  318 +re_dridex_check = re.compile(r'[G-Zg-z]')
308 319  
309 320 #--- FUNCTIONS ----------------------------------------------------------------
310 321  
... ... @@ -956,8 +967,10 @@ def detect_base64_strings(vba_code):
956 967 results = []
957 968 found = set()
958 969 for match in re_base64_string.finditer(vba_code):
959   - value = match.group()
960   - if value not in found:
  970 + # extract the base64 string without quotes:
  971 + value = match.group().strip('"')
  972 + # only keep new values and not in the whitelist:
  973 + if value not in found and value.lower() not in BASE64_WHITELIST:
961 974 try:
962 975 decoded = base64.b64decode(value)
963 976 results.append((value, decoded))
... ... @@ -978,9 +991,6 @@ def detect_dridex_strings(vba_code):
978 991 from thirdparty.DridexUrlDecoder.DridexUrlDecoder import DridexUrlDecode
979 992 results = []
980 993 found = set()
981   - re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"')
982   - # regex to check that it is not just a hex string:
983   - re_dridex_check = re.compile(r'[G-Zg-z]')
984 994 for match in re_dridex_string.finditer(vba_code):
985 995 value = match.group()[1:-1]
986 996 if not re_dridex_check.search(value):
... ...