Commit 67f0725b7be892dcda47317ee5e180dce881d410

Authored by Philippe Lagadec
1 parent 8f37786d

olevba: improved Base64 detection and decoding

Showing 1 changed file with 18 additions and 8 deletions
oletools/olevba.py
@@ -120,6 +120,7 @@ https://github.com/unixfreak0037/officeparser @@ -120,6 +120,7 @@ https://github.com/unixfreak0037/officeparser
120 # 2015-02-07 v0.24 PL: - renamed option --hex to --decode, fixed display 120 # 2015-02-07 v0.24 PL: - renamed option --hex to --decode, fixed display
121 # - display exceptions with stack trace 121 # - display exceptions with stack trace
122 # - added several suspicious keywords 122 # - added several suspicious keywords
  123 +# - improved Base64 detection and decoding
123 124
124 __version__ = '0.24' 125 __version__ = '0.24'
125 126
@@ -127,6 +128,8 @@ __version__ = '0.24' @@ -127,6 +128,8 @@ __version__ = '0.24'
127 # TODO: 128 # TODO:
128 # + do not use logging, but a provided logger (null logger by default) 129 # + do not use logging, but a provided logger (null logger by default)
129 # + setup logging (common with other oletools) 130 # + setup logging (common with other oletools)
  131 +# + add xor bruteforcing like bbharvest
  132 +# + add chr() decoding
130 133
131 # TODO later: 134 # TODO later:
132 # + performance improvement: instead of searching each keyword separately, 135 # + performance improvement: instead of searching each keyword separately,
@@ -249,7 +252,7 @@ SUSPICIOUS_KEYWORDS = { @@ -249,7 +252,7 @@ SUSPICIOUS_KEYWORDS = {
249 ('Lib',), 252 ('Lib',),
250 'May download files from the Internet': 253 'May download files from the Internet':
251 #TODO: regex to find urlmon+URLDownloadToFileA on same line 254 #TODO: regex to find urlmon+URLDownloadToFileA on same line
252 - ('URLDownloadToFileA',), 255 + ('URLDownloadToFileA', 'Msxml2.XMLHTTP', 'Microsoft.XMLHTTP'),
253 'May control another application by simulating user keystrokes': 256 'May control another application by simulating user keystrokes':
254 ('SendKeys', 'AppActivate'), 257 ('SendKeys', 'AppActivate'),
255 #SendKeys: http://msdn.microsoft.com/en-us/library/office/gg278655%28v=office.15%29.aspx 258 #SendKeys: http://msdn.microsoft.com/en-us/library/office/gg278655%28v=office.15%29.aspx
@@ -303,8 +306,16 @@ re_hex_string = re.compile(r'(?:[0-9A-Fa-f]{2}){4,}') @@ -303,8 +306,16 @@ re_hex_string = re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')
303 306
304 # regex to detect strings encoded in base64 307 # regex to detect strings encoded in base64
305 #re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?"') 308 #re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?"')
306 -# alternate version from balbuzard:  
307 -re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4}){2,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)"') 309 +# better version from balbuzard, less false positives:
  310 +re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4}){1,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)?"')
  311 +# white list of common strings matching the base64 regex, but which are not base64 strings (all lowercase):
  312 +BASE64_WHITELIST = set(['thisdocument'])
  313 +
  314 +# regex to detect strings encoded with a specific Dridex algorithm
  315 +# (see https://github.com/JamesHabben/MalwareStuff)
  316 +re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"')
  317 +# regex to check that it is not just a hex string:
  318 +re_dridex_check = re.compile(r'[G-Zg-z]')
308 319
309 #--- FUNCTIONS ---------------------------------------------------------------- 320 #--- FUNCTIONS ----------------------------------------------------------------
310 321
@@ -956,8 +967,10 @@ def detect_base64_strings(vba_code): @@ -956,8 +967,10 @@ def detect_base64_strings(vba_code):
956 results = [] 967 results = []
957 found = set() 968 found = set()
958 for match in re_base64_string.finditer(vba_code): 969 for match in re_base64_string.finditer(vba_code):
959 - value = match.group()  
960 - if value not in found: 970 + # extract the base64 string without quotes:
  971 + value = match.group().strip('"')
  972 + # only keep new values and not in the whitelist:
  973 + if value not in found and value.lower() not in BASE64_WHITELIST:
961 try: 974 try:
962 decoded = base64.b64decode(value) 975 decoded = base64.b64decode(value)
963 results.append((value, decoded)) 976 results.append((value, decoded))
@@ -978,9 +991,6 @@ def detect_dridex_strings(vba_code): @@ -978,9 +991,6 @@ def detect_dridex_strings(vba_code):
978 from thirdparty.DridexUrlDecoder.DridexUrlDecoder import DridexUrlDecode 991 from thirdparty.DridexUrlDecoder.DridexUrlDecoder import DridexUrlDecode
979 results = [] 992 results = []
980 found = set() 993 found = set()
981 - re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"')  
982 - # regex to check that it is not just a hex string:  
983 - re_dridex_check = re.compile(r'[G-Zg-z]')  
984 for match in re_dridex_string.finditer(vba_code): 994 for match in re_dridex_string.finditer(vba_code):
985 value = match.group()[1:-1] 995 value = match.group()[1:-1]
986 if not re_dridex_check.search(value): 996 if not re_dridex_check.search(value):