Commit f1944c359c5205b83c72519acc2678562114f859

Authored by Philippe Lagadec
1 parent 58773840

olevba: added generic VBA expression deobfuscation (chr,asc,etc) using pyparsing

Showing 1 changed file with 219 additions and 7 deletions
oletools/olevba.py
... ... @@ -138,15 +138,15 @@ https://github.com/unixfreak0037/officeparser
138 138 # parsing errors (issue #7)
139 139 # 2015-05-29 v0.30 PL: - added suspicious keywords suggested by @ozhermit,
140 140 # Davy Douhine (issue #9), issue #13
  141 +# 2015-06-16 v0.31 PL: - added generic VBA expression deobfuscation (chr,asc,etc)
141 142  
142   -__version__ = '0.30'
  143 +__version__ = '0.31'
143 144  
144 145 #------------------------------------------------------------------------------
145 146 # TODO:
146 147 # + do not use logging, but a provided logger (null logger by default)
147 148 # + setup logging (common with other oletools)
148 149 # + add xor bruteforcing like bbharvest
149   -# + add chr() decoding
150 150  
151 151 # TODO later:
152 152 # + performance improvement: instead of searching each keyword separately,
... ... @@ -209,6 +209,11 @@ import thirdparty.olefile as olefile
209 209 from thirdparty.prettytable import prettytable
210 210 from thirdparty.xglob import xglob
211 211  
  212 +# TODO: move to thirdparty
  213 +from pyparsing import *
  214 +
  215 +
  216 +
212 217 #--- CONSTANTS ----------------------------------------------------------------
213 218  
214 219 # URL and message to report issues:
... ... @@ -391,13 +396,15 @@ re_url = re.compile(URL_RE)
391 396 RE_PATTERNS = (
392 397 ('URL', re.compile(URL_RE)),
393 398 ('IPv4 address', re.compile(IPv4)),
  399 + # TODO: add IPv6
394 400 ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@' + SERVER + '\b')),
395 401 # ('Domain name', re.compile(r'(?=^.{1,254}$)(^(?:(?!\d+\.|-)[a-zA-Z0-9_\-]{1,63}(?<!-)\.?)+(?:[a-zA-Z]{2,})$)')),
396 402 # Executable file name with known extensions (except .com which is present in many URLs, and .application):
397 403 ("Executable file name", re.compile(
398 404 r"(?i)\b\w+\.(EXE|PIF|GADGET|MSI|MSP|MSC|VBS|VBE|VB|JSE|JS|WSF|WSC|WSH|WS|BAT|CMD|DLL|SCR|HTA|CPL|CLASS|JAR|PS1XML|PS1|PS2XML|PS2|PSC1|PSC2|SCF|LNK|INF|REG)\b")),
399 405 # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/
400   - #TODO: https://support.office.com/en-us/article/Blocked-attachments-in-Outlook-3811cddc-17c3-4279-a30c-060ba0207372#__attachment_file_types
  406 + # TODO: https://support.office.com/en-us/article/Blocked-attachments-in-Outlook-3811cddc-17c3-4279-a30c-060ba0207372#__attachment_file_types
  407 + # TODO: add win & unix file paths
401 408 #('Hex string', re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')),
402 409 )
403 410  
... ... @@ -419,6 +426,163 @@ re_dridex_string = re.compile(r&#39;&quot;[0-9A-Za-z]{20,}&quot;&#39;)
419 426 re_nothex_check = re.compile(r'[G-Zg-z]')
420 427  
421 428  
  429 +# === PARTIAL VBA GRAMMAR ====================================================
  430 +
  431 +# REFERENCES:
  432 +# - [MS-VBAL]: VBA Language Specification
  433 +# https://msdn.microsoft.com/en-us/library/dd361851.aspx
  434 +# - pyparsing: http://pyparsing.wikispaces.com/
  435 +
  436 +# VBA identifier chars (from MS-VBAL 3.3.5)
  437 +vba_identifier_chars = alphanums + '_'
  438 +
  439 +class VbaExpressionString(str):
  440 + """
  441 + Class identical to str, used to distinguish plain strings from strings
  442 + obfuscated using VBA expressions (Chr, StrReverse, etc)
  443 + Usage: each VBA expression parse action should convert strings to
  444 + VbaExpressionString.
  445 + Then isinstance(s, VbaExpressionString) is True only for VBA expressions.
  446 + (see detect_vba_strings)
  447 + """
  448 + pass
  449 +
  450 +
  451 +# --- NUMBER TOKENS ----------------------------------------------------------
  452 +
  453 +# 3.3.2 Number Tokens
  454 +# INTEGER = integer-literal ["%" / "&" / "^"]
  455 +# integer-literal = decimal-literal / octal-literal / hex-literal
  456 +# decimal-literal = 1*decimal-digit
  457 +# octal-literal = "&" [%x004F / %x006F] 1*octal-digit
  458 +# ; & or &o or &O
  459 +# hex-literal = "&" (%x0048 / %x0068) 1*hex-digit
  460 +# ; &h or &H
  461 +# octal-digit = "0" / "1" / "2" / "3" / "4" / "5" / "6" / "7"
  462 +# decimal-digit = octal-digit / "8" / "9"
  463 +# hex-digit = decimal-digit / %x0041-0046 / %x0061-0066 ;A-F / a-f
  464 +
  465 +# NOTE: here Combine() is required to avoid spaces between elements
  466 +# NOTE: here WordStart is necessary to avoid matching a number preceded by
  467 +# letters or underscore (e.g. "VBT1" or "ABC_34"), when using scanString
  468 +decimal_literal = Combine(WordStart(vba_identifier_chars) + Word(nums)
  469 + + Suppress(Optional(Word('%&^', exact=1))))
  470 +decimal_literal.setParseAction(lambda t: int(t[0]))
  471 +
  472 +octal_literal = Combine(Suppress(Literal('&') + Optional((CaselessLiteral('o')))) + Word(srange('[0-7]'))
  473 + + Suppress(Optional(Word('%&^', exact=1))))
  474 +octal_literal.setParseAction(lambda t: int(t[0], base=8))
  475 +
  476 +hex_literal = Combine(Suppress(CaselessLiteral('&h')) + Word(srange('[0-9a-fA-F]'))
  477 + + Suppress(Optional(Word('%&^', exact=1))))
  478 +hex_literal.setParseAction(lambda t: int(t[0], base=16))
  479 +
  480 +integer = decimal_literal | octal_literal | hex_literal
  481 +
  482 +
  483 +# --- QUOTED STRINGS ---------------------------------------------------------
  484 +
  485 +# 3.3.4 String Tokens
  486 +# STRING = double-quote *string-character (double-quote / line-continuation / LINE-END)
  487 +# double-quote = %x0022 ; "
  488 +# string-character = NO-LINE-CONTINUATION ((double-quote double-quote) termination-character)
  489 +
  490 +quoted_string = QuotedString('"', escQuote='""')
  491 +quoted_string.setParseAction(lambda t: str(t[0]))
  492 +
  493 +
  494 +#--- VBA Expressions ---------------------------------------------------------
  495 +
  496 +# See MS-VBAL 5.6 Expressions
  497 +
  498 +# need to pre-declare using Forward() because it is recursive
  499 +# VBA string expression and integer expression
  500 +vba_expr_str = Forward()
  501 +vba_expr_int = Forward()
  502 +
  503 +# --- CHR --------------------------------------------------------------------
  504 +
  505 +# Chr, Chr$, ChrB, ChrW(int) => char
  506 +vba_chr = Suppress(
  507 + Combine(WordStart(vba_identifier_chars) + CaselessLiteral('Chr')
  508 + + Optional(CaselessLiteral('B') | CaselessLiteral('W')) + Optional('$'))
  509 + + '(') + vba_expr_int + Suppress(')')
  510 +vba_chr.setParseAction(lambda t: VbaExpressionString(chr(t[0])))
  511 +
  512 +
  513 +# --- ASC --------------------------------------------------------------------
  514 +
  515 +# Asc(char) => int
  516 +#TODO: see MS-VBAL 6.1.2.11.1.1 page 240 => AscB, AscW
  517 +vba_asc = Suppress(CaselessKeyword('Asc') + '(') + vba_expr_str + Suppress(')')
  518 +vba_asc.setParseAction(lambda t: ord(t[0]))
  519 +
  520 +
  521 +# --- VAL --------------------------------------------------------------------
  522 +
  523 +# Val(string) => int
  524 +# TODO: make sure the behavior of VBA's val is fully covered
  525 +vba_val = Suppress(CaselessKeyword('Val') + '(') + vba_expr_str + Suppress(')')
  526 +vba_val.setParseAction(lambda t: int(t[0].strip()))
  527 +
  528 +
  529 +# --- StrReverse() --------------------------------------------------------------------
  530 +
  531 +# StrReverse(string) => string
  532 +strReverse = Suppress(CaselessKeyword('StrReverse') + '(') + vba_expr_str + Suppress(')')
  533 +strReverse.setParseAction(lambda t: VbaExpressionString(str(t[0])[::-1]))
  534 +
  535 +
  536 +# --- ENVIRON() --------------------------------------------------------------------
  537 +
  538 +# Environ("name") => just translated to "%name%", that is enough for malware analysis
  539 +environ = Suppress(CaselessKeyword('Environ') + '(') + vba_expr_str + Suppress(')')
  540 +environ.setParseAction(lambda t: VbaExpressionString('%%%s%%' % t[0]))
  541 +
  542 +
  543 +# ---STRING EXPRESSION -------------------------------------------------------
  544 +
  545 +def concat_strings_list(tokens):
  546 + """
  547 + parse action to concatenate strings in a VBA expression with operators '+' or '&'
  548 + """
  549 + # extract argument from the tokens:
  550 + # expected to be a tuple containing a list of strings such as [a,'&',b,'&',c,...]
  551 + strings = tokens[0][::2]
  552 + return VbaExpressionString(''.join(strings))
  553 +
  554 +
  555 +vba_expr_str_item = (vba_chr | strReverse | environ | quoted_string)
  556 +
  557 +vba_expr_str <<= infixNotation(vba_expr_str_item,
  558 + [
  559 + ("+", 2, opAssoc.LEFT, concat_strings_list),
  560 + ("&", 2, opAssoc.LEFT, concat_strings_list),
  561 + ])
  562 +
  563 +
  564 +# ---STRING EXPRESSION -------------------------------------------------------
  565 +
  566 +def sum_ints_list(tokens):
  567 + """
  568 + parse action to sum integers in a VBA expression with operator '+'
  569 + """
  570 + # extract argument from the tokens:
  571 + # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...]
  572 + integers = tokens[0][::2]
  573 + return sum(integers)
  574 +
  575 +
  576 +vba_expr_int_item = (vba_asc | vba_val | integer)
  577 +
  578 +vba_expr_int <<= infixNotation(vba_expr_int_item,
  579 + [
  580 + ("+", 2, opAssoc.LEFT, sum_ints_list),
  581 + ])
  582 +
  583 +
  584 +# see detect_vba_strings for the deobfuscation code using this grammar
  585 +
422 586 # === MSO/ActiveMime files parsing ===========================================
423 587  
424 588 def is_mso_file(data):
... ... @@ -1186,6 +1350,41 @@ def detect_dridex_strings(vba_code):
1186 1350 return results
1187 1351  
1188 1352  
  1353 +def detect_vba_strings(vba_code):
  1354 + """
  1355 + Detect if the VBA code contains strings obfuscated with VBA expressions
  1356 + using keywords such as Chr, Asc, Val, StrReverse, etc.
  1357 +
  1358 + :param vba_code: str, VBA source code
  1359 + :return: list of str tuples (encoded string, decoded string)
  1360 + """
  1361 + # TODO: handle exceptions
  1362 + results = []
  1363 + found = set()
  1364 + # IMPORTANT: to extract the actual VBA expressions found in the code,
  1365 + # we must expand tabs to have the same string as pyparsing.
  1366 + # Otherwise, start and end offsets are incorrect.
  1367 + vba_code = vba_code.expandtabs()
  1368 + for tokens, start, end in vba_expr_str.scanString(vba_code):
  1369 + encoded = vba_code[start:end]
  1370 + decoded = tokens[0]
  1371 + if isinstance(decoded, VbaExpressionString):
  1372 + # This is a VBA expression, not a simple string
  1373 + # print 'VBA EXPRESSION: encoded=%r => decoded=%r' % (encoded, decoded)
  1374 + # remove parentheses and quotes from original string:
  1375 + # if encoded.startswith('(') and encoded.endswith(')'):
  1376 + # encoded = encoded[1:-1]
  1377 + # if encoded.startswith('"') and encoded.endswith('"'):
  1378 + # encoded = encoded[1:-1]
  1379 + # avoid duplicates and simple strings:
  1380 + if encoded not in found and decoded != encoded:
  1381 + results.append((encoded, decoded))
  1382 + found.add(encoded)
  1383 + # else:
  1384 + # print 'VBA STRING: encoded=%r => decoded=%r' % (encoded, decoded)
  1385 + return results
  1386 +
  1387 +
1189 1388 class VBA_Scanner(object):
1190 1389 """
1191 1390 Class to scan the source code of a VBA module to find obfuscated strings,
... ... @@ -1204,6 +1403,7 @@ class VBA_Scanner(object):
1204 1403 self.code_rev_hex = ''
1205 1404 self.code_base64 = ''
1206 1405 self.code_dridex = ''
  1406 + self.code_vba = ''
1207 1407  
1208 1408  
1209 1409 def scan(self, include_decoded_strings=False):
... ... @@ -1240,6 +1440,10 @@ class VBA_Scanner(object):
1240 1440 self.dridex_strings = detect_dridex_strings(self.code)
1241 1441 for encoded, decoded in self.dridex_strings:
1242 1442 self.code_dridex += '\n' + decoded
  1443 + # Detect obfuscated strings in VBA expressions
  1444 + self.vba_strings = detect_vba_strings(self.code)
  1445 + for encoded, decoded in self.vba_strings:
  1446 + self.code_vba += '\n' + decoded
1243 1447 results = []
1244 1448 self.autoexec_keywords = []
1245 1449 self.suspicious_keywords = []
... ... @@ -1252,6 +1456,7 @@ class VBA_Scanner(object):
1252 1456 (self.code_rev_hex, 'StrReverse+Hex'),
1253 1457 (self.code_base64, 'Base64'),
1254 1458 (self.code_dridex, 'Dridex'),
  1459 + (self.code_vba, 'VBA expression'),
1255 1460 ):
1256 1461 self.autoexec_keywords += detect_autoexec(code, obfuscation)
1257 1462 self.suspicious_keywords += detect_suspicious(code, obfuscation)
... ... @@ -1267,6 +1472,9 @@ class VBA_Scanner(object):
1267 1472 if self.dridex_strings:
1268 1473 self.suspicious_keywords.append(('Dridex Strings',
1269 1474 'Dridex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))
  1475 + if self.vba_strings:
  1476 + self.suspicious_keywords.append(('VBA obfuscated Strings',
  1477 + 'VBA string expressions were detected, may be used to obfuscate strings (option --decode to see all)'))
1270 1478 for keyword, description in self.autoexec_keywords:
1271 1479 results.append(('AutoExec', keyword, description))
1272 1480 for keyword, description in self.suspicious_keywords:
... ... @@ -1275,11 +1483,13 @@ class VBA_Scanner(object):
1275 1483 results.append(('IOC', value, pattern_type))
1276 1484 if include_decoded_strings:
1277 1485 for encoded, decoded in self.hex_strings:
1278   - results.append(('Hex String', repr(decoded), encoded))
  1486 + results.append(('Hex String', repr(decoded), repr(encoded)))
1279 1487 for encoded, decoded in self.base64_strings:
1280   - results.append(('Base64 String', repr(decoded), encoded))
  1488 + results.append(('Base64 String', repr(decoded), repr(encoded)))
1281 1489 for encoded, decoded in self.dridex_strings:
1282   - results.append(('Dridex string', repr(decoded), encoded))
  1490 + results.append(('Dridex string', repr(decoded), repr(encoded)))
  1491 + for encoded, decoded in self.vba_strings:
  1492 + results.append(('VBA string', repr(decoded), repr(encoded)))
1283 1493 return results
1284 1494  
1285 1495 def scan_summary(self):
... ... @@ -1821,7 +2031,9 @@ def main():
1821 2031 parser.add_option("-i", "--input", dest='input', type='str', default=None,
1822 2032 help='input file containing VBA source code to be analyzed (no parsing)')
1823 2033 parser.add_option("--decode", action="store_true", dest="show_decoded_strings",
1824   - help='display all the obfuscated strings with their decoded content (Hex, Base64, StrReverse, Dridex).')
  2034 + help='display all the obfuscated strings with their decoded content (Hex, Base64, StrReverse, Dridex, VBA).')
  2035 +
  2036 + # TODO: --novba to disable VBA expressions parsing
1825 2037  
1826 2038 (options, args) = parser.parse_args()
1827 2039  
... ...