Commit f1944c359c5205b83c72519acc2678562114f859
1 parent
58773840
olevba: added generic VBA expression deobfuscation (chr,asc,etc) using pyparsing
Showing
1 changed file
with
219 additions
and
7 deletions
oletools/olevba.py
| ... | ... | @@ -138,15 +138,15 @@ https://github.com/unixfreak0037/officeparser |
| 138 | 138 | # parsing errors (issue #7) |
| 139 | 139 | # 2015-05-29 v0.30 PL: - added suspicious keywords suggested by @ozhermit, |
| 140 | 140 | # Davy Douhine (issue #9), issue #13 |
| 141 | +# 2015-06-16 v0.31 PL: - added generic VBA expression deobfuscation (chr,asc,etc) | |
| 141 | 142 | |
| 142 | -__version__ = '0.30' | |
| 143 | +__version__ = '0.31' | |
| 143 | 144 | |
| 144 | 145 | #------------------------------------------------------------------------------ |
| 145 | 146 | # TODO: |
| 146 | 147 | # + do not use logging, but a provided logger (null logger by default) |
| 147 | 148 | # + setup logging (common with other oletools) |
| 148 | 149 | # + add xor bruteforcing like bbharvest |
| 149 | -# + add chr() decoding | |
| 150 | 150 | |
| 151 | 151 | # TODO later: |
| 152 | 152 | # + performance improvement: instead of searching each keyword separately, |
| ... | ... | @@ -209,6 +209,11 @@ import thirdparty.olefile as olefile |
| 209 | 209 | from thirdparty.prettytable import prettytable |
| 210 | 210 | from thirdparty.xglob import xglob |
| 211 | 211 | |
| 212 | +# TODO: move to thirdparty | |
| 213 | +from pyparsing import * | |
| 214 | + | |
| 215 | + | |
| 216 | + | |
| 212 | 217 | #--- CONSTANTS ---------------------------------------------------------------- |
| 213 | 218 | |
| 214 | 219 | # URL and message to report issues: |
| ... | ... | @@ -391,13 +396,15 @@ re_url = re.compile(URL_RE) |
| 391 | 396 | RE_PATTERNS = ( |
| 392 | 397 | ('URL', re.compile(URL_RE)), |
| 393 | 398 | ('IPv4 address', re.compile(IPv4)), |
| 399 | + # TODO: add IPv6 | |
| 394 | 400 | ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@' + SERVER + '\b')), |
| 395 | 401 | # ('Domain name', re.compile(r'(?=^.{1,254}$)(^(?:(?!\d+\.|-)[a-zA-Z0-9_\-]{1,63}(?<!-)\.?)+(?:[a-zA-Z]{2,})$)')), |
| 396 | 402 | # Executable file name with known extensions (except .com which is present in many URLs, and .application): |
| 397 | 403 | ("Executable file name", re.compile( |
| 398 | 404 | r"(?i)\b\w+\.(EXE|PIF|GADGET|MSI|MSP|MSC|VBS|VBE|VB|JSE|JS|WSF|WSC|WSH|WS|BAT|CMD|DLL|SCR|HTA|CPL|CLASS|JAR|PS1XML|PS1|PS2XML|PS2|PSC1|PSC2|SCF|LNK|INF|REG)\b")), |
| 399 | 405 | # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/ |
| 400 | - #TODO: https://support.office.com/en-us/article/Blocked-attachments-in-Outlook-3811cddc-17c3-4279-a30c-060ba0207372#__attachment_file_types | |
| 406 | + # TODO: https://support.office.com/en-us/article/Blocked-attachments-in-Outlook-3811cddc-17c3-4279-a30c-060ba0207372#__attachment_file_types | |
| 407 | + # TODO: add win & unix file paths | |
| 401 | 408 | #('Hex string', re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')), |
| 402 | 409 | ) |
| 403 | 410 | |
| ... | ... | @@ -419,6 +426,163 @@ re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"') |
| 419 | 426 | re_nothex_check = re.compile(r'[G-Zg-z]') |
| 420 | 427 | |
| 421 | 428 | |
| 429 | +# === PARTIAL VBA GRAMMAR ==================================================== | |
| 430 | + | |
| 431 | +# REFERENCES: | |
| 432 | +# - [MS-VBAL]: VBA Language Specification | |
| 433 | +# https://msdn.microsoft.com/en-us/library/dd361851.aspx | |
| 434 | +# - pyparsing: http://pyparsing.wikispaces.com/ | |
| 435 | + | |
| 436 | +# VBA identifier chars (from MS-VBAL 3.3.5) | |
| 437 | +vba_identifier_chars = alphanums + '_' | |
| 438 | + | |
| 439 | +class VbaExpressionString(str): | |
| 440 | + """ | |
| 441 | + Class identical to str, used to distinguish plain strings from strings | |
| 442 | + obfuscated using VBA expressions (Chr, StrReverse, etc) | |
| 443 | + Usage: each VBA expression parse action should convert strings to | |
| 444 | + VbaExpressionString. | |
| 445 | + Then isinstance(s, VbaExpressionString) is True only for VBA expressions. | |
| 446 | + (see detect_vba_strings) | |
| 447 | + """ | |
| 448 | + pass | |
| 449 | + | |
| 450 | + | |
| 451 | +# --- NUMBER TOKENS ---------------------------------------------------------- | |
| 452 | + | |
| 453 | +# 3.3.2 Number Tokens | |
| 454 | +# INTEGER = integer-literal ["%" / "&" / "^"] | |
| 455 | +# integer-literal = decimal-literal / octal-literal / hex-literal | |
| 456 | +# decimal-literal = 1*decimal-digit | |
| 457 | +# octal-literal = "&" [%x004F / %x006F] 1*octal-digit | |
| 458 | +# ; & or &o or &O | |
| 459 | +# hex-literal = "&" (%x0048 / %x0068) 1*hex-digit | |
| 460 | +# ; &h or &H | |
| 461 | +# octal-digit = "0" / "1" / "2" / "3" / "4" / "5" / "6" / "7" | |
| 462 | +# decimal-digit = octal-digit / "8" / "9" | |
| 463 | +# hex-digit = decimal-digit / %x0041-0046 / %x0061-0066 ;A-F / a-f | |
| 464 | + | |
| 465 | +# NOTE: here Combine() is required to avoid spaces between elements | |
| 466 | +# NOTE: here WordStart is necessary to avoid matching a number preceded by | |
| 467 | +# letters or underscore (e.g. "VBT1" or "ABC_34"), when using scanString | |
| 468 | +decimal_literal = Combine(WordStart(vba_identifier_chars) + Word(nums) | |
| 469 | + + Suppress(Optional(Word('%&^', exact=1)))) | |
| 470 | +decimal_literal.setParseAction(lambda t: int(t[0])) | |
| 471 | + | |
| 472 | +octal_literal = Combine(Suppress(Literal('&') + Optional((CaselessLiteral('o')))) + Word(srange('[0-7]')) | |
| 473 | + + Suppress(Optional(Word('%&^', exact=1)))) | |
| 474 | +octal_literal.setParseAction(lambda t: int(t[0], base=8)) | |
| 475 | + | |
| 476 | +hex_literal = Combine(Suppress(CaselessLiteral('&h')) + Word(srange('[0-9a-fA-F]')) | |
| 477 | + + Suppress(Optional(Word('%&^', exact=1)))) | |
| 478 | +hex_literal.setParseAction(lambda t: int(t[0], base=16)) | |
| 479 | + | |
| 480 | +integer = decimal_literal | octal_literal | hex_literal | |
| 481 | + | |
| 482 | + | |
| 483 | +# --- QUOTED STRINGS --------------------------------------------------------- | |
| 484 | + | |
| 485 | +# 3.3.4 String Tokens | |
| 486 | +# STRING = double-quote *string-character (double-quote / line-continuation / LINE-END) | |
| 487 | +# double-quote = %x0022 ; " | |
| 488 | +# string-character = NO-LINE-CONTINUATION ((double-quote double-quote) termination-character) | |
| 489 | + | |
| 490 | +quoted_string = QuotedString('"', escQuote='""') | |
| 491 | +quoted_string.setParseAction(lambda t: str(t[0])) | |
| 492 | + | |
| 493 | + | |
| 494 | +#--- VBA Expressions --------------------------------------------------------- | |
| 495 | + | |
| 496 | +# See MS-VBAL 5.6 Expressions | |
| 497 | + | |
| 498 | +# need to pre-declare using Forward() because it is recursive | |
| 499 | +# VBA string expression and integer expression | |
| 500 | +vba_expr_str = Forward() | |
| 501 | +vba_expr_int = Forward() | |
| 502 | + | |
| 503 | +# --- CHR -------------------------------------------------------------------- | |
| 504 | + | |
| 505 | +# Chr, Chr$, ChrB, ChrW(int) => char | |
| 506 | +vba_chr = Suppress( | |
| 507 | + Combine(WordStart(vba_identifier_chars) + CaselessLiteral('Chr') | |
| 508 | + + Optional(CaselessLiteral('B') | CaselessLiteral('W')) + Optional('$')) | |
| 509 | + + '(') + vba_expr_int + Suppress(')') | |
| 510 | +vba_chr.setParseAction(lambda t: VbaExpressionString(chr(t[0]))) | |
| 511 | + | |
| 512 | + | |
| 513 | +# --- ASC -------------------------------------------------------------------- | |
| 514 | + | |
| 515 | +# Asc(char) => int | |
| 516 | +#TODO: see MS-VBAL 6.1.2.11.1.1 page 240 => AscB, AscW | |
| 517 | +vba_asc = Suppress(CaselessKeyword('Asc') + '(') + vba_expr_str + Suppress(')') | |
| 518 | +vba_asc.setParseAction(lambda t: ord(t[0])) | |
| 519 | + | |
| 520 | + | |
| 521 | +# --- VAL -------------------------------------------------------------------- | |
| 522 | + | |
| 523 | +# Val(string) => int | |
| 524 | +# TODO: make sure the behavior of VBA's val is fully covered | |
| 525 | +vba_val = Suppress(CaselessKeyword('Val') + '(') + vba_expr_str + Suppress(')') | |
| 526 | +vba_val.setParseAction(lambda t: int(t[0].strip())) | |
| 527 | + | |
| 528 | + | |
| 529 | +# --- StrReverse() -------------------------------------------------------------------- | |
| 530 | + | |
| 531 | +# StrReverse(string) => string | |
| 532 | +strReverse = Suppress(CaselessKeyword('StrReverse') + '(') + vba_expr_str + Suppress(')') | |
| 533 | +strReverse.setParseAction(lambda t: VbaExpressionString(str(t[0])[::-1])) | |
| 534 | + | |
| 535 | + | |
| 536 | +# --- ENVIRON() -------------------------------------------------------------------- | |
| 537 | + | |
| 538 | +# Environ("name") => just translated to "%name%", that is enough for malware analysis | |
| 539 | +environ = Suppress(CaselessKeyword('Environ') + '(') + vba_expr_str + Suppress(')') | |
| 540 | +environ.setParseAction(lambda t: VbaExpressionString('%%%s%%' % t[0])) | |
| 541 | + | |
| 542 | + | |
| 543 | +# ---STRING EXPRESSION ------------------------------------------------------- | |
| 544 | + | |
| 545 | +def concat_strings_list(tokens): | |
| 546 | + """ | |
| 547 | + parse action to concatenate strings in a VBA expression with operators '+' or '&' | |
| 548 | + """ | |
| 549 | + # extract argument from the tokens: | |
| 550 | + # expected to be a tuple containing a list of strings such as [a,'&',b,'&',c,...] | |
| 551 | + strings = tokens[0][::2] | |
| 552 | + return VbaExpressionString(''.join(strings)) | |
| 553 | + | |
| 554 | + | |
| 555 | +vba_expr_str_item = (vba_chr | strReverse | environ | quoted_string) | |
| 556 | + | |
| 557 | +vba_expr_str <<= infixNotation(vba_expr_str_item, | |
| 558 | + [ | |
| 559 | + ("+", 2, opAssoc.LEFT, concat_strings_list), | |
| 560 | + ("&", 2, opAssoc.LEFT, concat_strings_list), | |
| 561 | + ]) | |
| 562 | + | |
| 563 | + | |
| 564 | +# ---STRING EXPRESSION ------------------------------------------------------- | |
| 565 | + | |
| 566 | +def sum_ints_list(tokens): | |
| 567 | + """ | |
| 568 | + parse action to sum integers in a VBA expression with operator '+' | |
| 569 | + """ | |
| 570 | + # extract argument from the tokens: | |
| 571 | + # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...] | |
| 572 | + integers = tokens[0][::2] | |
| 573 | + return sum(integers) | |
| 574 | + | |
| 575 | + | |
| 576 | +vba_expr_int_item = (vba_asc | vba_val | integer) | |
| 577 | + | |
| 578 | +vba_expr_int <<= infixNotation(vba_expr_int_item, | |
| 579 | + [ | |
| 580 | + ("+", 2, opAssoc.LEFT, sum_ints_list), | |
| 581 | + ]) | |
| 582 | + | |
| 583 | + | |
| 584 | +# see detect_vba_strings for the deobfuscation code using this grammar | |
| 585 | + | |
| 422 | 586 | # === MSO/ActiveMime files parsing =========================================== |
| 423 | 587 | |
| 424 | 588 | def is_mso_file(data): |
| ... | ... | @@ -1186,6 +1350,41 @@ def detect_dridex_strings(vba_code): |
| 1186 | 1350 | return results |
| 1187 | 1351 | |
| 1188 | 1352 | |
| 1353 | +def detect_vba_strings(vba_code): | |
| 1354 | + """ | |
| 1355 | + Detect if the VBA code contains strings obfuscated with VBA expressions | |
| 1356 | + using keywords such as Chr, Asc, Val, StrReverse, etc. | |
| 1357 | + | |
| 1358 | + :param vba_code: str, VBA source code | |
| 1359 | + :return: list of str tuples (encoded string, decoded string) | |
| 1360 | + """ | |
| 1361 | + # TODO: handle exceptions | |
| 1362 | + results = [] | |
| 1363 | + found = set() | |
| 1364 | + # IMPORTANT: to extract the actual VBA expressions found in the code, | |
| 1365 | + # we must expand tabs to have the same string as pyparsing. | |
| 1366 | + # Otherwise, start and end offsets are incorrect. | |
| 1367 | + vba_code = vba_code.expandtabs() | |
| 1368 | + for tokens, start, end in vba_expr_str.scanString(vba_code): | |
| 1369 | + encoded = vba_code[start:end] | |
| 1370 | + decoded = tokens[0] | |
| 1371 | + if isinstance(decoded, VbaExpressionString): | |
| 1372 | + # This is a VBA expression, not a simple string | |
| 1373 | + # print 'VBA EXPRESSION: encoded=%r => decoded=%r' % (encoded, decoded) | |
| 1374 | + # remove parentheses and quotes from original string: | |
| 1375 | + # if encoded.startswith('(') and encoded.endswith(')'): | |
| 1376 | + # encoded = encoded[1:-1] | |
| 1377 | + # if encoded.startswith('"') and encoded.endswith('"'): | |
| 1378 | + # encoded = encoded[1:-1] | |
| 1379 | + # avoid duplicates and simple strings: | |
| 1380 | + if encoded not in found and decoded != encoded: | |
| 1381 | + results.append((encoded, decoded)) | |
| 1382 | + found.add(encoded) | |
| 1383 | + # else: | |
| 1384 | + # print 'VBA STRING: encoded=%r => decoded=%r' % (encoded, decoded) | |
| 1385 | + return results | |
| 1386 | + | |
| 1387 | + | |
| 1189 | 1388 | class VBA_Scanner(object): |
| 1190 | 1389 | """ |
| 1191 | 1390 | Class to scan the source code of a VBA module to find obfuscated strings, |
| ... | ... | @@ -1204,6 +1403,7 @@ class VBA_Scanner(object): |
| 1204 | 1403 | self.code_rev_hex = '' |
| 1205 | 1404 | self.code_base64 = '' |
| 1206 | 1405 | self.code_dridex = '' |
| 1406 | + self.code_vba = '' | |
| 1207 | 1407 | |
| 1208 | 1408 | |
| 1209 | 1409 | def scan(self, include_decoded_strings=False): |
| ... | ... | @@ -1240,6 +1440,10 @@ class VBA_Scanner(object): |
| 1240 | 1440 | self.dridex_strings = detect_dridex_strings(self.code) |
| 1241 | 1441 | for encoded, decoded in self.dridex_strings: |
| 1242 | 1442 | self.code_dridex += '\n' + decoded |
| 1443 | + # Detect obfuscated strings in VBA expressions | |
| 1444 | + self.vba_strings = detect_vba_strings(self.code) | |
| 1445 | + for encoded, decoded in self.vba_strings: | |
| 1446 | + self.code_vba += '\n' + decoded | |
| 1243 | 1447 | results = [] |
| 1244 | 1448 | self.autoexec_keywords = [] |
| 1245 | 1449 | self.suspicious_keywords = [] |
| ... | ... | @@ -1252,6 +1456,7 @@ class VBA_Scanner(object): |
| 1252 | 1456 | (self.code_rev_hex, 'StrReverse+Hex'), |
| 1253 | 1457 | (self.code_base64, 'Base64'), |
| 1254 | 1458 | (self.code_dridex, 'Dridex'), |
| 1459 | + (self.code_vba, 'VBA expression'), | |
| 1255 | 1460 | ): |
| 1256 | 1461 | self.autoexec_keywords += detect_autoexec(code, obfuscation) |
| 1257 | 1462 | self.suspicious_keywords += detect_suspicious(code, obfuscation) |
| ... | ... | @@ -1267,6 +1472,9 @@ class VBA_Scanner(object): |
| 1267 | 1472 | if self.dridex_strings: |
| 1268 | 1473 | self.suspicious_keywords.append(('Dridex Strings', |
| 1269 | 1474 | 'Dridex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)')) |
| 1475 | + if self.vba_strings: | |
| 1476 | + self.suspicious_keywords.append(('VBA obfuscated Strings', | |
| 1477 | + 'VBA string expressions were detected, may be used to obfuscate strings (option --decode to see all)')) | |
| 1270 | 1478 | for keyword, description in self.autoexec_keywords: |
| 1271 | 1479 | results.append(('AutoExec', keyword, description)) |
| 1272 | 1480 | for keyword, description in self.suspicious_keywords: |
| ... | ... | @@ -1275,11 +1483,13 @@ class VBA_Scanner(object): |
| 1275 | 1483 | results.append(('IOC', value, pattern_type)) |
| 1276 | 1484 | if include_decoded_strings: |
| 1277 | 1485 | for encoded, decoded in self.hex_strings: |
| 1278 | - results.append(('Hex String', repr(decoded), encoded)) | |
| 1486 | + results.append(('Hex String', repr(decoded), repr(encoded))) | |
| 1279 | 1487 | for encoded, decoded in self.base64_strings: |
| 1280 | - results.append(('Base64 String', repr(decoded), encoded)) | |
| 1488 | + results.append(('Base64 String', repr(decoded), repr(encoded))) | |
| 1281 | 1489 | for encoded, decoded in self.dridex_strings: |
| 1282 | - results.append(('Dridex string', repr(decoded), encoded)) | |
| 1490 | + results.append(('Dridex string', repr(decoded), repr(encoded))) | |
| 1491 | + for encoded, decoded in self.vba_strings: | |
| 1492 | + results.append(('VBA string', repr(decoded), repr(encoded))) | |
| 1283 | 1493 | return results |
| 1284 | 1494 | |
| 1285 | 1495 | def scan_summary(self): |
| ... | ... | @@ -1821,7 +2031,9 @@ def main(): |
| 1821 | 2031 | parser.add_option("-i", "--input", dest='input', type='str', default=None, |
| 1822 | 2032 | help='input file containing VBA source code to be analyzed (no parsing)') |
| 1823 | 2033 | parser.add_option("--decode", action="store_true", dest="show_decoded_strings", |
| 1824 | - help='display all the obfuscated strings with their decoded content (Hex, Base64, StrReverse, Dridex).') | |
| 2034 | + help='display all the obfuscated strings with their decoded content (Hex, Base64, StrReverse, Dridex, VBA).') | |
| 2035 | + | |
| 2036 | + # TODO: --novba to disable VBA expressions parsing | |
| 1825 | 2037 | |
| 1826 | 2038 | (options, args) = parser.parse_args() |
| 1827 | 2039 | ... | ... |