Commit 99e9238ea6094146bc72e0596d6fe61b88bd3c22

Authored by Philippe Lagadec
1 parent 1281a509

olevba: added pattern extraction

Showing 1 changed file with 52 additions and 2 deletions
oletools/olevba.py
@@ -86,8 +86,9 @@ Usage: olevba.py <file> @@ -86,8 +86,9 @@ Usage: olevba.py <file>
86 # - ignore empty macros 86 # - ignore empty macros
87 # 2014-12-14 v0.07 PL: - detect_autoexec() is now case-insensitive 87 # 2014-12-14 v0.07 PL: - detect_autoexec() is now case-insensitive
88 # 2014-12-15 v0.08 PL: - improved display for empty macros 88 # 2014-12-15 v0.08 PL: - improved display for empty macros
  89 +# - added pattern extraction
89 90
90 -__version__ = '0.07' 91 +__version__ = '0.08'
91 92
92 #------------------------------------------------------------------------------ 93 #------------------------------------------------------------------------------
93 # TODO: 94 # TODO:
@@ -96,6 +97,13 @@ __version__ = '0.07' @@ -96,6 +97,13 @@ __version__ = '0.07'
96 # + nicer output 97 # + nicer output
97 # + setup logging (common with other oletools) 98 # + setup logging (common with other oletools)
98 # + update readme, wiki and decalage.info, pypi (link to sample files) 99 # + update readme, wiki and decalage.info, pypi (link to sample files)
  100 +# + performance improvement: instead of searching each keyword separately,
  101 +# first split vba code into a list of words (per line), then check each
  102 +# word against a dict. (or put vba words into a set/dict?)
  103 +# + for regex, maybe combine them into a single re with named groups?
  104 +# + add Yara support, include sample rules? plugins like balbuzard?
  105 +# + add balbuzard support
  106 +# + move main into functions
99 107
100 # TODO later: 108 # TODO later:
101 # + output to file 109 # + output to file
@@ -103,7 +111,7 @@ __version__ = '0.07' @@ -103,7 +111,7 @@ __version__ = '0.07'
103 # + look for VBA in embedded documents (e.g. Excel in Word) 111 # + look for VBA in embedded documents (e.g. Excel in Word)
104 # + support SRP streams (see Lenny's article + links and sample) 112 # + support SRP streams (see Lenny's article + links and sample)
105 # - python 3.x support 113 # - python 3.x support
106 -# - add support for PowerPoint macros (see libclamav, libgsf) 114 +# - add support for PowerPoint macros (see libclamav, libgsf), use oledump heuristic?
107 # - check VBA macros in Visio, Access, Project, etc 115 # - check VBA macros in Visio, Access, Project, etc
108 # - extract_macros: convert to a class, split long function into smaller methods 116 # - extract_macros: convert to a class, split long function into smaller methods
109 # - extract_macros: read bytes from stream file objects instead of strings 117 # - extract_macros: read bytes from stream file objects instead of strings
@@ -123,6 +131,7 @@ import struct @@ -123,6 +131,7 @@ import struct
123 import cStringIO 131 import cStringIO
124 import math 132 import math
125 import zipfile 133 import zipfile
  134 +import re
126 135
127 import thirdparty.olefile as olefile 136 import thirdparty.olefile as olefile
128 137
@@ -153,6 +162,16 @@ AUTOEXEC_KEYWORDS = { @@ -153,6 +162,16 @@ AUTOEXEC_KEYWORDS = {
153 #TODO: full list in MS specs?? 162 #TODO: full list in MS specs??
154 } 163 }
155 164
  165 +# Patterns to be extracted (IP addresses, URLs, etc)
  166 +# From patterns.py in balbuzard
  167 +RE_PATTERNS = (
  168 + ('URL', re.compile(r'(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~])*[^\.\,\)\(\s]')),
  169 + ('IPv4 address', re.compile(r"\b(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\b")),
  170 + ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@(?:[A-Z0-9-]+\.)+(?:[A-Z]{2,12}|XN--[A-Z0-9]{4,18})\b')),
  171 + ('Domain name', re.compile(r'(?=^.{1,254}$)(^(?:(?!\d+\.|-)[a-zA-Z0-9_\-]{1,63}(?<!-)\.?)+(?:[a-zA-Z]{2,})$)')),
  172 + ("Executable file name", re.compile(r"(?i)\b\w+\.(EXE|COM|VBS|JS|VBE|JSE|BAT|CMD|DLL|SCR|CLASS|JAR)\b")),
  173 + )
  174 +
156 175
157 #--- FUNCTIONS ---------------------------------------------------------------- 176 #--- FUNCTIONS ----------------------------------------------------------------
158 177
@@ -696,6 +715,7 @@ def detect_autoexec(vba_code): @@ -696,6 +715,7 @@ def detect_autoexec(vba_code):
696 :param vba_code: str, VBA source code 715 :param vba_code: str, VBA source code
697 :return: list of str tuples (keyword, description) 716 :return: list of str tuples (keyword, description)
698 """ 717 """
  718 + #TODO: use regex to find keywords with word boundaries
699 # case-insensitive search 719 # case-insensitive search
700 vba_code = vba_code.lower() 720 vba_code = vba_code.lower()
701 results = [] 721 results = []
@@ -706,6 +726,22 @@ def detect_autoexec(vba_code): @@ -706,6 +726,22 @@ def detect_autoexec(vba_code):
706 return results 726 return results
707 727
708 728
  729 +def detect_patterns(vba_code):
  730 + """
  731 + Detect if the VBA code contains specific patterns such as IP addresses,
  732 + URLs, e-mail addresses, executable file names, etc.
  733 +
  734 + :param vba_code: str, VBA source code
  735 + :return: list of str tuples (pattern type, value)
  736 + """
  737 + results = []
  738 + for pattern_type, pattern_re in RE_PATTERNS:
  739 + match = pattern_re.search(vba_code)
  740 + if match is not None:
  741 + results.append((pattern_type, match.group()))
  742 + return results
  743 +
  744 +
709 #=== CLASSES ================================================================= 745 #=== CLASSES =================================================================
710 746
711 class VBA_Parser(object): 747 class VBA_Parser(object):
@@ -964,6 +1000,20 @@ if __name__ == &#39;__main__&#39;: @@ -964,6 +1000,20 @@ if __name__ == &#39;__main__&#39;:
964 else: 1000 else:
965 print 'Auto-executable macro keywords: None found' 1001 print 'Auto-executable macro keywords: None found'
966 1002
  1003 + print '- '*39
  1004 + patterns = detect_patterns(vba_code)
  1005 + if patterns:
  1006 + print 'Patterns found:'
  1007 + t = prettytable.PrettyTable(('Value', 'Pattern type'))
  1008 + t.align = 'l'
  1009 + t.max_width['Value'] = 40
  1010 + t.max_width['Pattern type'] = 39
  1011 + for pattern_type, value in patterns:
  1012 + t.add_row((value, pattern_type))
  1013 + print t
  1014 + else:
  1015 + print 'Patterns: None found'
  1016 +
967 else: 1017 else:
968 print 'No VBA macros found.' 1018 print 'No VBA macros found.'
969 except TypeError: 1019 except TypeError: