Commit 89272589d1dcecda43949dcd314c9fd109283c3e

Authored by Philippe Lagadec
1 parent b56f9ef7

olevba: fixed issue #4: regex for URL, e-mail and exe filename

Showing 1 changed file with 28 additions and 6 deletions
oletools/olevba.py
@@ -111,8 +111,9 @@ https://github.com/unixfreak0037/officeparser @@ -111,8 +111,9 @@ https://github.com/unixfreak0037/officeparser
111 # 2015-01-26 v0.20 PL: - added option --hex to show all hex strings decoded 111 # 2015-01-26 v0.20 PL: - added option --hex to show all hex strings decoded
112 # 2015-01-29 v0.21 PL: - added Dridex obfuscation decoding 112 # 2015-01-29 v0.21 PL: - added Dridex obfuscation decoding
113 # - improved display, shows obfuscation name 113 # - improved display, shows obfuscation name
  114 +# 2015-02-01 v0.22 PL: - fixed issue #4: regex for URL, e-mail and exe filename
114 115
115 -__version__ = '0.21' 116 +__version__ = '0.22'
116 117
117 #------------------------------------------------------------------------------ 118 #------------------------------------------------------------------------------
118 # TODO: 119 # TODO:
@@ -249,16 +250,37 @@ SUSPICIOUS_KEYWORDS = { @@ -249,16 +250,37 @@ SUSPICIOUS_KEYWORDS = {
249 #Chr: http://msdn.microsoft.com/en-us/library/office/gg264465%28v=office.15%29.aspx 250 #Chr: http://msdn.microsoft.com/en-us/library/office/gg264465%28v=office.15%29.aspx
250 } 251 }
251 252
  253 +# Regular Expression for a URL:
  254 +# http://en.wikipedia.org/wiki/Uniform_resource_locator
  255 +# http://www.w3.org/Addressing/URL/uri-spec.html
  256 +#TODO: also support username:password@server
  257 +#TODO: other protocols (file, gopher, wais, ...?)
  258 +SCHEME = r'\b(?:http|ftp)s?'
  259 +# see http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains
  260 +TLD = r'(?:xn--[a-zA-Z0-9]{4,20}|[a-zA-Z]{2,20})'
  261 +DNS_NAME = r'(?:[a-zA-Z0-9\-\.]+\.' + TLD + ')'
  262 +#TODO: IPv6 - see https://www.debuggex.com/
  263 +# A literal numeric IPv6 address may be given, but must be enclosed in [ ] e.g. [db8:0cec::99:123a]
  264 +NUMBER_0_255 = r'(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'
  265 +IPv4 = r'(?:'+NUMBER_0_255+r'\.){3}'+NUMBER_0_255
  266 +# IPv4 must come before the DNS name because it is more specific
  267 +SERVER = r'(?:' + IPv4 + '|' + DNS_NAME + ')'
  268 +PORT = r'(?:\:[0-9]{1,5})?'
  269 +SERVER_PORT = SERVER + PORT
  270 +URL_PATH = r'(?:/[a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~]*)?' # [^\.\,\)\(\s"]
  271 +URL_RE = SCHEME + r'\://' + SERVER_PORT + URL_PATH
  272 +re_url = re.compile(URL_RE)
  273 +
  274 +
252 # Patterns to be extracted (IP addresses, URLs, etc) 275 # Patterns to be extracted (IP addresses, URLs, etc)
253 # From patterns.py in balbuzard 276 # From patterns.py in balbuzard
254 RE_PATTERNS = ( 277 RE_PATTERNS = (
255 - #TODO: check if this regex matches URLs with an IP address (various forms)  
256 - ('URL', re.compile(r'(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~])*[^\.\,\)\(\s]')),  
257 - ('IPv4 address', re.compile(r"\b(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\b")),  
258 - ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@(?:[A-Z0-9-]+\.)+(?:[A-Z]{2,12}|XN--[A-Z0-9]{4,18})\b')), 278 + ('URL', re.compile(URL_RE)),
  279 + ('IPv4 address', re.compile(IPv4)),
  280 + ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@'+SERVER+'\b')),
259 # ('Domain name', re.compile(r'(?=^.{1,254}$)(^(?:(?!\d+\.|-)[a-zA-Z0-9_\-]{1,63}(?<!-)\.?)+(?:[a-zA-Z]{2,})$)')), 281 # ('Domain name', re.compile(r'(?=^.{1,254}$)(^(?:(?!\d+\.|-)[a-zA-Z0-9_\-]{1,63}(?<!-)\.?)+(?:[a-zA-Z]{2,})$)')),
260 # Executable file name with known extensions (except .com which is present in many URLs, and .application): 282 # Executable file name with known extensions (except .com which is present in many URLs, and .application):
261 - ("Executable file name", re.compile(r"(?i)\b\w+\.(EXE|PIF|GADGET|MSI|MSP|MSC|VB|VBS|JS|VBE|JSE|WS|WSF|WSC|WSH|BAT|CMD|DLL|SCR|HTA|CPL|CLASS|JAR|PS1|PS1XML|PS2|PS2XML|PSC1|PSC2|SCF|LNK|INF|REG)\b")), 283 + ("Executable file name", re.compile(r"(?i)\b\w+\.(EXE|PIF|GADGET|MSI|MSP|MSC|VBS|VBE|VB|JSE|JS|WSF|WSC|WSH|WS|BAT|CMD|DLL|SCR|HTA|CPL|CLASS|JAR|PS1XML|PS1|PS2XML|PS2|PSC1|PSC2|SCF|LNK|INF|REG)\b")),
262 # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/ 284 # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/
263 #TODO: https://support.office.com/en-us/article/Blocked-attachments-in-Outlook-3811cddc-17c3-4279-a30c-060ba0207372#__attachment_file_types 285 #TODO: https://support.office.com/en-us/article/Blocked-attachments-in-Outlook-3811cddc-17c3-4279-a30c-060ba0207372#__attachment_file_types
264 #('Hex string', re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')), 286 #('Hex string', re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')),