Commit 89272589d1dcecda43949dcd314c9fd109283c3e

Authored by Philippe Lagadec
1 parent b56f9ef7

olevba: fixed issue #4: regex for URL, e-mail and exe filename

Showing 1 changed file with 28 additions and 6 deletions
oletools/olevba.py
... ... @@ -111,8 +111,9 @@ https://github.com/unixfreak0037/officeparser
111 111 # 2015-01-26 v0.20 PL: - added option --hex to show all hex strings decoded
112 112 # 2015-01-29 v0.21 PL: - added Dridex obfuscation decoding
113 113 # - improved display, shows obfuscation name
  114 +# 2015-02-01 v0.22 PL: - fixed issue #4: regex for URL, e-mail and exe filename
114 115  
115   -__version__ = '0.21'
  116 +__version__ = '0.22'
116 117  
117 118 #------------------------------------------------------------------------------
118 119 # TODO:
... ... @@ -249,16 +250,37 @@ SUSPICIOUS_KEYWORDS = {
249 250 #Chr: http://msdn.microsoft.com/en-us/library/office/gg264465%28v=office.15%29.aspx
250 251 }
251 252  
  253 +# Regular Expression for a URL:
  254 +# http://en.wikipedia.org/wiki/Uniform_resource_locator
  255 +# http://www.w3.org/Addressing/URL/uri-spec.html
  256 +#TODO: also support username:password@server
  257 +#TODO: other protocols (file, gopher, wais, ...?)
  258 +SCHEME = r'\b(?:http|ftp)s?'
  259 +# see http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains
  260 +TLD = r'(?:xn--[a-zA-Z0-9]{4,20}|[a-zA-Z]{2,20})'
  261 +DNS_NAME = r'(?:[a-zA-Z0-9\-\.]+\.' + TLD + ')'
  262 +#TODO: IPv6 - see https://www.debuggex.com/
  263 +# A literal numeric IPv6 address may be given, but must be enclosed in [ ] e.g. [db8:0cec::99:123a]
  264 +NUMBER_0_255 = r'(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'
  265 +IPv4 = r'(?:'+NUMBER_0_255+r'\.){3}'+NUMBER_0_255
  266 +# IPv4 must come before the DNS name because it is more specific
  267 +SERVER = r'(?:' + IPv4 + '|' + DNS_NAME + ')'
  268 +PORT = r'(?:\:[0-9]{1,5})?'
  269 +SERVER_PORT = SERVER + PORT
  270 +URL_PATH = r'(?:/[a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~]*)?' # [^\.\,\)\(\s"]
  271 +URL_RE = SCHEME + r'\://' + SERVER_PORT + URL_PATH
  272 +re_url = re.compile(URL_RE)
  273 +
  274 +
252 275 # Patterns to be extracted (IP addresses, URLs, etc)
253 276 # From patterns.py in balbuzard
254 277 RE_PATTERNS = (
255   - #TODO: check if this regex matches URLs with an IP address (various forms)
256   - ('URL', re.compile(r'(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~])*[^\.\,\)\(\s]')),
257   - ('IPv4 address', re.compile(r"\b(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\b")),
258   - ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@(?:[A-Z0-9-]+\.)+(?:[A-Z]{2,12}|XN--[A-Z0-9]{4,18})\b')),
  278 + ('URL', re.compile(URL_RE)),
  279 + ('IPv4 address', re.compile(IPv4)),
  280 + ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@'+SERVER+'\b')),
259 281 # ('Domain name', re.compile(r'(?=^.{1,254}$)(^(?:(?!\d+\.|-)[a-zA-Z0-9_\-]{1,63}(?<!-)\.?)+(?:[a-zA-Z]{2,})$)')),
260 282 # Executable file name with known extensions (except .com which is present in many URLs, and .application):
261   - ("Executable file name", re.compile(r"(?i)\b\w+\.(EXE|PIF|GADGET|MSI|MSP|MSC|VB|VBS|JS|VBE|JSE|WS|WSF|WSC|WSH|BAT|CMD|DLL|SCR|HTA|CPL|CLASS|JAR|PS1|PS1XML|PS2|PS2XML|PSC1|PSC2|SCF|LNK|INF|REG)\b")),
  283 + ("Executable file name", re.compile(r"(?i)\b\w+\.(EXE|PIF|GADGET|MSI|MSP|MSC|VBS|VBE|VB|JSE|JS|WSF|WSC|WSH|WS|BAT|CMD|DLL|SCR|HTA|CPL|CLASS|JAR|PS1XML|PS1|PS2XML|PS2|PSC1|PSC2|SCF|LNK|INF|REG)\b")),
262 284 # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/
263 285 #TODO: https://support.office.com/en-us/article/Blocked-attachments-in-Outlook-3811cddc-17c3-4279-a30c-060ba0207372#__attachment_file_types
264 286 #('Hex string', re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')),
... ...