Commit 89272589d1dcecda43949dcd314c9fd109283c3e
1 parent
b56f9ef7
olevba: fixed issue #4: regex for URL, e-mail and exe filename
Showing
1 changed file
with
28 additions
and
6 deletions
oletools/olevba.py
| ... | ... | @@ -111,8 +111,9 @@ https://github.com/unixfreak0037/officeparser |
| 111 | 111 | # 2015-01-26 v0.20 PL: - added option --hex to show all hex strings decoded |
| 112 | 112 | # 2015-01-29 v0.21 PL: - added Dridex obfuscation decoding |
| 113 | 113 | # - improved display, shows obfuscation name |
| 114 | +# 2015-02-01 v0.22 PL: - fixed issue #4: regex for URL, e-mail and exe filename | |
| 114 | 115 | |
| 115 | -__version__ = '0.21' | |
| 116 | +__version__ = '0.22' | |
| 116 | 117 | |
| 117 | 118 | #------------------------------------------------------------------------------ |
| 118 | 119 | # TODO: |
| ... | ... | @@ -249,16 +250,37 @@ SUSPICIOUS_KEYWORDS = { |
| 249 | 250 | #Chr: http://msdn.microsoft.com/en-us/library/office/gg264465%28v=office.15%29.aspx |
| 250 | 251 | } |
| 251 | 252 | |
| 253 | +# Regular Expression for a URL: | |
| 254 | +# http://en.wikipedia.org/wiki/Uniform_resource_locator | |
| 255 | +# http://www.w3.org/Addressing/URL/uri-spec.html | |
| 256 | +#TODO: also support username:password@server | |
| 257 | +#TODO: other protocols (file, gopher, wais, ...?) | |
| 258 | +SCHEME = r'\b(?:http|ftp)s?' | |
| 259 | +# see http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains | |
| 260 | +TLD = r'(?:xn--[a-zA-Z0-9]{4,20}|[a-zA-Z]{2,20})' | |
| 261 | +DNS_NAME = r'(?:[a-zA-Z0-9\-\.]+\.' + TLD + ')' | |
| 262 | +#TODO: IPv6 - see https://www.debuggex.com/ | |
| 263 | +# A literal numeric IPv6 address may be given, but must be enclosed in [ ] e.g. [db8:0cec::99:123a] | |
| 264 | +NUMBER_0_255 = r'(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])' | |
| 265 | +IPv4 = r'(?:'+NUMBER_0_255+r'\.){3}'+NUMBER_0_255 | |
| 266 | +# IPv4 must come before the DNS name because it is more specific | |
| 267 | +SERVER = r'(?:' + IPv4 + '|' + DNS_NAME + ')' | |
| 268 | +PORT = r'(?:\:[0-9]{1,5})?' | |
| 269 | +SERVER_PORT = SERVER + PORT | |
| 270 | +URL_PATH = r'(?:/[a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~]*)?' # [^\.\,\)\(\s"] | |
| 271 | +URL_RE = SCHEME + r'\://' + SERVER_PORT + URL_PATH | |
| 272 | +re_url = re.compile(URL_RE) | |
| 273 | + | |
| 274 | + | |
| 252 | 275 | # Patterns to be extracted (IP addresses, URLs, etc) |
| 253 | 276 | # From patterns.py in balbuzard |
| 254 | 277 | RE_PATTERNS = ( |
| 255 | - #TODO: check if this regex matches URLs with an IP address (various forms) | |
| 256 | - ('URL', re.compile(r'(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~])*[^\.\,\)\(\s]')), | |
| 257 | - ('IPv4 address', re.compile(r"\b(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\b")), | |
| 258 | - ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@(?:[A-Z0-9-]+\.)+(?:[A-Z]{2,12}|XN--[A-Z0-9]{4,18})\b')), | |
| 278 | + ('URL', re.compile(URL_RE)), | |
| 279 | + ('IPv4 address', re.compile(IPv4)), | |
| 280 | + ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@'+SERVER+'\b')), | |
| 259 | 281 | # ('Domain name', re.compile(r'(?=^.{1,254}$)(^(?:(?!\d+\.|-)[a-zA-Z0-9_\-]{1,63}(?<!-)\.?)+(?:[a-zA-Z]{2,})$)')), |
| 260 | 282 | # Executable file name with known extensions (except .com which is present in many URLs, and .application): |
| 261 | - ("Executable file name", re.compile(r"(?i)\b\w+\.(EXE|PIF|GADGET|MSI|MSP|MSC|VB|VBS|JS|VBE|JSE|WS|WSF|WSC|WSH|BAT|CMD|DLL|SCR|HTA|CPL|CLASS|JAR|PS1|PS1XML|PS2|PS2XML|PSC1|PSC2|SCF|LNK|INF|REG)\b")), | |
| 283 | + ("Executable file name", re.compile(r"(?i)\b\w+\.(EXE|PIF|GADGET|MSI|MSP|MSC|VBS|VBE|VB|JSE|JS|WSF|WSC|WSH|WS|BAT|CMD|DLL|SCR|HTA|CPL|CLASS|JAR|PS1XML|PS1|PS2XML|PS2|PSC1|PSC2|SCF|LNK|INF|REG)\b")), | |
| 262 | 284 | # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/ |
| 263 | 285 | #TODO: https://support.office.com/en-us/article/Blocked-attachments-in-Outlook-3811cddc-17c3-4279-a30c-060ba0207372#__attachment_file_types |
| 264 | 286 | #('Hex string', re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')), | ... | ... |