Commit a4ffb743f926d59e022f10313ca70d6af9f8c8b7
1 parent
41896bcf
olevba: changed line endings from CRLF to LF
Showing
1 changed file
with
1670 additions
and
1670 deletions
oletools/olevba.py
100644 → 100755
| 1 | -#!/usr/bin/env python | ||
| 2 | -""" | ||
| 3 | -olevba.py | ||
| 4 | - | ||
| 5 | -olevba is a script to parse OLE and OpenXML files such as MS Office documents | ||
| 6 | -(e.g. Word, Excel), to extract VBA Macro code in clear text, deobfuscate | ||
| 7 | -and analyze malicious macros. | ||
| 8 | - | ||
| 9 | -Supported formats: | ||
| 10 | -- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) | ||
| 11 | -- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) | ||
| 12 | -- PowerPoint 2007+ (.pptm, .ppsm) | ||
| 13 | -- Word 2003 XML (.xml) | ||
| 14 | - | ||
| 15 | -Author: Philippe Lagadec - http://www.decalage.info | ||
| 16 | -License: BSD, see source code or documentation | ||
| 17 | - | ||
| 18 | -olevba is part of the python-oletools package: | ||
| 19 | -http://www.decalage.info/python/oletools | ||
| 20 | - | ||
| 21 | -olevba is based on source code from officeparser by John William Davison | ||
| 22 | -https://github.com/unixfreak0037/officeparser | ||
| 23 | -""" | ||
| 24 | - | ||
| 25 | -#=== LICENSE ================================================================== | ||
| 26 | - | ||
| 27 | -# olevba is copyright (c) 2014-2015 Philippe Lagadec (http://www.decalage.info) | ||
| 28 | -# All rights reserved. | ||
| 29 | -# | ||
| 30 | -# Redistribution and use in source and binary forms, with or without modification, | ||
| 31 | -# are permitted provided that the following conditions are met: | ||
| 32 | -# | ||
| 33 | -# * Redistributions of source code must retain the above copyright notice, this | ||
| 34 | -# list of conditions and the following disclaimer. | ||
| 35 | -# * Redistributions in binary form must reproduce the above copyright notice, | ||
| 36 | -# this list of conditions and the following disclaimer in the documentation | ||
| 37 | -# and/or other materials provided with the distribution. | ||
| 38 | -# | ||
| 39 | -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | ||
| 40 | -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | ||
| 41 | -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | ||
| 42 | -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | ||
| 43 | -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
| 44 | -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||
| 45 | -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | ||
| 46 | -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||
| 47 | -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
| 48 | -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| 49 | - | ||
| 50 | - | ||
| 51 | -# olevba contains modified source code from the officeparser project, published | ||
| 52 | -# under the following MIT License (MIT): | ||
| 53 | -# | ||
| 54 | -# officeparser is copyright (c) 2014 John William Davison | ||
| 55 | -# | ||
| 56 | -# Permission is hereby granted, free of charge, to any person obtaining a copy | ||
| 57 | -# of this software and associated documentation files (the "Software"), to deal | ||
| 58 | -# in the Software without restriction, including without limitation the rights | ||
| 59 | -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
| 60 | -# copies of the Software, and to permit persons to whom the Software is | ||
| 61 | -# furnished to do so, subject to the following conditions: | ||
| 62 | -# | ||
| 63 | -# The above copyright notice and this permission notice shall be included in all | ||
| 64 | -# copies or substantial portions of the Software. | ||
| 65 | -# | ||
| 66 | -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
| 67 | -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
| 68 | -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
| 69 | -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
| 70 | -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
| 71 | -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| 72 | -# SOFTWARE. | ||
| 73 | - | ||
| 74 | -#------------------------------------------------------------------------------ | ||
| 75 | -# CHANGELOG: | ||
| 76 | -# 2014-08-05 v0.01 PL: - first version based on officeparser code | ||
| 77 | -# 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser | ||
| 78 | -# 2014-08-15 PL: - fixed incorrect value check in PROJECTHELPFILEPATH Record | ||
| 79 | -# 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats | ||
| 80 | -# and to find the VBA project root anywhere in the file | ||
| 81 | -# 2014-11-29 v0.04 PL: - use olefile instead of OleFileIO_PL | ||
| 82 | -# 2014-12-05 v0.05 PL: - refactored most functions into a class, new API | ||
| 83 | -# - added detect_vba_macros | ||
| 84 | -# 2014-12-10 v0.06 PL: - hide first lines with VB attributes | ||
| 85 | -# - detect auto-executable macros | ||
| 86 | -# - ignore empty macros | ||
| 87 | -# 2014-12-14 v0.07 PL: - detect_autoexec() is now case-insensitive | ||
| 88 | -# 2014-12-15 v0.08 PL: - improved display for empty macros | ||
| 89 | -# - added pattern extraction | ||
| 90 | -# 2014-12-25 v0.09 PL: - added suspicious keywords detection | ||
| 91 | -# 2014-12-27 v0.10 PL: - added OptionParser, main and process_file | ||
| 92 | -# - uses xglob to scan several files with wildcards | ||
| 93 | -# - option -r to recurse subdirectories | ||
| 94 | -# - option -z to scan files in password-protected zips | ||
| 95 | -# 2015-01-02 v0.11 PL: - improved filter_vba to detect colons | ||
| 96 | -# 2015-01-03 v0.12 PL: - fixed detect_patterns to detect all patterns | ||
| 97 | -# - process_file: improved display, shows container file | ||
| 98 | -# - improved list of executable file extensions | ||
| 99 | -# 2015-01-04 v0.13 PL: - added several suspicious keywords, improved display | ||
| 100 | -# 2015-01-08 v0.14 PL: - added hex strings detection and decoding | ||
| 101 | -# - fixed issue #2, decoding VBA stream names using | ||
| 102 | -# specified codepage and unicode stream names | ||
| 103 | -# 2015-01-11 v0.15 PL: - added new triage mode, options -t and -d | ||
| 104 | -# 2015-01-16 v0.16 PL: - fix for issue #3 (exception when module name="text") | ||
| 105 | -# - added several suspicious keywords | ||
| 106 | -# - added option -i to analyze VBA source code directly | ||
| 107 | -# 2015-01-17 v0.17 PL: - removed .com from the list of executable extensions | ||
| 108 | -# - added scan_vba to run all detection algorithms | ||
| 109 | -# - decoded hex strings are now also scanned + reversed | ||
| 110 | -# 2015-01-23 v0.18 PL: - fixed issue #3, case-insensitive search in code_modules | ||
| 111 | -# 2015-01-24 v0.19 PL: - improved the detection of IOCs obfuscated with hex | ||
| 112 | -# strings and StrReverse | ||
| 113 | -# 2015-01-26 v0.20 PL: - added option --hex to show all hex strings decoded | ||
| 114 | -# 2015-01-29 v0.21 PL: - added Dridex obfuscation decoding | ||
| 115 | -# - improved display, shows obfuscation name | ||
| 116 | -# 2015-02-01 v0.22 PL: - fixed issue #4: regex for URL, e-mail and exe filename | ||
| 117 | -# - added Base64 obfuscation decoding (contribution from | ||
| 118 | -# @JamesHabben) | ||
| 119 | -# 2015-02-03 v0.23 PL: - triage now uses VBA_Scanner results, shows Base64 and | ||
| 120 | -# Dridex strings | ||
| 121 | -# - exception handling in detect_base64_strings | ||
| 122 | -# 2015-02-07 v0.24 PL: - renamed option --hex to --decode, fixed display | ||
| 123 | -# - display exceptions with stack trace | ||
| 124 | -# - added several suspicious keywords | ||
| 125 | -# - improved Base64 detection and decoding | ||
| 126 | -# - fixed triage mode not to scan attrib lines | ||
| 127 | -# 2015-03-04 v0.25 PL: - added support for Word 2003 XML | ||
| 128 | - | ||
| 129 | -__version__ = '0.25' | ||
| 130 | - | ||
| 131 | -#------------------------------------------------------------------------------ | ||
| 132 | -# TODO: | ||
| 133 | -# + do not use logging, but a provided logger (null logger by default) | ||
| 134 | -# + setup logging (common with other oletools) | ||
| 135 | -# + add xor bruteforcing like bbharvest | ||
| 136 | -# + add chr() decoding | ||
| 137 | - | ||
| 138 | -# TODO later: | ||
| 139 | -# + performance improvement: instead of searching each keyword separately, | ||
| 140 | -# first split vba code into a list of words (per line), then check each | ||
| 141 | -# word against a dict. (or put vba words into a set/dict?) | ||
| 142 | -# + for regex, maybe combine them into a single re with named groups? | ||
| 143 | -# + add Yara support, include sample rules? plugins like balbuzard? | ||
| 144 | -# + add balbuzard support | ||
| 145 | -# + output to file (replace print by file.write, sys.stdout by default) | ||
| 146 | -# + look for VBA in embedded documents (e.g. Excel in Word) | ||
| 147 | -# + support SRP streams (see Lenny's article + links and sample) | ||
| 148 | -# - python 3.x support | ||
| 149 | -# - add support for PowerPoint macros (see libclamav, libgsf), use oledump heuristic? | ||
| 150 | -# - check VBA macros in Visio, Access, Project, etc | ||
| 151 | -# - extract_macros: convert to a class, split long function into smaller methods | ||
| 152 | -# - extract_macros: read bytes from stream file objects instead of strings | ||
| 153 | -# - extract_macros: use combined struct.unpack instead of many calls | ||
| 154 | - | ||
| 155 | -#------------------------------------------------------------------------------ | ||
| 156 | -# REFERENCES: | ||
| 157 | -# - [MS-OVBA]: Microsoft Office VBA File Format Structure | ||
| 158 | -# http://msdn.microsoft.com/en-us/library/office/cc313094%28v=office.12%29.aspx | ||
| 159 | -# - officeparser: https://github.com/unixfreak0037/officeparser | ||
| 160 | - | ||
| 161 | - | ||
| 162 | -#--- IMPORTS ------------------------------------------------------------------ | ||
| 163 | - | ||
| 164 | -import sys, logging | ||
| 165 | -import struct | ||
| 166 | -import cStringIO | ||
| 167 | -import math | ||
| 168 | -import zipfile | ||
| 169 | -import re | ||
| 170 | -import optparse | ||
| 171 | -import os.path | ||
| 172 | -import binascii | ||
| 173 | -import base64 | ||
| 174 | -import traceback | ||
| 175 | -import zlib | ||
| 176 | - | ||
| 177 | -# import lxml or ElementTree for XML parsing: | ||
| 178 | -try: | ||
| 179 | - # lxml: best performance for XML processing | ||
| 180 | - import lxml.etree as ET | ||
| 181 | -except ImportError: | ||
| 182 | - try: | ||
| 183 | - # Python 2.5+: batteries included | ||
| 184 | - import xml.etree.cElementTree as ET | ||
| 185 | - except ImportError: | ||
| 186 | - try: | ||
| 187 | - # Python <2.5: standalone ElementTree install | ||
| 188 | - import elementtree.cElementTree as ET | ||
| 189 | - except ImportError: | ||
| 190 | - raise ImportError, "lxml or ElementTree are not installed, "\ | ||
| 191 | - +"see http://codespeak.net/lxml "\ | ||
| 192 | - +"or http://effbot.org/zone/element-index.htm" | ||
| 193 | - | ||
| 194 | -import thirdparty.olefile as olefile | ||
| 195 | -from thirdparty.prettytable import prettytable | ||
| 196 | -from thirdparty.xglob import xglob | ||
| 197 | - | ||
| 198 | -#--- CONSTANTS ---------------------------------------------------------------- | ||
| 199 | - | ||
| 200 | -TYPE_OLE = 'OLE' | ||
| 201 | -TYPE_OpenXML = 'OpenXML' | ||
| 202 | -TYPE_Word2003_XML = 'Word2003_XML' | ||
| 203 | - | ||
| 204 | -MODULE_EXTENSION = "bas" | ||
| 205 | -CLASS_EXTENSION = "cls" | ||
| 206 | -FORM_EXTENSION = "frm" | ||
| 207 | - | ||
| 208 | -# Namespaces and tags for Word2003 XML parsing: | ||
| 209 | -NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}' | ||
| 210 | -# the tag <w:binData w:name="editdata.mso"> contains the VBA macro code: | ||
| 211 | -TAG_BINDATA = NS_W + 'binData' | ||
| 212 | -ATTR_NAME = NS_W + 'name' | ||
| 213 | - | ||
| 214 | -# Keywords to detect auto-executable macros | ||
| 215 | -AUTOEXEC_KEYWORDS = { | ||
| 216 | - # MS Word: | ||
| 217 | - 'Runs when the Word document is opened': | ||
| 218 | - ('AutoExec', 'AutoOpen', 'Document_Open', 'DocumentOpen'), | ||
| 219 | - 'Runs when the Word document is closed': | ||
| 220 | - ('AutoExit', 'AutoClose', 'Document_Close', 'DocumentBeforeClose'), | ||
| 221 | - 'Runs when the Word document is modified': | ||
| 222 | - ('DocumentChange',), | ||
| 223 | - 'Runs when a new Word document is created': | ||
| 224 | - ('AutoNew', 'Document_New', 'NewDocument'), | ||
| 225 | - | ||
| 226 | - # MS Excel: | ||
| 227 | - 'Runs when the Excel Workbook is opened': | ||
| 228 | - ('Auto_Open', 'Workbook_Open'), | ||
| 229 | - 'Runs when the Excel Workbook is closed': | ||
| 230 | - ('Auto_Close', 'Workbook_Close'), | ||
| 231 | - | ||
| 232 | - #TODO: full list in MS specs?? | ||
| 233 | -} | ||
| 234 | - | ||
| 235 | -# Suspicious Keywords that may be used by malware | ||
| 236 | -# See VBA language reference: http://msdn.microsoft.com/en-us/library/office/jj692818%28v=office.15%29.aspx | ||
| 237 | -SUSPICIOUS_KEYWORDS = { | ||
| 238 | - #TODO: use regex to support variable whitespaces | ||
| 239 | - 'May read system environment variables': | ||
| 240 | - ('Environ',), | ||
| 241 | - 'May open a file': | ||
| 242 | - ('Open',), | ||
| 243 | - 'May write to a file (if combined with Open)': | ||
| 244 | - #TODO: regex to find Open+Write on same line | ||
| 245 | - ('Write', 'Put', 'Output', 'Print #'), | ||
| 246 | - 'May read or write a binary file (if combined with Open)': | ||
| 247 | - #TODO: regex to find Open+Binary on same line | ||
| 248 | - ('Binary',), | ||
| 249 | - 'May copy a file': | ||
| 250 | - ('FileCopy', 'CopyFile'), | ||
| 251 | - #FileCopy: http://msdn.microsoft.com/en-us/library/office/gg264390%28v=office.15%29.aspx | ||
| 252 | - #CopyFile: http://msdn.microsoft.com/en-us/library/office/gg264089%28v=office.15%29.aspx | ||
| 253 | - 'May delete a file': | ||
| 254 | - ('Kill',), | ||
| 255 | - 'May create a text file': | ||
| 256 | - ('CreateTextFile','ADODB.Stream', 'WriteText', 'SaveToFile'), | ||
| 257 | - #CreateTextFile: http://msdn.microsoft.com/en-us/library/office/gg264617%28v=office.15%29.aspx | ||
| 258 | - #ADODB.Stream sample: http://pastebin.com/Z4TMyuq6 | ||
| 259 | - 'May run an executable file or a system command': | ||
| 260 | - ('Shell', 'vbNormal', 'vbNormalFocus', 'vbHide', 'vbMinimizedFocus', 'vbMaximizedFocus', 'vbNormalNoFocus', | ||
| 261 | - 'vbMinimizedNoFocus', 'WScript.Shell', 'Run'), | ||
| 262 | - #Shell: http://msdn.microsoft.com/en-us/library/office/gg278437%28v=office.15%29.aspx | ||
| 263 | - #WScript.Shell+Run sample: http://pastebin.com/Z4TMyuq6 | ||
| 264 | - 'May hide the application': | ||
| 265 | - ('Application.Visible', 'ShowWindow', 'SW_HIDE'), | ||
| 266 | - 'May create a directory': | ||
| 267 | - ('MkDir',), | ||
| 268 | - 'May save the current workbook': | ||
| 269 | - ('ActiveWorkbook.SaveAs',), | ||
| 270 | - 'May change which directory contains files to open at startup': | ||
| 271 | - #TODO: confirm the actual effect | ||
| 272 | - ('Application.AltStartupPath',), | ||
| 273 | - 'May create an OLE object': | ||
| 274 | - ('CreateObject',), | ||
| 275 | - 'May run an application (if combined with CreateObject)': | ||
| 276 | - ('Shell.Application',), | ||
| 277 | - 'May enumerate application windows (if combined with Shell.Application object)': | ||
| 278 | - ('Windows', 'FindWindow'), | ||
| 279 | - 'May run code from a DLL': | ||
| 280 | - #TODO: regex to find declare+lib on same line | ||
| 281 | - ('Lib',), | ||
| 282 | - 'May download files from the Internet': | ||
| 283 | - #TODO: regex to find urlmon+URLDownloadToFileA on same line | ||
| 284 | - ('URLDownloadToFileA', 'Msxml2.XMLHTTP', 'Microsoft.XMLHTTP'), | ||
| 285 | - 'May control another application by simulating user keystrokes': | ||
| 286 | - ('SendKeys', 'AppActivate'), | ||
| 287 | - #SendKeys: http://msdn.microsoft.com/en-us/library/office/gg278655%28v=office.15%29.aspx | ||
| 288 | - 'May attempt to obfuscate malicious function calls': | ||
| 289 | - ('CallByName',), | ||
| 290 | - #CallByName: http://msdn.microsoft.com/en-us/library/office/gg278760%28v=office.15%29.aspx | ||
| 291 | - 'May attempt to obfuscate specific strings': | ||
| 292 | - #TODO: regex to find several Chr*, not just one | ||
| 293 | - ('Chr', 'ChrB', 'ChrW', 'StrReverse', 'Xor'), | ||
| 294 | - #Chr: http://msdn.microsoft.com/en-us/library/office/gg264465%28v=office.15%29.aspx | ||
| 295 | -} | ||
| 296 | - | ||
| 297 | -# Regular Expression for a URL: | ||
| 298 | -# http://en.wikipedia.org/wiki/Uniform_resource_locator | ||
| 299 | -# http://www.w3.org/Addressing/URL/uri-spec.html | ||
| 300 | -#TODO: also support username:password@server | ||
| 301 | -#TODO: other protocols (file, gopher, wais, ...?) | ||
| 302 | -SCHEME = r'\b(?:http|ftp)s?' | ||
| 303 | -# see http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains | ||
| 304 | -TLD = r'(?:xn--[a-zA-Z0-9]{4,20}|[a-zA-Z]{2,20})' | ||
| 305 | -DNS_NAME = r'(?:[a-zA-Z0-9\-\.]+\.' + TLD + ')' | ||
| 306 | -#TODO: IPv6 - see https://www.debuggex.com/ | ||
| 307 | -# A literal numeric IPv6 address may be given, but must be enclosed in [ ] e.g. [db8:0cec::99:123a] | ||
| 308 | -NUMBER_0_255 = r'(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])' | ||
| 309 | -IPv4 = r'(?:'+NUMBER_0_255+r'\.){3}'+NUMBER_0_255 | ||
| 310 | -# IPv4 must come before the DNS name because it is more specific | ||
| 311 | -SERVER = r'(?:' + IPv4 + '|' + DNS_NAME + ')' | ||
| 312 | -PORT = r'(?:\:[0-9]{1,5})?' | ||
| 313 | -SERVER_PORT = SERVER + PORT | ||
| 314 | -URL_PATH = r'(?:/[a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~]*)?' # [^\.\,\)\(\s"] | ||
| 315 | -URL_RE = SCHEME + r'\://' + SERVER_PORT + URL_PATH | ||
| 316 | -re_url = re.compile(URL_RE) | ||
| 317 | - | ||
| 318 | - | ||
| 319 | -# Patterns to be extracted (IP addresses, URLs, etc) | ||
| 320 | -# From patterns.py in balbuzard | ||
| 321 | -RE_PATTERNS = ( | ||
| 322 | - ('URL', re.compile(URL_RE)), | ||
| 323 | - ('IPv4 address', re.compile(IPv4)), | ||
| 324 | - ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@'+SERVER+'\b')), | ||
| 325 | - # ('Domain name', re.compile(r'(?=^.{1,254}$)(^(?:(?!\d+\.|-)[a-zA-Z0-9_\-]{1,63}(?<!-)\.?)+(?:[a-zA-Z]{2,})$)')), | ||
| 326 | - # Executable file name with known extensions (except .com which is present in many URLs, and .application): | ||
| 327 | - ("Executable file name", re.compile(r"(?i)\b\w+\.(EXE|PIF|GADGET|MSI|MSP|MSC|VBS|VBE|VB|JSE|JS|WSF|WSC|WSH|WS|BAT|CMD|DLL|SCR|HTA|CPL|CLASS|JAR|PS1XML|PS1|PS2XML|PS2|PSC1|PSC2|SCF|LNK|INF|REG)\b")), | ||
| 328 | - # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/ | ||
| 329 | - #TODO: https://support.office.com/en-us/article/Blocked-attachments-in-Outlook-3811cddc-17c3-4279-a30c-060ba0207372#__attachment_file_types | ||
| 330 | - #('Hex string', re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')), | ||
| 331 | - ) | ||
| 332 | - | ||
| 333 | -# regex to detect strings encoded in hexadecimal | ||
| 334 | -re_hex_string = re.compile(r'(?:[0-9A-Fa-f]{2}){4,}') | ||
| 335 | - | ||
| 336 | -# regex to detect strings encoded in base64 | ||
| 337 | -#re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?"') | ||
| 338 | -# better version from balbuzard, less false positives: | ||
| 339 | -re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4}){1,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)?"') | ||
| 340 | -# white list of common strings matching the base64 regex, but which are not base64 strings (all lowercase): | ||
| 341 | -BASE64_WHITELIST = set(['thisdocument', 'thisworkbook', 'test', 'temp', 'http', 'open', 'exit']) | ||
| 342 | - | ||
| 343 | -# regex to detect strings encoded with a specific Dridex algorithm | ||
| 344 | -# (see https://github.com/JamesHabben/MalwareStuff) | ||
| 345 | -re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"') | ||
| 346 | -# regex to check that it is not just a hex string: | ||
| 347 | -re_nothex_check = re.compile(r'[G-Zg-z]') | ||
| 348 | - | ||
| 349 | -#--- FUNCTIONS ---------------------------------------------------------------- | ||
| 350 | - | ||
| 351 | -def copytoken_help(decompressed_current, decompressed_chunk_start): | ||
| 352 | - """ | ||
| 353 | - compute bit masks to decode a CopyToken according to MS-OVBA 2.4.1.3.19.1 CopyToken Help | ||
| 354 | - | ||
| 355 | - decompressed_current: number of decompressed bytes so far, i.e. len(decompressed_container) | ||
| 356 | - decompressed_chunk_start: offset of the current chunk in the decompressed container | ||
| 357 | - return length_mask, offset_mask, bit_count, maximum_length | ||
| 358 | - """ | ||
| 359 | - difference = decompressed_current - decompressed_chunk_start | ||
| 360 | - bit_count = int(math.ceil(math.log(difference, 2))) | ||
| 361 | - bit_count = max([bit_count, 4]) | ||
| 362 | - length_mask = 0xFFFF >> bit_count | ||
| 363 | - offset_mask = ~length_mask | ||
| 364 | - maximum_length = (0xFFFF >> bit_count) + 3 | ||
| 365 | - return length_mask, offset_mask, bit_count, maximum_length | ||
| 366 | - | ||
| 367 | - | ||
| 368 | -def decompress_stream (compressed_container): | ||
| 369 | - """ | ||
| 370 | - Decompress a stream according to MS-OVBA section 2.4.1 | ||
| 371 | - | ||
| 372 | - compressed_container: string compressed according to the MS-OVBA 2.4.1.3.6 Compression algorithm | ||
| 373 | - return the decompressed container as a string (bytes) | ||
| 374 | - """ | ||
| 375 | - # 2.4.1.2 State Variables | ||
| 376 | - | ||
| 377 | - # The following state is maintained for the CompressedContainer (section 2.4.1.1.1): | ||
| 378 | - # CompressedRecordEnd: The location of the byte after the last byte in the CompressedContainer (section 2.4.1.1.1). | ||
| 379 | - # CompressedCurrent: The location of the next byte in the CompressedContainer (section 2.4.1.1.1) to be read by | ||
| 380 | - # decompression or to be written by compression. | ||
| 381 | - | ||
| 382 | - # The following state is maintained for the current CompressedChunk (section 2.4.1.1.4): | ||
| 383 | - # CompressedChunkStart: The location of the first byte of the CompressedChunk (section 2.4.1.1.4) within the | ||
| 384 | - # CompressedContainer (section 2.4.1.1.1). | ||
| 385 | - | ||
| 386 | - # The following state is maintained for a DecompressedBuffer (section 2.4.1.1.2): | ||
| 387 | - # DecompressedCurrent: The location of the next byte in the DecompressedBuffer (section 2.4.1.1.2) to be written by | ||
| 388 | - # decompression or to be read by compression. | ||
| 389 | - # DecompressedBufferEnd: The location of the byte after the last byte in the DecompressedBuffer (section 2.4.1.1.2). | ||
| 390 | - | ||
| 391 | - # The following state is maintained for the current DecompressedChunk (section 2.4.1.1.3): | ||
| 392 | - # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the | ||
| 393 | - # DecompressedBuffer (section 2.4.1.1.2). | ||
| 394 | - | ||
| 395 | - decompressed_container = '' # result | ||
| 396 | - compressed_current = 0 | ||
| 397 | - | ||
| 398 | - sig_byte = ord(compressed_container[compressed_current]) | ||
| 399 | - if sig_byte != 0x01: | ||
| 400 | - raise ValueError('invalid signature byte {0:02X}'.format(sig_byte)) | ||
| 401 | - | ||
| 402 | - compressed_current += 1 | ||
| 403 | - | ||
| 404 | - #NOTE: the definition of CompressedRecordEnd is ambiguous. Here we assume that | ||
| 405 | - # CompressedRecordEnd = len(compressed_container) | ||
| 406 | - while compressed_current < len(compressed_container): | ||
| 407 | - # 2.4.1.1.5 | ||
| 408 | - compressed_chunk_start = compressed_current | ||
| 409 | - # chunk header = first 16 bits | ||
| 410 | - compressed_chunk_header = struct.unpack("<H", compressed_container[compressed_chunk_start:compressed_chunk_start + 2])[0] | ||
| 411 | - # chunk size = 12 first bits of header + 3 | ||
| 412 | - chunk_size = (compressed_chunk_header & 0x0FFF) + 3 | ||
| 413 | - # chunk signature = 3 next bits - should always be 0b011 | ||
| 414 | - chunk_signature = (compressed_chunk_header >> 12) & 0x07 | ||
| 415 | - if chunk_signature != 0b011: | ||
| 416 | - raise ValueError('Invalid CompressedChunkSignature in VBA compressed stream') | ||
| 417 | - # chunk flag = next bit - 1 == compressed, 0 == uncompressed | ||
| 418 | - chunk_flag = (compressed_chunk_header >> 15) & 0x01 | ||
| 419 | - logging.debug("chunk size = {0}, compressed flag = {1}".format(chunk_size, chunk_flag)) | ||
| 420 | - | ||
| 421 | - #MS-OVBA 2.4.1.3.12: the maximum size of a chunk including its header is 4098 bytes (header 2 + data 4096) | ||
| 422 | - # The minimum size is 3 bytes | ||
| 423 | - # NOTE: there seems to be a typo in MS-OVBA, the check should be with 4098, not 4095 (which is the max value | ||
| 424 | - # in chunk header before adding 3. | ||
| 425 | - # Also the first test is not useful since a 12 bits value cannot be larger than 4095. | ||
| 426 | - if chunk_flag == 1 and chunk_size > 4098: | ||
| 427 | - raise ValueError('CompressedChunkSize > 4098 but CompressedChunkFlag == 1') | ||
| 428 | - if chunk_flag == 0 and chunk_size != 4098: | ||
| 429 | - raise ValueError('CompressedChunkSize != 4098 but CompressedChunkFlag == 0') | ||
| 430 | - | ||
| 431 | - # check if chunk_size goes beyond the compressed data, instead of silently cutting it: | ||
| 432 | - #TODO: raise an exception? | ||
| 433 | - if compressed_chunk_start + chunk_size > len(compressed_container): | ||
| 434 | - logging.warning('Chunk size is larger than remaining compressed data') | ||
| 435 | - compressed_end = min([len(compressed_container), compressed_chunk_start + chunk_size]) | ||
| 436 | - # read after chunk header: | ||
| 437 | - compressed_current = compressed_chunk_start + 2 | ||
| 438 | - | ||
| 439 | - if chunk_flag == 0: | ||
| 440 | - # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk | ||
| 441 | - # uncompressed chunk: read the next 4096 bytes as-is | ||
| 442 | - #TODO: check if there are at least 4096 bytes left | ||
| 443 | - decompressed_container += compressed_container[compressed_current:compressed_current + 4096] | ||
| 444 | - compressed_current += 4096 | ||
| 445 | - else: | ||
| 446 | - # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk | ||
| 447 | - # compressed chunk | ||
| 448 | - decompressed_chunk_start = len(decompressed_container) | ||
| 449 | - while compressed_current < compressed_end: | ||
| 450 | - # MS-OVBA 2.4.1.3.4 Decompressing a TokenSequence | ||
| 451 | - # logging.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end)) | ||
| 452 | - # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or | ||
| 453 | - # copy tokens (reference to a previous literal token) | ||
| 454 | - flag_byte = ord(compressed_container[compressed_current]) | ||
| 455 | - compressed_current += 1 | ||
| 456 | - for bit_index in xrange(0, 8): | ||
| 457 | - # logging.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end)) | ||
| 458 | - if compressed_current >= compressed_end: | ||
| 459 | - break | ||
| 460 | - # MS-OVBA 2.4.1.3.5 Decompressing a Token | ||
| 461 | - # MS-OVBA 2.4.1.3.17 Extract FlagBit | ||
| 462 | - flag_bit = (flag_byte >> bit_index) & 1 | ||
| 463 | - #logging.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit)) | ||
| 464 | - if flag_bit == 0: # LiteralToken | ||
| 465 | - # copy one byte directly to output | ||
| 466 | - decompressed_container += compressed_container[compressed_current] | ||
| 467 | - compressed_current += 1 | ||
| 468 | - else: # CopyToken | ||
| 469 | - # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken | ||
| 470 | - copy_token = struct.unpack("<H", compressed_container[compressed_current:compressed_current + 2])[0] | ||
| 471 | - #TODO: check this | ||
| 472 | - length_mask, offset_mask, bit_count, maximum_length = copytoken_help( | ||
| 473 | - len(decompressed_container), decompressed_chunk_start) | ||
| 474 | - length = (copy_token & length_mask) + 3 | ||
| 475 | - temp1 = copy_token & offset_mask | ||
| 476 | - temp2 = 16 - bit_count | ||
| 477 | - offset = (temp1 >> temp2) + 1 | ||
| 478 | - #logging.debug('offset=%d length=%d' % (offset, length)) | ||
| 479 | - copy_source = len(decompressed_container) - offset | ||
| 480 | - for index in xrange(copy_source, copy_source + length): | ||
| 481 | - decompressed_container += decompressed_container[index] | ||
| 482 | - compressed_current += 2 | ||
| 483 | - return decompressed_container | ||
| 484 | - | ||
| 485 | - | ||
| 486 | -def _extract_vba (ole, vba_root, project_path, dir_path): | ||
| 487 | - """ | ||
| 488 | - Extract VBA macros from an OleFileIO object. | ||
| 489 | - Internal function, do not call directly. | ||
| 490 | - | ||
| 491 | - vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream | ||
| 492 | - vba_project: path to the PROJECT stream | ||
| 493 | - This is a generator, yielding (stream path, VBA filename, VBA source code) for each VBA code stream | ||
| 494 | - """ | ||
| 495 | - # Open the PROJECT stream: | ||
| 496 | - project = ole.openstream(project_path) | ||
| 497 | - | ||
| 498 | - # sample content of the PROJECT stream: | ||
| 499 | - | ||
| 500 | - ## ID="{5312AC8A-349D-4950-BDD0-49BE3C4DD0F0}" | ||
| 501 | - ## Document=ThisDocument/&H00000000 | ||
| 502 | - ## Module=NewMacros | ||
| 503 | - ## Name="Project" | ||
| 504 | - ## HelpContextID="0" | ||
| 505 | - ## VersionCompatible32="393222000" | ||
| 506 | - ## CMG="F1F301E705E705E705E705" | ||
| 507 | - ## DPB="8F8D7FE3831F2020202020" | ||
| 508 | - ## GC="2D2FDD81E51EE61EE6E1" | ||
| 509 | - ## | ||
| 510 | - ## [Host Extender Info] | ||
| 511 | - ## &H00000001={3832D640-CF90-11CF-8E43-00A0C911005A};VBE;&H00000000 | ||
| 512 | - ## &H00000002={000209F2-0000-0000-C000-000000000046};Word8.0;&H00000000 | ||
| 513 | - ## | ||
| 514 | - ## [Workspace] | ||
| 515 | - ## ThisDocument=22, 29, 339, 477, Z | ||
| 516 | - ## NewMacros=-4, 42, 832, 510, C | ||
| 517 | - | ||
| 518 | - code_modules = {} | ||
| 519 | - | ||
| 520 | - for line in project: | ||
| 521 | - line = line.strip() | ||
| 522 | - if '=' in line: | ||
| 523 | - # split line at the 1st equal sign: | ||
| 524 | - name, value = line.split('=', 1) | ||
| 525 | - # looking for code modules | ||
| 526 | - # add the code module as a key in the dictionary | ||
| 527 | - # the value will be the extension needed later | ||
| 528 | - # The value is converted to lowercase, to allow case-insensitive matching (issue #3) | ||
| 529 | - value = value.lower() | ||
| 530 | - if name == 'Document': | ||
| 531 | - # split value at the 1st slash, keep 1st part: | ||
| 532 | - value = value.split('/', 1)[0] | ||
| 533 | - code_modules[value] = CLASS_EXTENSION | ||
| 534 | - elif name == 'Module': | ||
| 535 | - code_modules[value] = MODULE_EXTENSION | ||
| 536 | - elif name == 'Class': | ||
| 537 | - code_modules[value] = CLASS_EXTENSION | ||
| 538 | - elif name == 'BaseClass': | ||
| 539 | - code_modules[value] = FORM_EXTENSION | ||
| 540 | - | ||
| 541 | - # read data from dir stream (compressed) | ||
| 542 | - dir_compressed = ole.openstream(dir_path).read() | ||
| 543 | - | ||
| 544 | - def check_value(name, expected, value): | ||
| 545 | - if expected != value: | ||
| 546 | - logging.error("invalid value for {0} expected {1:04X} got {2:04X}".format(name, expected, value)) | ||
| 547 | - | ||
| 548 | - dir_stream = cStringIO.StringIO(decompress_stream(dir_compressed)) | ||
| 549 | - | ||
| 550 | - # PROJECTSYSKIND Record | ||
| 551 | - PROJECTSYSKIND_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 552 | - check_value('PROJECTSYSKIND_Id', 0x0001, PROJECTSYSKIND_Id) | ||
| 553 | - PROJECTSYSKIND_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 554 | - check_value('PROJECTSYSKIND_Size', 0x0004, PROJECTSYSKIND_Size) | ||
| 555 | - PROJECTSYSKIND_SysKind = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 556 | - if PROJECTSYSKIND_SysKind == 0x00: | ||
| 557 | - logging.debug("16-bit Windows") | ||
| 558 | - elif PROJECTSYSKIND_SysKind == 0x01: | ||
| 559 | - logging.debug("32-bit Windows") | ||
| 560 | - elif PROJECTSYSKIND_SysKind == 0x02: | ||
| 561 | - logging.debug("Macintosh") | ||
| 562 | - elif PROJECTSYSKIND_SysKind == 0x03: | ||
| 563 | - logging.debug("64-bit Windows") | ||
| 564 | - else: | ||
| 565 | - logging.error("invalid PROJECTSYSKIND_SysKind {0:04X}".format(PROJECTSYSKIND_SysKind)) | ||
| 566 | - | ||
| 567 | - # PROJECTLCID Record | ||
| 568 | - PROJECTLCID_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 569 | - check_value('PROJECTLCID_Id', 0x0002, PROJECTLCID_Id) | ||
| 570 | - PROJECTLCID_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 571 | - check_value('PROJECTLCID_Size', 0x0004, PROJECTLCID_Size) | ||
| 572 | - PROJECTLCID_Lcid = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 573 | - check_value('PROJECTLCID_Lcid', 0x409, PROJECTLCID_Lcid) | ||
| 574 | - | ||
| 575 | - # PROJECTLCIDINVOKE Record | ||
| 576 | - PROJECTLCIDINVOKE_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 577 | - check_value('PROJECTLCIDINVOKE_Id', 0x0014, PROJECTLCIDINVOKE_Id) | ||
| 578 | - PROJECTLCIDINVOKE_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 579 | - check_value('PROJECTLCIDINVOKE_Size', 0x0004, PROJECTLCIDINVOKE_Size) | ||
| 580 | - PROJECTLCIDINVOKE_LcidInvoke = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 581 | - check_value('PROJECTLCIDINVOKE_LcidInvoke', 0x409, PROJECTLCIDINVOKE_LcidInvoke) | ||
| 582 | - | ||
| 583 | - # PROJECTCODEPAGE Record | ||
| 584 | - PROJECTCODEPAGE_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 585 | - check_value('PROJECTCODEPAGE_Id', 0x0003, PROJECTCODEPAGE_Id) | ||
| 586 | - PROJECTCODEPAGE_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 587 | - check_value('PROJECTCODEPAGE_Size', 0x0002, PROJECTCODEPAGE_Size) | ||
| 588 | - PROJECTCODEPAGE_CodePage = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 589 | - | ||
| 590 | - # PROJECTNAME Record | ||
| 591 | - PROJECTNAME_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 592 | - check_value('PROJECTNAME_Id', 0x0004, PROJECTNAME_Id) | ||
| 593 | - PROJECTNAME_SizeOfProjectName = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 594 | - if PROJECTNAME_SizeOfProjectName < 1 or PROJECTNAME_SizeOfProjectName > 128: | ||
| 595 | - logging.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(PROJECTNAME_SizeOfProjectName)) | ||
| 596 | - PROJECTNAME_ProjectName = dir_stream.read(PROJECTNAME_SizeOfProjectName) | ||
| 597 | - | ||
| 598 | - # PROJECTDOCSTRING Record | ||
| 599 | - PROJECTDOCSTRING_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 600 | - check_value('PROJECTDOCSTRING_Id', 0x0005, PROJECTDOCSTRING_Id) | ||
| 601 | - PROJECTDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 602 | - if PROJECTNAME_SizeOfProjectName > 2000: | ||
| 603 | - logging.error("PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(PROJECTDOCSTRING_SizeOfDocString)) | ||
| 604 | - PROJECTDOCSTRING_DocString = dir_stream.read(PROJECTDOCSTRING_SizeOfDocString) | ||
| 605 | - PROJECTDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 606 | - check_value('PROJECTDOCSTRING_Reserved', 0x0040, PROJECTDOCSTRING_Reserved) | ||
| 607 | - PROJECTDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 608 | - if PROJECTDOCSTRING_SizeOfDocStringUnicode % 2 != 0: | ||
| 609 | - logging.error("PROJECTDOCSTRING_SizeOfDocStringUnicode is not even") | ||
| 610 | - PROJECTDOCSTRING_DocStringUnicode = dir_stream.read(PROJECTDOCSTRING_SizeOfDocStringUnicode) | ||
| 611 | - | ||
| 612 | - # PROJECTHELPFILEPATH Record - MS-OVBA 2.3.4.2.1.7 | ||
| 613 | - PROJECTHELPFILEPATH_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 614 | - check_value('PROJECTHELPFILEPATH_Id', 0x0006, PROJECTHELPFILEPATH_Id) | ||
| 615 | - PROJECTHELPFILEPATH_SizeOfHelpFile1 = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 616 | - if PROJECTHELPFILEPATH_SizeOfHelpFile1 > 260: | ||
| 617 | - logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(PROJECTHELPFILEPATH_SizeOfHelpFile1)) | ||
| 618 | - PROJECTHELPFILEPATH_HelpFile1 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile1) | ||
| 619 | - PROJECTHELPFILEPATH_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 620 | - check_value('PROJECTHELPFILEPATH_Reserved', 0x003D, PROJECTHELPFILEPATH_Reserved) | ||
| 621 | - PROJECTHELPFILEPATH_SizeOfHelpFile2 = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 622 | - if PROJECTHELPFILEPATH_SizeOfHelpFile2 != PROJECTHELPFILEPATH_SizeOfHelpFile1: | ||
| 623 | - logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 does not equal PROJECTHELPFILEPATH_SizeOfHelpFile2") | ||
| 624 | - PROJECTHELPFILEPATH_HelpFile2 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile2) | ||
| 625 | - if PROJECTHELPFILEPATH_HelpFile2 != PROJECTHELPFILEPATH_HelpFile1: | ||
| 626 | - logging.error("PROJECTHELPFILEPATH_HelpFile1 does not equal PROJECTHELPFILEPATH_HelpFile2") | ||
| 627 | - | ||
| 628 | - # PROJECTHELPCONTEXT Record | ||
| 629 | - PROJECTHELPCONTEXT_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 630 | - check_value('PROJECTHELPCONTEXT_Id', 0x0007, PROJECTHELPCONTEXT_Id) | ||
| 631 | - PROJECTHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 632 | - check_value('PROJECTHELPCONTEXT_Size', 0x0004, PROJECTHELPCONTEXT_Size) | ||
| 633 | - PROJECTHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 634 | - | ||
| 635 | - # PROJECTLIBFLAGS Record | ||
| 636 | - PROJECTLIBFLAGS_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 637 | - check_value('PROJECTLIBFLAGS_Id', 0x0008, PROJECTLIBFLAGS_Id) | ||
| 638 | - PROJECTLIBFLAGS_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 639 | - check_value('PROJECTLIBFLAGS_Size', 0x0004, PROJECTLIBFLAGS_Size) | ||
| 640 | - PROJECTLIBFLAGS_ProjectLibFlags = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 641 | - check_value('PROJECTLIBFLAGS_ProjectLibFlags', 0x0000, PROJECTLIBFLAGS_ProjectLibFlags) | ||
| 642 | - | ||
| 643 | - # PROJECTVERSION Record | ||
| 644 | - PROJECTVERSION_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 645 | - check_value('PROJECTVERSION_Id', 0x0009, PROJECTVERSION_Id) | ||
| 646 | - PROJECTVERSION_Reserved = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 647 | - check_value('PROJECTVERSION_Reserved', 0x0004, PROJECTVERSION_Reserved) | ||
| 648 | - PROJECTVERSION_VersionMajor = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 649 | - PROJECTVERSION_VersionMinor = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 650 | - | ||
| 651 | - # PROJECTCONSTANTS Record | ||
| 652 | - PROJECTCONSTANTS_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 653 | - check_value('PROJECTCONSTANTS_Id', 0x000C, PROJECTCONSTANTS_Id) | ||
| 654 | - PROJECTCONSTANTS_SizeOfConstants = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 655 | - if PROJECTCONSTANTS_SizeOfConstants > 1015: | ||
| 656 | - logging.error("PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(PROJECTCONSTANTS_SizeOfConstants)) | ||
| 657 | - PROJECTCONSTANTS_Constants = dir_stream.read(PROJECTCONSTANTS_SizeOfConstants) | ||
| 658 | - PROJECTCONSTANTS_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 659 | - check_value('PROJECTCONSTANTS_Reserved', 0x003C, PROJECTCONSTANTS_Reserved) | ||
| 660 | - PROJECTCONSTANTS_SizeOfConstantsUnicode = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 661 | - if PROJECTCONSTANTS_SizeOfConstantsUnicode % 2 != 0: | ||
| 662 | - logging.error("PROJECTCONSTANTS_SizeOfConstantsUnicode is not even") | ||
| 663 | - PROJECTCONSTANTS_ConstantsUnicode = dir_stream.read(PROJECTCONSTANTS_SizeOfConstantsUnicode) | ||
| 664 | - | ||
| 665 | - # array of REFERENCE records | ||
| 666 | - check = None | ||
| 667 | - while True: | ||
| 668 | - check = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 669 | - logging.debug("reference type = {0:04X}".format(check)) | ||
| 670 | - if check == 0x000F: | ||
| 671 | - break | ||
| 672 | - | ||
| 673 | - if check == 0x0016: | ||
| 674 | - # REFERENCENAME | ||
| 675 | - REFERENCE_Id = check | ||
| 676 | - REFERENCE_SizeOfName = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 677 | - REFERENCE_Name = dir_stream.read(REFERENCE_SizeOfName) | ||
| 678 | - REFERENCE_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 679 | - check_value('REFERENCE_Reserved', 0x003E, REFERENCE_Reserved) | ||
| 680 | - REFERENCE_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 681 | - REFERENCE_NameUnicode = dir_stream.read(REFERENCE_SizeOfNameUnicode) | ||
| 682 | - continue | ||
| 683 | - | ||
| 684 | - if check == 0x0033: | ||
| 685 | - # REFERENCEORIGINAL (followed by REFERENCECONTROL) | ||
| 686 | - REFERENCEORIGINAL_Id = check | ||
| 687 | - REFERENCEORIGINAL_SizeOfLibidOriginal = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 688 | - REFERENCEORIGINAL_LibidOriginal = dir_stream.read(REFERENCEORIGINAL_SizeOfLibidOriginal) | ||
| 689 | - continue | ||
| 690 | - | ||
| 691 | - if check == 0x002F: | ||
| 692 | - # REFERENCECONTROL | ||
| 693 | - REFERENCECONTROL_Id = check | ||
| 694 | - REFERENCECONTROL_SizeTwiddled = struct.unpack("<L", dir_stream.read(4))[0] # ignore | ||
| 695 | - REFERENCECONTROL_SizeOfLibidTwiddled = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 696 | - REFERENCECONTROL_LibidTwiddled = dir_stream.read(REFERENCECONTROL_SizeOfLibidTwiddled) | ||
| 697 | - REFERENCECONTROL_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0] # ignore | ||
| 698 | - check_value('REFERENCECONTROL_Reserved1', 0x0000, REFERENCECONTROL_Reserved1) | ||
| 699 | - REFERENCECONTROL_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0] # ignore | ||
| 700 | - check_value('REFERENCECONTROL_Reserved2', 0x0000, REFERENCECONTROL_Reserved2) | ||
| 701 | - # optional field | ||
| 702 | - check2 = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 703 | - if check2 == 0x0016: | ||
| 704 | - REFERENCECONTROL_NameRecordExtended_Id = check | ||
| 705 | - REFERENCECONTROL_NameRecordExtended_SizeofName = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 706 | - REFERENCECONTROL_NameRecordExtended_Name = dir_stream.read(REFERENCECONTROL_NameRecordExtended_SizeofName) | ||
| 707 | - REFERENCECONTROL_NameRecordExtended_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 708 | - check_value('REFERENCECONTROL_NameRecordExtended_Reserved', 0x003E, REFERENCECONTROL_NameRecordExtended_Reserved) | ||
| 709 | - REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 710 | - REFERENCECONTROL_NameRecordExtended_NameUnicode = dir_stream.read(REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode) | ||
| 711 | - REFERENCECONTROL_Reserved3 = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 712 | - else: | ||
| 713 | - REFERENCECONTROL_Reserved3 = check2 | ||
| 714 | - | ||
| 715 | - check_value('REFERENCECONTROL_Reserved3', 0x0030, REFERENCECONTROL_Reserved3) | ||
| 716 | - REFERENCECONTROL_SizeExtended = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 717 | - REFERENCECONTROL_SizeOfLibidExtended = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 718 | - REFERENCECONTROL_LibidExtended = dir_stream.read(REFERENCECONTROL_SizeOfLibidExtended) | ||
| 719 | - REFERENCECONTROL_Reserved4 = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 720 | - REFERENCECONTROL_Reserved5 = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 721 | - REFERENCECONTROL_OriginalTypeLib = dir_stream.read(16) | ||
| 722 | - REFERENCECONTROL_Cookie = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 723 | - continue | ||
| 724 | - | ||
| 725 | - if check == 0x000D: | ||
| 726 | - # REFERENCEREGISTERED | ||
| 727 | - REFERENCEREGISTERED_Id = check | ||
| 728 | - REFERENCEREGISTERED_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 729 | - REFERENCEREGISTERED_SizeOfLibid = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 730 | - REFERENCEREGISTERED_Libid = dir_stream.read(REFERENCEREGISTERED_SizeOfLibid) | ||
| 731 | - REFERENCEREGISTERED_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 732 | - check_value('REFERENCEREGISTERED_Reserved1', 0x0000, REFERENCEREGISTERED_Reserved1) | ||
| 733 | - REFERENCEREGISTERED_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 734 | - check_value('REFERENCEREGISTERED_Reserved2', 0x0000, REFERENCEREGISTERED_Reserved2) | ||
| 735 | - continue | ||
| 736 | - | ||
| 737 | - if check == 0x000E: | ||
| 738 | - # REFERENCEPROJECT | ||
| 739 | - REFERENCEPROJECT_Id = check | ||
| 740 | - REFERENCEPROJECT_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 741 | - REFERENCEPROJECT_SizeOfLibidAbsolute = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 742 | - REFERENCEPROJECT_LibidAbsolute = dir_stream.read(REFERENCEPROJECT_SizeOfLibidAbsolute) | ||
| 743 | - REFERENCEPROJECT_SizeOfLibidRelative = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 744 | - REFERENCEPROJECT_LibidRelative = dir_stream.read(REFERENCEPROJECT_SizeOfLibidRelative) | ||
| 745 | - REFERENCEPROJECT_MajorVersion = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 746 | - REFERENCEPROJECT_MinorVersion = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 747 | - continue | ||
| 748 | - | ||
| 749 | - logging.error('invalid or unknown check Id {0:04X}'.format(check)) | ||
| 750 | - sys.exit(0) | ||
| 751 | - | ||
| 752 | - PROJECTMODULES_Id = check #struct.unpack("<H", dir_stream.read(2))[0] | ||
| 753 | - check_value('PROJECTMODULES_Id', 0x000F, PROJECTMODULES_Id) | ||
| 754 | - PROJECTMODULES_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 755 | - check_value('PROJECTMODULES_Size', 0x0002, PROJECTMODULES_Size) | ||
| 756 | - PROJECTMODULES_Count = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 757 | - PROJECTMODULES_ProjectCookieRecord_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 758 | - check_value('PROJECTMODULES_ProjectCookieRecord_Id', 0x0013, PROJECTMODULES_ProjectCookieRecord_Id) | ||
| 759 | - PROJECTMODULES_ProjectCookieRecord_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 760 | - check_value('PROJECTMODULES_ProjectCookieRecord_Size', 0x0002, PROJECTMODULES_ProjectCookieRecord_Size) | ||
| 761 | - PROJECTMODULES_ProjectCookieRecord_Cookie = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 762 | - | ||
| 763 | - logging.debug("parsing {0} modules".format(PROJECTMODULES_Count)) | ||
| 764 | - for x in xrange(0, PROJECTMODULES_Count): | ||
| 765 | - MODULENAME_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 766 | - check_value('MODULENAME_Id', 0x0019, MODULENAME_Id) | ||
| 767 | - MODULENAME_SizeOfModuleName = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 768 | - MODULENAME_ModuleName = dir_stream.read(MODULENAME_SizeOfModuleName) | ||
| 769 | - # account for optional sections | ||
| 770 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 771 | - if section_id == 0x0047: | ||
| 772 | - MODULENAMEUNICODE_Id = section_id | ||
| 773 | - MODULENAMEUNICODE_SizeOfModuleNameUnicode = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 774 | - MODULENAMEUNICODE_ModuleNameUnicode = dir_stream.read(MODULENAMEUNICODE_SizeOfModuleNameUnicode) | ||
| 775 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 776 | - if section_id == 0x001A: | ||
| 777 | - MODULESTREAMNAME_id = section_id | ||
| 778 | - MODULESTREAMNAME_SizeOfStreamName = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 779 | - MODULESTREAMNAME_StreamName = dir_stream.read(MODULESTREAMNAME_SizeOfStreamName) | ||
| 780 | - MODULESTREAMNAME_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 781 | - check_value('MODULESTREAMNAME_Reserved', 0x0032, MODULESTREAMNAME_Reserved) | ||
| 782 | - MODULESTREAMNAME_SizeOfStreamNameUnicode = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 783 | - MODULESTREAMNAME_StreamNameUnicode = dir_stream.read(MODULESTREAMNAME_SizeOfStreamNameUnicode) | ||
| 784 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 785 | - if section_id == 0x001C: | ||
| 786 | - MODULEDOCSTRING_Id = section_id | ||
| 787 | - check_value('MODULEDOCSTRING_Id', 0x001C, MODULEDOCSTRING_Id) | ||
| 788 | - MODULEDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 789 | - MODULEDOCSTRING_DocString = dir_stream.read(MODULEDOCSTRING_SizeOfDocString) | ||
| 790 | - MODULEDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 791 | - check_value('MODULEDOCSTRING_Reserved', 0x0048, MODULEDOCSTRING_Reserved) | ||
| 792 | - MODULEDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 793 | - MODULEDOCSTRING_DocStringUnicode = dir_stream.read(MODULEDOCSTRING_SizeOfDocStringUnicode) | ||
| 794 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 795 | - if section_id == 0x0031: | ||
| 796 | - MODULEOFFSET_Id = section_id | ||
| 797 | - check_value('MODULEOFFSET_Id', 0x0031, MODULEOFFSET_Id) | ||
| 798 | - MODULEOFFSET_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 799 | - check_value('MODULEOFFSET_Size', 0x0004, MODULEOFFSET_Size) | ||
| 800 | - MODULEOFFSET_TextOffset = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 801 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 802 | - if section_id == 0x001E: | ||
| 803 | - MODULEHELPCONTEXT_Id = section_id | ||
| 804 | - check_value('MODULEHELPCONTEXT_Id', 0x001E, MODULEHELPCONTEXT_Id) | ||
| 805 | - MODULEHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 806 | - check_value('MODULEHELPCONTEXT_Size', 0x0004, MODULEHELPCONTEXT_Size) | ||
| 807 | - MODULEHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 808 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 809 | - if section_id == 0x002C: | ||
| 810 | - MODULECOOKIE_Id = section_id | ||
| 811 | - check_value('MODULECOOKIE_Id', 0x002C, MODULECOOKIE_Id) | ||
| 812 | - MODULECOOKIE_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 813 | - check_value('MODULECOOKIE_Size', 0x0002, MODULECOOKIE_Size) | ||
| 814 | - MODULECOOKIE_Cookie = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 815 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 816 | - if section_id == 0x0021 or section_id == 0x0022: | ||
| 817 | - MODULETYPE_Id = section_id | ||
| 818 | - MODULETYPE_Reserved = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 819 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 820 | - if section_id == 0x0025: | ||
| 821 | - MODULEREADONLY_Id = section_id | ||
| 822 | - check_value('MODULEREADONLY_Id', 0x0025, MODULEREADONLY_Id) | ||
| 823 | - MODULEREADONLY_Reserved = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 824 | - check_value('MODULEREADONLY_Reserved', 0x0000, MODULEREADONLY_Reserved) | ||
| 825 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 826 | - if section_id == 0x0028: | ||
| 827 | - MODULEPRIVATE_Id = section_id | ||
| 828 | - check_value('MODULEPRIVATE_Id', 0x0028, MODULEPRIVATE_Id) | ||
| 829 | - MODULEPRIVATE_Reserved = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 830 | - check_value('MODULEPRIVATE_Reserved', 0x0000, MODULEPRIVATE_Reserved) | ||
| 831 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 832 | - if section_id == 0x002B: # TERMINATOR | ||
| 833 | - MODULE_Reserved = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 834 | - check_value('MODULE_Reserved', 0x0000, MODULE_Reserved) | ||
| 835 | - section_id = None | ||
| 836 | - if section_id != None: | ||
| 837 | - logging.warning('unknown or invalid module section id {0:04X}'.format(section_id)) | ||
| 838 | - | ||
| 839 | - logging.debug('Project CodePage = %d' % PROJECTCODEPAGE_CodePage) | ||
| 840 | - vba_codec = 'cp%d' % PROJECTCODEPAGE_CodePage | ||
| 841 | - logging.debug("ModuleName = {0}".format(MODULENAME_ModuleName)) | ||
| 842 | - logging.debug("StreamName = {0}".format(repr(MODULESTREAMNAME_StreamName))) | ||
| 843 | - streamname_unicode = MODULESTREAMNAME_StreamName.decode(vba_codec) | ||
| 844 | - logging.debug("StreamName.decode('%s') = %s" % (vba_codec, repr(streamname_unicode))) | ||
| 845 | - logging.debug("StreamNameUnicode = {0}".format(repr(MODULESTREAMNAME_StreamNameUnicode))) | ||
| 846 | - logging.debug("TextOffset = {0}".format(MODULEOFFSET_TextOffset)) | ||
| 847 | - | ||
| 848 | - code_path = vba_root + u'VBA/' + streamname_unicode | ||
| 849 | - #TODO: test if stream exists | ||
| 850 | - logging.debug('opening VBA code stream %s' % repr(code_path)) | ||
| 851 | - code_data = ole.openstream(code_path).read() | ||
| 852 | - logging.debug("length of code_data = {0}".format(len(code_data))) | ||
| 853 | - logging.debug("offset of code_data = {0}".format(MODULEOFFSET_TextOffset)) | ||
| 854 | - code_data = code_data[MODULEOFFSET_TextOffset:] | ||
| 855 | - if len(code_data) > 0: | ||
| 856 | - code_data = decompress_stream(code_data) | ||
| 857 | - # case-insensitive search in the code_modules dict to find the file extension: | ||
| 858 | - filext = code_modules.get(MODULENAME_ModuleName.lower(), 'bin') | ||
| 859 | - filename = '{0}.{1}'.format(MODULENAME_ModuleName, filext) | ||
| 860 | - #TODO: also yield the codepage so that callers can decode it properly | ||
| 861 | - yield (code_path, filename, code_data) | ||
| 862 | - # print '-'*79 | ||
| 863 | - # print filename | ||
| 864 | - # print '' | ||
| 865 | - # print code_data | ||
| 866 | - # print '' | ||
| 867 | - logging.debug('extracted file {0}'.format(filename)) | ||
| 868 | - else: | ||
| 869 | - logging.warning("module stream {0} has code data length 0".format(MODULESTREAMNAME_StreamName)) | ||
| 870 | - return | ||
| 871 | - | ||
| 872 | - | ||
| 873 | -def filter_vba(vba_code): | ||
| 874 | - """ | ||
| 875 | - Filter VBA source code to remove the first lines starting with "Attribute VB_", | ||
| 876 | - which are automatically added by MS Office and not displayed in the VBA Editor. | ||
| 877 | - This should only be used when displaying source code for human analysis. | ||
| 878 | - | ||
| 879 | - Note: lines are not filtered if they contain a colon, because it could be | ||
| 880 | - used to hide malicious instructions. | ||
| 881 | - | ||
| 882 | - :param vba_code: str, VBA source code | ||
| 883 | - :return: str, filtered VBA source code | ||
| 884 | - """ | ||
| 885 | - vba_lines = vba_code.splitlines() | ||
| 886 | - start = 0 | ||
| 887 | - for line in vba_lines: | ||
| 888 | - if line.startswith("Attribute VB_") and not ':' in line: | ||
| 889 | - start += 1 | ||
| 890 | - else: | ||
| 891 | - break | ||
| 892 | - #TODO: also remove empty lines? | ||
| 893 | - vba = '\n'.join(vba_lines[start:]) | ||
| 894 | - return vba | ||
| 895 | - | ||
| 896 | - | ||
| 897 | -def detect_autoexec(vba_code, obfuscation=None): | ||
| 898 | - """ | ||
| 899 | - Detect if the VBA code contains keywords corresponding to macros running | ||
| 900 | - automatically when triggered by specific actions (e.g. when a document is | ||
| 901 | - opened or closed). | ||
| 902 | - | ||
| 903 | - :param vba_code: str, VBA source code | ||
| 904 | - :param obfuscation: None or str, name of obfuscation to be added to description | ||
| 905 | - :return: list of str tuples (keyword, description) | ||
| 906 | - """ | ||
| 907 | - #TODO: merge code with detect_suspicious | ||
| 908 | - # case-insensitive search | ||
| 909 | - #vba_code = vba_code.lower() | ||
| 910 | - results = [] | ||
| 911 | - obf_text = '' | ||
| 912 | - if obfuscation: | ||
| 913 | - obf_text = ' (obfuscation: %s)' % obfuscation | ||
| 914 | - for description, keywords in AUTOEXEC_KEYWORDS.items(): | ||
| 915 | - for keyword in keywords: | ||
| 916 | - #TODO: if keyword is already a compiled regex, use it as-is | ||
| 917 | - # search using regex to detect word boundaries: | ||
| 918 | - if re.search(r'(?i)\b'+keyword+r'\b', vba_code): | ||
| 919 | - #if keyword.lower() in vba_code: | ||
| 920 | - results.append((keyword, description+obf_text)) | ||
| 921 | - return results | ||
| 922 | - | ||
| 923 | - | ||
| 924 | -def detect_suspicious(vba_code, obfuscation=None): | ||
| 925 | - """ | ||
| 926 | - Detect if the VBA code contains suspicious keywords corresponding to | ||
| 927 | - potential malware behaviour. | ||
| 928 | - | ||
| 929 | - :param vba_code: str, VBA source code | ||
| 930 | - :param obfuscation: None or str, name of obfuscation to be added to description | ||
| 931 | - :return: list of str tuples (keyword, description) | ||
| 932 | - """ | ||
| 933 | - # case-insensitive search | ||
| 934 | - #vba_code = vba_code.lower() | ||
| 935 | - results = [] | ||
| 936 | - obf_text = '' | ||
| 937 | - if obfuscation: | ||
| 938 | - obf_text = ' (obfuscation: %s)' % obfuscation | ||
| 939 | - for description, keywords in SUSPICIOUS_KEYWORDS.items(): | ||
| 940 | - for keyword in keywords: | ||
| 941 | - # search using regex to detect word boundaries: | ||
| 942 | - if re.search(r'(?i)\b'+keyword+r'\b', vba_code): | ||
| 943 | - #if keyword.lower() in vba_code: | ||
| 944 | - results.append((keyword, description+obf_text)) | ||
| 945 | - return results | ||
| 946 | - | ||
| 947 | - | ||
| 948 | -def detect_patterns(vba_code, obfuscation=None): | ||
| 949 | - """ | ||
| 950 | - Detect if the VBA code contains specific patterns such as IP addresses, | ||
| 951 | - URLs, e-mail addresses, executable file names, etc. | ||
| 952 | - | ||
| 953 | - :param vba_code: str, VBA source code | ||
| 954 | - :return: list of str tuples (pattern type, value) | ||
| 955 | - """ | ||
| 956 | - results = [] | ||
| 957 | - found = set() | ||
| 958 | - obf_text = '' | ||
| 959 | - if obfuscation: | ||
| 960 | - obf_text = ' (obfuscation: %s)' % obfuscation | ||
| 961 | - for pattern_type, pattern_re in RE_PATTERNS: | ||
| 962 | - for match in pattern_re.finditer(vba_code): | ||
| 963 | - value = match.group() | ||
| 964 | - if value not in found: | ||
| 965 | - results.append((pattern_type+obf_text, value)) | ||
| 966 | - found.add(value) | ||
| 967 | - return results | ||
| 968 | - | ||
| 969 | - | ||
| 970 | -def detect_hex_strings(vba_code): | ||
| 971 | - """ | ||
| 972 | - Detect if the VBA code contains strings encoded in hexadecimal. | ||
| 973 | - | ||
| 974 | - :param vba_code: str, VBA source code | ||
| 975 | - :return: list of str tuples (encoded string, decoded string) | ||
| 976 | - """ | ||
| 977 | - results = [] | ||
| 978 | - found = set() | ||
| 979 | - for match in re_hex_string.finditer(vba_code): | ||
| 980 | - value = match.group() | ||
| 981 | - if value not in found: | ||
| 982 | - decoded = binascii.unhexlify(value) | ||
| 983 | - results.append((value, decoded)) | ||
| 984 | - found.add(value) | ||
| 985 | - return results | ||
| 986 | - | ||
| 987 | - | ||
| 988 | -def detect_base64_strings(vba_code): | ||
| 989 | - """ | ||
| 990 | - Detect if the VBA code contains strings encoded in base64. | ||
| 991 | - | ||
| 992 | - :param vba_code: str, VBA source code | ||
| 993 | - :return: list of str tuples (encoded string, decoded string) | ||
| 994 | - """ | ||
| 995 | - #TODO: avoid matching simple hex strings as base64? | ||
| 996 | - results = [] | ||
| 997 | - found = set() | ||
| 998 | - for match in re_base64_string.finditer(vba_code): | ||
| 999 | - # extract the base64 string without quotes: | ||
| 1000 | - value = match.group().strip('"') | ||
| 1001 | - # check it is not just a hex string: | ||
| 1002 | - if not re_nothex_check.search(value): | ||
| 1003 | - continue | ||
| 1004 | - # only keep new values and not in the whitelist: | ||
| 1005 | - if value not in found and value.lower() not in BASE64_WHITELIST: | ||
| 1006 | - try: | ||
| 1007 | - decoded = base64.b64decode(value) | ||
| 1008 | - results.append((value, decoded)) | ||
| 1009 | - found.add(value) | ||
| 1010 | - except: | ||
| 1011 | - # if an exception occurs, it is likely not a base64-encoded string | ||
| 1012 | - pass | ||
| 1013 | - return results | ||
| 1014 | - | ||
| 1015 | - | ||
| 1016 | -def detect_dridex_strings(vba_code): | ||
| 1017 | - """ | ||
| 1018 | - Detect if the VBA code contains strings obfuscated with a specific algorithm found in Dridex samples. | ||
| 1019 | - | ||
| 1020 | - :param vba_code: str, VBA source code | ||
| 1021 | - :return: list of str tuples (encoded string, decoded string) | ||
| 1022 | - """ | ||
| 1023 | - from thirdparty.DridexUrlDecoder.DridexUrlDecoder import DridexUrlDecode | ||
| 1024 | - results = [] | ||
| 1025 | - found = set() | ||
| 1026 | - for match in re_dridex_string.finditer(vba_code): | ||
| 1027 | - value = match.group()[1:-1] | ||
| 1028 | - # check it is not just a hex string: | ||
| 1029 | - if not re_nothex_check.search(value): | ||
| 1030 | - continue | ||
| 1031 | - if value not in found: | ||
| 1032 | - try: | ||
| 1033 | - decoded = DridexUrlDecode(value) | ||
| 1034 | - results.append((value, decoded)) | ||
| 1035 | - found.add(value) | ||
| 1036 | - except: | ||
| 1037 | - # if an exception occurs, it is likely not a dridex-encoded string | ||
| 1038 | - pass | ||
| 1039 | - return results | ||
| 1040 | - | ||
| 1041 | - | ||
| 1042 | -class VBA_Scanner (object): | ||
| 1043 | - """ | ||
| 1044 | - Class to scan the source code of a VBA module to find obfuscated strings, | ||
| 1045 | - suspicious keywords, IOCs, auto-executable macros, etc. | ||
| 1046 | - """ | ||
| 1047 | - | ||
| 1048 | - def __init__(self, vba_code): | ||
| 1049 | - """ | ||
| 1050 | - VBA_Scanner constructor | ||
| 1051 | - | ||
| 1052 | - :param vba_code: str, VBA source code to be analyzed | ||
| 1053 | - """ | ||
| 1054 | - self.code = vba_code | ||
| 1055 | - self.code_hex = '' | ||
| 1056 | - self.code_hex_rev = '' | ||
| 1057 | - self.code_rev_hex = '' | ||
| 1058 | - self.code_base64 = '' | ||
| 1059 | - self.code_dridex = '' | ||
| 1060 | - | ||
| 1061 | - | ||
| 1062 | - def scan(self, include_decoded_strings=False): | ||
| 1063 | - """ | ||
| 1064 | - Analyze the provided VBA code to detect suspicious keywords, | ||
| 1065 | - auto-executable macros, IOC patterns, obfuscation patterns | ||
| 1066 | - such as hex-encoded strings. | ||
| 1067 | - | ||
| 1068 | - :param include_decoded_strings: bool, if True, all encoded strings will be included with their decoded content. | ||
| 1069 | - :return: list of tuples (type, keyword, description) | ||
| 1070 | - (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String') | ||
| 1071 | - """ | ||
| 1072 | - # First, detect and extract hex-encoded strings: | ||
| 1073 | - self.hex_strings = detect_hex_strings(self.code) | ||
| 1074 | - # detect if the code contains StrReverse: | ||
| 1075 | - self.strReverse = False | ||
| 1076 | - if 'strreverse' in self.code.lower(): self.strReverse = True | ||
| 1077 | - # Then append the decoded strings to the VBA code, to detect obfuscated IOCs and keywords: | ||
| 1078 | - for encoded, decoded in self.hex_strings: | ||
| 1079 | - self.code_hex += '\n'+decoded | ||
| 1080 | - # if the code contains "StrReverse", also append the hex strings in reverse order: | ||
| 1081 | - if self.strReverse: | ||
| 1082 | - # StrReverse after hex decoding: | ||
| 1083 | - self.code_hex_rev += '\n'+decoded[::-1] | ||
| 1084 | - # StrReverse before hex decoding: | ||
| 1085 | - self.code_rev_hex += '\n'+binascii.unhexlify(encoded[::-1]) | ||
| 1086 | - #example: https://malwr.com/analysis/NmFlMGI4YTY1YzYyNDkwNTg1ZTBiZmY5OGI3YjlhYzU/ | ||
| 1087 | - #TODO: also append the full code reversed if StrReverse? (risk of false positives?) | ||
| 1088 | - # Detect Base64-encoded strings | ||
| 1089 | - self.base64_strings = detect_base64_strings(self.code) | ||
| 1090 | - for encoded, decoded in self.base64_strings: | ||
| 1091 | - self.code_base64 += '\n'+decoded | ||
| 1092 | - # Detect Dridex-encoded strings | ||
| 1093 | - self.dridex_strings = detect_dridex_strings(self.code) | ||
| 1094 | - for encoded, decoded in self.dridex_strings: | ||
| 1095 | - self.code_dridex += '\n'+decoded | ||
| 1096 | - results = [] | ||
| 1097 | - self.autoexec_keywords = [] | ||
| 1098 | - self.suspicious_keywords = [] | ||
| 1099 | - self.iocs = [] | ||
| 1100 | - | ||
| 1101 | - for code, obfuscation in ( | ||
| 1102 | - (self.code, None), | ||
| 1103 | - (self.code_hex, 'Hex'), | ||
| 1104 | - (self.code_hex_rev, 'Hex+StrReverse'), | ||
| 1105 | - (self.code_rev_hex, 'StrReverse+Hex'), | ||
| 1106 | - (self.code_base64, 'Base64'), | ||
| 1107 | - (self.code_dridex, 'Dridex'), | ||
| 1108 | - ): | ||
| 1109 | - self.autoexec_keywords += detect_autoexec(code, obfuscation) | ||
| 1110 | - self.suspicious_keywords += detect_suspicious(code, obfuscation) | ||
| 1111 | - self.iocs += detect_patterns(code, obfuscation) | ||
| 1112 | - | ||
| 1113 | - # If hex-encoded strings were discovered, add an item to suspicious keywords: | ||
| 1114 | - if self.hex_strings: | ||
| 1115 | - self.suspicious_keywords.append(('Hex Strings', | ||
| 1116 | - 'Hex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)')) | ||
| 1117 | - if self.base64_strings: | ||
| 1118 | - self.suspicious_keywords.append(('Base64 Strings', | ||
| 1119 | - 'Base64-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)')) | ||
| 1120 | - if self.dridex_strings: | ||
| 1121 | - self.suspicious_keywords.append(('Dridex Strings', | ||
| 1122 | - 'Dridex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)')) | ||
| 1123 | - for keyword, description in self.autoexec_keywords: | ||
| 1124 | - results.append(('AutoExec', keyword, description)) | ||
| 1125 | - for keyword, description in self.suspicious_keywords: | ||
| 1126 | - results.append(('Suspicious', keyword, description)) | ||
| 1127 | - for pattern_type, value in self.iocs: | ||
| 1128 | - results.append(('IOC', value, pattern_type)) | ||
| 1129 | - if include_decoded_strings: | ||
| 1130 | - for encoded, decoded in self.hex_strings: | ||
| 1131 | - results.append(('Hex String', repr(decoded), encoded)) | ||
| 1132 | - for encoded, decoded in self.base64_strings: | ||
| 1133 | - results.append(('Base64 String', repr(decoded), encoded)) | ||
| 1134 | - for encoded, decoded in self.dridex_strings: | ||
| 1135 | - results.append(('Dridex string', repr(decoded), encoded)) | ||
| 1136 | - return results | ||
| 1137 | - | ||
| 1138 | - def scan_summary(self): | ||
| 1139 | - """ | ||
| 1140 | - Analyze the provided VBA code to detect suspicious keywords, | ||
| 1141 | - auto-executable macros, IOC patterns, obfuscation patterns | ||
| 1142 | - such as hex-encoded strings. | ||
| 1143 | - | ||
| 1144 | - :return: tuple with the number of items found for each category: | ||
| 1145 | - (autoexec, suspicious, IOCs, hex, base64, dridex) | ||
| 1146 | - """ | ||
| 1147 | - self.scan() | ||
| 1148 | - return (len(self.autoexec_keywords), len(self.suspicious_keywords), | ||
| 1149 | - len(self.iocs), len(self.hex_strings), len(self.base64_strings), | ||
| 1150 | - len(self.dridex_strings)) | ||
| 1151 | - | ||
| 1152 | - | ||
| 1153 | - | ||
| 1154 | -def scan_vba(vba_code, include_decoded_strings): | ||
| 1155 | - """ | ||
| 1156 | - Analyze the provided VBA code to detect suspicious keywords, | ||
| 1157 | - auto-executable macros, IOC patterns, obfuscation patterns | ||
| 1158 | - such as hex-encoded strings. | ||
| 1159 | - (shortcut for VBA_Scanner(vba_code).scan()) | ||
| 1160 | - | ||
| 1161 | - :param vba_code: str, VBA source code to be analyzed | ||
| 1162 | - :param include_decoded_strings: bool, if True all encoded strings will be included with their decoded content. | ||
| 1163 | - :return: list of tuples (type, keyword, description) | ||
| 1164 | - (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String') | ||
| 1165 | - """ | ||
| 1166 | - return VBA_Scanner(vba_code).scan(include_decoded_strings) | ||
| 1167 | - | ||
| 1168 | - | ||
| 1169 | -#=== CLASSES ================================================================= | ||
| 1170 | - | ||
| 1171 | -class VBA_Parser(object): | ||
| 1172 | - """ | ||
| 1173 | - Class to parse MS Office files, to detect VBA macros and extract VBA source code | ||
| 1174 | - Supported file formats: | ||
| 1175 | - - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) | ||
| 1176 | - - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) | ||
| 1177 | - - PowerPoint 2007+ (.pptm, .ppsm) | ||
| 1178 | - """ | ||
| 1179 | - | ||
| 1180 | - def __init__(self, filename, data=None): | ||
| 1181 | - """ | ||
| 1182 | - Constructor for VBA_Parser | ||
| 1183 | - | ||
| 1184 | - :param filename: filename or path of file to parse, or file-like object | ||
| 1185 | - | ||
| 1186 | - :param data: None or bytes str, if None the file will be read from disk (or from the file-like object). | ||
| 1187 | - If data is provided as a bytes string, it will be parsed as the content of the file in memory, | ||
| 1188 | - and not read from disk. Note: files must be read in binary mode, i.e. open(f, 'rb'). | ||
| 1189 | - """ | ||
| 1190 | - #TODO: filename should only be a string, data should be used for the file-like object | ||
| 1191 | - #TODO: filename should be mandatory, optional data is a string or file-like object | ||
| 1192 | - #TODO: also support olefile and zipfile as input | ||
| 1193 | - if data is None: | ||
| 1194 | - # open file from disk: | ||
| 1195 | - _file = filename | ||
| 1196 | - else: | ||
| 1197 | - # file already read in memory, make it a file-like object for zipfile: | ||
| 1198 | - _file = cStringIO.StringIO(data) | ||
| 1199 | - #self.file = _file | ||
| 1200 | - self.ole_file = None | ||
| 1201 | - self.ole_subfiles = [] | ||
| 1202 | - self.filename = filename | ||
| 1203 | - self.type = None | ||
| 1204 | - self.vba_projects = None | ||
| 1205 | - # if filename is None: | ||
| 1206 | - # if isinstance(_file, basestring): | ||
| 1207 | - # if len(_file) < olefile.MINIMAL_OLEFILE_SIZE: | ||
| 1208 | - # self.filename = _file | ||
| 1209 | - # else: | ||
| 1210 | - # self.filename = '<file in bytes string>' | ||
| 1211 | - # else: | ||
| 1212 | - # self.filename = '<file-like object>' | ||
| 1213 | - if olefile.isOleFile(_file): | ||
| 1214 | - # This looks like an OLE file | ||
| 1215 | - logging.info('Parsing OLE file %s' % self.filename) | ||
| 1216 | - # Open and parse the OLE file, using unicode for path names: | ||
| 1217 | - self.ole_file = olefile.OleFileIO(_file, path_encoding=None) | ||
| 1218 | - self.type = TYPE_OLE | ||
| 1219 | - #TODO: raise TypeError if this is a Powerpoint 97 file, since VBA macros cannot be detected yet | ||
| 1220 | - elif zipfile.is_zipfile(_file): | ||
| 1221 | - # This looks like a zip file, need to look for vbaProject.bin inside | ||
| 1222 | - # It can be any OLE file inside the archive | ||
| 1223 | - #...because vbaProject.bin can be renamed: | ||
| 1224 | - # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18 | ||
| 1225 | - logging.info('Opening ZIP/OpenXML file %s' % self.filename) | ||
| 1226 | - self.type = TYPE_OpenXML | ||
| 1227 | - z = zipfile.ZipFile(_file) | ||
| 1228 | - #TODO: check if this is actually an OpenXML file | ||
| 1229 | - #TODO: if the zip file is encrypted, suggest to use the -z option, or try '-z infected' automatically? | ||
| 1230 | - # check each file within the zip if it is an OLE file, by reading its magic: | ||
| 1231 | - for subfile in z.namelist(): | ||
| 1232 | - magic = z.open(subfile).read(len(olefile.MAGIC)) | ||
| 1233 | - if magic == olefile.MAGIC: | ||
| 1234 | - logging.debug('Opening OLE file %s within zip' % subfile) | ||
| 1235 | - ole_data = z.open(subfile).read() | ||
| 1236 | - try: | ||
| 1237 | - self.ole_subfiles.append(VBA_Parser(filename=subfile, data=ole_data)) | ||
| 1238 | - except: | ||
| 1239 | - logging.debug('%s is not a valid OLE file' % subfile) | ||
| 1240 | - continue | ||
| 1241 | - z.close() | ||
| 1242 | - else: | ||
| 1243 | - # read file from disk, check if it is a Word 2003 XML file (WordProcessingML), Excel 2003 XML, | ||
| 1244 | - # or a plain text file containing VBA code | ||
| 1245 | - if data is None: | ||
| 1246 | - data = open(filename, 'rb').read() | ||
| 1247 | - # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace | ||
| 1248 | - if 'http://schemas.microsoft.com/office/word/2003/wordml' in data: | ||
| 1249 | - logging.info('Opening Word 2003 XML file %s' % self.filename) | ||
| 1250 | - self.type = TYPE_Word2003_XML | ||
| 1251 | - # parse the XML content | ||
| 1252 | - et = ET.fromstring(data) | ||
| 1253 | - # find all the binData elements: | ||
| 1254 | - for bindata in et.getiterator(TAG_BINDATA): | ||
| 1255 | - # the binData content is an OLE container for the VBA project, compressed | ||
| 1256 | - # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded. | ||
| 1257 | - # get the filename: | ||
| 1258 | - fname = bindata.get(ATTR_NAME, 'noname.mso') | ||
| 1259 | - # decode the base64 activemime | ||
| 1260 | - activemime = binascii.a2b_base64(bindata.text) | ||
| 1261 | - # decompress the zlib data starting at offset 0x32, which is the OLE container: | ||
| 1262 | - ole_data = zlib.decompress(activemime[0x32:]) | ||
| 1263 | - try: | ||
| 1264 | - self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data)) | ||
| 1265 | - except: | ||
| 1266 | - logging.debug('%s is not a valid OLE file' % fname) | ||
| 1267 | - continue | ||
| 1268 | - #TODO: handle exceptions | ||
| 1269 | - #TODO: Excel 2003 XML | ||
| 1270 | - #TODO: plain text VBA file | ||
| 1271 | - else: | ||
| 1272 | - msg = '%s is not an OLE nor an OpenXML file, cannot extract VBA Macros.' % self.filename | ||
| 1273 | - logging.error(msg) | ||
| 1274 | - raise TypeError(msg) | ||
| 1275 | - | ||
| 1276 | - def find_vba_projects (self): | ||
| 1277 | - """ | ||
| 1278 | - Finds all the VBA projects stored in an OLE file. | ||
| 1279 | - | ||
| 1280 | - Return None if the file is not OLE but OpenXML. | ||
| 1281 | - Return a list of tuples (vba_root, project_path, dir_path) for each VBA project. | ||
| 1282 | - vba_root is the path of the root OLE storage containing the VBA project, | ||
| 1283 | - including a trailing slash unless it is the root of the OLE file. | ||
| 1284 | - project_path is the path of the OLE stream named "PROJECT" within the VBA project. | ||
| 1285 | - dir_path is the path of the OLE stream named "VBA/dir" within the VBA project. | ||
| 1286 | - | ||
| 1287 | - If this function returns an empty list for one of the supported formats | ||
| 1288 | - (i.e. Word, Excel, Powerpoint except Powerpoint 97-2003), then the | ||
| 1289 | - file does not contain VBA macros. | ||
| 1290 | - | ||
| 1291 | - :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path) | ||
| 1292 | - for each VBA project found if OLE file | ||
| 1293 | - """ | ||
| 1294 | - # if the file is not OLE but OpenXML, return None: | ||
| 1295 | - if self.ole_file is None: | ||
| 1296 | - return None | ||
| 1297 | - | ||
| 1298 | - # if this method has already been called, return previous result: | ||
| 1299 | - if self.vba_projects is not None: | ||
| 1300 | - return self.vba_projects | ||
| 1301 | - | ||
| 1302 | - # Find the VBA project root (different in MS Word, Excel, etc): | ||
| 1303 | - # - Word 97-2003: Macros | ||
| 1304 | - # - Excel 97-2003: _VBA_PROJECT_CUR | ||
| 1305 | - # - PowerPoint 97-2003: not supported yet (different file structure) | ||
| 1306 | - # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin. | ||
| 1307 | - # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word | ||
| 1308 | - # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word | ||
| 1309 | - # - Visio 2007: not supported yet (different file structure) | ||
| 1310 | - | ||
| 1311 | - # According to MS-OVBA section 2.2.1: | ||
| 1312 | - # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream | ||
| 1313 | - # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream | ||
| 1314 | - # - all names are case-insensitive | ||
| 1315 | - | ||
| 1316 | - # start with an empty list: | ||
| 1317 | - self.vba_projects = [] | ||
| 1318 | - # Look for any storage containing those storage/streams: | ||
| 1319 | - ole = self.ole_file | ||
| 1320 | - for storage in ole.listdir(streams=False, storages=True): | ||
| 1321 | - # Look for a storage ending with "VBA": | ||
| 1322 | - if storage[-1].upper() == 'VBA': | ||
| 1323 | - logging.debug('Found VBA storage: %s' % ('/'.join(storage))) | ||
| 1324 | - vba_root = '/'.join(storage[:-1]) | ||
| 1325 | - # Add a trailing slash to vba_root, unless it is the root of the OLE file: | ||
| 1326 | - # (used later to append all the child streams/storages) | ||
| 1327 | - if vba_root != '': | ||
| 1328 | - vba_root += '/' | ||
| 1329 | - logging.debug('Checking vba_root="%s"' % vba_root) | ||
| 1330 | - | ||
| 1331 | - def check_vba_stream(ole, vba_root, stream_path): | ||
| 1332 | - full_path = vba_root + stream_path | ||
| 1333 | - if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM: | ||
| 1334 | - logging.debug('Found %s stream: %s' % (stream_path, full_path)) | ||
| 1335 | - return full_path | ||
| 1336 | - else: | ||
| 1337 | - logging.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path) | ||
| 1338 | - return False | ||
| 1339 | - | ||
| 1340 | - # Check if the VBA root storage also contains a PROJECT stream: | ||
| 1341 | - project_path = check_vba_stream(ole, vba_root, 'PROJECT') | ||
| 1342 | - if not project_path: continue | ||
| 1343 | - # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream: | ||
| 1344 | - vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT') | ||
| 1345 | - if not vba_project_path: continue | ||
| 1346 | - # Check if the VBA root storage also contains a VBA/dir stream: | ||
| 1347 | - dir_path = check_vba_stream(ole, vba_root, 'VBA/dir') | ||
| 1348 | - if not dir_path: continue | ||
| 1349 | - # Now we are pretty sure it is a VBA project structure | ||
| 1350 | - logging.debug('VBA root storage: "%s"' % vba_root) | ||
| 1351 | - # append the results to the list as a tuple for later use: | ||
| 1352 | - self.vba_projects.append((vba_root, project_path, dir_path)) | ||
| 1353 | - return self.vba_projects | ||
| 1354 | - | ||
| 1355 | - def detect_vba_macros(self): | ||
| 1356 | - """ | ||
| 1357 | - Detect the potential presence of VBA macros in the file, by checking | ||
| 1358 | - if it contains VBA projects. Both OLE and OpenXML files are supported. | ||
| 1359 | - | ||
| 1360 | - Important: for now, results are accurate only for Word, Excel and PowerPoint | ||
| 1361 | - EXCEPT Powerpoint 97-2003, which has a different structure for VBA. | ||
| 1362 | - | ||
| 1363 | - Note: this method does NOT attempt to check the actual presence or validity | ||
| 1364 | - of VBA macro source code, so there might be false positives. | ||
| 1365 | - It may also detect VBA macros in files embedded within the main file, | ||
| 1366 | - for example an Excel workbook with macros embedded into a Word | ||
| 1367 | - document without macros may be detected, without distinction. | ||
| 1368 | - | ||
| 1369 | - :return: bool, True if at least one VBA project has been found, False otherwise | ||
| 1370 | - """ | ||
| 1371 | - #TODO: return None or raise exception if format not supported like PPT 97-2003 | ||
| 1372 | - #TODO: return the number of VBA projects found instead of True/False? | ||
| 1373 | - # if OpenXML, check all the OLE subfiles: | ||
| 1374 | - if self.ole_file is None: | ||
| 1375 | - for ole_subfile in self.ole_subfiles: | ||
| 1376 | - if ole_subfile.detect_vba_macros(): | ||
| 1377 | - return True | ||
| 1378 | - return False | ||
| 1379 | - # otherwise it's an OLE file, find VBA projects: | ||
| 1380 | - vba_projects = self.find_vba_projects() | ||
| 1381 | - if len(vba_projects) == 0: | ||
| 1382 | - return False | ||
| 1383 | - else: | ||
| 1384 | - return True | ||
| 1385 | - | ||
| 1386 | - | ||
| 1387 | - def extract_macros (self): | ||
| 1388 | - """ | ||
| 1389 | - Extract and decompress source code for each VBA macro found in the file | ||
| 1390 | - | ||
| 1391 | - Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found | ||
| 1392 | - If the file is OLE, filename is the path of the file. | ||
| 1393 | - If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros | ||
| 1394 | - within the zip archive, e.g. word/vbaProject.bin. | ||
| 1395 | - """ | ||
| 1396 | - if self.ole_file is None: | ||
| 1397 | - for ole_subfile in self.ole_subfiles: | ||
| 1398 | - for results in ole_subfile.extract_macros(): | ||
| 1399 | - yield results | ||
| 1400 | - else: | ||
| 1401 | - self.find_vba_projects() | ||
| 1402 | - for vba_root, project_path, dir_path in self.vba_projects: | ||
| 1403 | - # extract all VBA macros from that VBA root storage: | ||
| 1404 | - for stream_path, vba_filename, vba_code in _extract_vba(self.ole_file, vba_root, project_path, dir_path): | ||
| 1405 | - yield (self.filename, stream_path, vba_filename, vba_code) | ||
| 1406 | - | ||
| 1407 | - | ||
| 1408 | - def close(self): | ||
| 1409 | - """ | ||
| 1410 | - Close all the open files. This method must be called after usage, if | ||
| 1411 | - the application is opening many files. | ||
| 1412 | - """ | ||
| 1413 | - if self.ole_file is None: | ||
| 1414 | - for ole_subfile in self.ole_subfiles: | ||
| 1415 | - ole_subfile.close() | ||
| 1416 | - else: | ||
| 1417 | - self.ole_file.close() | ||
| 1418 | - | ||
| 1419 | - | ||
| 1420 | -def print_analysis(vba_code, show_decoded_strings=False): | ||
| 1421 | - """ | ||
| 1422 | - Analyze the provided VBA code, and print the results in a table | ||
| 1423 | - | ||
| 1424 | - :param vba_code: str, VBA source code to be analyzed | ||
| 1425 | - :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content. | ||
| 1426 | - :return: None | ||
| 1427 | - """ | ||
| 1428 | - results = scan_vba(vba_code, show_decoded_strings) | ||
| 1429 | - if results: | ||
| 1430 | - t = prettytable.PrettyTable(('Type', 'Keyword', 'Description')) | ||
| 1431 | - t.align = 'l' | ||
| 1432 | - t.max_width['Type'] = 10 | ||
| 1433 | - t.max_width['Keyword'] = 20 | ||
| 1434 | - t.max_width['Description'] = 39 | ||
| 1435 | - for kw_type, keyword, description in results: | ||
| 1436 | - t.add_row((kw_type, keyword, description)) | ||
| 1437 | - print t | ||
| 1438 | - else: | ||
| 1439 | - print 'No suspicious keyword or IOC found.' | ||
| 1440 | - | ||
| 1441 | - | ||
| 1442 | - | ||
| 1443 | -def process_file (container, filename, data, show_decoded_strings=False): | ||
| 1444 | - """ | ||
| 1445 | - Process a single file | ||
| 1446 | - | ||
| 1447 | - :param container: str, path and filename of container if the file is within | ||
| 1448 | - a zip archive, None otherwise. | ||
| 1449 | - :param filename: str, path and filename of file on disk, or within the container. | ||
| 1450 | - :param data: bytes, content of the file if it is in a container, None if it is a file on disk. | ||
| 1451 | - :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content. | ||
| 1452 | - """ | ||
| 1453 | - #TODO: replace print by writing to a provided output file (sys.stdout by default) | ||
| 1454 | - if container: | ||
| 1455 | - display_filename = '%s in %s' % (filename, container) | ||
| 1456 | - else: | ||
| 1457 | - display_filename = filename | ||
| 1458 | - print '='*79 | ||
| 1459 | - print 'FILE:', display_filename | ||
| 1460 | - try: | ||
| 1461 | - #TODO: handle olefile errors, when an OLE file is malformed | ||
| 1462 | - vba = VBA_Parser(filename, data) | ||
| 1463 | - print 'Type:', vba.type | ||
| 1464 | - if vba.detect_vba_macros(): | ||
| 1465 | - #print 'Contains VBA Macros:' | ||
| 1466 | - for (subfilename, stream_path, vba_filename, vba_code) in vba.extract_macros(): | ||
| 1467 | - # hide attribute lines: | ||
| 1468 | - #TODO: option to disable attribute filtering | ||
| 1469 | - vba_code_filtered = filter_vba(vba_code) | ||
| 1470 | - print '-'*79 | ||
| 1471 | - print 'VBA MACRO %s ' % vba_filename | ||
| 1472 | - print 'in file: %s - OLE stream: %s' % (subfilename, repr(stream_path)) | ||
| 1473 | - print '- '*39 | ||
| 1474 | - # detect empty macros: | ||
| 1475 | - if vba_code_filtered.strip() == '': | ||
| 1476 | - print '(empty macro)' | ||
| 1477 | - else: | ||
| 1478 | - print vba_code_filtered | ||
| 1479 | - print '- '*39 | ||
| 1480 | - print 'ANALYSIS:' | ||
| 1481 | - # analyse the whole code, filtered to avoid false positives: | ||
| 1482 | - print_analysis(vba_code_filtered, show_decoded_strings) | ||
| 1483 | - else: | ||
| 1484 | - print 'No VBA macros found.' | ||
| 1485 | - except: #TypeError: | ||
| 1486 | - #raise | ||
| 1487 | - #TODO: print more info if debug mode | ||
| 1488 | - #print sys.exc_value | ||
| 1489 | - # display the exception with full stack trace for debugging, but do not stop: | ||
| 1490 | - traceback.print_exc() | ||
| 1491 | - print '' | ||
| 1492 | - | ||
| 1493 | - | ||
| 1494 | -def process_file_triage (container, filename, data): | ||
| 1495 | - """ | ||
| 1496 | - Process a single file | ||
| 1497 | - | ||
| 1498 | - :param container: str, path and filename of container if the file is within | ||
| 1499 | - a zip archive, None otherwise. | ||
| 1500 | - :param filename: str, path and filename of file on disk, or within the container. | ||
| 1501 | - :param data: bytes, content of the file if it is in a container, None if it is a file on disk. | ||
| 1502 | - """ | ||
| 1503 | - #TODO: replace print by writing to a provided output file (sys.stdout by default) | ||
| 1504 | - nb_macros = 0 | ||
| 1505 | - nb_autoexec = 0 | ||
| 1506 | - nb_suspicious = 0 | ||
| 1507 | - nb_iocs = 0 | ||
| 1508 | - nb_hexstrings = 0 | ||
| 1509 | - nb_base64strings = 0 | ||
| 1510 | - nb_dridexstrings = 0 | ||
| 1511 | - # ftype = 'Other' | ||
| 1512 | - message = '' | ||
| 1513 | - try: | ||
| 1514 | - #TODO: handle olefile errors, when an OLE file is malformed | ||
| 1515 | - vba = VBA_Parser(filename, data) | ||
| 1516 | - if vba.detect_vba_macros(): | ||
| 1517 | - for (subfilename, stream_path, vba_filename, vba_code) in vba.extract_macros(): | ||
| 1518 | - nb_macros += 1 | ||
| 1519 | - if vba_code.strip() != '': | ||
| 1520 | - # analyse the whole code, filtered to avoid false positives: | ||
| 1521 | - scanner = VBA_Scanner(filter_vba(vba_code)) | ||
| 1522 | - autoexec, suspicious, iocs, hexstrings, base64strings, dridex = scanner.scan_summary() | ||
| 1523 | - nb_autoexec += autoexec | ||
| 1524 | - nb_suspicious += suspicious | ||
| 1525 | - nb_iocs += iocs | ||
| 1526 | - nb_hexstrings += hexstrings | ||
| 1527 | - nb_base64strings += base64strings | ||
| 1528 | - nb_dridexstrings += dridex | ||
| 1529 | - if vba.type == TYPE_OLE: | ||
| 1530 | - flags = 'OLE:' | ||
| 1531 | - elif vba.type == TYPE_OpenXML: | ||
| 1532 | - flags = 'OpX:' | ||
| 1533 | - elif vba.type == TYPE_Word2003_XML: | ||
| 1534 | - flags = 'XML:' | ||
| 1535 | - macros = autoexec = suspicious = iocs = hexstrings = base64obf = dridex = '-' | ||
| 1536 | - if nb_macros: macros = 'M' | ||
| 1537 | - if nb_autoexec: autoexec = 'A' | ||
| 1538 | - if nb_suspicious: suspicious = 'S' | ||
| 1539 | - if nb_iocs: iocs = 'I' | ||
| 1540 | - if nb_hexstrings: hexstrings = 'H' | ||
| 1541 | - if nb_base64strings: base64obf = 'B' | ||
| 1542 | - if nb_dridexstrings: dridex = 'D' | ||
| 1543 | - flags += '%s%s%s%s%s%s%s' % (macros, autoexec, suspicious, iocs, hexstrings, | ||
| 1544 | - base64obf, dridex) | ||
| 1545 | - | ||
| 1546 | - # macros = autoexec = suspicious = iocs = hexstrings = 'no' | ||
| 1547 | - # if nb_macros: macros = 'YES:%d' % nb_macros | ||
| 1548 | - # if nb_autoexec: autoexec = 'YES:%d' % nb_autoexec | ||
| 1549 | - # if nb_suspicious: suspicious = 'YES:%d' % nb_suspicious | ||
| 1550 | - # if nb_iocs: iocs = 'YES:%d' % nb_iocs | ||
| 1551 | - # if nb_hexstrings: hexstrings = 'YES:%d' % nb_hexstrings | ||
| 1552 | - # # 2nd line = info | ||
| 1553 | - # print '%-8s %-7s %-7s %-7s %-7s %-7s' % (vba.type, macros, autoexec, suspicious, iocs, hexstrings) | ||
| 1554 | - except TypeError: | ||
| 1555 | - # file type not OLE nor OpenXML | ||
| 1556 | - flags = '?' | ||
| 1557 | - message = 'File format not supported' | ||
| 1558 | - except: | ||
| 1559 | - # another error occurred | ||
| 1560 | - #raise | ||
| 1561 | - #TODO: print more info if debug mode | ||
| 1562 | - #TODO: distinguish real errors from incorrect file types | ||
| 1563 | - flags = '!ERROR' | ||
| 1564 | - message = sys.exc_value | ||
| 1565 | - line = '%-11s %s' % (flags, filename) | ||
| 1566 | - if message: | ||
| 1567 | - line += ' - %s' % message | ||
| 1568 | - print line | ||
| 1569 | - | ||
| 1570 | - # t = prettytable.PrettyTable(('filename', 'type', 'macros', 'autoexec', 'suspicious', 'ioc', 'hexstrings'), | ||
| 1571 | - # header=False, border=False) | ||
| 1572 | - # t.align = 'l' | ||
| 1573 | - # t.max_width['filename'] = 30 | ||
| 1574 | - # t.max_width['type'] = 10 | ||
| 1575 | - # t.max_width['macros'] = 6 | ||
| 1576 | - # t.max_width['autoexec'] = 6 | ||
| 1577 | - # t.max_width['suspicious'] = 6 | ||
| 1578 | - # t.max_width['ioc'] = 6 | ||
| 1579 | - # t.max_width['hexstrings'] = 6 | ||
| 1580 | - # t.add_row((filename, ftype, macros, autoexec, suspicious, iocs, hexstrings)) | ||
| 1581 | - # print t | ||
| 1582 | - | ||
| 1583 | -def main_triage_quick(): | ||
| 1584 | - pass | ||
| 1585 | - | ||
| 1586 | -#=== MAIN ===================================================================== | ||
| 1587 | - | ||
| 1588 | -def main(): | ||
| 1589 | - """ | ||
| 1590 | - Main function, called when olevba is run from the command line | ||
| 1591 | - """ | ||
| 1592 | - usage = 'usage: %prog [options] <filename> [filename2 ...]' | ||
| 1593 | - parser = optparse.OptionParser(usage=usage) | ||
| 1594 | - # parser.add_option('-o', '--outfile', dest='outfile', | ||
| 1595 | - # help='output file') | ||
| 1596 | - # parser.add_option('-c', '--csv', dest='csv', | ||
| 1597 | - # help='export results to a CSV file') | ||
| 1598 | - parser.add_option("-r", action="store_true", dest="recursive", | ||
| 1599 | - help='find files recursively in subdirectories.') | ||
| 1600 | - parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None, | ||
| 1601 | - help='if the file is a zip archive, open first file from it, using the provided password (requires Python 2.6+)') | ||
| 1602 | - parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*', | ||
| 1603 | - help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)') | ||
| 1604 | - parser.add_option("-t", action="store_true", dest="triage_mode", | ||
| 1605 | - help='triage mode, display results as a summary table (default for multiple files)') | ||
| 1606 | - parser.add_option("-d", action="store_true", dest="detailed_mode", | ||
| 1607 | - help='detailed mode, display full results (default for single file)') | ||
| 1608 | - parser.add_option("-i", "--input", dest='input', type='str', default=None, | ||
| 1609 | - help='input file containing VBA source code to be analyzed (no parsing)') | ||
| 1610 | - parser.add_option("--decode", action="store_true", dest="show_decoded_strings", | ||
| 1611 | - help='display all the obfuscated strings with their decoded content (Hex, Base64, StrReverse, Dridex).') | ||
| 1612 | - | ||
| 1613 | - (options, args) = parser.parse_args() | ||
| 1614 | - | ||
| 1615 | - # Print help if no arguments are passed | ||
| 1616 | - if len(args) == 0 and not options.input: | ||
| 1617 | - print __doc__ | ||
| 1618 | - parser.print_help() | ||
| 1619 | - sys.exit() | ||
| 1620 | - | ||
| 1621 | - # print banner with version | ||
| 1622 | - print 'olevba %s - http://decalage.info/python/oletools' % __version__ | ||
| 1623 | - | ||
| 1624 | - logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING) #INFO) | ||
| 1625 | - # For now, all logging is disabled: | ||
| 1626 | - logging.disable(logging.CRITICAL) | ||
| 1627 | - | ||
| 1628 | - if options.input: | ||
| 1629 | - # input file provided with VBA source code to be analyzed directly: | ||
| 1630 | - print 'Analysis of VBA source code from %s:' % options.input | ||
| 1631 | - vba_code = open(options.input).read() | ||
| 1632 | - print_analysis(vba_code, show_decoded_strings=options.show_decoded_strings) | ||
| 1633 | - sys.exit() | ||
| 1634 | - | ||
| 1635 | - # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('Type', 'Macros', 'AutoEx', 'Susp.', 'IOCs', 'HexStr') | ||
| 1636 | - # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('-'*8, '-'*7, '-'*7, '-'*7, '-'*7, '-'*7) | ||
| 1637 | - if not options.detailed_mode or options.triage_mode: | ||
| 1638 | - print '%-11s %-65s' % ('Flags', 'Filename') | ||
| 1639 | - print '%-11s %-65s' % ('-'*11, '-'*65) | ||
| 1640 | - previous_container = None | ||
| 1641 | - count = 0 | ||
| 1642 | - container = filename = data = None | ||
| 1643 | - for container, filename, data in xglob.iter_files(args, recursive=options.recursive, | ||
| 1644 | - zip_password=options.zip_password, zip_fname=options.zip_fname): | ||
| 1645 | - # ignore directory names stored in zip files: | ||
| 1646 | - if container and filename.endswith('/'): | ||
| 1647 | - continue | ||
| 1648 | - if options.detailed_mode and not options.triage_mode: | ||
| 1649 | - # fully detailed output | ||
| 1650 | - process_file(container, filename, data, show_decoded_strings=options.show_decoded_strings) | ||
| 1651 | - else: | ||
| 1652 | - # print container name when it changes: | ||
| 1653 | - if container != previous_container: | ||
| 1654 | - if container is not None: | ||
| 1655 | - print '\nFiles in %s:' % container | ||
| 1656 | - previous_container = container | ||
| 1657 | - # summarized output for triage: | ||
| 1658 | - process_file_triage(container, filename, data) | ||
| 1659 | - count += 1 | ||
| 1660 | - if not options.detailed_mode or options.triage_mode: | ||
| 1661 | - print '\n(Flags: OpX=OpenXML, XML=Word2003XML, M=Macros, A=Auto-executable, S=Suspicious keywords, I=IOCs, H=Hex strings, B=Base64 strings, D=Dridex strings, ?=Unknown)\n' | ||
| 1662 | - | ||
| 1663 | - if count == 1 and not options.triage_mode and not options.detailed_mode: | ||
| 1664 | - # if options -t and -d were not specified and it's a single file, print details: | ||
| 1665 | - #TODO: avoid doing the analysis twice by storing results | ||
| 1666 | - process_file(container, filename, data, show_decoded_strings=options.show_decoded_strings) | ||
| 1667 | - | ||
| 1668 | -if __name__ == '__main__': | ||
| 1669 | - main() | ||
| 1670 | - | 1 | +#!/usr/bin/env python |
| 2 | +""" | ||
| 3 | +olevba.py | ||
| 4 | + | ||
| 5 | +olevba is a script to parse OLE and OpenXML files such as MS Office documents | ||
| 6 | +(e.g. Word, Excel), to extract VBA Macro code in clear text, deobfuscate | ||
| 7 | +and analyze malicious macros. | ||
| 8 | + | ||
| 9 | +Supported formats: | ||
| 10 | +- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) | ||
| 11 | +- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) | ||
| 12 | +- PowerPoint 2007+ (.pptm, .ppsm) | ||
| 13 | +- Word 2003 XML (.xml) | ||
| 14 | + | ||
| 15 | +Author: Philippe Lagadec - http://www.decalage.info | ||
| 16 | +License: BSD, see source code or documentation | ||
| 17 | + | ||
| 18 | +olevba is part of the python-oletools package: | ||
| 19 | +http://www.decalage.info/python/oletools | ||
| 20 | + | ||
| 21 | +olevba is based on source code from officeparser by John William Davison | ||
| 22 | +https://github.com/unixfreak0037/officeparser | ||
| 23 | +""" | ||
| 24 | + | ||
| 25 | +#=== LICENSE ================================================================== | ||
| 26 | + | ||
| 27 | +# olevba is copyright (c) 2014-2015 Philippe Lagadec (http://www.decalage.info) | ||
| 28 | +# All rights reserved. | ||
| 29 | +# | ||
| 30 | +# Redistribution and use in source and binary forms, with or without modification, | ||
| 31 | +# are permitted provided that the following conditions are met: | ||
| 32 | +# | ||
| 33 | +# * Redistributions of source code must retain the above copyright notice, this | ||
| 34 | +# list of conditions and the following disclaimer. | ||
| 35 | +# * Redistributions in binary form must reproduce the above copyright notice, | ||
| 36 | +# this list of conditions and the following disclaimer in the documentation | ||
| 37 | +# and/or other materials provided with the distribution. | ||
| 38 | +# | ||
| 39 | +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | ||
| 40 | +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | ||
| 41 | +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | ||
| 42 | +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | ||
| 43 | +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
| 44 | +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||
| 45 | +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | ||
| 46 | +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||
| 47 | +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
| 48 | +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| 49 | + | ||
| 50 | + | ||
| 51 | +# olevba contains modified source code from the officeparser project, published | ||
| 52 | +# under the following MIT License (MIT): | ||
| 53 | +# | ||
| 54 | +# officeparser is copyright (c) 2014 John William Davison | ||
| 55 | +# | ||
| 56 | +# Permission is hereby granted, free of charge, to any person obtaining a copy | ||
| 57 | +# of this software and associated documentation files (the "Software"), to deal | ||
| 58 | +# in the Software without restriction, including without limitation the rights | ||
| 59 | +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
| 60 | +# copies of the Software, and to permit persons to whom the Software is | ||
| 61 | +# furnished to do so, subject to the following conditions: | ||
| 62 | +# | ||
| 63 | +# The above copyright notice and this permission notice shall be included in all | ||
| 64 | +# copies or substantial portions of the Software. | ||
| 65 | +# | ||
| 66 | +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
| 67 | +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
| 68 | +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
| 69 | +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
| 70 | +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
| 71 | +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| 72 | +# SOFTWARE. | ||
| 73 | + | ||
| 74 | +#------------------------------------------------------------------------------ | ||
| 75 | +# CHANGELOG: | ||
| 76 | +# 2014-08-05 v0.01 PL: - first version based on officeparser code | ||
| 77 | +# 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser | ||
| 78 | +# 2014-08-15 PL: - fixed incorrect value check in PROJECTHELPFILEPATH Record | ||
| 79 | +# 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats | ||
| 80 | +# and to find the VBA project root anywhere in the file | ||
| 81 | +# 2014-11-29 v0.04 PL: - use olefile instead of OleFileIO_PL | ||
| 82 | +# 2014-12-05 v0.05 PL: - refactored most functions into a class, new API | ||
| 83 | +# - added detect_vba_macros | ||
| 84 | +# 2014-12-10 v0.06 PL: - hide first lines with VB attributes | ||
| 85 | +# - detect auto-executable macros | ||
| 86 | +# - ignore empty macros | ||
| 87 | +# 2014-12-14 v0.07 PL: - detect_autoexec() is now case-insensitive | ||
| 88 | +# 2014-12-15 v0.08 PL: - improved display for empty macros | ||
| 89 | +# - added pattern extraction | ||
| 90 | +# 2014-12-25 v0.09 PL: - added suspicious keywords detection | ||
| 91 | +# 2014-12-27 v0.10 PL: - added OptionParser, main and process_file | ||
| 92 | +# - uses xglob to scan several files with wildcards | ||
| 93 | +# - option -r to recurse subdirectories | ||
| 94 | +# - option -z to scan files in password-protected zips | ||
| 95 | +# 2015-01-02 v0.11 PL: - improved filter_vba to detect colons | ||
| 96 | +# 2015-01-03 v0.12 PL: - fixed detect_patterns to detect all patterns | ||
| 97 | +# - process_file: improved display, shows container file | ||
| 98 | +# - improved list of executable file extensions | ||
| 99 | +# 2015-01-04 v0.13 PL: - added several suspicious keywords, improved display | ||
| 100 | +# 2015-01-08 v0.14 PL: - added hex strings detection and decoding | ||
| 101 | +# - fixed issue #2, decoding VBA stream names using | ||
| 102 | +# specified codepage and unicode stream names | ||
| 103 | +# 2015-01-11 v0.15 PL: - added new triage mode, options -t and -d | ||
| 104 | +# 2015-01-16 v0.16 PL: - fix for issue #3 (exception when module name="text") | ||
| 105 | +# - added several suspicious keywords | ||
| 106 | +# - added option -i to analyze VBA source code directly | ||
| 107 | +# 2015-01-17 v0.17 PL: - removed .com from the list of executable extensions | ||
| 108 | +# - added scan_vba to run all detection algorithms | ||
| 109 | +# - decoded hex strings are now also scanned + reversed | ||
| 110 | +# 2015-01-23 v0.18 PL: - fixed issue #3, case-insensitive search in code_modules | ||
| 111 | +# 2015-01-24 v0.19 PL: - improved the detection of IOCs obfuscated with hex | ||
| 112 | +# strings and StrReverse | ||
| 113 | +# 2015-01-26 v0.20 PL: - added option --hex to show all hex strings decoded | ||
| 114 | +# 2015-01-29 v0.21 PL: - added Dridex obfuscation decoding | ||
| 115 | +# - improved display, shows obfuscation name | ||
| 116 | +# 2015-02-01 v0.22 PL: - fixed issue #4: regex for URL, e-mail and exe filename | ||
| 117 | +# - added Base64 obfuscation decoding (contribution from | ||
| 118 | +# @JamesHabben) | ||
| 119 | +# 2015-02-03 v0.23 PL: - triage now uses VBA_Scanner results, shows Base64 and | ||
| 120 | +# Dridex strings | ||
| 121 | +# - exception handling in detect_base64_strings | ||
| 122 | +# 2015-02-07 v0.24 PL: - renamed option --hex to --decode, fixed display | ||
| 123 | +# - display exceptions with stack trace | ||
| 124 | +# - added several suspicious keywords | ||
| 125 | +# - improved Base64 detection and decoding | ||
| 126 | +# - fixed triage mode not to scan attrib lines | ||
| 127 | +# 2015-03-04 v0.25 PL: - added support for Word 2003 XML | ||
| 128 | + | ||
| 129 | +__version__ = '0.25' | ||
| 130 | + | ||
| 131 | +#------------------------------------------------------------------------------ | ||
| 132 | +# TODO: | ||
| 133 | +# + do not use logging, but a provided logger (null logger by default) | ||
| 134 | +# + setup logging (common with other oletools) | ||
| 135 | +# + add xor bruteforcing like bbharvest | ||
| 136 | +# + add chr() decoding | ||
| 137 | + | ||
| 138 | +# TODO later: | ||
| 139 | +# + performance improvement: instead of searching each keyword separately, | ||
| 140 | +# first split vba code into a list of words (per line), then check each | ||
| 141 | +# word against a dict. (or put vba words into a set/dict?) | ||
| 142 | +# + for regex, maybe combine them into a single re with named groups? | ||
| 143 | +# + add Yara support, include sample rules? plugins like balbuzard? | ||
| 144 | +# + add balbuzard support | ||
| 145 | +# + output to file (replace print by file.write, sys.stdout by default) | ||
| 146 | +# + look for VBA in embedded documents (e.g. Excel in Word) | ||
| 147 | +# + support SRP streams (see Lenny's article + links and sample) | ||
| 148 | +# - python 3.x support | ||
| 149 | +# - add support for PowerPoint macros (see libclamav, libgsf), use oledump heuristic? | ||
| 150 | +# - check VBA macros in Visio, Access, Project, etc | ||
| 151 | +# - extract_macros: convert to a class, split long function into smaller methods | ||
| 152 | +# - extract_macros: read bytes from stream file objects instead of strings | ||
| 153 | +# - extract_macros: use combined struct.unpack instead of many calls | ||
| 154 | + | ||
| 155 | +#------------------------------------------------------------------------------ | ||
| 156 | +# REFERENCES: | ||
| 157 | +# - [MS-OVBA]: Microsoft Office VBA File Format Structure | ||
| 158 | +# http://msdn.microsoft.com/en-us/library/office/cc313094%28v=office.12%29.aspx | ||
| 159 | +# - officeparser: https://github.com/unixfreak0037/officeparser | ||
| 160 | + | ||
| 161 | + | ||
| 162 | +#--- IMPORTS ------------------------------------------------------------------ | ||
| 163 | + | ||
| 164 | +import sys, logging | ||
| 165 | +import struct | ||
| 166 | +import cStringIO | ||
| 167 | +import math | ||
| 168 | +import zipfile | ||
| 169 | +import re | ||
| 170 | +import optparse | ||
| 171 | +import os.path | ||
| 172 | +import binascii | ||
| 173 | +import base64 | ||
| 174 | +import traceback | ||
| 175 | +import zlib | ||
| 176 | + | ||
| 177 | +# import lxml or ElementTree for XML parsing: | ||
| 178 | +try: | ||
| 179 | + # lxml: best performance for XML processing | ||
| 180 | + import lxml.etree as ET | ||
| 181 | +except ImportError: | ||
| 182 | + try: | ||
| 183 | + # Python 2.5+: batteries included | ||
| 184 | + import xml.etree.cElementTree as ET | ||
| 185 | + except ImportError: | ||
| 186 | + try: | ||
| 187 | + # Python <2.5: standalone ElementTree install | ||
| 188 | + import elementtree.cElementTree as ET | ||
| 189 | + except ImportError: | ||
| 190 | + raise ImportError, "lxml or ElementTree are not installed, "\ | ||
| 191 | + +"see http://codespeak.net/lxml "\ | ||
| 192 | + +"or http://effbot.org/zone/element-index.htm" | ||
| 193 | + | ||
| 194 | +import thirdparty.olefile as olefile | ||
| 195 | +from thirdparty.prettytable import prettytable | ||
| 196 | +from thirdparty.xglob import xglob | ||
| 197 | + | ||
| 198 | +#--- CONSTANTS ---------------------------------------------------------------- | ||
| 199 | + | ||
| 200 | +TYPE_OLE = 'OLE' | ||
| 201 | +TYPE_OpenXML = 'OpenXML' | ||
| 202 | +TYPE_Word2003_XML = 'Word2003_XML' | ||
| 203 | + | ||
| 204 | +MODULE_EXTENSION = "bas" | ||
| 205 | +CLASS_EXTENSION = "cls" | ||
| 206 | +FORM_EXTENSION = "frm" | ||
| 207 | + | ||
| 208 | +# Namespaces and tags for Word2003 XML parsing: | ||
| 209 | +NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}' | ||
| 210 | +# the tag <w:binData w:name="editdata.mso"> contains the VBA macro code: | ||
| 211 | +TAG_BINDATA = NS_W + 'binData' | ||
| 212 | +ATTR_NAME = NS_W + 'name' | ||
| 213 | + | ||
| 214 | +# Keywords to detect auto-executable macros | ||
| 215 | +AUTOEXEC_KEYWORDS = { | ||
| 216 | + # MS Word: | ||
| 217 | + 'Runs when the Word document is opened': | ||
| 218 | + ('AutoExec', 'AutoOpen', 'Document_Open', 'DocumentOpen'), | ||
| 219 | + 'Runs when the Word document is closed': | ||
| 220 | + ('AutoExit', 'AutoClose', 'Document_Close', 'DocumentBeforeClose'), | ||
| 221 | + 'Runs when the Word document is modified': | ||
| 222 | + ('DocumentChange',), | ||
| 223 | + 'Runs when a new Word document is created': | ||
| 224 | + ('AutoNew', 'Document_New', 'NewDocument'), | ||
| 225 | + | ||
| 226 | + # MS Excel: | ||
| 227 | + 'Runs when the Excel Workbook is opened': | ||
| 228 | + ('Auto_Open', 'Workbook_Open'), | ||
| 229 | + 'Runs when the Excel Workbook is closed': | ||
| 230 | + ('Auto_Close', 'Workbook_Close'), | ||
| 231 | + | ||
| 232 | + #TODO: full list in MS specs?? | ||
| 233 | +} | ||
| 234 | + | ||
| 235 | +# Suspicious Keywords that may be used by malware | ||
| 236 | +# See VBA language reference: http://msdn.microsoft.com/en-us/library/office/jj692818%28v=office.15%29.aspx | ||
| 237 | +SUSPICIOUS_KEYWORDS = { | ||
| 238 | + #TODO: use regex to support variable whitespaces | ||
| 239 | + 'May read system environment variables': | ||
| 240 | + ('Environ',), | ||
| 241 | + 'May open a file': | ||
| 242 | + ('Open',), | ||
| 243 | + 'May write to a file (if combined with Open)': | ||
| 244 | + #TODO: regex to find Open+Write on same line | ||
| 245 | + ('Write', 'Put', 'Output', 'Print #'), | ||
| 246 | + 'May read or write a binary file (if combined with Open)': | ||
| 247 | + #TODO: regex to find Open+Binary on same line | ||
| 248 | + ('Binary',), | ||
| 249 | + 'May copy a file': | ||
| 250 | + ('FileCopy', 'CopyFile'), | ||
| 251 | + #FileCopy: http://msdn.microsoft.com/en-us/library/office/gg264390%28v=office.15%29.aspx | ||
| 252 | + #CopyFile: http://msdn.microsoft.com/en-us/library/office/gg264089%28v=office.15%29.aspx | ||
| 253 | + 'May delete a file': | ||
| 254 | + ('Kill',), | ||
| 255 | + 'May create a text file': | ||
| 256 | + ('CreateTextFile','ADODB.Stream', 'WriteText', 'SaveToFile'), | ||
| 257 | + #CreateTextFile: http://msdn.microsoft.com/en-us/library/office/gg264617%28v=office.15%29.aspx | ||
| 258 | + #ADODB.Stream sample: http://pastebin.com/Z4TMyuq6 | ||
| 259 | + 'May run an executable file or a system command': | ||
| 260 | + ('Shell', 'vbNormal', 'vbNormalFocus', 'vbHide', 'vbMinimizedFocus', 'vbMaximizedFocus', 'vbNormalNoFocus', | ||
| 261 | + 'vbMinimizedNoFocus', 'WScript.Shell', 'Run'), | ||
| 262 | + #Shell: http://msdn.microsoft.com/en-us/library/office/gg278437%28v=office.15%29.aspx | ||
| 263 | + #WScript.Shell+Run sample: http://pastebin.com/Z4TMyuq6 | ||
| 264 | + 'May hide the application': | ||
| 265 | + ('Application.Visible', 'ShowWindow', 'SW_HIDE'), | ||
| 266 | + 'May create a directory': | ||
| 267 | + ('MkDir',), | ||
| 268 | + 'May save the current workbook': | ||
| 269 | + ('ActiveWorkbook.SaveAs',), | ||
| 270 | + 'May change which directory contains files to open at startup': | ||
| 271 | + #TODO: confirm the actual effect | ||
| 272 | + ('Application.AltStartupPath',), | ||
| 273 | + 'May create an OLE object': | ||
| 274 | + ('CreateObject',), | ||
| 275 | + 'May run an application (if combined with CreateObject)': | ||
| 276 | + ('Shell.Application',), | ||
| 277 | + 'May enumerate application windows (if combined with Shell.Application object)': | ||
| 278 | + ('Windows', 'FindWindow'), | ||
| 279 | + 'May run code from a DLL': | ||
| 280 | + #TODO: regex to find declare+lib on same line | ||
| 281 | + ('Lib',), | ||
| 282 | + 'May download files from the Internet': | ||
| 283 | + #TODO: regex to find urlmon+URLDownloadToFileA on same line | ||
| 284 | + ('URLDownloadToFileA', 'Msxml2.XMLHTTP', 'Microsoft.XMLHTTP'), | ||
| 285 | + 'May control another application by simulating user keystrokes': | ||
| 286 | + ('SendKeys', 'AppActivate'), | ||
| 287 | + #SendKeys: http://msdn.microsoft.com/en-us/library/office/gg278655%28v=office.15%29.aspx | ||
| 288 | + 'May attempt to obfuscate malicious function calls': | ||
| 289 | + ('CallByName',), | ||
| 290 | + #CallByName: http://msdn.microsoft.com/en-us/library/office/gg278760%28v=office.15%29.aspx | ||
| 291 | + 'May attempt to obfuscate specific strings': | ||
| 292 | + #TODO: regex to find several Chr*, not just one | ||
| 293 | + ('Chr', 'ChrB', 'ChrW', 'StrReverse', 'Xor'), | ||
| 294 | + #Chr: http://msdn.microsoft.com/en-us/library/office/gg264465%28v=office.15%29.aspx | ||
| 295 | +} | ||
| 296 | + | ||
| 297 | +# Regular Expression for a URL: | ||
| 298 | +# http://en.wikipedia.org/wiki/Uniform_resource_locator | ||
| 299 | +# http://www.w3.org/Addressing/URL/uri-spec.html | ||
| 300 | +#TODO: also support username:password@server | ||
| 301 | +#TODO: other protocols (file, gopher, wais, ...?) | ||
| 302 | +SCHEME = r'\b(?:http|ftp)s?' | ||
| 303 | +# see http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains | ||
| 304 | +TLD = r'(?:xn--[a-zA-Z0-9]{4,20}|[a-zA-Z]{2,20})' | ||
| 305 | +DNS_NAME = r'(?:[a-zA-Z0-9\-\.]+\.' + TLD + ')' | ||
| 306 | +#TODO: IPv6 - see https://www.debuggex.com/ | ||
| 307 | +# A literal numeric IPv6 address may be given, but must be enclosed in [ ] e.g. [db8:0cec::99:123a] | ||
| 308 | +NUMBER_0_255 = r'(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])' | ||
| 309 | +IPv4 = r'(?:'+NUMBER_0_255+r'\.){3}'+NUMBER_0_255 | ||
| 310 | +# IPv4 must come before the DNS name because it is more specific | ||
| 311 | +SERVER = r'(?:' + IPv4 + '|' + DNS_NAME + ')' | ||
| 312 | +PORT = r'(?:\:[0-9]{1,5})?' | ||
| 313 | +SERVER_PORT = SERVER + PORT | ||
| 314 | +URL_PATH = r'(?:/[a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~]*)?' # [^\.\,\)\(\s"] | ||
| 315 | +URL_RE = SCHEME + r'\://' + SERVER_PORT + URL_PATH | ||
| 316 | +re_url = re.compile(URL_RE) | ||
| 317 | + | ||
| 318 | + | ||
| 319 | +# Patterns to be extracted (IP addresses, URLs, etc) | ||
| 320 | +# From patterns.py in balbuzard | ||
| 321 | +RE_PATTERNS = ( | ||
| 322 | + ('URL', re.compile(URL_RE)), | ||
| 323 | + ('IPv4 address', re.compile(IPv4)), | ||
| 324 | + ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@'+SERVER+'\b')), | ||
| 325 | + # ('Domain name', re.compile(r'(?=^.{1,254}$)(^(?:(?!\d+\.|-)[a-zA-Z0-9_\-]{1,63}(?<!-)\.?)+(?:[a-zA-Z]{2,})$)')), | ||
| 326 | + # Executable file name with known extensions (except .com which is present in many URLs, and .application): | ||
| 327 | + ("Executable file name", re.compile(r"(?i)\b\w+\.(EXE|PIF|GADGET|MSI|MSP|MSC|VBS|VBE|VB|JSE|JS|WSF|WSC|WSH|WS|BAT|CMD|DLL|SCR|HTA|CPL|CLASS|JAR|PS1XML|PS1|PS2XML|PS2|PSC1|PSC2|SCF|LNK|INF|REG)\b")), | ||
| 328 | + # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/ | ||
| 329 | + #TODO: https://support.office.com/en-us/article/Blocked-attachments-in-Outlook-3811cddc-17c3-4279-a30c-060ba0207372#__attachment_file_types | ||
| 330 | + #('Hex string', re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')), | ||
| 331 | + ) | ||
| 332 | + | ||
| 333 | +# regex to detect strings encoded in hexadecimal | ||
| 334 | +re_hex_string = re.compile(r'(?:[0-9A-Fa-f]{2}){4,}') | ||
| 335 | + | ||
| 336 | +# regex to detect strings encoded in base64 | ||
| 337 | +#re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?"') | ||
| 338 | +# better version from balbuzard, less false positives: | ||
| 339 | +re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4}){1,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)?"') | ||
| 340 | +# white list of common strings matching the base64 regex, but which are not base64 strings (all lowercase): | ||
| 341 | +BASE64_WHITELIST = set(['thisdocument', 'thisworkbook', 'test', 'temp', 'http', 'open', 'exit']) | ||
| 342 | + | ||
| 343 | +# regex to detect strings encoded with a specific Dridex algorithm | ||
| 344 | +# (see https://github.com/JamesHabben/MalwareStuff) | ||
| 345 | +re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"') | ||
| 346 | +# regex to check that it is not just a hex string: | ||
| 347 | +re_nothex_check = re.compile(r'[G-Zg-z]') | ||
| 348 | + | ||
| 349 | +#--- FUNCTIONS ---------------------------------------------------------------- | ||
| 350 | + | ||
| 351 | +def copytoken_help(decompressed_current, decompressed_chunk_start): | ||
| 352 | + """ | ||
| 353 | + compute bit masks to decode a CopyToken according to MS-OVBA 2.4.1.3.19.1 CopyToken Help | ||
| 354 | + | ||
| 355 | + decompressed_current: number of decompressed bytes so far, i.e. len(decompressed_container) | ||
| 356 | + decompressed_chunk_start: offset of the current chunk in the decompressed container | ||
| 357 | + return length_mask, offset_mask, bit_count, maximum_length | ||
| 358 | + """ | ||
| 359 | + difference = decompressed_current - decompressed_chunk_start | ||
| 360 | + bit_count = int(math.ceil(math.log(difference, 2))) | ||
| 361 | + bit_count = max([bit_count, 4]) | ||
| 362 | + length_mask = 0xFFFF >> bit_count | ||
| 363 | + offset_mask = ~length_mask | ||
| 364 | + maximum_length = (0xFFFF >> bit_count) + 3 | ||
| 365 | + return length_mask, offset_mask, bit_count, maximum_length | ||
| 366 | + | ||
| 367 | + | ||
| 368 | +def decompress_stream (compressed_container): | ||
| 369 | + """ | ||
| 370 | + Decompress a stream according to MS-OVBA section 2.4.1 | ||
| 371 | + | ||
| 372 | + compressed_container: string compressed according to the MS-OVBA 2.4.1.3.6 Compression algorithm | ||
| 373 | + return the decompressed container as a string (bytes) | ||
| 374 | + """ | ||
| 375 | + # 2.4.1.2 State Variables | ||
| 376 | + | ||
| 377 | + # The following state is maintained for the CompressedContainer (section 2.4.1.1.1): | ||
| 378 | + # CompressedRecordEnd: The location of the byte after the last byte in the CompressedContainer (section 2.4.1.1.1). | ||
| 379 | + # CompressedCurrent: The location of the next byte in the CompressedContainer (section 2.4.1.1.1) to be read by | ||
| 380 | + # decompression or to be written by compression. | ||
| 381 | + | ||
| 382 | + # The following state is maintained for the current CompressedChunk (section 2.4.1.1.4): | ||
| 383 | + # CompressedChunkStart: The location of the first byte of the CompressedChunk (section 2.4.1.1.4) within the | ||
| 384 | + # CompressedContainer (section 2.4.1.1.1). | ||
| 385 | + | ||
| 386 | + # The following state is maintained for a DecompressedBuffer (section 2.4.1.1.2): | ||
| 387 | + # DecompressedCurrent: The location of the next byte in the DecompressedBuffer (section 2.4.1.1.2) to be written by | ||
| 388 | + # decompression or to be read by compression. | ||
| 389 | + # DecompressedBufferEnd: The location of the byte after the last byte in the DecompressedBuffer (section 2.4.1.1.2). | ||
| 390 | + | ||
| 391 | + # The following state is maintained for the current DecompressedChunk (section 2.4.1.1.3): | ||
| 392 | + # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the | ||
| 393 | + # DecompressedBuffer (section 2.4.1.1.2). | ||
| 394 | + | ||
| 395 | + decompressed_container = '' # result | ||
| 396 | + compressed_current = 0 | ||
| 397 | + | ||
| 398 | + sig_byte = ord(compressed_container[compressed_current]) | ||
| 399 | + if sig_byte != 0x01: | ||
| 400 | + raise ValueError('invalid signature byte {0:02X}'.format(sig_byte)) | ||
| 401 | + | ||
| 402 | + compressed_current += 1 | ||
| 403 | + | ||
| 404 | + #NOTE: the definition of CompressedRecordEnd is ambiguous. Here we assume that | ||
| 405 | + # CompressedRecordEnd = len(compressed_container) | ||
| 406 | + while compressed_current < len(compressed_container): | ||
| 407 | + # 2.4.1.1.5 | ||
| 408 | + compressed_chunk_start = compressed_current | ||
| 409 | + # chunk header = first 16 bits | ||
| 410 | + compressed_chunk_header = struct.unpack("<H", compressed_container[compressed_chunk_start:compressed_chunk_start + 2])[0] | ||
| 411 | + # chunk size = 12 first bits of header + 3 | ||
| 412 | + chunk_size = (compressed_chunk_header & 0x0FFF) + 3 | ||
| 413 | + # chunk signature = 3 next bits - should always be 0b011 | ||
| 414 | + chunk_signature = (compressed_chunk_header >> 12) & 0x07 | ||
| 415 | + if chunk_signature != 0b011: | ||
| 416 | + raise ValueError('Invalid CompressedChunkSignature in VBA compressed stream') | ||
| 417 | + # chunk flag = next bit - 1 == compressed, 0 == uncompressed | ||
| 418 | + chunk_flag = (compressed_chunk_header >> 15) & 0x01 | ||
| 419 | + logging.debug("chunk size = {0}, compressed flag = {1}".format(chunk_size, chunk_flag)) | ||
| 420 | + | ||
| 421 | + #MS-OVBA 2.4.1.3.12: the maximum size of a chunk including its header is 4098 bytes (header 2 + data 4096) | ||
| 422 | + # The minimum size is 3 bytes | ||
| 423 | + # NOTE: there seems to be a typo in MS-OVBA, the check should be with 4098, not 4095 (which is the max value | ||
| 424 | + # in chunk header before adding 3. | ||
| 425 | + # Also the first test is not useful since a 12 bits value cannot be larger than 4095. | ||
| 426 | + if chunk_flag == 1 and chunk_size > 4098: | ||
| 427 | + raise ValueError('CompressedChunkSize > 4098 but CompressedChunkFlag == 1') | ||
| 428 | + if chunk_flag == 0 and chunk_size != 4098: | ||
| 429 | + raise ValueError('CompressedChunkSize != 4098 but CompressedChunkFlag == 0') | ||
| 430 | + | ||
| 431 | + # check if chunk_size goes beyond the compressed data, instead of silently cutting it: | ||
| 432 | + #TODO: raise an exception? | ||
| 433 | + if compressed_chunk_start + chunk_size > len(compressed_container): | ||
| 434 | + logging.warning('Chunk size is larger than remaining compressed data') | ||
| 435 | + compressed_end = min([len(compressed_container), compressed_chunk_start + chunk_size]) | ||
| 436 | + # read after chunk header: | ||
| 437 | + compressed_current = compressed_chunk_start + 2 | ||
| 438 | + | ||
| 439 | + if chunk_flag == 0: | ||
| 440 | + # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk | ||
| 441 | + # uncompressed chunk: read the next 4096 bytes as-is | ||
| 442 | + #TODO: check if there are at least 4096 bytes left | ||
| 443 | + decompressed_container += compressed_container[compressed_current:compressed_current + 4096] | ||
| 444 | + compressed_current += 4096 | ||
| 445 | + else: | ||
| 446 | + # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk | ||
| 447 | + # compressed chunk | ||
| 448 | + decompressed_chunk_start = len(decompressed_container) | ||
| 449 | + while compressed_current < compressed_end: | ||
| 450 | + # MS-OVBA 2.4.1.3.4 Decompressing a TokenSequence | ||
| 451 | + # logging.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end)) | ||
| 452 | + # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or | ||
| 453 | + # copy tokens (reference to a previous literal token) | ||
| 454 | + flag_byte = ord(compressed_container[compressed_current]) | ||
| 455 | + compressed_current += 1 | ||
| 456 | + for bit_index in xrange(0, 8): | ||
| 457 | + # logging.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end)) | ||
| 458 | + if compressed_current >= compressed_end: | ||
| 459 | + break | ||
| 460 | + # MS-OVBA 2.4.1.3.5 Decompressing a Token | ||
| 461 | + # MS-OVBA 2.4.1.3.17 Extract FlagBit | ||
| 462 | + flag_bit = (flag_byte >> bit_index) & 1 | ||
| 463 | + #logging.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit)) | ||
| 464 | + if flag_bit == 0: # LiteralToken | ||
| 465 | + # copy one byte directly to output | ||
| 466 | + decompressed_container += compressed_container[compressed_current] | ||
| 467 | + compressed_current += 1 | ||
| 468 | + else: # CopyToken | ||
| 469 | + # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken | ||
| 470 | + copy_token = struct.unpack("<H", compressed_container[compressed_current:compressed_current + 2])[0] | ||
| 471 | + #TODO: check this | ||
| 472 | + length_mask, offset_mask, bit_count, maximum_length = copytoken_help( | ||
| 473 | + len(decompressed_container), decompressed_chunk_start) | ||
| 474 | + length = (copy_token & length_mask) + 3 | ||
| 475 | + temp1 = copy_token & offset_mask | ||
| 476 | + temp2 = 16 - bit_count | ||
| 477 | + offset = (temp1 >> temp2) + 1 | ||
| 478 | + #logging.debug('offset=%d length=%d' % (offset, length)) | ||
| 479 | + copy_source = len(decompressed_container) - offset | ||
| 480 | + for index in xrange(copy_source, copy_source + length): | ||
| 481 | + decompressed_container += decompressed_container[index] | ||
| 482 | + compressed_current += 2 | ||
| 483 | + return decompressed_container | ||
| 484 | + | ||
| 485 | + | ||
| 486 | +def _extract_vba (ole, vba_root, project_path, dir_path): | ||
| 487 | + """ | ||
| 488 | + Extract VBA macros from an OleFileIO object. | ||
| 489 | + Internal function, do not call directly. | ||
| 490 | + | ||
| 491 | + vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream | ||
| 492 | + vba_project: path to the PROJECT stream | ||
| 493 | + This is a generator, yielding (stream path, VBA filename, VBA source code) for each VBA code stream | ||
| 494 | + """ | ||
| 495 | + # Open the PROJECT stream: | ||
| 496 | + project = ole.openstream(project_path) | ||
| 497 | + | ||
| 498 | + # sample content of the PROJECT stream: | ||
| 499 | + | ||
| 500 | + ## ID="{5312AC8A-349D-4950-BDD0-49BE3C4DD0F0}" | ||
| 501 | + ## Document=ThisDocument/&H00000000 | ||
| 502 | + ## Module=NewMacros | ||
| 503 | + ## Name="Project" | ||
| 504 | + ## HelpContextID="0" | ||
| 505 | + ## VersionCompatible32="393222000" | ||
| 506 | + ## CMG="F1F301E705E705E705E705" | ||
| 507 | + ## DPB="8F8D7FE3831F2020202020" | ||
| 508 | + ## GC="2D2FDD81E51EE61EE6E1" | ||
| 509 | + ## | ||
| 510 | + ## [Host Extender Info] | ||
| 511 | + ## &H00000001={3832D640-CF90-11CF-8E43-00A0C911005A};VBE;&H00000000 | ||
| 512 | + ## &H00000002={000209F2-0000-0000-C000-000000000046};Word8.0;&H00000000 | ||
| 513 | + ## | ||
| 514 | + ## [Workspace] | ||
| 515 | + ## ThisDocument=22, 29, 339, 477, Z | ||
| 516 | + ## NewMacros=-4, 42, 832, 510, C | ||
| 517 | + | ||
| 518 | + code_modules = {} | ||
| 519 | + | ||
| 520 | + for line in project: | ||
| 521 | + line = line.strip() | ||
| 522 | + if '=' in line: | ||
| 523 | + # split line at the 1st equal sign: | ||
| 524 | + name, value = line.split('=', 1) | ||
| 525 | + # looking for code modules | ||
| 526 | + # add the code module as a key in the dictionary | ||
| 527 | + # the value will be the extension needed later | ||
| 528 | + # The value is converted to lowercase, to allow case-insensitive matching (issue #3) | ||
| 529 | + value = value.lower() | ||
| 530 | + if name == 'Document': | ||
| 531 | + # split value at the 1st slash, keep 1st part: | ||
| 532 | + value = value.split('/', 1)[0] | ||
| 533 | + code_modules[value] = CLASS_EXTENSION | ||
| 534 | + elif name == 'Module': | ||
| 535 | + code_modules[value] = MODULE_EXTENSION | ||
| 536 | + elif name == 'Class': | ||
| 537 | + code_modules[value] = CLASS_EXTENSION | ||
| 538 | + elif name == 'BaseClass': | ||
| 539 | + code_modules[value] = FORM_EXTENSION | ||
| 540 | + | ||
| 541 | + # read data from dir stream (compressed) | ||
| 542 | + dir_compressed = ole.openstream(dir_path).read() | ||
| 543 | + | ||
| 544 | + def check_value(name, expected, value): | ||
| 545 | + if expected != value: | ||
| 546 | + logging.error("invalid value for {0} expected {1:04X} got {2:04X}".format(name, expected, value)) | ||
| 547 | + | ||
| 548 | + dir_stream = cStringIO.StringIO(decompress_stream(dir_compressed)) | ||
| 549 | + | ||
| 550 | + # PROJECTSYSKIND Record | ||
| 551 | + PROJECTSYSKIND_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 552 | + check_value('PROJECTSYSKIND_Id', 0x0001, PROJECTSYSKIND_Id) | ||
| 553 | + PROJECTSYSKIND_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 554 | + check_value('PROJECTSYSKIND_Size', 0x0004, PROJECTSYSKIND_Size) | ||
| 555 | + PROJECTSYSKIND_SysKind = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 556 | + if PROJECTSYSKIND_SysKind == 0x00: | ||
| 557 | + logging.debug("16-bit Windows") | ||
| 558 | + elif PROJECTSYSKIND_SysKind == 0x01: | ||
| 559 | + logging.debug("32-bit Windows") | ||
| 560 | + elif PROJECTSYSKIND_SysKind == 0x02: | ||
| 561 | + logging.debug("Macintosh") | ||
| 562 | + elif PROJECTSYSKIND_SysKind == 0x03: | ||
| 563 | + logging.debug("64-bit Windows") | ||
| 564 | + else: | ||
| 565 | + logging.error("invalid PROJECTSYSKIND_SysKind {0:04X}".format(PROJECTSYSKIND_SysKind)) | ||
| 566 | + | ||
| 567 | + # PROJECTLCID Record | ||
| 568 | + PROJECTLCID_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 569 | + check_value('PROJECTLCID_Id', 0x0002, PROJECTLCID_Id) | ||
| 570 | + PROJECTLCID_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 571 | + check_value('PROJECTLCID_Size', 0x0004, PROJECTLCID_Size) | ||
| 572 | + PROJECTLCID_Lcid = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 573 | + check_value('PROJECTLCID_Lcid', 0x409, PROJECTLCID_Lcid) | ||
| 574 | + | ||
| 575 | + # PROJECTLCIDINVOKE Record | ||
| 576 | + PROJECTLCIDINVOKE_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 577 | + check_value('PROJECTLCIDINVOKE_Id', 0x0014, PROJECTLCIDINVOKE_Id) | ||
| 578 | + PROJECTLCIDINVOKE_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 579 | + check_value('PROJECTLCIDINVOKE_Size', 0x0004, PROJECTLCIDINVOKE_Size) | ||
| 580 | + PROJECTLCIDINVOKE_LcidInvoke = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 581 | + check_value('PROJECTLCIDINVOKE_LcidInvoke', 0x409, PROJECTLCIDINVOKE_LcidInvoke) | ||
| 582 | + | ||
| 583 | + # PROJECTCODEPAGE Record | ||
| 584 | + PROJECTCODEPAGE_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 585 | + check_value('PROJECTCODEPAGE_Id', 0x0003, PROJECTCODEPAGE_Id) | ||
| 586 | + PROJECTCODEPAGE_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 587 | + check_value('PROJECTCODEPAGE_Size', 0x0002, PROJECTCODEPAGE_Size) | ||
| 588 | + PROJECTCODEPAGE_CodePage = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 589 | + | ||
| 590 | + # PROJECTNAME Record | ||
| 591 | + PROJECTNAME_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 592 | + check_value('PROJECTNAME_Id', 0x0004, PROJECTNAME_Id) | ||
| 593 | + PROJECTNAME_SizeOfProjectName = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 594 | + if PROJECTNAME_SizeOfProjectName < 1 or PROJECTNAME_SizeOfProjectName > 128: | ||
| 595 | + logging.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(PROJECTNAME_SizeOfProjectName)) | ||
| 596 | + PROJECTNAME_ProjectName = dir_stream.read(PROJECTNAME_SizeOfProjectName) | ||
| 597 | + | ||
| 598 | + # PROJECTDOCSTRING Record | ||
| 599 | + PROJECTDOCSTRING_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 600 | + check_value('PROJECTDOCSTRING_Id', 0x0005, PROJECTDOCSTRING_Id) | ||
| 601 | + PROJECTDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 602 | + if PROJECTNAME_SizeOfProjectName > 2000: | ||
| 603 | + logging.error("PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(PROJECTDOCSTRING_SizeOfDocString)) | ||
| 604 | + PROJECTDOCSTRING_DocString = dir_stream.read(PROJECTDOCSTRING_SizeOfDocString) | ||
| 605 | + PROJECTDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 606 | + check_value('PROJECTDOCSTRING_Reserved', 0x0040, PROJECTDOCSTRING_Reserved) | ||
| 607 | + PROJECTDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 608 | + if PROJECTDOCSTRING_SizeOfDocStringUnicode % 2 != 0: | ||
| 609 | + logging.error("PROJECTDOCSTRING_SizeOfDocStringUnicode is not even") | ||
| 610 | + PROJECTDOCSTRING_DocStringUnicode = dir_stream.read(PROJECTDOCSTRING_SizeOfDocStringUnicode) | ||
| 611 | + | ||
| 612 | + # PROJECTHELPFILEPATH Record - MS-OVBA 2.3.4.2.1.7 | ||
| 613 | + PROJECTHELPFILEPATH_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 614 | + check_value('PROJECTHELPFILEPATH_Id', 0x0006, PROJECTHELPFILEPATH_Id) | ||
| 615 | + PROJECTHELPFILEPATH_SizeOfHelpFile1 = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 616 | + if PROJECTHELPFILEPATH_SizeOfHelpFile1 > 260: | ||
| 617 | + logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(PROJECTHELPFILEPATH_SizeOfHelpFile1)) | ||
| 618 | + PROJECTHELPFILEPATH_HelpFile1 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile1) | ||
| 619 | + PROJECTHELPFILEPATH_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 620 | + check_value('PROJECTHELPFILEPATH_Reserved', 0x003D, PROJECTHELPFILEPATH_Reserved) | ||
| 621 | + PROJECTHELPFILEPATH_SizeOfHelpFile2 = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 622 | + if PROJECTHELPFILEPATH_SizeOfHelpFile2 != PROJECTHELPFILEPATH_SizeOfHelpFile1: | ||
| 623 | + logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 does not equal PROJECTHELPFILEPATH_SizeOfHelpFile2") | ||
| 624 | + PROJECTHELPFILEPATH_HelpFile2 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile2) | ||
| 625 | + if PROJECTHELPFILEPATH_HelpFile2 != PROJECTHELPFILEPATH_HelpFile1: | ||
| 626 | + logging.error("PROJECTHELPFILEPATH_HelpFile1 does not equal PROJECTHELPFILEPATH_HelpFile2") | ||
| 627 | + | ||
| 628 | + # PROJECTHELPCONTEXT Record | ||
| 629 | + PROJECTHELPCONTEXT_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 630 | + check_value('PROJECTHELPCONTEXT_Id', 0x0007, PROJECTHELPCONTEXT_Id) | ||
| 631 | + PROJECTHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 632 | + check_value('PROJECTHELPCONTEXT_Size', 0x0004, PROJECTHELPCONTEXT_Size) | ||
| 633 | + PROJECTHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 634 | + | ||
| 635 | + # PROJECTLIBFLAGS Record | ||
| 636 | + PROJECTLIBFLAGS_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 637 | + check_value('PROJECTLIBFLAGS_Id', 0x0008, PROJECTLIBFLAGS_Id) | ||
| 638 | + PROJECTLIBFLAGS_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 639 | + check_value('PROJECTLIBFLAGS_Size', 0x0004, PROJECTLIBFLAGS_Size) | ||
| 640 | + PROJECTLIBFLAGS_ProjectLibFlags = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 641 | + check_value('PROJECTLIBFLAGS_ProjectLibFlags', 0x0000, PROJECTLIBFLAGS_ProjectLibFlags) | ||
| 642 | + | ||
| 643 | + # PROJECTVERSION Record | ||
| 644 | + PROJECTVERSION_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 645 | + check_value('PROJECTVERSION_Id', 0x0009, PROJECTVERSION_Id) | ||
| 646 | + PROJECTVERSION_Reserved = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 647 | + check_value('PROJECTVERSION_Reserved', 0x0004, PROJECTVERSION_Reserved) | ||
| 648 | + PROJECTVERSION_VersionMajor = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 649 | + PROJECTVERSION_VersionMinor = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 650 | + | ||
| 651 | + # PROJECTCONSTANTS Record | ||
| 652 | + PROJECTCONSTANTS_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 653 | + check_value('PROJECTCONSTANTS_Id', 0x000C, PROJECTCONSTANTS_Id) | ||
| 654 | + PROJECTCONSTANTS_SizeOfConstants = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 655 | + if PROJECTCONSTANTS_SizeOfConstants > 1015: | ||
| 656 | + logging.error("PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(PROJECTCONSTANTS_SizeOfConstants)) | ||
| 657 | + PROJECTCONSTANTS_Constants = dir_stream.read(PROJECTCONSTANTS_SizeOfConstants) | ||
| 658 | + PROJECTCONSTANTS_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 659 | + check_value('PROJECTCONSTANTS_Reserved', 0x003C, PROJECTCONSTANTS_Reserved) | ||
| 660 | + PROJECTCONSTANTS_SizeOfConstantsUnicode = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 661 | + if PROJECTCONSTANTS_SizeOfConstantsUnicode % 2 != 0: | ||
| 662 | + logging.error("PROJECTCONSTANTS_SizeOfConstantsUnicode is not even") | ||
| 663 | + PROJECTCONSTANTS_ConstantsUnicode = dir_stream.read(PROJECTCONSTANTS_SizeOfConstantsUnicode) | ||
| 664 | + | ||
| 665 | + # array of REFERENCE records | ||
| 666 | + check = None | ||
| 667 | + while True: | ||
| 668 | + check = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 669 | + logging.debug("reference type = {0:04X}".format(check)) | ||
| 670 | + if check == 0x000F: | ||
| 671 | + break | ||
| 672 | + | ||
| 673 | + if check == 0x0016: | ||
| 674 | + # REFERENCENAME | ||
| 675 | + REFERENCE_Id = check | ||
| 676 | + REFERENCE_SizeOfName = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 677 | + REFERENCE_Name = dir_stream.read(REFERENCE_SizeOfName) | ||
| 678 | + REFERENCE_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 679 | + check_value('REFERENCE_Reserved', 0x003E, REFERENCE_Reserved) | ||
| 680 | + REFERENCE_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 681 | + REFERENCE_NameUnicode = dir_stream.read(REFERENCE_SizeOfNameUnicode) | ||
| 682 | + continue | ||
| 683 | + | ||
| 684 | + if check == 0x0033: | ||
| 685 | + # REFERENCEORIGINAL (followed by REFERENCECONTROL) | ||
| 686 | + REFERENCEORIGINAL_Id = check | ||
| 687 | + REFERENCEORIGINAL_SizeOfLibidOriginal = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 688 | + REFERENCEORIGINAL_LibidOriginal = dir_stream.read(REFERENCEORIGINAL_SizeOfLibidOriginal) | ||
| 689 | + continue | ||
| 690 | + | ||
| 691 | + if check == 0x002F: | ||
| 692 | + # REFERENCECONTROL | ||
| 693 | + REFERENCECONTROL_Id = check | ||
| 694 | + REFERENCECONTROL_SizeTwiddled = struct.unpack("<L", dir_stream.read(4))[0] # ignore | ||
| 695 | + REFERENCECONTROL_SizeOfLibidTwiddled = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 696 | + REFERENCECONTROL_LibidTwiddled = dir_stream.read(REFERENCECONTROL_SizeOfLibidTwiddled) | ||
| 697 | + REFERENCECONTROL_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0] # ignore | ||
| 698 | + check_value('REFERENCECONTROL_Reserved1', 0x0000, REFERENCECONTROL_Reserved1) | ||
| 699 | + REFERENCECONTROL_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0] # ignore | ||
| 700 | + check_value('REFERENCECONTROL_Reserved2', 0x0000, REFERENCECONTROL_Reserved2) | ||
| 701 | + # optional field | ||
| 702 | + check2 = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 703 | + if check2 == 0x0016: | ||
| 704 | + REFERENCECONTROL_NameRecordExtended_Id = check | ||
| 705 | + REFERENCECONTROL_NameRecordExtended_SizeofName = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 706 | + REFERENCECONTROL_NameRecordExtended_Name = dir_stream.read(REFERENCECONTROL_NameRecordExtended_SizeofName) | ||
| 707 | + REFERENCECONTROL_NameRecordExtended_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 708 | + check_value('REFERENCECONTROL_NameRecordExtended_Reserved', 0x003E, REFERENCECONTROL_NameRecordExtended_Reserved) | ||
| 709 | + REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 710 | + REFERENCECONTROL_NameRecordExtended_NameUnicode = dir_stream.read(REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode) | ||
| 711 | + REFERENCECONTROL_Reserved3 = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 712 | + else: | ||
| 713 | + REFERENCECONTROL_Reserved3 = check2 | ||
| 714 | + | ||
| 715 | + check_value('REFERENCECONTROL_Reserved3', 0x0030, REFERENCECONTROL_Reserved3) | ||
| 716 | + REFERENCECONTROL_SizeExtended = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 717 | + REFERENCECONTROL_SizeOfLibidExtended = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 718 | + REFERENCECONTROL_LibidExtended = dir_stream.read(REFERENCECONTROL_SizeOfLibidExtended) | ||
| 719 | + REFERENCECONTROL_Reserved4 = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 720 | + REFERENCECONTROL_Reserved5 = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 721 | + REFERENCECONTROL_OriginalTypeLib = dir_stream.read(16) | ||
| 722 | + REFERENCECONTROL_Cookie = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 723 | + continue | ||
| 724 | + | ||
| 725 | + if check == 0x000D: | ||
| 726 | + # REFERENCEREGISTERED | ||
| 727 | + REFERENCEREGISTERED_Id = check | ||
| 728 | + REFERENCEREGISTERED_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 729 | + REFERENCEREGISTERED_SizeOfLibid = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 730 | + REFERENCEREGISTERED_Libid = dir_stream.read(REFERENCEREGISTERED_SizeOfLibid) | ||
| 731 | + REFERENCEREGISTERED_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 732 | + check_value('REFERENCEREGISTERED_Reserved1', 0x0000, REFERENCEREGISTERED_Reserved1) | ||
| 733 | + REFERENCEREGISTERED_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 734 | + check_value('REFERENCEREGISTERED_Reserved2', 0x0000, REFERENCEREGISTERED_Reserved2) | ||
| 735 | + continue | ||
| 736 | + | ||
| 737 | + if check == 0x000E: | ||
| 738 | + # REFERENCEPROJECT | ||
| 739 | + REFERENCEPROJECT_Id = check | ||
| 740 | + REFERENCEPROJECT_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 741 | + REFERENCEPROJECT_SizeOfLibidAbsolute = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 742 | + REFERENCEPROJECT_LibidAbsolute = dir_stream.read(REFERENCEPROJECT_SizeOfLibidAbsolute) | ||
| 743 | + REFERENCEPROJECT_SizeOfLibidRelative = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 744 | + REFERENCEPROJECT_LibidRelative = dir_stream.read(REFERENCEPROJECT_SizeOfLibidRelative) | ||
| 745 | + REFERENCEPROJECT_MajorVersion = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 746 | + REFERENCEPROJECT_MinorVersion = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 747 | + continue | ||
| 748 | + | ||
| 749 | + logging.error('invalid or unknown check Id {0:04X}'.format(check)) | ||
| 750 | + sys.exit(0) | ||
| 751 | + | ||
| 752 | + PROJECTMODULES_Id = check #struct.unpack("<H", dir_stream.read(2))[0] | ||
| 753 | + check_value('PROJECTMODULES_Id', 0x000F, PROJECTMODULES_Id) | ||
| 754 | + PROJECTMODULES_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 755 | + check_value('PROJECTMODULES_Size', 0x0002, PROJECTMODULES_Size) | ||
| 756 | + PROJECTMODULES_Count = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 757 | + PROJECTMODULES_ProjectCookieRecord_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 758 | + check_value('PROJECTMODULES_ProjectCookieRecord_Id', 0x0013, PROJECTMODULES_ProjectCookieRecord_Id) | ||
| 759 | + PROJECTMODULES_ProjectCookieRecord_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 760 | + check_value('PROJECTMODULES_ProjectCookieRecord_Size', 0x0002, PROJECTMODULES_ProjectCookieRecord_Size) | ||
| 761 | + PROJECTMODULES_ProjectCookieRecord_Cookie = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 762 | + | ||
| 763 | + logging.debug("parsing {0} modules".format(PROJECTMODULES_Count)) | ||
| 764 | + for x in xrange(0, PROJECTMODULES_Count): | ||
| 765 | + MODULENAME_Id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 766 | + check_value('MODULENAME_Id', 0x0019, MODULENAME_Id) | ||
| 767 | + MODULENAME_SizeOfModuleName = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 768 | + MODULENAME_ModuleName = dir_stream.read(MODULENAME_SizeOfModuleName) | ||
| 769 | + # account for optional sections | ||
| 770 | + section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 771 | + if section_id == 0x0047: | ||
| 772 | + MODULENAMEUNICODE_Id = section_id | ||
| 773 | + MODULENAMEUNICODE_SizeOfModuleNameUnicode = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 774 | + MODULENAMEUNICODE_ModuleNameUnicode = dir_stream.read(MODULENAMEUNICODE_SizeOfModuleNameUnicode) | ||
| 775 | + section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 776 | + if section_id == 0x001A: | ||
| 777 | + MODULESTREAMNAME_id = section_id | ||
| 778 | + MODULESTREAMNAME_SizeOfStreamName = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 779 | + MODULESTREAMNAME_StreamName = dir_stream.read(MODULESTREAMNAME_SizeOfStreamName) | ||
| 780 | + MODULESTREAMNAME_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 781 | + check_value('MODULESTREAMNAME_Reserved', 0x0032, MODULESTREAMNAME_Reserved) | ||
| 782 | + MODULESTREAMNAME_SizeOfStreamNameUnicode = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 783 | + MODULESTREAMNAME_StreamNameUnicode = dir_stream.read(MODULESTREAMNAME_SizeOfStreamNameUnicode) | ||
| 784 | + section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 785 | + if section_id == 0x001C: | ||
| 786 | + MODULEDOCSTRING_Id = section_id | ||
| 787 | + check_value('MODULEDOCSTRING_Id', 0x001C, MODULEDOCSTRING_Id) | ||
| 788 | + MODULEDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 789 | + MODULEDOCSTRING_DocString = dir_stream.read(MODULEDOCSTRING_SizeOfDocString) | ||
| 790 | + MODULEDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 791 | + check_value('MODULEDOCSTRING_Reserved', 0x0048, MODULEDOCSTRING_Reserved) | ||
| 792 | + MODULEDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 793 | + MODULEDOCSTRING_DocStringUnicode = dir_stream.read(MODULEDOCSTRING_SizeOfDocStringUnicode) | ||
| 794 | + section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 795 | + if section_id == 0x0031: | ||
| 796 | + MODULEOFFSET_Id = section_id | ||
| 797 | + check_value('MODULEOFFSET_Id', 0x0031, MODULEOFFSET_Id) | ||
| 798 | + MODULEOFFSET_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 799 | + check_value('MODULEOFFSET_Size', 0x0004, MODULEOFFSET_Size) | ||
| 800 | + MODULEOFFSET_TextOffset = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 801 | + section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 802 | + if section_id == 0x001E: | ||
| 803 | + MODULEHELPCONTEXT_Id = section_id | ||
| 804 | + check_value('MODULEHELPCONTEXT_Id', 0x001E, MODULEHELPCONTEXT_Id) | ||
| 805 | + MODULEHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 806 | + check_value('MODULEHELPCONTEXT_Size', 0x0004, MODULEHELPCONTEXT_Size) | ||
| 807 | + MODULEHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 808 | + section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 809 | + if section_id == 0x002C: | ||
| 810 | + MODULECOOKIE_Id = section_id | ||
| 811 | + check_value('MODULECOOKIE_Id', 0x002C, MODULECOOKIE_Id) | ||
| 812 | + MODULECOOKIE_Size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 813 | + check_value('MODULECOOKIE_Size', 0x0002, MODULECOOKIE_Size) | ||
| 814 | + MODULECOOKIE_Cookie = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 815 | + section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 816 | + if section_id == 0x0021 or section_id == 0x0022: | ||
| 817 | + MODULETYPE_Id = section_id | ||
| 818 | + MODULETYPE_Reserved = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 819 | + section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 820 | + if section_id == 0x0025: | ||
| 821 | + MODULEREADONLY_Id = section_id | ||
| 822 | + check_value('MODULEREADONLY_Id', 0x0025, MODULEREADONLY_Id) | ||
| 823 | + MODULEREADONLY_Reserved = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 824 | + check_value('MODULEREADONLY_Reserved', 0x0000, MODULEREADONLY_Reserved) | ||
| 825 | + section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 826 | + if section_id == 0x0028: | ||
| 827 | + MODULEPRIVATE_Id = section_id | ||
| 828 | + check_value('MODULEPRIVATE_Id', 0x0028, MODULEPRIVATE_Id) | ||
| 829 | + MODULEPRIVATE_Reserved = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 830 | + check_value('MODULEPRIVATE_Reserved', 0x0000, MODULEPRIVATE_Reserved) | ||
| 831 | + section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 832 | + if section_id == 0x002B: # TERMINATOR | ||
| 833 | + MODULE_Reserved = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 834 | + check_value('MODULE_Reserved', 0x0000, MODULE_Reserved) | ||
| 835 | + section_id = None | ||
| 836 | + if section_id != None: | ||
| 837 | + logging.warning('unknown or invalid module section id {0:04X}'.format(section_id)) | ||
| 838 | + | ||
| 839 | + logging.debug('Project CodePage = %d' % PROJECTCODEPAGE_CodePage) | ||
| 840 | + vba_codec = 'cp%d' % PROJECTCODEPAGE_CodePage | ||
| 841 | + logging.debug("ModuleName = {0}".format(MODULENAME_ModuleName)) | ||
| 842 | + logging.debug("StreamName = {0}".format(repr(MODULESTREAMNAME_StreamName))) | ||
| 843 | + streamname_unicode = MODULESTREAMNAME_StreamName.decode(vba_codec) | ||
| 844 | + logging.debug("StreamName.decode('%s') = %s" % (vba_codec, repr(streamname_unicode))) | ||
| 845 | + logging.debug("StreamNameUnicode = {0}".format(repr(MODULESTREAMNAME_StreamNameUnicode))) | ||
| 846 | + logging.debug("TextOffset = {0}".format(MODULEOFFSET_TextOffset)) | ||
| 847 | + | ||
| 848 | + code_path = vba_root + u'VBA/' + streamname_unicode | ||
| 849 | + #TODO: test if stream exists | ||
| 850 | + logging.debug('opening VBA code stream %s' % repr(code_path)) | ||
| 851 | + code_data = ole.openstream(code_path).read() | ||
| 852 | + logging.debug("length of code_data = {0}".format(len(code_data))) | ||
| 853 | + logging.debug("offset of code_data = {0}".format(MODULEOFFSET_TextOffset)) | ||
| 854 | + code_data = code_data[MODULEOFFSET_TextOffset:] | ||
| 855 | + if len(code_data) > 0: | ||
| 856 | + code_data = decompress_stream(code_data) | ||
| 857 | + # case-insensitive search in the code_modules dict to find the file extension: | ||
| 858 | + filext = code_modules.get(MODULENAME_ModuleName.lower(), 'bin') | ||
| 859 | + filename = '{0}.{1}'.format(MODULENAME_ModuleName, filext) | ||
| 860 | + #TODO: also yield the codepage so that callers can decode it properly | ||
| 861 | + yield (code_path, filename, code_data) | ||
| 862 | + # print '-'*79 | ||
| 863 | + # print filename | ||
| 864 | + # print '' | ||
| 865 | + # print code_data | ||
| 866 | + # print '' | ||
| 867 | + logging.debug('extracted file {0}'.format(filename)) | ||
| 868 | + else: | ||
| 869 | + logging.warning("module stream {0} has code data length 0".format(MODULESTREAMNAME_StreamName)) | ||
| 870 | + return | ||
| 871 | + | ||
| 872 | + | ||
| 873 | +def filter_vba(vba_code): | ||
| 874 | + """ | ||
| 875 | + Filter VBA source code to remove the first lines starting with "Attribute VB_", | ||
| 876 | + which are automatically added by MS Office and not displayed in the VBA Editor. | ||
| 877 | + This should only be used when displaying source code for human analysis. | ||
| 878 | + | ||
| 879 | + Note: lines are not filtered if they contain a colon, because it could be | ||
| 880 | + used to hide malicious instructions. | ||
| 881 | + | ||
| 882 | + :param vba_code: str, VBA source code | ||
| 883 | + :return: str, filtered VBA source code | ||
| 884 | + """ | ||
| 885 | + vba_lines = vba_code.splitlines() | ||
| 886 | + start = 0 | ||
| 887 | + for line in vba_lines: | ||
| 888 | + if line.startswith("Attribute VB_") and not ':' in line: | ||
| 889 | + start += 1 | ||
| 890 | + else: | ||
| 891 | + break | ||
| 892 | + #TODO: also remove empty lines? | ||
| 893 | + vba = '\n'.join(vba_lines[start:]) | ||
| 894 | + return vba | ||
| 895 | + | ||
| 896 | + | ||
| 897 | +def detect_autoexec(vba_code, obfuscation=None): | ||
| 898 | + """ | ||
| 899 | + Detect if the VBA code contains keywords corresponding to macros running | ||
| 900 | + automatically when triggered by specific actions (e.g. when a document is | ||
| 901 | + opened or closed). | ||
| 902 | + | ||
| 903 | + :param vba_code: str, VBA source code | ||
| 904 | + :param obfuscation: None or str, name of obfuscation to be added to description | ||
| 905 | + :return: list of str tuples (keyword, description) | ||
| 906 | + """ | ||
| 907 | + #TODO: merge code with detect_suspicious | ||
| 908 | + # case-insensitive search | ||
| 909 | + #vba_code = vba_code.lower() | ||
| 910 | + results = [] | ||
| 911 | + obf_text = '' | ||
| 912 | + if obfuscation: | ||
| 913 | + obf_text = ' (obfuscation: %s)' % obfuscation | ||
| 914 | + for description, keywords in AUTOEXEC_KEYWORDS.items(): | ||
| 915 | + for keyword in keywords: | ||
| 916 | + #TODO: if keyword is already a compiled regex, use it as-is | ||
| 917 | + # search using regex to detect word boundaries: | ||
| 918 | + if re.search(r'(?i)\b'+keyword+r'\b', vba_code): | ||
| 919 | + #if keyword.lower() in vba_code: | ||
| 920 | + results.append((keyword, description+obf_text)) | ||
| 921 | + return results | ||
| 922 | + | ||
| 923 | + | ||
| 924 | +def detect_suspicious(vba_code, obfuscation=None): | ||
| 925 | + """ | ||
| 926 | + Detect if the VBA code contains suspicious keywords corresponding to | ||
| 927 | + potential malware behaviour. | ||
| 928 | + | ||
| 929 | + :param vba_code: str, VBA source code | ||
| 930 | + :param obfuscation: None or str, name of obfuscation to be added to description | ||
| 931 | + :return: list of str tuples (keyword, description) | ||
| 932 | + """ | ||
| 933 | + # case-insensitive search | ||
| 934 | + #vba_code = vba_code.lower() | ||
| 935 | + results = [] | ||
| 936 | + obf_text = '' | ||
| 937 | + if obfuscation: | ||
| 938 | + obf_text = ' (obfuscation: %s)' % obfuscation | ||
| 939 | + for description, keywords in SUSPICIOUS_KEYWORDS.items(): | ||
| 940 | + for keyword in keywords: | ||
| 941 | + # search using regex to detect word boundaries: | ||
| 942 | + if re.search(r'(?i)\b'+keyword+r'\b', vba_code): | ||
| 943 | + #if keyword.lower() in vba_code: | ||
| 944 | + results.append((keyword, description+obf_text)) | ||
| 945 | + return results | ||
| 946 | + | ||
| 947 | + | ||
| 948 | +def detect_patterns(vba_code, obfuscation=None): | ||
| 949 | + """ | ||
| 950 | + Detect if the VBA code contains specific patterns such as IP addresses, | ||
| 951 | + URLs, e-mail addresses, executable file names, etc. | ||
| 952 | + | ||
| 953 | + :param vba_code: str, VBA source code | ||
| 954 | + :return: list of str tuples (pattern type, value) | ||
| 955 | + """ | ||
| 956 | + results = [] | ||
| 957 | + found = set() | ||
| 958 | + obf_text = '' | ||
| 959 | + if obfuscation: | ||
| 960 | + obf_text = ' (obfuscation: %s)' % obfuscation | ||
| 961 | + for pattern_type, pattern_re in RE_PATTERNS: | ||
| 962 | + for match in pattern_re.finditer(vba_code): | ||
| 963 | + value = match.group() | ||
| 964 | + if value not in found: | ||
| 965 | + results.append((pattern_type+obf_text, value)) | ||
| 966 | + found.add(value) | ||
| 967 | + return results | ||
| 968 | + | ||
| 969 | + | ||
| 970 | +def detect_hex_strings(vba_code): | ||
| 971 | + """ | ||
| 972 | + Detect if the VBA code contains strings encoded in hexadecimal. | ||
| 973 | + | ||
| 974 | + :param vba_code: str, VBA source code | ||
| 975 | + :return: list of str tuples (encoded string, decoded string) | ||
| 976 | + """ | ||
| 977 | + results = [] | ||
| 978 | + found = set() | ||
| 979 | + for match in re_hex_string.finditer(vba_code): | ||
| 980 | + value = match.group() | ||
| 981 | + if value not in found: | ||
| 982 | + decoded = binascii.unhexlify(value) | ||
| 983 | + results.append((value, decoded)) | ||
| 984 | + found.add(value) | ||
| 985 | + return results | ||
| 986 | + | ||
| 987 | + | ||
| 988 | +def detect_base64_strings(vba_code): | ||
| 989 | + """ | ||
| 990 | + Detect if the VBA code contains strings encoded in base64. | ||
| 991 | + | ||
| 992 | + :param vba_code: str, VBA source code | ||
| 993 | + :return: list of str tuples (encoded string, decoded string) | ||
| 994 | + """ | ||
| 995 | + #TODO: avoid matching simple hex strings as base64? | ||
| 996 | + results = [] | ||
| 997 | + found = set() | ||
| 998 | + for match in re_base64_string.finditer(vba_code): | ||
| 999 | + # extract the base64 string without quotes: | ||
| 1000 | + value = match.group().strip('"') | ||
| 1001 | + # check it is not just a hex string: | ||
| 1002 | + if not re_nothex_check.search(value): | ||
| 1003 | + continue | ||
| 1004 | + # only keep new values and not in the whitelist: | ||
| 1005 | + if value not in found and value.lower() not in BASE64_WHITELIST: | ||
| 1006 | + try: | ||
| 1007 | + decoded = base64.b64decode(value) | ||
| 1008 | + results.append((value, decoded)) | ||
| 1009 | + found.add(value) | ||
| 1010 | + except: | ||
| 1011 | + # if an exception occurs, it is likely not a base64-encoded string | ||
| 1012 | + pass | ||
| 1013 | + return results | ||
| 1014 | + | ||
| 1015 | + | ||
| 1016 | +def detect_dridex_strings(vba_code): | ||
| 1017 | + """ | ||
| 1018 | + Detect if the VBA code contains strings obfuscated with a specific algorithm found in Dridex samples. | ||
| 1019 | + | ||
| 1020 | + :param vba_code: str, VBA source code | ||
| 1021 | + :return: list of str tuples (encoded string, decoded string) | ||
| 1022 | + """ | ||
| 1023 | + from thirdparty.DridexUrlDecoder.DridexUrlDecoder import DridexUrlDecode | ||
| 1024 | + results = [] | ||
| 1025 | + found = set() | ||
| 1026 | + for match in re_dridex_string.finditer(vba_code): | ||
| 1027 | + value = match.group()[1:-1] | ||
| 1028 | + # check it is not just a hex string: | ||
| 1029 | + if not re_nothex_check.search(value): | ||
| 1030 | + continue | ||
| 1031 | + if value not in found: | ||
| 1032 | + try: | ||
| 1033 | + decoded = DridexUrlDecode(value) | ||
| 1034 | + results.append((value, decoded)) | ||
| 1035 | + found.add(value) | ||
| 1036 | + except: | ||
| 1037 | + # if an exception occurs, it is likely not a dridex-encoded string | ||
| 1038 | + pass | ||
| 1039 | + return results | ||
| 1040 | + | ||
| 1041 | + | ||
| 1042 | +class VBA_Scanner (object): | ||
| 1043 | + """ | ||
| 1044 | + Class to scan the source code of a VBA module to find obfuscated strings, | ||
| 1045 | + suspicious keywords, IOCs, auto-executable macros, etc. | ||
| 1046 | + """ | ||
| 1047 | + | ||
| 1048 | + def __init__(self, vba_code): | ||
| 1049 | + """ | ||
| 1050 | + VBA_Scanner constructor | ||
| 1051 | + | ||
| 1052 | + :param vba_code: str, VBA source code to be analyzed | ||
| 1053 | + """ | ||
| 1054 | + self.code = vba_code | ||
| 1055 | + self.code_hex = '' | ||
| 1056 | + self.code_hex_rev = '' | ||
| 1057 | + self.code_rev_hex = '' | ||
| 1058 | + self.code_base64 = '' | ||
| 1059 | + self.code_dridex = '' | ||
| 1060 | + | ||
| 1061 | + | ||
| 1062 | + def scan(self, include_decoded_strings=False): | ||
| 1063 | + """ | ||
| 1064 | + Analyze the provided VBA code to detect suspicious keywords, | ||
| 1065 | + auto-executable macros, IOC patterns, obfuscation patterns | ||
| 1066 | + such as hex-encoded strings. | ||
| 1067 | + | ||
| 1068 | + :param include_decoded_strings: bool, if True, all encoded strings will be included with their decoded content. | ||
| 1069 | + :return: list of tuples (type, keyword, description) | ||
| 1070 | + (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String') | ||
| 1071 | + """ | ||
| 1072 | + # First, detect and extract hex-encoded strings: | ||
| 1073 | + self.hex_strings = detect_hex_strings(self.code) | ||
| 1074 | + # detect if the code contains StrReverse: | ||
| 1075 | + self.strReverse = False | ||
| 1076 | + if 'strreverse' in self.code.lower(): self.strReverse = True | ||
| 1077 | + # Then append the decoded strings to the VBA code, to detect obfuscated IOCs and keywords: | ||
| 1078 | + for encoded, decoded in self.hex_strings: | ||
| 1079 | + self.code_hex += '\n'+decoded | ||
| 1080 | + # if the code contains "StrReverse", also append the hex strings in reverse order: | ||
| 1081 | + if self.strReverse: | ||
| 1082 | + # StrReverse after hex decoding: | ||
| 1083 | + self.code_hex_rev += '\n'+decoded[::-1] | ||
| 1084 | + # StrReverse before hex decoding: | ||
| 1085 | + self.code_rev_hex += '\n'+binascii.unhexlify(encoded[::-1]) | ||
| 1086 | + #example: https://malwr.com/analysis/NmFlMGI4YTY1YzYyNDkwNTg1ZTBiZmY5OGI3YjlhYzU/ | ||
| 1087 | + #TODO: also append the full code reversed if StrReverse? (risk of false positives?) | ||
| 1088 | + # Detect Base64-encoded strings | ||
| 1089 | + self.base64_strings = detect_base64_strings(self.code) | ||
| 1090 | + for encoded, decoded in self.base64_strings: | ||
| 1091 | + self.code_base64 += '\n'+decoded | ||
| 1092 | + # Detect Dridex-encoded strings | ||
| 1093 | + self.dridex_strings = detect_dridex_strings(self.code) | ||
| 1094 | + for encoded, decoded in self.dridex_strings: | ||
| 1095 | + self.code_dridex += '\n'+decoded | ||
| 1096 | + results = [] | ||
| 1097 | + self.autoexec_keywords = [] | ||
| 1098 | + self.suspicious_keywords = [] | ||
| 1099 | + self.iocs = [] | ||
| 1100 | + | ||
| 1101 | + for code, obfuscation in ( | ||
| 1102 | + (self.code, None), | ||
| 1103 | + (self.code_hex, 'Hex'), | ||
| 1104 | + (self.code_hex_rev, 'Hex+StrReverse'), | ||
| 1105 | + (self.code_rev_hex, 'StrReverse+Hex'), | ||
| 1106 | + (self.code_base64, 'Base64'), | ||
| 1107 | + (self.code_dridex, 'Dridex'), | ||
| 1108 | + ): | ||
| 1109 | + self.autoexec_keywords += detect_autoexec(code, obfuscation) | ||
| 1110 | + self.suspicious_keywords += detect_suspicious(code, obfuscation) | ||
| 1111 | + self.iocs += detect_patterns(code, obfuscation) | ||
| 1112 | + | ||
| 1113 | + # If hex-encoded strings were discovered, add an item to suspicious keywords: | ||
| 1114 | + if self.hex_strings: | ||
| 1115 | + self.suspicious_keywords.append(('Hex Strings', | ||
| 1116 | + 'Hex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)')) | ||
| 1117 | + if self.base64_strings: | ||
| 1118 | + self.suspicious_keywords.append(('Base64 Strings', | ||
| 1119 | + 'Base64-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)')) | ||
| 1120 | + if self.dridex_strings: | ||
| 1121 | + self.suspicious_keywords.append(('Dridex Strings', | ||
| 1122 | + 'Dridex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)')) | ||
| 1123 | + for keyword, description in self.autoexec_keywords: | ||
| 1124 | + results.append(('AutoExec', keyword, description)) | ||
| 1125 | + for keyword, description in self.suspicious_keywords: | ||
| 1126 | + results.append(('Suspicious', keyword, description)) | ||
| 1127 | + for pattern_type, value in self.iocs: | ||
| 1128 | + results.append(('IOC', value, pattern_type)) | ||
| 1129 | + if include_decoded_strings: | ||
| 1130 | + for encoded, decoded in self.hex_strings: | ||
| 1131 | + results.append(('Hex String', repr(decoded), encoded)) | ||
| 1132 | + for encoded, decoded in self.base64_strings: | ||
| 1133 | + results.append(('Base64 String', repr(decoded), encoded)) | ||
| 1134 | + for encoded, decoded in self.dridex_strings: | ||
| 1135 | + results.append(('Dridex string', repr(decoded), encoded)) | ||
| 1136 | + return results | ||
| 1137 | + | ||
| 1138 | + def scan_summary(self): | ||
| 1139 | + """ | ||
| 1140 | + Analyze the provided VBA code to detect suspicious keywords, | ||
| 1141 | + auto-executable macros, IOC patterns, obfuscation patterns | ||
| 1142 | + such as hex-encoded strings. | ||
| 1143 | + | ||
| 1144 | + :return: tuple with the number of items found for each category: | ||
| 1145 | + (autoexec, suspicious, IOCs, hex, base64, dridex) | ||
| 1146 | + """ | ||
| 1147 | + self.scan() | ||
| 1148 | + return (len(self.autoexec_keywords), len(self.suspicious_keywords), | ||
| 1149 | + len(self.iocs), len(self.hex_strings), len(self.base64_strings), | ||
| 1150 | + len(self.dridex_strings)) | ||
| 1151 | + | ||
| 1152 | + | ||
| 1153 | + | ||
| 1154 | +def scan_vba(vba_code, include_decoded_strings): | ||
| 1155 | + """ | ||
| 1156 | + Analyze the provided VBA code to detect suspicious keywords, | ||
| 1157 | + auto-executable macros, IOC patterns, obfuscation patterns | ||
| 1158 | + such as hex-encoded strings. | ||
| 1159 | + (shortcut for VBA_Scanner(vba_code).scan()) | ||
| 1160 | + | ||
| 1161 | + :param vba_code: str, VBA source code to be analyzed | ||
| 1162 | + :param include_decoded_strings: bool, if True all encoded strings will be included with their decoded content. | ||
| 1163 | + :return: list of tuples (type, keyword, description) | ||
| 1164 | + (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String') | ||
| 1165 | + """ | ||
| 1166 | + return VBA_Scanner(vba_code).scan(include_decoded_strings) | ||
| 1167 | + | ||
| 1168 | + | ||
| 1169 | +#=== CLASSES ================================================================= | ||
| 1170 | + | ||
| 1171 | +class VBA_Parser(object): | ||
| 1172 | + """ | ||
| 1173 | + Class to parse MS Office files, to detect VBA macros and extract VBA source code | ||
| 1174 | + Supported file formats: | ||
| 1175 | + - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) | ||
| 1176 | + - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) | ||
| 1177 | + - PowerPoint 2007+ (.pptm, .ppsm) | ||
| 1178 | + """ | ||
| 1179 | + | ||
| 1180 | + def __init__(self, filename, data=None): | ||
| 1181 | + """ | ||
| 1182 | + Constructor for VBA_Parser | ||
| 1183 | + | ||
| 1184 | + :param filename: filename or path of file to parse, or file-like object | ||
| 1185 | + | ||
| 1186 | + :param data: None or bytes str, if None the file will be read from disk (or from the file-like object). | ||
| 1187 | + If data is provided as a bytes string, it will be parsed as the content of the file in memory, | ||
| 1188 | + and not read from disk. Note: files must be read in binary mode, i.e. open(f, 'rb'). | ||
| 1189 | + """ | ||
| 1190 | + #TODO: filename should only be a string, data should be used for the file-like object | ||
| 1191 | + #TODO: filename should be mandatory, optional data is a string or file-like object | ||
| 1192 | + #TODO: also support olefile and zipfile as input | ||
| 1193 | + if data is None: | ||
| 1194 | + # open file from disk: | ||
| 1195 | + _file = filename | ||
| 1196 | + else: | ||
| 1197 | + # file already read in memory, make it a file-like object for zipfile: | ||
| 1198 | + _file = cStringIO.StringIO(data) | ||
| 1199 | + #self.file = _file | ||
| 1200 | + self.ole_file = None | ||
| 1201 | + self.ole_subfiles = [] | ||
| 1202 | + self.filename = filename | ||
| 1203 | + self.type = None | ||
| 1204 | + self.vba_projects = None | ||
| 1205 | + # if filename is None: | ||
| 1206 | + # if isinstance(_file, basestring): | ||
| 1207 | + # if len(_file) < olefile.MINIMAL_OLEFILE_SIZE: | ||
| 1208 | + # self.filename = _file | ||
| 1209 | + # else: | ||
| 1210 | + # self.filename = '<file in bytes string>' | ||
| 1211 | + # else: | ||
| 1212 | + # self.filename = '<file-like object>' | ||
| 1213 | + if olefile.isOleFile(_file): | ||
| 1214 | + # This looks like an OLE file | ||
| 1215 | + logging.info('Parsing OLE file %s' % self.filename) | ||
| 1216 | + # Open and parse the OLE file, using unicode for path names: | ||
| 1217 | + self.ole_file = olefile.OleFileIO(_file, path_encoding=None) | ||
| 1218 | + self.type = TYPE_OLE | ||
| 1219 | + #TODO: raise TypeError if this is a Powerpoint 97 file, since VBA macros cannot be detected yet | ||
| 1220 | + elif zipfile.is_zipfile(_file): | ||
| 1221 | + # This looks like a zip file, need to look for vbaProject.bin inside | ||
| 1222 | + # It can be any OLE file inside the archive | ||
| 1223 | + #...because vbaProject.bin can be renamed: | ||
| 1224 | + # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18 | ||
| 1225 | + logging.info('Opening ZIP/OpenXML file %s' % self.filename) | ||
| 1226 | + self.type = TYPE_OpenXML | ||
| 1227 | + z = zipfile.ZipFile(_file) | ||
| 1228 | + #TODO: check if this is actually an OpenXML file | ||
| 1229 | + #TODO: if the zip file is encrypted, suggest to use the -z option, or try '-z infected' automatically? | ||
| 1230 | + # check each file within the zip if it is an OLE file, by reading its magic: | ||
| 1231 | + for subfile in z.namelist(): | ||
| 1232 | + magic = z.open(subfile).read(len(olefile.MAGIC)) | ||
| 1233 | + if magic == olefile.MAGIC: | ||
| 1234 | + logging.debug('Opening OLE file %s within zip' % subfile) | ||
| 1235 | + ole_data = z.open(subfile).read() | ||
| 1236 | + try: | ||
| 1237 | + self.ole_subfiles.append(VBA_Parser(filename=subfile, data=ole_data)) | ||
| 1238 | + except: | ||
| 1239 | + logging.debug('%s is not a valid OLE file' % subfile) | ||
| 1240 | + continue | ||
| 1241 | + z.close() | ||
| 1242 | + else: | ||
| 1243 | + # read file from disk, check if it is a Word 2003 XML file (WordProcessingML), Excel 2003 XML, | ||
| 1244 | + # or a plain text file containing VBA code | ||
| 1245 | + if data is None: | ||
| 1246 | + data = open(filename, 'rb').read() | ||
| 1247 | + # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace | ||
| 1248 | + if 'http://schemas.microsoft.com/office/word/2003/wordml' in data: | ||
| 1249 | + logging.info('Opening Word 2003 XML file %s' % self.filename) | ||
| 1250 | + self.type = TYPE_Word2003_XML | ||
| 1251 | + # parse the XML content | ||
| 1252 | + et = ET.fromstring(data) | ||
| 1253 | + # find all the binData elements: | ||
| 1254 | + for bindata in et.getiterator(TAG_BINDATA): | ||
| 1255 | + # the binData content is an OLE container for the VBA project, compressed | ||
| 1256 | + # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded. | ||
| 1257 | + # get the filename: | ||
| 1258 | + fname = bindata.get(ATTR_NAME, 'noname.mso') | ||
| 1259 | + # decode the base64 activemime | ||
| 1260 | + activemime = binascii.a2b_base64(bindata.text) | ||
| 1261 | + # decompress the zlib data starting at offset 0x32, which is the OLE container: | ||
| 1262 | + ole_data = zlib.decompress(activemime[0x32:]) | ||
| 1263 | + try: | ||
| 1264 | + self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data)) | ||
| 1265 | + except: | ||
| 1266 | + logging.debug('%s is not a valid OLE file' % fname) | ||
| 1267 | + continue | ||
| 1268 | + #TODO: handle exceptions | ||
| 1269 | + #TODO: Excel 2003 XML | ||
| 1270 | + #TODO: plain text VBA file | ||
| 1271 | + else: | ||
| 1272 | + msg = '%s is not an OLE nor an OpenXML file, cannot extract VBA Macros.' % self.filename | ||
| 1273 | + logging.error(msg) | ||
| 1274 | + raise TypeError(msg) | ||
| 1275 | + | ||
| 1276 | + def find_vba_projects (self): | ||
| 1277 | + """ | ||
| 1278 | + Finds all the VBA projects stored in an OLE file. | ||
| 1279 | + | ||
| 1280 | + Return None if the file is not OLE but OpenXML. | ||
| 1281 | + Return a list of tuples (vba_root, project_path, dir_path) for each VBA project. | ||
| 1282 | + vba_root is the path of the root OLE storage containing the VBA project, | ||
| 1283 | + including a trailing slash unless it is the root of the OLE file. | ||
| 1284 | + project_path is the path of the OLE stream named "PROJECT" within the VBA project. | ||
| 1285 | + dir_path is the path of the OLE stream named "VBA/dir" within the VBA project. | ||
| 1286 | + | ||
| 1287 | + If this function returns an empty list for one of the supported formats | ||
| 1288 | + (i.e. Word, Excel, Powerpoint except Powerpoint 97-2003), then the | ||
| 1289 | + file does not contain VBA macros. | ||
| 1290 | + | ||
| 1291 | + :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path) | ||
| 1292 | + for each VBA project found if OLE file | ||
| 1293 | + """ | ||
| 1294 | + # if the file is not OLE but OpenXML, return None: | ||
| 1295 | + if self.ole_file is None: | ||
| 1296 | + return None | ||
| 1297 | + | ||
| 1298 | + # if this method has already been called, return previous result: | ||
| 1299 | + if self.vba_projects is not None: | ||
| 1300 | + return self.vba_projects | ||
| 1301 | + | ||
| 1302 | + # Find the VBA project root (different in MS Word, Excel, etc): | ||
| 1303 | + # - Word 97-2003: Macros | ||
| 1304 | + # - Excel 97-2003: _VBA_PROJECT_CUR | ||
| 1305 | + # - PowerPoint 97-2003: not supported yet (different file structure) | ||
| 1306 | + # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin. | ||
| 1307 | + # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word | ||
| 1308 | + # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word | ||
| 1309 | + # - Visio 2007: not supported yet (different file structure) | ||
| 1310 | + | ||
| 1311 | + # According to MS-OVBA section 2.2.1: | ||
| 1312 | + # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream | ||
| 1313 | + # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream | ||
| 1314 | + # - all names are case-insensitive | ||
| 1315 | + | ||
| 1316 | + # start with an empty list: | ||
| 1317 | + self.vba_projects = [] | ||
| 1318 | + # Look for any storage containing those storage/streams: | ||
| 1319 | + ole = self.ole_file | ||
| 1320 | + for storage in ole.listdir(streams=False, storages=True): | ||
| 1321 | + # Look for a storage ending with "VBA": | ||
| 1322 | + if storage[-1].upper() == 'VBA': | ||
| 1323 | + logging.debug('Found VBA storage: %s' % ('/'.join(storage))) | ||
| 1324 | + vba_root = '/'.join(storage[:-1]) | ||
| 1325 | + # Add a trailing slash to vba_root, unless it is the root of the OLE file: | ||
| 1326 | + # (used later to append all the child streams/storages) | ||
| 1327 | + if vba_root != '': | ||
| 1328 | + vba_root += '/' | ||
| 1329 | + logging.debug('Checking vba_root="%s"' % vba_root) | ||
| 1330 | + | ||
| 1331 | + def check_vba_stream(ole, vba_root, stream_path): | ||
| 1332 | + full_path = vba_root + stream_path | ||
| 1333 | + if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM: | ||
| 1334 | + logging.debug('Found %s stream: %s' % (stream_path, full_path)) | ||
| 1335 | + return full_path | ||
| 1336 | + else: | ||
| 1337 | + logging.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path) | ||
| 1338 | + return False | ||
| 1339 | + | ||
| 1340 | + # Check if the VBA root storage also contains a PROJECT stream: | ||
| 1341 | + project_path = check_vba_stream(ole, vba_root, 'PROJECT') | ||
| 1342 | + if not project_path: continue | ||
| 1343 | + # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream: | ||
| 1344 | + vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT') | ||
| 1345 | + if not vba_project_path: continue | ||
| 1346 | + # Check if the VBA root storage also contains a VBA/dir stream: | ||
| 1347 | + dir_path = check_vba_stream(ole, vba_root, 'VBA/dir') | ||
| 1348 | + if not dir_path: continue | ||
| 1349 | + # Now we are pretty sure it is a VBA project structure | ||
| 1350 | + logging.debug('VBA root storage: "%s"' % vba_root) | ||
| 1351 | + # append the results to the list as a tuple for later use: | ||
| 1352 | + self.vba_projects.append((vba_root, project_path, dir_path)) | ||
| 1353 | + return self.vba_projects | ||
| 1354 | + | ||
| 1355 | + def detect_vba_macros(self): | ||
| 1356 | + """ | ||
| 1357 | + Detect the potential presence of VBA macros in the file, by checking | ||
| 1358 | + if it contains VBA projects. Both OLE and OpenXML files are supported. | ||
| 1359 | + | ||
| 1360 | + Important: for now, results are accurate only for Word, Excel and PowerPoint | ||
| 1361 | + EXCEPT Powerpoint 97-2003, which has a different structure for VBA. | ||
| 1362 | + | ||
| 1363 | + Note: this method does NOT attempt to check the actual presence or validity | ||
| 1364 | + of VBA macro source code, so there might be false positives. | ||
| 1365 | + It may also detect VBA macros in files embedded within the main file, | ||
| 1366 | + for example an Excel workbook with macros embedded into a Word | ||
| 1367 | + document without macros may be detected, without distinction. | ||
| 1368 | + | ||
| 1369 | + :return: bool, True if at least one VBA project has been found, False otherwise | ||
| 1370 | + """ | ||
| 1371 | + #TODO: return None or raise exception if format not supported like PPT 97-2003 | ||
| 1372 | + #TODO: return the number of VBA projects found instead of True/False? | ||
| 1373 | + # if OpenXML, check all the OLE subfiles: | ||
| 1374 | + if self.ole_file is None: | ||
| 1375 | + for ole_subfile in self.ole_subfiles: | ||
| 1376 | + if ole_subfile.detect_vba_macros(): | ||
| 1377 | + return True | ||
| 1378 | + return False | ||
| 1379 | + # otherwise it's an OLE file, find VBA projects: | ||
| 1380 | + vba_projects = self.find_vba_projects() | ||
| 1381 | + if len(vba_projects) == 0: | ||
| 1382 | + return False | ||
| 1383 | + else: | ||
| 1384 | + return True | ||
| 1385 | + | ||
| 1386 | + | ||
| 1387 | + def extract_macros (self): | ||
| 1388 | + """ | ||
| 1389 | + Extract and decompress source code for each VBA macro found in the file | ||
| 1390 | + | ||
| 1391 | + Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found | ||
| 1392 | + If the file is OLE, filename is the path of the file. | ||
| 1393 | + If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros | ||
| 1394 | + within the zip archive, e.g. word/vbaProject.bin. | ||
| 1395 | + """ | ||
| 1396 | + if self.ole_file is None: | ||
| 1397 | + for ole_subfile in self.ole_subfiles: | ||
| 1398 | + for results in ole_subfile.extract_macros(): | ||
| 1399 | + yield results | ||
| 1400 | + else: | ||
| 1401 | + self.find_vba_projects() | ||
| 1402 | + for vba_root, project_path, dir_path in self.vba_projects: | ||
| 1403 | + # extract all VBA macros from that VBA root storage: | ||
| 1404 | + for stream_path, vba_filename, vba_code in _extract_vba(self.ole_file, vba_root, project_path, dir_path): | ||
| 1405 | + yield (self.filename, stream_path, vba_filename, vba_code) | ||
| 1406 | + | ||
| 1407 | + | ||
| 1408 | + def close(self): | ||
| 1409 | + """ | ||
| 1410 | + Close all the open files. This method must be called after usage, if | ||
| 1411 | + the application is opening many files. | ||
| 1412 | + """ | ||
| 1413 | + if self.ole_file is None: | ||
| 1414 | + for ole_subfile in self.ole_subfiles: | ||
| 1415 | + ole_subfile.close() | ||
| 1416 | + else: | ||
| 1417 | + self.ole_file.close() | ||
| 1418 | + | ||
| 1419 | + | ||
| 1420 | +def print_analysis(vba_code, show_decoded_strings=False): | ||
| 1421 | + """ | ||
| 1422 | + Analyze the provided VBA code, and print the results in a table | ||
| 1423 | + | ||
| 1424 | + :param vba_code: str, VBA source code to be analyzed | ||
| 1425 | + :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content. | ||
| 1426 | + :return: None | ||
| 1427 | + """ | ||
| 1428 | + results = scan_vba(vba_code, show_decoded_strings) | ||
| 1429 | + if results: | ||
| 1430 | + t = prettytable.PrettyTable(('Type', 'Keyword', 'Description')) | ||
| 1431 | + t.align = 'l' | ||
| 1432 | + t.max_width['Type'] = 10 | ||
| 1433 | + t.max_width['Keyword'] = 20 | ||
| 1434 | + t.max_width['Description'] = 39 | ||
| 1435 | + for kw_type, keyword, description in results: | ||
| 1436 | + t.add_row((kw_type, keyword, description)) | ||
| 1437 | + print t | ||
| 1438 | + else: | ||
| 1439 | + print 'No suspicious keyword or IOC found.' | ||
| 1440 | + | ||
| 1441 | + | ||
| 1442 | + | ||
| 1443 | +def process_file (container, filename, data, show_decoded_strings=False): | ||
| 1444 | + """ | ||
| 1445 | + Process a single file | ||
| 1446 | + | ||
| 1447 | + :param container: str, path and filename of container if the file is within | ||
| 1448 | + a zip archive, None otherwise. | ||
| 1449 | + :param filename: str, path and filename of file on disk, or within the container. | ||
| 1450 | + :param data: bytes, content of the file if it is in a container, None if it is a file on disk. | ||
| 1451 | + :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content. | ||
| 1452 | + """ | ||
| 1453 | + #TODO: replace print by writing to a provided output file (sys.stdout by default) | ||
| 1454 | + if container: | ||
| 1455 | + display_filename = '%s in %s' % (filename, container) | ||
| 1456 | + else: | ||
| 1457 | + display_filename = filename | ||
| 1458 | + print '='*79 | ||
| 1459 | + print 'FILE:', display_filename | ||
| 1460 | + try: | ||
| 1461 | + #TODO: handle olefile errors, when an OLE file is malformed | ||
| 1462 | + vba = VBA_Parser(filename, data) | ||
| 1463 | + print 'Type:', vba.type | ||
| 1464 | + if vba.detect_vba_macros(): | ||
| 1465 | + #print 'Contains VBA Macros:' | ||
| 1466 | + for (subfilename, stream_path, vba_filename, vba_code) in vba.extract_macros(): | ||
| 1467 | + # hide attribute lines: | ||
| 1468 | + #TODO: option to disable attribute filtering | ||
| 1469 | + vba_code_filtered = filter_vba(vba_code) | ||
| 1470 | + print '-'*79 | ||
| 1471 | + print 'VBA MACRO %s ' % vba_filename | ||
| 1472 | + print 'in file: %s - OLE stream: %s' % (subfilename, repr(stream_path)) | ||
| 1473 | + print '- '*39 | ||
| 1474 | + # detect empty macros: | ||
| 1475 | + if vba_code_filtered.strip() == '': | ||
| 1476 | + print '(empty macro)' | ||
| 1477 | + else: | ||
| 1478 | + print vba_code_filtered | ||
| 1479 | + print '- '*39 | ||
| 1480 | + print 'ANALYSIS:' | ||
| 1481 | + # analyse the whole code, filtered to avoid false positives: | ||
| 1482 | + print_analysis(vba_code_filtered, show_decoded_strings) | ||
| 1483 | + else: | ||
| 1484 | + print 'No VBA macros found.' | ||
| 1485 | + except: #TypeError: | ||
| 1486 | + #raise | ||
| 1487 | + #TODO: print more info if debug mode | ||
| 1488 | + #print sys.exc_value | ||
| 1489 | + # display the exception with full stack trace for debugging, but do not stop: | ||
| 1490 | + traceback.print_exc() | ||
| 1491 | + print '' | ||
| 1492 | + | ||
| 1493 | + | ||
| 1494 | +def process_file_triage (container, filename, data): | ||
| 1495 | + """ | ||
| 1496 | + Process a single file | ||
| 1497 | + | ||
| 1498 | + :param container: str, path and filename of container if the file is within | ||
| 1499 | + a zip archive, None otherwise. | ||
| 1500 | + :param filename: str, path and filename of file on disk, or within the container. | ||
| 1501 | + :param data: bytes, content of the file if it is in a container, None if it is a file on disk. | ||
| 1502 | + """ | ||
| 1503 | + #TODO: replace print by writing to a provided output file (sys.stdout by default) | ||
| 1504 | + nb_macros = 0 | ||
| 1505 | + nb_autoexec = 0 | ||
| 1506 | + nb_suspicious = 0 | ||
| 1507 | + nb_iocs = 0 | ||
| 1508 | + nb_hexstrings = 0 | ||
| 1509 | + nb_base64strings = 0 | ||
| 1510 | + nb_dridexstrings = 0 | ||
| 1511 | + # ftype = 'Other' | ||
| 1512 | + message = '' | ||
| 1513 | + try: | ||
| 1514 | + #TODO: handle olefile errors, when an OLE file is malformed | ||
| 1515 | + vba = VBA_Parser(filename, data) | ||
| 1516 | + if vba.detect_vba_macros(): | ||
| 1517 | + for (subfilename, stream_path, vba_filename, vba_code) in vba.extract_macros(): | ||
| 1518 | + nb_macros += 1 | ||
| 1519 | + if vba_code.strip() != '': | ||
| 1520 | + # analyse the whole code, filtered to avoid false positives: | ||
| 1521 | + scanner = VBA_Scanner(filter_vba(vba_code)) | ||
| 1522 | + autoexec, suspicious, iocs, hexstrings, base64strings, dridex = scanner.scan_summary() | ||
| 1523 | + nb_autoexec += autoexec | ||
| 1524 | + nb_suspicious += suspicious | ||
| 1525 | + nb_iocs += iocs | ||
| 1526 | + nb_hexstrings += hexstrings | ||
| 1527 | + nb_base64strings += base64strings | ||
| 1528 | + nb_dridexstrings += dridex | ||
| 1529 | + if vba.type == TYPE_OLE: | ||
| 1530 | + flags = 'OLE:' | ||
| 1531 | + elif vba.type == TYPE_OpenXML: | ||
| 1532 | + flags = 'OpX:' | ||
| 1533 | + elif vba.type == TYPE_Word2003_XML: | ||
| 1534 | + flags = 'XML:' | ||
| 1535 | + macros = autoexec = suspicious = iocs = hexstrings = base64obf = dridex = '-' | ||
| 1536 | + if nb_macros: macros = 'M' | ||
| 1537 | + if nb_autoexec: autoexec = 'A' | ||
| 1538 | + if nb_suspicious: suspicious = 'S' | ||
| 1539 | + if nb_iocs: iocs = 'I' | ||
| 1540 | + if nb_hexstrings: hexstrings = 'H' | ||
| 1541 | + if nb_base64strings: base64obf = 'B' | ||
| 1542 | + if nb_dridexstrings: dridex = 'D' | ||
| 1543 | + flags += '%s%s%s%s%s%s%s' % (macros, autoexec, suspicious, iocs, hexstrings, | ||
| 1544 | + base64obf, dridex) | ||
| 1545 | + | ||
| 1546 | + # macros = autoexec = suspicious = iocs = hexstrings = 'no' | ||
| 1547 | + # if nb_macros: macros = 'YES:%d' % nb_macros | ||
| 1548 | + # if nb_autoexec: autoexec = 'YES:%d' % nb_autoexec | ||
| 1549 | + # if nb_suspicious: suspicious = 'YES:%d' % nb_suspicious | ||
| 1550 | + # if nb_iocs: iocs = 'YES:%d' % nb_iocs | ||
| 1551 | + # if nb_hexstrings: hexstrings = 'YES:%d' % nb_hexstrings | ||
| 1552 | + # # 2nd line = info | ||
| 1553 | + # print '%-8s %-7s %-7s %-7s %-7s %-7s' % (vba.type, macros, autoexec, suspicious, iocs, hexstrings) | ||
| 1554 | + except TypeError: | ||
| 1555 | + # file type not OLE nor OpenXML | ||
| 1556 | + flags = '?' | ||
| 1557 | + message = 'File format not supported' | ||
| 1558 | + except: | ||
| 1559 | + # another error occurred | ||
| 1560 | + #raise | ||
| 1561 | + #TODO: print more info if debug mode | ||
| 1562 | + #TODO: distinguish real errors from incorrect file types | ||
| 1563 | + flags = '!ERROR' | ||
| 1564 | + message = sys.exc_value | ||
| 1565 | + line = '%-11s %s' % (flags, filename) | ||
| 1566 | + if message: | ||
| 1567 | + line += ' - %s' % message | ||
| 1568 | + print line | ||
| 1569 | + | ||
| 1570 | + # t = prettytable.PrettyTable(('filename', 'type', 'macros', 'autoexec', 'suspicious', 'ioc', 'hexstrings'), | ||
| 1571 | + # header=False, border=False) | ||
| 1572 | + # t.align = 'l' | ||
| 1573 | + # t.max_width['filename'] = 30 | ||
| 1574 | + # t.max_width['type'] = 10 | ||
| 1575 | + # t.max_width['macros'] = 6 | ||
| 1576 | + # t.max_width['autoexec'] = 6 | ||
| 1577 | + # t.max_width['suspicious'] = 6 | ||
| 1578 | + # t.max_width['ioc'] = 6 | ||
| 1579 | + # t.max_width['hexstrings'] = 6 | ||
| 1580 | + # t.add_row((filename, ftype, macros, autoexec, suspicious, iocs, hexstrings)) | ||
| 1581 | + # print t | ||
| 1582 | + | ||
| 1583 | +def main_triage_quick(): | ||
| 1584 | + pass | ||
| 1585 | + | ||
| 1586 | +#=== MAIN ===================================================================== | ||
| 1587 | + | ||
| 1588 | +def main(): | ||
| 1589 | + """ | ||
| 1590 | + Main function, called when olevba is run from the command line | ||
| 1591 | + """ | ||
| 1592 | + usage = 'usage: %prog [options] <filename> [filename2 ...]' | ||
| 1593 | + parser = optparse.OptionParser(usage=usage) | ||
| 1594 | + # parser.add_option('-o', '--outfile', dest='outfile', | ||
| 1595 | + # help='output file') | ||
| 1596 | + # parser.add_option('-c', '--csv', dest='csv', | ||
| 1597 | + # help='export results to a CSV file') | ||
| 1598 | + parser.add_option("-r", action="store_true", dest="recursive", | ||
| 1599 | + help='find files recursively in subdirectories.') | ||
| 1600 | + parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None, | ||
| 1601 | + help='if the file is a zip archive, open first file from it, using the provided password (requires Python 2.6+)') | ||
| 1602 | + parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*', | ||
| 1603 | + help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)') | ||
| 1604 | + parser.add_option("-t", action="store_true", dest="triage_mode", | ||
| 1605 | + help='triage mode, display results as a summary table (default for multiple files)') | ||
| 1606 | + parser.add_option("-d", action="store_true", dest="detailed_mode", | ||
| 1607 | + help='detailed mode, display full results (default for single file)') | ||
| 1608 | + parser.add_option("-i", "--input", dest='input', type='str', default=None, | ||
| 1609 | + help='input file containing VBA source code to be analyzed (no parsing)') | ||
| 1610 | + parser.add_option("--decode", action="store_true", dest="show_decoded_strings", | ||
| 1611 | + help='display all the obfuscated strings with their decoded content (Hex, Base64, StrReverse, Dridex).') | ||
| 1612 | + | ||
| 1613 | + (options, args) = parser.parse_args() | ||
| 1614 | + | ||
| 1615 | + # Print help if no arguments are passed | ||
| 1616 | + if len(args) == 0 and not options.input: | ||
| 1617 | + print __doc__ | ||
| 1618 | + parser.print_help() | ||
| 1619 | + sys.exit() | ||
| 1620 | + | ||
| 1621 | + # print banner with version | ||
| 1622 | + print 'olevba %s - http://decalage.info/python/oletools' % __version__ | ||
| 1623 | + | ||
| 1624 | + logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING) #INFO) | ||
| 1625 | + # For now, all logging is disabled: | ||
| 1626 | + logging.disable(logging.CRITICAL) | ||
| 1627 | + | ||
| 1628 | + if options.input: | ||
| 1629 | + # input file provided with VBA source code to be analyzed directly: | ||
| 1630 | + print 'Analysis of VBA source code from %s:' % options.input | ||
| 1631 | + vba_code = open(options.input).read() | ||
| 1632 | + print_analysis(vba_code, show_decoded_strings=options.show_decoded_strings) | ||
| 1633 | + sys.exit() | ||
| 1634 | + | ||
| 1635 | + # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('Type', 'Macros', 'AutoEx', 'Susp.', 'IOCs', 'HexStr') | ||
| 1636 | + # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('-'*8, '-'*7, '-'*7, '-'*7, '-'*7, '-'*7) | ||
| 1637 | + if not options.detailed_mode or options.triage_mode: | ||
| 1638 | + print '%-11s %-65s' % ('Flags', 'Filename') | ||
| 1639 | + print '%-11s %-65s' % ('-'*11, '-'*65) | ||
| 1640 | + previous_container = None | ||
| 1641 | + count = 0 | ||
| 1642 | + container = filename = data = None | ||
| 1643 | + for container, filename, data in xglob.iter_files(args, recursive=options.recursive, | ||
| 1644 | + zip_password=options.zip_password, zip_fname=options.zip_fname): | ||
| 1645 | + # ignore directory names stored in zip files: | ||
| 1646 | + if container and filename.endswith('/'): | ||
| 1647 | + continue | ||
| 1648 | + if options.detailed_mode and not options.triage_mode: | ||
| 1649 | + # fully detailed output | ||
| 1650 | + process_file(container, filename, data, show_decoded_strings=options.show_decoded_strings) | ||
| 1651 | + else: | ||
| 1652 | + # print container name when it changes: | ||
| 1653 | + if container != previous_container: | ||
| 1654 | + if container is not None: | ||
| 1655 | + print '\nFiles in %s:' % container | ||
| 1656 | + previous_container = container | ||
| 1657 | + # summarized output for triage: | ||
| 1658 | + process_file_triage(container, filename, data) | ||
| 1659 | + count += 1 | ||
| 1660 | + if not options.detailed_mode or options.triage_mode: | ||
| 1661 | + print '\n(Flags: OpX=OpenXML, XML=Word2003XML, M=Macros, A=Auto-executable, S=Suspicious keywords, I=IOCs, H=Hex strings, B=Base64 strings, D=Dridex strings, ?=Unknown)\n' | ||
| 1662 | + | ||
| 1663 | + if count == 1 and not options.triage_mode and not options.detailed_mode: | ||
| 1664 | + # if options -t and -d were not specified and it's a single file, print details: | ||
| 1665 | + #TODO: avoid doing the analysis twice by storing results | ||
| 1666 | + process_file(container, filename, data, show_decoded_strings=options.show_decoded_strings) | ||
| 1667 | + | ||
| 1668 | +if __name__ == '__main__': | ||
| 1669 | + main() | ||
| 1670 | + | ||
| 1671 | # This was coded while listening to "Dust" from I Love You But I've Chosen Darkness | 1671 | # This was coded while listening to "Dust" from I Love You But I've Chosen Darkness |
| 1672 | \ No newline at end of file | 1672 | \ No newline at end of file |