Commit a4ffb743f926d59e022f10313ca70d6af9f8c8b7
1 parent
41896bcf
olevba: changed line endings from CRLF to LF
Showing
1 changed file
with
1670 additions
and
1670 deletions
oletools/olevba.py
100644 → 100755
| 1 | -#!/usr/bin/env python | |
| 2 | -""" | |
| 3 | -olevba.py | |
| 4 | - | |
| 5 | -olevba is a script to parse OLE and OpenXML files such as MS Office documents | |
| 6 | -(e.g. Word, Excel), to extract VBA Macro code in clear text, deobfuscate | |
| 7 | -and analyze malicious macros. | |
| 8 | - | |
| 9 | -Supported formats: | |
| 10 | -- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) | |
| 11 | -- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) | |
| 12 | -- PowerPoint 2007+ (.pptm, .ppsm) | |
| 13 | -- Word 2003 XML (.xml) | |
| 14 | - | |
| 15 | -Author: Philippe Lagadec - http://www.decalage.info | |
| 16 | -License: BSD, see source code or documentation | |
| 17 | - | |
| 18 | -olevba is part of the python-oletools package: | |
| 19 | -http://www.decalage.info/python/oletools | |
| 20 | - | |
| 21 | -olevba is based on source code from officeparser by John William Davison | |
| 22 | -https://github.com/unixfreak0037/officeparser | |
| 23 | -""" | |
| 24 | - | |
| 25 | -#=== LICENSE ================================================================== | |
| 26 | - | |
| 27 | -# olevba is copyright (c) 2014-2015 Philippe Lagadec (http://www.decalage.info) | |
| 28 | -# All rights reserved. | |
| 29 | -# | |
| 30 | -# Redistribution and use in source and binary forms, with or without modification, | |
| 31 | -# are permitted provided that the following conditions are met: | |
| 32 | -# | |
| 33 | -# * Redistributions of source code must retain the above copyright notice, this | |
| 34 | -# list of conditions and the following disclaimer. | |
| 35 | -# * Redistributions in binary form must reproduce the above copyright notice, | |
| 36 | -# this list of conditions and the following disclaimer in the documentation | |
| 37 | -# and/or other materials provided with the distribution. | |
| 38 | -# | |
| 39 | -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
| 40 | -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
| 41 | -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
| 42 | -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | |
| 43 | -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 44 | -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
| 45 | -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
| 46 | -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
| 47 | -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
| 48 | -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 49 | - | |
| 50 | - | |
| 51 | -# olevba contains modified source code from the officeparser project, published | |
| 52 | -# under the following MIT License (MIT): | |
| 53 | -# | |
| 54 | -# officeparser is copyright (c) 2014 John William Davison | |
| 55 | -# | |
| 56 | -# Permission is hereby granted, free of charge, to any person obtaining a copy | |
| 57 | -# of this software and associated documentation files (the "Software"), to deal | |
| 58 | -# in the Software without restriction, including without limitation the rights | |
| 59 | -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| 60 | -# copies of the Software, and to permit persons to whom the Software is | |
| 61 | -# furnished to do so, subject to the following conditions: | |
| 62 | -# | |
| 63 | -# The above copyright notice and this permission notice shall be included in all | |
| 64 | -# copies or substantial portions of the Software. | |
| 65 | -# | |
| 66 | -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| 67 | -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| 68 | -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| 69 | -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| 70 | -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| 71 | -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
| 72 | -# SOFTWARE. | |
| 73 | - | |
| 74 | -#------------------------------------------------------------------------------ | |
| 75 | -# CHANGELOG: | |
| 76 | -# 2014-08-05 v0.01 PL: - first version based on officeparser code | |
| 77 | -# 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser | |
| 78 | -# 2014-08-15 PL: - fixed incorrect value check in PROJECTHELPFILEPATH Record | |
| 79 | -# 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats | |
| 80 | -# and to find the VBA project root anywhere in the file | |
| 81 | -# 2014-11-29 v0.04 PL: - use olefile instead of OleFileIO_PL | |
| 82 | -# 2014-12-05 v0.05 PL: - refactored most functions into a class, new API | |
| 83 | -# - added detect_vba_macros | |
| 84 | -# 2014-12-10 v0.06 PL: - hide first lines with VB attributes | |
| 85 | -# - detect auto-executable macros | |
| 86 | -# - ignore empty macros | |
| 87 | -# 2014-12-14 v0.07 PL: - detect_autoexec() is now case-insensitive | |
| 88 | -# 2014-12-15 v0.08 PL: - improved display for empty macros | |
| 89 | -# - added pattern extraction | |
| 90 | -# 2014-12-25 v0.09 PL: - added suspicious keywords detection | |
| 91 | -# 2014-12-27 v0.10 PL: - added OptionParser, main and process_file | |
| 92 | -# - uses xglob to scan several files with wildcards | |
| 93 | -# - option -r to recurse subdirectories | |
| 94 | -# - option -z to scan files in password-protected zips | |
| 95 | -# 2015-01-02 v0.11 PL: - improved filter_vba to detect colons | |
| 96 | -# 2015-01-03 v0.12 PL: - fixed detect_patterns to detect all patterns | |
| 97 | -# - process_file: improved display, shows container file | |
| 98 | -# - improved list of executable file extensions | |
| 99 | -# 2015-01-04 v0.13 PL: - added several suspicious keywords, improved display | |
| 100 | -# 2015-01-08 v0.14 PL: - added hex strings detection and decoding | |
| 101 | -# - fixed issue #2, decoding VBA stream names using | |
| 102 | -# specified codepage and unicode stream names | |
| 103 | -# 2015-01-11 v0.15 PL: - added new triage mode, options -t and -d | |
| 104 | -# 2015-01-16 v0.16 PL: - fix for issue #3 (exception when module name="text") | |
| 105 | -# - added several suspicious keywords | |
| 106 | -# - added option -i to analyze VBA source code directly | |
| 107 | -# 2015-01-17 v0.17 PL: - removed .com from the list of executable extensions | |
| 108 | -# - added scan_vba to run all detection algorithms | |
| 109 | -# - decoded hex strings are now also scanned + reversed | |
| 110 | -# 2015-01-23 v0.18 PL: - fixed issue #3, case-insensitive search in code_modules | |
| 111 | -# 2015-01-24 v0.19 PL: - improved the detection of IOCs obfuscated with hex | |
| 112 | -# strings and StrReverse | |
| 113 | -# 2015-01-26 v0.20 PL: - added option --hex to show all hex strings decoded | |
| 114 | -# 2015-01-29 v0.21 PL: - added Dridex obfuscation decoding | |
| 115 | -# - improved display, shows obfuscation name | |
| 116 | -# 2015-02-01 v0.22 PL: - fixed issue #4: regex for URL, e-mail and exe filename | |
| 117 | -# - added Base64 obfuscation decoding (contribution from | |
| 118 | -# @JamesHabben) | |
| 119 | -# 2015-02-03 v0.23 PL: - triage now uses VBA_Scanner results, shows Base64 and | |
| 120 | -# Dridex strings | |
| 121 | -# - exception handling in detect_base64_strings | |
| 122 | -# 2015-02-07 v0.24 PL: - renamed option --hex to --decode, fixed display | |
| 123 | -# - display exceptions with stack trace | |
| 124 | -# - added several suspicious keywords | |
| 125 | -# - improved Base64 detection and decoding | |
| 126 | -# - fixed triage mode not to scan attrib lines | |
| 127 | -# 2015-03-04 v0.25 PL: - added support for Word 2003 XML | |
| 128 | - | |
| 129 | -__version__ = '0.25' | |
| 130 | - | |
| 131 | -#------------------------------------------------------------------------------ | |
| 132 | -# TODO: | |
| 133 | -# + do not use logging, but a provided logger (null logger by default) | |
| 134 | -# + setup logging (common with other oletools) | |
| 135 | -# + add xor bruteforcing like bbharvest | |
| 136 | -# + add chr() decoding | |
| 137 | - | |
| 138 | -# TODO later: | |
| 139 | -# + performance improvement: instead of searching each keyword separately, | |
| 140 | -# first split vba code into a list of words (per line), then check each | |
| 141 | -# word against a dict. (or put vba words into a set/dict?) | |
| 142 | -# + for regex, maybe combine them into a single re with named groups? | |
| 143 | -# + add Yara support, include sample rules? plugins like balbuzard? | |
| 144 | -# + add balbuzard support | |
| 145 | -# + output to file (replace print by file.write, sys.stdout by default) | |
| 146 | -# + look for VBA in embedded documents (e.g. Excel in Word) | |
| 147 | -# + support SRP streams (see Lenny's article + links and sample) | |
| 148 | -# - python 3.x support | |
| 149 | -# - add support for PowerPoint macros (see libclamav, libgsf), use oledump heuristic? | |
| 150 | -# - check VBA macros in Visio, Access, Project, etc | |
| 151 | -# - extract_macros: convert to a class, split long function into smaller methods | |
| 152 | -# - extract_macros: read bytes from stream file objects instead of strings | |
| 153 | -# - extract_macros: use combined struct.unpack instead of many calls | |
| 154 | - | |
| 155 | -#------------------------------------------------------------------------------ | |
| 156 | -# REFERENCES: | |
| 157 | -# - [MS-OVBA]: Microsoft Office VBA File Format Structure | |
| 158 | -# http://msdn.microsoft.com/en-us/library/office/cc313094%28v=office.12%29.aspx | |
| 159 | -# - officeparser: https://github.com/unixfreak0037/officeparser | |
| 160 | - | |
| 161 | - | |
| 162 | -#--- IMPORTS ------------------------------------------------------------------ | |
| 163 | - | |
| 164 | -import sys, logging | |
| 165 | -import struct | |
| 166 | -import cStringIO | |
| 167 | -import math | |
| 168 | -import zipfile | |
| 169 | -import re | |
| 170 | -import optparse | |
| 171 | -import os.path | |
| 172 | -import binascii | |
| 173 | -import base64 | |
| 174 | -import traceback | |
| 175 | -import zlib | |
| 176 | - | |
| 177 | -# import lxml or ElementTree for XML parsing: | |
| 178 | -try: | |
| 179 | - # lxml: best performance for XML processing | |
| 180 | - import lxml.etree as ET | |
| 181 | -except ImportError: | |
| 182 | - try: | |
| 183 | - # Python 2.5+: batteries included | |
| 184 | - import xml.etree.cElementTree as ET | |
| 185 | - except ImportError: | |
| 186 | - try: | |
| 187 | - # Python <2.5: standalone ElementTree install | |
| 188 | - import elementtree.cElementTree as ET | |
| 189 | - except ImportError: | |
| 190 | - raise ImportError, "lxml or ElementTree are not installed, "\ | |
| 191 | - +"see http://codespeak.net/lxml "\ | |
| 192 | - +"or http://effbot.org/zone/element-index.htm" | |
| 193 | - | |
| 194 | -import thirdparty.olefile as olefile | |
| 195 | -from thirdparty.prettytable import prettytable | |
| 196 | -from thirdparty.xglob import xglob | |
| 197 | - | |
| 198 | -#--- CONSTANTS ---------------------------------------------------------------- | |
| 199 | - | |
| 200 | -TYPE_OLE = 'OLE' | |
| 201 | -TYPE_OpenXML = 'OpenXML' | |
| 202 | -TYPE_Word2003_XML = 'Word2003_XML' | |
| 203 | - | |
| 204 | -MODULE_EXTENSION = "bas" | |
| 205 | -CLASS_EXTENSION = "cls" | |
| 206 | -FORM_EXTENSION = "frm" | |
| 207 | - | |
| 208 | -# Namespaces and tags for Word2003 XML parsing: | |
| 209 | -NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}' | |
| 210 | -# the tag <w:binData w:name="editdata.mso"> contains the VBA macro code: | |
| 211 | -TAG_BINDATA = NS_W + 'binData' | |
| 212 | -ATTR_NAME = NS_W + 'name' | |
| 213 | - | |
| 214 | -# Keywords to detect auto-executable macros | |
| 215 | -AUTOEXEC_KEYWORDS = { | |
| 216 | - # MS Word: | |
| 217 | - 'Runs when the Word document is opened': | |
| 218 | - ('AutoExec', 'AutoOpen', 'Document_Open', 'DocumentOpen'), | |
| 219 | - 'Runs when the Word document is closed': | |
| 220 | - ('AutoExit', 'AutoClose', 'Document_Close', 'DocumentBeforeClose'), | |
| 221 | - 'Runs when the Word document is modified': | |
| 222 | - ('DocumentChange',), | |
| 223 | - 'Runs when a new Word document is created': | |
| 224 | - ('AutoNew', 'Document_New', 'NewDocument'), | |
| 225 | - | |
| 226 | - # MS Excel: | |
| 227 | - 'Runs when the Excel Workbook is opened': | |
| 228 | - ('Auto_Open', 'Workbook_Open'), | |
| 229 | - 'Runs when the Excel Workbook is closed': | |
| 230 | - ('Auto_Close', 'Workbook_Close'), | |
| 231 | - | |
| 232 | - #TODO: full list in MS specs?? | |
| 233 | -} | |
| 234 | - | |
| 235 | -# Suspicious Keywords that may be used by malware | |
| 236 | -# See VBA language reference: http://msdn.microsoft.com/en-us/library/office/jj692818%28v=office.15%29.aspx | |
| 237 | -SUSPICIOUS_KEYWORDS = { | |
| 238 | - #TODO: use regex to support variable whitespaces | |
| 239 | - 'May read system environment variables': | |
| 240 | - ('Environ',), | |
| 241 | - 'May open a file': | |
| 242 | - ('Open',), | |
| 243 | - 'May write to a file (if combined with Open)': | |
| 244 | - #TODO: regex to find Open+Write on same line | |
| 245 | - ('Write', 'Put', 'Output', 'Print #'), | |
| 246 | - 'May read or write a binary file (if combined with Open)': | |
| 247 | - #TODO: regex to find Open+Binary on same line | |
| 248 | - ('Binary',), | |
| 249 | - 'May copy a file': | |
| 250 | - ('FileCopy', 'CopyFile'), | |
| 251 | - #FileCopy: http://msdn.microsoft.com/en-us/library/office/gg264390%28v=office.15%29.aspx | |
| 252 | - #CopyFile: http://msdn.microsoft.com/en-us/library/office/gg264089%28v=office.15%29.aspx | |
| 253 | - 'May delete a file': | |
| 254 | - ('Kill',), | |
| 255 | - 'May create a text file': | |
| 256 | - ('CreateTextFile','ADODB.Stream', 'WriteText', 'SaveToFile'), | |
| 257 | - #CreateTextFile: http://msdn.microsoft.com/en-us/library/office/gg264617%28v=office.15%29.aspx | |
| 258 | - #ADODB.Stream sample: http://pastebin.com/Z4TMyuq6 | |
| 259 | - 'May run an executable file or a system command': | |
| 260 | - ('Shell', 'vbNormal', 'vbNormalFocus', 'vbHide', 'vbMinimizedFocus', 'vbMaximizedFocus', 'vbNormalNoFocus', | |
| 261 | - 'vbMinimizedNoFocus', 'WScript.Shell', 'Run'), | |
| 262 | - #Shell: http://msdn.microsoft.com/en-us/library/office/gg278437%28v=office.15%29.aspx | |
| 263 | - #WScript.Shell+Run sample: http://pastebin.com/Z4TMyuq6 | |
| 264 | - 'May hide the application': | |
| 265 | - ('Application.Visible', 'ShowWindow', 'SW_HIDE'), | |
| 266 | - 'May create a directory': | |
| 267 | - ('MkDir',), | |
| 268 | - 'May save the current workbook': | |
| 269 | - ('ActiveWorkbook.SaveAs',), | |
| 270 | - 'May change which directory contains files to open at startup': | |
| 271 | - #TODO: confirm the actual effect | |
| 272 | - ('Application.AltStartupPath',), | |
| 273 | - 'May create an OLE object': | |
| 274 | - ('CreateObject',), | |
| 275 | - 'May run an application (if combined with CreateObject)': | |
| 276 | - ('Shell.Application',), | |
| 277 | - 'May enumerate application windows (if combined with Shell.Application object)': | |
| 278 | - ('Windows', 'FindWindow'), | |
| 279 | - 'May run code from a DLL': | |
| 280 | - #TODO: regex to find declare+lib on same line | |
| 281 | - ('Lib',), | |
| 282 | - 'May download files from the Internet': | |
| 283 | - #TODO: regex to find urlmon+URLDownloadToFileA on same line | |
| 284 | - ('URLDownloadToFileA', 'Msxml2.XMLHTTP', 'Microsoft.XMLHTTP'), | |
| 285 | - 'May control another application by simulating user keystrokes': | |
| 286 | - ('SendKeys', 'AppActivate'), | |
| 287 | - #SendKeys: http://msdn.microsoft.com/en-us/library/office/gg278655%28v=office.15%29.aspx | |
| 288 | - 'May attempt to obfuscate malicious function calls': | |
| 289 | - ('CallByName',), | |
| 290 | - #CallByName: http://msdn.microsoft.com/en-us/library/office/gg278760%28v=office.15%29.aspx | |
| 291 | - 'May attempt to obfuscate specific strings': | |
| 292 | - #TODO: regex to find several Chr*, not just one | |
| 293 | - ('Chr', 'ChrB', 'ChrW', 'StrReverse', 'Xor'), | |
| 294 | - #Chr: http://msdn.microsoft.com/en-us/library/office/gg264465%28v=office.15%29.aspx | |
| 295 | -} | |
| 296 | - | |
| 297 | -# Regular Expression for a URL: | |
| 298 | -# http://en.wikipedia.org/wiki/Uniform_resource_locator | |
| 299 | -# http://www.w3.org/Addressing/URL/uri-spec.html | |
| 300 | -#TODO: also support username:password@server | |
| 301 | -#TODO: other protocols (file, gopher, wais, ...?) | |
| 302 | -SCHEME = r'\b(?:http|ftp)s?' | |
| 303 | -# see http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains | |
| 304 | -TLD = r'(?:xn--[a-zA-Z0-9]{4,20}|[a-zA-Z]{2,20})' | |
| 305 | -DNS_NAME = r'(?:[a-zA-Z0-9\-\.]+\.' + TLD + ')' | |
| 306 | -#TODO: IPv6 - see https://www.debuggex.com/ | |
| 307 | -# A literal numeric IPv6 address may be given, but must be enclosed in [ ] e.g. [db8:0cec::99:123a] | |
| 308 | -NUMBER_0_255 = r'(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])' | |
| 309 | -IPv4 = r'(?:'+NUMBER_0_255+r'\.){3}'+NUMBER_0_255 | |
| 310 | -# IPv4 must come before the DNS name because it is more specific | |
| 311 | -SERVER = r'(?:' + IPv4 + '|' + DNS_NAME + ')' | |
| 312 | -PORT = r'(?:\:[0-9]{1,5})?' | |
| 313 | -SERVER_PORT = SERVER + PORT | |
| 314 | -URL_PATH = r'(?:/[a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~]*)?' # [^\.\,\)\(\s"] | |
| 315 | -URL_RE = SCHEME + r'\://' + SERVER_PORT + URL_PATH | |
| 316 | -re_url = re.compile(URL_RE) | |
| 317 | - | |
| 318 | - | |
| 319 | -# Patterns to be extracted (IP addresses, URLs, etc) | |
| 320 | -# From patterns.py in balbuzard | |
| 321 | -RE_PATTERNS = ( | |
| 322 | - ('URL', re.compile(URL_RE)), | |
| 323 | - ('IPv4 address', re.compile(IPv4)), | |
| 324 | - ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@'+SERVER+'\b')), | |
| 325 | - # ('Domain name', re.compile(r'(?=^.{1,254}$)(^(?:(?!\d+\.|-)[a-zA-Z0-9_\-]{1,63}(?<!-)\.?)+(?:[a-zA-Z]{2,})$)')), | |
| 326 | - # Executable file name with known extensions (except .com which is present in many URLs, and .application): | |
| 327 | - ("Executable file name", re.compile(r"(?i)\b\w+\.(EXE|PIF|GADGET|MSI|MSP|MSC|VBS|VBE|VB|JSE|JS|WSF|WSC|WSH|WS|BAT|CMD|DLL|SCR|HTA|CPL|CLASS|JAR|PS1XML|PS1|PS2XML|PS2|PSC1|PSC2|SCF|LNK|INF|REG)\b")), | |
| 328 | - # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/ | |
| 329 | - #TODO: https://support.office.com/en-us/article/Blocked-attachments-in-Outlook-3811cddc-17c3-4279-a30c-060ba0207372#__attachment_file_types | |
| 330 | - #('Hex string', re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')), | |
| 331 | - ) | |
| 332 | - | |
| 333 | -# regex to detect strings encoded in hexadecimal | |
| 334 | -re_hex_string = re.compile(r'(?:[0-9A-Fa-f]{2}){4,}') | |
| 335 | - | |
| 336 | -# regex to detect strings encoded in base64 | |
| 337 | -#re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?"') | |
| 338 | -# better version from balbuzard, less false positives: | |
| 339 | -re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4}){1,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)?"') | |
| 340 | -# white list of common strings matching the base64 regex, but which are not base64 strings (all lowercase): | |
| 341 | -BASE64_WHITELIST = set(['thisdocument', 'thisworkbook', 'test', 'temp', 'http', 'open', 'exit']) | |
| 342 | - | |
| 343 | -# regex to detect strings encoded with a specific Dridex algorithm | |
| 344 | -# (see https://github.com/JamesHabben/MalwareStuff) | |
| 345 | -re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"') | |
| 346 | -# regex to check that it is not just a hex string: | |
| 347 | -re_nothex_check = re.compile(r'[G-Zg-z]') | |
| 348 | - | |
| 349 | -#--- FUNCTIONS ---------------------------------------------------------------- | |
| 350 | - | |
| 351 | -def copytoken_help(decompressed_current, decompressed_chunk_start): | |
| 352 | - """ | |
| 353 | - compute bit masks to decode a CopyToken according to MS-OVBA 2.4.1.3.19.1 CopyToken Help | |
| 354 | - | |
| 355 | - decompressed_current: number of decompressed bytes so far, i.e. len(decompressed_container) | |
| 356 | - decompressed_chunk_start: offset of the current chunk in the decompressed container | |
| 357 | - return length_mask, offset_mask, bit_count, maximum_length | |
| 358 | - """ | |
| 359 | - difference = decompressed_current - decompressed_chunk_start | |
| 360 | - bit_count = int(math.ceil(math.log(difference, 2))) | |
| 361 | - bit_count = max([bit_count, 4]) | |
| 362 | - length_mask = 0xFFFF >> bit_count | |
| 363 | - offset_mask = ~length_mask | |
| 364 | - maximum_length = (0xFFFF >> bit_count) + 3 | |
| 365 | - return length_mask, offset_mask, bit_count, maximum_length | |
| 366 | - | |
| 367 | - | |
| 368 | -def decompress_stream (compressed_container): | |
| 369 | - """ | |
| 370 | - Decompress a stream according to MS-OVBA section 2.4.1 | |
| 371 | - | |
| 372 | - compressed_container: string compressed according to the MS-OVBA 2.4.1.3.6 Compression algorithm | |
| 373 | - return the decompressed container as a string (bytes) | |
| 374 | - """ | |
| 375 | - # 2.4.1.2 State Variables | |
| 376 | - | |
| 377 | - # The following state is maintained for the CompressedContainer (section 2.4.1.1.1): | |
| 378 | - # CompressedRecordEnd: The location of the byte after the last byte in the CompressedContainer (section 2.4.1.1.1). | |
| 379 | - # CompressedCurrent: The location of the next byte in the CompressedContainer (section 2.4.1.1.1) to be read by | |
| 380 | - # decompression or to be written by compression. | |
| 381 | - | |
| 382 | - # The following state is maintained for the current CompressedChunk (section 2.4.1.1.4): | |
| 383 | - # CompressedChunkStart: The location of the first byte of the CompressedChunk (section 2.4.1.1.4) within the | |
| 384 | - # CompressedContainer (section 2.4.1.1.1). | |
| 385 | - | |
| 386 | - # The following state is maintained for a DecompressedBuffer (section 2.4.1.1.2): | |
| 387 | - # DecompressedCurrent: The location of the next byte in the DecompressedBuffer (section 2.4.1.1.2) to be written by | |
| 388 | - # decompression or to be read by compression. | |
| 389 | - # DecompressedBufferEnd: The location of the byte after the last byte in the DecompressedBuffer (section 2.4.1.1.2). | |
| 390 | - | |
| 391 | - # The following state is maintained for the current DecompressedChunk (section 2.4.1.1.3): | |
| 392 | - # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the | |
| 393 | - # DecompressedBuffer (section 2.4.1.1.2). | |
| 394 | - | |
| 395 | - decompressed_container = '' # result | |
| 396 | - compressed_current = 0 | |
| 397 | - | |
| 398 | - sig_byte = ord(compressed_container[compressed_current]) | |
| 399 | - if sig_byte != 0x01: | |
| 400 | - raise ValueError('invalid signature byte {0:02X}'.format(sig_byte)) | |
| 401 | - | |
| 402 | - compressed_current += 1 | |
| 403 | - | |
| 404 | - #NOTE: the definition of CompressedRecordEnd is ambiguous. Here we assume that | |
| 405 | - # CompressedRecordEnd = len(compressed_container) | |
| 406 | - while compressed_current < len(compressed_container): | |
| 407 | - # 2.4.1.1.5 | |
| 408 | - compressed_chunk_start = compressed_current | |
| 409 | - # chunk header = first 16 bits | |
| 410 | - compressed_chunk_header = struct.unpack("<H", compressed_container[compressed_chunk_start:compressed_chunk_start + 2])[0] | |
| 411 | - # chunk size = 12 first bits of header + 3 | |
| 412 | - chunk_size = (compressed_chunk_header & 0x0FFF) + 3 | |
| 413 | - # chunk signature = 3 next bits - should always be 0b011 | |
| 414 | - chunk_signature = (compressed_chunk_header >> 12) & 0x07 | |
| 415 | - if chunk_signature != 0b011: | |
| 416 | - raise ValueError('Invalid CompressedChunkSignature in VBA compressed stream') | |
| 417 | - # chunk flag = next bit - 1 == compressed, 0 == uncompressed | |
| 418 | - chunk_flag = (compressed_chunk_header >> 15) & 0x01 | |
| 419 | - logging.debug("chunk size = {0}, compressed flag = {1}".format(chunk_size, chunk_flag)) | |
| 420 | - | |
| 421 | - #MS-OVBA 2.4.1.3.12: the maximum size of a chunk including its header is 4098 bytes (header 2 + data 4096) | |
| 422 | - # The minimum size is 3 bytes | |
| 423 | - # NOTE: there seems to be a typo in MS-OVBA, the check should be with 4098, not 4095 (which is the max value | |
| 424 | - # in chunk header before adding 3. | |
| 425 | - # Also the first test is not useful since a 12 bits value cannot be larger than 4095. | |
| 426 | - if chunk_flag == 1 and chunk_size > 4098: | |
| 427 | - raise ValueError('CompressedChunkSize > 4098 but CompressedChunkFlag == 1') | |
| 428 | - if chunk_flag == 0 and chunk_size != 4098: | |
| 429 | - raise ValueError('CompressedChunkSize != 4098 but CompressedChunkFlag == 0') | |
| 430 | - | |
| 431 | - # check if chunk_size goes beyond the compressed data, instead of silently cutting it: | |
| 432 | - #TODO: raise an exception? | |
| 433 | - if compressed_chunk_start + chunk_size > len(compressed_container): | |
| 434 | - logging.warning('Chunk size is larger than remaining compressed data') | |
| 435 | - compressed_end = min([len(compressed_container), compressed_chunk_start + chunk_size]) | |
| 436 | - # read after chunk header: | |
| 437 | - compressed_current = compressed_chunk_start + 2 | |
| 438 | - | |
| 439 | - if chunk_flag == 0: | |
| 440 | - # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk | |
| 441 | - # uncompressed chunk: read the next 4096 bytes as-is | |
| 442 | - #TODO: check if there are at least 4096 bytes left | |
| 443 | - decompressed_container += compressed_container[compressed_current:compressed_current + 4096] | |
| 444 | - compressed_current += 4096 | |
| 445 | - else: | |
| 446 | - # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk | |
| 447 | - # compressed chunk | |
| 448 | - decompressed_chunk_start = len(decompressed_container) | |
| 449 | - while compressed_current < compressed_end: | |
| 450 | - # MS-OVBA 2.4.1.3.4 Decompressing a TokenSequence | |
| 451 | - # logging.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end)) | |
| 452 | - # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or | |
| 453 | - # copy tokens (reference to a previous literal token) | |
| 454 | - flag_byte = ord(compressed_container[compressed_current]) | |
| 455 | - compressed_current += 1 | |
| 456 | - for bit_index in xrange(0, 8): | |
| 457 | - # logging.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end)) | |
| 458 | - if compressed_current >= compressed_end: | |
| 459 | - break | |
| 460 | - # MS-OVBA 2.4.1.3.5 Decompressing a Token | |
| 461 | - # MS-OVBA 2.4.1.3.17 Extract FlagBit | |
| 462 | - flag_bit = (flag_byte >> bit_index) & 1 | |
| 463 | - #logging.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit)) | |
| 464 | - if flag_bit == 0: # LiteralToken | |
| 465 | - # copy one byte directly to output | |
| 466 | - decompressed_container += compressed_container[compressed_current] | |
| 467 | - compressed_current += 1 | |
| 468 | - else: # CopyToken | |
| 469 | - # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken | |
| 470 | - copy_token = struct.unpack("<H", compressed_container[compressed_current:compressed_current + 2])[0] | |
| 471 | - #TODO: check this | |
| 472 | - length_mask, offset_mask, bit_count, maximum_length = copytoken_help( | |
| 473 | - len(decompressed_container), decompressed_chunk_start) | |
| 474 | - length = (copy_token & length_mask) + 3 | |
| 475 | - temp1 = copy_token & offset_mask | |
| 476 | - temp2 = 16 - bit_count | |
| 477 | - offset = (temp1 >> temp2) + 1 | |
| 478 | - #logging.debug('offset=%d length=%d' % (offset, length)) | |
| 479 | - copy_source = len(decompressed_container) - offset | |
| 480 | - for index in xrange(copy_source, copy_source + length): | |
| 481 | - decompressed_container += decompressed_container[index] | |
| 482 | - compressed_current += 2 | |
| 483 | - return decompressed_container | |
| 484 | - | |
| 485 | - | |
| 486 | -def _extract_vba (ole, vba_root, project_path, dir_path): | |
| 487 | - """ | |
| 488 | - Extract VBA macros from an OleFileIO object. | |
| 489 | - Internal function, do not call directly. | |
| 490 | - | |
| 491 | - vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream | |
| 492 | - vba_project: path to the PROJECT stream | |
| 493 | - This is a generator, yielding (stream path, VBA filename, VBA source code) for each VBA code stream | |
| 494 | - """ | |
| 495 | - # Open the PROJECT stream: | |
| 496 | - project = ole.openstream(project_path) | |
| 497 | - | |
| 498 | - # sample content of the PROJECT stream: | |
| 499 | - | |
| 500 | - ## ID="{5312AC8A-349D-4950-BDD0-49BE3C4DD0F0}" | |
| 501 | - ## Document=ThisDocument/&H00000000 | |
| 502 | - ## Module=NewMacros | |
| 503 | - ## Name="Project" | |
| 504 | - ## HelpContextID="0" | |
| 505 | - ## VersionCompatible32="393222000" | |
| 506 | - ## CMG="F1F301E705E705E705E705" | |
| 507 | - ## DPB="8F8D7FE3831F2020202020" | |
| 508 | - ## GC="2D2FDD81E51EE61EE6E1" | |
| 509 | - ## | |
| 510 | - ## [Host Extender Info] | |
| 511 | - ## &H00000001={3832D640-CF90-11CF-8E43-00A0C911005A};VBE;&H00000000 | |
| 512 | - ## &H00000002={000209F2-0000-0000-C000-000000000046};Word8.0;&H00000000 | |
| 513 | - ## | |
| 514 | - ## [Workspace] | |
| 515 | - ## ThisDocument=22, 29, 339, 477, Z | |
| 516 | - ## NewMacros=-4, 42, 832, 510, C | |
| 517 | - | |
| 518 | - code_modules = {} | |
| 519 | - | |
| 520 | - for line in project: | |
| 521 | - line = line.strip() | |
| 522 | - if '=' in line: | |
| 523 | - # split line at the 1st equal sign: | |
| 524 | - name, value = line.split('=', 1) | |
| 525 | - # looking for code modules | |
| 526 | - # add the code module as a key in the dictionary | |
| 527 | - # the value will be the extension needed later | |
| 528 | - # The value is converted to lowercase, to allow case-insensitive matching (issue #3) | |
| 529 | - value = value.lower() | |
| 530 | - if name == 'Document': | |
| 531 | - # split value at the 1st slash, keep 1st part: | |
| 532 | - value = value.split('/', 1)[0] | |
| 533 | - code_modules[value] = CLASS_EXTENSION | |
| 534 | - elif name == 'Module': | |
| 535 | - code_modules[value] = MODULE_EXTENSION | |
| 536 | - elif name == 'Class': | |
| 537 | - code_modules[value] = CLASS_EXTENSION | |
| 538 | - elif name == 'BaseClass': | |
| 539 | - code_modules[value] = FORM_EXTENSION | |
| 540 | - | |
| 541 | - # read data from dir stream (compressed) | |
| 542 | - dir_compressed = ole.openstream(dir_path).read() | |
| 543 | - | |
| 544 | - def check_value(name, expected, value): | |
| 545 | - if expected != value: | |
| 546 | - logging.error("invalid value for {0} expected {1:04X} got {2:04X}".format(name, expected, value)) | |
| 547 | - | |
| 548 | - dir_stream = cStringIO.StringIO(decompress_stream(dir_compressed)) | |
| 549 | - | |
| 550 | - # PROJECTSYSKIND Record | |
| 551 | - PROJECTSYSKIND_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 552 | - check_value('PROJECTSYSKIND_Id', 0x0001, PROJECTSYSKIND_Id) | |
| 553 | - PROJECTSYSKIND_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 554 | - check_value('PROJECTSYSKIND_Size', 0x0004, PROJECTSYSKIND_Size) | |
| 555 | - PROJECTSYSKIND_SysKind = struct.unpack("<L", dir_stream.read(4))[0] | |
| 556 | - if PROJECTSYSKIND_SysKind == 0x00: | |
| 557 | - logging.debug("16-bit Windows") | |
| 558 | - elif PROJECTSYSKIND_SysKind == 0x01: | |
| 559 | - logging.debug("32-bit Windows") | |
| 560 | - elif PROJECTSYSKIND_SysKind == 0x02: | |
| 561 | - logging.debug("Macintosh") | |
| 562 | - elif PROJECTSYSKIND_SysKind == 0x03: | |
| 563 | - logging.debug("64-bit Windows") | |
| 564 | - else: | |
| 565 | - logging.error("invalid PROJECTSYSKIND_SysKind {0:04X}".format(PROJECTSYSKIND_SysKind)) | |
| 566 | - | |
| 567 | - # PROJECTLCID Record | |
| 568 | - PROJECTLCID_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 569 | - check_value('PROJECTLCID_Id', 0x0002, PROJECTLCID_Id) | |
| 570 | - PROJECTLCID_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 571 | - check_value('PROJECTLCID_Size', 0x0004, PROJECTLCID_Size) | |
| 572 | - PROJECTLCID_Lcid = struct.unpack("<L", dir_stream.read(4))[0] | |
| 573 | - check_value('PROJECTLCID_Lcid', 0x409, PROJECTLCID_Lcid) | |
| 574 | - | |
| 575 | - # PROJECTLCIDINVOKE Record | |
| 576 | - PROJECTLCIDINVOKE_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 577 | - check_value('PROJECTLCIDINVOKE_Id', 0x0014, PROJECTLCIDINVOKE_Id) | |
| 578 | - PROJECTLCIDINVOKE_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 579 | - check_value('PROJECTLCIDINVOKE_Size', 0x0004, PROJECTLCIDINVOKE_Size) | |
| 580 | - PROJECTLCIDINVOKE_LcidInvoke = struct.unpack("<L", dir_stream.read(4))[0] | |
| 581 | - check_value('PROJECTLCIDINVOKE_LcidInvoke', 0x409, PROJECTLCIDINVOKE_LcidInvoke) | |
| 582 | - | |
| 583 | - # PROJECTCODEPAGE Record | |
| 584 | - PROJECTCODEPAGE_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 585 | - check_value('PROJECTCODEPAGE_Id', 0x0003, PROJECTCODEPAGE_Id) | |
| 586 | - PROJECTCODEPAGE_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 587 | - check_value('PROJECTCODEPAGE_Size', 0x0002, PROJECTCODEPAGE_Size) | |
| 588 | - PROJECTCODEPAGE_CodePage = struct.unpack("<H", dir_stream.read(2))[0] | |
| 589 | - | |
| 590 | - # PROJECTNAME Record | |
| 591 | - PROJECTNAME_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 592 | - check_value('PROJECTNAME_Id', 0x0004, PROJECTNAME_Id) | |
| 593 | - PROJECTNAME_SizeOfProjectName = struct.unpack("<L", dir_stream.read(4))[0] | |
| 594 | - if PROJECTNAME_SizeOfProjectName < 1 or PROJECTNAME_SizeOfProjectName > 128: | |
| 595 | - logging.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(PROJECTNAME_SizeOfProjectName)) | |
| 596 | - PROJECTNAME_ProjectName = dir_stream.read(PROJECTNAME_SizeOfProjectName) | |
| 597 | - | |
| 598 | - # PROJECTDOCSTRING Record | |
| 599 | - PROJECTDOCSTRING_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 600 | - check_value('PROJECTDOCSTRING_Id', 0x0005, PROJECTDOCSTRING_Id) | |
| 601 | - PROJECTDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0] | |
| 602 | - if PROJECTNAME_SizeOfProjectName > 2000: | |
| 603 | - logging.error("PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(PROJECTDOCSTRING_SizeOfDocString)) | |
| 604 | - PROJECTDOCSTRING_DocString = dir_stream.read(PROJECTDOCSTRING_SizeOfDocString) | |
| 605 | - PROJECTDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 606 | - check_value('PROJECTDOCSTRING_Reserved', 0x0040, PROJECTDOCSTRING_Reserved) | |
| 607 | - PROJECTDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 608 | - if PROJECTDOCSTRING_SizeOfDocStringUnicode % 2 != 0: | |
| 609 | - logging.error("PROJECTDOCSTRING_SizeOfDocStringUnicode is not even") | |
| 610 | - PROJECTDOCSTRING_DocStringUnicode = dir_stream.read(PROJECTDOCSTRING_SizeOfDocStringUnicode) | |
| 611 | - | |
| 612 | - # PROJECTHELPFILEPATH Record - MS-OVBA 2.3.4.2.1.7 | |
| 613 | - PROJECTHELPFILEPATH_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 614 | - check_value('PROJECTHELPFILEPATH_Id', 0x0006, PROJECTHELPFILEPATH_Id) | |
| 615 | - PROJECTHELPFILEPATH_SizeOfHelpFile1 = struct.unpack("<L", dir_stream.read(4))[0] | |
| 616 | - if PROJECTHELPFILEPATH_SizeOfHelpFile1 > 260: | |
| 617 | - logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(PROJECTHELPFILEPATH_SizeOfHelpFile1)) | |
| 618 | - PROJECTHELPFILEPATH_HelpFile1 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile1) | |
| 619 | - PROJECTHELPFILEPATH_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 620 | - check_value('PROJECTHELPFILEPATH_Reserved', 0x003D, PROJECTHELPFILEPATH_Reserved) | |
| 621 | - PROJECTHELPFILEPATH_SizeOfHelpFile2 = struct.unpack("<L", dir_stream.read(4))[0] | |
| 622 | - if PROJECTHELPFILEPATH_SizeOfHelpFile2 != PROJECTHELPFILEPATH_SizeOfHelpFile1: | |
| 623 | - logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 does not equal PROJECTHELPFILEPATH_SizeOfHelpFile2") | |
| 624 | - PROJECTHELPFILEPATH_HelpFile2 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile2) | |
| 625 | - if PROJECTHELPFILEPATH_HelpFile2 != PROJECTHELPFILEPATH_HelpFile1: | |
| 626 | - logging.error("PROJECTHELPFILEPATH_HelpFile1 does not equal PROJECTHELPFILEPATH_HelpFile2") | |
| 627 | - | |
| 628 | - # PROJECTHELPCONTEXT Record | |
| 629 | - PROJECTHELPCONTEXT_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 630 | - check_value('PROJECTHELPCONTEXT_Id', 0x0007, PROJECTHELPCONTEXT_Id) | |
| 631 | - PROJECTHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 632 | - check_value('PROJECTHELPCONTEXT_Size', 0x0004, PROJECTHELPCONTEXT_Size) | |
| 633 | - PROJECTHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0] | |
| 634 | - | |
| 635 | - # PROJECTLIBFLAGS Record | |
| 636 | - PROJECTLIBFLAGS_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 637 | - check_value('PROJECTLIBFLAGS_Id', 0x0008, PROJECTLIBFLAGS_Id) | |
| 638 | - PROJECTLIBFLAGS_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 639 | - check_value('PROJECTLIBFLAGS_Size', 0x0004, PROJECTLIBFLAGS_Size) | |
| 640 | - PROJECTLIBFLAGS_ProjectLibFlags = struct.unpack("<L", dir_stream.read(4))[0] | |
| 641 | - check_value('PROJECTLIBFLAGS_ProjectLibFlags', 0x0000, PROJECTLIBFLAGS_ProjectLibFlags) | |
| 642 | - | |
| 643 | - # PROJECTVERSION Record | |
| 644 | - PROJECTVERSION_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 645 | - check_value('PROJECTVERSION_Id', 0x0009, PROJECTVERSION_Id) | |
| 646 | - PROJECTVERSION_Reserved = struct.unpack("<L", dir_stream.read(4))[0] | |
| 647 | - check_value('PROJECTVERSION_Reserved', 0x0004, PROJECTVERSION_Reserved) | |
| 648 | - PROJECTVERSION_VersionMajor = struct.unpack("<L", dir_stream.read(4))[0] | |
| 649 | - PROJECTVERSION_VersionMinor = struct.unpack("<H", dir_stream.read(2))[0] | |
| 650 | - | |
| 651 | - # PROJECTCONSTANTS Record | |
| 652 | - PROJECTCONSTANTS_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 653 | - check_value('PROJECTCONSTANTS_Id', 0x000C, PROJECTCONSTANTS_Id) | |
| 654 | - PROJECTCONSTANTS_SizeOfConstants = struct.unpack("<L", dir_stream.read(4))[0] | |
| 655 | - if PROJECTCONSTANTS_SizeOfConstants > 1015: | |
| 656 | - logging.error("PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(PROJECTCONSTANTS_SizeOfConstants)) | |
| 657 | - PROJECTCONSTANTS_Constants = dir_stream.read(PROJECTCONSTANTS_SizeOfConstants) | |
| 658 | - PROJECTCONSTANTS_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 659 | - check_value('PROJECTCONSTANTS_Reserved', 0x003C, PROJECTCONSTANTS_Reserved) | |
| 660 | - PROJECTCONSTANTS_SizeOfConstantsUnicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 661 | - if PROJECTCONSTANTS_SizeOfConstantsUnicode % 2 != 0: | |
| 662 | - logging.error("PROJECTCONSTANTS_SizeOfConstantsUnicode is not even") | |
| 663 | - PROJECTCONSTANTS_ConstantsUnicode = dir_stream.read(PROJECTCONSTANTS_SizeOfConstantsUnicode) | |
| 664 | - | |
| 665 | - # array of REFERENCE records | |
| 666 | - check = None | |
| 667 | - while True: | |
| 668 | - check = struct.unpack("<H", dir_stream.read(2))[0] | |
| 669 | - logging.debug("reference type = {0:04X}".format(check)) | |
| 670 | - if check == 0x000F: | |
| 671 | - break | |
| 672 | - | |
| 673 | - if check == 0x0016: | |
| 674 | - # REFERENCENAME | |
| 675 | - REFERENCE_Id = check | |
| 676 | - REFERENCE_SizeOfName = struct.unpack("<L", dir_stream.read(4))[0] | |
| 677 | - REFERENCE_Name = dir_stream.read(REFERENCE_SizeOfName) | |
| 678 | - REFERENCE_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 679 | - check_value('REFERENCE_Reserved', 0x003E, REFERENCE_Reserved) | |
| 680 | - REFERENCE_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 681 | - REFERENCE_NameUnicode = dir_stream.read(REFERENCE_SizeOfNameUnicode) | |
| 682 | - continue | |
| 683 | - | |
| 684 | - if check == 0x0033: | |
| 685 | - # REFERENCEORIGINAL (followed by REFERENCECONTROL) | |
| 686 | - REFERENCEORIGINAL_Id = check | |
| 687 | - REFERENCEORIGINAL_SizeOfLibidOriginal = struct.unpack("<L", dir_stream.read(4))[0] | |
| 688 | - REFERENCEORIGINAL_LibidOriginal = dir_stream.read(REFERENCEORIGINAL_SizeOfLibidOriginal) | |
| 689 | - continue | |
| 690 | - | |
| 691 | - if check == 0x002F: | |
| 692 | - # REFERENCECONTROL | |
| 693 | - REFERENCECONTROL_Id = check | |
| 694 | - REFERENCECONTROL_SizeTwiddled = struct.unpack("<L", dir_stream.read(4))[0] # ignore | |
| 695 | - REFERENCECONTROL_SizeOfLibidTwiddled = struct.unpack("<L", dir_stream.read(4))[0] | |
| 696 | - REFERENCECONTROL_LibidTwiddled = dir_stream.read(REFERENCECONTROL_SizeOfLibidTwiddled) | |
| 697 | - REFERENCECONTROL_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0] # ignore | |
| 698 | - check_value('REFERENCECONTROL_Reserved1', 0x0000, REFERENCECONTROL_Reserved1) | |
| 699 | - REFERENCECONTROL_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0] # ignore | |
| 700 | - check_value('REFERENCECONTROL_Reserved2', 0x0000, REFERENCECONTROL_Reserved2) | |
| 701 | - # optional field | |
| 702 | - check2 = struct.unpack("<H", dir_stream.read(2))[0] | |
| 703 | - if check2 == 0x0016: | |
| 704 | - REFERENCECONTROL_NameRecordExtended_Id = check | |
| 705 | - REFERENCECONTROL_NameRecordExtended_SizeofName = struct.unpack("<L", dir_stream.read(4))[0] | |
| 706 | - REFERENCECONTROL_NameRecordExtended_Name = dir_stream.read(REFERENCECONTROL_NameRecordExtended_SizeofName) | |
| 707 | - REFERENCECONTROL_NameRecordExtended_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 708 | - check_value('REFERENCECONTROL_NameRecordExtended_Reserved', 0x003E, REFERENCECONTROL_NameRecordExtended_Reserved) | |
| 709 | - REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 710 | - REFERENCECONTROL_NameRecordExtended_NameUnicode = dir_stream.read(REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode) | |
| 711 | - REFERENCECONTROL_Reserved3 = struct.unpack("<H", dir_stream.read(2))[0] | |
| 712 | - else: | |
| 713 | - REFERENCECONTROL_Reserved3 = check2 | |
| 714 | - | |
| 715 | - check_value('REFERENCECONTROL_Reserved3', 0x0030, REFERENCECONTROL_Reserved3) | |
| 716 | - REFERENCECONTROL_SizeExtended = struct.unpack("<L", dir_stream.read(4))[0] | |
| 717 | - REFERENCECONTROL_SizeOfLibidExtended = struct.unpack("<L", dir_stream.read(4))[0] | |
| 718 | - REFERENCECONTROL_LibidExtended = dir_stream.read(REFERENCECONTROL_SizeOfLibidExtended) | |
| 719 | - REFERENCECONTROL_Reserved4 = struct.unpack("<L", dir_stream.read(4))[0] | |
| 720 | - REFERENCECONTROL_Reserved5 = struct.unpack("<H", dir_stream.read(2))[0] | |
| 721 | - REFERENCECONTROL_OriginalTypeLib = dir_stream.read(16) | |
| 722 | - REFERENCECONTROL_Cookie = struct.unpack("<L", dir_stream.read(4))[0] | |
| 723 | - continue | |
| 724 | - | |
| 725 | - if check == 0x000D: | |
| 726 | - # REFERENCEREGISTERED | |
| 727 | - REFERENCEREGISTERED_Id = check | |
| 728 | - REFERENCEREGISTERED_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 729 | - REFERENCEREGISTERED_SizeOfLibid = struct.unpack("<L", dir_stream.read(4))[0] | |
| 730 | - REFERENCEREGISTERED_Libid = dir_stream.read(REFERENCEREGISTERED_SizeOfLibid) | |
| 731 | - REFERENCEREGISTERED_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0] | |
| 732 | - check_value('REFERENCEREGISTERED_Reserved1', 0x0000, REFERENCEREGISTERED_Reserved1) | |
| 733 | - REFERENCEREGISTERED_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0] | |
| 734 | - check_value('REFERENCEREGISTERED_Reserved2', 0x0000, REFERENCEREGISTERED_Reserved2) | |
| 735 | - continue | |
| 736 | - | |
| 737 | - if check == 0x000E: | |
| 738 | - # REFERENCEPROJECT | |
| 739 | - REFERENCEPROJECT_Id = check | |
| 740 | - REFERENCEPROJECT_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 741 | - REFERENCEPROJECT_SizeOfLibidAbsolute = struct.unpack("<L", dir_stream.read(4))[0] | |
| 742 | - REFERENCEPROJECT_LibidAbsolute = dir_stream.read(REFERENCEPROJECT_SizeOfLibidAbsolute) | |
| 743 | - REFERENCEPROJECT_SizeOfLibidRelative = struct.unpack("<L", dir_stream.read(4))[0] | |
| 744 | - REFERENCEPROJECT_LibidRelative = dir_stream.read(REFERENCEPROJECT_SizeOfLibidRelative) | |
| 745 | - REFERENCEPROJECT_MajorVersion = struct.unpack("<L", dir_stream.read(4))[0] | |
| 746 | - REFERENCEPROJECT_MinorVersion = struct.unpack("<H", dir_stream.read(2))[0] | |
| 747 | - continue | |
| 748 | - | |
| 749 | - logging.error('invalid or unknown check Id {0:04X}'.format(check)) | |
| 750 | - sys.exit(0) | |
| 751 | - | |
| 752 | - PROJECTMODULES_Id = check #struct.unpack("<H", dir_stream.read(2))[0] | |
| 753 | - check_value('PROJECTMODULES_Id', 0x000F, PROJECTMODULES_Id) | |
| 754 | - PROJECTMODULES_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 755 | - check_value('PROJECTMODULES_Size', 0x0002, PROJECTMODULES_Size) | |
| 756 | - PROJECTMODULES_Count = struct.unpack("<H", dir_stream.read(2))[0] | |
| 757 | - PROJECTMODULES_ProjectCookieRecord_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 758 | - check_value('PROJECTMODULES_ProjectCookieRecord_Id', 0x0013, PROJECTMODULES_ProjectCookieRecord_Id) | |
| 759 | - PROJECTMODULES_ProjectCookieRecord_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 760 | - check_value('PROJECTMODULES_ProjectCookieRecord_Size', 0x0002, PROJECTMODULES_ProjectCookieRecord_Size) | |
| 761 | - PROJECTMODULES_ProjectCookieRecord_Cookie = struct.unpack("<H", dir_stream.read(2))[0] | |
| 762 | - | |
| 763 | - logging.debug("parsing {0} modules".format(PROJECTMODULES_Count)) | |
| 764 | - for x in xrange(0, PROJECTMODULES_Count): | |
| 765 | - MODULENAME_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 766 | - check_value('MODULENAME_Id', 0x0019, MODULENAME_Id) | |
| 767 | - MODULENAME_SizeOfModuleName = struct.unpack("<L", dir_stream.read(4))[0] | |
| 768 | - MODULENAME_ModuleName = dir_stream.read(MODULENAME_SizeOfModuleName) | |
| 769 | - # account for optional sections | |
| 770 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 771 | - if section_id == 0x0047: | |
| 772 | - MODULENAMEUNICODE_Id = section_id | |
| 773 | - MODULENAMEUNICODE_SizeOfModuleNameUnicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 774 | - MODULENAMEUNICODE_ModuleNameUnicode = dir_stream.read(MODULENAMEUNICODE_SizeOfModuleNameUnicode) | |
| 775 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 776 | - if section_id == 0x001A: | |
| 777 | - MODULESTREAMNAME_id = section_id | |
| 778 | - MODULESTREAMNAME_SizeOfStreamName = struct.unpack("<L", dir_stream.read(4))[0] | |
| 779 | - MODULESTREAMNAME_StreamName = dir_stream.read(MODULESTREAMNAME_SizeOfStreamName) | |
| 780 | - MODULESTREAMNAME_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 781 | - check_value('MODULESTREAMNAME_Reserved', 0x0032, MODULESTREAMNAME_Reserved) | |
| 782 | - MODULESTREAMNAME_SizeOfStreamNameUnicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 783 | - MODULESTREAMNAME_StreamNameUnicode = dir_stream.read(MODULESTREAMNAME_SizeOfStreamNameUnicode) | |
| 784 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 785 | - if section_id == 0x001C: | |
| 786 | - MODULEDOCSTRING_Id = section_id | |
| 787 | - check_value('MODULEDOCSTRING_Id', 0x001C, MODULEDOCSTRING_Id) | |
| 788 | - MODULEDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0] | |
| 789 | - MODULEDOCSTRING_DocString = dir_stream.read(MODULEDOCSTRING_SizeOfDocString) | |
| 790 | - MODULEDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 791 | - check_value('MODULEDOCSTRING_Reserved', 0x0048, MODULEDOCSTRING_Reserved) | |
| 792 | - MODULEDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 793 | - MODULEDOCSTRING_DocStringUnicode = dir_stream.read(MODULEDOCSTRING_SizeOfDocStringUnicode) | |
| 794 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 795 | - if section_id == 0x0031: | |
| 796 | - MODULEOFFSET_Id = section_id | |
| 797 | - check_value('MODULEOFFSET_Id', 0x0031, MODULEOFFSET_Id) | |
| 798 | - MODULEOFFSET_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 799 | - check_value('MODULEOFFSET_Size', 0x0004, MODULEOFFSET_Size) | |
| 800 | - MODULEOFFSET_TextOffset = struct.unpack("<L", dir_stream.read(4))[0] | |
| 801 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 802 | - if section_id == 0x001E: | |
| 803 | - MODULEHELPCONTEXT_Id = section_id | |
| 804 | - check_value('MODULEHELPCONTEXT_Id', 0x001E, MODULEHELPCONTEXT_Id) | |
| 805 | - MODULEHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 806 | - check_value('MODULEHELPCONTEXT_Size', 0x0004, MODULEHELPCONTEXT_Size) | |
| 807 | - MODULEHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0] | |
| 808 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 809 | - if section_id == 0x002C: | |
| 810 | - MODULECOOKIE_Id = section_id | |
| 811 | - check_value('MODULECOOKIE_Id', 0x002C, MODULECOOKIE_Id) | |
| 812 | - MODULECOOKIE_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 813 | - check_value('MODULECOOKIE_Size', 0x0002, MODULECOOKIE_Size) | |
| 814 | - MODULECOOKIE_Cookie = struct.unpack("<H", dir_stream.read(2))[0] | |
| 815 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 816 | - if section_id == 0x0021 or section_id == 0x0022: | |
| 817 | - MODULETYPE_Id = section_id | |
| 818 | - MODULETYPE_Reserved = struct.unpack("<L", dir_stream.read(4))[0] | |
| 819 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 820 | - if section_id == 0x0025: | |
| 821 | - MODULEREADONLY_Id = section_id | |
| 822 | - check_value('MODULEREADONLY_Id', 0x0025, MODULEREADONLY_Id) | |
| 823 | - MODULEREADONLY_Reserved = struct.unpack("<L", dir_stream.read(4))[0] | |
| 824 | - check_value('MODULEREADONLY_Reserved', 0x0000, MODULEREADONLY_Reserved) | |
| 825 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 826 | - if section_id == 0x0028: | |
| 827 | - MODULEPRIVATE_Id = section_id | |
| 828 | - check_value('MODULEPRIVATE_Id', 0x0028, MODULEPRIVATE_Id) | |
| 829 | - MODULEPRIVATE_Reserved = struct.unpack("<L", dir_stream.read(4))[0] | |
| 830 | - check_value('MODULEPRIVATE_Reserved', 0x0000, MODULEPRIVATE_Reserved) | |
| 831 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 832 | - if section_id == 0x002B: # TERMINATOR | |
| 833 | - MODULE_Reserved = struct.unpack("<L", dir_stream.read(4))[0] | |
| 834 | - check_value('MODULE_Reserved', 0x0000, MODULE_Reserved) | |
| 835 | - section_id = None | |
| 836 | - if section_id != None: | |
| 837 | - logging.warning('unknown or invalid module section id {0:04X}'.format(section_id)) | |
| 838 | - | |
| 839 | - logging.debug('Project CodePage = %d' % PROJECTCODEPAGE_CodePage) | |
| 840 | - vba_codec = 'cp%d' % PROJECTCODEPAGE_CodePage | |
| 841 | - logging.debug("ModuleName = {0}".format(MODULENAME_ModuleName)) | |
| 842 | - logging.debug("StreamName = {0}".format(repr(MODULESTREAMNAME_StreamName))) | |
| 843 | - streamname_unicode = MODULESTREAMNAME_StreamName.decode(vba_codec) | |
| 844 | - logging.debug("StreamName.decode('%s') = %s" % (vba_codec, repr(streamname_unicode))) | |
| 845 | - logging.debug("StreamNameUnicode = {0}".format(repr(MODULESTREAMNAME_StreamNameUnicode))) | |
| 846 | - logging.debug("TextOffset = {0}".format(MODULEOFFSET_TextOffset)) | |
| 847 | - | |
| 848 | - code_path = vba_root + u'VBA/' + streamname_unicode | |
| 849 | - #TODO: test if stream exists | |
| 850 | - logging.debug('opening VBA code stream %s' % repr(code_path)) | |
| 851 | - code_data = ole.openstream(code_path).read() | |
| 852 | - logging.debug("length of code_data = {0}".format(len(code_data))) | |
| 853 | - logging.debug("offset of code_data = {0}".format(MODULEOFFSET_TextOffset)) | |
| 854 | - code_data = code_data[MODULEOFFSET_TextOffset:] | |
| 855 | - if len(code_data) > 0: | |
| 856 | - code_data = decompress_stream(code_data) | |
| 857 | - # case-insensitive search in the code_modules dict to find the file extension: | |
| 858 | - filext = code_modules.get(MODULENAME_ModuleName.lower(), 'bin') | |
| 859 | - filename = '{0}.{1}'.format(MODULENAME_ModuleName, filext) | |
| 860 | - #TODO: also yield the codepage so that callers can decode it properly | |
| 861 | - yield (code_path, filename, code_data) | |
| 862 | - # print '-'*79 | |
| 863 | - # print filename | |
| 864 | - # print '' | |
| 865 | - # print code_data | |
| 866 | - # print '' | |
| 867 | - logging.debug('extracted file {0}'.format(filename)) | |
| 868 | - else: | |
| 869 | - logging.warning("module stream {0} has code data length 0".format(MODULESTREAMNAME_StreamName)) | |
| 870 | - return | |
| 871 | - | |
| 872 | - | |
| 873 | -def filter_vba(vba_code): | |
| 874 | - """ | |
| 875 | - Filter VBA source code to remove the first lines starting with "Attribute VB_", | |
| 876 | - which are automatically added by MS Office and not displayed in the VBA Editor. | |
| 877 | - This should only be used when displaying source code for human analysis. | |
| 878 | - | |
| 879 | - Note: lines are not filtered if they contain a colon, because it could be | |
| 880 | - used to hide malicious instructions. | |
| 881 | - | |
| 882 | - :param vba_code: str, VBA source code | |
| 883 | - :return: str, filtered VBA source code | |
| 884 | - """ | |
| 885 | - vba_lines = vba_code.splitlines() | |
| 886 | - start = 0 | |
| 887 | - for line in vba_lines: | |
| 888 | - if line.startswith("Attribute VB_") and not ':' in line: | |
| 889 | - start += 1 | |
| 890 | - else: | |
| 891 | - break | |
| 892 | - #TODO: also remove empty lines? | |
| 893 | - vba = '\n'.join(vba_lines[start:]) | |
| 894 | - return vba | |
| 895 | - | |
| 896 | - | |
| 897 | -def detect_autoexec(vba_code, obfuscation=None): | |
| 898 | - """ | |
| 899 | - Detect if the VBA code contains keywords corresponding to macros running | |
| 900 | - automatically when triggered by specific actions (e.g. when a document is | |
| 901 | - opened or closed). | |
| 902 | - | |
| 903 | - :param vba_code: str, VBA source code | |
| 904 | - :param obfuscation: None or str, name of obfuscation to be added to description | |
| 905 | - :return: list of str tuples (keyword, description) | |
| 906 | - """ | |
| 907 | - #TODO: merge code with detect_suspicious | |
| 908 | - # case-insensitive search | |
| 909 | - #vba_code = vba_code.lower() | |
| 910 | - results = [] | |
| 911 | - obf_text = '' | |
| 912 | - if obfuscation: | |
| 913 | - obf_text = ' (obfuscation: %s)' % obfuscation | |
| 914 | - for description, keywords in AUTOEXEC_KEYWORDS.items(): | |
| 915 | - for keyword in keywords: | |
| 916 | - #TODO: if keyword is already a compiled regex, use it as-is | |
| 917 | - # search using regex to detect word boundaries: | |
| 918 | - if re.search(r'(?i)\b'+keyword+r'\b', vba_code): | |
| 919 | - #if keyword.lower() in vba_code: | |
| 920 | - results.append((keyword, description+obf_text)) | |
| 921 | - return results | |
| 922 | - | |
| 923 | - | |
| 924 | -def detect_suspicious(vba_code, obfuscation=None): | |
| 925 | - """ | |
| 926 | - Detect if the VBA code contains suspicious keywords corresponding to | |
| 927 | - potential malware behaviour. | |
| 928 | - | |
| 929 | - :param vba_code: str, VBA source code | |
| 930 | - :param obfuscation: None or str, name of obfuscation to be added to description | |
| 931 | - :return: list of str tuples (keyword, description) | |
| 932 | - """ | |
| 933 | - # case-insensitive search | |
| 934 | - #vba_code = vba_code.lower() | |
| 935 | - results = [] | |
| 936 | - obf_text = '' | |
| 937 | - if obfuscation: | |
| 938 | - obf_text = ' (obfuscation: %s)' % obfuscation | |
| 939 | - for description, keywords in SUSPICIOUS_KEYWORDS.items(): | |
| 940 | - for keyword in keywords: | |
| 941 | - # search using regex to detect word boundaries: | |
| 942 | - if re.search(r'(?i)\b'+keyword+r'\b', vba_code): | |
| 943 | - #if keyword.lower() in vba_code: | |
| 944 | - results.append((keyword, description+obf_text)) | |
| 945 | - return results | |
| 946 | - | |
| 947 | - | |
| 948 | -def detect_patterns(vba_code, obfuscation=None): | |
| 949 | - """ | |
| 950 | - Detect if the VBA code contains specific patterns such as IP addresses, | |
| 951 | - URLs, e-mail addresses, executable file names, etc. | |
| 952 | - | |
| 953 | - :param vba_code: str, VBA source code | |
| 954 | - :return: list of str tuples (pattern type, value) | |
| 955 | - """ | |
| 956 | - results = [] | |
| 957 | - found = set() | |
| 958 | - obf_text = '' | |
| 959 | - if obfuscation: | |
| 960 | - obf_text = ' (obfuscation: %s)' % obfuscation | |
| 961 | - for pattern_type, pattern_re in RE_PATTERNS: | |
| 962 | - for match in pattern_re.finditer(vba_code): | |
| 963 | - value = match.group() | |
| 964 | - if value not in found: | |
| 965 | - results.append((pattern_type+obf_text, value)) | |
| 966 | - found.add(value) | |
| 967 | - return results | |
| 968 | - | |
| 969 | - | |
| 970 | -def detect_hex_strings(vba_code): | |
| 971 | - """ | |
| 972 | - Detect if the VBA code contains strings encoded in hexadecimal. | |
| 973 | - | |
| 974 | - :param vba_code: str, VBA source code | |
| 975 | - :return: list of str tuples (encoded string, decoded string) | |
| 976 | - """ | |
| 977 | - results = [] | |
| 978 | - found = set() | |
| 979 | - for match in re_hex_string.finditer(vba_code): | |
| 980 | - value = match.group() | |
| 981 | - if value not in found: | |
| 982 | - decoded = binascii.unhexlify(value) | |
| 983 | - results.append((value, decoded)) | |
| 984 | - found.add(value) | |
| 985 | - return results | |
| 986 | - | |
| 987 | - | |
| 988 | -def detect_base64_strings(vba_code): | |
| 989 | - """ | |
| 990 | - Detect if the VBA code contains strings encoded in base64. | |
| 991 | - | |
| 992 | - :param vba_code: str, VBA source code | |
| 993 | - :return: list of str tuples (encoded string, decoded string) | |
| 994 | - """ | |
| 995 | - #TODO: avoid matching simple hex strings as base64? | |
| 996 | - results = [] | |
| 997 | - found = set() | |
| 998 | - for match in re_base64_string.finditer(vba_code): | |
| 999 | - # extract the base64 string without quotes: | |
| 1000 | - value = match.group().strip('"') | |
| 1001 | - # check it is not just a hex string: | |
| 1002 | - if not re_nothex_check.search(value): | |
| 1003 | - continue | |
| 1004 | - # only keep new values and not in the whitelist: | |
| 1005 | - if value not in found and value.lower() not in BASE64_WHITELIST: | |
| 1006 | - try: | |
| 1007 | - decoded = base64.b64decode(value) | |
| 1008 | - results.append((value, decoded)) | |
| 1009 | - found.add(value) | |
| 1010 | - except: | |
| 1011 | - # if an exception occurs, it is likely not a base64-encoded string | |
| 1012 | - pass | |
| 1013 | - return results | |
| 1014 | - | |
| 1015 | - | |
| 1016 | -def detect_dridex_strings(vba_code): | |
| 1017 | - """ | |
| 1018 | - Detect if the VBA code contains strings obfuscated with a specific algorithm found in Dridex samples. | |
| 1019 | - | |
| 1020 | - :param vba_code: str, VBA source code | |
| 1021 | - :return: list of str tuples (encoded string, decoded string) | |
| 1022 | - """ | |
| 1023 | - from thirdparty.DridexUrlDecoder.DridexUrlDecoder import DridexUrlDecode | |
| 1024 | - results = [] | |
| 1025 | - found = set() | |
| 1026 | - for match in re_dridex_string.finditer(vba_code): | |
| 1027 | - value = match.group()[1:-1] | |
| 1028 | - # check it is not just a hex string: | |
| 1029 | - if not re_nothex_check.search(value): | |
| 1030 | - continue | |
| 1031 | - if value not in found: | |
| 1032 | - try: | |
| 1033 | - decoded = DridexUrlDecode(value) | |
| 1034 | - results.append((value, decoded)) | |
| 1035 | - found.add(value) | |
| 1036 | - except: | |
| 1037 | - # if an exception occurs, it is likely not a dridex-encoded string | |
| 1038 | - pass | |
| 1039 | - return results | |
| 1040 | - | |
| 1041 | - | |
| 1042 | -class VBA_Scanner (object): | |
| 1043 | - """ | |
| 1044 | - Class to scan the source code of a VBA module to find obfuscated strings, | |
| 1045 | - suspicious keywords, IOCs, auto-executable macros, etc. | |
| 1046 | - """ | |
| 1047 | - | |
| 1048 | - def __init__(self, vba_code): | |
| 1049 | - """ | |
| 1050 | - VBA_Scanner constructor | |
| 1051 | - | |
| 1052 | - :param vba_code: str, VBA source code to be analyzed | |
| 1053 | - """ | |
| 1054 | - self.code = vba_code | |
| 1055 | - self.code_hex = '' | |
| 1056 | - self.code_hex_rev = '' | |
| 1057 | - self.code_rev_hex = '' | |
| 1058 | - self.code_base64 = '' | |
| 1059 | - self.code_dridex = '' | |
| 1060 | - | |
| 1061 | - | |
| 1062 | - def scan(self, include_decoded_strings=False): | |
| 1063 | - """ | |
| 1064 | - Analyze the provided VBA code to detect suspicious keywords, | |
| 1065 | - auto-executable macros, IOC patterns, obfuscation patterns | |
| 1066 | - such as hex-encoded strings. | |
| 1067 | - | |
| 1068 | - :param include_decoded_strings: bool, if True, all encoded strings will be included with their decoded content. | |
| 1069 | - :return: list of tuples (type, keyword, description) | |
| 1070 | - (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String') | |
| 1071 | - """ | |
| 1072 | - # First, detect and extract hex-encoded strings: | |
| 1073 | - self.hex_strings = detect_hex_strings(self.code) | |
| 1074 | - # detect if the code contains StrReverse: | |
| 1075 | - self.strReverse = False | |
| 1076 | - if 'strreverse' in self.code.lower(): self.strReverse = True | |
| 1077 | - # Then append the decoded strings to the VBA code, to detect obfuscated IOCs and keywords: | |
| 1078 | - for encoded, decoded in self.hex_strings: | |
| 1079 | - self.code_hex += '\n'+decoded | |
| 1080 | - # if the code contains "StrReverse", also append the hex strings in reverse order: | |
| 1081 | - if self.strReverse: | |
| 1082 | - # StrReverse after hex decoding: | |
| 1083 | - self.code_hex_rev += '\n'+decoded[::-1] | |
| 1084 | - # StrReverse before hex decoding: | |
| 1085 | - self.code_rev_hex += '\n'+binascii.unhexlify(encoded[::-1]) | |
| 1086 | - #example: https://malwr.com/analysis/NmFlMGI4YTY1YzYyNDkwNTg1ZTBiZmY5OGI3YjlhYzU/ | |
| 1087 | - #TODO: also append the full code reversed if StrReverse? (risk of false positives?) | |
| 1088 | - # Detect Base64-encoded strings | |
| 1089 | - self.base64_strings = detect_base64_strings(self.code) | |
| 1090 | - for encoded, decoded in self.base64_strings: | |
| 1091 | - self.code_base64 += '\n'+decoded | |
| 1092 | - # Detect Dridex-encoded strings | |
| 1093 | - self.dridex_strings = detect_dridex_strings(self.code) | |
| 1094 | - for encoded, decoded in self.dridex_strings: | |
| 1095 | - self.code_dridex += '\n'+decoded | |
| 1096 | - results = [] | |
| 1097 | - self.autoexec_keywords = [] | |
| 1098 | - self.suspicious_keywords = [] | |
| 1099 | - self.iocs = [] | |
| 1100 | - | |
| 1101 | - for code, obfuscation in ( | |
| 1102 | - (self.code, None), | |
| 1103 | - (self.code_hex, 'Hex'), | |
| 1104 | - (self.code_hex_rev, 'Hex+StrReverse'), | |
| 1105 | - (self.code_rev_hex, 'StrReverse+Hex'), | |
| 1106 | - (self.code_base64, 'Base64'), | |
| 1107 | - (self.code_dridex, 'Dridex'), | |
| 1108 | - ): | |
| 1109 | - self.autoexec_keywords += detect_autoexec(code, obfuscation) | |
| 1110 | - self.suspicious_keywords += detect_suspicious(code, obfuscation) | |
| 1111 | - self.iocs += detect_patterns(code, obfuscation) | |
| 1112 | - | |
| 1113 | - # If hex-encoded strings were discovered, add an item to suspicious keywords: | |
| 1114 | - if self.hex_strings: | |
| 1115 | - self.suspicious_keywords.append(('Hex Strings', | |
| 1116 | - 'Hex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)')) | |
| 1117 | - if self.base64_strings: | |
| 1118 | - self.suspicious_keywords.append(('Base64 Strings', | |
| 1119 | - 'Base64-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)')) | |
| 1120 | - if self.dridex_strings: | |
| 1121 | - self.suspicious_keywords.append(('Dridex Strings', | |
| 1122 | - 'Dridex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)')) | |
| 1123 | - for keyword, description in self.autoexec_keywords: | |
| 1124 | - results.append(('AutoExec', keyword, description)) | |
| 1125 | - for keyword, description in self.suspicious_keywords: | |
| 1126 | - results.append(('Suspicious', keyword, description)) | |
| 1127 | - for pattern_type, value in self.iocs: | |
| 1128 | - results.append(('IOC', value, pattern_type)) | |
| 1129 | - if include_decoded_strings: | |
| 1130 | - for encoded, decoded in self.hex_strings: | |
| 1131 | - results.append(('Hex String', repr(decoded), encoded)) | |
| 1132 | - for encoded, decoded in self.base64_strings: | |
| 1133 | - results.append(('Base64 String', repr(decoded), encoded)) | |
| 1134 | - for encoded, decoded in self.dridex_strings: | |
| 1135 | - results.append(('Dridex string', repr(decoded), encoded)) | |
| 1136 | - return results | |
| 1137 | - | |
| 1138 | - def scan_summary(self): | |
| 1139 | - """ | |
| 1140 | - Analyze the provided VBA code to detect suspicious keywords, | |
| 1141 | - auto-executable macros, IOC patterns, obfuscation patterns | |
| 1142 | - such as hex-encoded strings. | |
| 1143 | - | |
| 1144 | - :return: tuple with the number of items found for each category: | |
| 1145 | - (autoexec, suspicious, IOCs, hex, base64, dridex) | |
| 1146 | - """ | |
| 1147 | - self.scan() | |
| 1148 | - return (len(self.autoexec_keywords), len(self.suspicious_keywords), | |
| 1149 | - len(self.iocs), len(self.hex_strings), len(self.base64_strings), | |
| 1150 | - len(self.dridex_strings)) | |
| 1151 | - | |
| 1152 | - | |
| 1153 | - | |
| 1154 | -def scan_vba(vba_code, include_decoded_strings): | |
| 1155 | - """ | |
| 1156 | - Analyze the provided VBA code to detect suspicious keywords, | |
| 1157 | - auto-executable macros, IOC patterns, obfuscation patterns | |
| 1158 | - such as hex-encoded strings. | |
| 1159 | - (shortcut for VBA_Scanner(vba_code).scan()) | |
| 1160 | - | |
| 1161 | - :param vba_code: str, VBA source code to be analyzed | |
| 1162 | - :param include_decoded_strings: bool, if True all encoded strings will be included with their decoded content. | |
| 1163 | - :return: list of tuples (type, keyword, description) | |
| 1164 | - (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String') | |
| 1165 | - """ | |
| 1166 | - return VBA_Scanner(vba_code).scan(include_decoded_strings) | |
| 1167 | - | |
| 1168 | - | |
| 1169 | -#=== CLASSES ================================================================= | |
| 1170 | - | |
| 1171 | -class VBA_Parser(object): | |
| 1172 | - """ | |
| 1173 | - Class to parse MS Office files, to detect VBA macros and extract VBA source code | |
| 1174 | - Supported file formats: | |
| 1175 | - - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) | |
| 1176 | - - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) | |
| 1177 | - - PowerPoint 2007+ (.pptm, .ppsm) | |
| 1178 | - """ | |
| 1179 | - | |
| 1180 | - def __init__(self, filename, data=None): | |
| 1181 | - """ | |
| 1182 | - Constructor for VBA_Parser | |
| 1183 | - | |
| 1184 | - :param filename: filename or path of file to parse, or file-like object | |
| 1185 | - | |
| 1186 | - :param data: None or bytes str, if None the file will be read from disk (or from the file-like object). | |
| 1187 | - If data is provided as a bytes string, it will be parsed as the content of the file in memory, | |
| 1188 | - and not read from disk. Note: files must be read in binary mode, i.e. open(f, 'rb'). | |
| 1189 | - """ | |
| 1190 | - #TODO: filename should only be a string, data should be used for the file-like object | |
| 1191 | - #TODO: filename should be mandatory, optional data is a string or file-like object | |
| 1192 | - #TODO: also support olefile and zipfile as input | |
| 1193 | - if data is None: | |
| 1194 | - # open file from disk: | |
| 1195 | - _file = filename | |
| 1196 | - else: | |
| 1197 | - # file already read in memory, make it a file-like object for zipfile: | |
| 1198 | - _file = cStringIO.StringIO(data) | |
| 1199 | - #self.file = _file | |
| 1200 | - self.ole_file = None | |
| 1201 | - self.ole_subfiles = [] | |
| 1202 | - self.filename = filename | |
| 1203 | - self.type = None | |
| 1204 | - self.vba_projects = None | |
| 1205 | - # if filename is None: | |
| 1206 | - # if isinstance(_file, basestring): | |
| 1207 | - # if len(_file) < olefile.MINIMAL_OLEFILE_SIZE: | |
| 1208 | - # self.filename = _file | |
| 1209 | - # else: | |
| 1210 | - # self.filename = '<file in bytes string>' | |
| 1211 | - # else: | |
| 1212 | - # self.filename = '<file-like object>' | |
| 1213 | - if olefile.isOleFile(_file): | |
| 1214 | - # This looks like an OLE file | |
| 1215 | - logging.info('Parsing OLE file %s' % self.filename) | |
| 1216 | - # Open and parse the OLE file, using unicode for path names: | |
| 1217 | - self.ole_file = olefile.OleFileIO(_file, path_encoding=None) | |
| 1218 | - self.type = TYPE_OLE | |
| 1219 | - #TODO: raise TypeError if this is a Powerpoint 97 file, since VBA macros cannot be detected yet | |
| 1220 | - elif zipfile.is_zipfile(_file): | |
| 1221 | - # This looks like a zip file, need to look for vbaProject.bin inside | |
| 1222 | - # It can be any OLE file inside the archive | |
| 1223 | - #...because vbaProject.bin can be renamed: | |
| 1224 | - # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18 | |
| 1225 | - logging.info('Opening ZIP/OpenXML file %s' % self.filename) | |
| 1226 | - self.type = TYPE_OpenXML | |
| 1227 | - z = zipfile.ZipFile(_file) | |
| 1228 | - #TODO: check if this is actually an OpenXML file | |
| 1229 | - #TODO: if the zip file is encrypted, suggest to use the -z option, or try '-z infected' automatically? | |
| 1230 | - # check each file within the zip if it is an OLE file, by reading its magic: | |
| 1231 | - for subfile in z.namelist(): | |
| 1232 | - magic = z.open(subfile).read(len(olefile.MAGIC)) | |
| 1233 | - if magic == olefile.MAGIC: | |
| 1234 | - logging.debug('Opening OLE file %s within zip' % subfile) | |
| 1235 | - ole_data = z.open(subfile).read() | |
| 1236 | - try: | |
| 1237 | - self.ole_subfiles.append(VBA_Parser(filename=subfile, data=ole_data)) | |
| 1238 | - except: | |
| 1239 | - logging.debug('%s is not a valid OLE file' % subfile) | |
| 1240 | - continue | |
| 1241 | - z.close() | |
| 1242 | - else: | |
| 1243 | - # read file from disk, check if it is a Word 2003 XML file (WordProcessingML), Excel 2003 XML, | |
| 1244 | - # or a plain text file containing VBA code | |
| 1245 | - if data is None: | |
| 1246 | - data = open(filename, 'rb').read() | |
| 1247 | - # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace | |
| 1248 | - if 'http://schemas.microsoft.com/office/word/2003/wordml' in data: | |
| 1249 | - logging.info('Opening Word 2003 XML file %s' % self.filename) | |
| 1250 | - self.type = TYPE_Word2003_XML | |
| 1251 | - # parse the XML content | |
| 1252 | - et = ET.fromstring(data) | |
| 1253 | - # find all the binData elements: | |
| 1254 | - for bindata in et.getiterator(TAG_BINDATA): | |
| 1255 | - # the binData content is an OLE container for the VBA project, compressed | |
| 1256 | - # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded. | |
| 1257 | - # get the filename: | |
| 1258 | - fname = bindata.get(ATTR_NAME, 'noname.mso') | |
| 1259 | - # decode the base64 activemime | |
| 1260 | - activemime = binascii.a2b_base64(bindata.text) | |
| 1261 | - # decompress the zlib data starting at offset 0x32, which is the OLE container: | |
| 1262 | - ole_data = zlib.decompress(activemime[0x32:]) | |
| 1263 | - try: | |
| 1264 | - self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data)) | |
| 1265 | - except: | |
| 1266 | - logging.debug('%s is not a valid OLE file' % fname) | |
| 1267 | - continue | |
| 1268 | - #TODO: handle exceptions | |
| 1269 | - #TODO: Excel 2003 XML | |
| 1270 | - #TODO: plain text VBA file | |
| 1271 | - else: | |
| 1272 | - msg = '%s is not an OLE nor an OpenXML file, cannot extract VBA Macros.' % self.filename | |
| 1273 | - logging.error(msg) | |
| 1274 | - raise TypeError(msg) | |
| 1275 | - | |
| 1276 | - def find_vba_projects (self): | |
| 1277 | - """ | |
| 1278 | - Finds all the VBA projects stored in an OLE file. | |
| 1279 | - | |
| 1280 | - Return None if the file is not OLE but OpenXML. | |
| 1281 | - Return a list of tuples (vba_root, project_path, dir_path) for each VBA project. | |
| 1282 | - vba_root is the path of the root OLE storage containing the VBA project, | |
| 1283 | - including a trailing slash unless it is the root of the OLE file. | |
| 1284 | - project_path is the path of the OLE stream named "PROJECT" within the VBA project. | |
| 1285 | - dir_path is the path of the OLE stream named "VBA/dir" within the VBA project. | |
| 1286 | - | |
| 1287 | - If this function returns an empty list for one of the supported formats | |
| 1288 | - (i.e. Word, Excel, Powerpoint except Powerpoint 97-2003), then the | |
| 1289 | - file does not contain VBA macros. | |
| 1290 | - | |
| 1291 | - :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path) | |
| 1292 | - for each VBA project found if OLE file | |
| 1293 | - """ | |
| 1294 | - # if the file is not OLE but OpenXML, return None: | |
| 1295 | - if self.ole_file is None: | |
| 1296 | - return None | |
| 1297 | - | |
| 1298 | - # if this method has already been called, return previous result: | |
| 1299 | - if self.vba_projects is not None: | |
| 1300 | - return self.vba_projects | |
| 1301 | - | |
| 1302 | - # Find the VBA project root (different in MS Word, Excel, etc): | |
| 1303 | - # - Word 97-2003: Macros | |
| 1304 | - # - Excel 97-2003: _VBA_PROJECT_CUR | |
| 1305 | - # - PowerPoint 97-2003: not supported yet (different file structure) | |
| 1306 | - # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin. | |
| 1307 | - # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word | |
| 1308 | - # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word | |
| 1309 | - # - Visio 2007: not supported yet (different file structure) | |
| 1310 | - | |
| 1311 | - # According to MS-OVBA section 2.2.1: | |
| 1312 | - # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream | |
| 1313 | - # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream | |
| 1314 | - # - all names are case-insensitive | |
| 1315 | - | |
| 1316 | - # start with an empty list: | |
| 1317 | - self.vba_projects = [] | |
| 1318 | - # Look for any storage containing those storage/streams: | |
| 1319 | - ole = self.ole_file | |
| 1320 | - for storage in ole.listdir(streams=False, storages=True): | |
| 1321 | - # Look for a storage ending with "VBA": | |
| 1322 | - if storage[-1].upper() == 'VBA': | |
| 1323 | - logging.debug('Found VBA storage: %s' % ('/'.join(storage))) | |
| 1324 | - vba_root = '/'.join(storage[:-1]) | |
| 1325 | - # Add a trailing slash to vba_root, unless it is the root of the OLE file: | |
| 1326 | - # (used later to append all the child streams/storages) | |
| 1327 | - if vba_root != '': | |
| 1328 | - vba_root += '/' | |
| 1329 | - logging.debug('Checking vba_root="%s"' % vba_root) | |
| 1330 | - | |
| 1331 | - def check_vba_stream(ole, vba_root, stream_path): | |
| 1332 | - full_path = vba_root + stream_path | |
| 1333 | - if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM: | |
| 1334 | - logging.debug('Found %s stream: %s' % (stream_path, full_path)) | |
| 1335 | - return full_path | |
| 1336 | - else: | |
| 1337 | - logging.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path) | |
| 1338 | - return False | |
| 1339 | - | |
| 1340 | - # Check if the VBA root storage also contains a PROJECT stream: | |
| 1341 | - project_path = check_vba_stream(ole, vba_root, 'PROJECT') | |
| 1342 | - if not project_path: continue | |
| 1343 | - # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream: | |
| 1344 | - vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT') | |
| 1345 | - if not vba_project_path: continue | |
| 1346 | - # Check if the VBA root storage also contains a VBA/dir stream: | |
| 1347 | - dir_path = check_vba_stream(ole, vba_root, 'VBA/dir') | |
| 1348 | - if not dir_path: continue | |
| 1349 | - # Now we are pretty sure it is a VBA project structure | |
| 1350 | - logging.debug('VBA root storage: "%s"' % vba_root) | |
| 1351 | - # append the results to the list as a tuple for later use: | |
| 1352 | - self.vba_projects.append((vba_root, project_path, dir_path)) | |
| 1353 | - return self.vba_projects | |
| 1354 | - | |
| 1355 | - def detect_vba_macros(self): | |
| 1356 | - """ | |
| 1357 | - Detect the potential presence of VBA macros in the file, by checking | |
| 1358 | - if it contains VBA projects. Both OLE and OpenXML files are supported. | |
| 1359 | - | |
| 1360 | - Important: for now, results are accurate only for Word, Excel and PowerPoint | |
| 1361 | - EXCEPT Powerpoint 97-2003, which has a different structure for VBA. | |
| 1362 | - | |
| 1363 | - Note: this method does NOT attempt to check the actual presence or validity | |
| 1364 | - of VBA macro source code, so there might be false positives. | |
| 1365 | - It may also detect VBA macros in files embedded within the main file, | |
| 1366 | - for example an Excel workbook with macros embedded into a Word | |
| 1367 | - document without macros may be detected, without distinction. | |
| 1368 | - | |
| 1369 | - :return: bool, True if at least one VBA project has been found, False otherwise | |
| 1370 | - """ | |
| 1371 | - #TODO: return None or raise exception if format not supported like PPT 97-2003 | |
| 1372 | - #TODO: return the number of VBA projects found instead of True/False? | |
| 1373 | - # if OpenXML, check all the OLE subfiles: | |
| 1374 | - if self.ole_file is None: | |
| 1375 | - for ole_subfile in self.ole_subfiles: | |
| 1376 | - if ole_subfile.detect_vba_macros(): | |
| 1377 | - return True | |
| 1378 | - return False | |
| 1379 | - # otherwise it's an OLE file, find VBA projects: | |
| 1380 | - vba_projects = self.find_vba_projects() | |
| 1381 | - if len(vba_projects) == 0: | |
| 1382 | - return False | |
| 1383 | - else: | |
| 1384 | - return True | |
| 1385 | - | |
| 1386 | - | |
| 1387 | - def extract_macros (self): | |
| 1388 | - """ | |
| 1389 | - Extract and decompress source code for each VBA macro found in the file | |
| 1390 | - | |
| 1391 | - Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found | |
| 1392 | - If the file is OLE, filename is the path of the file. | |
| 1393 | - If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros | |
| 1394 | - within the zip archive, e.g. word/vbaProject.bin. | |
| 1395 | - """ | |
| 1396 | - if self.ole_file is None: | |
| 1397 | - for ole_subfile in self.ole_subfiles: | |
| 1398 | - for results in ole_subfile.extract_macros(): | |
| 1399 | - yield results | |
| 1400 | - else: | |
| 1401 | - self.find_vba_projects() | |
| 1402 | - for vba_root, project_path, dir_path in self.vba_projects: | |
| 1403 | - # extract all VBA macros from that VBA root storage: | |
| 1404 | - for stream_path, vba_filename, vba_code in _extract_vba(self.ole_file, vba_root, project_path, dir_path): | |
| 1405 | - yield (self.filename, stream_path, vba_filename, vba_code) | |
| 1406 | - | |
| 1407 | - | |
| 1408 | - def close(self): | |
| 1409 | - """ | |
| 1410 | - Close all the open files. This method must be called after usage, if | |
| 1411 | - the application is opening many files. | |
| 1412 | - """ | |
| 1413 | - if self.ole_file is None: | |
| 1414 | - for ole_subfile in self.ole_subfiles: | |
| 1415 | - ole_subfile.close() | |
| 1416 | - else: | |
| 1417 | - self.ole_file.close() | |
| 1418 | - | |
| 1419 | - | |
| 1420 | -def print_analysis(vba_code, show_decoded_strings=False): | |
| 1421 | - """ | |
| 1422 | - Analyze the provided VBA code, and print the results in a table | |
| 1423 | - | |
| 1424 | - :param vba_code: str, VBA source code to be analyzed | |
| 1425 | - :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content. | |
| 1426 | - :return: None | |
| 1427 | - """ | |
| 1428 | - results = scan_vba(vba_code, show_decoded_strings) | |
| 1429 | - if results: | |
| 1430 | - t = prettytable.PrettyTable(('Type', 'Keyword', 'Description')) | |
| 1431 | - t.align = 'l' | |
| 1432 | - t.max_width['Type'] = 10 | |
| 1433 | - t.max_width['Keyword'] = 20 | |
| 1434 | - t.max_width['Description'] = 39 | |
| 1435 | - for kw_type, keyword, description in results: | |
| 1436 | - t.add_row((kw_type, keyword, description)) | |
| 1437 | - print t | |
| 1438 | - else: | |
| 1439 | - print 'No suspicious keyword or IOC found.' | |
| 1440 | - | |
| 1441 | - | |
| 1442 | - | |
| 1443 | -def process_file (container, filename, data, show_decoded_strings=False): | |
| 1444 | - """ | |
| 1445 | - Process a single file | |
| 1446 | - | |
| 1447 | - :param container: str, path and filename of container if the file is within | |
| 1448 | - a zip archive, None otherwise. | |
| 1449 | - :param filename: str, path and filename of file on disk, or within the container. | |
| 1450 | - :param data: bytes, content of the file if it is in a container, None if it is a file on disk. | |
| 1451 | - :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content. | |
| 1452 | - """ | |
| 1453 | - #TODO: replace print by writing to a provided output file (sys.stdout by default) | |
| 1454 | - if container: | |
| 1455 | - display_filename = '%s in %s' % (filename, container) | |
| 1456 | - else: | |
| 1457 | - display_filename = filename | |
| 1458 | - print '='*79 | |
| 1459 | - print 'FILE:', display_filename | |
| 1460 | - try: | |
| 1461 | - #TODO: handle olefile errors, when an OLE file is malformed | |
| 1462 | - vba = VBA_Parser(filename, data) | |
| 1463 | - print 'Type:', vba.type | |
| 1464 | - if vba.detect_vba_macros(): | |
| 1465 | - #print 'Contains VBA Macros:' | |
| 1466 | - for (subfilename, stream_path, vba_filename, vba_code) in vba.extract_macros(): | |
| 1467 | - # hide attribute lines: | |
| 1468 | - #TODO: option to disable attribute filtering | |
| 1469 | - vba_code_filtered = filter_vba(vba_code) | |
| 1470 | - print '-'*79 | |
| 1471 | - print 'VBA MACRO %s ' % vba_filename | |
| 1472 | - print 'in file: %s - OLE stream: %s' % (subfilename, repr(stream_path)) | |
| 1473 | - print '- '*39 | |
| 1474 | - # detect empty macros: | |
| 1475 | - if vba_code_filtered.strip() == '': | |
| 1476 | - print '(empty macro)' | |
| 1477 | - else: | |
| 1478 | - print vba_code_filtered | |
| 1479 | - print '- '*39 | |
| 1480 | - print 'ANALYSIS:' | |
| 1481 | - # analyse the whole code, filtered to avoid false positives: | |
| 1482 | - print_analysis(vba_code_filtered, show_decoded_strings) | |
| 1483 | - else: | |
| 1484 | - print 'No VBA macros found.' | |
| 1485 | - except: #TypeError: | |
| 1486 | - #raise | |
| 1487 | - #TODO: print more info if debug mode | |
| 1488 | - #print sys.exc_value | |
| 1489 | - # display the exception with full stack trace for debugging, but do not stop: | |
| 1490 | - traceback.print_exc() | |
| 1491 | - print '' | |
| 1492 | - | |
| 1493 | - | |
| 1494 | -def process_file_triage (container, filename, data): | |
| 1495 | - """ | |
| 1496 | - Process a single file | |
| 1497 | - | |
| 1498 | - :param container: str, path and filename of container if the file is within | |
| 1499 | - a zip archive, None otherwise. | |
| 1500 | - :param filename: str, path and filename of file on disk, or within the container. | |
| 1501 | - :param data: bytes, content of the file if it is in a container, None if it is a file on disk. | |
| 1502 | - """ | |
| 1503 | - #TODO: replace print by writing to a provided output file (sys.stdout by default) | |
| 1504 | - nb_macros = 0 | |
| 1505 | - nb_autoexec = 0 | |
| 1506 | - nb_suspicious = 0 | |
| 1507 | - nb_iocs = 0 | |
| 1508 | - nb_hexstrings = 0 | |
| 1509 | - nb_base64strings = 0 | |
| 1510 | - nb_dridexstrings = 0 | |
| 1511 | - # ftype = 'Other' | |
| 1512 | - message = '' | |
| 1513 | - try: | |
| 1514 | - #TODO: handle olefile errors, when an OLE file is malformed | |
| 1515 | - vba = VBA_Parser(filename, data) | |
| 1516 | - if vba.detect_vba_macros(): | |
| 1517 | - for (subfilename, stream_path, vba_filename, vba_code) in vba.extract_macros(): | |
| 1518 | - nb_macros += 1 | |
| 1519 | - if vba_code.strip() != '': | |
| 1520 | - # analyse the whole code, filtered to avoid false positives: | |
| 1521 | - scanner = VBA_Scanner(filter_vba(vba_code)) | |
| 1522 | - autoexec, suspicious, iocs, hexstrings, base64strings, dridex = scanner.scan_summary() | |
| 1523 | - nb_autoexec += autoexec | |
| 1524 | - nb_suspicious += suspicious | |
| 1525 | - nb_iocs += iocs | |
| 1526 | - nb_hexstrings += hexstrings | |
| 1527 | - nb_base64strings += base64strings | |
| 1528 | - nb_dridexstrings += dridex | |
| 1529 | - if vba.type == TYPE_OLE: | |
| 1530 | - flags = 'OLE:' | |
| 1531 | - elif vba.type == TYPE_OpenXML: | |
| 1532 | - flags = 'OpX:' | |
| 1533 | - elif vba.type == TYPE_Word2003_XML: | |
| 1534 | - flags = 'XML:' | |
| 1535 | - macros = autoexec = suspicious = iocs = hexstrings = base64obf = dridex = '-' | |
| 1536 | - if nb_macros: macros = 'M' | |
| 1537 | - if nb_autoexec: autoexec = 'A' | |
| 1538 | - if nb_suspicious: suspicious = 'S' | |
| 1539 | - if nb_iocs: iocs = 'I' | |
| 1540 | - if nb_hexstrings: hexstrings = 'H' | |
| 1541 | - if nb_base64strings: base64obf = 'B' | |
| 1542 | - if nb_dridexstrings: dridex = 'D' | |
| 1543 | - flags += '%s%s%s%s%s%s%s' % (macros, autoexec, suspicious, iocs, hexstrings, | |
| 1544 | - base64obf, dridex) | |
| 1545 | - | |
| 1546 | - # macros = autoexec = suspicious = iocs = hexstrings = 'no' | |
| 1547 | - # if nb_macros: macros = 'YES:%d' % nb_macros | |
| 1548 | - # if nb_autoexec: autoexec = 'YES:%d' % nb_autoexec | |
| 1549 | - # if nb_suspicious: suspicious = 'YES:%d' % nb_suspicious | |
| 1550 | - # if nb_iocs: iocs = 'YES:%d' % nb_iocs | |
| 1551 | - # if nb_hexstrings: hexstrings = 'YES:%d' % nb_hexstrings | |
| 1552 | - # # 2nd line = info | |
| 1553 | - # print '%-8s %-7s %-7s %-7s %-7s %-7s' % (vba.type, macros, autoexec, suspicious, iocs, hexstrings) | |
| 1554 | - except TypeError: | |
| 1555 | - # file type not OLE nor OpenXML | |
| 1556 | - flags = '?' | |
| 1557 | - message = 'File format not supported' | |
| 1558 | - except: | |
| 1559 | - # another error occurred | |
| 1560 | - #raise | |
| 1561 | - #TODO: print more info if debug mode | |
| 1562 | - #TODO: distinguish real errors from incorrect file types | |
| 1563 | - flags = '!ERROR' | |
| 1564 | - message = sys.exc_value | |
| 1565 | - line = '%-11s %s' % (flags, filename) | |
| 1566 | - if message: | |
| 1567 | - line += ' - %s' % message | |
| 1568 | - print line | |
| 1569 | - | |
| 1570 | - # t = prettytable.PrettyTable(('filename', 'type', 'macros', 'autoexec', 'suspicious', 'ioc', 'hexstrings'), | |
| 1571 | - # header=False, border=False) | |
| 1572 | - # t.align = 'l' | |
| 1573 | - # t.max_width['filename'] = 30 | |
| 1574 | - # t.max_width['type'] = 10 | |
| 1575 | - # t.max_width['macros'] = 6 | |
| 1576 | - # t.max_width['autoexec'] = 6 | |
| 1577 | - # t.max_width['suspicious'] = 6 | |
| 1578 | - # t.max_width['ioc'] = 6 | |
| 1579 | - # t.max_width['hexstrings'] = 6 | |
| 1580 | - # t.add_row((filename, ftype, macros, autoexec, suspicious, iocs, hexstrings)) | |
| 1581 | - # print t | |
| 1582 | - | |
| 1583 | -def main_triage_quick(): | |
| 1584 | - pass | |
| 1585 | - | |
| 1586 | -#=== MAIN ===================================================================== | |
| 1587 | - | |
| 1588 | -def main(): | |
| 1589 | - """ | |
| 1590 | - Main function, called when olevba is run from the command line | |
| 1591 | - """ | |
| 1592 | - usage = 'usage: %prog [options] <filename> [filename2 ...]' | |
| 1593 | - parser = optparse.OptionParser(usage=usage) | |
| 1594 | - # parser.add_option('-o', '--outfile', dest='outfile', | |
| 1595 | - # help='output file') | |
| 1596 | - # parser.add_option('-c', '--csv', dest='csv', | |
| 1597 | - # help='export results to a CSV file') | |
| 1598 | - parser.add_option("-r", action="store_true", dest="recursive", | |
| 1599 | - help='find files recursively in subdirectories.') | |
| 1600 | - parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None, | |
| 1601 | - help='if the file is a zip archive, open first file from it, using the provided password (requires Python 2.6+)') | |
| 1602 | - parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*', | |
| 1603 | - help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)') | |
| 1604 | - parser.add_option("-t", action="store_true", dest="triage_mode", | |
| 1605 | - help='triage mode, display results as a summary table (default for multiple files)') | |
| 1606 | - parser.add_option("-d", action="store_true", dest="detailed_mode", | |
| 1607 | - help='detailed mode, display full results (default for single file)') | |
| 1608 | - parser.add_option("-i", "--input", dest='input', type='str', default=None, | |
| 1609 | - help='input file containing VBA source code to be analyzed (no parsing)') | |
| 1610 | - parser.add_option("--decode", action="store_true", dest="show_decoded_strings", | |
| 1611 | - help='display all the obfuscated strings with their decoded content (Hex, Base64, StrReverse, Dridex).') | |
| 1612 | - | |
| 1613 | - (options, args) = parser.parse_args() | |
| 1614 | - | |
| 1615 | - # Print help if no arguments are passed | |
| 1616 | - if len(args) == 0 and not options.input: | |
| 1617 | - print __doc__ | |
| 1618 | - parser.print_help() | |
| 1619 | - sys.exit() | |
| 1620 | - | |
| 1621 | - # print banner with version | |
| 1622 | - print 'olevba %s - http://decalage.info/python/oletools' % __version__ | |
| 1623 | - | |
| 1624 | - logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING) #INFO) | |
| 1625 | - # For now, all logging is disabled: | |
| 1626 | - logging.disable(logging.CRITICAL) | |
| 1627 | - | |
| 1628 | - if options.input: | |
| 1629 | - # input file provided with VBA source code to be analyzed directly: | |
| 1630 | - print 'Analysis of VBA source code from %s:' % options.input | |
| 1631 | - vba_code = open(options.input).read() | |
| 1632 | - print_analysis(vba_code, show_decoded_strings=options.show_decoded_strings) | |
| 1633 | - sys.exit() | |
| 1634 | - | |
| 1635 | - # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('Type', 'Macros', 'AutoEx', 'Susp.', 'IOCs', 'HexStr') | |
| 1636 | - # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('-'*8, '-'*7, '-'*7, '-'*7, '-'*7, '-'*7) | |
| 1637 | - if not options.detailed_mode or options.triage_mode: | |
| 1638 | - print '%-11s %-65s' % ('Flags', 'Filename') | |
| 1639 | - print '%-11s %-65s' % ('-'*11, '-'*65) | |
| 1640 | - previous_container = None | |
| 1641 | - count = 0 | |
| 1642 | - container = filename = data = None | |
| 1643 | - for container, filename, data in xglob.iter_files(args, recursive=options.recursive, | |
| 1644 | - zip_password=options.zip_password, zip_fname=options.zip_fname): | |
| 1645 | - # ignore directory names stored in zip files: | |
| 1646 | - if container and filename.endswith('/'): | |
| 1647 | - continue | |
| 1648 | - if options.detailed_mode and not options.triage_mode: | |
| 1649 | - # fully detailed output | |
| 1650 | - process_file(container, filename, data, show_decoded_strings=options.show_decoded_strings) | |
| 1651 | - else: | |
| 1652 | - # print container name when it changes: | |
| 1653 | - if container != previous_container: | |
| 1654 | - if container is not None: | |
| 1655 | - print '\nFiles in %s:' % container | |
| 1656 | - previous_container = container | |
| 1657 | - # summarized output for triage: | |
| 1658 | - process_file_triage(container, filename, data) | |
| 1659 | - count += 1 | |
| 1660 | - if not options.detailed_mode or options.triage_mode: | |
| 1661 | - print '\n(Flags: OpX=OpenXML, XML=Word2003XML, M=Macros, A=Auto-executable, S=Suspicious keywords, I=IOCs, H=Hex strings, B=Base64 strings, D=Dridex strings, ?=Unknown)\n' | |
| 1662 | - | |
| 1663 | - if count == 1 and not options.triage_mode and not options.detailed_mode: | |
| 1664 | - # if options -t and -d were not specified and it's a single file, print details: | |
| 1665 | - #TODO: avoid doing the analysis twice by storing results | |
| 1666 | - process_file(container, filename, data, show_decoded_strings=options.show_decoded_strings) | |
| 1667 | - | |
| 1668 | -if __name__ == '__main__': | |
| 1669 | - main() | |
| 1670 | - | |
| 1 | +#!/usr/bin/env python | |
| 2 | +""" | |
| 3 | +olevba.py | |
| 4 | + | |
| 5 | +olevba is a script to parse OLE and OpenXML files such as MS Office documents | |
| 6 | +(e.g. Word, Excel), to extract VBA Macro code in clear text, deobfuscate | |
| 7 | +and analyze malicious macros. | |
| 8 | + | |
| 9 | +Supported formats: | |
| 10 | +- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) | |
| 11 | +- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) | |
| 12 | +- PowerPoint 2007+ (.pptm, .ppsm) | |
| 13 | +- Word 2003 XML (.xml) | |
| 14 | + | |
| 15 | +Author: Philippe Lagadec - http://www.decalage.info | |
| 16 | +License: BSD, see source code or documentation | |
| 17 | + | |
| 18 | +olevba is part of the python-oletools package: | |
| 19 | +http://www.decalage.info/python/oletools | |
| 20 | + | |
| 21 | +olevba is based on source code from officeparser by John William Davison | |
| 22 | +https://github.com/unixfreak0037/officeparser | |
| 23 | +""" | |
| 24 | + | |
| 25 | +#=== LICENSE ================================================================== | |
| 26 | + | |
| 27 | +# olevba is copyright (c) 2014-2015 Philippe Lagadec (http://www.decalage.info) | |
| 28 | +# All rights reserved. | |
| 29 | +# | |
| 30 | +# Redistribution and use in source and binary forms, with or without modification, | |
| 31 | +# are permitted provided that the following conditions are met: | |
| 32 | +# | |
| 33 | +# * Redistributions of source code must retain the above copyright notice, this | |
| 34 | +# list of conditions and the following disclaimer. | |
| 35 | +# * Redistributions in binary form must reproduce the above copyright notice, | |
| 36 | +# this list of conditions and the following disclaimer in the documentation | |
| 37 | +# and/or other materials provided with the distribution. | |
| 38 | +# | |
| 39 | +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
| 40 | +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
| 41 | +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
| 42 | +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | |
| 43 | +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 44 | +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
| 45 | +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
| 46 | +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
| 47 | +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
| 48 | +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 49 | + | |
| 50 | + | |
| 51 | +# olevba contains modified source code from the officeparser project, published | |
| 52 | +# under the following MIT License (MIT): | |
| 53 | +# | |
| 54 | +# officeparser is copyright (c) 2014 John William Davison | |
| 55 | +# | |
| 56 | +# Permission is hereby granted, free of charge, to any person obtaining a copy | |
| 57 | +# of this software and associated documentation files (the "Software"), to deal | |
| 58 | +# in the Software without restriction, including without limitation the rights | |
| 59 | +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| 60 | +# copies of the Software, and to permit persons to whom the Software is | |
| 61 | +# furnished to do so, subject to the following conditions: | |
| 62 | +# | |
| 63 | +# The above copyright notice and this permission notice shall be included in all | |
| 64 | +# copies or substantial portions of the Software. | |
| 65 | +# | |
| 66 | +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| 67 | +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| 68 | +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| 69 | +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| 70 | +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| 71 | +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
| 72 | +# SOFTWARE. | |
| 73 | + | |
| 74 | +#------------------------------------------------------------------------------ | |
| 75 | +# CHANGELOG: | |
| 76 | +# 2014-08-05 v0.01 PL: - first version based on officeparser code | |
| 77 | +# 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser | |
| 78 | +# 2014-08-15 PL: - fixed incorrect value check in PROJECTHELPFILEPATH Record | |
| 79 | +# 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats | |
| 80 | +# and to find the VBA project root anywhere in the file | |
| 81 | +# 2014-11-29 v0.04 PL: - use olefile instead of OleFileIO_PL | |
| 82 | +# 2014-12-05 v0.05 PL: - refactored most functions into a class, new API | |
| 83 | +# - added detect_vba_macros | |
| 84 | +# 2014-12-10 v0.06 PL: - hide first lines with VB attributes | |
| 85 | +# - detect auto-executable macros | |
| 86 | +# - ignore empty macros | |
| 87 | +# 2014-12-14 v0.07 PL: - detect_autoexec() is now case-insensitive | |
| 88 | +# 2014-12-15 v0.08 PL: - improved display for empty macros | |
| 89 | +# - added pattern extraction | |
| 90 | +# 2014-12-25 v0.09 PL: - added suspicious keywords detection | |
| 91 | +# 2014-12-27 v0.10 PL: - added OptionParser, main and process_file | |
| 92 | +# - uses xglob to scan several files with wildcards | |
| 93 | +# - option -r to recurse subdirectories | |
| 94 | +# - option -z to scan files in password-protected zips | |
| 95 | +# 2015-01-02 v0.11 PL: - improved filter_vba to detect colons | |
| 96 | +# 2015-01-03 v0.12 PL: - fixed detect_patterns to detect all patterns | |
| 97 | +# - process_file: improved display, shows container file | |
| 98 | +# - improved list of executable file extensions | |
| 99 | +# 2015-01-04 v0.13 PL: - added several suspicious keywords, improved display | |
| 100 | +# 2015-01-08 v0.14 PL: - added hex strings detection and decoding | |
| 101 | +# - fixed issue #2, decoding VBA stream names using | |
| 102 | +# specified codepage and unicode stream names | |
| 103 | +# 2015-01-11 v0.15 PL: - added new triage mode, options -t and -d | |
| 104 | +# 2015-01-16 v0.16 PL: - fix for issue #3 (exception when module name="text") | |
| 105 | +# - added several suspicious keywords | |
| 106 | +# - added option -i to analyze VBA source code directly | |
| 107 | +# 2015-01-17 v0.17 PL: - removed .com from the list of executable extensions | |
| 108 | +# - added scan_vba to run all detection algorithms | |
| 109 | +# - decoded hex strings are now also scanned + reversed | |
| 110 | +# 2015-01-23 v0.18 PL: - fixed issue #3, case-insensitive search in code_modules | |
| 111 | +# 2015-01-24 v0.19 PL: - improved the detection of IOCs obfuscated with hex | |
| 112 | +# strings and StrReverse | |
| 113 | +# 2015-01-26 v0.20 PL: - added option --hex to show all hex strings decoded | |
| 114 | +# 2015-01-29 v0.21 PL: - added Dridex obfuscation decoding | |
| 115 | +# - improved display, shows obfuscation name | |
| 116 | +# 2015-02-01 v0.22 PL: - fixed issue #4: regex for URL, e-mail and exe filename | |
| 117 | +# - added Base64 obfuscation decoding (contribution from | |
| 118 | +# @JamesHabben) | |
| 119 | +# 2015-02-03 v0.23 PL: - triage now uses VBA_Scanner results, shows Base64 and | |
| 120 | +# Dridex strings | |
| 121 | +# - exception handling in detect_base64_strings | |
| 122 | +# 2015-02-07 v0.24 PL: - renamed option --hex to --decode, fixed display | |
| 123 | +# - display exceptions with stack trace | |
| 124 | +# - added several suspicious keywords | |
| 125 | +# - improved Base64 detection and decoding | |
| 126 | +# - fixed triage mode not to scan attrib lines | |
| 127 | +# 2015-03-04 v0.25 PL: - added support for Word 2003 XML | |
| 128 | + | |
| 129 | +__version__ = '0.25' | |
| 130 | + | |
| 131 | +#------------------------------------------------------------------------------ | |
| 132 | +# TODO: | |
| 133 | +# + do not use logging, but a provided logger (null logger by default) | |
| 134 | +# + setup logging (common with other oletools) | |
| 135 | +# + add xor bruteforcing like bbharvest | |
| 136 | +# + add chr() decoding | |
| 137 | + | |
| 138 | +# TODO later: | |
| 139 | +# + performance improvement: instead of searching each keyword separately, | |
| 140 | +# first split vba code into a list of words (per line), then check each | |
| 141 | +# word against a dict. (or put vba words into a set/dict?) | |
| 142 | +# + for regex, maybe combine them into a single re with named groups? | |
| 143 | +# + add Yara support, include sample rules? plugins like balbuzard? | |
| 144 | +# + add balbuzard support | |
| 145 | +# + output to file (replace print by file.write, sys.stdout by default) | |
| 146 | +# + look for VBA in embedded documents (e.g. Excel in Word) | |
| 147 | +# + support SRP streams (see Lenny's article + links and sample) | |
| 148 | +# - python 3.x support | |
| 149 | +# - add support for PowerPoint macros (see libclamav, libgsf), use oledump heuristic? | |
| 150 | +# - check VBA macros in Visio, Access, Project, etc | |
| 151 | +# - extract_macros: convert to a class, split long function into smaller methods | |
| 152 | +# - extract_macros: read bytes from stream file objects instead of strings | |
| 153 | +# - extract_macros: use combined struct.unpack instead of many calls | |
| 154 | + | |
| 155 | +#------------------------------------------------------------------------------ | |
| 156 | +# REFERENCES: | |
| 157 | +# - [MS-OVBA]: Microsoft Office VBA File Format Structure | |
| 158 | +# http://msdn.microsoft.com/en-us/library/office/cc313094%28v=office.12%29.aspx | |
| 159 | +# - officeparser: https://github.com/unixfreak0037/officeparser | |
| 160 | + | |
| 161 | + | |
| 162 | +#--- IMPORTS ------------------------------------------------------------------ | |
| 163 | + | |
| 164 | +import sys, logging | |
| 165 | +import struct | |
| 166 | +import cStringIO | |
| 167 | +import math | |
| 168 | +import zipfile | |
| 169 | +import re | |
| 170 | +import optparse | |
| 171 | +import os.path | |
| 172 | +import binascii | |
| 173 | +import base64 | |
| 174 | +import traceback | |
| 175 | +import zlib | |
| 176 | + | |
| 177 | +# import lxml or ElementTree for XML parsing: | |
| 178 | +try: | |
| 179 | + # lxml: best performance for XML processing | |
| 180 | + import lxml.etree as ET | |
| 181 | +except ImportError: | |
| 182 | + try: | |
| 183 | + # Python 2.5+: batteries included | |
| 184 | + import xml.etree.cElementTree as ET | |
| 185 | + except ImportError: | |
| 186 | + try: | |
| 187 | + # Python <2.5: standalone ElementTree install | |
| 188 | + import elementtree.cElementTree as ET | |
| 189 | + except ImportError: | |
| 190 | + raise ImportError, "lxml or ElementTree are not installed, "\ | |
| 191 | + +"see http://codespeak.net/lxml "\ | |
| 192 | + +"or http://effbot.org/zone/element-index.htm" | |
| 193 | + | |
| 194 | +import thirdparty.olefile as olefile | |
| 195 | +from thirdparty.prettytable import prettytable | |
| 196 | +from thirdparty.xglob import xglob | |
| 197 | + | |
| 198 | +#--- CONSTANTS ---------------------------------------------------------------- | |
| 199 | + | |
| 200 | +TYPE_OLE = 'OLE' | |
| 201 | +TYPE_OpenXML = 'OpenXML' | |
| 202 | +TYPE_Word2003_XML = 'Word2003_XML' | |
| 203 | + | |
| 204 | +MODULE_EXTENSION = "bas" | |
| 205 | +CLASS_EXTENSION = "cls" | |
| 206 | +FORM_EXTENSION = "frm" | |
| 207 | + | |
| 208 | +# Namespaces and tags for Word2003 XML parsing: | |
| 209 | +NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}' | |
| 210 | +# the tag <w:binData w:name="editdata.mso"> contains the VBA macro code: | |
| 211 | +TAG_BINDATA = NS_W + 'binData' | |
| 212 | +ATTR_NAME = NS_W + 'name' | |
| 213 | + | |
| 214 | +# Keywords to detect auto-executable macros | |
| 215 | +AUTOEXEC_KEYWORDS = { | |
| 216 | + # MS Word: | |
| 217 | + 'Runs when the Word document is opened': | |
| 218 | + ('AutoExec', 'AutoOpen', 'Document_Open', 'DocumentOpen'), | |
| 219 | + 'Runs when the Word document is closed': | |
| 220 | + ('AutoExit', 'AutoClose', 'Document_Close', 'DocumentBeforeClose'), | |
| 221 | + 'Runs when the Word document is modified': | |
| 222 | + ('DocumentChange',), | |
| 223 | + 'Runs when a new Word document is created': | |
| 224 | + ('AutoNew', 'Document_New', 'NewDocument'), | |
| 225 | + | |
| 226 | + # MS Excel: | |
| 227 | + 'Runs when the Excel Workbook is opened': | |
| 228 | + ('Auto_Open', 'Workbook_Open'), | |
| 229 | + 'Runs when the Excel Workbook is closed': | |
| 230 | + ('Auto_Close', 'Workbook_Close'), | |
| 231 | + | |
| 232 | + #TODO: full list in MS specs?? | |
| 233 | +} | |
| 234 | + | |
| 235 | +# Suspicious Keywords that may be used by malware | |
| 236 | +# See VBA language reference: http://msdn.microsoft.com/en-us/library/office/jj692818%28v=office.15%29.aspx | |
| 237 | +SUSPICIOUS_KEYWORDS = { | |
| 238 | + #TODO: use regex to support variable whitespaces | |
| 239 | + 'May read system environment variables': | |
| 240 | + ('Environ',), | |
| 241 | + 'May open a file': | |
| 242 | + ('Open',), | |
| 243 | + 'May write to a file (if combined with Open)': | |
| 244 | + #TODO: regex to find Open+Write on same line | |
| 245 | + ('Write', 'Put', 'Output', 'Print #'), | |
| 246 | + 'May read or write a binary file (if combined with Open)': | |
| 247 | + #TODO: regex to find Open+Binary on same line | |
| 248 | + ('Binary',), | |
| 249 | + 'May copy a file': | |
| 250 | + ('FileCopy', 'CopyFile'), | |
| 251 | + #FileCopy: http://msdn.microsoft.com/en-us/library/office/gg264390%28v=office.15%29.aspx | |
| 252 | + #CopyFile: http://msdn.microsoft.com/en-us/library/office/gg264089%28v=office.15%29.aspx | |
| 253 | + 'May delete a file': | |
| 254 | + ('Kill',), | |
| 255 | + 'May create a text file': | |
| 256 | + ('CreateTextFile','ADODB.Stream', 'WriteText', 'SaveToFile'), | |
| 257 | + #CreateTextFile: http://msdn.microsoft.com/en-us/library/office/gg264617%28v=office.15%29.aspx | |
| 258 | + #ADODB.Stream sample: http://pastebin.com/Z4TMyuq6 | |
| 259 | + 'May run an executable file or a system command': | |
| 260 | + ('Shell', 'vbNormal', 'vbNormalFocus', 'vbHide', 'vbMinimizedFocus', 'vbMaximizedFocus', 'vbNormalNoFocus', | |
| 261 | + 'vbMinimizedNoFocus', 'WScript.Shell', 'Run'), | |
| 262 | + #Shell: http://msdn.microsoft.com/en-us/library/office/gg278437%28v=office.15%29.aspx | |
| 263 | + #WScript.Shell+Run sample: http://pastebin.com/Z4TMyuq6 | |
| 264 | + 'May hide the application': | |
| 265 | + ('Application.Visible', 'ShowWindow', 'SW_HIDE'), | |
| 266 | + 'May create a directory': | |
| 267 | + ('MkDir',), | |
| 268 | + 'May save the current workbook': | |
| 269 | + ('ActiveWorkbook.SaveAs',), | |
| 270 | + 'May change which directory contains files to open at startup': | |
| 271 | + #TODO: confirm the actual effect | |
| 272 | + ('Application.AltStartupPath',), | |
| 273 | + 'May create an OLE object': | |
| 274 | + ('CreateObject',), | |
| 275 | + 'May run an application (if combined with CreateObject)': | |
| 276 | + ('Shell.Application',), | |
| 277 | + 'May enumerate application windows (if combined with Shell.Application object)': | |
| 278 | + ('Windows', 'FindWindow'), | |
| 279 | + 'May run code from a DLL': | |
| 280 | + #TODO: regex to find declare+lib on same line | |
| 281 | + ('Lib',), | |
| 282 | + 'May download files from the Internet': | |
| 283 | + #TODO: regex to find urlmon+URLDownloadToFileA on same line | |
| 284 | + ('URLDownloadToFileA', 'Msxml2.XMLHTTP', 'Microsoft.XMLHTTP'), | |
| 285 | + 'May control another application by simulating user keystrokes': | |
| 286 | + ('SendKeys', 'AppActivate'), | |
| 287 | + #SendKeys: http://msdn.microsoft.com/en-us/library/office/gg278655%28v=office.15%29.aspx | |
| 288 | + 'May attempt to obfuscate malicious function calls': | |
| 289 | + ('CallByName',), | |
| 290 | + #CallByName: http://msdn.microsoft.com/en-us/library/office/gg278760%28v=office.15%29.aspx | |
| 291 | + 'May attempt to obfuscate specific strings': | |
| 292 | + #TODO: regex to find several Chr*, not just one | |
| 293 | + ('Chr', 'ChrB', 'ChrW', 'StrReverse', 'Xor'), | |
| 294 | + #Chr: http://msdn.microsoft.com/en-us/library/office/gg264465%28v=office.15%29.aspx | |
| 295 | +} | |
| 296 | + | |
| 297 | +# Regular Expression for a URL: | |
| 298 | +# http://en.wikipedia.org/wiki/Uniform_resource_locator | |
| 299 | +# http://www.w3.org/Addressing/URL/uri-spec.html | |
| 300 | +#TODO: also support username:password@server | |
| 301 | +#TODO: other protocols (file, gopher, wais, ...?) | |
| 302 | +SCHEME = r'\b(?:http|ftp)s?' | |
| 303 | +# see http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains | |
| 304 | +TLD = r'(?:xn--[a-zA-Z0-9]{4,20}|[a-zA-Z]{2,20})' | |
| 305 | +DNS_NAME = r'(?:[a-zA-Z0-9\-\.]+\.' + TLD + ')' | |
| 306 | +#TODO: IPv6 - see https://www.debuggex.com/ | |
| 307 | +# A literal numeric IPv6 address may be given, but must be enclosed in [ ] e.g. [db8:0cec::99:123a] | |
| 308 | +NUMBER_0_255 = r'(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])' | |
| 309 | +IPv4 = r'(?:'+NUMBER_0_255+r'\.){3}'+NUMBER_0_255 | |
| 310 | +# IPv4 must come before the DNS name because it is more specific | |
| 311 | +SERVER = r'(?:' + IPv4 + '|' + DNS_NAME + ')' | |
| 312 | +PORT = r'(?:\:[0-9]{1,5})?' | |
| 313 | +SERVER_PORT = SERVER + PORT | |
| 314 | +URL_PATH = r'(?:/[a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~]*)?' # [^\.\,\)\(\s"] | |
| 315 | +URL_RE = SCHEME + r'\://' + SERVER_PORT + URL_PATH | |
| 316 | +re_url = re.compile(URL_RE) | |
| 317 | + | |
| 318 | + | |
| 319 | +# Patterns to be extracted (IP addresses, URLs, etc) | |
| 320 | +# From patterns.py in balbuzard | |
| 321 | +RE_PATTERNS = ( | |
| 322 | + ('URL', re.compile(URL_RE)), | |
| 323 | + ('IPv4 address', re.compile(IPv4)), | |
| 324 | + ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@'+SERVER+'\b')), | |
| 325 | + # ('Domain name', re.compile(r'(?=^.{1,254}$)(^(?:(?!\d+\.|-)[a-zA-Z0-9_\-]{1,63}(?<!-)\.?)+(?:[a-zA-Z]{2,})$)')), | |
| 326 | + # Executable file name with known extensions (except .com which is present in many URLs, and .application): | |
| 327 | + ("Executable file name", re.compile(r"(?i)\b\w+\.(EXE|PIF|GADGET|MSI|MSP|MSC|VBS|VBE|VB|JSE|JS|WSF|WSC|WSH|WS|BAT|CMD|DLL|SCR|HTA|CPL|CLASS|JAR|PS1XML|PS1|PS2XML|PS2|PSC1|PSC2|SCF|LNK|INF|REG)\b")), | |
| 328 | + # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/ | |
| 329 | + #TODO: https://support.office.com/en-us/article/Blocked-attachments-in-Outlook-3811cddc-17c3-4279-a30c-060ba0207372#__attachment_file_types | |
| 330 | + #('Hex string', re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')), | |
| 331 | + ) | |
| 332 | + | |
| 333 | +# regex to detect strings encoded in hexadecimal | |
| 334 | +re_hex_string = re.compile(r'(?:[0-9A-Fa-f]{2}){4,}') | |
| 335 | + | |
| 336 | +# regex to detect strings encoded in base64 | |
| 337 | +#re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?"') | |
| 338 | +# better version from balbuzard, less false positives: | |
| 339 | +re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4}){1,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)?"') | |
| 340 | +# white list of common strings matching the base64 regex, but which are not base64 strings (all lowercase): | |
| 341 | +BASE64_WHITELIST = set(['thisdocument', 'thisworkbook', 'test', 'temp', 'http', 'open', 'exit']) | |
| 342 | + | |
| 343 | +# regex to detect strings encoded with a specific Dridex algorithm | |
| 344 | +# (see https://github.com/JamesHabben/MalwareStuff) | |
| 345 | +re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"') | |
| 346 | +# regex to check that it is not just a hex string: | |
| 347 | +re_nothex_check = re.compile(r'[G-Zg-z]') | |
| 348 | + | |
| 349 | +#--- FUNCTIONS ---------------------------------------------------------------- | |
| 350 | + | |
| 351 | +def copytoken_help(decompressed_current, decompressed_chunk_start): | |
| 352 | + """ | |
| 353 | + compute bit masks to decode a CopyToken according to MS-OVBA 2.4.1.3.19.1 CopyToken Help | |
| 354 | + | |
| 355 | + decompressed_current: number of decompressed bytes so far, i.e. len(decompressed_container) | |
| 356 | + decompressed_chunk_start: offset of the current chunk in the decompressed container | |
| 357 | + return length_mask, offset_mask, bit_count, maximum_length | |
| 358 | + """ | |
| 359 | + difference = decompressed_current - decompressed_chunk_start | |
| 360 | + bit_count = int(math.ceil(math.log(difference, 2))) | |
| 361 | + bit_count = max([bit_count, 4]) | |
| 362 | + length_mask = 0xFFFF >> bit_count | |
| 363 | + offset_mask = ~length_mask | |
| 364 | + maximum_length = (0xFFFF >> bit_count) + 3 | |
| 365 | + return length_mask, offset_mask, bit_count, maximum_length | |
| 366 | + | |
| 367 | + | |
| 368 | +def decompress_stream (compressed_container): | |
| 369 | + """ | |
| 370 | + Decompress a stream according to MS-OVBA section 2.4.1 | |
| 371 | + | |
| 372 | + compressed_container: string compressed according to the MS-OVBA 2.4.1.3.6 Compression algorithm | |
| 373 | + return the decompressed container as a string (bytes) | |
| 374 | + """ | |
| 375 | + # 2.4.1.2 State Variables | |
| 376 | + | |
| 377 | + # The following state is maintained for the CompressedContainer (section 2.4.1.1.1): | |
| 378 | + # CompressedRecordEnd: The location of the byte after the last byte in the CompressedContainer (section 2.4.1.1.1). | |
| 379 | + # CompressedCurrent: The location of the next byte in the CompressedContainer (section 2.4.1.1.1) to be read by | |
| 380 | + # decompression or to be written by compression. | |
| 381 | + | |
| 382 | + # The following state is maintained for the current CompressedChunk (section 2.4.1.1.4): | |
| 383 | + # CompressedChunkStart: The location of the first byte of the CompressedChunk (section 2.4.1.1.4) within the | |
| 384 | + # CompressedContainer (section 2.4.1.1.1). | |
| 385 | + | |
| 386 | + # The following state is maintained for a DecompressedBuffer (section 2.4.1.1.2): | |
| 387 | + # DecompressedCurrent: The location of the next byte in the DecompressedBuffer (section 2.4.1.1.2) to be written by | |
| 388 | + # decompression or to be read by compression. | |
| 389 | + # DecompressedBufferEnd: The location of the byte after the last byte in the DecompressedBuffer (section 2.4.1.1.2). | |
| 390 | + | |
| 391 | + # The following state is maintained for the current DecompressedChunk (section 2.4.1.1.3): | |
| 392 | + # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the | |
| 393 | + # DecompressedBuffer (section 2.4.1.1.2). | |
| 394 | + | |
| 395 | + decompressed_container = '' # result | |
| 396 | + compressed_current = 0 | |
| 397 | + | |
| 398 | + sig_byte = ord(compressed_container[compressed_current]) | |
| 399 | + if sig_byte != 0x01: | |
| 400 | + raise ValueError('invalid signature byte {0:02X}'.format(sig_byte)) | |
| 401 | + | |
| 402 | + compressed_current += 1 | |
| 403 | + | |
| 404 | + #NOTE: the definition of CompressedRecordEnd is ambiguous. Here we assume that | |
| 405 | + # CompressedRecordEnd = len(compressed_container) | |
| 406 | + while compressed_current < len(compressed_container): | |
| 407 | + # 2.4.1.1.5 | |
| 408 | + compressed_chunk_start = compressed_current | |
| 409 | + # chunk header = first 16 bits | |
| 410 | + compressed_chunk_header = struct.unpack("<H", compressed_container[compressed_chunk_start:compressed_chunk_start + 2])[0] | |
| 411 | + # chunk size = 12 first bits of header + 3 | |
| 412 | + chunk_size = (compressed_chunk_header & 0x0FFF) + 3 | |
| 413 | + # chunk signature = 3 next bits - should always be 0b011 | |
| 414 | + chunk_signature = (compressed_chunk_header >> 12) & 0x07 | |
| 415 | + if chunk_signature != 0b011: | |
| 416 | + raise ValueError('Invalid CompressedChunkSignature in VBA compressed stream') | |
| 417 | + # chunk flag = next bit - 1 == compressed, 0 == uncompressed | |
| 418 | + chunk_flag = (compressed_chunk_header >> 15) & 0x01 | |
| 419 | + logging.debug("chunk size = {0}, compressed flag = {1}".format(chunk_size, chunk_flag)) | |
| 420 | + | |
| 421 | + #MS-OVBA 2.4.1.3.12: the maximum size of a chunk including its header is 4098 bytes (header 2 + data 4096) | |
| 422 | + # The minimum size is 3 bytes | |
| 423 | + # NOTE: there seems to be a typo in MS-OVBA, the check should be with 4098, not 4095 (which is the max value | |
| 424 | + # in chunk header before adding 3. | |
| 425 | + # Also the first test is not useful since a 12 bits value cannot be larger than 4095. | |
| 426 | + if chunk_flag == 1 and chunk_size > 4098: | |
| 427 | + raise ValueError('CompressedChunkSize > 4098 but CompressedChunkFlag == 1') | |
| 428 | + if chunk_flag == 0 and chunk_size != 4098: | |
| 429 | + raise ValueError('CompressedChunkSize != 4098 but CompressedChunkFlag == 0') | |
| 430 | + | |
| 431 | + # check if chunk_size goes beyond the compressed data, instead of silently cutting it: | |
| 432 | + #TODO: raise an exception? | |
| 433 | + if compressed_chunk_start + chunk_size > len(compressed_container): | |
| 434 | + logging.warning('Chunk size is larger than remaining compressed data') | |
| 435 | + compressed_end = min([len(compressed_container), compressed_chunk_start + chunk_size]) | |
| 436 | + # read after chunk header: | |
| 437 | + compressed_current = compressed_chunk_start + 2 | |
| 438 | + | |
| 439 | + if chunk_flag == 0: | |
| 440 | + # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk | |
| 441 | + # uncompressed chunk: read the next 4096 bytes as-is | |
| 442 | + #TODO: check if there are at least 4096 bytes left | |
| 443 | + decompressed_container += compressed_container[compressed_current:compressed_current + 4096] | |
| 444 | + compressed_current += 4096 | |
| 445 | + else: | |
| 446 | + # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk | |
| 447 | + # compressed chunk | |
| 448 | + decompressed_chunk_start = len(decompressed_container) | |
| 449 | + while compressed_current < compressed_end: | |
| 450 | + # MS-OVBA 2.4.1.3.4 Decompressing a TokenSequence | |
| 451 | + # logging.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end)) | |
| 452 | + # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or | |
| 453 | + # copy tokens (reference to a previous literal token) | |
| 454 | + flag_byte = ord(compressed_container[compressed_current]) | |
| 455 | + compressed_current += 1 | |
| 456 | + for bit_index in xrange(0, 8): | |
| 457 | + # logging.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end)) | |
| 458 | + if compressed_current >= compressed_end: | |
| 459 | + break | |
| 460 | + # MS-OVBA 2.4.1.3.5 Decompressing a Token | |
| 461 | + # MS-OVBA 2.4.1.3.17 Extract FlagBit | |
| 462 | + flag_bit = (flag_byte >> bit_index) & 1 | |
| 463 | + #logging.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit)) | |
| 464 | + if flag_bit == 0: # LiteralToken | |
| 465 | + # copy one byte directly to output | |
| 466 | + decompressed_container += compressed_container[compressed_current] | |
| 467 | + compressed_current += 1 | |
| 468 | + else: # CopyToken | |
| 469 | + # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken | |
| 470 | + copy_token = struct.unpack("<H", compressed_container[compressed_current:compressed_current + 2])[0] | |
| 471 | + #TODO: check this | |
| 472 | + length_mask, offset_mask, bit_count, maximum_length = copytoken_help( | |
| 473 | + len(decompressed_container), decompressed_chunk_start) | |
| 474 | + length = (copy_token & length_mask) + 3 | |
| 475 | + temp1 = copy_token & offset_mask | |
| 476 | + temp2 = 16 - bit_count | |
| 477 | + offset = (temp1 >> temp2) + 1 | |
| 478 | + #logging.debug('offset=%d length=%d' % (offset, length)) | |
| 479 | + copy_source = len(decompressed_container) - offset | |
| 480 | + for index in xrange(copy_source, copy_source + length): | |
| 481 | + decompressed_container += decompressed_container[index] | |
| 482 | + compressed_current += 2 | |
| 483 | + return decompressed_container | |
| 484 | + | |
| 485 | + | |
| 486 | +def _extract_vba (ole, vba_root, project_path, dir_path): | |
| 487 | + """ | |
| 488 | + Extract VBA macros from an OleFileIO object. | |
| 489 | + Internal function, do not call directly. | |
| 490 | + | |
| 491 | + vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream | |
| 492 | + vba_project: path to the PROJECT stream | |
| 493 | + This is a generator, yielding (stream path, VBA filename, VBA source code) for each VBA code stream | |
| 494 | + """ | |
| 495 | + # Open the PROJECT stream: | |
| 496 | + project = ole.openstream(project_path) | |
| 497 | + | |
| 498 | + # sample content of the PROJECT stream: | |
| 499 | + | |
| 500 | + ## ID="{5312AC8A-349D-4950-BDD0-49BE3C4DD0F0}" | |
| 501 | + ## Document=ThisDocument/&H00000000 | |
| 502 | + ## Module=NewMacros | |
| 503 | + ## Name="Project" | |
| 504 | + ## HelpContextID="0" | |
| 505 | + ## VersionCompatible32="393222000" | |
| 506 | + ## CMG="F1F301E705E705E705E705" | |
| 507 | + ## DPB="8F8D7FE3831F2020202020" | |
| 508 | + ## GC="2D2FDD81E51EE61EE6E1" | |
| 509 | + ## | |
| 510 | + ## [Host Extender Info] | |
| 511 | + ## &H00000001={3832D640-CF90-11CF-8E43-00A0C911005A};VBE;&H00000000 | |
| 512 | + ## &H00000002={000209F2-0000-0000-C000-000000000046};Word8.0;&H00000000 | |
| 513 | + ## | |
| 514 | + ## [Workspace] | |
| 515 | + ## ThisDocument=22, 29, 339, 477, Z | |
| 516 | + ## NewMacros=-4, 42, 832, 510, C | |
| 517 | + | |
| 518 | + code_modules = {} | |
| 519 | + | |
| 520 | + for line in project: | |
| 521 | + line = line.strip() | |
| 522 | + if '=' in line: | |
| 523 | + # split line at the 1st equal sign: | |
| 524 | + name, value = line.split('=', 1) | |
| 525 | + # looking for code modules | |
| 526 | + # add the code module as a key in the dictionary | |
| 527 | + # the value will be the extension needed later | |
| 528 | + # The value is converted to lowercase, to allow case-insensitive matching (issue #3) | |
| 529 | + value = value.lower() | |
| 530 | + if name == 'Document': | |
| 531 | + # split value at the 1st slash, keep 1st part: | |
| 532 | + value = value.split('/', 1)[0] | |
| 533 | + code_modules[value] = CLASS_EXTENSION | |
| 534 | + elif name == 'Module': | |
| 535 | + code_modules[value] = MODULE_EXTENSION | |
| 536 | + elif name == 'Class': | |
| 537 | + code_modules[value] = CLASS_EXTENSION | |
| 538 | + elif name == 'BaseClass': | |
| 539 | + code_modules[value] = FORM_EXTENSION | |
| 540 | + | |
| 541 | + # read data from dir stream (compressed) | |
| 542 | + dir_compressed = ole.openstream(dir_path).read() | |
| 543 | + | |
| 544 | + def check_value(name, expected, value): | |
| 545 | + if expected != value: | |
| 546 | + logging.error("invalid value for {0} expected {1:04X} got {2:04X}".format(name, expected, value)) | |
| 547 | + | |
| 548 | + dir_stream = cStringIO.StringIO(decompress_stream(dir_compressed)) | |
| 549 | + | |
| 550 | + # PROJECTSYSKIND Record | |
| 551 | + PROJECTSYSKIND_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 552 | + check_value('PROJECTSYSKIND_Id', 0x0001, PROJECTSYSKIND_Id) | |
| 553 | + PROJECTSYSKIND_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 554 | + check_value('PROJECTSYSKIND_Size', 0x0004, PROJECTSYSKIND_Size) | |
| 555 | + PROJECTSYSKIND_SysKind = struct.unpack("<L", dir_stream.read(4))[0] | |
| 556 | + if PROJECTSYSKIND_SysKind == 0x00: | |
| 557 | + logging.debug("16-bit Windows") | |
| 558 | + elif PROJECTSYSKIND_SysKind == 0x01: | |
| 559 | + logging.debug("32-bit Windows") | |
| 560 | + elif PROJECTSYSKIND_SysKind == 0x02: | |
| 561 | + logging.debug("Macintosh") | |
| 562 | + elif PROJECTSYSKIND_SysKind == 0x03: | |
| 563 | + logging.debug("64-bit Windows") | |
| 564 | + else: | |
| 565 | + logging.error("invalid PROJECTSYSKIND_SysKind {0:04X}".format(PROJECTSYSKIND_SysKind)) | |
| 566 | + | |
| 567 | + # PROJECTLCID Record | |
| 568 | + PROJECTLCID_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 569 | + check_value('PROJECTLCID_Id', 0x0002, PROJECTLCID_Id) | |
| 570 | + PROJECTLCID_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 571 | + check_value('PROJECTLCID_Size', 0x0004, PROJECTLCID_Size) | |
| 572 | + PROJECTLCID_Lcid = struct.unpack("<L", dir_stream.read(4))[0] | |
| 573 | + check_value('PROJECTLCID_Lcid', 0x409, PROJECTLCID_Lcid) | |
| 574 | + | |
| 575 | + # PROJECTLCIDINVOKE Record | |
| 576 | + PROJECTLCIDINVOKE_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 577 | + check_value('PROJECTLCIDINVOKE_Id', 0x0014, PROJECTLCIDINVOKE_Id) | |
| 578 | + PROJECTLCIDINVOKE_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 579 | + check_value('PROJECTLCIDINVOKE_Size', 0x0004, PROJECTLCIDINVOKE_Size) | |
| 580 | + PROJECTLCIDINVOKE_LcidInvoke = struct.unpack("<L", dir_stream.read(4))[0] | |
| 581 | + check_value('PROJECTLCIDINVOKE_LcidInvoke', 0x409, PROJECTLCIDINVOKE_LcidInvoke) | |
| 582 | + | |
| 583 | + # PROJECTCODEPAGE Record | |
| 584 | + PROJECTCODEPAGE_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 585 | + check_value('PROJECTCODEPAGE_Id', 0x0003, PROJECTCODEPAGE_Id) | |
| 586 | + PROJECTCODEPAGE_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 587 | + check_value('PROJECTCODEPAGE_Size', 0x0002, PROJECTCODEPAGE_Size) | |
| 588 | + PROJECTCODEPAGE_CodePage = struct.unpack("<H", dir_stream.read(2))[0] | |
| 589 | + | |
| 590 | + # PROJECTNAME Record | |
| 591 | + PROJECTNAME_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 592 | + check_value('PROJECTNAME_Id', 0x0004, PROJECTNAME_Id) | |
| 593 | + PROJECTNAME_SizeOfProjectName = struct.unpack("<L", dir_stream.read(4))[0] | |
| 594 | + if PROJECTNAME_SizeOfProjectName < 1 or PROJECTNAME_SizeOfProjectName > 128: | |
| 595 | + logging.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(PROJECTNAME_SizeOfProjectName)) | |
| 596 | + PROJECTNAME_ProjectName = dir_stream.read(PROJECTNAME_SizeOfProjectName) | |
| 597 | + | |
| 598 | + # PROJECTDOCSTRING Record | |
| 599 | + PROJECTDOCSTRING_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 600 | + check_value('PROJECTDOCSTRING_Id', 0x0005, PROJECTDOCSTRING_Id) | |
| 601 | + PROJECTDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0] | |
| 602 | + if PROJECTNAME_SizeOfProjectName > 2000: | |
| 603 | + logging.error("PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(PROJECTDOCSTRING_SizeOfDocString)) | |
| 604 | + PROJECTDOCSTRING_DocString = dir_stream.read(PROJECTDOCSTRING_SizeOfDocString) | |
| 605 | + PROJECTDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 606 | + check_value('PROJECTDOCSTRING_Reserved', 0x0040, PROJECTDOCSTRING_Reserved) | |
| 607 | + PROJECTDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 608 | + if PROJECTDOCSTRING_SizeOfDocStringUnicode % 2 != 0: | |
| 609 | + logging.error("PROJECTDOCSTRING_SizeOfDocStringUnicode is not even") | |
| 610 | + PROJECTDOCSTRING_DocStringUnicode = dir_stream.read(PROJECTDOCSTRING_SizeOfDocStringUnicode) | |
| 611 | + | |
| 612 | + # PROJECTHELPFILEPATH Record - MS-OVBA 2.3.4.2.1.7 | |
| 613 | + PROJECTHELPFILEPATH_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 614 | + check_value('PROJECTHELPFILEPATH_Id', 0x0006, PROJECTHELPFILEPATH_Id) | |
| 615 | + PROJECTHELPFILEPATH_SizeOfHelpFile1 = struct.unpack("<L", dir_stream.read(4))[0] | |
| 616 | + if PROJECTHELPFILEPATH_SizeOfHelpFile1 > 260: | |
| 617 | + logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(PROJECTHELPFILEPATH_SizeOfHelpFile1)) | |
| 618 | + PROJECTHELPFILEPATH_HelpFile1 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile1) | |
| 619 | + PROJECTHELPFILEPATH_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 620 | + check_value('PROJECTHELPFILEPATH_Reserved', 0x003D, PROJECTHELPFILEPATH_Reserved) | |
| 621 | + PROJECTHELPFILEPATH_SizeOfHelpFile2 = struct.unpack("<L", dir_stream.read(4))[0] | |
| 622 | + if PROJECTHELPFILEPATH_SizeOfHelpFile2 != PROJECTHELPFILEPATH_SizeOfHelpFile1: | |
| 623 | + logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 does not equal PROJECTHELPFILEPATH_SizeOfHelpFile2") | |
| 624 | + PROJECTHELPFILEPATH_HelpFile2 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile2) | |
| 625 | + if PROJECTHELPFILEPATH_HelpFile2 != PROJECTHELPFILEPATH_HelpFile1: | |
| 626 | + logging.error("PROJECTHELPFILEPATH_HelpFile1 does not equal PROJECTHELPFILEPATH_HelpFile2") | |
| 627 | + | |
| 628 | + # PROJECTHELPCONTEXT Record | |
| 629 | + PROJECTHELPCONTEXT_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 630 | + check_value('PROJECTHELPCONTEXT_Id', 0x0007, PROJECTHELPCONTEXT_Id) | |
| 631 | + PROJECTHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 632 | + check_value('PROJECTHELPCONTEXT_Size', 0x0004, PROJECTHELPCONTEXT_Size) | |
| 633 | + PROJECTHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0] | |
| 634 | + | |
| 635 | + # PROJECTLIBFLAGS Record | |
| 636 | + PROJECTLIBFLAGS_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 637 | + check_value('PROJECTLIBFLAGS_Id', 0x0008, PROJECTLIBFLAGS_Id) | |
| 638 | + PROJECTLIBFLAGS_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 639 | + check_value('PROJECTLIBFLAGS_Size', 0x0004, PROJECTLIBFLAGS_Size) | |
| 640 | + PROJECTLIBFLAGS_ProjectLibFlags = struct.unpack("<L", dir_stream.read(4))[0] | |
| 641 | + check_value('PROJECTLIBFLAGS_ProjectLibFlags', 0x0000, PROJECTLIBFLAGS_ProjectLibFlags) | |
| 642 | + | |
| 643 | + # PROJECTVERSION Record | |
| 644 | + PROJECTVERSION_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 645 | + check_value('PROJECTVERSION_Id', 0x0009, PROJECTVERSION_Id) | |
| 646 | + PROJECTVERSION_Reserved = struct.unpack("<L", dir_stream.read(4))[0] | |
| 647 | + check_value('PROJECTVERSION_Reserved', 0x0004, PROJECTVERSION_Reserved) | |
| 648 | + PROJECTVERSION_VersionMajor = struct.unpack("<L", dir_stream.read(4))[0] | |
| 649 | + PROJECTVERSION_VersionMinor = struct.unpack("<H", dir_stream.read(2))[0] | |
| 650 | + | |
| 651 | + # PROJECTCONSTANTS Record | |
| 652 | + PROJECTCONSTANTS_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 653 | + check_value('PROJECTCONSTANTS_Id', 0x000C, PROJECTCONSTANTS_Id) | |
| 654 | + PROJECTCONSTANTS_SizeOfConstants = struct.unpack("<L", dir_stream.read(4))[0] | |
| 655 | + if PROJECTCONSTANTS_SizeOfConstants > 1015: | |
| 656 | + logging.error("PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(PROJECTCONSTANTS_SizeOfConstants)) | |
| 657 | + PROJECTCONSTANTS_Constants = dir_stream.read(PROJECTCONSTANTS_SizeOfConstants) | |
| 658 | + PROJECTCONSTANTS_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 659 | + check_value('PROJECTCONSTANTS_Reserved', 0x003C, PROJECTCONSTANTS_Reserved) | |
| 660 | + PROJECTCONSTANTS_SizeOfConstantsUnicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 661 | + if PROJECTCONSTANTS_SizeOfConstantsUnicode % 2 != 0: | |
| 662 | + logging.error("PROJECTCONSTANTS_SizeOfConstantsUnicode is not even") | |
| 663 | + PROJECTCONSTANTS_ConstantsUnicode = dir_stream.read(PROJECTCONSTANTS_SizeOfConstantsUnicode) | |
| 664 | + | |
| 665 | + # array of REFERENCE records | |
| 666 | + check = None | |
| 667 | + while True: | |
| 668 | + check = struct.unpack("<H", dir_stream.read(2))[0] | |
| 669 | + logging.debug("reference type = {0:04X}".format(check)) | |
| 670 | + if check == 0x000F: | |
| 671 | + break | |
| 672 | + | |
| 673 | + if check == 0x0016: | |
| 674 | + # REFERENCENAME | |
| 675 | + REFERENCE_Id = check | |
| 676 | + REFERENCE_SizeOfName = struct.unpack("<L", dir_stream.read(4))[0] | |
| 677 | + REFERENCE_Name = dir_stream.read(REFERENCE_SizeOfName) | |
| 678 | + REFERENCE_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 679 | + check_value('REFERENCE_Reserved', 0x003E, REFERENCE_Reserved) | |
| 680 | + REFERENCE_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 681 | + REFERENCE_NameUnicode = dir_stream.read(REFERENCE_SizeOfNameUnicode) | |
| 682 | + continue | |
| 683 | + | |
| 684 | + if check == 0x0033: | |
| 685 | + # REFERENCEORIGINAL (followed by REFERENCECONTROL) | |
| 686 | + REFERENCEORIGINAL_Id = check | |
| 687 | + REFERENCEORIGINAL_SizeOfLibidOriginal = struct.unpack("<L", dir_stream.read(4))[0] | |
| 688 | + REFERENCEORIGINAL_LibidOriginal = dir_stream.read(REFERENCEORIGINAL_SizeOfLibidOriginal) | |
| 689 | + continue | |
| 690 | + | |
| 691 | + if check == 0x002F: | |
| 692 | + # REFERENCECONTROL | |
| 693 | + REFERENCECONTROL_Id = check | |
| 694 | + REFERENCECONTROL_SizeTwiddled = struct.unpack("<L", dir_stream.read(4))[0] # ignore | |
| 695 | + REFERENCECONTROL_SizeOfLibidTwiddled = struct.unpack("<L", dir_stream.read(4))[0] | |
| 696 | + REFERENCECONTROL_LibidTwiddled = dir_stream.read(REFERENCECONTROL_SizeOfLibidTwiddled) | |
| 697 | + REFERENCECONTROL_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0] # ignore | |
| 698 | + check_value('REFERENCECONTROL_Reserved1', 0x0000, REFERENCECONTROL_Reserved1) | |
| 699 | + REFERENCECONTROL_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0] # ignore | |
| 700 | + check_value('REFERENCECONTROL_Reserved2', 0x0000, REFERENCECONTROL_Reserved2) | |
| 701 | + # optional field | |
| 702 | + check2 = struct.unpack("<H", dir_stream.read(2))[0] | |
| 703 | + if check2 == 0x0016: | |
| 704 | + REFERENCECONTROL_NameRecordExtended_Id = check | |
| 705 | + REFERENCECONTROL_NameRecordExtended_SizeofName = struct.unpack("<L", dir_stream.read(4))[0] | |
| 706 | + REFERENCECONTROL_NameRecordExtended_Name = dir_stream.read(REFERENCECONTROL_NameRecordExtended_SizeofName) | |
| 707 | + REFERENCECONTROL_NameRecordExtended_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 708 | + check_value('REFERENCECONTROL_NameRecordExtended_Reserved', 0x003E, REFERENCECONTROL_NameRecordExtended_Reserved) | |
| 709 | + REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 710 | + REFERENCECONTROL_NameRecordExtended_NameUnicode = dir_stream.read(REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode) | |
| 711 | + REFERENCECONTROL_Reserved3 = struct.unpack("<H", dir_stream.read(2))[0] | |
| 712 | + else: | |
| 713 | + REFERENCECONTROL_Reserved3 = check2 | |
| 714 | + | |
| 715 | + check_value('REFERENCECONTROL_Reserved3', 0x0030, REFERENCECONTROL_Reserved3) | |
| 716 | + REFERENCECONTROL_SizeExtended = struct.unpack("<L", dir_stream.read(4))[0] | |
| 717 | + REFERENCECONTROL_SizeOfLibidExtended = struct.unpack("<L", dir_stream.read(4))[0] | |
| 718 | + REFERENCECONTROL_LibidExtended = dir_stream.read(REFERENCECONTROL_SizeOfLibidExtended) | |
| 719 | + REFERENCECONTROL_Reserved4 = struct.unpack("<L", dir_stream.read(4))[0] | |
| 720 | + REFERENCECONTROL_Reserved5 = struct.unpack("<H", dir_stream.read(2))[0] | |
| 721 | + REFERENCECONTROL_OriginalTypeLib = dir_stream.read(16) | |
| 722 | + REFERENCECONTROL_Cookie = struct.unpack("<L", dir_stream.read(4))[0] | |
| 723 | + continue | |
| 724 | + | |
| 725 | + if check == 0x000D: | |
| 726 | + # REFERENCEREGISTERED | |
| 727 | + REFERENCEREGISTERED_Id = check | |
| 728 | + REFERENCEREGISTERED_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 729 | + REFERENCEREGISTERED_SizeOfLibid = struct.unpack("<L", dir_stream.read(4))[0] | |
| 730 | + REFERENCEREGISTERED_Libid = dir_stream.read(REFERENCEREGISTERED_SizeOfLibid) | |
| 731 | + REFERENCEREGISTERED_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0] | |
| 732 | + check_value('REFERENCEREGISTERED_Reserved1', 0x0000, REFERENCEREGISTERED_Reserved1) | |
| 733 | + REFERENCEREGISTERED_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0] | |
| 734 | + check_value('REFERENCEREGISTERED_Reserved2', 0x0000, REFERENCEREGISTERED_Reserved2) | |
| 735 | + continue | |
| 736 | + | |
| 737 | + if check == 0x000E: | |
| 738 | + # REFERENCEPROJECT | |
| 739 | + REFERENCEPROJECT_Id = check | |
| 740 | + REFERENCEPROJECT_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 741 | + REFERENCEPROJECT_SizeOfLibidAbsolute = struct.unpack("<L", dir_stream.read(4))[0] | |
| 742 | + REFERENCEPROJECT_LibidAbsolute = dir_stream.read(REFERENCEPROJECT_SizeOfLibidAbsolute) | |
| 743 | + REFERENCEPROJECT_SizeOfLibidRelative = struct.unpack("<L", dir_stream.read(4))[0] | |
| 744 | + REFERENCEPROJECT_LibidRelative = dir_stream.read(REFERENCEPROJECT_SizeOfLibidRelative) | |
| 745 | + REFERENCEPROJECT_MajorVersion = struct.unpack("<L", dir_stream.read(4))[0] | |
| 746 | + REFERENCEPROJECT_MinorVersion = struct.unpack("<H", dir_stream.read(2))[0] | |
| 747 | + continue | |
| 748 | + | |
| 749 | + logging.error('invalid or unknown check Id {0:04X}'.format(check)) | |
| 750 | + sys.exit(0) | |
| 751 | + | |
| 752 | + PROJECTMODULES_Id = check #struct.unpack("<H", dir_stream.read(2))[0] | |
| 753 | + check_value('PROJECTMODULES_Id', 0x000F, PROJECTMODULES_Id) | |
| 754 | + PROJECTMODULES_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 755 | + check_value('PROJECTMODULES_Size', 0x0002, PROJECTMODULES_Size) | |
| 756 | + PROJECTMODULES_Count = struct.unpack("<H", dir_stream.read(2))[0] | |
| 757 | + PROJECTMODULES_ProjectCookieRecord_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 758 | + check_value('PROJECTMODULES_ProjectCookieRecord_Id', 0x0013, PROJECTMODULES_ProjectCookieRecord_Id) | |
| 759 | + PROJECTMODULES_ProjectCookieRecord_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 760 | + check_value('PROJECTMODULES_ProjectCookieRecord_Size', 0x0002, PROJECTMODULES_ProjectCookieRecord_Size) | |
| 761 | + PROJECTMODULES_ProjectCookieRecord_Cookie = struct.unpack("<H", dir_stream.read(2))[0] | |
| 762 | + | |
| 763 | + logging.debug("parsing {0} modules".format(PROJECTMODULES_Count)) | |
| 764 | + for x in xrange(0, PROJECTMODULES_Count): | |
| 765 | + MODULENAME_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 766 | + check_value('MODULENAME_Id', 0x0019, MODULENAME_Id) | |
| 767 | + MODULENAME_SizeOfModuleName = struct.unpack("<L", dir_stream.read(4))[0] | |
| 768 | + MODULENAME_ModuleName = dir_stream.read(MODULENAME_SizeOfModuleName) | |
| 769 | + # account for optional sections | |
| 770 | + section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 771 | + if section_id == 0x0047: | |
| 772 | + MODULENAMEUNICODE_Id = section_id | |
| 773 | + MODULENAMEUNICODE_SizeOfModuleNameUnicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 774 | + MODULENAMEUNICODE_ModuleNameUnicode = dir_stream.read(MODULENAMEUNICODE_SizeOfModuleNameUnicode) | |
| 775 | + section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 776 | + if section_id == 0x001A: | |
| 777 | + MODULESTREAMNAME_id = section_id | |
| 778 | + MODULESTREAMNAME_SizeOfStreamName = struct.unpack("<L", dir_stream.read(4))[0] | |
| 779 | + MODULESTREAMNAME_StreamName = dir_stream.read(MODULESTREAMNAME_SizeOfStreamName) | |
| 780 | + MODULESTREAMNAME_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 781 | + check_value('MODULESTREAMNAME_Reserved', 0x0032, MODULESTREAMNAME_Reserved) | |
| 782 | + MODULESTREAMNAME_SizeOfStreamNameUnicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 783 | + MODULESTREAMNAME_StreamNameUnicode = dir_stream.read(MODULESTREAMNAME_SizeOfStreamNameUnicode) | |
| 784 | + section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 785 | + if section_id == 0x001C: | |
| 786 | + MODULEDOCSTRING_Id = section_id | |
| 787 | + check_value('MODULEDOCSTRING_Id', 0x001C, MODULEDOCSTRING_Id) | |
| 788 | + MODULEDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0] | |
| 789 | + MODULEDOCSTRING_DocString = dir_stream.read(MODULEDOCSTRING_SizeOfDocString) | |
| 790 | + MODULEDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 791 | + check_value('MODULEDOCSTRING_Reserved', 0x0048, MODULEDOCSTRING_Reserved) | |
| 792 | + MODULEDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 793 | + MODULEDOCSTRING_DocStringUnicode = dir_stream.read(MODULEDOCSTRING_SizeOfDocStringUnicode) | |
| 794 | + section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 795 | + if section_id == 0x0031: | |
| 796 | + MODULEOFFSET_Id = section_id | |
| 797 | + check_value('MODULEOFFSET_Id', 0x0031, MODULEOFFSET_Id) | |
| 798 | + MODULEOFFSET_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 799 | + check_value('MODULEOFFSET_Size', 0x0004, MODULEOFFSET_Size) | |
| 800 | + MODULEOFFSET_TextOffset = struct.unpack("<L", dir_stream.read(4))[0] | |
| 801 | + section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 802 | + if section_id == 0x001E: | |
| 803 | + MODULEHELPCONTEXT_Id = section_id | |
| 804 | + check_value('MODULEHELPCONTEXT_Id', 0x001E, MODULEHELPCONTEXT_Id) | |
| 805 | + MODULEHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 806 | + check_value('MODULEHELPCONTEXT_Size', 0x0004, MODULEHELPCONTEXT_Size) | |
| 807 | + MODULEHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0] | |
| 808 | + section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 809 | + if section_id == 0x002C: | |
| 810 | + MODULECOOKIE_Id = section_id | |
| 811 | + check_value('MODULECOOKIE_Id', 0x002C, MODULECOOKIE_Id) | |
| 812 | + MODULECOOKIE_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 813 | + check_value('MODULECOOKIE_Size', 0x0002, MODULECOOKIE_Size) | |
| 814 | + MODULECOOKIE_Cookie = struct.unpack("<H", dir_stream.read(2))[0] | |
| 815 | + section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 816 | + if section_id == 0x0021 or section_id == 0x0022: | |
| 817 | + MODULETYPE_Id = section_id | |
| 818 | + MODULETYPE_Reserved = struct.unpack("<L", dir_stream.read(4))[0] | |
| 819 | + section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 820 | + if section_id == 0x0025: | |
| 821 | + MODULEREADONLY_Id = section_id | |
| 822 | + check_value('MODULEREADONLY_Id', 0x0025, MODULEREADONLY_Id) | |
| 823 | + MODULEREADONLY_Reserved = struct.unpack("<L", dir_stream.read(4))[0] | |
| 824 | + check_value('MODULEREADONLY_Reserved', 0x0000, MODULEREADONLY_Reserved) | |
| 825 | + section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 826 | + if section_id == 0x0028: | |
| 827 | + MODULEPRIVATE_Id = section_id | |
| 828 | + check_value('MODULEPRIVATE_Id', 0x0028, MODULEPRIVATE_Id) | |
| 829 | + MODULEPRIVATE_Reserved = struct.unpack("<L", dir_stream.read(4))[0] | |
| 830 | + check_value('MODULEPRIVATE_Reserved', 0x0000, MODULEPRIVATE_Reserved) | |
| 831 | + section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 832 | + if section_id == 0x002B: # TERMINATOR | |
| 833 | + MODULE_Reserved = struct.unpack("<L", dir_stream.read(4))[0] | |
| 834 | + check_value('MODULE_Reserved', 0x0000, MODULE_Reserved) | |
| 835 | + section_id = None | |
| 836 | + if section_id != None: | |
| 837 | + logging.warning('unknown or invalid module section id {0:04X}'.format(section_id)) | |
| 838 | + | |
| 839 | + logging.debug('Project CodePage = %d' % PROJECTCODEPAGE_CodePage) | |
| 840 | + vba_codec = 'cp%d' % PROJECTCODEPAGE_CodePage | |
| 841 | + logging.debug("ModuleName = {0}".format(MODULENAME_ModuleName)) | |
| 842 | + logging.debug("StreamName = {0}".format(repr(MODULESTREAMNAME_StreamName))) | |
| 843 | + streamname_unicode = MODULESTREAMNAME_StreamName.decode(vba_codec) | |
| 844 | + logging.debug("StreamName.decode('%s') = %s" % (vba_codec, repr(streamname_unicode))) | |
| 845 | + logging.debug("StreamNameUnicode = {0}".format(repr(MODULESTREAMNAME_StreamNameUnicode))) | |
| 846 | + logging.debug("TextOffset = {0}".format(MODULEOFFSET_TextOffset)) | |
| 847 | + | |
| 848 | + code_path = vba_root + u'VBA/' + streamname_unicode | |
| 849 | + #TODO: test if stream exists | |
| 850 | + logging.debug('opening VBA code stream %s' % repr(code_path)) | |
| 851 | + code_data = ole.openstream(code_path).read() | |
| 852 | + logging.debug("length of code_data = {0}".format(len(code_data))) | |
| 853 | + logging.debug("offset of code_data = {0}".format(MODULEOFFSET_TextOffset)) | |
| 854 | + code_data = code_data[MODULEOFFSET_TextOffset:] | |
| 855 | + if len(code_data) > 0: | |
| 856 | + code_data = decompress_stream(code_data) | |
| 857 | + # case-insensitive search in the code_modules dict to find the file extension: | |
| 858 | + filext = code_modules.get(MODULENAME_ModuleName.lower(), 'bin') | |
| 859 | + filename = '{0}.{1}'.format(MODULENAME_ModuleName, filext) | |
| 860 | + #TODO: also yield the codepage so that callers can decode it properly | |
| 861 | + yield (code_path, filename, code_data) | |
| 862 | + # print '-'*79 | |
| 863 | + # print filename | |
| 864 | + # print '' | |
| 865 | + # print code_data | |
| 866 | + # print '' | |
| 867 | + logging.debug('extracted file {0}'.format(filename)) | |
| 868 | + else: | |
| 869 | + logging.warning("module stream {0} has code data length 0".format(MODULESTREAMNAME_StreamName)) | |
| 870 | + return | |
| 871 | + | |
| 872 | + | |
| 873 | +def filter_vba(vba_code): | |
| 874 | + """ | |
| 875 | + Filter VBA source code to remove the first lines starting with "Attribute VB_", | |
| 876 | + which are automatically added by MS Office and not displayed in the VBA Editor. | |
| 877 | + This should only be used when displaying source code for human analysis. | |
| 878 | + | |
| 879 | + Note: lines are not filtered if they contain a colon, because it could be | |
| 880 | + used to hide malicious instructions. | |
| 881 | + | |
| 882 | + :param vba_code: str, VBA source code | |
| 883 | + :return: str, filtered VBA source code | |
| 884 | + """ | |
| 885 | + vba_lines = vba_code.splitlines() | |
| 886 | + start = 0 | |
| 887 | + for line in vba_lines: | |
| 888 | + if line.startswith("Attribute VB_") and not ':' in line: | |
| 889 | + start += 1 | |
| 890 | + else: | |
| 891 | + break | |
| 892 | + #TODO: also remove empty lines? | |
| 893 | + vba = '\n'.join(vba_lines[start:]) | |
| 894 | + return vba | |
| 895 | + | |
| 896 | + | |
| 897 | +def detect_autoexec(vba_code, obfuscation=None): | |
| 898 | + """ | |
| 899 | + Detect if the VBA code contains keywords corresponding to macros running | |
| 900 | + automatically when triggered by specific actions (e.g. when a document is | |
| 901 | + opened or closed). | |
| 902 | + | |
| 903 | + :param vba_code: str, VBA source code | |
| 904 | + :param obfuscation: None or str, name of obfuscation to be added to description | |
| 905 | + :return: list of str tuples (keyword, description) | |
| 906 | + """ | |
| 907 | + #TODO: merge code with detect_suspicious | |
| 908 | + # case-insensitive search | |
| 909 | + #vba_code = vba_code.lower() | |
| 910 | + results = [] | |
| 911 | + obf_text = '' | |
| 912 | + if obfuscation: | |
| 913 | + obf_text = ' (obfuscation: %s)' % obfuscation | |
| 914 | + for description, keywords in AUTOEXEC_KEYWORDS.items(): | |
| 915 | + for keyword in keywords: | |
| 916 | + #TODO: if keyword is already a compiled regex, use it as-is | |
| 917 | + # search using regex to detect word boundaries: | |
| 918 | + if re.search(r'(?i)\b'+keyword+r'\b', vba_code): | |
| 919 | + #if keyword.lower() in vba_code: | |
| 920 | + results.append((keyword, description+obf_text)) | |
| 921 | + return results | |
| 922 | + | |
| 923 | + | |
| 924 | +def detect_suspicious(vba_code, obfuscation=None): | |
| 925 | + """ | |
| 926 | + Detect if the VBA code contains suspicious keywords corresponding to | |
| 927 | + potential malware behaviour. | |
| 928 | + | |
| 929 | + :param vba_code: str, VBA source code | |
| 930 | + :param obfuscation: None or str, name of obfuscation to be added to description | |
| 931 | + :return: list of str tuples (keyword, description) | |
| 932 | + """ | |
| 933 | + # case-insensitive search | |
| 934 | + #vba_code = vba_code.lower() | |
| 935 | + results = [] | |
| 936 | + obf_text = '' | |
| 937 | + if obfuscation: | |
| 938 | + obf_text = ' (obfuscation: %s)' % obfuscation | |
| 939 | + for description, keywords in SUSPICIOUS_KEYWORDS.items(): | |
| 940 | + for keyword in keywords: | |
| 941 | + # search using regex to detect word boundaries: | |
| 942 | + if re.search(r'(?i)\b'+keyword+r'\b', vba_code): | |
| 943 | + #if keyword.lower() in vba_code: | |
| 944 | + results.append((keyword, description+obf_text)) | |
| 945 | + return results | |
| 946 | + | |
| 947 | + | |
| 948 | +def detect_patterns(vba_code, obfuscation=None): | |
| 949 | + """ | |
| 950 | + Detect if the VBA code contains specific patterns such as IP addresses, | |
| 951 | + URLs, e-mail addresses, executable file names, etc. | |
| 952 | + | |
| 953 | + :param vba_code: str, VBA source code | |
| 954 | + :return: list of str tuples (pattern type, value) | |
| 955 | + """ | |
| 956 | + results = [] | |
| 957 | + found = set() | |
| 958 | + obf_text = '' | |
| 959 | + if obfuscation: | |
| 960 | + obf_text = ' (obfuscation: %s)' % obfuscation | |
| 961 | + for pattern_type, pattern_re in RE_PATTERNS: | |
| 962 | + for match in pattern_re.finditer(vba_code): | |
| 963 | + value = match.group() | |
| 964 | + if value not in found: | |
| 965 | + results.append((pattern_type+obf_text, value)) | |
| 966 | + found.add(value) | |
| 967 | + return results | |
| 968 | + | |
| 969 | + | |
| 970 | +def detect_hex_strings(vba_code): | |
| 971 | + """ | |
| 972 | + Detect if the VBA code contains strings encoded in hexadecimal. | |
| 973 | + | |
| 974 | + :param vba_code: str, VBA source code | |
| 975 | + :return: list of str tuples (encoded string, decoded string) | |
| 976 | + """ | |
| 977 | + results = [] | |
| 978 | + found = set() | |
| 979 | + for match in re_hex_string.finditer(vba_code): | |
| 980 | + value = match.group() | |
| 981 | + if value not in found: | |
| 982 | + decoded = binascii.unhexlify(value) | |
| 983 | + results.append((value, decoded)) | |
| 984 | + found.add(value) | |
| 985 | + return results | |
| 986 | + | |
| 987 | + | |
| 988 | +def detect_base64_strings(vba_code): | |
| 989 | + """ | |
| 990 | + Detect if the VBA code contains strings encoded in base64. | |
| 991 | + | |
| 992 | + :param vba_code: str, VBA source code | |
| 993 | + :return: list of str tuples (encoded string, decoded string) | |
| 994 | + """ | |
| 995 | + #TODO: avoid matching simple hex strings as base64? | |
| 996 | + results = [] | |
| 997 | + found = set() | |
| 998 | + for match in re_base64_string.finditer(vba_code): | |
| 999 | + # extract the base64 string without quotes: | |
| 1000 | + value = match.group().strip('"') | |
| 1001 | + # check it is not just a hex string: | |
| 1002 | + if not re_nothex_check.search(value): | |
| 1003 | + continue | |
| 1004 | + # only keep new values and not in the whitelist: | |
| 1005 | + if value not in found and value.lower() not in BASE64_WHITELIST: | |
| 1006 | + try: | |
| 1007 | + decoded = base64.b64decode(value) | |
| 1008 | + results.append((value, decoded)) | |
| 1009 | + found.add(value) | |
| 1010 | + except: | |
| 1011 | + # if an exception occurs, it is likely not a base64-encoded string | |
| 1012 | + pass | |
| 1013 | + return results | |
| 1014 | + | |
| 1015 | + | |
| 1016 | +def detect_dridex_strings(vba_code): | |
| 1017 | + """ | |
| 1018 | + Detect if the VBA code contains strings obfuscated with a specific algorithm found in Dridex samples. | |
| 1019 | + | |
| 1020 | + :param vba_code: str, VBA source code | |
| 1021 | + :return: list of str tuples (encoded string, decoded string) | |
| 1022 | + """ | |
| 1023 | + from thirdparty.DridexUrlDecoder.DridexUrlDecoder import DridexUrlDecode | |
| 1024 | + results = [] | |
| 1025 | + found = set() | |
| 1026 | + for match in re_dridex_string.finditer(vba_code): | |
| 1027 | + value = match.group()[1:-1] | |
| 1028 | + # check it is not just a hex string: | |
| 1029 | + if not re_nothex_check.search(value): | |
| 1030 | + continue | |
| 1031 | + if value not in found: | |
| 1032 | + try: | |
| 1033 | + decoded = DridexUrlDecode(value) | |
| 1034 | + results.append((value, decoded)) | |
| 1035 | + found.add(value) | |
| 1036 | + except: | |
| 1037 | + # if an exception occurs, it is likely not a dridex-encoded string | |
| 1038 | + pass | |
| 1039 | + return results | |
| 1040 | + | |
| 1041 | + | |
| 1042 | +class VBA_Scanner (object): | |
| 1043 | + """ | |
| 1044 | + Class to scan the source code of a VBA module to find obfuscated strings, | |
| 1045 | + suspicious keywords, IOCs, auto-executable macros, etc. | |
| 1046 | + """ | |
| 1047 | + | |
| 1048 | + def __init__(self, vba_code): | |
| 1049 | + """ | |
| 1050 | + VBA_Scanner constructor | |
| 1051 | + | |
| 1052 | + :param vba_code: str, VBA source code to be analyzed | |
| 1053 | + """ | |
| 1054 | + self.code = vba_code | |
| 1055 | + self.code_hex = '' | |
| 1056 | + self.code_hex_rev = '' | |
| 1057 | + self.code_rev_hex = '' | |
| 1058 | + self.code_base64 = '' | |
| 1059 | + self.code_dridex = '' | |
| 1060 | + | |
| 1061 | + | |
| 1062 | + def scan(self, include_decoded_strings=False): | |
| 1063 | + """ | |
| 1064 | + Analyze the provided VBA code to detect suspicious keywords, | |
| 1065 | + auto-executable macros, IOC patterns, obfuscation patterns | |
| 1066 | + such as hex-encoded strings. | |
| 1067 | + | |
| 1068 | + :param include_decoded_strings: bool, if True, all encoded strings will be included with their decoded content. | |
| 1069 | + :return: list of tuples (type, keyword, description) | |
| 1070 | + (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String') | |
| 1071 | + """ | |
| 1072 | + # First, detect and extract hex-encoded strings: | |
| 1073 | + self.hex_strings = detect_hex_strings(self.code) | |
| 1074 | + # detect if the code contains StrReverse: | |
| 1075 | + self.strReverse = False | |
| 1076 | + if 'strreverse' in self.code.lower(): self.strReverse = True | |
| 1077 | + # Then append the decoded strings to the VBA code, to detect obfuscated IOCs and keywords: | |
| 1078 | + for encoded, decoded in self.hex_strings: | |
| 1079 | + self.code_hex += '\n'+decoded | |
| 1080 | + # if the code contains "StrReverse", also append the hex strings in reverse order: | |
| 1081 | + if self.strReverse: | |
| 1082 | + # StrReverse after hex decoding: | |
| 1083 | + self.code_hex_rev += '\n'+decoded[::-1] | |
| 1084 | + # StrReverse before hex decoding: | |
| 1085 | + self.code_rev_hex += '\n'+binascii.unhexlify(encoded[::-1]) | |
| 1086 | + #example: https://malwr.com/analysis/NmFlMGI4YTY1YzYyNDkwNTg1ZTBiZmY5OGI3YjlhYzU/ | |
| 1087 | + #TODO: also append the full code reversed if StrReverse? (risk of false positives?) | |
| 1088 | + # Detect Base64-encoded strings | |
| 1089 | + self.base64_strings = detect_base64_strings(self.code) | |
| 1090 | + for encoded, decoded in self.base64_strings: | |
| 1091 | + self.code_base64 += '\n'+decoded | |
| 1092 | + # Detect Dridex-encoded strings | |
| 1093 | + self.dridex_strings = detect_dridex_strings(self.code) | |
| 1094 | + for encoded, decoded in self.dridex_strings: | |
| 1095 | + self.code_dridex += '\n'+decoded | |
| 1096 | + results = [] | |
| 1097 | + self.autoexec_keywords = [] | |
| 1098 | + self.suspicious_keywords = [] | |
| 1099 | + self.iocs = [] | |
| 1100 | + | |
| 1101 | + for code, obfuscation in ( | |
| 1102 | + (self.code, None), | |
| 1103 | + (self.code_hex, 'Hex'), | |
| 1104 | + (self.code_hex_rev, 'Hex+StrReverse'), | |
| 1105 | + (self.code_rev_hex, 'StrReverse+Hex'), | |
| 1106 | + (self.code_base64, 'Base64'), | |
| 1107 | + (self.code_dridex, 'Dridex'), | |
| 1108 | + ): | |
| 1109 | + self.autoexec_keywords += detect_autoexec(code, obfuscation) | |
| 1110 | + self.suspicious_keywords += detect_suspicious(code, obfuscation) | |
| 1111 | + self.iocs += detect_patterns(code, obfuscation) | |
| 1112 | + | |
| 1113 | + # If hex-encoded strings were discovered, add an item to suspicious keywords: | |
| 1114 | + if self.hex_strings: | |
| 1115 | + self.suspicious_keywords.append(('Hex Strings', | |
| 1116 | + 'Hex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)')) | |
| 1117 | + if self.base64_strings: | |
| 1118 | + self.suspicious_keywords.append(('Base64 Strings', | |
| 1119 | + 'Base64-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)')) | |
| 1120 | + if self.dridex_strings: | |
| 1121 | + self.suspicious_keywords.append(('Dridex Strings', | |
| 1122 | + 'Dridex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)')) | |
| 1123 | + for keyword, description in self.autoexec_keywords: | |
| 1124 | + results.append(('AutoExec', keyword, description)) | |
| 1125 | + for keyword, description in self.suspicious_keywords: | |
| 1126 | + results.append(('Suspicious', keyword, description)) | |
| 1127 | + for pattern_type, value in self.iocs: | |
| 1128 | + results.append(('IOC', value, pattern_type)) | |
| 1129 | + if include_decoded_strings: | |
| 1130 | + for encoded, decoded in self.hex_strings: | |
| 1131 | + results.append(('Hex String', repr(decoded), encoded)) | |
| 1132 | + for encoded, decoded in self.base64_strings: | |
| 1133 | + results.append(('Base64 String', repr(decoded), encoded)) | |
| 1134 | + for encoded, decoded in self.dridex_strings: | |
| 1135 | + results.append(('Dridex string', repr(decoded), encoded)) | |
| 1136 | + return results | |
| 1137 | + | |
| 1138 | + def scan_summary(self): | |
| 1139 | + """ | |
| 1140 | + Analyze the provided VBA code to detect suspicious keywords, | |
| 1141 | + auto-executable macros, IOC patterns, obfuscation patterns | |
| 1142 | + such as hex-encoded strings. | |
| 1143 | + | |
| 1144 | + :return: tuple with the number of items found for each category: | |
| 1145 | + (autoexec, suspicious, IOCs, hex, base64, dridex) | |
| 1146 | + """ | |
| 1147 | + self.scan() | |
| 1148 | + return (len(self.autoexec_keywords), len(self.suspicious_keywords), | |
| 1149 | + len(self.iocs), len(self.hex_strings), len(self.base64_strings), | |
| 1150 | + len(self.dridex_strings)) | |
| 1151 | + | |
| 1152 | + | |
| 1153 | + | |
| 1154 | +def scan_vba(vba_code, include_decoded_strings): | |
| 1155 | + """ | |
| 1156 | + Analyze the provided VBA code to detect suspicious keywords, | |
| 1157 | + auto-executable macros, IOC patterns, obfuscation patterns | |
| 1158 | + such as hex-encoded strings. | |
| 1159 | + (shortcut for VBA_Scanner(vba_code).scan()) | |
| 1160 | + | |
| 1161 | + :param vba_code: str, VBA source code to be analyzed | |
| 1162 | + :param include_decoded_strings: bool, if True all encoded strings will be included with their decoded content. | |
| 1163 | + :return: list of tuples (type, keyword, description) | |
| 1164 | + (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String') | |
| 1165 | + """ | |
| 1166 | + return VBA_Scanner(vba_code).scan(include_decoded_strings) | |
| 1167 | + | |
| 1168 | + | |
| 1169 | +#=== CLASSES ================================================================= | |
| 1170 | + | |
| 1171 | +class VBA_Parser(object): | |
| 1172 | + """ | |
| 1173 | + Class to parse MS Office files, to detect VBA macros and extract VBA source code | |
| 1174 | + Supported file formats: | |
| 1175 | + - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) | |
| 1176 | + - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) | |
| 1177 | + - PowerPoint 2007+ (.pptm, .ppsm) | |
| 1178 | + """ | |
| 1179 | + | |
| 1180 | + def __init__(self, filename, data=None): | |
| 1181 | + """ | |
| 1182 | + Constructor for VBA_Parser | |
| 1183 | + | |
| 1184 | + :param filename: filename or path of file to parse, or file-like object | |
| 1185 | + | |
| 1186 | + :param data: None or bytes str, if None the file will be read from disk (or from the file-like object). | |
| 1187 | + If data is provided as a bytes string, it will be parsed as the content of the file in memory, | |
| 1188 | + and not read from disk. Note: files must be read in binary mode, i.e. open(f, 'rb'). | |
| 1189 | + """ | |
| 1190 | + #TODO: filename should only be a string, data should be used for the file-like object | |
| 1191 | + #TODO: filename should be mandatory, optional data is a string or file-like object | |
| 1192 | + #TODO: also support olefile and zipfile as input | |
| 1193 | + if data is None: | |
| 1194 | + # open file from disk: | |
| 1195 | + _file = filename | |
| 1196 | + else: | |
| 1197 | + # file already read in memory, make it a file-like object for zipfile: | |
| 1198 | + _file = cStringIO.StringIO(data) | |
| 1199 | + #self.file = _file | |
| 1200 | + self.ole_file = None | |
| 1201 | + self.ole_subfiles = [] | |
| 1202 | + self.filename = filename | |
| 1203 | + self.type = None | |
| 1204 | + self.vba_projects = None | |
| 1205 | + # if filename is None: | |
| 1206 | + # if isinstance(_file, basestring): | |
| 1207 | + # if len(_file) < olefile.MINIMAL_OLEFILE_SIZE: | |
| 1208 | + # self.filename = _file | |
| 1209 | + # else: | |
| 1210 | + # self.filename = '<file in bytes string>' | |
| 1211 | + # else: | |
| 1212 | + # self.filename = '<file-like object>' | |
| 1213 | + if olefile.isOleFile(_file): | |
| 1214 | + # This looks like an OLE file | |
| 1215 | + logging.info('Parsing OLE file %s' % self.filename) | |
| 1216 | + # Open and parse the OLE file, using unicode for path names: | |
| 1217 | + self.ole_file = olefile.OleFileIO(_file, path_encoding=None) | |
| 1218 | + self.type = TYPE_OLE | |
| 1219 | + #TODO: raise TypeError if this is a Powerpoint 97 file, since VBA macros cannot be detected yet | |
| 1220 | + elif zipfile.is_zipfile(_file): | |
| 1221 | + # This looks like a zip file, need to look for vbaProject.bin inside | |
| 1222 | + # It can be any OLE file inside the archive | |
| 1223 | + #...because vbaProject.bin can be renamed: | |
| 1224 | + # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18 | |
| 1225 | + logging.info('Opening ZIP/OpenXML file %s' % self.filename) | |
| 1226 | + self.type = TYPE_OpenXML | |
| 1227 | + z = zipfile.ZipFile(_file) | |
| 1228 | + #TODO: check if this is actually an OpenXML file | |
| 1229 | + #TODO: if the zip file is encrypted, suggest to use the -z option, or try '-z infected' automatically? | |
| 1230 | + # check each file within the zip if it is an OLE file, by reading its magic: | |
| 1231 | + for subfile in z.namelist(): | |
| 1232 | + magic = z.open(subfile).read(len(olefile.MAGIC)) | |
| 1233 | + if magic == olefile.MAGIC: | |
| 1234 | + logging.debug('Opening OLE file %s within zip' % subfile) | |
| 1235 | + ole_data = z.open(subfile).read() | |
| 1236 | + try: | |
| 1237 | + self.ole_subfiles.append(VBA_Parser(filename=subfile, data=ole_data)) | |
| 1238 | + except: | |
| 1239 | + logging.debug('%s is not a valid OLE file' % subfile) | |
| 1240 | + continue | |
| 1241 | + z.close() | |
| 1242 | + else: | |
| 1243 | + # read file from disk, check if it is a Word 2003 XML file (WordProcessingML), Excel 2003 XML, | |
| 1244 | + # or a plain text file containing VBA code | |
| 1245 | + if data is None: | |
| 1246 | + data = open(filename, 'rb').read() | |
| 1247 | + # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace | |
| 1248 | + if 'http://schemas.microsoft.com/office/word/2003/wordml' in data: | |
| 1249 | + logging.info('Opening Word 2003 XML file %s' % self.filename) | |
| 1250 | + self.type = TYPE_Word2003_XML | |
| 1251 | + # parse the XML content | |
| 1252 | + et = ET.fromstring(data) | |
| 1253 | + # find all the binData elements: | |
| 1254 | + for bindata in et.getiterator(TAG_BINDATA): | |
| 1255 | + # the binData content is an OLE container for the VBA project, compressed | |
| 1256 | + # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded. | |
| 1257 | + # get the filename: | |
| 1258 | + fname = bindata.get(ATTR_NAME, 'noname.mso') | |
| 1259 | + # decode the base64 activemime | |
| 1260 | + activemime = binascii.a2b_base64(bindata.text) | |
| 1261 | + # decompress the zlib data starting at offset 0x32, which is the OLE container: | |
| 1262 | + ole_data = zlib.decompress(activemime[0x32:]) | |
| 1263 | + try: | |
| 1264 | + self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data)) | |
| 1265 | + except: | |
| 1266 | + logging.debug('%s is not a valid OLE file' % fname) | |
| 1267 | + continue | |
| 1268 | + #TODO: handle exceptions | |
| 1269 | + #TODO: Excel 2003 XML | |
| 1270 | + #TODO: plain text VBA file | |
| 1271 | + else: | |
| 1272 | + msg = '%s is not an OLE nor an OpenXML file, cannot extract VBA Macros.' % self.filename | |
| 1273 | + logging.error(msg) | |
| 1274 | + raise TypeError(msg) | |
| 1275 | + | |
| 1276 | + def find_vba_projects (self): | |
| 1277 | + """ | |
| 1278 | + Finds all the VBA projects stored in an OLE file. | |
| 1279 | + | |
| 1280 | + Return None if the file is not OLE but OpenXML. | |
| 1281 | + Return a list of tuples (vba_root, project_path, dir_path) for each VBA project. | |
| 1282 | + vba_root is the path of the root OLE storage containing the VBA project, | |
| 1283 | + including a trailing slash unless it is the root of the OLE file. | |
| 1284 | + project_path is the path of the OLE stream named "PROJECT" within the VBA project. | |
| 1285 | + dir_path is the path of the OLE stream named "VBA/dir" within the VBA project. | |
| 1286 | + | |
| 1287 | + If this function returns an empty list for one of the supported formats | |
| 1288 | + (i.e. Word, Excel, Powerpoint except Powerpoint 97-2003), then the | |
| 1289 | + file does not contain VBA macros. | |
| 1290 | + | |
| 1291 | + :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path) | |
| 1292 | + for each VBA project found if OLE file | |
| 1293 | + """ | |
| 1294 | + # if the file is not OLE but OpenXML, return None: | |
| 1295 | + if self.ole_file is None: | |
| 1296 | + return None | |
| 1297 | + | |
| 1298 | + # if this method has already been called, return previous result: | |
| 1299 | + if self.vba_projects is not None: | |
| 1300 | + return self.vba_projects | |
| 1301 | + | |
| 1302 | + # Find the VBA project root (different in MS Word, Excel, etc): | |
| 1303 | + # - Word 97-2003: Macros | |
| 1304 | + # - Excel 97-2003: _VBA_PROJECT_CUR | |
| 1305 | + # - PowerPoint 97-2003: not supported yet (different file structure) | |
| 1306 | + # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin. | |
| 1307 | + # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word | |
| 1308 | + # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word | |
| 1309 | + # - Visio 2007: not supported yet (different file structure) | |
| 1310 | + | |
| 1311 | + # According to MS-OVBA section 2.2.1: | |
| 1312 | + # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream | |
| 1313 | + # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream | |
| 1314 | + # - all names are case-insensitive | |
| 1315 | + | |
| 1316 | + # start with an empty list: | |
| 1317 | + self.vba_projects = [] | |
| 1318 | + # Look for any storage containing those storage/streams: | |
| 1319 | + ole = self.ole_file | |
| 1320 | + for storage in ole.listdir(streams=False, storages=True): | |
| 1321 | + # Look for a storage ending with "VBA": | |
| 1322 | + if storage[-1].upper() == 'VBA': | |
| 1323 | + logging.debug('Found VBA storage: %s' % ('/'.join(storage))) | |
| 1324 | + vba_root = '/'.join(storage[:-1]) | |
| 1325 | + # Add a trailing slash to vba_root, unless it is the root of the OLE file: | |
| 1326 | + # (used later to append all the child streams/storages) | |
| 1327 | + if vba_root != '': | |
| 1328 | + vba_root += '/' | |
| 1329 | + logging.debug('Checking vba_root="%s"' % vba_root) | |
| 1330 | + | |
| 1331 | + def check_vba_stream(ole, vba_root, stream_path): | |
| 1332 | + full_path = vba_root + stream_path | |
| 1333 | + if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM: | |
| 1334 | + logging.debug('Found %s stream: %s' % (stream_path, full_path)) | |
| 1335 | + return full_path | |
| 1336 | + else: | |
| 1337 | + logging.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path) | |
| 1338 | + return False | |
| 1339 | + | |
| 1340 | + # Check if the VBA root storage also contains a PROJECT stream: | |
| 1341 | + project_path = check_vba_stream(ole, vba_root, 'PROJECT') | |
| 1342 | + if not project_path: continue | |
| 1343 | + # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream: | |
| 1344 | + vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT') | |
| 1345 | + if not vba_project_path: continue | |
| 1346 | + # Check if the VBA root storage also contains a VBA/dir stream: | |
| 1347 | + dir_path = check_vba_stream(ole, vba_root, 'VBA/dir') | |
| 1348 | + if not dir_path: continue | |
| 1349 | + # Now we are pretty sure it is a VBA project structure | |
| 1350 | + logging.debug('VBA root storage: "%s"' % vba_root) | |
| 1351 | + # append the results to the list as a tuple for later use: | |
| 1352 | + self.vba_projects.append((vba_root, project_path, dir_path)) | |
| 1353 | + return self.vba_projects | |
| 1354 | + | |
| 1355 | + def detect_vba_macros(self): | |
| 1356 | + """ | |
| 1357 | + Detect the potential presence of VBA macros in the file, by checking | |
| 1358 | + if it contains VBA projects. Both OLE and OpenXML files are supported. | |
| 1359 | + | |
| 1360 | + Important: for now, results are accurate only for Word, Excel and PowerPoint | |
| 1361 | + EXCEPT Powerpoint 97-2003, which has a different structure for VBA. | |
| 1362 | + | |
| 1363 | + Note: this method does NOT attempt to check the actual presence or validity | |
| 1364 | + of VBA macro source code, so there might be false positives. | |
| 1365 | + It may also detect VBA macros in files embedded within the main file, | |
| 1366 | + for example an Excel workbook with macros embedded into a Word | |
| 1367 | + document without macros may be detected, without distinction. | |
| 1368 | + | |
| 1369 | + :return: bool, True if at least one VBA project has been found, False otherwise | |
| 1370 | + """ | |
| 1371 | + #TODO: return None or raise exception if format not supported like PPT 97-2003 | |
| 1372 | + #TODO: return the number of VBA projects found instead of True/False? | |
| 1373 | + # if OpenXML, check all the OLE subfiles: | |
| 1374 | + if self.ole_file is None: | |
| 1375 | + for ole_subfile in self.ole_subfiles: | |
| 1376 | + if ole_subfile.detect_vba_macros(): | |
| 1377 | + return True | |
| 1378 | + return False | |
| 1379 | + # otherwise it's an OLE file, find VBA projects: | |
| 1380 | + vba_projects = self.find_vba_projects() | |
| 1381 | + if len(vba_projects) == 0: | |
| 1382 | + return False | |
| 1383 | + else: | |
| 1384 | + return True | |
| 1385 | + | |
| 1386 | + | |
| 1387 | + def extract_macros (self): | |
| 1388 | + """ | |
| 1389 | + Extract and decompress source code for each VBA macro found in the file | |
| 1390 | + | |
| 1391 | + Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found | |
| 1392 | + If the file is OLE, filename is the path of the file. | |
| 1393 | + If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros | |
| 1394 | + within the zip archive, e.g. word/vbaProject.bin. | |
| 1395 | + """ | |
| 1396 | + if self.ole_file is None: | |
| 1397 | + for ole_subfile in self.ole_subfiles: | |
| 1398 | + for results in ole_subfile.extract_macros(): | |
| 1399 | + yield results | |
| 1400 | + else: | |
| 1401 | + self.find_vba_projects() | |
| 1402 | + for vba_root, project_path, dir_path in self.vba_projects: | |
| 1403 | + # extract all VBA macros from that VBA root storage: | |
| 1404 | + for stream_path, vba_filename, vba_code in _extract_vba(self.ole_file, vba_root, project_path, dir_path): | |
| 1405 | + yield (self.filename, stream_path, vba_filename, vba_code) | |
| 1406 | + | |
| 1407 | + | |
| 1408 | + def close(self): | |
| 1409 | + """ | |
| 1410 | + Close all the open files. This method must be called after usage, if | |
| 1411 | + the application is opening many files. | |
| 1412 | + """ | |
| 1413 | + if self.ole_file is None: | |
| 1414 | + for ole_subfile in self.ole_subfiles: | |
| 1415 | + ole_subfile.close() | |
| 1416 | + else: | |
| 1417 | + self.ole_file.close() | |
| 1418 | + | |
| 1419 | + | |
| 1420 | +def print_analysis(vba_code, show_decoded_strings=False): | |
| 1421 | + """ | |
| 1422 | + Analyze the provided VBA code, and print the results in a table | |
| 1423 | + | |
| 1424 | + :param vba_code: str, VBA source code to be analyzed | |
| 1425 | + :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content. | |
| 1426 | + :return: None | |
| 1427 | + """ | |
| 1428 | + results = scan_vba(vba_code, show_decoded_strings) | |
| 1429 | + if results: | |
| 1430 | + t = prettytable.PrettyTable(('Type', 'Keyword', 'Description')) | |
| 1431 | + t.align = 'l' | |
| 1432 | + t.max_width['Type'] = 10 | |
| 1433 | + t.max_width['Keyword'] = 20 | |
| 1434 | + t.max_width['Description'] = 39 | |
| 1435 | + for kw_type, keyword, description in results: | |
| 1436 | + t.add_row((kw_type, keyword, description)) | |
| 1437 | + print t | |
| 1438 | + else: | |
| 1439 | + print 'No suspicious keyword or IOC found.' | |
| 1440 | + | |
| 1441 | + | |
| 1442 | + | |
| 1443 | +def process_file (container, filename, data, show_decoded_strings=False): | |
| 1444 | + """ | |
| 1445 | + Process a single file | |
| 1446 | + | |
| 1447 | + :param container: str, path and filename of container if the file is within | |
| 1448 | + a zip archive, None otherwise. | |
| 1449 | + :param filename: str, path and filename of file on disk, or within the container. | |
| 1450 | + :param data: bytes, content of the file if it is in a container, None if it is a file on disk. | |
| 1451 | + :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content. | |
| 1452 | + """ | |
| 1453 | + #TODO: replace print by writing to a provided output file (sys.stdout by default) | |
| 1454 | + if container: | |
| 1455 | + display_filename = '%s in %s' % (filename, container) | |
| 1456 | + else: | |
| 1457 | + display_filename = filename | |
| 1458 | + print '='*79 | |
| 1459 | + print 'FILE:', display_filename | |
| 1460 | + try: | |
| 1461 | + #TODO: handle olefile errors, when an OLE file is malformed | |
| 1462 | + vba = VBA_Parser(filename, data) | |
| 1463 | + print 'Type:', vba.type | |
| 1464 | + if vba.detect_vba_macros(): | |
| 1465 | + #print 'Contains VBA Macros:' | |
| 1466 | + for (subfilename, stream_path, vba_filename, vba_code) in vba.extract_macros(): | |
| 1467 | + # hide attribute lines: | |
| 1468 | + #TODO: option to disable attribute filtering | |
| 1469 | + vba_code_filtered = filter_vba(vba_code) | |
| 1470 | + print '-'*79 | |
| 1471 | + print 'VBA MACRO %s ' % vba_filename | |
| 1472 | + print 'in file: %s - OLE stream: %s' % (subfilename, repr(stream_path)) | |
| 1473 | + print '- '*39 | |
| 1474 | + # detect empty macros: | |
| 1475 | + if vba_code_filtered.strip() == '': | |
| 1476 | + print '(empty macro)' | |
| 1477 | + else: | |
| 1478 | + print vba_code_filtered | |
| 1479 | + print '- '*39 | |
| 1480 | + print 'ANALYSIS:' | |
| 1481 | + # analyse the whole code, filtered to avoid false positives: | |
| 1482 | + print_analysis(vba_code_filtered, show_decoded_strings) | |
| 1483 | + else: | |
| 1484 | + print 'No VBA macros found.' | |
| 1485 | + except: #TypeError: | |
| 1486 | + #raise | |
| 1487 | + #TODO: print more info if debug mode | |
| 1488 | + #print sys.exc_value | |
| 1489 | + # display the exception with full stack trace for debugging, but do not stop: | |
| 1490 | + traceback.print_exc() | |
| 1491 | + print '' | |
| 1492 | + | |
| 1493 | + | |
| 1494 | +def process_file_triage (container, filename, data): | |
| 1495 | + """ | |
| 1496 | + Process a single file | |
| 1497 | + | |
| 1498 | + :param container: str, path and filename of container if the file is within | |
| 1499 | + a zip archive, None otherwise. | |
| 1500 | + :param filename: str, path and filename of file on disk, or within the container. | |
| 1501 | + :param data: bytes, content of the file if it is in a container, None if it is a file on disk. | |
| 1502 | + """ | |
| 1503 | + #TODO: replace print by writing to a provided output file (sys.stdout by default) | |
| 1504 | + nb_macros = 0 | |
| 1505 | + nb_autoexec = 0 | |
| 1506 | + nb_suspicious = 0 | |
| 1507 | + nb_iocs = 0 | |
| 1508 | + nb_hexstrings = 0 | |
| 1509 | + nb_base64strings = 0 | |
| 1510 | + nb_dridexstrings = 0 | |
| 1511 | + # ftype = 'Other' | |
| 1512 | + message = '' | |
| 1513 | + try: | |
| 1514 | + #TODO: handle olefile errors, when an OLE file is malformed | |
| 1515 | + vba = VBA_Parser(filename, data) | |
| 1516 | + if vba.detect_vba_macros(): | |
| 1517 | + for (subfilename, stream_path, vba_filename, vba_code) in vba.extract_macros(): | |
| 1518 | + nb_macros += 1 | |
| 1519 | + if vba_code.strip() != '': | |
| 1520 | + # analyse the whole code, filtered to avoid false positives: | |
| 1521 | + scanner = VBA_Scanner(filter_vba(vba_code)) | |
| 1522 | + autoexec, suspicious, iocs, hexstrings, base64strings, dridex = scanner.scan_summary() | |
| 1523 | + nb_autoexec += autoexec | |
| 1524 | + nb_suspicious += suspicious | |
| 1525 | + nb_iocs += iocs | |
| 1526 | + nb_hexstrings += hexstrings | |
| 1527 | + nb_base64strings += base64strings | |
| 1528 | + nb_dridexstrings += dridex | |
| 1529 | + if vba.type == TYPE_OLE: | |
| 1530 | + flags = 'OLE:' | |
| 1531 | + elif vba.type == TYPE_OpenXML: | |
| 1532 | + flags = 'OpX:' | |
| 1533 | + elif vba.type == TYPE_Word2003_XML: | |
| 1534 | + flags = 'XML:' | |
| 1535 | + macros = autoexec = suspicious = iocs = hexstrings = base64obf = dridex = '-' | |
| 1536 | + if nb_macros: macros = 'M' | |
| 1537 | + if nb_autoexec: autoexec = 'A' | |
| 1538 | + if nb_suspicious: suspicious = 'S' | |
| 1539 | + if nb_iocs: iocs = 'I' | |
| 1540 | + if nb_hexstrings: hexstrings = 'H' | |
| 1541 | + if nb_base64strings: base64obf = 'B' | |
| 1542 | + if nb_dridexstrings: dridex = 'D' | |
| 1543 | + flags += '%s%s%s%s%s%s%s' % (macros, autoexec, suspicious, iocs, hexstrings, | |
| 1544 | + base64obf, dridex) | |
| 1545 | + | |
| 1546 | + # macros = autoexec = suspicious = iocs = hexstrings = 'no' | |
| 1547 | + # if nb_macros: macros = 'YES:%d' % nb_macros | |
| 1548 | + # if nb_autoexec: autoexec = 'YES:%d' % nb_autoexec | |
| 1549 | + # if nb_suspicious: suspicious = 'YES:%d' % nb_suspicious | |
| 1550 | + # if nb_iocs: iocs = 'YES:%d' % nb_iocs | |
| 1551 | + # if nb_hexstrings: hexstrings = 'YES:%d' % nb_hexstrings | |
| 1552 | + # # 2nd line = info | |
| 1553 | + # print '%-8s %-7s %-7s %-7s %-7s %-7s' % (vba.type, macros, autoexec, suspicious, iocs, hexstrings) | |
| 1554 | + except TypeError: | |
| 1555 | + # file type not OLE nor OpenXML | |
| 1556 | + flags = '?' | |
| 1557 | + message = 'File format not supported' | |
| 1558 | + except: | |
| 1559 | + # another error occurred | |
| 1560 | + #raise | |
| 1561 | + #TODO: print more info if debug mode | |
| 1562 | + #TODO: distinguish real errors from incorrect file types | |
| 1563 | + flags = '!ERROR' | |
| 1564 | + message = sys.exc_value | |
| 1565 | + line = '%-11s %s' % (flags, filename) | |
| 1566 | + if message: | |
| 1567 | + line += ' - %s' % message | |
| 1568 | + print line | |
| 1569 | + | |
| 1570 | + # t = prettytable.PrettyTable(('filename', 'type', 'macros', 'autoexec', 'suspicious', 'ioc', 'hexstrings'), | |
| 1571 | + # header=False, border=False) | |
| 1572 | + # t.align = 'l' | |
| 1573 | + # t.max_width['filename'] = 30 | |
| 1574 | + # t.max_width['type'] = 10 | |
| 1575 | + # t.max_width['macros'] = 6 | |
| 1576 | + # t.max_width['autoexec'] = 6 | |
| 1577 | + # t.max_width['suspicious'] = 6 | |
| 1578 | + # t.max_width['ioc'] = 6 | |
| 1579 | + # t.max_width['hexstrings'] = 6 | |
| 1580 | + # t.add_row((filename, ftype, macros, autoexec, suspicious, iocs, hexstrings)) | |
| 1581 | + # print t | |
| 1582 | + | |
| 1583 | +def main_triage_quick(): | |
| 1584 | + pass | |
| 1585 | + | |
| 1586 | +#=== MAIN ===================================================================== | |
| 1587 | + | |
| 1588 | +def main(): | |
| 1589 | + """ | |
| 1590 | + Main function, called when olevba is run from the command line | |
| 1591 | + """ | |
| 1592 | + usage = 'usage: %prog [options] <filename> [filename2 ...]' | |
| 1593 | + parser = optparse.OptionParser(usage=usage) | |
| 1594 | + # parser.add_option('-o', '--outfile', dest='outfile', | |
| 1595 | + # help='output file') | |
| 1596 | + # parser.add_option('-c', '--csv', dest='csv', | |
| 1597 | + # help='export results to a CSV file') | |
| 1598 | + parser.add_option("-r", action="store_true", dest="recursive", | |
| 1599 | + help='find files recursively in subdirectories.') | |
| 1600 | + parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None, | |
| 1601 | + help='if the file is a zip archive, open first file from it, using the provided password (requires Python 2.6+)') | |
| 1602 | + parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*', | |
| 1603 | + help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)') | |
| 1604 | + parser.add_option("-t", action="store_true", dest="triage_mode", | |
| 1605 | + help='triage mode, display results as a summary table (default for multiple files)') | |
| 1606 | + parser.add_option("-d", action="store_true", dest="detailed_mode", | |
| 1607 | + help='detailed mode, display full results (default for single file)') | |
| 1608 | + parser.add_option("-i", "--input", dest='input', type='str', default=None, | |
| 1609 | + help='input file containing VBA source code to be analyzed (no parsing)') | |
| 1610 | + parser.add_option("--decode", action="store_true", dest="show_decoded_strings", | |
| 1611 | + help='display all the obfuscated strings with their decoded content (Hex, Base64, StrReverse, Dridex).') | |
| 1612 | + | |
| 1613 | + (options, args) = parser.parse_args() | |
| 1614 | + | |
| 1615 | + # Print help if no arguments are passed | |
| 1616 | + if len(args) == 0 and not options.input: | |
| 1617 | + print __doc__ | |
| 1618 | + parser.print_help() | |
| 1619 | + sys.exit() | |
| 1620 | + | |
| 1621 | + # print banner with version | |
| 1622 | + print 'olevba %s - http://decalage.info/python/oletools' % __version__ | |
| 1623 | + | |
| 1624 | + logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING) #INFO) | |
| 1625 | + # For now, all logging is disabled: | |
| 1626 | + logging.disable(logging.CRITICAL) | |
| 1627 | + | |
| 1628 | + if options.input: | |
| 1629 | + # input file provided with VBA source code to be analyzed directly: | |
| 1630 | + print 'Analysis of VBA source code from %s:' % options.input | |
| 1631 | + vba_code = open(options.input).read() | |
| 1632 | + print_analysis(vba_code, show_decoded_strings=options.show_decoded_strings) | |
| 1633 | + sys.exit() | |
| 1634 | + | |
| 1635 | + # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('Type', 'Macros', 'AutoEx', 'Susp.', 'IOCs', 'HexStr') | |
| 1636 | + # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('-'*8, '-'*7, '-'*7, '-'*7, '-'*7, '-'*7) | |
| 1637 | + if not options.detailed_mode or options.triage_mode: | |
| 1638 | + print '%-11s %-65s' % ('Flags', 'Filename') | |
| 1639 | + print '%-11s %-65s' % ('-'*11, '-'*65) | |
| 1640 | + previous_container = None | |
| 1641 | + count = 0 | |
| 1642 | + container = filename = data = None | |
| 1643 | + for container, filename, data in xglob.iter_files(args, recursive=options.recursive, | |
| 1644 | + zip_password=options.zip_password, zip_fname=options.zip_fname): | |
| 1645 | + # ignore directory names stored in zip files: | |
| 1646 | + if container and filename.endswith('/'): | |
| 1647 | + continue | |
| 1648 | + if options.detailed_mode and not options.triage_mode: | |
| 1649 | + # fully detailed output | |
| 1650 | + process_file(container, filename, data, show_decoded_strings=options.show_decoded_strings) | |
| 1651 | + else: | |
| 1652 | + # print container name when it changes: | |
| 1653 | + if container != previous_container: | |
| 1654 | + if container is not None: | |
| 1655 | + print '\nFiles in %s:' % container | |
| 1656 | + previous_container = container | |
| 1657 | + # summarized output for triage: | |
| 1658 | + process_file_triage(container, filename, data) | |
| 1659 | + count += 1 | |
| 1660 | + if not options.detailed_mode or options.triage_mode: | |
| 1661 | + print '\n(Flags: OpX=OpenXML, XML=Word2003XML, M=Macros, A=Auto-executable, S=Suspicious keywords, I=IOCs, H=Hex strings, B=Base64 strings, D=Dridex strings, ?=Unknown)\n' | |
| 1662 | + | |
| 1663 | + if count == 1 and not options.triage_mode and not options.detailed_mode: | |
| 1664 | + # if options -t and -d were not specified and it's a single file, print details: | |
| 1665 | + #TODO: avoid doing the analysis twice by storing results | |
| 1666 | + process_file(container, filename, data, show_decoded_strings=options.show_decoded_strings) | |
| 1667 | + | |
| 1668 | +if __name__ == '__main__': | |
| 1669 | + main() | |
| 1670 | + | |
| 1671 | 1671 | # This was coded while listening to "Dust" from I Love You But I've Chosen Darkness |
| 1672 | 1672 | \ No newline at end of file | ... | ... |