Commit 8e1d03d7a18b0779ea73c1d4b13914c07220c37d
1 parent
a7309e59
olevba3: replaced by a redirection to olevba + deprecation warning (issue #106)
Showing
1 changed file
with
6 additions
and
3651 deletions
oletools/olevba3.py
| 1 | 1 | #!/usr/bin/env python |
| 2 | -""" | |
| 3 | -olevba3.py | |
| 4 | 2 | |
| 5 | -olevba is a script to parse OLE and OpenXML files such as MS Office documents | |
| 6 | -(e.g. Word, Excel), to extract VBA Macro code in clear text, deobfuscate | |
| 7 | -and analyze malicious macros. | |
| 3 | +# olevba3 is a stub that redirects to olevba.py, for backwards compatibility | |
| 8 | 4 | |
| 9 | -olevba3 is the version of olevba that runs on Python 3.x. | |
| 10 | - | |
| 11 | -Supported formats: | |
| 12 | -- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) | |
| 13 | -- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) | |
| 14 | -- PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm) | |
| 15 | -- Word/PowerPoint 2007+ XML (aka Flat OPC) | |
| 16 | -- Word 2003 XML (.xml) | |
| 17 | -- Word/Excel Single File Web Page / MHTML (.mht) | |
| 18 | -- Publisher (.pub) | |
| 19 | -- raises an error if run with files encrypted using MS Crypto API RC4 | |
| 20 | - | |
| 21 | -Author: Philippe Lagadec - http://www.decalage.info | |
| 22 | -License: BSD, see source code or documentation | |
| 23 | - | |
| 24 | -olevba is part of the python-oletools package: | |
| 25 | -http://www.decalage.info/python/oletools | |
| 26 | - | |
| 27 | -olevba is based on source code from officeparser by John William Davison | |
| 28 | -https://github.com/unixfreak0037/officeparser | |
| 29 | -""" | |
| 30 | - | |
| 31 | -# === LICENSE ================================================================== | |
| 32 | - | |
| 33 | -# olevba is copyright (c) 2014-2018 Philippe Lagadec (http://www.decalage.info) | |
| 34 | -# All rights reserved. | |
| 35 | -# | |
| 36 | -# Redistribution and use in source and binary forms, with or without modification, | |
| 37 | -# are permitted provided that the following conditions are met: | |
| 38 | -# | |
| 39 | -# * Redistributions of source code must retain the above copyright notice, this | |
| 40 | -# list of conditions and the following disclaimer. | |
| 41 | -# * Redistributions in binary form must reproduce the above copyright notice, | |
| 42 | -# this list of conditions and the following disclaimer in the documentation | |
| 43 | -# and/or other materials provided with the distribution. | |
| 44 | -# | |
| 45 | -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
| 46 | -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
| 47 | -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
| 48 | -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | |
| 49 | -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 50 | -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
| 51 | -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
| 52 | -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
| 53 | -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
| 54 | -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 55 | - | |
| 56 | - | |
| 57 | -# olevba contains modified source code from the officeparser project, published | |
| 58 | -# under the following MIT License (MIT): | |
| 59 | -# | |
| 60 | -# officeparser is copyright (c) 2014 John William Davison | |
| 61 | -# | |
| 62 | -# Permission is hereby granted, free of charge, to any person obtaining a copy | |
| 63 | -# of this software and associated documentation files (the "Software"), to deal | |
| 64 | -# in the Software without restriction, including without limitation the rights | |
| 65 | -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| 66 | -# copies of the Software, and to permit persons to whom the Software is | |
| 67 | -# furnished to do so, subject to the following conditions: | |
| 68 | -# | |
| 69 | -# The above copyright notice and this permission notice shall be included in all | |
| 70 | -# copies or substantial portions of the Software. | |
| 71 | -# | |
| 72 | -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| 73 | -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| 74 | -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| 75 | -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| 76 | -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| 77 | -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
| 78 | -# SOFTWARE. | |
| 79 | - | |
| 80 | -from __future__ import print_function | |
| 81 | - | |
| 82 | -#------------------------------------------------------------------------------ | |
| 83 | -# CHANGELOG: | |
| 84 | -# 2014-08-05 v0.01 PL: - first version based on officeparser code | |
| 85 | -# 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser | |
| 86 | -# 2014-08-15 PL: - fixed incorrect value check in projecthelpfilepath Record | |
| 87 | -# 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats | |
| 88 | -# and to find the VBA project root anywhere in the file | |
| 89 | -# 2014-11-29 v0.04 PL: - use olefile instead of OleFileIO_PL | |
| 90 | -# 2014-12-05 v0.05 PL: - refactored most functions into a class, new API | |
| 91 | -# - added detect_vba_macros | |
| 92 | -# 2014-12-10 v0.06 PL: - hide first lines with VB attributes | |
| 93 | -# - detect auto-executable macros | |
| 94 | -# - ignore empty macros | |
| 95 | -# 2014-12-14 v0.07 PL: - detect_autoexec() is now case-insensitive | |
| 96 | -# 2014-12-15 v0.08 PL: - improved display for empty macros | |
| 97 | -# - added pattern extraction | |
| 98 | -# 2014-12-25 v0.09 PL: - added suspicious keywords detection | |
| 99 | -# 2014-12-27 v0.10 PL: - added OptionParser, main and process_file | |
| 100 | -# - uses xglob to scan several files with wildcards | |
| 101 | -# - option -r to recurse subdirectories | |
| 102 | -# - option -z to scan files in password-protected zips | |
| 103 | -# 2015-01-02 v0.11 PL: - improved filter_vba to detect colons | |
| 104 | -# 2015-01-03 v0.12 PL: - fixed detect_patterns to detect all patterns | |
| 105 | -# - process_file: improved display, shows container file | |
| 106 | -# - improved list of executable file extensions | |
| 107 | -# 2015-01-04 v0.13 PL: - added several suspicious keywords, improved display | |
| 108 | -# 2015-01-08 v0.14 PL: - added hex strings detection and decoding | |
| 109 | -# - fixed issue #2, decoding VBA stream names using | |
| 110 | -# specified codepage and unicode stream names | |
| 111 | -# 2015-01-11 v0.15 PL: - added new triage mode, options -t and -d | |
| 112 | -# 2015-01-16 v0.16 PL: - fix for issue #3 (exception when module name="text") | |
| 113 | -# - added several suspicious keywords | |
| 114 | -# - added option -i to analyze VBA source code directly | |
| 115 | -# 2015-01-17 v0.17 PL: - removed .com from the list of executable extensions | |
| 116 | -# - added scan_vba to run all detection algorithms | |
| 117 | -# - decoded hex strings are now also scanned + reversed | |
| 118 | -# 2015-01-23 v0.18 PL: - fixed issue #3, case-insensitive search in code_modules | |
| 119 | -# 2015-01-24 v0.19 PL: - improved the detection of IOCs obfuscated with hex | |
| 120 | -# strings and StrReverse | |
| 121 | -# 2015-01-26 v0.20 PL: - added option --hex to show all hex strings decoded | |
| 122 | -# 2015-01-29 v0.21 PL: - added Dridex obfuscation decoding | |
| 123 | -# - improved display, shows obfuscation name | |
| 124 | -# 2015-02-01 v0.22 PL: - fixed issue #4: regex for URL, e-mail and exe filename | |
| 125 | -# - added Base64 obfuscation decoding (contribution from | |
| 126 | -# @JamesHabben) | |
| 127 | -# 2015-02-03 v0.23 PL: - triage now uses VBA_Scanner results, shows Base64 and | |
| 128 | -# Dridex strings | |
| 129 | -# - exception handling in detect_base64_strings | |
| 130 | -# 2015-02-07 v0.24 PL: - renamed option --hex to --decode, fixed display | |
| 131 | -# - display exceptions with stack trace | |
| 132 | -# - added several suspicious keywords | |
| 133 | -# - improved Base64 detection and decoding | |
| 134 | -# - fixed triage mode not to scan attrib lines | |
| 135 | -# 2015-03-04 v0.25 PL: - added support for Word 2003 XML | |
| 136 | -# 2015-03-22 v0.26 PL: - added suspicious keywords for sandboxing and | |
| 137 | -# virtualisation detection | |
| 138 | -# 2015-05-06 v0.27 PL: - added support for MHTML files with VBA macros | |
| 139 | -# (issue #10 reported by Greg from SpamStopsHere) | |
| 140 | -# 2015-05-24 v0.28 PL: - improved support for MHTML files with modified header | |
| 141 | -# (issue #11 reported by Thomas Chopitea) | |
| 142 | -# 2015-05-26 v0.29 PL: - improved MSO files parsing, taking into account | |
| 143 | -# various data offsets (issue #12) | |
| 144 | -# - improved detection of MSO files, avoiding incorrect | |
| 145 | -# parsing errors (issue #7) | |
| 146 | -# 2015-05-29 v0.30 PL: - added suspicious keywords suggested by @ozhermit, | |
| 147 | -# Davy Douhine (issue #9), issue #13 | |
| 148 | -# 2015-06-16 v0.31 PL: - added generic VBA expression deobfuscation (chr,asc,etc) | |
| 149 | -# 2015-06-19 PL: - added options -a, -c, --each, --attr | |
| 150 | -# 2015-06-21 v0.32 PL: - always display decoded strings which are printable | |
| 151 | -# - fix VBA_Scanner.scan to return raw strings, not repr() | |
| 152 | -# 2015-07-09 v0.40 PL: - removed usage of sys.stderr which causes issues | |
| 153 | -# 2015-07-12 PL: - added Hex function decoding to VBA Parser | |
| 154 | -# 2015-07-13 PL: - added Base64 function decoding to VBA Parser | |
| 155 | -# 2015-09-06 PL: - improved VBA_Parser, refactored the main functions | |
| 156 | -# 2015-09-13 PL: - moved main functions to a class VBA_Parser_CLI | |
| 157 | -# - fixed issue when analysis was done twice | |
| 158 | -# 2015-09-15 PL: - remove duplicate IOCs from results | |
| 159 | -# 2015-09-16 PL: - join long VBA lines ending with underscore before scan | |
| 160 | -# - disabled unused option --each | |
| 161 | -# 2015-09-22 v0.41 PL: - added new option --reveal | |
| 162 | -# - added suspicious strings for PowerShell.exe options | |
| 163 | -# 2015-10-09 v0.42 PL: - VBA_Parser: split each format into a separate method | |
| 164 | -# 2015-10-10 PL: - added support for text files with VBA source code | |
| 165 | -# 2015-11-17 PL: - fixed bug with --decode option | |
| 166 | -# 2015-12-16 PL: - fixed bug in main (no options input anymore) | |
| 167 | -# - improved logging, added -l option | |
| 168 | -# 2016-01-31 PL: - fixed issue #31 in VBA_Parser.open_mht | |
| 169 | -# - fixed issue #32 by monkeypatching email.feedparser | |
| 170 | -# 2016-02-07 PL: - KeyboardInterrupt is now raised properly | |
| 171 | -# 2016-02-20 v0.43 PL: - fixed issue #34 in the VBA parser and vba_chr | |
| 172 | -# 2016-02-29 PL: - added Workbook_Activate to suspicious keywords | |
| 173 | -# 2016-03-08 v0.44 PL: - added VBA Form strings extraction and analysis | |
| 174 | -# 2016-03-04 v0.45 CH: - added JSON output (by Christian Herdtweck) | |
| 175 | -# 2016-03-16 CH: - added option --no-deobfuscate (temporary) | |
| 176 | -# 2016-04-19 v0.46 PL: - new option --deobf instead of --no-deobfuscate | |
| 177 | -# - updated suspicious keywords | |
| 178 | -# 2016-05-04 v0.47 PL: - look for VBA code in any stream including orphans | |
| 179 | -# 2016-04-28 CH: - return an exit code depending on the results | |
| 180 | -# - improved error and exception handling | |
| 181 | -# - improved JSON output | |
| 182 | -# 2016-05-12 CH: - added support for PowerPoint 97-2003 files | |
| 183 | -# 2016-06-06 CH: - improved handling of unicode VBA module names | |
| 184 | -# 2016-06-07 CH: - added option --relaxed, stricter parsing by default | |
| 185 | -# 2016-06-12 v0.50 PL: - fixed small bugs in VBA parsing code | |
| 186 | -# 2016-07-01 PL: - fixed issue #58 with format() to support Python 2.6 | |
| 187 | -# 2016-07-29 CH: - fixed several bugs including #73 (Mac Roman encoding) | |
| 188 | -# 2016-08-31 PL: - added autoexec keyword InkPicture_Painted | |
| 189 | -# - detect_autoexec now returns the exact keyword found | |
| 190 | -# 2016-09-05 PL: - added autoexec keywords for MS Publisher (.pub) | |
| 191 | -# 2016-09-06 PL: - fixed issue #20, is_zipfile on Python 2.6 | |
| 192 | -# 2016-09-12 PL: - enabled packrat to improve pyparsing performance | |
| 193 | -# 2016-10-25 PL: - fixed raise and print statements for Python 3 | |
| 194 | -# 2016-11-03 v0.51 PL: - added EnumDateFormats and EnumSystemLanguageGroupsW | |
| 195 | -# 2017-02-07 PL: - temporary fix for issue #132 | |
| 196 | -# - added keywords for Mac-specific macros (issue #130) | |
| 197 | -# 2017-03-08 PL: - fixed absolute imports | |
| 198 | -# 2017-03-16 PL: - fixed issues #148 and #149 for option --reveal | |
| 199 | -# 2017-05-19 PL: - added enable_logging to fix issue #154 | |
| 200 | -# 2017-05-31 c1fe: - PR #135 fixing issue #132 for some Mac files | |
| 201 | -# 2017-06-08 PL: - fixed issue #122 Chr() with negative numbers | |
| 202 | -# 2017-06-15 PL: - deobfuscation line by line to handle large files | |
| 203 | -# 2017-07-11 v0.52 PL: - raise exception instead of sys.exit (issue #180) | |
| 204 | -# 2017-11-20 PL: - fixed issue #219, do not close the file too early | |
| 205 | -# 2017-11-24 PL: - added keywords to detect self-modifying macros and | |
| 206 | -# attempts to disable macro security (issue #221) | |
| 207 | -# 2018-03-19 PL: - removed pyparsing from the thirdparty subfolder | |
| 208 | -# 2018-05-13 v0.53 PL: - added support for Word/PowerPoint 2007+ XML (FlatOPC) | |
| 209 | -# (issue #283) | |
| 210 | -# 2018-06-11 v0.53.1 MHW: - fixed #320: chr instead of unichr on python 3 | |
| 211 | -# 2018-06-12 MHW: - fixed #322: import reduce from functools | |
| 212 | -# 2018-09-11 v0.54 PL: - olefile is now a dependency | |
| 213 | -# 2018-10-25 CH: - detect encryption and raise error if detected | |
| 214 | - | |
| 215 | -__version__ = '0.54dev4' | |
| 216 | - | |
| 217 | -#------------------------------------------------------------------------------ | |
| 218 | -# TODO: | |
| 219 | -# + setup logging (common with other oletools) | |
| 220 | -# + add xor bruteforcing like bbharvest | |
| 221 | -# + options -a and -c should imply -d | |
| 222 | - | |
| 223 | -# TODO later: | |
| 224 | -# + performance improvement: instead of searching each keyword separately, | |
| 225 | -# first split vba code into a list of words (per line), then check each | |
| 226 | -# word against a dict. (or put vba words into a set/dict?) | |
| 227 | -# + for regex, maybe combine them into a single re with named groups? | |
| 228 | -# + add Yara support, include sample rules? plugins like balbuzard? | |
| 229 | -# + add balbuzard support | |
| 230 | -# + output to file (replace print by file.write, sys.stdout by default) | |
| 231 | -# + look for VBA in embedded documents (e.g. Excel in Word) | |
| 232 | -# + support SRP streams (see Lenny's article + links and sample) | |
| 233 | -# - python 3.x support | |
| 234 | -# - check VBA macros in Visio, Access, Project, etc | |
| 235 | -# - extract_macros: convert to a class, split long function into smaller methods | |
| 236 | -# - extract_macros: read bytes from stream file objects instead of strings | |
| 237 | -# - extract_macros: use combined struct.unpack instead of many calls | |
| 238 | -# - all except clauses should target specific exceptions | |
| 239 | - | |
| 240 | -#------------------------------------------------------------------------------ | |
| 241 | -# REFERENCES: | |
| 242 | -# - [MS-OVBA]: Microsoft Office VBA File Format Structure | |
| 243 | -# http://msdn.microsoft.com/en-us/library/office/cc313094%28v=office.12%29.aspx | |
| 244 | -# - officeparser: https://github.com/unixfreak0037/officeparser | |
| 245 | - | |
| 246 | - | |
| 247 | -#--- IMPORTS ------------------------------------------------------------------ | |
| 248 | - | |
| 249 | -import sys | |
| 250 | -import os | |
| 251 | -import logging | |
| 252 | -import struct | |
| 253 | -from io import BytesIO | |
| 254 | -import math | |
| 255 | -import zipfile | |
| 256 | -import re | |
| 257 | -import optparse | |
| 258 | -import binascii | |
| 259 | -import base64 | |
| 260 | -import zlib | |
| 261 | -import email # for MHTML parsing | |
| 262 | -import string # for printable | |
| 263 | -import json # for json output mode (argument --json) | |
| 264 | - | |
| 265 | -# import lxml or ElementTree for XML parsing: | |
| 266 | -try: | |
| 267 | - # lxml: best performance for XML processing | |
| 268 | - import lxml.etree as ET | |
| 269 | -except ImportError: | |
| 270 | - try: | |
| 271 | - # Python 2.5+: batteries included | |
| 272 | - import xml.etree.cElementTree as ET | |
| 273 | - except ImportError: | |
| 274 | - try: | |
| 275 | - # Python <2.5: standalone ElementTree install | |
| 276 | - import elementtree.cElementTree as ET | |
| 277 | - except ImportError: | |
| 278 | - raise ImportError("lxml or ElementTree are not installed, " \ | |
| 279 | - + "see http://codespeak.net/lxml " \ | |
| 280 | - + "or http://effbot.org/zone/element-index.htm") | |
| 281 | - | |
| 282 | -import colorclass | |
| 283 | - | |
| 284 | -# On Windows, colorclass needs to be enabled: | |
| 285 | -if os.name == 'nt': | |
| 286 | - colorclass.Windows.enable(auto_colors=True) | |
| 5 | +import sys, os, warnings | |
| 287 | 6 | |
| 7 | +warnings.warn('olevba3 is deprecated, olevba should be used instead.', DeprecationWarning) | |
| 288 | 8 | |
| 289 | 9 | # IMPORTANT: it should be possible to run oletools directly as scripts |
| 290 | 10 | # in any directory without installing them with pip or setup.py. |
| ... | ... | @@ -292,3378 +12,13 @@ if os.name == 'nt': |
| 292 | 12 | # And to enable Python 2+3 compatibility, we need to use absolute imports, |
| 293 | 13 | # so we add the oletools parent folder to sys.path (absolute+normalized path): |
| 294 | 14 | _thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) |
| 295 | -# print('_thismodule_dir = %r' % _thismodule_dir) | |
| 296 | 15 | _parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) |
| 297 | -# print('_parent_dir = %r' % _thirdparty_dir) | |
| 298 | -if not _parent_dir in sys.path: | |
| 16 | +if _parent_dir not in sys.path: | |
| 299 | 17 | sys.path.insert(0, _parent_dir) |
| 300 | 18 | |
| 301 | -import olefile | |
| 302 | -from oletools.thirdparty.prettytable import prettytable | |
| 303 | -from oletools.thirdparty.xglob import xglob, PathNotFoundException | |
| 304 | -from pyparsing import \ | |
| 305 | - CaselessKeyword, CaselessLiteral, Combine, Forward, Literal, \ | |
| 306 | - Optional, QuotedString,Regex, Suppress, Word, WordStart, \ | |
| 307 | - alphanums, alphas, hexnums,nums, opAssoc, srange, \ | |
| 308 | - infixNotation, ParserElement | |
| 309 | -import oletools.ppt_parser as ppt_parser | |
| 310 | -from oletools import rtfobj | |
| 311 | -from oletools import oleid | |
| 312 | -from oletools.common.errors import FileIsEncryptedError | |
| 313 | - | |
| 314 | -# monkeypatch email to fix issue #32: | |
| 315 | -# allow header lines without ":" | |
| 316 | -import email.feedparser | |
| 317 | -email.feedparser.headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:?|[\t ])') | |
| 318 | - | |
| 319 | -# === PYTHON 2+3 SUPPORT ====================================================== | |
| 320 | - | |
| 321 | -if sys.version_info[0] <= 2: | |
| 322 | - # Python 2.x | |
| 323 | - if sys.version_info[1] <= 6: | |
| 324 | - # Python 2.6 | |
| 325 | - # use is_zipfile backported from Python 2.7: | |
| 326 | - from thirdparty.zipfile27 import is_zipfile | |
| 327 | - else: | |
| 328 | - # Python 2.7 | |
| 329 | - from zipfile import is_zipfile | |
| 330 | -else: | |
| 331 | - # Python 3.x+ | |
| 332 | - from zipfile import is_zipfile | |
| 333 | - # xrange is now called range: | |
| 334 | - xrange = range | |
| 335 | - # unichr does not exist anymore, only chr: | |
| 336 | - unichr = chr | |
| 337 | - from functools import reduce | |
| 338 | - | |
| 339 | - | |
| 340 | -# === PYTHON 3.0 - 3.4 SUPPORT ====================================================== | |
| 341 | - | |
| 342 | -# From https://gist.github.com/ynkdir/867347/c5e188a4886bc2dd71876c7e069a7b00b6c16c61 | |
| 343 | - | |
| 344 | -if sys.version_info >= (3, 0) and sys.version_info < (3, 5): | |
| 345 | - import codecs | |
| 346 | - | |
| 347 | - _backslashreplace_errors = codecs.lookup_error("backslashreplace") | |
| 348 | - | |
| 349 | - def backslashreplace_errors(exc): | |
| 350 | - if isinstance(exc, UnicodeDecodeError): | |
| 351 | - u = "".join("\\x{0:02x}".format(c) for c in exc.object[exc.start:exc.end]) | |
| 352 | - return (u, exc.end) | |
| 353 | - return _backslashreplace_errors(exc) | |
| 354 | - | |
| 355 | - codecs.register_error("backslashreplace", backslashreplace_errors) | |
| 356 | - | |
| 357 | - | |
| 358 | -# === LOGGING ================================================================= | |
| 359 | - | |
| 360 | -class NullHandler(logging.Handler): | |
| 361 | - """ | |
| 362 | - Log Handler without output, to avoid printing messages if logging is not | |
| 363 | - configured by the main application. | |
| 364 | - Python 2.7 has logging.NullHandler, but this is necessary for 2.6: | |
| 365 | - see https://docs.python.org/2.6/library/logging.html#configuring-logging-for-a-library | |
| 366 | - """ | |
| 367 | - def emit(self, record): | |
| 368 | - pass | |
| 369 | - | |
| 370 | -def get_logger(name, level=logging.CRITICAL+1): | |
| 371 | - """ | |
| 372 | - Create a suitable logger object for this module. | |
| 373 | - The goal is not to change settings of the root logger, to avoid getting | |
| 374 | - other modules' logs on the screen. | |
| 375 | - If a logger exists with same name, reuse it. (Else it would have duplicate | |
| 376 | - handlers and messages would be doubled.) | |
| 377 | - The level is set to CRITICAL+1 by default, to avoid any logging. | |
| 378 | - """ | |
| 379 | - # First, test if there is already a logger with the same name, else it | |
| 380 | - # will generate duplicate messages (due to duplicate handlers): | |
| 381 | - if name in logging.Logger.manager.loggerDict: | |
| 382 | - #NOTE: another less intrusive but more "hackish" solution would be to | |
| 383 | - # use getLogger then test if its effective level is not default. | |
| 384 | - logger = logging.getLogger(name) | |
| 385 | - # make sure level is OK: | |
| 386 | - logger.setLevel(level) | |
| 387 | - return logger | |
| 388 | - # get a new logger: | |
| 389 | - logger = logging.getLogger(name) | |
| 390 | - # only add a NullHandler for this logger, it is up to the application | |
| 391 | - # to configure its own logging: | |
| 392 | - logger.addHandler(NullHandler()) | |
| 393 | - logger.setLevel(level) | |
| 394 | - return logger | |
| 395 | - | |
| 396 | -# a global logger object used for debugging: | |
| 397 | -log = get_logger('olevba') | |
| 398 | - | |
| 399 | - | |
| 400 | -def enable_logging(): | |
| 401 | - """ | |
| 402 | - Enable logging for this module (disabled by default). | |
| 403 | - This will set the module-specific logger level to NOTSET, which | |
| 404 | - means the main application controls the actual logging level. | |
| 405 | - """ | |
| 406 | - log.setLevel(logging.NOTSET) | |
| 407 | - # Also enable logging in the ppt_parser module: | |
| 408 | - ppt_parser.enable_logging() | |
| 409 | - | |
| 410 | - | |
| 411 | - | |
| 412 | -#=== EXCEPTIONS ============================================================== | |
| 413 | - | |
| 414 | -class OlevbaBaseException(Exception): | |
| 415 | - """ Base class for exceptions produced here for simpler except clauses """ | |
| 416 | - def __init__(self, msg, filename=None, orig_exc=None, **kwargs): | |
| 417 | - if orig_exc: | |
| 418 | - super(OlevbaBaseException, self).__init__(msg + | |
| 419 | - ' ({0})'.format(orig_exc), | |
| 420 | - **kwargs) | |
| 421 | - else: | |
| 422 | - super(OlevbaBaseException, self).__init__(msg, **kwargs) | |
| 423 | - self.msg = msg | |
| 424 | - self.filename = filename | |
| 425 | - self.orig_exc = orig_exc | |
| 426 | - | |
| 427 | - | |
| 428 | -class FileOpenError(OlevbaBaseException): | |
| 429 | - """ raised by VBA_Parser constructor if all open_... attempts failed | |
| 430 | - | |
| 431 | - probably means the file type is not supported | |
| 432 | - """ | |
| 433 | - | |
| 434 | - def __init__(self, filename, orig_exc=None): | |
| 435 | - super(FileOpenError, self).__init__( | |
| 436 | - 'Failed to open file %s' % filename, filename, orig_exc) | |
| 437 | - | |
| 438 | - | |
| 439 | -class ProcessingError(OlevbaBaseException): | |
| 440 | - """ raised by VBA_Parser.process_file* functions """ | |
| 441 | - | |
| 442 | - def __init__(self, filename, orig_exc): | |
| 443 | - super(ProcessingError, self).__init__( | |
| 444 | - 'Error processing file %s' % filename, filename, orig_exc) | |
| 445 | - | |
| 446 | - | |
| 447 | -class MsoExtractionError(RuntimeError, OlevbaBaseException): | |
| 448 | - """ raised by mso_file_extract if parsing MSO/ActiveMIME data failed """ | |
| 449 | - | |
| 450 | - def __init__(self, msg): | |
| 451 | - MsoExtractionError.__init__(self, msg) | |
| 452 | - OlevbaBaseException.__init__(self, msg) | |
| 453 | - | |
| 454 | - | |
| 455 | -class SubstreamOpenError(FileOpenError): | |
| 456 | - """ special kind of FileOpenError: file is a substream of original file """ | |
| 457 | - | |
| 458 | - def __init__(self, filename, subfilename, orig_exc=None): | |
| 459 | - super(SubstreamOpenError, self).__init__( | |
| 460 | - str(filename) + '/' + str(subfilename), orig_exc) | |
| 461 | - self.filename = filename # overwrite setting in OlevbaBaseException | |
| 462 | - self.subfilename = subfilename | |
| 463 | - | |
| 464 | - | |
| 465 | -class UnexpectedDataError(OlevbaBaseException): | |
| 466 | - """ raised when parsing is strict (=not relaxed) and data is unexpected """ | |
| 467 | - | |
| 468 | - def __init__(self, stream_path, variable, expected, value): | |
| 469 | - if isinstance(expected, int): | |
| 470 | - es = '{0:04X}'.format(expected) | |
| 471 | - elif isinstance(expected, tuple): | |
| 472 | - es = ','.join('{0:04X}'.format(e) for e in expected) | |
| 473 | - es = '({0})'.format(es) | |
| 474 | - else: | |
| 475 | - raise ValueError('Unknown type encountered: {0}'.format(type(expected))) | |
| 476 | - super(UnexpectedDataError, self).__init__( | |
| 477 | - 'Unexpected value in {0} for variable {1}: ' | |
| 478 | - 'expected {2} but found {3:04X}!' | |
| 479 | - .format(stream_path, variable, es, value)) | |
| 480 | - self.stream_path = stream_path | |
| 481 | - self.variable = variable | |
| 482 | - self.expected = expected | |
| 483 | - self.value = value | |
| 484 | - | |
| 485 | -#--- CONSTANTS ---------------------------------------------------------------- | |
| 486 | - | |
| 487 | -# return codes | |
| 488 | -RETURN_OK = 0 | |
| 489 | -RETURN_WARNINGS = 1 # (reserved, not used yet) | |
| 490 | -RETURN_WRONG_ARGS = 2 # (fixed, built into optparse) | |
| 491 | -RETURN_FILE_NOT_FOUND = 3 | |
| 492 | -RETURN_XGLOB_ERR = 4 | |
| 493 | -RETURN_OPEN_ERROR = 5 | |
| 494 | -RETURN_PARSE_ERROR = 6 | |
| 495 | -RETURN_SEVERAL_ERRS = 7 | |
| 496 | -RETURN_UNEXPECTED = 8 | |
| 497 | -RETURN_ENCRYPTED = 9 | |
| 498 | - | |
| 499 | -# MAC codepages (from http://stackoverflow.com/questions/1592925/decoding-mac-os-text-in-python) | |
| 500 | -MAC_CODEPAGES = { | |
| 501 | - 10000: 'mac-roman', | |
| 502 | - 10001: 'shiftjis', # not found: 'mac-shift-jis', | |
| 503 | - 10003: 'ascii', # nothing appropriate found: 'mac-hangul', | |
| 504 | - 10008: 'gb2321', # not found: 'mac-gb2312', | |
| 505 | - 10002: 'big5', # not found: 'mac-big5', | |
| 506 | - 10005: 'hebrew', # not found: 'mac-hebrew', | |
| 507 | - 10004: 'mac-arabic', | |
| 508 | - 10006: 'mac-greek', | |
| 509 | - 10081: 'mac-turkish', | |
| 510 | - 10021: 'thai', # not found: mac-thai', | |
| 511 | - 10029: 'maccentraleurope', # not found: 'mac-east europe', | |
| 512 | - 10007: 'ascii', # nothing appropriate found: 'mac-russian', | |
| 513 | -} | |
| 514 | - | |
| 515 | -# URL and message to report issues: | |
| 516 | -URL_OLEVBA_ISSUES = 'https://github.com/decalage2/oletools/issues' | |
| 517 | -MSG_OLEVBA_ISSUES = 'Please report this issue on %s' % URL_OLEVBA_ISSUES | |
| 518 | - | |
| 519 | -# Container types: | |
| 520 | -TYPE_OLE = 'OLE' | |
| 521 | -TYPE_OpenXML = 'OpenXML' | |
| 522 | -TYPE_FlatOPC_XML = 'FlatOPC_XML' | |
| 523 | -TYPE_Word2003_XML = 'Word2003_XML' | |
| 524 | -TYPE_MHTML = 'MHTML' | |
| 525 | -TYPE_TEXT = 'Text' | |
| 526 | -TYPE_PPT = 'PPT' | |
| 527 | - | |
| 528 | -# short tag to display file types in triage mode: | |
| 529 | -TYPE2TAG = { | |
| 530 | - TYPE_OLE: 'OLE:', | |
| 531 | - TYPE_OpenXML: 'OpX:', | |
| 532 | - TYPE_FlatOPC_XML: 'FlX:', | |
| 533 | - TYPE_Word2003_XML: 'XML:', | |
| 534 | - TYPE_MHTML: 'MHT:', | |
| 535 | - TYPE_TEXT: 'TXT:', | |
| 536 | - TYPE_PPT: 'PPT', | |
| 537 | -} | |
| 538 | - | |
| 539 | - | |
| 540 | -# MSO files ActiveMime header magic | |
| 541 | -MSO_ACTIVEMIME_HEADER = b'ActiveMime' | |
| 542 | - | |
| 543 | -MODULE_EXTENSION = "bas" | |
| 544 | -CLASS_EXTENSION = "cls" | |
| 545 | -FORM_EXTENSION = "frm" | |
| 546 | - | |
| 547 | -# Namespaces and tags for Word2003 XML parsing: | |
| 548 | -NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}' | |
| 549 | -# the tag <w:binData w:name="editdata.mso"> contains the VBA macro code: | |
| 550 | -TAG_BINDATA = NS_W + 'binData' | |
| 551 | -ATTR_NAME = NS_W + 'name' | |
| 552 | - | |
| 553 | -# Namespaces and tags for Word/PowerPoint 2007+ XML parsing: | |
| 554 | -# root: <pkg:package xmlns:pkg="http://schemas.microsoft.com/office/2006/xmlPackage"> | |
| 555 | -NS_XMLPACKAGE = '{http://schemas.microsoft.com/office/2006/xmlPackage}' | |
| 556 | -TAG_PACKAGE = NS_XMLPACKAGE + 'package' | |
| 557 | -# the tag <pkg:part> includes <pkg:binaryData> that contains the VBA macro code in Base64: | |
| 558 | -# <pkg:part pkg:name="/word/vbaProject.bin" pkg:contentType="application/vnd.ms-office.vbaProject"><pkg:binaryData> | |
| 559 | -TAG_PKGPART = NS_XMLPACKAGE + 'part' | |
| 560 | -ATTR_PKG_NAME = NS_XMLPACKAGE + 'name' | |
| 561 | -ATTR_PKG_CONTENTTYPE = NS_XMLPACKAGE + 'contentType' | |
| 562 | -CTYPE_VBAPROJECT = "application/vnd.ms-office.vbaProject" | |
| 563 | -TAG_PKGBINDATA = NS_XMLPACKAGE + 'binaryData' | |
| 564 | - | |
| 565 | -# Keywords to detect auto-executable macros | |
| 566 | -AUTOEXEC_KEYWORDS = { | |
| 567 | - # MS Word: | |
| 568 | - 'Runs when the Word document is opened': | |
| 569 | - ('AutoExec', 'AutoOpen', 'DocumentOpen'), | |
| 570 | - 'Runs when the Word document is closed': | |
| 571 | - ('AutoExit', 'AutoClose', 'Document_Close', 'DocumentBeforeClose'), | |
| 572 | - 'Runs when the Word document is modified': | |
| 573 | - ('DocumentChange',), | |
| 574 | - 'Runs when a new Word document is created': | |
| 575 | - ('AutoNew', 'Document_New', 'NewDocument'), | |
| 576 | - | |
| 577 | - # MS Word and Publisher: | |
| 578 | - 'Runs when the Word or Publisher document is opened': | |
| 579 | - ('Document_Open',), | |
| 580 | - 'Runs when the Publisher document is closed': | |
| 581 | - ('Document_BeforeClose',), | |
| 582 | - | |
| 583 | - # MS Excel: | |
| 584 | - 'Runs when the Excel Workbook is opened': | |
| 585 | - ('Auto_Open', 'Workbook_Open', 'Workbook_Activate'), | |
| 586 | - 'Runs when the Excel Workbook is closed': | |
| 587 | - ('Auto_Close', 'Workbook_Close'), | |
| 588 | - | |
| 589 | - # any MS Office application: | |
| 590 | - 'Runs when the file is opened (using InkPicture ActiveX object)': | |
| 591 | - # ref:https://twitter.com/joe4security/status/770691099988025345 | |
| 592 | - (r'\w+_Painted',), | |
| 593 | - 'Runs when the file is opened and ActiveX objects trigger events': | |
| 594 | - (r'\w+_(?:GotFocus|LostFocus|MouseHover)',), | |
| 595 | -} | |
| 596 | - | |
| 597 | -# Suspicious Keywords that may be used by malware | |
| 598 | -# See VBA language reference: http://msdn.microsoft.com/en-us/library/office/jj692818%28v=office.15%29.aspx | |
| 599 | -SUSPICIOUS_KEYWORDS = { | |
| 600 | - #TODO: use regex to support variable whitespaces | |
| 601 | - 'May read system environment variables': | |
| 602 | - ('Environ',), | |
| 603 | - 'May open a file': | |
| 604 | - ('Open',), | |
| 605 | - 'May write to a file (if combined with Open)': | |
| 606 | - #TODO: regex to find Open+Write on same line | |
| 607 | - ('Write', 'Put', 'Output', 'Print #'), | |
| 608 | - 'May read or write a binary file (if combined with Open)': | |
| 609 | - #TODO: regex to find Open+Binary on same line | |
| 610 | - ('Binary',), | |
| 611 | - 'May copy a file': | |
| 612 | - ('FileCopy', 'CopyFile'), | |
| 613 | - #FileCopy: http://msdn.microsoft.com/en-us/library/office/gg264390%28v=office.15%29.aspx | |
| 614 | - #CopyFile: http://msdn.microsoft.com/en-us/library/office/gg264089%28v=office.15%29.aspx | |
| 615 | - 'May delete a file': | |
| 616 | - ('Kill',), | |
| 617 | - 'May create a text file': | |
| 618 | - ('CreateTextFile', 'ADODB.Stream', 'WriteText', 'SaveToFile'), | |
| 619 | - #CreateTextFile: http://msdn.microsoft.com/en-us/library/office/gg264617%28v=office.15%29.aspx | |
| 620 | - #ADODB.Stream sample: http://pastebin.com/Z4TMyuq6 | |
| 621 | - 'May run an executable file or a system command': | |
| 622 | - ('Shell', 'vbNormal', 'vbNormalFocus', 'vbHide', 'vbMinimizedFocus', 'vbMaximizedFocus', 'vbNormalNoFocus', | |
| 623 | - 'vbMinimizedNoFocus', 'WScript.Shell', 'Run', 'ShellExecute'), | |
| 624 | - # MacScript: see https://msdn.microsoft.com/en-us/library/office/gg264812.aspx | |
| 625 | - 'May run an executable file or a system command on a Mac': | |
| 626 | - ('MacScript',), | |
| 627 | - 'May run an executable file or a system command on a Mac (if combined with libc.dylib)': | |
| 628 | - ('system', 'popen', r'exec[lv][ep]?'), | |
| 629 | - #Shell: http://msdn.microsoft.com/en-us/library/office/gg278437%28v=office.15%29.aspx | |
| 630 | - #WScript.Shell+Run sample: http://pastebin.com/Z4TMyuq6 | |
| 631 | - 'May run PowerShell commands': | |
| 632 | - #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ | |
| 633 | - #also: https://bitbucket.org/decalage/oletools/issues/14/olevba-library-update-ioc | |
| 634 | - # ref: https://blog.netspi.com/15-ways-to-bypass-the-powershell-execution-policy/ | |
| 635 | - # TODO: add support for keywords starting with a non-alpha character, such as "-noexit" | |
| 636 | - # TODO: '-command', '-EncodedCommand', '-scriptblock' | |
| 637 | - ('PowerShell', 'noexit', 'ExecutionPolicy', 'noprofile', 'command', 'EncodedCommand', | |
| 638 | - 'invoke-command', 'scriptblock', 'Invoke-Expression', 'AuthorizationManager'), | |
| 639 | - 'May run an executable file or a system command using PowerShell': | |
| 640 | - ('Start-Process',), | |
| 641 | - 'May hide the application': | |
| 642 | - ('Application.Visible', 'ShowWindow', 'SW_HIDE'), | |
| 643 | - 'May create a directory': | |
| 644 | - ('MkDir',), | |
| 645 | - 'May save the current workbook': | |
| 646 | - ('ActiveWorkbook.SaveAs',), | |
| 647 | - 'May change which directory contains files to open at startup': | |
| 648 | - #TODO: confirm the actual effect | |
| 649 | - ('Application.AltStartupPath',), | |
| 650 | - 'May create an OLE object': | |
| 651 | - ('CreateObject',), | |
| 652 | - 'May create an OLE object using PowerShell': | |
| 653 | - ('New-Object',), | |
| 654 | - 'May run an application (if combined with CreateObject)': | |
| 655 | - ('Shell.Application',), | |
| 656 | - 'May enumerate application windows (if combined with Shell.Application object)': | |
| 657 | - ('Windows', 'FindWindow'), | |
| 658 | - 'May run code from a DLL': | |
| 659 | - #TODO: regex to find declare+lib on same line - see mraptor | |
| 660 | - ('Lib',), | |
| 661 | - 'May run code from a library on a Mac': | |
| 662 | - #TODO: regex to find declare+lib on same line - see mraptor | |
| 663 | - ('libc.dylib', 'dylib'), | |
| 664 | - 'May inject code into another process': | |
| 665 | - ('CreateThread', 'VirtualAlloc', # (issue #9) suggested by Davy Douhine - used by MSF payload | |
| 666 | - 'VirtualAllocEx', 'RtlMoveMemory', | |
| 667 | - ), | |
| 668 | - 'May run a shellcode in memory': | |
| 669 | - ('EnumSystemLanguageGroupsW?', # Used by Hancitor in Oct 2016 | |
| 670 | - 'EnumDateFormats(?:W|(?:Ex){1,2})?'), # see https://msdn.microsoft.com/en-us/library/windows/desktop/dd317810(v=vs.85).aspx | |
| 671 | - 'May download files from the Internet': | |
| 672 | - #TODO: regex to find urlmon+URLDownloadToFileA on same line | |
| 673 | - ('URLDownloadToFileA', 'Msxml2.XMLHTTP', 'Microsoft.XMLHTTP', | |
| 674 | - 'MSXML2.ServerXMLHTTP', # suggested in issue #13 | |
| 675 | - 'User-Agent', # sample from @ozhermit: http://pastebin.com/MPc3iV6z | |
| 676 | - ), | |
| 677 | - 'May download files from the Internet using PowerShell': | |
| 678 | - #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ | |
| 679 | - ('Net.WebClient', 'DownloadFile', 'DownloadString'), | |
| 680 | - 'May control another application by simulating user keystrokes': | |
| 681 | - ('SendKeys', 'AppActivate'), | |
| 682 | - #SendKeys: http://msdn.microsoft.com/en-us/library/office/gg278655%28v=office.15%29.aspx | |
| 683 | - 'May attempt to obfuscate malicious function calls': | |
| 684 | - ('CallByName',), | |
| 685 | - #CallByName: http://msdn.microsoft.com/en-us/library/office/gg278760%28v=office.15%29.aspx | |
| 686 | - 'May attempt to obfuscate specific strings (use option --deobf to deobfuscate)': | |
| 687 | - #TODO: regex to find several Chr*, not just one | |
| 688 | - ('Chr', 'ChrB', 'ChrW', 'StrReverse', 'Xor'), | |
| 689 | - #Chr: http://msdn.microsoft.com/en-us/library/office/gg264465%28v=office.15%29.aspx | |
| 690 | - 'May read or write registry keys': | |
| 691 | - #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ | |
| 692 | - ('RegOpenKeyExA', 'RegOpenKeyEx', 'RegCloseKey'), | |
| 693 | - 'May read registry keys': | |
| 694 | - #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ | |
| 695 | - ('RegQueryValueExA', 'RegQueryValueEx', | |
| 696 | - 'RegRead', #with Wscript.Shell | |
| 697 | - ), | |
| 698 | - 'May detect virtualization': | |
| 699 | - # sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ | |
| 700 | - (r'SYSTEM\ControlSet001\Services\Disk\Enum', 'VIRTUAL', 'VMWARE', 'VBOX'), | |
| 701 | - 'May detect Anubis Sandbox': | |
| 702 | - # sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ | |
| 703 | - # NOTES: this sample also checks App.EXEName but that seems to be a bug, it works in VB6 but not in VBA | |
| 704 | - # ref: http://www.syssec-project.eu/m/page-media/3/disarm-raid11.pdf | |
| 705 | - ('GetVolumeInformationA', 'GetVolumeInformation', # with kernel32.dll | |
| 706 | - '1824245000', r'HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows NT\CurrentVersion\ProductId', | |
| 707 | - '76487-337-8429955-22614', 'andy', 'sample', r'C:\exec\exec.exe', 'popupkiller' | |
| 708 | - ), | |
| 709 | - 'May detect Sandboxie': | |
| 710 | - # sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ | |
| 711 | - # ref: http://www.cplusplus.com/forum/windows/96874/ | |
| 712 | - ('SbieDll.dll', 'SandboxieControlWndClass'), | |
| 713 | - 'May detect Sunbelt Sandbox': | |
| 714 | - # ref: http://www.cplusplus.com/forum/windows/96874/ | |
| 715 | - (r'C:\file.exe',), | |
| 716 | - 'May detect Norman Sandbox': | |
| 717 | - # ref: http://www.cplusplus.com/forum/windows/96874/ | |
| 718 | - ('currentuser',), | |
| 719 | - 'May detect CW Sandbox': | |
| 720 | - # ref: http://www.cplusplus.com/forum/windows/96874/ | |
| 721 | - ('Schmidti',), | |
| 722 | - 'May detect WinJail Sandbox': | |
| 723 | - # ref: http://www.cplusplus.com/forum/windows/96874/ | |
| 724 | - ('Afx:400000:0',), | |
| 725 | - 'May attempt to disable VBA macro security and Protected View': | |
| 726 | - # ref: http://blog.trendmicro.com/trendlabs-security-intelligence/qkg-filecoder-self-replicating-document-encrypting-ransomware/ | |
| 727 | - # ref: https://thehackernews.com/2017/11/ms-office-macro-malware.html | |
| 728 | - ('AccessVBOM', 'VBAWarnings', 'ProtectedView', 'DisableAttachementsInPV', 'DisableInternetFilesInPV', | |
| 729 | - 'DisableUnsafeLocationsInPV', 'blockcontentexecutionfrominternet'), | |
| 730 | - 'May attempt to modify the VBA code (self-modification)': | |
| 731 | - ('VBProject', 'VBComponents', 'CodeModule', 'AddFromString'), | |
| 732 | -} | |
| 733 | - | |
| 734 | -# Suspicious Keywords to be searched for directly as strings, without regex | |
| 735 | -SUSPICIOUS_KEYWORDS_NOREGEX = { | |
| 736 | - 'May use special characters such as backspace to obfuscate code when printed on the console': | |
| 737 | - ('\b',), | |
| 738 | -} | |
| 739 | - | |
| 740 | -# Regular Expression for a URL: | |
| 741 | -# http://en.wikipedia.org/wiki/Uniform_resource_locator | |
| 742 | -# http://www.w3.org/Addressing/URL/uri-spec.html | |
| 743 | -#TODO: also support username:password@server | |
| 744 | -#TODO: other protocols (file, gopher, wais, ...?) | |
| 745 | -SCHEME = r'\b(?:http|ftp)s?' | |
| 746 | -# see http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains | |
| 747 | -TLD = r'(?:xn--[a-zA-Z0-9]{4,20}|[a-zA-Z]{2,20})' | |
| 748 | -DNS_NAME = r'(?:[a-zA-Z0-9\-\.]+\.' + TLD + ')' | |
| 749 | -#TODO: IPv6 - see https://www.debuggex.com/ | |
| 750 | -# A literal numeric IPv6 address may be given, but must be enclosed in [ ] e.g. [db8:0cec::99:123a] | |
| 751 | -NUMBER_0_255 = r'(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])' | |
| 752 | -IPv4 = r'(?:' + NUMBER_0_255 + r'\.){3}' + NUMBER_0_255 | |
| 753 | -# IPv4 must come before the DNS name because it is more specific | |
| 754 | -SERVER = r'(?:' + IPv4 + '|' + DNS_NAME + ')' | |
| 755 | -PORT = r'(?:\:[0-9]{1,5})?' | |
| 756 | -SERVER_PORT = SERVER + PORT | |
| 757 | -URL_PATH = r'(?:/[a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~]*)?' # [^\.\,\)\(\s"] | |
| 758 | -URL_RE = SCHEME + r'\://' + SERVER_PORT + URL_PATH | |
| 759 | -re_url = re.compile(URL_RE) | |
| 760 | - | |
| 761 | - | |
| 762 | -# Patterns to be extracted (IP addresses, URLs, etc) | |
| 763 | -# From patterns.py in balbuzard | |
| 764 | -RE_PATTERNS = ( | |
| 765 | - ('URL', re.compile(URL_RE)), | |
| 766 | - ('IPv4 address', re.compile(IPv4)), | |
| 767 | - # TODO: add IPv6 | |
| 768 | - ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@' + SERVER + '\b')), | |
| 769 | - # ('Domain name', re.compile(r'(?=^.{1,254}$)(^(?:(?!\d+\.|-)[a-zA-Z0-9_\-]{1,63}(?<!-)\.?)+(?:[a-zA-Z]{2,})$)')), | |
| 770 | - # Executable file name with known extensions (except .com which is present in many URLs, and .application): | |
| 771 | - ("Executable file name", re.compile( | |
| 772 | - r"(?i)\b\w+\.(EXE|PIF|GADGET|MSI|MSP|MSC|VBS|VBE|VB|JSE|JS|WSF|WSC|WSH|WS|BAT|CMD|DLL|SCR|HTA|CPL|CLASS|JAR|PS1XML|PS1|PS2XML|PS2|PSC1|PSC2|SCF|LNK|INF|REG)\b")), | |
| 773 | - # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/ | |
| 774 | - # TODO: https://support.office.com/en-us/article/Blocked-attachments-in-Outlook-3811cddc-17c3-4279-a30c-060ba0207372#__attachment_file_types | |
| 775 | - # TODO: add win & unix file paths | |
| 776 | - #('Hex string', re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')), | |
| 777 | -) | |
| 778 | - | |
| 779 | -# regex to detect strings encoded in hexadecimal | |
| 780 | -re_hex_string = re.compile(r'(?:[0-9A-Fa-f]{2}){4,}') | |
| 781 | - | |
| 782 | -# regex to detect strings encoded in base64 | |
| 783 | -#re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?"') | |
| 784 | -# better version from balbuzard, less false positives: | |
| 785 | -# (plain version without double quotes, used also below in quoted_base64_string) | |
| 786 | -BASE64_RE = r'(?:[A-Za-z0-9+/]{4}){1,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)?' | |
| 787 | -re_base64_string = re.compile('"' + BASE64_RE + '"') | |
| 788 | -# white list of common strings matching the base64 regex, but which are not base64 strings (all lowercase): | |
| 789 | -BASE64_WHITELIST = set(['thisdocument', 'thisworkbook', 'test', 'temp', 'http', 'open', 'exit']) | |
| 790 | - | |
| 791 | -# regex to detect strings encoded with a specific Dridex algorithm | |
| 792 | -# (see https://github.com/JamesHabben/MalwareStuff) | |
| 793 | -re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"') | |
| 794 | -# regex to check that it is not just a hex string: | |
| 795 | -re_nothex_check = re.compile(r'[G-Zg-z]') | |
| 796 | - | |
| 797 | -# regex to extract printable strings (at least 5 chars) from VBA Forms: | |
| 798 | -# (must be bytes for Python 3) | |
| 799 | -re_printable_string = re.compile(b'[\\t\\r\\n\\x20-\\xFF]{5,}') | |
| 800 | - | |
| 801 | - | |
| 802 | -# === PARTIAL VBA GRAMMAR ==================================================== | |
| 803 | - | |
| 804 | -# REFERENCES: | |
| 805 | -# - [MS-VBAL]: VBA Language Specification | |
| 806 | -# https://msdn.microsoft.com/en-us/library/dd361851.aspx | |
| 807 | -# - pyparsing: http://pyparsing.wikispaces.com/ | |
| 808 | - | |
| 809 | -# TODO: set whitespaces according to VBA | |
| 810 | -# TODO: merge extended lines before parsing | |
| 811 | - | |
| 812 | -# Enable PackRat for better performance: | |
| 813 | -# (see https://pythonhosted.org/pyparsing/pyparsing.ParserElement-class.html#enablePackrat) | |
| 814 | -ParserElement.enablePackrat() | |
| 815 | - | |
| 816 | -# VBA identifier chars (from MS-VBAL 3.3.5) | |
| 817 | -vba_identifier_chars = alphanums + '_' | |
| 818 | - | |
| 819 | -class VbaExpressionString(str): | |
| 820 | - """ | |
| 821 | - Class identical to str, used to distinguish plain strings from strings | |
| 822 | - obfuscated using VBA expressions (Chr, StrReverse, etc) | |
| 823 | - Usage: each VBA expression parse action should convert strings to | |
| 824 | - VbaExpressionString. | |
| 825 | - Then isinstance(s, VbaExpressionString) is True only for VBA expressions. | |
| 826 | - (see detect_vba_strings) | |
| 827 | - """ | |
| 828 | - # TODO: use Unicode everywhere instead of str | |
| 829 | - pass | |
| 830 | - | |
| 831 | - | |
| 832 | -# --- NUMBER TOKENS ---------------------------------------------------------- | |
| 833 | - | |
| 834 | -# 3.3.2 Number Tokens | |
| 835 | -# INTEGER = integer-literal ["%" / "&" / "^"] | |
| 836 | -# integer-literal = decimal-literal / octal-literal / hex-literal | |
| 837 | -# decimal-literal = 1*decimal-digit | |
| 838 | -# octal-literal = "&" [%x004F / %x006F] 1*octal-digit | |
| 839 | -# ; & or &o or &O | |
| 840 | -# hex-literal = "&" (%x0048 / %x0068) 1*hex-digit | |
| 841 | -# ; &h or &H | |
| 842 | -# octal-digit = "0" / "1" / "2" / "3" / "4" / "5" / "6" / "7" | |
| 843 | -# decimal-digit = octal-digit / "8" / "9" | |
| 844 | -# hex-digit = decimal-digit / %x0041-0046 / %x0061-0066 ;A-F / a-f | |
| 845 | - | |
| 846 | -# NOTE: here Combine() is required to avoid spaces between elements | |
| 847 | -# NOTE: here WordStart is necessary to avoid matching a number preceded by | |
| 848 | -# letters or underscore (e.g. "VBT1" or "ABC_34"), when using scanString | |
| 849 | -decimal_literal = Combine(Optional('-') + WordStart(vba_identifier_chars) + Word(nums) | |
| 850 | - + Suppress(Optional(Word('%&^', exact=1)))) | |
| 851 | -decimal_literal.setParseAction(lambda t: int(t[0])) | |
| 852 | - | |
| 853 | -octal_literal = Combine(Suppress(Literal('&') + Optional((CaselessLiteral('o')))) + Word(srange('[0-7]')) | |
| 854 | - + Suppress(Optional(Word('%&^', exact=1)))) | |
| 855 | -octal_literal.setParseAction(lambda t: int(t[0], base=8)) | |
| 856 | - | |
| 857 | -hex_literal = Combine(Suppress(CaselessLiteral('&h')) + Word(srange('[0-9a-fA-F]')) | |
| 858 | - + Suppress(Optional(Word('%&^', exact=1)))) | |
| 859 | -hex_literal.setParseAction(lambda t: int(t[0], base=16)) | |
| 860 | - | |
| 861 | -integer = decimal_literal | octal_literal | hex_literal | |
| 862 | - | |
| 863 | - | |
| 864 | -# --- QUOTED STRINGS --------------------------------------------------------- | |
| 865 | - | |
| 866 | -# 3.3.4 String Tokens | |
| 867 | -# STRING = double-quote *string-character (double-quote / line-continuation / LINE-END) | |
| 868 | -# double-quote = %x0022 ; " | |
| 869 | -# string-character = NO-LINE-CONTINUATION ((double-quote double-quote) termination-character) | |
| 870 | - | |
| 871 | -quoted_string = QuotedString('"', escQuote='""') | |
| 872 | -quoted_string.setParseAction(lambda t: str(t[0])) | |
| 873 | - | |
| 874 | - | |
| 875 | -#--- VBA Expressions --------------------------------------------------------- | |
| 876 | - | |
| 877 | -# See MS-VBAL 5.6 Expressions | |
| 878 | - | |
| 879 | -# need to pre-declare using Forward() because it is recursive | |
| 880 | -# VBA string expression and integer expression | |
| 881 | -vba_expr_str = Forward() | |
| 882 | -vba_expr_int = Forward() | |
| 883 | - | |
| 884 | -# --- CHR -------------------------------------------------------------------- | |
| 885 | - | |
| 886 | -# MS-VBAL 6.1.2.11.1.4 Chr / Chr$ | |
| 887 | -# Function Chr(CharCode As Long) As Variant | |
| 888 | -# Function Chr$(CharCode As Long) As String | |
| 889 | -# Parameter Description | |
| 890 | -# CharCode Long whose value is a code point. | |
| 891 | -# Returns a String data value consisting of a single character containing the character whose code | |
| 892 | -# point is the data value of the argument. | |
| 893 | -# - If the argument is not in the range 0 to 255, Error Number 5 ("Invalid procedure call or | |
| 894 | -# argument") is raised unless the implementation supports a character set with a larger code point | |
| 895 | -# range. | |
| 896 | -# - If the argument value is in the range of 0 to 127, it is interpreted as a 7-bit ASCII code point. | |
| 897 | -# - If the argument value is in the range of 128 to 255, the code point interpretation of the value is | |
| 898 | -# implementation defined. | |
| 899 | -# - Chr$ has the same runtime semantics as Chr, however the declared type of its function result is | |
| 900 | -# String rather than Variant. | |
| 901 | - | |
| 902 | -# 6.1.2.11.1.5 ChrB / ChrB$ | |
| 903 | -# Function ChrB(CharCode As Long) As Variant | |
| 904 | -# Function ChrB$(CharCode As Long) As String | |
| 905 | -# CharCode Long whose value is a code point. | |
| 906 | -# Returns a String data value consisting of a single byte character whose code point value is the | |
| 907 | -# data value of the argument. | |
| 908 | -# - If the argument is not in the range 0 to 255, Error Number 6 ("Overflow") is raised. | |
| 909 | -# - ChrB$ has the same runtime semantics as ChrB however the declared type of its function result | |
| 910 | -# is String rather than Variant. | |
| 911 | -# - Note: the ChrB function is used with byte data contained in a String. Instead of returning a | |
| 912 | -# character, which may be one or two bytes, ChrB always returns a single byte. The ChrW function | |
| 913 | -# returns a String containing the Unicode character except on platforms where Unicode is not | |
| 914 | -# supported, in which case, the behavior is identical to the Chr function. | |
| 915 | - | |
| 916 | -# 6.1.2.11.1.6 ChrW/ ChrW$ | |
| 917 | -# Function ChrW(CharCode As Long) As Variant | |
| 918 | -# Function ChrW$(CharCode As Long) As String | |
| 919 | -# CharCode Long whose value is a code point. | |
| 920 | -# Returns a String data value consisting of a single character containing the character whose code | |
| 921 | -# point is the data value of the argument. | |
| 922 | -# - If the argument is not in the range -32,767 to 65,535 then Error Number 5 ("Invalid procedure | |
| 923 | -# call or argument") is raised. | |
| 924 | -# - If the argument is a negative value it is treated as if it was the value: CharCode + 65,536. | |
| 925 | -# - If the implemented uses 16-bit Unicode code points argument, data value is interpreted as a 16- | |
| 926 | -# bit Unicode code point. | |
| 927 | -# - If the implementation does not support Unicode, ChrW has the same semantics as Chr. | |
| 928 | -# - ChrW$ has the same runtime semantics as ChrW, however the declared type of its function result | |
| 929 | -# is String rather than Variant. | |
| 930 | - | |
| 931 | -# Chr, Chr$, ChrB, ChrW(int) => char | |
| 932 | -vba_chr = Suppress( | |
| 933 | - Combine(WordStart(vba_identifier_chars) + CaselessLiteral('Chr') | |
| 934 | - + Optional(CaselessLiteral('B') | CaselessLiteral('W')) + Optional('$')) | |
| 935 | - + '(') + vba_expr_int + Suppress(')') | |
| 936 | - | |
| 937 | -def vba_chr_tostr(t): | |
| 938 | - try: | |
| 939 | - i = t[0] | |
| 940 | - if i>=0 and i<=255: | |
| 941 | - # normal, non-unicode character: | |
| 942 | - # TODO: check if it needs to be converted to bytes for Python 3 | |
| 943 | - return VbaExpressionString(chr(i)) | |
| 944 | - else: | |
| 945 | - # unicode character | |
| 946 | - # Note: this distinction is only needed for Python 2 | |
| 947 | - return VbaExpressionString(unichr(i).encode('utf-8', 'backslashreplace')) | |
| 948 | - except ValueError: | |
| 949 | - log.exception('ERROR: incorrect parameter value for chr(): %r' % i) | |
| 950 | - return VbaExpressionString('Chr(%r)' % i) | |
| 951 | - | |
| 952 | -vba_chr.setParseAction(vba_chr_tostr) | |
| 953 | - | |
| 954 | - | |
| 955 | -# --- ASC -------------------------------------------------------------------- | |
| 956 | - | |
| 957 | -# Asc(char) => int | |
| 958 | -#TODO: see MS-VBAL 6.1.2.11.1.1 page 240 => AscB, AscW | |
| 959 | -vba_asc = Suppress(CaselessKeyword('Asc') + '(') + vba_expr_str + Suppress(')') | |
| 960 | -vba_asc.setParseAction(lambda t: ord(t[0])) | |
| 961 | - | |
| 962 | - | |
| 963 | -# --- VAL -------------------------------------------------------------------- | |
| 964 | - | |
| 965 | -# Val(string) => int | |
| 966 | -# TODO: make sure the behavior of VBA's val is fully covered | |
| 967 | -vba_val = Suppress(CaselessKeyword('Val') + '(') + vba_expr_str + Suppress(')') | |
| 968 | -vba_val.setParseAction(lambda t: int(t[0].strip())) | |
| 969 | - | |
| 970 | - | |
| 971 | -# --- StrReverse() -------------------------------------------------------------------- | |
| 972 | - | |
| 973 | -# StrReverse(string) => string | |
| 974 | -strReverse = Suppress(CaselessKeyword('StrReverse') + '(') + vba_expr_str + Suppress(')') | |
| 975 | -strReverse.setParseAction(lambda t: VbaExpressionString(str(t[0])[::-1])) | |
| 976 | - | |
| 977 | - | |
| 978 | -# --- ENVIRON() -------------------------------------------------------------------- | |
| 979 | - | |
| 980 | -# Environ("name") => just translated to "%name%", that is enough for malware analysis | |
| 981 | -environ = Suppress(CaselessKeyword('Environ') + '(') + vba_expr_str + Suppress(')') | |
| 982 | -environ.setParseAction(lambda t: VbaExpressionString('%%%s%%' % t[0])) | |
| 983 | - | |
| 984 | - | |
| 985 | -# --- IDENTIFIER ------------------------------------------------------------- | |
| 986 | - | |
| 987 | -#TODO: see MS-VBAL 3.3.5 page 33 | |
| 988 | -# 3.3.5 Identifier Tokens | |
| 989 | -# Latin-identifier = first-Latin-identifier-character *subsequent-Latin-identifier-character | |
| 990 | -# first-Latin-identifier-character = (%x0041-005A / %x0061-007A) ; A-Z / a-z | |
| 991 | -# subsequent-Latin-identifier-character = first-Latin-identifier-character / DIGIT / %x5F ; underscore | |
| 992 | -latin_identifier = Word(initChars=alphas, bodyChars=alphanums + '_') | |
| 993 | - | |
| 994 | -# --- HEX FUNCTION ----------------------------------------------------------- | |
| 995 | - | |
| 996 | -# match any custom function name with a hex string as argument: | |
| 997 | -# TODO: accept vba_expr_str_item as argument, check if it is a hex or base64 string at runtime | |
| 998 | - | |
| 999 | -# quoted string of at least two hexadecimal numbers of two digits: | |
| 1000 | -quoted_hex_string = Suppress('"') + Combine(Word(hexnums, exact=2) * (2, None)) + Suppress('"') | |
| 1001 | -quoted_hex_string.setParseAction(lambda t: str(t[0])) | |
| 1002 | - | |
| 1003 | -hex_function_call = Suppress(latin_identifier) + Suppress('(') + \ | |
| 1004 | - quoted_hex_string('hex_string') + Suppress(')') | |
| 1005 | -hex_function_call.setParseAction(lambda t: VbaExpressionString(binascii.a2b_hex(t.hex_string))) | |
| 1006 | - | |
| 1007 | - | |
| 1008 | -# --- BASE64 FUNCTION ----------------------------------------------------------- | |
| 1009 | - | |
| 1010 | -# match any custom function name with a Base64 string as argument: | |
| 1011 | -# TODO: accept vba_expr_str_item as argument, check if it is a hex or base64 string at runtime | |
| 1012 | - | |
| 1013 | -# quoted string of at least two hexadecimal numbers of two digits: | |
| 1014 | -quoted_base64_string = Suppress('"') + Regex(BASE64_RE) + Suppress('"') | |
| 1015 | -quoted_base64_string.setParseAction(lambda t: str(t[0])) | |
| 1016 | - | |
| 1017 | -base64_function_call = Suppress(latin_identifier) + Suppress('(') + \ | |
| 1018 | - quoted_base64_string('base64_string') + Suppress(')') | |
| 1019 | -base64_function_call.setParseAction(lambda t: VbaExpressionString(binascii.a2b_base64(t.base64_string))) | |
| 1020 | - | |
| 1021 | - | |
| 1022 | -# ---STRING EXPRESSION ------------------------------------------------------- | |
| 1023 | - | |
| 1024 | -def concat_strings_list(tokens): | |
| 1025 | - """ | |
| 1026 | - parse action to concatenate strings in a VBA expression with operators '+' or '&' | |
| 1027 | - """ | |
| 1028 | - # extract argument from the tokens: | |
| 1029 | - # expected to be a tuple containing a list of strings such as [a,'&',b,'&',c,...] | |
| 1030 | - strings = tokens[0][::2] | |
| 1031 | - return VbaExpressionString(''.join(strings)) | |
| 1032 | - | |
| 1033 | - | |
| 1034 | -vba_expr_str_item = (vba_chr | strReverse | environ | quoted_string | hex_function_call | base64_function_call) | |
| 1035 | - | |
| 1036 | -vba_expr_str <<= infixNotation(vba_expr_str_item, | |
| 1037 | - [ | |
| 1038 | - ("+", 2, opAssoc.LEFT, concat_strings_list), | |
| 1039 | - ("&", 2, opAssoc.LEFT, concat_strings_list), | |
| 1040 | - ]) | |
| 1041 | - | |
| 1042 | - | |
| 1043 | -# --- INTEGER EXPRESSION ------------------------------------------------------- | |
| 1044 | - | |
| 1045 | -def sum_ints_list(tokens): | |
| 1046 | - """ | |
| 1047 | - parse action to sum integers in a VBA expression with operator '+' | |
| 1048 | - """ | |
| 1049 | - # extract argument from the tokens: | |
| 1050 | - # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...] | |
| 1051 | - integers = tokens[0][::2] | |
| 1052 | - return sum(integers) | |
| 1053 | - | |
| 1054 | - | |
| 1055 | -def subtract_ints_list(tokens): | |
| 1056 | - """ | |
| 1057 | - parse action to subtract integers in a VBA expression with operator '-' | |
| 1058 | - """ | |
| 1059 | - # extract argument from the tokens: | |
| 1060 | - # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...] | |
| 1061 | - integers = tokens[0][::2] | |
| 1062 | - return reduce(lambda x,y:x-y, integers) | |
| 1063 | - | |
| 1064 | - | |
| 1065 | -def multiply_ints_list(tokens): | |
| 1066 | - """ | |
| 1067 | - parse action to multiply integers in a VBA expression with operator '*' | |
| 1068 | - """ | |
| 1069 | - # extract argument from the tokens: | |
| 1070 | - # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...] | |
| 1071 | - integers = tokens[0][::2] | |
| 1072 | - return reduce(lambda x,y:x*y, integers) | |
| 1073 | - | |
| 1074 | - | |
| 1075 | -def divide_ints_list(tokens): | |
| 1076 | - """ | |
| 1077 | - parse action to divide integers in a VBA expression with operator '/' | |
| 1078 | - """ | |
| 1079 | - # extract argument from the tokens: | |
| 1080 | - # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...] | |
| 1081 | - integers = tokens[0][::2] | |
| 1082 | - return reduce(lambda x,y:x/y, integers) | |
| 1083 | - | |
| 1084 | - | |
| 1085 | -vba_expr_int_item = (vba_asc | vba_val | integer) | |
| 1086 | - | |
| 1087 | -# operators associativity: | |
| 1088 | -# https://en.wikipedia.org/wiki/Operator_associativity | |
| 1089 | - | |
| 1090 | -vba_expr_int <<= infixNotation(vba_expr_int_item, | |
| 1091 | - [ | |
| 1092 | - ("*", 2, opAssoc.LEFT, multiply_ints_list), | |
| 1093 | - ("/", 2, opAssoc.LEFT, divide_ints_list), | |
| 1094 | - ("-", 2, opAssoc.LEFT, subtract_ints_list), | |
| 1095 | - ("+", 2, opAssoc.LEFT, sum_ints_list), | |
| 1096 | - ]) | |
| 1097 | - | |
| 1098 | - | |
| 1099 | -# see detect_vba_strings for the deobfuscation code using this grammar | |
| 1100 | - | |
| 1101 | -# === MSO/ActiveMime files parsing =========================================== | |
| 1102 | - | |
| 1103 | -def is_mso_file(data): | |
| 1104 | - """ | |
| 1105 | - Check if the provided data is the content of a MSO/ActiveMime file, such as | |
| 1106 | - the ones created by Outlook in some cases, or Word/Excel when saving a | |
| 1107 | - file with the MHTML format or the Word 2003 XML format. | |
| 1108 | - This function only checks the ActiveMime magic at the beginning of data. | |
| 1109 | - :param data: bytes string, MSO/ActiveMime file content | |
| 1110 | - :return: bool, True if the file is MSO, False otherwise | |
| 1111 | - """ | |
| 1112 | - return data.startswith(MSO_ACTIVEMIME_HEADER) | |
| 1113 | - | |
| 1114 | - | |
| 1115 | -# regex to find zlib block headers, starting with byte 0x78 = 'x' | |
| 1116 | -re_zlib_header = re.compile(r'x') | |
| 1117 | - | |
| 1118 | - | |
| 1119 | -def mso_file_extract(data): | |
| 1120 | - """ | |
| 1121 | - Extract the data stored into a MSO/ActiveMime file, such as | |
| 1122 | - the ones created by Outlook in some cases, or Word/Excel when saving a | |
| 1123 | - file with the MHTML format or the Word 2003 XML format. | |
| 1124 | - | |
| 1125 | - :param data: bytes string, MSO/ActiveMime file content | |
| 1126 | - :return: bytes string, extracted data (uncompressed) | |
| 1127 | - | |
| 1128 | - raise a MsoExtractionError if the data cannot be extracted | |
| 1129 | - """ | |
| 1130 | - # check the magic: | |
| 1131 | - assert is_mso_file(data) | |
| 1132 | - | |
| 1133 | - # In all the samples seen so far, Word always uses an offset of 0x32, | |
| 1134 | - # and Excel 0x22A. But we read the offset from the header to be more | |
| 1135 | - # generic. | |
| 1136 | - offsets = [0x32, 0x22A] | |
| 1137 | - | |
| 1138 | - # First, attempt to get the compressed data offset from the header | |
| 1139 | - # According to my tests, it should be an unsigned 16 bits integer, | |
| 1140 | - # at offset 0x1E (little endian) + add 46: | |
| 1141 | - try: | |
| 1142 | - offset = struct.unpack_from('<H', data, offset=0x1E)[0] + 46 | |
| 1143 | - log.debug('Parsing MSO file: data offset = 0x%X' % offset) | |
| 1144 | - offsets.insert(0, offset) # insert at beginning of offsets | |
| 1145 | - except struct.error as exc: | |
| 1146 | - log.info('Unable to parse MSO/ActiveMime file header (%s)' % exc) | |
| 1147 | - log.debug('Trace:', exc_info=True) | |
| 1148 | - raise MsoExtractionError('Unable to parse MSO/ActiveMime file header') | |
| 1149 | - # now try offsets | |
| 1150 | - for start in offsets: | |
| 1151 | - try: | |
| 1152 | - log.debug('Attempting zlib decompression from MSO file offset 0x%X' % start) | |
| 1153 | - extracted_data = zlib.decompress(data[start:]) | |
| 1154 | - return extracted_data | |
| 1155 | - except zlib.error as exc: | |
| 1156 | - log.info('zlib decompression failed for offset %s (%s)' | |
| 1157 | - % (start, exc)) | |
| 1158 | - log.debug('Trace:', exc_info=True) | |
| 1159 | - # None of the guessed offsets worked, let's try brute-forcing by looking | |
| 1160 | - # for potential zlib-compressed blocks starting with 0x78: | |
| 1161 | - log.debug('Looking for potential zlib-compressed blocks in MSO file') | |
| 1162 | - for match in re_zlib_header.finditer(data): | |
| 1163 | - start = match.start() | |
| 1164 | - try: | |
| 1165 | - log.debug('Attempting zlib decompression from MSO file offset 0x%X' % start) | |
| 1166 | - extracted_data = zlib.decompress(data[start:]) | |
| 1167 | - return extracted_data | |
| 1168 | - except zlib.error as exc: | |
| 1169 | - log.info('zlib decompression failed (%s)' % exc) | |
| 1170 | - log.debug('Trace:', exc_info=True) | |
| 1171 | - raise MsoExtractionError('Unable to decompress data from a MSO/ActiveMime file') | |
| 1172 | - | |
| 1173 | - | |
| 1174 | -#--- FUNCTIONS ---------------------------------------------------------------- | |
| 1175 | - | |
| 1176 | -# set of printable characters, for is_printable | |
| 1177 | -_PRINTABLE_SET = set(string.printable) | |
| 1178 | - | |
| 1179 | -def is_printable(s): | |
| 1180 | - """ | |
| 1181 | - returns True if string s only contains printable ASCII characters | |
| 1182 | - (i.e. contained in string.printable) | |
| 1183 | - This is similar to Python 3's str.isprintable, for Python 2.x. | |
| 1184 | - :param s: str | |
| 1185 | - :return: bool | |
| 1186 | - """ | |
| 1187 | - # inspired from http://stackoverflow.com/questions/3636928/test-if-a-python-string-is-printable | |
| 1188 | - # check if the set of chars from s is contained into the set of printable chars: | |
| 1189 | - return set(s).issubset(_PRINTABLE_SET) | |
| 1190 | - | |
| 1191 | - | |
| 1192 | -def copytoken_help(decompressed_current, decompressed_chunk_start): | |
| 1193 | - """ | |
| 1194 | - compute bit masks to decode a CopyToken according to MS-OVBA 2.4.1.3.19.1 CopyToken Help | |
| 1195 | - | |
| 1196 | - decompressed_current: number of decompressed bytes so far, i.e. len(decompressed_container) | |
| 1197 | - decompressed_chunk_start: offset of the current chunk in the decompressed container | |
| 1198 | - return length_mask, offset_mask, bit_count, maximum_length | |
| 1199 | - """ | |
| 1200 | - difference = decompressed_current - decompressed_chunk_start | |
| 1201 | - bit_count = int(math.ceil(math.log(difference, 2))) | |
| 1202 | - bit_count = max([bit_count, 4]) | |
| 1203 | - length_mask = 0xFFFF >> bit_count | |
| 1204 | - offset_mask = ~length_mask | |
| 1205 | - maximum_length = (0xFFFF >> bit_count) + 3 | |
| 1206 | - return length_mask, offset_mask, bit_count, maximum_length | |
| 1207 | - | |
| 1208 | - | |
| 1209 | -def decompress_stream(compressed_container): | |
| 1210 | - """ | |
| 1211 | - Decompress a stream according to MS-OVBA section 2.4.1 | |
| 1212 | - | |
| 1213 | - compressed_container: string compressed according to the MS-OVBA 2.4.1.3.6 Compression algorithm | |
| 1214 | - return the decompressed container as a string (bytes) | |
| 1215 | - """ | |
| 1216 | - # 2.4.1.2 State Variables | |
| 1217 | - | |
| 1218 | - # The following state is maintained for the CompressedContainer (section 2.4.1.1.1): | |
| 1219 | - # CompressedRecordEnd: The location of the byte after the last byte in the CompressedContainer (section 2.4.1.1.1). | |
| 1220 | - # CompressedCurrent: The location of the next byte in the CompressedContainer (section 2.4.1.1.1) to be read by | |
| 1221 | - # decompression or to be written by compression. | |
| 1222 | - | |
| 1223 | - # The following state is maintained for the current CompressedChunk (section 2.4.1.1.4): | |
| 1224 | - # CompressedChunkStart: The location of the first byte of the CompressedChunk (section 2.4.1.1.4) within the | |
| 1225 | - # CompressedContainer (section 2.4.1.1.1). | |
| 1226 | - | |
| 1227 | - # The following state is maintained for a DecompressedBuffer (section 2.4.1.1.2): | |
| 1228 | - # DecompressedCurrent: The location of the next byte in the DecompressedBuffer (section 2.4.1.1.2) to be written by | |
| 1229 | - # decompression or to be read by compression. | |
| 1230 | - # DecompressedBufferEnd: The location of the byte after the last byte in the DecompressedBuffer (section 2.4.1.1.2). | |
| 1231 | - | |
| 1232 | - # The following state is maintained for the current DecompressedChunk (section 2.4.1.1.3): | |
| 1233 | - # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the | |
| 1234 | - # DecompressedBuffer (section 2.4.1.1.2). | |
| 1235 | - | |
| 1236 | - # Check the input is a bytearray: | |
| 1237 | - if not isinstance(compressed_container, bytearray): | |
| 1238 | - raise TypeError('decompress_stream requires a bytearray as input') | |
| 1239 | - decompressed_container = bytearray() # result | |
| 1240 | - compressed_current = 0 | |
| 1241 | - | |
| 1242 | - sig_byte = compressed_container[compressed_current] | |
| 1243 | - if sig_byte != 0x01: | |
| 1244 | - raise ValueError('invalid signature byte {0:02X}'.format(sig_byte)) | |
| 1245 | - | |
| 1246 | - compressed_current += 1 | |
| 1247 | - | |
| 1248 | - #NOTE: the definition of CompressedRecordEnd is ambiguous. Here we assume that | |
| 1249 | - # CompressedRecordEnd = len(compressed_container) | |
| 1250 | - while compressed_current < len(compressed_container): | |
| 1251 | - # 2.4.1.1.5 | |
| 1252 | - compressed_chunk_start = compressed_current | |
| 1253 | - # chunk header = first 16 bits | |
| 1254 | - compressed_chunk_header = \ | |
| 1255 | - struct.unpack("<H", compressed_container[compressed_chunk_start:compressed_chunk_start + 2])[0] | |
| 1256 | - # chunk size = 12 first bits of header + 3 | |
| 1257 | - chunk_size = (compressed_chunk_header & 0x0FFF) + 3 | |
| 1258 | - # chunk signature = 3 next bits - should always be 0b011 | |
| 1259 | - chunk_signature = (compressed_chunk_header >> 12) & 0x07 | |
| 1260 | - if chunk_signature != 0b011: | |
| 1261 | - raise ValueError('Invalid CompressedChunkSignature in VBA compressed stream') | |
| 1262 | - # chunk flag = next bit - 1 == compressed, 0 == uncompressed | |
| 1263 | - chunk_flag = (compressed_chunk_header >> 15) & 0x01 | |
| 1264 | - log.debug("chunk size = {0}, compressed flag = {1}".format(chunk_size, chunk_flag)) | |
| 1265 | - | |
| 1266 | - #MS-OVBA 2.4.1.3.12: the maximum size of a chunk including its header is 4098 bytes (header 2 + data 4096) | |
| 1267 | - # The minimum size is 3 bytes | |
| 1268 | - # NOTE: there seems to be a typo in MS-OVBA, the check should be with 4098, not 4095 (which is the max value | |
| 1269 | - # in chunk header before adding 3. | |
| 1270 | - # Also the first test is not useful since a 12 bits value cannot be larger than 4095. | |
| 1271 | - if chunk_flag == 1 and chunk_size > 4098: | |
| 1272 | - raise ValueError('CompressedChunkSize > 4098 but CompressedChunkFlag == 1') | |
| 1273 | - if chunk_flag == 0 and chunk_size != 4098: | |
| 1274 | - raise ValueError('CompressedChunkSize != 4098 but CompressedChunkFlag == 0') | |
| 1275 | - | |
| 1276 | - # check if chunk_size goes beyond the compressed data, instead of silently cutting it: | |
| 1277 | - #TODO: raise an exception? | |
| 1278 | - if compressed_chunk_start + chunk_size > len(compressed_container): | |
| 1279 | - log.warning('Chunk size is larger than remaining compressed data') | |
| 1280 | - compressed_end = min([len(compressed_container), compressed_chunk_start + chunk_size]) | |
| 1281 | - # read after chunk header: | |
| 1282 | - compressed_current = compressed_chunk_start + 2 | |
| 1283 | - | |
| 1284 | - if chunk_flag == 0: | |
| 1285 | - # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk | |
| 1286 | - # uncompressed chunk: read the next 4096 bytes as-is | |
| 1287 | - #TODO: check if there are at least 4096 bytes left | |
| 1288 | - decompressed_container.extend([compressed_container[compressed_current:compressed_current + 4096]]) | |
| 1289 | - compressed_current += 4096 | |
| 1290 | - else: | |
| 1291 | - # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk | |
| 1292 | - # compressed chunk | |
| 1293 | - decompressed_chunk_start = len(decompressed_container) | |
| 1294 | - while compressed_current < compressed_end: | |
| 1295 | - # MS-OVBA 2.4.1.3.4 Decompressing a TokenSequence | |
| 1296 | - # log.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end)) | |
| 1297 | - # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or | |
| 1298 | - # copy tokens (reference to a previous literal token) | |
| 1299 | - flag_byte = compressed_container[compressed_current] | |
| 1300 | - compressed_current += 1 | |
| 1301 | - for bit_index in xrange(0, 8): | |
| 1302 | - # log.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end)) | |
| 1303 | - if compressed_current >= compressed_end: | |
| 1304 | - break | |
| 1305 | - # MS-OVBA 2.4.1.3.5 Decompressing a Token | |
| 1306 | - # MS-OVBA 2.4.1.3.17 Extract FlagBit | |
| 1307 | - flag_bit = (flag_byte >> bit_index) & 1 | |
| 1308 | - #log.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit)) | |
| 1309 | - if flag_bit == 0: # LiteralToken | |
| 1310 | - # copy one byte directly to output | |
| 1311 | - decompressed_container.extend([compressed_container[compressed_current]]) | |
| 1312 | - compressed_current += 1 | |
| 1313 | - else: # CopyToken | |
| 1314 | - # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken | |
| 1315 | - copy_token = \ | |
| 1316 | - struct.unpack("<H", compressed_container[compressed_current:compressed_current + 2])[0] | |
| 1317 | - #TODO: check this | |
| 1318 | - length_mask, offset_mask, bit_count, _ = copytoken_help( | |
| 1319 | - len(decompressed_container), decompressed_chunk_start) | |
| 1320 | - length = (copy_token & length_mask) + 3 | |
| 1321 | - temp1 = copy_token & offset_mask | |
| 1322 | - temp2 = 16 - bit_count | |
| 1323 | - offset = (temp1 >> temp2) + 1 | |
| 1324 | - #log.debug('offset=%d length=%d' % (offset, length)) | |
| 1325 | - copy_source = len(decompressed_container) - offset | |
| 1326 | - for index in xrange(copy_source, copy_source + length): | |
| 1327 | - decompressed_container.extend([decompressed_container[index]]) | |
| 1328 | - compressed_current += 2 | |
| 1329 | - return bytes(decompressed_container) | |
| 1330 | - | |
| 1331 | - | |
| 1332 | -def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): | |
| 1333 | - """ | |
| 1334 | - Extract VBA macros from an OleFileIO object. | |
| 1335 | - Internal function, do not call directly. | |
| 1336 | - | |
| 1337 | - vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream | |
| 1338 | - vba_project: path to the PROJECT stream | |
| 1339 | - :param relaxed: If True, only create info/debug log entry if data is not as expected | |
| 1340 | - (e.g. opening substream fails); if False, raise an error in this case | |
| 1341 | - This is a generator, yielding (stream path, VBA filename, VBA source code) for each VBA code stream | |
| 1342 | - """ | |
| 1343 | - # Open the PROJECT stream: | |
| 1344 | - project = ole.openstream(project_path) | |
| 1345 | - log.debug('relaxed is %s' % relaxed) | |
| 1346 | - | |
| 1347 | - # sample content of the PROJECT stream: | |
| 1348 | - | |
| 1349 | - ## ID="{5312AC8A-349D-4950-BDD0-49BE3C4DD0F0}" | |
| 1350 | - ## Document=ThisDocument/&H00000000 | |
| 1351 | - ## Module=NewMacros | |
| 1352 | - ## Name="Project" | |
| 1353 | - ## HelpContextID="0" | |
| 1354 | - ## VersionCompatible32="393222000" | |
| 1355 | - ## CMG="F1F301E705E705E705E705" | |
| 1356 | - ## DPB="8F8D7FE3831F2020202020" | |
| 1357 | - ## GC="2D2FDD81E51EE61EE6E1" | |
| 1358 | - ## | |
| 1359 | - ## [Host Extender Info] | |
| 1360 | - ## &H00000001={3832D640-CF90-11CF-8E43-00A0C911005A};VBE;&H00000000 | |
| 1361 | - ## &H00000002={000209F2-0000-0000-C000-000000000046};Word8.0;&H00000000 | |
| 1362 | - ## | |
| 1363 | - ## [Workspace] | |
| 1364 | - ## ThisDocument=22, 29, 339, 477, Z | |
| 1365 | - ## NewMacros=-4, 42, 832, 510, C | |
| 1366 | - | |
| 1367 | - code_modules = {} | |
| 1368 | - | |
| 1369 | - for line in project: | |
| 1370 | - line = line.strip().decode('utf-8','ignore') | |
| 1371 | - if '=' in line: | |
| 1372 | - # split line at the 1st equal sign: | |
| 1373 | - name, value = line.split('=', 1) | |
| 1374 | - # looking for code modules | |
| 1375 | - # add the code module as a key in the dictionary | |
| 1376 | - # the value will be the extension needed later | |
| 1377 | - # The value is converted to lowercase, to allow case-insensitive matching (issue #3) | |
| 1378 | - value = value.lower() | |
| 1379 | - if name == 'Document': | |
| 1380 | - # split value at the 1st slash, keep 1st part: | |
| 1381 | - value = value.split('/', 1)[0] | |
| 1382 | - code_modules[value] = CLASS_EXTENSION | |
| 1383 | - elif name == 'Module': | |
| 1384 | - code_modules[value] = MODULE_EXTENSION | |
| 1385 | - elif name == 'Class': | |
| 1386 | - code_modules[value] = CLASS_EXTENSION | |
| 1387 | - elif name == 'BaseClass': | |
| 1388 | - code_modules[value] = FORM_EXTENSION | |
| 1389 | - | |
| 1390 | - # read data from dir stream (compressed) | |
| 1391 | - dir_compressed = ole.openstream(dir_path).read() | |
| 1392 | - | |
| 1393 | - def check_value(name, expected, value): | |
| 1394 | - if expected != value: | |
| 1395 | - if relaxed: | |
| 1396 | - log.error("invalid value for {0} expected {1:04X} got {2:04X}" | |
| 1397 | - .format(name, expected, value)) | |
| 1398 | - else: | |
| 1399 | - raise UnexpectedDataError(dir_path, name, expected, value) | |
| 1400 | - | |
| 1401 | - dir_stream = BytesIO(decompress_stream(bytearray(dir_compressed))) | |
| 1402 | - | |
| 1403 | - # PROJECTSYSKIND Record | |
| 1404 | - projectsyskind_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1405 | - check_value('PROJECTSYSKIND_Id', 0x0001, projectsyskind_id) | |
| 1406 | - projectsyskind_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1407 | - check_value('PROJECTSYSKIND_Size', 0x0004, projectsyskind_size) | |
| 1408 | - projectsyskind_syskind = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1409 | - if projectsyskind_syskind == 0x00: | |
| 1410 | - log.debug("16-bit Windows") | |
| 1411 | - elif projectsyskind_syskind == 0x01: | |
| 1412 | - log.debug("32-bit Windows") | |
| 1413 | - elif projectsyskind_syskind == 0x02: | |
| 1414 | - log.debug("Macintosh") | |
| 1415 | - elif projectsyskind_syskind == 0x03: | |
| 1416 | - log.debug("64-bit Windows") | |
| 1417 | - else: | |
| 1418 | - log.error("invalid PROJECTSYSKIND_SysKind {0:04X}".format(projectsyskind_syskind)) | |
| 1419 | - | |
| 1420 | - # PROJECTLCID Record | |
| 1421 | - projectlcid_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1422 | - check_value('PROJECTLCID_Id', 0x0002, projectlcid_id) | |
| 1423 | - projectlcid_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1424 | - check_value('PROJECTLCID_Size', 0x0004, projectlcid_size) | |
| 1425 | - projectlcid_lcid = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1426 | - check_value('PROJECTLCID_Lcid', 0x409, projectlcid_lcid) | |
| 1427 | - | |
| 1428 | - # PROJECTLCIDINVOKE Record | |
| 1429 | - projectlcidinvoke_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1430 | - check_value('PROJECTLCIDINVOKE_Id', 0x0014, projectlcidinvoke_id) | |
| 1431 | - projectlcidinvoke_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1432 | - check_value('PROJECTLCIDINVOKE_Size', 0x0004, projectlcidinvoke_size) | |
| 1433 | - projectlcidinvoke_lcidinvoke = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1434 | - check_value('PROJECTLCIDINVOKE_LcidInvoke', 0x409, projectlcidinvoke_lcidinvoke) | |
| 1435 | - | |
| 1436 | - # PROJECTCODEPAGE Record | |
| 1437 | - projectcodepage_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1438 | - check_value('PROJECTCODEPAGE_Id', 0x0003, projectcodepage_id) | |
| 1439 | - projectcodepage_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1440 | - check_value('PROJECTCODEPAGE_Size', 0x0002, projectcodepage_size) | |
| 1441 | - projectcodepage_codepage = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1442 | - | |
| 1443 | - # PROJECTNAME Record | |
| 1444 | - projectname_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1445 | - check_value('PROJECTNAME_Id', 0x0004, projectname_id) | |
| 1446 | - projectname_sizeof_projectname = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1447 | - if projectname_sizeof_projectname < 1 or projectname_sizeof_projectname > 128: | |
| 1448 | - log.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(projectname_sizeof_projectname)) | |
| 1449 | - projectname_projectname = dir_stream.read(projectname_sizeof_projectname) | |
| 1450 | - unused = projectname_projectname | |
| 1451 | - | |
| 1452 | - # PROJECTDOCSTRING Record | |
| 1453 | - projectdocstring_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1454 | - check_value('PROJECTDOCSTRING_Id', 0x0005, projectdocstring_id) | |
| 1455 | - projectdocstring_sizeof_docstring = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1456 | - if projectdocstring_sizeof_docstring > 2000: | |
| 1457 | - log.error( | |
| 1458 | - "PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(projectdocstring_sizeof_docstring)) | |
| 1459 | - projectdocstring_docstring = dir_stream.read(projectdocstring_sizeof_docstring) | |
| 1460 | - projectdocstring_reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1461 | - check_value('PROJECTDOCSTRING_Reserved', 0x0040, projectdocstring_reserved) | |
| 1462 | - projectdocstring_sizeof_docstring_unicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1463 | - if projectdocstring_sizeof_docstring_unicode % 2 != 0: | |
| 1464 | - log.error("PROJECTDOCSTRING_SizeOfDocStringUnicode is not even") | |
| 1465 | - projectdocstring_docstring_unicode = dir_stream.read(projectdocstring_sizeof_docstring_unicode) | |
| 1466 | - unused = projectdocstring_docstring | |
| 1467 | - unused = projectdocstring_docstring_unicode | |
| 1468 | - | |
| 1469 | - # PROJECTHELPFILEPATH Record - MS-OVBA 2.3.4.2.1.7 | |
| 1470 | - projecthelpfilepath_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1471 | - check_value('PROJECTHELPFILEPATH_Id', 0x0006, projecthelpfilepath_id) | |
| 1472 | - projecthelpfilepath_sizeof_helpfile1 = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1473 | - if projecthelpfilepath_sizeof_helpfile1 > 260: | |
| 1474 | - log.error( | |
| 1475 | - "PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(projecthelpfilepath_sizeof_helpfile1)) | |
| 1476 | - projecthelpfilepath_helpfile1 = dir_stream.read(projecthelpfilepath_sizeof_helpfile1) | |
| 1477 | - projecthelpfilepath_reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1478 | - check_value('PROJECTHELPFILEPATH_Reserved', 0x003D, projecthelpfilepath_reserved) | |
| 1479 | - projecthelpfilepath_sizeof_helpfile2 = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1480 | - if projecthelpfilepath_sizeof_helpfile2 != projecthelpfilepath_sizeof_helpfile1: | |
| 1481 | - log.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 does not equal PROJECTHELPFILEPATH_SizeOfHelpFile2") | |
| 1482 | - projecthelpfilepath_helpfile2 = dir_stream.read(projecthelpfilepath_sizeof_helpfile2) | |
| 1483 | - if projecthelpfilepath_helpfile2 != projecthelpfilepath_helpfile1: | |
| 1484 | - log.error("PROJECTHELPFILEPATH_HelpFile1 does not equal PROJECTHELPFILEPATH_HelpFile2") | |
| 1485 | - | |
| 1486 | - # PROJECTHELPCONTEXT Record | |
| 1487 | - projecthelpcontext_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1488 | - check_value('PROJECTHELPCONTEXT_Id', 0x0007, projecthelpcontext_id) | |
| 1489 | - projecthelpcontext_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1490 | - check_value('PROJECTHELPCONTEXT_Size', 0x0004, projecthelpcontext_size) | |
| 1491 | - projecthelpcontext_helpcontext = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1492 | - unused = projecthelpcontext_helpcontext | |
| 1493 | - | |
| 1494 | - # PROJECTLIBFLAGS Record | |
| 1495 | - projectlibflags_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1496 | - check_value('PROJECTLIBFLAGS_Id', 0x0008, projectlibflags_id) | |
| 1497 | - projectlibflags_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1498 | - check_value('PROJECTLIBFLAGS_Size', 0x0004, projectlibflags_size) | |
| 1499 | - projectlibflags_projectlibflags = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1500 | - check_value('PROJECTLIBFLAGS_ProjectLibFlags', 0x0000, projectlibflags_projectlibflags) | |
| 1501 | - | |
| 1502 | - # PROJECTVERSION Record | |
| 1503 | - projectversion_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1504 | - check_value('PROJECTVERSION_Id', 0x0009, projectversion_id) | |
| 1505 | - projectversion_reserved = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1506 | - check_value('PROJECTVERSION_Reserved', 0x0004, projectversion_reserved) | |
| 1507 | - projectversion_versionmajor = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1508 | - projectversion_versionminor = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1509 | - unused = projectversion_versionmajor | |
| 1510 | - unused = projectversion_versionminor | |
| 1511 | - | |
| 1512 | - # PROJECTCONSTANTS Record | |
| 1513 | - projectconstants_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1514 | - check_value('PROJECTCONSTANTS_Id', 0x000C, projectconstants_id) | |
| 1515 | - projectconstants_sizeof_constants = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1516 | - if projectconstants_sizeof_constants > 1015: | |
| 1517 | - log.error( | |
| 1518 | - "PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(projectconstants_sizeof_constants)) | |
| 1519 | - projectconstants_constants = dir_stream.read(projectconstants_sizeof_constants) | |
| 1520 | - projectconstants_reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1521 | - check_value('PROJECTCONSTANTS_Reserved', 0x003C, projectconstants_reserved) | |
| 1522 | - projectconstants_sizeof_constants_unicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1523 | - if projectconstants_sizeof_constants_unicode % 2 != 0: | |
| 1524 | - log.error("PROJECTCONSTANTS_SizeOfConstantsUnicode is not even") | |
| 1525 | - projectconstants_constants_unicode = dir_stream.read(projectconstants_sizeof_constants_unicode) | |
| 1526 | - unused = projectconstants_constants | |
| 1527 | - unused = projectconstants_constants_unicode | |
| 1528 | - | |
| 1529 | - # array of REFERENCE records | |
| 1530 | - check = None | |
| 1531 | - while True: | |
| 1532 | - check = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1533 | - log.debug("reference type = {0:04X}".format(check)) | |
| 1534 | - if check == 0x000F: | |
| 1535 | - break | |
| 1536 | - | |
| 1537 | - if check == 0x0016: | |
| 1538 | - # REFERENCENAME | |
| 1539 | - reference_id = check | |
| 1540 | - reference_sizeof_name = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1541 | - reference_name = dir_stream.read(reference_sizeof_name) | |
| 1542 | - reference_reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1543 | - # According to [MS-OVBA] 2.3.4.2.2.2 REFERENCENAME Record: | |
| 1544 | - # "Reserved (2 bytes): MUST be 0x003E. MUST be ignored." | |
| 1545 | - # So let's ignore it, otherwise it crashes on some files (issue #132) | |
| 1546 | - # PR #135 by @c1fe: | |
| 1547 | - # contrary to the specification I think that the unicode name | |
| 1548 | - # is optional. if reference_reserved is not 0x003E I think it | |
| 1549 | - # is actually the start of another REFERENCE record | |
| 1550 | - # at least when projectsyskind_syskind == 0x02 (Macintosh) | |
| 1551 | - if reference_reserved == 0x003E: | |
| 1552 | - #if reference_reserved not in (0x003E, 0x000D): | |
| 1553 | - # raise UnexpectedDataError(dir_path, 'REFERENCE_Reserved', | |
| 1554 | - # 0x0003E, reference_reserved) | |
| 1555 | - reference_sizeof_name_unicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1556 | - reference_name_unicode = dir_stream.read(reference_sizeof_name_unicode) | |
| 1557 | - unused = reference_id | |
| 1558 | - unused = reference_name | |
| 1559 | - unused = reference_name_unicode | |
| 1560 | - continue | |
| 1561 | - else: | |
| 1562 | - check = reference_reserved | |
| 1563 | - log.debug("reference type = {0:04X}".format(check)) | |
| 1564 | - | |
| 1565 | - if check == 0x0033: | |
| 1566 | - # REFERENCEORIGINAL (followed by REFERENCECONTROL) | |
| 1567 | - referenceoriginal_id = check | |
| 1568 | - referenceoriginal_sizeof_libidoriginal = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1569 | - referenceoriginal_libidoriginal = dir_stream.read(referenceoriginal_sizeof_libidoriginal) | |
| 1570 | - unused = referenceoriginal_id | |
| 1571 | - unused = referenceoriginal_libidoriginal | |
| 1572 | - continue | |
| 1573 | - | |
| 1574 | - if check == 0x002F: | |
| 1575 | - # REFERENCECONTROL | |
| 1576 | - referencecontrol_id = check | |
| 1577 | - referencecontrol_sizetwiddled = struct.unpack("<L", dir_stream.read(4))[0] # ignore | |
| 1578 | - referencecontrol_sizeof_libidtwiddled = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1579 | - referencecontrol_libidtwiddled = dir_stream.read(referencecontrol_sizeof_libidtwiddled) | |
| 1580 | - referencecontrol_reserved1 = struct.unpack("<L", dir_stream.read(4))[0] # ignore | |
| 1581 | - check_value('REFERENCECONTROL_Reserved1', 0x0000, referencecontrol_reserved1) | |
| 1582 | - referencecontrol_reserved2 = struct.unpack("<H", dir_stream.read(2))[0] # ignore | |
| 1583 | - check_value('REFERENCECONTROL_Reserved2', 0x0000, referencecontrol_reserved2) | |
| 1584 | - unused = referencecontrol_id | |
| 1585 | - unused = referencecontrol_sizetwiddled | |
| 1586 | - unused = referencecontrol_libidtwiddled | |
| 1587 | - # optional field | |
| 1588 | - check2 = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1589 | - if check2 == 0x0016: | |
| 1590 | - referencecontrol_namerecordextended_id = check | |
| 1591 | - referencecontrol_namerecordextended_sizeof_name = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1592 | - referencecontrol_namerecordextended_name = dir_stream.read( | |
| 1593 | - referencecontrol_namerecordextended_sizeof_name) | |
| 1594 | - referencecontrol_namerecordextended_reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1595 | - if referencecontrol_namerecordextended_reserved == 0x003E: | |
| 1596 | - referencecontrol_namerecordextended_sizeof_name_unicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1597 | - referencecontrol_namerecordextended_name_unicode = dir_stream.read( | |
| 1598 | - referencecontrol_namerecordextended_sizeof_name_unicode) | |
| 1599 | - referencecontrol_reserved3 = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1600 | - unused = referencecontrol_namerecordextended_id | |
| 1601 | - unused = referencecontrol_namerecordextended_name | |
| 1602 | - unused = referencecontrol_namerecordextended_name_unicode | |
| 1603 | - else: | |
| 1604 | - referencecontrol_reserved3 = referencecontrol_namerecordextended_reserved | |
| 1605 | - else: | |
| 1606 | - referencecontrol_reserved3 = check2 | |
| 1607 | - | |
| 1608 | - check_value('REFERENCECONTROL_Reserved3', 0x0030, referencecontrol_reserved3) | |
| 1609 | - referencecontrol_sizeextended = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1610 | - referencecontrol_sizeof_libidextended = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1611 | - referencecontrol_libidextended = dir_stream.read(referencecontrol_sizeof_libidextended) | |
| 1612 | - referencecontrol_reserved4 = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1613 | - referencecontrol_reserved5 = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1614 | - referencecontrol_originaltypelib = dir_stream.read(16) | |
| 1615 | - referencecontrol_cookie = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1616 | - unused = referencecontrol_sizeextended | |
| 1617 | - unused = referencecontrol_libidextended | |
| 1618 | - unused = referencecontrol_reserved4 | |
| 1619 | - unused = referencecontrol_reserved5 | |
| 1620 | - unused = referencecontrol_originaltypelib | |
| 1621 | - unused = referencecontrol_cookie | |
| 1622 | - continue | |
| 1623 | - | |
| 1624 | - if check == 0x000D: | |
| 1625 | - # REFERENCEREGISTERED | |
| 1626 | - referenceregistered_id = check | |
| 1627 | - referenceregistered_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1628 | - referenceregistered_sizeof_libid = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1629 | - referenceregistered_libid = dir_stream.read(referenceregistered_sizeof_libid) | |
| 1630 | - referenceregistered_reserved1 = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1631 | - check_value('REFERENCEREGISTERED_Reserved1', 0x0000, referenceregistered_reserved1) | |
| 1632 | - referenceregistered_reserved2 = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1633 | - check_value('REFERENCEREGISTERED_Reserved2', 0x0000, referenceregistered_reserved2) | |
| 1634 | - unused = referenceregistered_id | |
| 1635 | - unused = referenceregistered_size | |
| 1636 | - unused = referenceregistered_libid | |
| 1637 | - continue | |
| 1638 | - | |
| 1639 | - if check == 0x000E: | |
| 1640 | - # REFERENCEPROJECT | |
| 1641 | - referenceproject_id = check | |
| 1642 | - referenceproject_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1643 | - referenceproject_sizeof_libidabsolute = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1644 | - referenceproject_libidabsolute = dir_stream.read(referenceproject_sizeof_libidabsolute) | |
| 1645 | - referenceproject_sizeof_libidrelative = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1646 | - referenceproject_libidrelative = dir_stream.read(referenceproject_sizeof_libidrelative) | |
| 1647 | - referenceproject_majorversion = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1648 | - referenceproject_minorversion = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1649 | - unused = referenceproject_id | |
| 1650 | - unused = referenceproject_size | |
| 1651 | - unused = referenceproject_libidabsolute | |
| 1652 | - unused = referenceproject_libidrelative | |
| 1653 | - unused = referenceproject_majorversion | |
| 1654 | - unused = referenceproject_minorversion | |
| 1655 | - continue | |
| 1656 | - | |
| 1657 | - log.error('invalid or unknown check Id {0:04X}'.format(check)) | |
| 1658 | - # raise an exception instead of stopping abruptly (issue #180) | |
| 1659 | - raise UnexpectedDataError(dir_path, 'reference type', (0x0F, 0x16, 0x33, 0x2F, 0x0D, 0x0E), check) | |
| 1660 | - #sys.exit(0) | |
| 1661 | - | |
| 1662 | - projectmodules_id = check #struct.unpack("<H", dir_stream.read(2))[0] | |
| 1663 | - check_value('PROJECTMODULES_Id', 0x000F, projectmodules_id) | |
| 1664 | - projectmodules_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1665 | - check_value('PROJECTMODULES_Size', 0x0002, projectmodules_size) | |
| 1666 | - projectmodules_count = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1667 | - projectmodules_projectcookierecord_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1668 | - check_value('PROJECTMODULES_ProjectCookieRecord_Id', 0x0013, projectmodules_projectcookierecord_id) | |
| 1669 | - projectmodules_projectcookierecord_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1670 | - check_value('PROJECTMODULES_ProjectCookieRecord_Size', 0x0002, projectmodules_projectcookierecord_size) | |
| 1671 | - projectmodules_projectcookierecord_cookie = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1672 | - unused = projectmodules_projectcookierecord_cookie | |
| 1673 | - | |
| 1674 | - # short function to simplify unicode text output | |
| 1675 | - uni_out = lambda unicode_text: unicode_text.encode('utf-8', 'replace') | |
| 1676 | - | |
| 1677 | - log.debug("parsing {0} modules".format(projectmodules_count)) | |
| 1678 | - for projectmodule_index in xrange(0, projectmodules_count): | |
| 1679 | - try: | |
| 1680 | - modulename_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1681 | - check_value('MODULENAME_Id', 0x0019, modulename_id) | |
| 1682 | - modulename_sizeof_modulename = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1683 | - modulename_modulename = dir_stream.read(modulename_sizeof_modulename).decode('utf-8', 'backslashreplace') | |
| 1684 | - # TODO: preset variables to avoid "referenced before assignment" errors | |
| 1685 | - modulename_unicode_modulename_unicode = '' | |
| 1686 | - # account for optional sections | |
| 1687 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1688 | - if section_id == 0x0047: | |
| 1689 | - modulename_unicode_id = section_id | |
| 1690 | - modulename_unicode_sizeof_modulename_unicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1691 | - modulename_unicode_modulename_unicode = dir_stream.read( | |
| 1692 | - modulename_unicode_sizeof_modulename_unicode).decode('UTF-16LE', 'replace') | |
| 1693 | - # just guessing that this is the same encoding as used in OleFileIO | |
| 1694 | - unused = modulename_unicode_id | |
| 1695 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1696 | - if section_id == 0x001A: | |
| 1697 | - modulestreamname_id = section_id | |
| 1698 | - modulestreamname_sizeof_streamname = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1699 | - modulestreamname_streamname = dir_stream.read(modulestreamname_sizeof_streamname) | |
| 1700 | - modulestreamname_reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1701 | - check_value('MODULESTREAMNAME_Reserved', 0x0032, modulestreamname_reserved) | |
| 1702 | - modulestreamname_sizeof_streamname_unicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1703 | - modulestreamname_streamname_unicode = dir_stream.read( | |
| 1704 | - modulestreamname_sizeof_streamname_unicode).decode('UTF-16LE', 'replace') | |
| 1705 | - # just guessing that this is the same encoding as used in OleFileIO | |
| 1706 | - unused = modulestreamname_id | |
| 1707 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1708 | - if section_id == 0x001C: | |
| 1709 | - moduledocstring_id = section_id | |
| 1710 | - check_value('MODULEDOCSTRING_Id', 0x001C, moduledocstring_id) | |
| 1711 | - moduledocstring_sizeof_docstring = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1712 | - moduledocstring_docstring = dir_stream.read(moduledocstring_sizeof_docstring) | |
| 1713 | - moduledocstring_reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1714 | - check_value('MODULEDOCSTRING_Reserved', 0x0048, moduledocstring_reserved) | |
| 1715 | - moduledocstring_sizeof_docstring_unicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1716 | - moduledocstring_docstring_unicode = dir_stream.read(moduledocstring_sizeof_docstring_unicode) | |
| 1717 | - unused = moduledocstring_docstring | |
| 1718 | - unused = moduledocstring_docstring_unicode | |
| 1719 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1720 | - if section_id == 0x0031: | |
| 1721 | - moduleoffset_id = section_id | |
| 1722 | - check_value('MODULEOFFSET_Id', 0x0031, moduleoffset_id) | |
| 1723 | - moduleoffset_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1724 | - check_value('MODULEOFFSET_Size', 0x0004, moduleoffset_size) | |
| 1725 | - moduleoffset_textoffset = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1726 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1727 | - if section_id == 0x001E: | |
| 1728 | - modulehelpcontext_id = section_id | |
| 1729 | - check_value('MODULEHELPCONTEXT_Id', 0x001E, modulehelpcontext_id) | |
| 1730 | - modulehelpcontext_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1731 | - check_value('MODULEHELPCONTEXT_Size', 0x0004, modulehelpcontext_size) | |
| 1732 | - modulehelpcontext_helpcontext = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1733 | - unused = modulehelpcontext_helpcontext | |
| 1734 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1735 | - if section_id == 0x002C: | |
| 1736 | - modulecookie_id = section_id | |
| 1737 | - check_value('MODULECOOKIE_Id', 0x002C, modulecookie_id) | |
| 1738 | - modulecookie_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1739 | - check_value('MODULECOOKIE_Size', 0x0002, modulecookie_size) | |
| 1740 | - modulecookie_cookie = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1741 | - unused = modulecookie_cookie | |
| 1742 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1743 | - if section_id == 0x0021 or section_id == 0x0022: | |
| 1744 | - moduletype_id = section_id | |
| 1745 | - moduletype_reserved = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1746 | - unused = moduletype_id | |
| 1747 | - unused = moduletype_reserved | |
| 1748 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1749 | - if section_id == 0x0025: | |
| 1750 | - modulereadonly_id = section_id | |
| 1751 | - check_value('MODULEREADONLY_Id', 0x0025, modulereadonly_id) | |
| 1752 | - modulereadonly_reserved = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1753 | - check_value('MODULEREADONLY_Reserved', 0x0000, modulereadonly_reserved) | |
| 1754 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1755 | - if section_id == 0x0028: | |
| 1756 | - moduleprivate_id = section_id | |
| 1757 | - check_value('MODULEPRIVATE_Id', 0x0028, moduleprivate_id) | |
| 1758 | - moduleprivate_reserved = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1759 | - check_value('MODULEPRIVATE_Reserved', 0x0000, moduleprivate_reserved) | |
| 1760 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1761 | - if section_id == 0x002B: # TERMINATOR | |
| 1762 | - module_reserved = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1763 | - check_value('MODULE_Reserved', 0x0000, module_reserved) | |
| 1764 | - section_id = None | |
| 1765 | - if section_id != None: | |
| 1766 | - log.warning('unknown or invalid module section id {0:04X}'.format(section_id)) | |
| 1767 | - | |
| 1768 | - log.debug('Project CodePage = %d' % projectcodepage_codepage) | |
| 1769 | - if projectcodepage_codepage in MAC_CODEPAGES: | |
| 1770 | - vba_codec = MAC_CODEPAGES[projectcodepage_codepage] | |
| 1771 | - else: | |
| 1772 | - vba_codec = 'cp%d' % projectcodepage_codepage | |
| 1773 | - log.debug("ModuleName = {0}".format(modulename_modulename)) | |
| 1774 | - log.debug("ModuleNameUnicode = {0}".format(uni_out(modulename_unicode_modulename_unicode))) | |
| 1775 | - log.debug("StreamName = {0}".format(modulestreamname_streamname)) | |
| 1776 | - try: | |
| 1777 | - streamname_unicode = modulestreamname_streamname.decode(vba_codec) | |
| 1778 | - except UnicodeError as ue: | |
| 1779 | - log.debug('failed to decode stream name {0!r} with codec {1}' | |
| 1780 | - .format(uni_out(streamname_unicode), vba_codec)) | |
| 1781 | - streamname_unicode = modulestreamname_streamname.decode(vba_codec, errors='replace') | |
| 1782 | - log.debug("StreamName.decode('%s') = %s" % (vba_codec, uni_out(streamname_unicode))) | |
| 1783 | - log.debug("StreamNameUnicode = {0}".format(uni_out(modulestreamname_streamname_unicode))) | |
| 1784 | - log.debug("TextOffset = {0}".format(moduleoffset_textoffset)) | |
| 1785 | - | |
| 1786 | - code_data = None | |
| 1787 | - try_names = streamname_unicode, \ | |
| 1788 | - modulename_unicode_modulename_unicode, \ | |
| 1789 | - modulestreamname_streamname_unicode | |
| 1790 | - for stream_name in try_names: | |
| 1791 | - # TODO: if olefile._find were less private, could replace this | |
| 1792 | - # try-except with calls to it | |
| 1793 | - try: | |
| 1794 | - code_path = vba_root + u'VBA/' + stream_name | |
| 1795 | - log.debug('opening VBA code stream %s' % uni_out(code_path)) | |
| 1796 | - code_data = ole.openstream(code_path).read() | |
| 1797 | - break | |
| 1798 | - except IOError as ioe: | |
| 1799 | - log.debug('failed to open stream VBA/%r (%r), try other name' | |
| 1800 | - % (uni_out(stream_name), ioe)) | |
| 1801 | - | |
| 1802 | - if code_data is None: | |
| 1803 | - log.info("Could not open stream %d of %d ('VBA/' + one of %r)!" | |
| 1804 | - % (projectmodule_index, projectmodules_count, | |
| 1805 | - '/'.join("'" + uni_out(stream_name) + "'" | |
| 1806 | - for stream_name in try_names))) | |
| 1807 | - if relaxed: | |
| 1808 | - continue # ... with next submodule | |
| 1809 | - else: | |
| 1810 | - raise SubstreamOpenError('[BASE]', 'VBA/' + | |
| 1811 | - uni_out(modulename_unicode_modulename_unicode)) | |
| 1812 | - | |
| 1813 | - log.debug("length of code_data = {0}".format(len(code_data))) | |
| 1814 | - log.debug("offset of code_data = {0}".format(moduleoffset_textoffset)) | |
| 1815 | - code_data = code_data[moduleoffset_textoffset:] | |
| 1816 | - if len(code_data) > 0: | |
| 1817 | - code_data = decompress_stream(bytearray(code_data)) | |
| 1818 | - # case-insensitive search in the code_modules dict to find the file extension: | |
| 1819 | - filext = code_modules.get(modulename_modulename.lower(), 'bin') | |
| 1820 | - filename = '{0}.{1}'.format(modulename_modulename, filext) | |
| 1821 | - #TODO: also yield the codepage so that callers can decode it properly | |
| 1822 | - yield (code_path, filename, code_data) | |
| 1823 | - # print '-'*79 | |
| 1824 | - # print filename | |
| 1825 | - # print '' | |
| 1826 | - # print code_data | |
| 1827 | - # print '' | |
| 1828 | - log.debug('extracted file {0}'.format(filename)) | |
| 1829 | - else: | |
| 1830 | - log.warning("module stream {0} has code data length 0".format(modulestreamname_streamname)) | |
| 1831 | - except (UnexpectedDataError, SubstreamOpenError): | |
| 1832 | - raise | |
| 1833 | - except Exception as exc: | |
| 1834 | - log.info('Error parsing module {0} of {1} in _extract_vba:' | |
| 1835 | - .format(projectmodule_index, projectmodules_count), | |
| 1836 | - exc_info=True) | |
| 1837 | - if not relaxed: | |
| 1838 | - raise | |
| 1839 | - _ = unused # make pylint happy: now variable "unused" is being used ;-) | |
| 1840 | - return | |
| 1841 | - | |
| 1842 | - | |
| 1843 | -def vba_collapse_long_lines(vba_code): | |
| 1844 | - """ | |
| 1845 | - Parse a VBA module code to detect continuation line characters (underscore) and | |
| 1846 | - collapse split lines. Continuation line characters are replaced by spaces. | |
| 1847 | - | |
| 1848 | - :param vba_code: str, VBA module code | |
| 1849 | - :return: str, VBA module code with long lines collapsed | |
| 1850 | - """ | |
| 1851 | - # TODO: use a regex instead, to allow whitespaces after the underscore? | |
| 1852 | - vba_code = vba_code.replace(' _\r\n', ' ') | |
| 1853 | - vba_code = vba_code.replace(' _\r', ' ') | |
| 1854 | - vba_code = vba_code.replace(' _\n', ' ') | |
| 1855 | - return vba_code | |
| 1856 | - | |
| 1857 | - | |
| 1858 | -def filter_vba(vba_code): | |
| 1859 | - """ | |
| 1860 | - Filter VBA source code to remove the first lines starting with "Attribute VB_", | |
| 1861 | - which are automatically added by MS Office and not displayed in the VBA Editor. | |
| 1862 | - This should only be used when displaying source code for human analysis. | |
| 1863 | - | |
| 1864 | - Note: lines are not filtered if they contain a colon, because it could be | |
| 1865 | - used to hide malicious instructions. | |
| 1866 | - | |
| 1867 | - :param vba_code: str, VBA source code | |
| 1868 | - :return: str, filtered VBA source code | |
| 1869 | - """ | |
| 1870 | - vba_lines = vba_code.splitlines() | |
| 1871 | - start = 0 | |
| 1872 | - for line in vba_lines: | |
| 1873 | - if line.startswith("Attribute VB_") and not ':' in line: | |
| 1874 | - start += 1 | |
| 1875 | - else: | |
| 1876 | - break | |
| 1877 | - #TODO: also remove empty lines? | |
| 1878 | - vba = '\n'.join(vba_lines[start:]) | |
| 1879 | - return vba | |
| 1880 | - | |
| 1881 | - | |
| 1882 | -def detect_autoexec(vba_code, obfuscation=None): | |
| 1883 | - """ | |
| 1884 | - Detect if the VBA code contains keywords corresponding to macros running | |
| 1885 | - automatically when triggered by specific actions (e.g. when a document is | |
| 1886 | - opened or closed). | |
| 1887 | - | |
| 1888 | - :param vba_code: str, VBA source code | |
| 1889 | - :param obfuscation: None or str, name of obfuscation to be added to description | |
| 1890 | - :return: list of str tuples (keyword, description) | |
| 1891 | - """ | |
| 1892 | - #TODO: merge code with detect_suspicious | |
| 1893 | - # case-insensitive search | |
| 1894 | - #vba_code = vba_code.lower() | |
| 1895 | - results = [] | |
| 1896 | - obf_text = '' | |
| 1897 | - if obfuscation: | |
| 1898 | - obf_text = ' (obfuscation: %s)' % obfuscation | |
| 1899 | - for description, keywords in AUTOEXEC_KEYWORDS.items(): | |
| 1900 | - for keyword in keywords: | |
| 1901 | - #TODO: if keyword is already a compiled regex, use it as-is | |
| 1902 | - # search using regex to detect word boundaries: | |
| 1903 | - match = re.search(r'(?i)\b' + keyword + r'\b', vba_code) | |
| 1904 | - if match: | |
| 1905 | - #if keyword.lower() in vba_code: | |
| 1906 | - found_keyword = match.group() | |
| 1907 | - results.append((found_keyword, description + obf_text)) | |
| 1908 | - return results | |
| 1909 | - | |
| 1910 | - | |
| 1911 | -def detect_suspicious(vba_code, obfuscation=None): | |
| 1912 | - """ | |
| 1913 | - Detect if the VBA code contains suspicious keywords corresponding to | |
| 1914 | - potential malware behaviour. | |
| 1915 | - | |
| 1916 | - :param vba_code: str, VBA source code | |
| 1917 | - :param obfuscation: None or str, name of obfuscation to be added to description | |
| 1918 | - :return: list of str tuples (keyword, description) | |
| 1919 | - """ | |
| 1920 | - # case-insensitive search | |
| 1921 | - #vba_code = vba_code.lower() | |
| 1922 | - results = [] | |
| 1923 | - obf_text = '' | |
| 1924 | - if obfuscation: | |
| 1925 | - obf_text = ' (obfuscation: %s)' % obfuscation | |
| 1926 | - for description, keywords in SUSPICIOUS_KEYWORDS.items(): | |
| 1927 | - for keyword in keywords: | |
| 1928 | - # search using regex to detect word boundaries: | |
| 1929 | - match = re.search(r'(?i)\b' + re.escape(keyword) + r'\b', vba_code) | |
| 1930 | - if match: | |
| 1931 | - #if keyword.lower() in vba_code: | |
| 1932 | - found_keyword = match.group() | |
| 1933 | - results.append((found_keyword, description + obf_text)) | |
| 1934 | - return results | |
| 1935 | - | |
| 1936 | - | |
| 1937 | -def detect_patterns(vba_code, obfuscation=None): | |
| 1938 | - """ | |
| 1939 | - Detect if the VBA code contains specific patterns such as IP addresses, | |
| 1940 | - URLs, e-mail addresses, executable file names, etc. | |
| 1941 | - | |
| 1942 | - :param vba_code: str, VBA source code | |
| 1943 | - :return: list of str tuples (pattern type, value) | |
| 1944 | - """ | |
| 1945 | - results = [] | |
| 1946 | - found = set() | |
| 1947 | - obf_text = '' | |
| 1948 | - if obfuscation: | |
| 1949 | - obf_text = ' (obfuscation: %s)' % obfuscation | |
| 1950 | - for pattern_type, pattern_re in RE_PATTERNS: | |
| 1951 | - for match in pattern_re.finditer(vba_code): | |
| 1952 | - value = match.group() | |
| 1953 | - if value not in found: | |
| 1954 | - results.append((pattern_type + obf_text, value)) | |
| 1955 | - found.add(value) | |
| 1956 | - return results | |
| 1957 | - | |
| 1958 | - | |
| 1959 | -def detect_hex_strings(vba_code): | |
| 1960 | - """ | |
| 1961 | - Detect if the VBA code contains strings encoded in hexadecimal. | |
| 1962 | - | |
| 1963 | - :param vba_code: str, VBA source code | |
| 1964 | - :return: list of str tuples (encoded string, decoded string) | |
| 1965 | - """ | |
| 1966 | - results = [] | |
| 1967 | - found = set() | |
| 1968 | - for match in re_hex_string.finditer(vba_code): | |
| 1969 | - value = match.group() | |
| 1970 | - if value not in found: | |
| 1971 | - decoded = binascii.unhexlify(value) | |
| 1972 | - results.append((value, decoded.decode('utf-8', 'backslashreplace'))) | |
| 1973 | - found.add(value) | |
| 1974 | - return results | |
| 1975 | - | |
| 1976 | - | |
| 1977 | -def detect_base64_strings(vba_code): | |
| 1978 | - """ | |
| 1979 | - Detect if the VBA code contains strings encoded in base64. | |
| 1980 | - | |
| 1981 | - :param vba_code: str, VBA source code | |
| 1982 | - :return: list of str tuples (encoded string, decoded string) | |
| 1983 | - """ | |
| 1984 | - #TODO: avoid matching simple hex strings as base64? | |
| 1985 | - results = [] | |
| 1986 | - found = set() | |
| 1987 | - for match in re_base64_string.finditer(vba_code): | |
| 1988 | - # extract the base64 string without quotes: | |
| 1989 | - value = match.group().strip('"') | |
| 1990 | - # check it is not just a hex string: | |
| 1991 | - if not re_nothex_check.search(value): | |
| 1992 | - continue | |
| 1993 | - # only keep new values and not in the whitelist: | |
| 1994 | - if value not in found and value.lower() not in BASE64_WHITELIST: | |
| 1995 | - try: | |
| 1996 | - decoded = base64.b64decode(value) | |
| 1997 | - results.append((value, decoded.decode('utf-8','replace'))) | |
| 1998 | - found.add(value) | |
| 1999 | - except (TypeError, ValueError) as exc: | |
| 2000 | - log.debug('Failed to base64-decode (%s)' % exc) | |
| 2001 | - # if an exception occurs, it is likely not a base64-encoded string | |
| 2002 | - return results | |
| 2003 | - | |
| 2004 | - | |
| 2005 | -def detect_dridex_strings(vba_code): | |
| 2006 | - """ | |
| 2007 | - Detect if the VBA code contains strings obfuscated with a specific algorithm found in Dridex samples. | |
| 2008 | - | |
| 2009 | - :param vba_code: str, VBA source code | |
| 2010 | - :return: list of str tuples (encoded string, decoded string) | |
| 2011 | - """ | |
| 2012 | - # TODO: move this at the beginning of script | |
| 2013 | - from oletools.thirdparty.DridexUrlDecoder.DridexUrlDecoder import DridexUrlDecode | |
| 2014 | - | |
| 2015 | - results = [] | |
| 2016 | - found = set() | |
| 2017 | - for match in re_dridex_string.finditer(vba_code): | |
| 2018 | - value = match.group()[1:-1] | |
| 2019 | - # check it is not just a hex string: | |
| 2020 | - if not re_nothex_check.search(value): | |
| 2021 | - continue | |
| 2022 | - if value not in found: | |
| 2023 | - try: | |
| 2024 | - decoded = DridexUrlDecode(value) | |
| 2025 | - results.append((value, decoded)) | |
| 2026 | - found.add(value) | |
| 2027 | - except Exception as exc: | |
| 2028 | - log.debug('Failed to Dridex-decode (%s)' % exc) | |
| 2029 | - # if an exception occurs, it is likely not a dridex-encoded string | |
| 2030 | - return results | |
| 2031 | - | |
| 2032 | - | |
| 2033 | -def detect_vba_strings(vba_code): | |
| 2034 | - """ | |
| 2035 | - Detect if the VBA code contains strings obfuscated with VBA expressions | |
| 2036 | - using keywords such as Chr, Asc, Val, StrReverse, etc. | |
| 2037 | - | |
| 2038 | - :param vba_code: str, VBA source code | |
| 2039 | - :return: list of str tuples (encoded string, decoded string) | |
| 2040 | - """ | |
| 2041 | - # TODO: handle exceptions | |
| 2042 | - results = [] | |
| 2043 | - found = set() | |
| 2044 | - # IMPORTANT: to extract the actual VBA expressions found in the code, | |
| 2045 | - # we must expand tabs to have the same string as pyparsing. | |
| 2046 | - # Otherwise, start and end offsets are incorrect. | |
| 2047 | - vba_code = vba_code.expandtabs() | |
| 2048 | - # Split the VBA code line by line to avoid MemoryError on large scripts: | |
| 2049 | - for vba_line in vba_code.splitlines(): | |
| 2050 | - for tokens, start, end in vba_expr_str.scanString(vba_line): | |
| 2051 | - encoded = vba_line[start:end] | |
| 2052 | - decoded = tokens[0] | |
| 2053 | - if isinstance(decoded, VbaExpressionString): | |
| 2054 | - # This is a VBA expression, not a simple string | |
| 2055 | - # print 'VBA EXPRESSION: encoded=%r => decoded=%r' % (encoded, decoded) | |
| 2056 | - # remove parentheses and quotes from original string: | |
| 2057 | - # if encoded.startswith('(') and encoded.endswith(')'): | |
| 2058 | - # encoded = encoded[1:-1] | |
| 2059 | - # if encoded.startswith('"') and encoded.endswith('"'): | |
| 2060 | - # encoded = encoded[1:-1] | |
| 2061 | - # avoid duplicates and simple strings: | |
| 2062 | - if encoded not in found and decoded != encoded: | |
| 2063 | - results.append((encoded, decoded)) | |
| 2064 | - found.add(encoded) | |
| 2065 | - # else: | |
| 2066 | - # print 'VBA STRING: encoded=%r => decoded=%r' % (encoded, decoded) | |
| 2067 | - return results | |
| 2068 | - | |
| 2069 | - | |
| 2070 | -def json2ascii(json_obj, encoding='utf8', errors='replace'): | |
| 2071 | - """ ensure there is no unicode in json and all strings are safe to decode | |
| 2072 | - | |
| 2073 | - works recursively, decodes and re-encodes every string to/from unicode | |
| 2074 | - to ensure there will be no trouble in loading the dumped json output | |
| 2075 | - """ | |
| 2076 | - if json_obj is None: | |
| 2077 | - pass | |
| 2078 | - elif isinstance(json_obj, (bool, int, float)): | |
| 2079 | - pass | |
| 2080 | - elif isinstance(json_obj, str): | |
| 2081 | - # de-code and re-encode | |
| 2082 | - dencoded = json_obj | |
| 2083 | - if dencoded != json_obj: | |
| 2084 | - log.debug('json2ascii: replaced: {0} (len {1})' | |
| 2085 | - .format(json_obj, len(json_obj))) | |
| 2086 | - log.debug('json2ascii: with: {0} (len {1})' | |
| 2087 | - .format(dencoded, len(dencoded))) | |
| 2088 | - return dencoded | |
| 2089 | - elif isinstance(json_obj, bytes): | |
| 2090 | - log.debug('json2ascii: encode unicode: {0}' | |
| 2091 | - .format(json_obj.decode(encoding, errors))) | |
| 2092 | - # cannot put original into logger | |
| 2093 | - # print 'original: ' json_obj | |
| 2094 | - return json_obj.decode(encoding, errors) | |
| 2095 | - elif isinstance(json_obj, dict): | |
| 2096 | - for key in json_obj: | |
| 2097 | - json_obj[key] = json2ascii(json_obj[key]) | |
| 2098 | - elif isinstance(json_obj, (list,tuple)): | |
| 2099 | - for item in json_obj: | |
| 2100 | - item = json2ascii(item) | |
| 2101 | - else: | |
| 2102 | - log.debug('unexpected type in json2ascii: {0} -- leave as is' | |
| 2103 | - .format(type(json_obj))) | |
| 2104 | - return json_obj | |
| 2105 | - | |
| 2106 | - | |
| 2107 | -def print_json(json_dict=None, _json_is_first=False, _json_is_last=False, | |
| 2108 | - **json_parts): | |
| 2109 | - """ line-wise print of json.dumps(json2ascii(..)) with options and indent+1 | |
| 2110 | - | |
| 2111 | - can use in two ways: | |
| 2112 | - (1) print_json(some_dict) | |
| 2113 | - (2) print_json(key1=value1, key2=value2, ...) | |
| 2114 | - | |
| 2115 | - :param bool _json_is_first: set to True only for very first entry to complete | |
| 2116 | - the top-level json-list | |
| 2117 | - :param bool _json_is_last: set to True only for very last entry to complete | |
| 2118 | - the top-level json-list | |
| 2119 | - """ | |
| 2120 | - if json_dict and json_parts: | |
| 2121 | - raise ValueError('Invalid json argument: want either single dict or ' | |
| 2122 | - 'key=value parts but got both)') | |
| 2123 | - elif (json_dict is not None) and (not isinstance(json_dict, dict)): | |
| 2124 | - raise ValueError('Invalid json argument: want either single dict or ' | |
| 2125 | - 'key=value parts but got {0} instead of dict)' | |
| 2126 | - .format(type(json_dict))) | |
| 2127 | - if json_parts: | |
| 2128 | - json_dict = json_parts | |
| 2129 | - | |
| 2130 | - if _json_is_first: | |
| 2131 | - print('[') | |
| 2132 | - | |
| 2133 | - lines = json.dumps(json2ascii(json_dict), check_circular=False, | |
| 2134 | - indent=4, ensure_ascii=False).splitlines() | |
| 2135 | - for line in lines[:-1]: | |
| 2136 | - print(' {0}'.format(line)) | |
| 2137 | - if _json_is_last: | |
| 2138 | - print(' {0}'.format(lines[-1])) # print last line without comma | |
| 2139 | - print(']') | |
| 2140 | - else: | |
| 2141 | - print(' {0},'.format(lines[-1])) # print last line with comma | |
| 2142 | - | |
| 2143 | - | |
| 2144 | -class VBA_Scanner(object): | |
| 2145 | - """ | |
| 2146 | - Class to scan the source code of a VBA module to find obfuscated strings, | |
| 2147 | - suspicious keywords, IOCs, auto-executable macros, etc. | |
| 2148 | - """ | |
| 2149 | - | |
| 2150 | - def __init__(self, vba_code): | |
| 2151 | - """ | |
| 2152 | - VBA_Scanner constructor | |
| 2153 | - | |
| 2154 | - :param vba_code: str, VBA source code to be analyzed | |
| 2155 | - """ | |
| 2156 | - if isinstance(vba_code, bytes): | |
| 2157 | - vba_code = vba_code.decode('utf-8', 'backslashreplace') | |
| 2158 | - # join long lines ending with " _": | |
| 2159 | - self.code = vba_collapse_long_lines(vba_code) | |
| 2160 | - self.code_hex = '' | |
| 2161 | - self.code_hex_rev = '' | |
| 2162 | - self.code_rev_hex = '' | |
| 2163 | - self.code_base64 = '' | |
| 2164 | - self.code_dridex = '' | |
| 2165 | - self.code_vba = '' | |
| 2166 | - self.strReverse = None | |
| 2167 | - # results = None before scanning, then a list of tuples after scanning | |
| 2168 | - self.results = None | |
| 2169 | - self.autoexec_keywords = None | |
| 2170 | - self.suspicious_keywords = None | |
| 2171 | - self.iocs = None | |
| 2172 | - self.hex_strings = None | |
| 2173 | - self.base64_strings = None | |
| 2174 | - self.dridex_strings = None | |
| 2175 | - self.vba_strings = None | |
| 2176 | - | |
| 2177 | - | |
| 2178 | - def scan(self, include_decoded_strings=False, deobfuscate=False): | |
| 2179 | - """ | |
| 2180 | - Analyze the provided VBA code to detect suspicious keywords, | |
| 2181 | - auto-executable macros, IOC patterns, obfuscation patterns | |
| 2182 | - such as hex-encoded strings. | |
| 2183 | - | |
| 2184 | - :param include_decoded_strings: bool, if True, all encoded strings will be included with their decoded content. | |
| 2185 | - :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) | |
| 2186 | - :return: list of tuples (type, keyword, description) | |
| 2187 | - (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String') | |
| 2188 | - """ | |
| 2189 | - # First, detect and extract hex-encoded strings: | |
| 2190 | - self.hex_strings = detect_hex_strings(self.code) | |
| 2191 | - # detect if the code contains StrReverse: | |
| 2192 | - self.strReverse = False | |
| 2193 | - if 'strreverse' in self.code.lower(): self.strReverse = True | |
| 2194 | - # Then append the decoded strings to the VBA code, to detect obfuscated IOCs and keywords: | |
| 2195 | - for encoded, decoded in self.hex_strings: | |
| 2196 | - self.code_hex += '\n' + decoded | |
| 2197 | - # if the code contains "StrReverse", also append the hex strings in reverse order: | |
| 2198 | - if self.strReverse: | |
| 2199 | - # StrReverse after hex decoding: | |
| 2200 | - self.code_hex_rev += '\n' + decoded[::-1] | |
| 2201 | - # StrReverse before hex decoding: | |
| 2202 | - self.code_rev_hex += '\n' + str(binascii.unhexlify(encoded[::-1])) | |
| 2203 | - #example: https://malwr.com/analysis/NmFlMGI4YTY1YzYyNDkwNTg1ZTBiZmY5OGI3YjlhYzU/ | |
| 2204 | - #TODO: also append the full code reversed if StrReverse? (risk of false positives?) | |
| 2205 | - # Detect Base64-encoded strings | |
| 2206 | - self.base64_strings = detect_base64_strings(self.code) | |
| 2207 | - for encoded, decoded in self.base64_strings: | |
| 2208 | - self.code_base64 += '\n' + decoded | |
| 2209 | - # Detect Dridex-encoded strings | |
| 2210 | - self.dridex_strings = detect_dridex_strings(self.code) | |
| 2211 | - for encoded, decoded in self.dridex_strings: | |
| 2212 | - self.code_dridex += '\n' + decoded | |
| 2213 | - # Detect obfuscated strings in VBA expressions | |
| 2214 | - if deobfuscate: | |
| 2215 | - self.vba_strings = detect_vba_strings(self.code) | |
| 2216 | - else: | |
| 2217 | - self.vba_strings = [] | |
| 2218 | - for encoded, decoded in self.vba_strings: | |
| 2219 | - self.code_vba += '\n' + decoded | |
| 2220 | - results = [] | |
| 2221 | - self.autoexec_keywords = [] | |
| 2222 | - self.suspicious_keywords = [] | |
| 2223 | - self.iocs = [] | |
| 2224 | - | |
| 2225 | - for code, obfuscation in ( | |
| 2226 | - (self.code, None), | |
| 2227 | - (self.code_hex, 'Hex'), | |
| 2228 | - (self.code_hex_rev, 'Hex+StrReverse'), | |
| 2229 | - (self.code_rev_hex, 'StrReverse+Hex'), | |
| 2230 | - (self.code_base64, 'Base64'), | |
| 2231 | - (self.code_dridex, 'Dridex'), | |
| 2232 | - (self.code_vba, 'VBA expression'), | |
| 2233 | - ): | |
| 2234 | - if isinstance(code,bytes): | |
| 2235 | - code=code.decode('utf-8','backslashreplace') | |
| 2236 | - self.autoexec_keywords += detect_autoexec(code, obfuscation) | |
| 2237 | - self.suspicious_keywords += detect_suspicious(code, obfuscation) | |
| 2238 | - self.iocs += detect_patterns(code, obfuscation) | |
| 2239 | - | |
| 2240 | - # If hex-encoded strings were discovered, add an item to suspicious keywords: | |
| 2241 | - if self.hex_strings: | |
| 2242 | - self.suspicious_keywords.append(('Hex Strings', | |
| 2243 | - 'Hex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)')) | |
| 2244 | - if self.base64_strings: | |
| 2245 | - self.suspicious_keywords.append(('Base64 Strings', | |
| 2246 | - 'Base64-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)')) | |
| 2247 | - if self.dridex_strings: | |
| 2248 | - self.suspicious_keywords.append(('Dridex Strings', | |
| 2249 | - 'Dridex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)')) | |
| 2250 | - if self.vba_strings: | |
| 2251 | - self.suspicious_keywords.append(('VBA obfuscated Strings', | |
| 2252 | - 'VBA string expressions were detected, may be used to obfuscate strings (option --decode to see all)')) | |
| 2253 | - # use a set to avoid duplicate keywords | |
| 2254 | - keyword_set = set() | |
| 2255 | - for keyword, description in self.autoexec_keywords: | |
| 2256 | - if keyword not in keyword_set: | |
| 2257 | - results.append(('AutoExec', keyword, description)) | |
| 2258 | - keyword_set.add(keyword) | |
| 2259 | - keyword_set = set() | |
| 2260 | - for keyword, description in self.suspicious_keywords: | |
| 2261 | - if keyword not in keyword_set: | |
| 2262 | - results.append(('Suspicious', keyword, description)) | |
| 2263 | - keyword_set.add(keyword) | |
| 2264 | - keyword_set = set() | |
| 2265 | - for pattern_type, value in self.iocs: | |
| 2266 | - if value not in keyword_set: | |
| 2267 | - results.append(('IOC', value, pattern_type)) | |
| 2268 | - keyword_set.add(value) | |
| 2269 | - | |
| 2270 | - # include decoded strings only if they are printable or if --decode option: | |
| 2271 | - for encoded, decoded in self.hex_strings: | |
| 2272 | - if include_decoded_strings or is_printable(decoded): | |
| 2273 | - results.append(('Hex String', decoded, encoded)) | |
| 2274 | - for encoded, decoded in self.base64_strings: | |
| 2275 | - if include_decoded_strings or is_printable(decoded): | |
| 2276 | - results.append(('Base64 String', decoded, encoded)) | |
| 2277 | - for encoded, decoded in self.dridex_strings: | |
| 2278 | - if include_decoded_strings or is_printable(decoded): | |
| 2279 | - results.append(('Dridex string', decoded, encoded)) | |
| 2280 | - for encoded, decoded in self.vba_strings: | |
| 2281 | - if include_decoded_strings or is_printable(decoded): | |
| 2282 | - results.append(('VBA string', decoded, encoded)) | |
| 2283 | - self.results = results | |
| 2284 | - return results | |
| 2285 | - | |
| 2286 | - def scan_summary(self): | |
| 2287 | - """ | |
| 2288 | - Analyze the provided VBA code to detect suspicious keywords, | |
| 2289 | - auto-executable macros, IOC patterns, obfuscation patterns | |
| 2290 | - such as hex-encoded strings. | |
| 2291 | - | |
| 2292 | - :return: tuple with the number of items found for each category: | |
| 2293 | - (autoexec, suspicious, IOCs, hex, base64, dridex, vba) | |
| 2294 | - """ | |
| 2295 | - # avoid scanning the same code twice: | |
| 2296 | - if self.results is None: | |
| 2297 | - self.scan() | |
| 2298 | - return (len(self.autoexec_keywords), len(self.suspicious_keywords), | |
| 2299 | - len(self.iocs), len(self.hex_strings), len(self.base64_strings), | |
| 2300 | - len(self.dridex_strings), len(self.vba_strings)) | |
| 2301 | - | |
| 2302 | - | |
| 2303 | -def scan_vba(vba_code, include_decoded_strings, deobfuscate=False): | |
| 2304 | - """ | |
| 2305 | - Analyze the provided VBA code to detect suspicious keywords, | |
| 2306 | - auto-executable macros, IOC patterns, obfuscation patterns | |
| 2307 | - such as hex-encoded strings. | |
| 2308 | - (shortcut for VBA_Scanner(vba_code).scan()) | |
| 2309 | - | |
| 2310 | - :param vba_code: str, VBA source code to be analyzed | |
| 2311 | - :param include_decoded_strings: bool, if True all encoded strings will be included with their decoded content. | |
| 2312 | - :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) | |
| 2313 | - :return: list of tuples (type, keyword, description) | |
| 2314 | - (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String') | |
| 2315 | - """ | |
| 2316 | - return VBA_Scanner(vba_code).scan(include_decoded_strings, deobfuscate) | |
| 2317 | - | |
| 2318 | - | |
| 2319 | -#=== CLASSES ================================================================= | |
| 2320 | - | |
| 2321 | -class VBA_Parser(object): | |
| 2322 | - """ | |
| 2323 | - Class to parse MS Office files, to detect VBA macros and extract VBA source code | |
| 2324 | - Supported file formats: | |
| 2325 | - - Word 97-2003 (.doc, .dot) | |
| 2326 | - - Word 2007+ (.docm, .dotm) | |
| 2327 | - - Word 2003 XML (.xml) | |
| 2328 | - - Word MHT - Single File Web Page / MHTML (.mht) | |
| 2329 | - - Excel 97-2003 (.xls) | |
| 2330 | - - Excel 2007+ (.xlsm, .xlsb) | |
| 2331 | - - PowerPoint 97-2003 (.ppt) | |
| 2332 | - - PowerPoint 2007+ (.pptm, .ppsm) | |
| 2333 | - """ | |
| 2334 | - | |
| 2335 | - def __init__(self, filename, data=None, container=None, relaxed=False): | |
| 2336 | - """ | |
| 2337 | - Constructor for VBA_Parser | |
| 2338 | - | |
| 2339 | - :param filename: filename or path of file to parse, or file-like object | |
| 2340 | - | |
| 2341 | - :param data: None or bytes str, if None the file will be read from disk (or from the file-like object). | |
| 2342 | - If data is provided as a bytes string, it will be parsed as the content of the file in memory, | |
| 2343 | - and not read from disk. Note: files must be read in binary mode, i.e. open(f, 'rb'). | |
| 2344 | - | |
| 2345 | - :param container: str, path and filename of container if the file is within | |
| 2346 | - a zip archive, None otherwise. | |
| 2347 | - | |
| 2348 | - :param relaxed: if True, treat mal-formed documents and missing streams more like MS office: | |
| 2349 | - do nothing; if False (default), raise errors in these cases | |
| 2350 | - | |
| 2351 | - raises a FileOpenError if all attemps to interpret the data header failed | |
| 2352 | - """ | |
| 2353 | - #TODO: filename should only be a string, data should be used for the file-like object | |
| 2354 | - #TODO: filename should be mandatory, optional data is a string or file-like object | |
| 2355 | - #TODO: also support olefile and zipfile as input | |
| 2356 | - if data is None: | |
| 2357 | - # open file from disk: | |
| 2358 | - _file = filename | |
| 2359 | - else: | |
| 2360 | - # file already read in memory, make it a file-like object for zipfile: | |
| 2361 | - _file = BytesIO(data) | |
| 2362 | - #self.file = _file | |
| 2363 | - self.ole_file = None | |
| 2364 | - self.ole_subfiles = [] | |
| 2365 | - self.filename = filename | |
| 2366 | - self.container = container | |
| 2367 | - self.relaxed = relaxed | |
| 2368 | - self.type = None | |
| 2369 | - self.vba_projects = None | |
| 2370 | - self.vba_forms = None | |
| 2371 | - self.contains_macros = None # will be set to True or False by detect_macros | |
| 2372 | - self.vba_code_all_modules = None # to store the source code of all modules | |
| 2373 | - # list of tuples for each module: (subfilename, stream_path, vba_filename, vba_code) | |
| 2374 | - self.modules = None | |
| 2375 | - # Analysis results: list of tuples (type, keyword, description) - See VBA_Scanner | |
| 2376 | - self.analysis_results = None | |
| 2377 | - # statistics for the scan summary and flags | |
| 2378 | - self.nb_macros = 0 | |
| 2379 | - self.nb_autoexec = 0 | |
| 2380 | - self.nb_suspicious = 0 | |
| 2381 | - self.nb_iocs = 0 | |
| 2382 | - self.nb_hexstrings = 0 | |
| 2383 | - self.nb_base64strings = 0 | |
| 2384 | - self.nb_dridexstrings = 0 | |
| 2385 | - self.nb_vbastrings = 0 | |
| 2386 | - | |
| 2387 | - # if filename is None: | |
| 2388 | - # if isinstance(_file, basestring): | |
| 2389 | - # if len(_file) < olefile.MINIMAL_OLEFILE_SIZE: | |
| 2390 | - # self.filename = _file | |
| 2391 | - # else: | |
| 2392 | - # self.filename = '<file in bytes string>' | |
| 2393 | - # else: | |
| 2394 | - # self.filename = '<file-like object>' | |
| 2395 | - if olefile.isOleFile(_file): | |
| 2396 | - # This looks like an OLE file | |
| 2397 | - self.open_ole(_file) | |
| 2398 | - | |
| 2399 | - # check whether file is encrypted (need to do this before try ppt) | |
| 2400 | - log.debug('Check encryption of ole file') | |
| 2401 | - crypt_indicator = oleid.OleID(self.ole_file).check_encrypted() | |
| 2402 | - if crypt_indicator.value: | |
| 2403 | - raise FileIsEncryptedError(filename) | |
| 2404 | - | |
| 2405 | - # if this worked, try whether it is a ppt file (special ole file) | |
| 2406 | - self.open_ppt() | |
| 2407 | - if self.type is None and is_zipfile(_file): | |
| 2408 | - # Zip file, which may be an OpenXML document | |
| 2409 | - self.open_openxml(_file) | |
| 2410 | - if self.type is None: | |
| 2411 | - # read file from disk, check if it is a Word 2003 XML file (WordProcessingML), Excel 2003 XML, | |
| 2412 | - # or a plain text file containing VBA code | |
| 2413 | - if data is None: | |
| 2414 | - with open(filename, 'rb') as file_handle: | |
| 2415 | - data = file_handle.read() | |
| 2416 | - # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace | |
| 2417 | - if b'http://schemas.microsoft.com/office/word/2003/wordml' in data: | |
| 2418 | - self.open_word2003xml(data) | |
| 2419 | - # check if it is a Word/PowerPoint 2007+ XML file (Flat OPC): must contain the namespace | |
| 2420 | - if b'http://schemas.microsoft.com/office/2006/xmlPackage' in data: | |
| 2421 | - self.open_flatopc(data) | |
| 2422 | - # store a lowercase version for the next tests: | |
| 2423 | - data_lowercase = data.lower() | |
| 2424 | - # check if it is a MHT file (MIME HTML, Word or Excel saved as "Single File Web Page"): | |
| 2425 | - # According to my tests, these files usually start with "MIME-Version: 1.0" on the 1st line | |
| 2426 | - # BUT Word accepts a blank line or other MIME headers inserted before, | |
| 2427 | - # and even whitespaces in between "MIME", "-", "Version" and ":". The version number is ignored. | |
| 2428 | - # And the line is case insensitive. | |
| 2429 | - # so we'll just check the presence of mime, version and multipart anywhere: | |
| 2430 | - if self.type is None and b'mime' in data_lowercase and b'version' in data_lowercase \ | |
| 2431 | - and b'multipart' in data_lowercase: | |
| 2432 | - self.open_mht(data) | |
| 2433 | - #TODO: handle exceptions | |
| 2434 | - #TODO: Excel 2003 XML | |
| 2435 | - # Check whether this is rtf | |
| 2436 | - if rtfobj.is_rtf(data, treat_str_as_data=True): | |
| 2437 | - # Ignore RTF since it contains no macros and methods in here will not find macros | |
| 2438 | - # in embedded objects. run rtfobj and repeat on its output. | |
| 2439 | - msg = '%s is RTF, need to run rtfobj.py and find VBA Macros in its output.' % self.filename | |
| 2440 | - log.info(msg) | |
| 2441 | - raise FileOpenError(msg) | |
| 2442 | - # Check if this is a plain text VBA or VBScript file: | |
| 2443 | - # To avoid scanning binary files, we simply check for some control chars: | |
| 2444 | - if self.type is None and b'\x00' not in data: | |
| 2445 | - self.open_text(data) | |
| 2446 | - if self.type is None: | |
| 2447 | - # At this stage, could not match a known format: | |
| 2448 | - msg = '%s is not a supported file type, cannot extract VBA Macros.' % self.filename | |
| 2449 | - log.info(msg) | |
| 2450 | - raise FileOpenError(msg) | |
| 2451 | - | |
| 2452 | - def open_ole(self, _file): | |
| 2453 | - """ | |
| 2454 | - Open an OLE file | |
| 2455 | - :param _file: filename or file contents in a file object | |
| 2456 | - :return: nothing | |
| 2457 | - """ | |
| 2458 | - log.info('Opening OLE file %s' % self.filename) | |
| 2459 | - try: | |
| 2460 | - # Open and parse the OLE file, using unicode for path names: | |
| 2461 | - self.ole_file = olefile.OleFileIO(_file, path_encoding=None) | |
| 2462 | - # set type only if parsing succeeds | |
| 2463 | - self.type = TYPE_OLE | |
| 2464 | - except (IOError, TypeError, ValueError) as exc: | |
| 2465 | - # TODO: handle OLE parsing exceptions | |
| 2466 | - log.info('Failed OLE parsing for file %r (%s)' % (self.filename, exc)) | |
| 2467 | - log.debug('Trace:', exc_info=True) | |
| 2468 | - | |
| 2469 | - | |
| 2470 | - def open_openxml(self, _file): | |
| 2471 | - """ | |
| 2472 | - Open an OpenXML file | |
| 2473 | - :param _file: filename or file contents in a file object | |
| 2474 | - :return: nothing | |
| 2475 | - """ | |
| 2476 | - # This looks like a zip file, need to look for vbaProject.bin inside | |
| 2477 | - # It can be any OLE file inside the archive | |
| 2478 | - #...because vbaProject.bin can be renamed: | |
| 2479 | - # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18 | |
| 2480 | - log.info('Opening ZIP/OpenXML file %s' % self.filename) | |
| 2481 | - try: | |
| 2482 | - z = zipfile.ZipFile(_file) | |
| 2483 | - #TODO: check if this is actually an OpenXML file | |
| 2484 | - #TODO: if the zip file is encrypted, suggest to use the -z option, or try '-z infected' automatically | |
| 2485 | - # check each file within the zip if it is an OLE file, by reading its magic: | |
| 2486 | - for subfile in z.namelist(): | |
| 2487 | - with z.open(subfile) as file_handle: | |
| 2488 | - magic = file_handle.read(len(olefile.MAGIC)) | |
| 2489 | - if magic == olefile.MAGIC: | |
| 2490 | - log.debug('Opening OLE file %s within zip' % subfile) | |
| 2491 | - with z.open(subfile) as file_handle: | |
| 2492 | - ole_data = file_handle.read() | |
| 2493 | - try: | |
| 2494 | - self.ole_subfiles.append( | |
| 2495 | - VBA_Parser(filename=subfile, data=ole_data, | |
| 2496 | - relaxed=self.relaxed)) | |
| 2497 | - except OlevbaBaseException as exc: | |
| 2498 | - if self.relaxed: | |
| 2499 | - log.info('%s is not a valid OLE file (%s)' % (subfile, exc)) | |
| 2500 | - log.debug('Trace:', exc_info=True) | |
| 2501 | - continue | |
| 2502 | - else: | |
| 2503 | - raise SubstreamOpenError(self.filename, subfile, | |
| 2504 | - exc) | |
| 2505 | - z.close() | |
| 2506 | - # set type only if parsing succeeds | |
| 2507 | - self.type = TYPE_OpenXML | |
| 2508 | - except OlevbaBaseException as exc: | |
| 2509 | - if self.relaxed: | |
| 2510 | - log.info('Error {0} caught in Zip/OpenXML parsing for file {1}' | |
| 2511 | - .format(exc, self.filename)) | |
| 2512 | - log.debug('Trace:', exc_info=True) | |
| 2513 | - else: | |
| 2514 | - raise | |
| 2515 | - except (RuntimeError, zipfile.BadZipfile, zipfile.LargeZipFile, IOError) as exc: | |
| 2516 | - # TODO: handle parsing exceptions | |
| 2517 | - log.info('Failed Zip/OpenXML parsing for file %r (%s)' | |
| 2518 | - % (self.filename, exc)) | |
| 2519 | - log.debug('Trace:', exc_info=True) | |
| 2520 | - | |
| 2521 | - def open_word2003xml(self, data): | |
| 2522 | - """ | |
| 2523 | - Open a Word 2003 XML file | |
| 2524 | - :param data: file contents in a string or bytes | |
| 2525 | - :return: nothing | |
| 2526 | - """ | |
| 2527 | - log.info('Opening Word 2003 XML file %s' % self.filename) | |
| 2528 | - try: | |
| 2529 | - # parse the XML content | |
| 2530 | - # TODO: handle XML parsing exceptions | |
| 2531 | - et = ET.fromstring(data) | |
| 2532 | - # find all the binData elements: | |
| 2533 | - for bindata in et.getiterator(TAG_BINDATA): | |
| 2534 | - # the binData content is an OLE container for the VBA project, compressed | |
| 2535 | - # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded. | |
| 2536 | - # get the filename: | |
| 2537 | - fname = bindata.get(ATTR_NAME, 'noname.mso') | |
| 2538 | - # decode the base64 activemime | |
| 2539 | - mso_data = binascii.a2b_base64(bindata.text) | |
| 2540 | - if is_mso_file(mso_data): | |
| 2541 | - # decompress the zlib data stored in the MSO file, which is the OLE container: | |
| 2542 | - # TODO: handle different offsets => separate function | |
| 2543 | - try: | |
| 2544 | - ole_data = mso_file_extract(mso_data) | |
| 2545 | - self.ole_subfiles.append( | |
| 2546 | - VBA_Parser(filename=fname, data=ole_data, | |
| 2547 | - relaxed=self.relaxed)) | |
| 2548 | - except OlevbaBaseException as exc: | |
| 2549 | - if self.relaxed: | |
| 2550 | - log.info('Error parsing subfile {0}: {1}' | |
| 2551 | - .format(fname, exc)) | |
| 2552 | - log.debug('Trace:', exc_info=True) | |
| 2553 | - else: | |
| 2554 | - raise SubstreamOpenError(self.filename, fname, exc) | |
| 2555 | - else: | |
| 2556 | - log.info('%s is not a valid MSO file' % fname) | |
| 2557 | - # set type only if parsing succeeds | |
| 2558 | - self.type = TYPE_Word2003_XML | |
| 2559 | - except OlevbaBaseException as exc: | |
| 2560 | - if self.relaxed: | |
| 2561 | - log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc)) | |
| 2562 | - log.debug('Trace:', exc_info=True) | |
| 2563 | - else: | |
| 2564 | - raise | |
| 2565 | - except Exception as exc: | |
| 2566 | - # TODO: differentiate exceptions for each parsing stage | |
| 2567 | - # (but ET is different libs, no good exception description in API) | |
| 2568 | - # found: XMLSyntaxError | |
| 2569 | - log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc)) | |
| 2570 | - log.debug('Trace:', exc_info=True) | |
| 2571 | - | |
| 2572 | - def open_flatopc(self, data): | |
| 2573 | - """ | |
| 2574 | - Open a Word or PowerPoint 2007+ XML file, aka "Flat OPC" | |
| 2575 | - :param data: file contents in a string or bytes | |
| 2576 | - :return: nothing | |
| 2577 | - """ | |
| 2578 | - log.info('Opening Flat OPC Word/PowerPoint XML file %s' % self.filename) | |
| 2579 | - try: | |
| 2580 | - # parse the XML content | |
| 2581 | - # TODO: handle XML parsing exceptions | |
| 2582 | - et = ET.fromstring(data) | |
| 2583 | - # TODO: check root node namespace and tag | |
| 2584 | - # find all the pkg:part elements: | |
| 2585 | - for pkgpart in et.iter(TAG_PKGPART): | |
| 2586 | - fname = pkgpart.get(ATTR_PKG_NAME, 'unknown') | |
| 2587 | - content_type = pkgpart.get(ATTR_PKG_CONTENTTYPE, 'unknown') | |
| 2588 | - if content_type == CTYPE_VBAPROJECT: | |
| 2589 | - for bindata in pkgpart.iterfind(TAG_PKGBINDATA): | |
| 2590 | - try: | |
| 2591 | - ole_data = binascii.a2b_base64(bindata.text) | |
| 2592 | - self.ole_subfiles.append( | |
| 2593 | - VBA_Parser(filename=fname, data=ole_data, | |
| 2594 | - relaxed=self.relaxed)) | |
| 2595 | - except OlevbaBaseException as exc: | |
| 2596 | - if self.relaxed: | |
| 2597 | - log.info('Error parsing subfile {0}: {1}' | |
| 2598 | - .format(fname, exc)) | |
| 2599 | - log.debug('Trace:', exc_info=True) | |
| 2600 | - else: | |
| 2601 | - raise SubstreamOpenError(self.filename, fname, exc) | |
| 2602 | - # set type only if parsing succeeds | |
| 2603 | - self.type = TYPE_FlatOPC_XML | |
| 2604 | - except OlevbaBaseException as exc: | |
| 2605 | - if self.relaxed: | |
| 2606 | - log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc)) | |
| 2607 | - log.debug('Trace:', exc_info=True) | |
| 2608 | - else: | |
| 2609 | - raise | |
| 2610 | - except Exception as exc: | |
| 2611 | - # TODO: differentiate exceptions for each parsing stage | |
| 2612 | - # (but ET is different libs, no good exception description in API) | |
| 2613 | - # found: XMLSyntaxError | |
| 2614 | - log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc)) | |
| 2615 | - log.debug('Trace:', exc_info=True) | |
| 2616 | - | |
| 2617 | - def open_mht(self, data): | |
| 2618 | - """ | |
| 2619 | - Open a MHTML file | |
| 2620 | - :param data: file contents in a string or bytes | |
| 2621 | - :return: nothing | |
| 2622 | - """ | |
| 2623 | - log.info('Opening MHTML file %s' % self.filename) | |
| 2624 | - try: | |
| 2625 | - if isinstance(data,bytes): | |
| 2626 | - data = data.decode('utf8', 'backslashreplace') | |
| 2627 | - # parse the MIME content | |
| 2628 | - # remove any leading whitespace or newline (workaround for issue in email package) | |
| 2629 | - stripped_data = data.lstrip('\r\n\t ') | |
| 2630 | - # strip any junk from the beginning of the file | |
| 2631 | - # (issue #31 fix by Greg C - gdigreg) | |
| 2632 | - # TODO: improve keywords to avoid false positives | |
| 2633 | - mime_offset = stripped_data.find('MIME') | |
| 2634 | - content_offset = stripped_data.find('Content') | |
| 2635 | - # if "MIME" is found, and located before "Content": | |
| 2636 | - if -1 < mime_offset <= content_offset: | |
| 2637 | - stripped_data = stripped_data[mime_offset:] | |
| 2638 | - # else if "Content" is found, and before "MIME" | |
| 2639 | - # TODO: can it work without "MIME" at all? | |
| 2640 | - elif content_offset > -1: | |
| 2641 | - stripped_data = stripped_data[content_offset:] | |
| 2642 | - # TODO: quick and dirty fix: insert a standard line with MIME-Version header? | |
| 2643 | - mhtml = email.message_from_string(stripped_data) | |
| 2644 | - # find all the attached files: | |
| 2645 | - for part in mhtml.walk(): | |
| 2646 | - content_type = part.get_content_type() # always returns a value | |
| 2647 | - fname = part.get_filename(None) # returns None if it fails | |
| 2648 | - # TODO: get content-location if no filename | |
| 2649 | - log.debug('MHTML part: filename=%r, content-type=%r' % (fname, content_type)) | |
| 2650 | - part_data = part.get_payload(decode=True) | |
| 2651 | - # VBA macros are stored in a binary file named "editdata.mso". | |
| 2652 | - # the data content is an OLE container for the VBA project, compressed | |
| 2653 | - # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded. | |
| 2654 | - # decompress the zlib data starting at offset 0x32, which is the OLE container: | |
| 2655 | - # check ActiveMime header: | |
| 2656 | - | |
| 2657 | - if (isinstance(part_data, str) or isinstance(part_data, bytes)) and is_mso_file(part_data): | |
| 2658 | - log.debug('Found ActiveMime header, decompressing MSO container') | |
| 2659 | - try: | |
| 2660 | - ole_data = mso_file_extract(part_data) | |
| 2661 | - | |
| 2662 | - # TODO: check if it is actually an OLE file | |
| 2663 | - # TODO: get the MSO filename from content_location? | |
| 2664 | - self.ole_subfiles.append( | |
| 2665 | - VBA_Parser(filename=fname, data=ole_data, | |
| 2666 | - relaxed=self.relaxed)) | |
| 2667 | - except OlevbaBaseException as exc: | |
| 2668 | - if self.relaxed: | |
| 2669 | - log.info('%s does not contain a valid OLE file (%s)' | |
| 2670 | - % (fname, exc)) | |
| 2671 | - log.debug('Trace:', exc_info=True) | |
| 2672 | - # TODO: bug here - need to split in smaller functions/classes? | |
| 2673 | - else: | |
| 2674 | - raise SubstreamOpenError(self.filename, fname, exc) | |
| 2675 | - else: | |
| 2676 | - log.debug('type(part_data) = %s' % type(part_data)) | |
| 2677 | - try: | |
| 2678 | - log.debug('part_data[0:20] = %r' % part_data[0:20]) | |
| 2679 | - except TypeError as err: | |
| 2680 | - log.debug('part_data has no __getitem__') | |
| 2681 | - # set type only if parsing succeeds | |
| 2682 | - self.type = TYPE_MHTML | |
| 2683 | - except OlevbaBaseException: | |
| 2684 | - raise | |
| 2685 | - except Exception: | |
| 2686 | - log.info('Failed MIME parsing for file %r - %s' | |
| 2687 | - % (self.filename, MSG_OLEVBA_ISSUES)) | |
| 2688 | - log.debug('Trace:', exc_info=True) | |
| 2689 | - | |
| 2690 | - def open_ppt(self): | |
| 2691 | - """ try to interpret self.ole_file as PowerPoint 97-2003 using PptParser | |
| 2692 | - | |
| 2693 | - Although self.ole_file is a valid olefile.OleFileIO, we set | |
| 2694 | - self.ole_file = None in here and instead set self.ole_subfiles to the | |
| 2695 | - VBA ole streams found within the main ole file. That makes most of the | |
| 2696 | - code below treat this like an OpenXML file and only look at the | |
| 2697 | - ole_subfiles (except find_vba_* which needs to explicitly check for | |
| 2698 | - self.type) | |
| 2699 | - """ | |
| 2700 | - | |
| 2701 | - log.info('Check whether OLE file is PPT') | |
| 2702 | - try: | |
| 2703 | - ppt = ppt_parser.PptParser(self.ole_file, fast_fail=True) | |
| 2704 | - for vba_data in ppt.iter_vba_data(): | |
| 2705 | - self.ole_subfiles.append(VBA_Parser(None, vba_data, | |
| 2706 | - container='PptParser')) | |
| 2707 | - log.info('File is PPT') | |
| 2708 | - self.ole_file.close() # just in case | |
| 2709 | - self.ole_file = None # required to make other methods look at ole_subfiles | |
| 2710 | - self.type = TYPE_PPT | |
| 2711 | - except Exception as exc: | |
| 2712 | - if self.container == 'PptParser': | |
| 2713 | - # this is a subfile of a ppt --> to be expected that is no ppt | |
| 2714 | - log.debug('PPT subfile is not a PPT file') | |
| 2715 | - else: | |
| 2716 | - log.debug("File appears not to be a ppt file (%s)" % exc) | |
| 2717 | - | |
| 2718 | - | |
| 2719 | - def open_text(self, data): | |
| 2720 | - """ | |
| 2721 | - Open a text file containing VBA or VBScript source code | |
| 2722 | - :param data: file contents in a string or bytes | |
| 2723 | - :return: nothing | |
| 2724 | - """ | |
| 2725 | - log.info('Opening text file %s' % self.filename) | |
| 2726 | - # directly store the source code: | |
| 2727 | - if isinstance(data,bytes): | |
| 2728 | - data=data.decode('utf8','backslashreplace') | |
| 2729 | - self.vba_code_all_modules = data | |
| 2730 | - self.contains_macros = True | |
| 2731 | - # set type only if parsing succeeds | |
| 2732 | - self.type = TYPE_TEXT | |
| 2733 | - | |
| 2734 | - | |
| 2735 | - def find_vba_projects(self): | |
| 2736 | - """ | |
| 2737 | - Finds all the VBA projects stored in an OLE file. | |
| 2738 | - | |
| 2739 | - Return None if the file is not OLE but OpenXML. | |
| 2740 | - Return a list of tuples (vba_root, project_path, dir_path) for each VBA project. | |
| 2741 | - vba_root is the path of the root OLE storage containing the VBA project, | |
| 2742 | - including a trailing slash unless it is the root of the OLE file. | |
| 2743 | - project_path is the path of the OLE stream named "PROJECT" within the VBA project. | |
| 2744 | - dir_path is the path of the OLE stream named "VBA/dir" within the VBA project. | |
| 2745 | - | |
| 2746 | - If this function returns an empty list for one of the supported formats | |
| 2747 | - (i.e. Word, Excel, Powerpoint), then the file does not contain VBA macros. | |
| 2748 | - | |
| 2749 | - :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path) | |
| 2750 | - for each VBA project found if OLE file | |
| 2751 | - """ | |
| 2752 | - log.debug('VBA_Parser.find_vba_projects') | |
| 2753 | - | |
| 2754 | - # if the file is not OLE but OpenXML, return None: | |
| 2755 | - if self.ole_file is None and self.type != TYPE_PPT: | |
| 2756 | - return None | |
| 2757 | - | |
| 2758 | - # if this method has already been called, return previous result: | |
| 2759 | - if self.vba_projects is not None: | |
| 2760 | - return self.vba_projects | |
| 2761 | - | |
| 2762 | - # if this is a ppt file (PowerPoint 97-2003): | |
| 2763 | - # self.ole_file is None but the ole_subfiles do contain vba_projects | |
| 2764 | - # (like for OpenXML files). | |
| 2765 | - if self.type == TYPE_PPT: | |
| 2766 | - # TODO: so far, this function is never called for PPT files, but | |
| 2767 | - # if that happens, the information is lost which ole file contains | |
| 2768 | - # which storage! | |
| 2769 | - log.warning('Returned info is not complete for PPT types!') | |
| 2770 | - self.vba_projects = [] | |
| 2771 | - for subfile in self.ole_subfiles: | |
| 2772 | - self.vba_projects.extend(subfile.find_vba_projects()) | |
| 2773 | - return self.vba_projects | |
| 2774 | - | |
| 2775 | - # Find the VBA project root (different in MS Word, Excel, etc): | |
| 2776 | - # - Word 97-2003: Macros | |
| 2777 | - # - Excel 97-2003: _VBA_PROJECT_CUR | |
| 2778 | - # - PowerPoint 97-2003: PptParser has identified ole_subfiles | |
| 2779 | - # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin. | |
| 2780 | - # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word | |
| 2781 | - # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word | |
| 2782 | - # - Visio 2007: not supported yet (different file structure) | |
| 2783 | - | |
| 2784 | - # According to MS-OVBA section 2.2.1: | |
| 2785 | - # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream | |
| 2786 | - # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream | |
| 2787 | - # - all names are case-insensitive | |
| 2788 | - | |
| 2789 | - def check_vba_stream(ole, vba_root, stream_path): | |
| 2790 | - full_path = vba_root + stream_path | |
| 2791 | - if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM: | |
| 2792 | - log.debug('Found %s stream: %s' % (stream_path, full_path)) | |
| 2793 | - return full_path | |
| 2794 | - else: | |
| 2795 | - log.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path) | |
| 2796 | - return False | |
| 2797 | - | |
| 2798 | - # start with an empty list: | |
| 2799 | - self.vba_projects = [] | |
| 2800 | - # Look for any storage containing those storage/streams: | |
| 2801 | - ole = self.ole_file | |
| 2802 | - for storage in ole.listdir(streams=False, storages=True): | |
| 2803 | - log.debug('Checking storage %r' % storage) | |
| 2804 | - # Look for a storage ending with "VBA": | |
| 2805 | - if storage[-1].upper() == 'VBA': | |
| 2806 | - log.debug('Found VBA storage: %s' % ('/'.join(storage))) | |
| 2807 | - vba_root = '/'.join(storage[:-1]) | |
| 2808 | - # Add a trailing slash to vba_root, unless it is the root of the OLE file: | |
| 2809 | - # (used later to append all the child streams/storages) | |
| 2810 | - if vba_root != '': | |
| 2811 | - vba_root += '/' | |
| 2812 | - log.debug('Checking vba_root="%s"' % vba_root) | |
| 2813 | - | |
| 2814 | - # Check if the VBA root storage also contains a PROJECT stream: | |
| 2815 | - project_path = check_vba_stream(ole, vba_root, 'PROJECT') | |
| 2816 | - if not project_path: continue | |
| 2817 | - # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream: | |
| 2818 | - vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT') | |
| 2819 | - if not vba_project_path: continue | |
| 2820 | - # Check if the VBA root storage also contains a VBA/dir stream: | |
| 2821 | - dir_path = check_vba_stream(ole, vba_root, 'VBA/dir') | |
| 2822 | - if not dir_path: continue | |
| 2823 | - # Now we are pretty sure it is a VBA project structure | |
| 2824 | - log.debug('VBA root storage: "%s"' % vba_root) | |
| 2825 | - # append the results to the list as a tuple for later use: | |
| 2826 | - self.vba_projects.append((vba_root, project_path, dir_path)) | |
| 2827 | - return self.vba_projects | |
| 2828 | - | |
| 2829 | - def detect_vba_macros(self): | |
| 2830 | - """ | |
| 2831 | - Detect the potential presence of VBA macros in the file, by checking | |
| 2832 | - if it contains VBA projects. Both OLE and OpenXML files are supported. | |
| 2833 | - | |
| 2834 | - Important: for now, results are accurate only for Word, Excel and PowerPoint | |
| 2835 | - | |
| 2836 | - Note: this method does NOT attempt to check the actual presence or validity | |
| 2837 | - of VBA macro source code, so there might be false positives. | |
| 2838 | - It may also detect VBA macros in files embedded within the main file, | |
| 2839 | - for example an Excel workbook with macros embedded into a Word | |
| 2840 | - document without macros may be detected, without distinction. | |
| 2841 | - | |
| 2842 | - :return: bool, True if at least one VBA project has been found, False otherwise | |
| 2843 | - """ | |
| 2844 | - #TODO: return None or raise exception if format not supported | |
| 2845 | - #TODO: return the number of VBA projects found instead of True/False? | |
| 2846 | - # if this method was already called, return the previous result: | |
| 2847 | - if self.contains_macros is not None: | |
| 2848 | - return self.contains_macros | |
| 2849 | - # if OpenXML/PPT, check all the OLE subfiles: | |
| 2850 | - if self.ole_file is None: | |
| 2851 | - for ole_subfile in self.ole_subfiles: | |
| 2852 | - if ole_subfile.detect_vba_macros(): | |
| 2853 | - self.contains_macros = True | |
| 2854 | - return True | |
| 2855 | - # otherwise, no macro found: | |
| 2856 | - self.contains_macros = False | |
| 2857 | - return False | |
| 2858 | - # otherwise it's an OLE file, find VBA projects: | |
| 2859 | - vba_projects = self.find_vba_projects() | |
| 2860 | - if len(vba_projects) == 0: | |
| 2861 | - self.contains_macros = False | |
| 2862 | - else: | |
| 2863 | - self.contains_macros = True | |
| 2864 | - # Also look for VBA code in any stream including orphans | |
| 2865 | - # (happens in some malformed files) | |
| 2866 | - ole = self.ole_file | |
| 2867 | - for sid in xrange(len(ole.direntries)): | |
| 2868 | - # check if id is already done above: | |
| 2869 | - log.debug('Checking DirEntry #%d' % sid) | |
| 2870 | - d = ole.direntries[sid] | |
| 2871 | - if d is None: | |
| 2872 | - # this direntry is not part of the tree: either unused or an orphan | |
| 2873 | - d = ole._load_direntry(sid) | |
| 2874 | - log.debug('This DirEntry is an orphan or unused') | |
| 2875 | - if d.entry_type == olefile.STGTY_STREAM: | |
| 2876 | - # read data | |
| 2877 | - log.debug('Reading data from stream %r - size: %d bytes' % (d.name, d.size)) | |
| 2878 | - try: | |
| 2879 | - data = ole._open(d.isectStart, d.size).read() | |
| 2880 | - log.debug('Read %d bytes' % len(data)) | |
| 2881 | - if len(data) > 200: | |
| 2882 | - log.debug('%r...[much more data]...%r' % (data[:100], data[-50:])) | |
| 2883 | - else: | |
| 2884 | - log.debug(repr(data)) | |
| 2885 | - if 'Attribut\x00' in data.decode('utf-8', 'ignore'): | |
| 2886 | - log.debug('Found VBA compressed code') | |
| 2887 | - self.contains_macros = True | |
| 2888 | - except IOError as exc: | |
| 2889 | - if self.relaxed: | |
| 2890 | - log.info('Error when reading OLE Stream %r' % d.name) | |
| 2891 | - log.debug('Trace:', exc_trace=True) | |
| 2892 | - else: | |
| 2893 | - raise SubstreamOpenError(self.filename, d.name, exc) | |
| 2894 | - return self.contains_macros | |
| 2895 | - | |
| 2896 | - def extract_macros(self): | |
| 2897 | - """ | |
| 2898 | - Extract and decompress source code for each VBA macro found in the file | |
| 2899 | - | |
| 2900 | - Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found | |
| 2901 | - If the file is OLE, filename is the path of the file. | |
| 2902 | - If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros | |
| 2903 | - within the zip archive, e.g. word/vbaProject.bin. | |
| 2904 | - If the file is PPT, result is as for OpenXML but filename is useless | |
| 2905 | - """ | |
| 2906 | - log.debug('extract_macros:') | |
| 2907 | - if self.ole_file is None: | |
| 2908 | - # This may be either an OpenXML/PPT or a text file: | |
| 2909 | - if self.type == TYPE_TEXT: | |
| 2910 | - # This is a text file, yield the full code: | |
| 2911 | - yield (self.filename, '', self.filename, self.vba_code_all_modules) | |
| 2912 | - else: | |
| 2913 | - # OpenXML/PPT: recursively yield results from each OLE subfile: | |
| 2914 | - for ole_subfile in self.ole_subfiles: | |
| 2915 | - for results in ole_subfile.extract_macros(): | |
| 2916 | - yield results | |
| 2917 | - else: | |
| 2918 | - # This is an OLE file: | |
| 2919 | - self.find_vba_projects() | |
| 2920 | - # set of stream ids | |
| 2921 | - vba_stream_ids = set() | |
| 2922 | - for vba_root, project_path, dir_path in self.vba_projects: | |
| 2923 | - # extract all VBA macros from that VBA root storage: | |
| 2924 | - # The function _extract_vba may fail on some files (issue #132) | |
| 2925 | - try: | |
| 2926 | - for stream_path, vba_filename, vba_code in \ | |
| 2927 | - _extract_vba(self.ole_file, vba_root, project_path, | |
| 2928 | - dir_path, self.relaxed): | |
| 2929 | - # store direntry ids in a set: | |
| 2930 | - vba_stream_ids.add(self.ole_file._find(stream_path)) | |
| 2931 | - yield (self.filename, stream_path, vba_filename, vba_code) | |
| 2932 | - except Exception as e: | |
| 2933 | - log.exception('Error in _extract_vba') | |
| 2934 | - # Also look for VBA code in any stream including orphans | |
| 2935 | - # (happens in some malformed files) | |
| 2936 | - ole = self.ole_file | |
| 2937 | - for sid in xrange(len(ole.direntries)): | |
| 2938 | - # check if id is already done above: | |
| 2939 | - log.debug('Checking DirEntry #%d' % sid) | |
| 2940 | - if sid in vba_stream_ids: | |
| 2941 | - log.debug('Already extracted') | |
| 2942 | - continue | |
| 2943 | - d = ole.direntries[sid] | |
| 2944 | - if d is None: | |
| 2945 | - # this direntry is not part of the tree: either unused or an orphan | |
| 2946 | - d = ole._load_direntry(sid) | |
| 2947 | - log.debug('This DirEntry is an orphan or unused') | |
| 2948 | - if d.entry_type == olefile.STGTY_STREAM: | |
| 2949 | - # read data | |
| 2950 | - log.debug('Reading data from stream %r' % d.name) | |
| 2951 | - data = ole._open(d.isectStart, d.size).read() | |
| 2952 | - for match in re.finditer(b'\\x00Attribut[^e]', data, flags=re.IGNORECASE): | |
| 2953 | - start = match.start() - 3 | |
| 2954 | - log.debug('Found VBA compressed code at index %X' % start) | |
| 2955 | - compressed_code = data[start:] | |
| 2956 | - try: | |
| 2957 | - vba_code = decompress_stream(bytearray(compressed_code)) | |
| 2958 | - yield (self.filename, d.name, d.name, vba_code) | |
| 2959 | - except Exception as exc: | |
| 2960 | - # display the exception with full stack trace for debugging | |
| 2961 | - log.debug('Error processing stream %r in file %r (%s)' % (d.name, self.filename, exc)) | |
| 2962 | - log.debug('Traceback:', exc_info=True) | |
| 2963 | - # do not raise the error, as it is unlikely to be a compressed macro stream | |
| 2964 | - | |
| 2965 | - def extract_all_macros(self): | |
| 2966 | - """ | |
| 2967 | - Extract and decompress source code for each VBA macro found in the file | |
| 2968 | - by calling extract_macros(), store the results as a list of tuples | |
| 2969 | - (filename, stream_path, vba_filename, vba_code) in self.modules. | |
| 2970 | - See extract_macros for details. | |
| 2971 | - """ | |
| 2972 | - if self.modules is None: | |
| 2973 | - self.modules = [] | |
| 2974 | - for (subfilename, stream_path, vba_filename, vba_code) in self.extract_macros(): | |
| 2975 | - self.modules.append((subfilename, stream_path, vba_filename, vba_code)) | |
| 2976 | - self.nb_macros = len(self.modules) | |
| 2977 | - return self.modules | |
| 2978 | - | |
| 2979 | - | |
| 2980 | - | |
| 2981 | - def analyze_macros(self, show_decoded_strings=False, deobfuscate=False): | |
| 2982 | - """ | |
| 2983 | - runs extract_macros and analyze the source code of all VBA macros | |
| 2984 | - found in the file. | |
| 2985 | - All results are stored in self.analysis_results. | |
| 2986 | - If called more than once, simply returns the previous results. | |
| 2987 | - """ | |
| 2988 | - if self.detect_vba_macros(): | |
| 2989 | - # if the analysis was already done, avoid doing it twice: | |
| 2990 | - if self.analysis_results is not None: | |
| 2991 | - return self.analysis_results | |
| 2992 | - # variable to merge source code from all modules: | |
| 2993 | - if self.vba_code_all_modules is None: | |
| 2994 | - self.vba_code_all_modules = '' | |
| 2995 | - for (_, _, _, vba_code) in self.extract_all_macros(): | |
| 2996 | - #TODO: filter code? (each module) | |
| 2997 | - if isinstance(vba_code, bytes): | |
| 2998 | - vba_code = vba_code.decode('utf-8', 'ignore') | |
| 2999 | - self.vba_code_all_modules += vba_code + '\n' | |
| 3000 | - for (_, _, form_string) in self.extract_form_strings(): | |
| 3001 | - self.vba_code_all_modules += form_string.decode('utf-8', 'ignore') + '\n' | |
| 3002 | - # Analyze the whole code at once: | |
| 3003 | - scanner = VBA_Scanner(self.vba_code_all_modules) | |
| 3004 | - self.analysis_results = scanner.scan(show_decoded_strings, deobfuscate) | |
| 3005 | - autoexec, suspicious, iocs, hexstrings, base64strings, dridex, vbastrings = scanner.scan_summary() | |
| 3006 | - self.nb_autoexec += autoexec | |
| 3007 | - self.nb_suspicious += suspicious | |
| 3008 | - self.nb_iocs += iocs | |
| 3009 | - self.nb_hexstrings += hexstrings | |
| 3010 | - self.nb_base64strings += base64strings | |
| 3011 | - self.nb_dridexstrings += dridex | |
| 3012 | - self.nb_vbastrings += vbastrings | |
| 3013 | - | |
| 3014 | - return self.analysis_results | |
| 3015 | - | |
| 3016 | - | |
| 3017 | - def reveal(self): | |
| 3018 | - # we only want printable strings: | |
| 3019 | - analysis = self.analyze_macros(show_decoded_strings=False) | |
| 3020 | - # to avoid replacing short strings contained into longer strings, we sort the analysis results | |
| 3021 | - # based on the length of the encoded string, in reverse order: | |
| 3022 | - analysis = sorted(analysis, key=lambda type_decoded_encoded: len(type_decoded_encoded[2]), reverse=True) | |
| 3023 | - # normally now self.vba_code_all_modules contains source code from all modules | |
| 3024 | - # Need to collapse long lines: | |
| 3025 | - deobf_code = vba_collapse_long_lines(self.vba_code_all_modules) | |
| 3026 | - deobf_code = filter_vba(deobf_code) | |
| 3027 | - for kw_type, decoded, encoded in analysis: | |
| 3028 | - if kw_type == 'VBA string': | |
| 3029 | - #print '%3d occurences: %r => %r' % (deobf_code.count(encoded), encoded, decoded) | |
| 3030 | - # need to add double quotes around the decoded strings | |
| 3031 | - # after escaping double-quotes as double-double-quotes for VBA: | |
| 3032 | - decoded = decoded.replace('"', '""') | |
| 3033 | - decoded = '"%s"' % decoded | |
| 3034 | - # if the encoded string is enclosed in parentheses, | |
| 3035 | - # keep them in the decoded version: | |
| 3036 | - if encoded.startswith('(') and encoded.endswith(')'): | |
| 3037 | - decoded = '(%s)' % decoded | |
| 3038 | - deobf_code = deobf_code.replace(encoded, decoded) | |
| 3039 | - # # TODO: there is a bug somewhere which creates double returns '\r\r' | |
| 3040 | - # deobf_code = deobf_code.replace('\r\r', '\r') | |
| 3041 | - return deobf_code | |
| 3042 | - #TODO: repasser l'analyse plusieurs fois si des chaines hex ou base64 sont revelees | |
| 3043 | - | |
| 3044 | - | |
| 3045 | - def find_vba_forms(self): | |
| 3046 | - """ | |
| 3047 | - Finds all the VBA forms stored in an OLE file. | |
| 3048 | - | |
| 3049 | - Return None if the file is not OLE but OpenXML. | |
| 3050 | - Return a list of tuples (vba_root, project_path, dir_path) for each VBA project. | |
| 3051 | - vba_root is the path of the root OLE storage containing the VBA project, | |
| 3052 | - including a trailing slash unless it is the root of the OLE file. | |
| 3053 | - project_path is the path of the OLE stream named "PROJECT" within the VBA project. | |
| 3054 | - dir_path is the path of the OLE stream named "VBA/dir" within the VBA project. | |
| 3055 | - | |
| 3056 | - If this function returns an empty list for one of the supported formats | |
| 3057 | - (i.e. Word, Excel, Powerpoint), then the file does not contain VBA forms. | |
| 3058 | - | |
| 3059 | - :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path) | |
| 3060 | - for each VBA project found if OLE file | |
| 3061 | - """ | |
| 3062 | - log.debug('VBA_Parser.find_vba_forms') | |
| 3063 | - | |
| 3064 | - # if the file is not OLE but OpenXML, return None: | |
| 3065 | - if self.ole_file is None and self.type != TYPE_PPT: | |
| 3066 | - return None | |
| 3067 | - | |
| 3068 | - # if this method has already been called, return previous result: | |
| 3069 | - # if self.vba_projects is not None: | |
| 3070 | - # return self.vba_projects | |
| 3071 | - | |
| 3072 | - # According to MS-OFORMS section 2.1.2 Control Streams: | |
| 3073 | - # - A parent control, that is, a control that can contain embedded controls, | |
| 3074 | - # MUST be persisted as a storage that contains multiple streams. | |
| 3075 | - # - All parent controls MUST contain a FormControl. The FormControl | |
| 3076 | - # properties are persisted to a stream (1) as specified in section 2.1.1.2. | |
| 3077 | - # The name of this stream (1) MUST be "f". | |
| 3078 | - # - Embedded controls that cannot themselves contain other embedded | |
| 3079 | - # controls are persisted sequentially as FormEmbeddedActiveXControls | |
| 3080 | - # to a stream (1) contained in the same storage as the parent control. | |
| 3081 | - # The name of this stream (1) MUST be "o". | |
| 3082 | - # - all names are case-insensitive | |
| 3083 | - | |
| 3084 | - if self.type == TYPE_PPT: | |
| 3085 | - # TODO: so far, this function is never called for PPT files, but | |
| 3086 | - # if that happens, the information is lost which ole file contains | |
| 3087 | - # which storage! | |
| 3088 | - ole_files = self.ole_subfiles | |
| 3089 | - log.warning('Returned info is not complete for PPT types!') | |
| 3090 | - else: | |
| 3091 | - ole_files = [self.ole_file, ] | |
| 3092 | - | |
| 3093 | - # start with an empty list: | |
| 3094 | - self.vba_forms = [] | |
| 3095 | - | |
| 3096 | - # Loop over ole streams | |
| 3097 | - for ole in ole_files: | |
| 3098 | - # Look for any storage containing those storage/streams: | |
| 3099 | - for storage in ole.listdir(streams=False, storages=True): | |
| 3100 | - log.debug('Checking storage %r' % storage) | |
| 3101 | - # Look for two streams named 'o' and 'f': | |
| 3102 | - o_stream = storage + ['o'] | |
| 3103 | - f_stream = storage + ['f'] | |
| 3104 | - log.debug('Checking if streams %r and %r exist' % (f_stream, o_stream)) | |
| 3105 | - if ole.exists(o_stream) and ole.get_type(o_stream) == olefile.STGTY_STREAM \ | |
| 3106 | - and ole.exists(f_stream) and ole.get_type(f_stream) == olefile.STGTY_STREAM: | |
| 3107 | - form_path = '/'.join(storage) | |
| 3108 | - log.debug('Found VBA Form: %r' % form_path) | |
| 3109 | - self.vba_forms.append(storage) | |
| 3110 | - return self.vba_forms | |
| 3111 | - | |
| 3112 | - def extract_form_strings(self): | |
| 3113 | - """ | |
| 3114 | - Extract printable strings from each VBA Form found in the file | |
| 3115 | - | |
| 3116 | - Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found | |
| 3117 | - If the file is OLE, filename is the path of the file. | |
| 3118 | - If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros | |
| 3119 | - within the zip archive, e.g. word/vbaProject.bin. | |
| 3120 | - If the file is PPT, result is as for OpenXML but filename is useless | |
| 3121 | - """ | |
| 3122 | - if self.ole_file is None: | |
| 3123 | - # This may be either an OpenXML/PPT or a text file: | |
| 3124 | - if self.type == TYPE_TEXT: | |
| 3125 | - # This is a text file, return no results: | |
| 3126 | - return | |
| 3127 | - else: | |
| 3128 | - # OpenXML/PPT: recursively yield results from each OLE subfile: | |
| 3129 | - for ole_subfile in self.ole_subfiles: | |
| 3130 | - for results in ole_subfile.extract_form_strings(): | |
| 3131 | - yield results | |
| 3132 | - else: | |
| 3133 | - # This is an OLE file: | |
| 3134 | - self.find_vba_forms() | |
| 3135 | - ole = self.ole_file | |
| 3136 | - for form_storage in self.vba_forms: | |
| 3137 | - o_stream = form_storage + ['o'] | |
| 3138 | - log.debug('Opening form object stream %r' % '/'.join(o_stream)) | |
| 3139 | - form_data = ole.openstream(o_stream).read() | |
| 3140 | - # Extract printable strings from the form object stream "o": | |
| 3141 | - for m in re_printable_string.finditer(form_data): | |
| 3142 | - log.debug('Printable string found in form: %r' % m.group()) | |
| 3143 | - yield (self.filename, '/'.join(o_stream), m.group()) | |
| 3144 | - | |
| 3145 | - | |
| 3146 | - def close(self): | |
| 3147 | - """ | |
| 3148 | - Close all the open files. This method must be called after usage, if | |
| 3149 | - the application is opening many files. | |
| 3150 | - """ | |
| 3151 | - if self.ole_file is None: | |
| 3152 | - if self.ole_subfiles is not None: | |
| 3153 | - for ole_subfile in self.ole_subfiles: | |
| 3154 | - ole_subfile.close() | |
| 3155 | - else: | |
| 3156 | - self.ole_file.close() | |
| 3157 | - | |
| 3158 | - | |
| 3159 | - | |
| 3160 | -class VBA_Parser_CLI(VBA_Parser): | |
| 3161 | - """ | |
| 3162 | - VBA parser and analyzer, adding methods for the command line interface | |
| 3163 | - of olevba. (see VBA_Parser) | |
| 3164 | - """ | |
| 3165 | - | |
| 3166 | - def __init__(self, *args, **kwargs): | |
| 3167 | - """ | |
| 3168 | - Constructor for VBA_Parser_CLI. | |
| 3169 | - Calls __init__ from VBA_Parser with all arguments --> see doc there | |
| 3170 | - """ | |
| 3171 | - super(VBA_Parser_CLI, self).__init__(*args, **kwargs) | |
| 3172 | - | |
| 3173 | - | |
| 3174 | - def print_analysis(self, show_decoded_strings=False, deobfuscate=False): | |
| 3175 | - """ | |
| 3176 | - Analyze the provided VBA code, and print the results in a table | |
| 3177 | - | |
| 3178 | - :param vba_code: str, VBA source code to be analyzed | |
| 3179 | - :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content. | |
| 3180 | - :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) | |
| 3181 | - :return: None | |
| 3182 | - """ | |
| 3183 | - # print a waiting message only if the output is not redirected to a file: | |
| 3184 | - if sys.stdout.isatty(): | |
| 3185 | - print('Analysis...\r', end='') | |
| 3186 | - sys.stdout.flush() | |
| 3187 | - results = self.analyze_macros(show_decoded_strings, deobfuscate) | |
| 3188 | - if results: | |
| 3189 | - t = prettytable.PrettyTable(('Type', 'Keyword', 'Description')) | |
| 3190 | - t.align = 'l' | |
| 3191 | - t.max_width['Type'] = 10 | |
| 3192 | - t.max_width['Keyword'] = 20 | |
| 3193 | - t.max_width['Description'] = 39 | |
| 3194 | - for kw_type, keyword, description in results: | |
| 3195 | - # handle non printable strings: | |
| 3196 | - if not is_printable(keyword): | |
| 3197 | - keyword = repr(keyword) | |
| 3198 | - if not is_printable(description): | |
| 3199 | - description = repr(description) | |
| 3200 | - t.add_row((kw_type, keyword, description)) | |
| 3201 | - print(t) | |
| 3202 | - else: | |
| 3203 | - print('No suspicious keyword or IOC found.') | |
| 3204 | - | |
| 3205 | - def print_analysis_json(self, show_decoded_strings=False, deobfuscate=False): | |
| 3206 | - """ | |
| 3207 | - Analyze the provided VBA code, and return the results in json format | |
| 3208 | - | |
| 3209 | - :param vba_code: str, VBA source code to be analyzed | |
| 3210 | - :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content. | |
| 3211 | - :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) | |
| 3212 | - | |
| 3213 | - :return: dict | |
| 3214 | - """ | |
| 3215 | - # print a waiting message only if the output is not redirected to a file: | |
| 3216 | - if sys.stdout.isatty(): | |
| 3217 | - print('Analysis...\r', end='') | |
| 3218 | - sys.stdout.flush() | |
| 3219 | - return [dict(type=kw_type, keyword=keyword, description=description) | |
| 3220 | - for kw_type, keyword, description in self.analyze_macros(show_decoded_strings, deobfuscate)] | |
| 3221 | - | |
| 3222 | - def process_file(self, show_decoded_strings=False, | |
| 3223 | - display_code=True, hide_attributes=True, | |
| 3224 | - vba_code_only=False, show_deobfuscated_code=False, | |
| 3225 | - deobfuscate=False): | |
| 3226 | - """ | |
| 3227 | - Process a single file | |
| 3228 | - | |
| 3229 | - :param filename: str, path and filename of file on disk, or within the container. | |
| 3230 | - :param data: bytes, content of the file if it is in a container, None if it is a file on disk. | |
| 3231 | - :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content. | |
| 3232 | - :param display_code: bool, if False VBA source code is not displayed (default True) | |
| 3233 | - :param global_analysis: bool, if True all modules are merged for a single analysis (default), | |
| 3234 | - otherwise each module is analyzed separately (old behaviour) | |
| 3235 | - :param hide_attributes: bool, if True the first lines starting with "Attribute VB" are hidden (default) | |
| 3236 | - :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) | |
| 3237 | - """ | |
| 3238 | - #TODO: replace print by writing to a provided output file (sys.stdout by default) | |
| 3239 | - # fix conflicting parameters: | |
| 3240 | - if vba_code_only and not display_code: | |
| 3241 | - display_code = True | |
| 3242 | - if self.container: | |
| 3243 | - display_filename = '%s in %s' % (self.filename, self.container) | |
| 3244 | - else: | |
| 3245 | - display_filename = self.filename | |
| 3246 | - print('=' * 79) | |
| 3247 | - print('FILE: %s' % display_filename) | |
| 3248 | - try: | |
| 3249 | - #TODO: handle olefile errors, when an OLE file is malformed | |
| 3250 | - print('Type: %s'% self.type) | |
| 3251 | - if self.detect_vba_macros(): | |
| 3252 | - #print 'Contains VBA Macros:' | |
| 3253 | - for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros(): | |
| 3254 | - if hide_attributes: | |
| 3255 | - # hide attribute lines: | |
| 3256 | - if isinstance(vba_code,bytes): | |
| 3257 | - vba_code =vba_code.decode('utf-8','backslashreplace') | |
| 3258 | - vba_code_filtered = filter_vba(vba_code) | |
| 3259 | - else: | |
| 3260 | - vba_code_filtered = vba_code | |
| 3261 | - print('-' * 79) | |
| 3262 | - print('VBA MACRO %s ' % vba_filename) | |
| 3263 | - print('in file: %s - OLE stream: %s' % (subfilename, repr(stream_path))) | |
| 3264 | - if display_code: | |
| 3265 | - print('- ' * 39) | |
| 3266 | - # detect empty macros: | |
| 3267 | - if vba_code_filtered.strip() == '': | |
| 3268 | - print('(empty macro)') | |
| 3269 | - else: | |
| 3270 | - print(vba_code_filtered) | |
| 3271 | - for (subfilename, stream_path, form_string) in self.extract_form_strings(): | |
| 3272 | - print('-' * 79) | |
| 3273 | - print('VBA FORM STRING IN %r - OLE stream: %r' % (subfilename, stream_path)) | |
| 3274 | - print('- ' * 39) | |
| 3275 | - print(form_string.decode('utf-8', 'ignore')) | |
| 3276 | - if not vba_code_only: | |
| 3277 | - # analyse the code from all modules at once: | |
| 3278 | - self.print_analysis(show_decoded_strings, deobfuscate) | |
| 3279 | - if show_deobfuscated_code: | |
| 3280 | - print('MACRO SOURCE CODE WITH DEOBFUSCATED VBA STRINGS (EXPERIMENTAL):\n\n') | |
| 3281 | - print(self.reveal()) | |
| 3282 | - else: | |
| 3283 | - print('No VBA macros found.') | |
| 3284 | - except OlevbaBaseException: | |
| 3285 | - raise | |
| 3286 | - except Exception as exc: | |
| 3287 | - # display the exception with full stack trace for debugging | |
| 3288 | - log.info('Error processing file %s (%s)' % (self.filename, exc)) | |
| 3289 | - log.debug('Traceback:', exc_info=True) | |
| 3290 | - raise ProcessingError(self.filename, exc) | |
| 3291 | - print('') | |
| 3292 | - | |
| 3293 | - | |
| 3294 | - def process_file_json(self, show_decoded_strings=False, | |
| 3295 | - display_code=True, hide_attributes=True, | |
| 3296 | - vba_code_only=False, show_deobfuscated_code=False, | |
| 3297 | - deobfuscate=False): | |
| 3298 | - """ | |
| 3299 | - Process a single file | |
| 3300 | - | |
| 3301 | - every "show" or "print" here is to be translated as "add to json" | |
| 3302 | - | |
| 3303 | - :param filename: str, path and filename of file on disk, or within the container. | |
| 3304 | - :param data: bytes, content of the file if it is in a container, None if it is a file on disk. | |
| 3305 | - :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content. | |
| 3306 | - :param display_code: bool, if False VBA source code is not displayed (default True) | |
| 3307 | - :param global_analysis: bool, if True all modules are merged for a single analysis (default), | |
| 3308 | - otherwise each module is analyzed separately (old behaviour) | |
| 3309 | - :param hide_attributes: bool, if True the first lines starting with "Attribute VB" are hidden (default) | |
| 3310 | - :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) | |
| 3311 | - """ | |
| 3312 | - #TODO: fix conflicting parameters (?) | |
| 3313 | - | |
| 3314 | - if vba_code_only and not display_code: | |
| 3315 | - display_code = True | |
| 3316 | - | |
| 3317 | - result = {} | |
| 3318 | - | |
| 3319 | - if self.container: | |
| 3320 | - result['container'] = self.container | |
| 3321 | - else: | |
| 3322 | - result['container'] = None | |
| 3323 | - result['file'] = self.filename | |
| 3324 | - result['json_conversion_successful'] = False | |
| 3325 | - result['analysis'] = None | |
| 3326 | - result['code_deobfuscated'] = None | |
| 3327 | - result['do_deobfuscate'] = deobfuscate | |
| 3328 | - | |
| 3329 | - try: | |
| 3330 | - #TODO: handle olefile errors, when an OLE file is malformed | |
| 3331 | - result['type'] = self.type | |
| 3332 | - macros = [] | |
| 3333 | - if self.detect_vba_macros(): | |
| 3334 | - for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros(): | |
| 3335 | - curr_macro = {} | |
| 3336 | - if isinstance(vba_code, bytes): | |
| 3337 | - vba_code = vba_code.decode('utf-8', 'backslashreplace') | |
| 3338 | - | |
| 3339 | - if hide_attributes: | |
| 3340 | - # hide attribute lines: | |
| 3341 | - vba_code_filtered = filter_vba(vba_code) | |
| 3342 | - else: | |
| 3343 | - vba_code_filtered = vba_code | |
| 3344 | - | |
| 3345 | - curr_macro['vba_filename'] = vba_filename | |
| 3346 | - curr_macro['subfilename'] = subfilename | |
| 3347 | - curr_macro['ole_stream'] = stream_path | |
| 3348 | - if display_code: | |
| 3349 | - curr_macro['code'] = vba_code_filtered.strip() | |
| 3350 | - else: | |
| 3351 | - curr_macro['code'] = None | |
| 3352 | - macros.append(curr_macro) | |
| 3353 | - if not vba_code_only: | |
| 3354 | - # analyse the code from all modules at once: | |
| 3355 | - result['analysis'] = self.print_analysis_json(show_decoded_strings, | |
| 3356 | - deobfuscate) | |
| 3357 | - if show_deobfuscated_code: | |
| 3358 | - result['code_deobfuscated'] = self.reveal() | |
| 3359 | - result['macros'] = macros | |
| 3360 | - result['json_conversion_successful'] = True | |
| 3361 | - except Exception as exc: | |
| 3362 | - # display the exception with full stack trace for debugging | |
| 3363 | - log.info('Error processing file %s (%s)' % (self.filename, exc)) | |
| 3364 | - log.debug('Traceback:', exc_info=True) | |
| 3365 | - raise ProcessingError(self.filename, exc) | |
| 3366 | - | |
| 3367 | - return result | |
| 3368 | - | |
| 3369 | - | |
| 3370 | - def process_file_triage(self, show_decoded_strings=False, deobfuscate=False): | |
| 3371 | - """ | |
| 3372 | - Process a file in triage mode, showing only summary results on one line. | |
| 3373 | - """ | |
| 3374 | - #TODO: replace print by writing to a provided output file (sys.stdout by default) | |
| 3375 | - try: | |
| 3376 | - #TODO: handle olefile errors, when an OLE file is malformed | |
| 3377 | - if self.detect_vba_macros(): | |
| 3378 | - # print a waiting message only if the output is not redirected to a file: | |
| 3379 | - if sys.stdout.isatty(): | |
| 3380 | - print('Analysis...\r', end='') | |
| 3381 | - sys.stdout.flush() | |
| 3382 | - self.analyze_macros(show_decoded_strings=show_decoded_strings, | |
| 3383 | - deobfuscate=deobfuscate) | |
| 3384 | - flags = TYPE2TAG[self.type] | |
| 3385 | - macros = autoexec = suspicious = iocs = hexstrings = base64obf = dridex = vba_obf = '-' | |
| 3386 | - if self.contains_macros: macros = 'M' | |
| 3387 | - if self.nb_autoexec: autoexec = 'A' | |
| 3388 | - if self.nb_suspicious: suspicious = 'S' | |
| 3389 | - if self.nb_iocs: iocs = 'I' | |
| 3390 | - if self.nb_hexstrings: hexstrings = 'H' | |
| 3391 | - if self.nb_base64strings: base64obf = 'B' | |
| 3392 | - if self.nb_dridexstrings: dridex = 'D' | |
| 3393 | - if self.nb_vbastrings: vba_obf = 'V' | |
| 3394 | - flags += '%s%s%s%s%s%s%s%s' % (macros, autoexec, suspicious, iocs, hexstrings, | |
| 3395 | - base64obf, dridex, vba_obf) | |
| 3396 | - | |
| 3397 | - line = '%-12s %s' % (flags, self.filename) | |
| 3398 | - print(line) | |
| 3399 | - except Exception as exc: | |
| 3400 | - # display the exception with full stack trace for debugging only | |
| 3401 | - log.debug('Error processing file %s (%s)' % (self.filename, exc), | |
| 3402 | - exc_info=True) | |
| 3403 | - raise ProcessingError(self.filename, exc) | |
| 3404 | - | |
| 3405 | - | |
| 3406 | -#=== MAIN ===================================================================== | |
| 3407 | - | |
| 3408 | -def parse_args(cmd_line_args=None): | |
| 3409 | - """ parse command line arguments (given ones or per default sys.argv) """ | |
| 3410 | - | |
| 3411 | - DEFAULT_LOG_LEVEL = "warning" # Default log level | |
| 3412 | - LOG_LEVELS = { | |
| 3413 | - 'debug': logging.DEBUG, | |
| 3414 | - 'info': logging.INFO, | |
| 3415 | - 'warning': logging.WARNING, | |
| 3416 | - 'error': logging.ERROR, | |
| 3417 | - 'critical': logging.CRITICAL | |
| 3418 | - } | |
| 3419 | - | |
| 3420 | - usage = 'usage: olevba [options] <filename> [filename2 ...]' | |
| 3421 | - parser = optparse.OptionParser(usage=usage) | |
| 3422 | - # parser.add_option('-o', '--outfile', dest='outfile', | |
| 3423 | - # help='output file') | |
| 3424 | - # parser.add_option('-c', '--csv', dest='csv', | |
| 3425 | - # help='export results to a CSV file') | |
| 3426 | - parser.add_option("-r", action="store_true", dest="recursive", | |
| 3427 | - help='find files recursively in subdirectories.') | |
| 3428 | - parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None, | |
| 3429 | - help='if the file is a zip archive, open all files from it, using the provided password (requires Python 2.6+)') | |
| 3430 | - parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*', | |
| 3431 | - help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)') | |
| 3432 | - # output mode; could make this even simpler with add_option(type='choice') but that would make | |
| 3433 | - # cmd line interface incompatible... | |
| 3434 | - modes = optparse.OptionGroup(parser, title='Output mode (mutually exclusive)') | |
| 3435 | - modes.add_option("-t", '--triage', action="store_const", dest="output_mode", | |
| 3436 | - const='triage', default='unspecified', | |
| 3437 | - help='triage mode, display results as a summary table (default for multiple files)') | |
| 3438 | - modes.add_option("-d", '--detailed', action="store_const", dest="output_mode", | |
| 3439 | - const='detailed', default='unspecified', | |
| 3440 | - help='detailed mode, display full results (default for single file)') | |
| 3441 | - modes.add_option("-j", '--json', action="store_const", dest="output_mode", | |
| 3442 | - const='json', default='unspecified', | |
| 3443 | - help='json mode, detailed in json format (never default)') | |
| 3444 | - parser.add_option_group(modes) | |
| 3445 | - parser.add_option("-a", '--analysis', action="store_false", dest="display_code", default=True, | |
| 3446 | - help='display only analysis results, not the macro source code') | |
| 3447 | - parser.add_option("-c", '--code', action="store_true", dest="vba_code_only", default=False, | |
| 3448 | - help='display only VBA source code, do not analyze it') | |
| 3449 | - parser.add_option("--decode", action="store_true", dest="show_decoded_strings", | |
| 3450 | - help='display all the obfuscated strings with their decoded content (Hex, Base64, StrReverse, Dridex, VBA).') | |
| 3451 | - parser.add_option("--attr", action="store_false", dest="hide_attributes", default=True, | |
| 3452 | - help='display the attribute lines at the beginning of VBA source code') | |
| 3453 | - parser.add_option("--reveal", action="store_true", dest="show_deobfuscated_code", | |
| 3454 | - help='display the macro source code after replacing all the obfuscated strings by their decoded content.') | |
| 3455 | - parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL, | |
| 3456 | - help="logging level debug/info/warning/error/critical (default=%default)") | |
| 3457 | - parser.add_option('--deobf', dest="deobfuscate", action="store_true", default=False, | |
| 3458 | - help="Attempt to deobfuscate VBA expressions (slow)") | |
| 3459 | - parser.add_option('--relaxed', dest="relaxed", action="store_true", default=False, | |
| 3460 | - help="Do not raise errors if opening of substream fails") | |
| 3461 | - | |
| 3462 | - (options, args) = parser.parse_args(cmd_line_args) | |
| 3463 | - | |
| 3464 | - # Print help if no arguments are passed | |
| 3465 | - if len(args) == 0: | |
| 3466 | - # print banner with version | |
| 3467 | - python_version = '%d.%d.%d' % sys.version_info[0:3] | |
| 3468 | - print('olevba3 %s on Python %s - http://decalage.info/python/oletools' % | |
| 3469 | - (__version__, python_version)) | |
| 3470 | - print(__doc__) | |
| 3471 | - parser.print_help() | |
| 3472 | - sys.exit(RETURN_WRONG_ARGS) | |
| 3473 | - | |
| 3474 | - options.loglevel = LOG_LEVELS[options.loglevel] | |
| 3475 | - | |
| 3476 | - return options, args | |
| 3477 | - | |
| 3478 | - | |
| 3479 | -def main(cmd_line_args=None): | |
| 3480 | - """ | |
| 3481 | - Main function, called when olevba is run from the command line | |
| 3482 | - | |
| 3483 | - Optional argument: command line arguments to be forwarded to ArgumentParser | |
| 3484 | - in process_args. Per default (cmd_line_args=None), sys.argv is used. Option | |
| 3485 | - mainly added for unit-testing | |
| 3486 | - """ | |
| 3487 | - | |
| 3488 | - options, args = parse_args(cmd_line_args) | |
| 3489 | - | |
| 3490 | - # provide info about tool and its version | |
| 3491 | - if options.output_mode == 'json': | |
| 3492 | - # print first json entry with meta info and opening '[' | |
| 3493 | - print_json(script_name='olevba', version=__version__, | |
| 3494 | - url='http://decalage.info/python/oletools', | |
| 3495 | - type='MetaInformation', _json_is_first=True) | |
| 3496 | - else: | |
| 3497 | - # print banner with version | |
| 3498 | - python_version = '%d.%d.%d' % sys.version_info[0:3] | |
| 3499 | - print('olevba3 %s on Python %s - http://decalage.info/python/oletools' % | |
| 3500 | - (__version__, python_version)) | |
| 3501 | - | |
| 3502 | - logging.basicConfig(level=options.loglevel, format='%(levelname)-8s %(message)s') | |
| 3503 | - # enable logging in the modules: | |
| 3504 | - enable_logging() | |
| 3505 | - | |
| 3506 | - # with the option --reveal, make sure --deobf is also enabled: | |
| 3507 | - if options.show_deobfuscated_code and not options.deobfuscate: | |
| 3508 | - log.info('set --deobf because --reveal was set') | |
| 3509 | - options.deobfuscate = True | |
| 3510 | - if options.output_mode == 'triage' and options.show_deobfuscated_code: | |
| 3511 | - log.info('ignoring option --reveal in triage output mode') | |
| 3512 | - | |
| 3513 | - # Column headers (do not know how many files there will be yet, so if no output_mode | |
| 3514 | - # was specified, we will print triage for first file --> need these headers) | |
| 3515 | - if options.output_mode in ('triage', 'unspecified'): | |
| 3516 | - print('%-12s %-65s' % ('Flags', 'Filename')) | |
| 3517 | - print('%-12s %-65s' % ('-' * 11, '-' * 65)) | |
| 3518 | - | |
| 3519 | - previous_container = None | |
| 3520 | - count = 0 | |
| 3521 | - container = filename = data = None | |
| 3522 | - vba_parser = None | |
| 3523 | - return_code = RETURN_OK | |
| 3524 | - try: | |
| 3525 | - for container, filename, data in xglob.iter_files(args, recursive=options.recursive, | |
| 3526 | - zip_password=options.zip_password, zip_fname=options.zip_fname): | |
| 3527 | - # ignore directory names stored in zip files: | |
| 3528 | - if container and filename.endswith('/'): | |
| 3529 | - continue | |
| 3530 | - | |
| 3531 | - # handle errors from xglob | |
| 3532 | - if isinstance(data, Exception): | |
| 3533 | - if isinstance(data, PathNotFoundException): | |
| 3534 | - if options.output_mode in ('triage', 'unspecified'): | |
| 3535 | - print('%-12s %s - File not found' % ('?', filename)) | |
| 3536 | - elif options.output_mode != 'json': | |
| 3537 | - log.error('Given path %r does not exist!' % filename) | |
| 3538 | - return_code = RETURN_FILE_NOT_FOUND if return_code == 0 \ | |
| 3539 | - else RETURN_SEVERAL_ERRS | |
| 3540 | - else: | |
| 3541 | - if options.output_mode in ('triage', 'unspecified'): | |
| 3542 | - print('%-12s %s - Failed to read from zip file %s' % ('?', filename, container)) | |
| 3543 | - elif options.output_mode != 'json': | |
| 3544 | - log.error('Exception opening/reading %r from zip file %r: %s' | |
| 3545 | - % (filename, container, data)) | |
| 3546 | - return_code = RETURN_XGLOB_ERR if return_code == 0 \ | |
| 3547 | - else RETURN_SEVERAL_ERRS | |
| 3548 | - if options.output_mode == 'json': | |
| 3549 | - print_json(file=filename, type='error', | |
| 3550 | - error=type(data).__name__, message=str(data)) | |
| 3551 | - continue | |
| 3552 | - | |
| 3553 | - try: | |
| 3554 | - # close the previous file if analyzing several: | |
| 3555 | - # (this must be done here to avoid closing the file if there is only 1, | |
| 3556 | - # to fix issue #219) | |
| 3557 | - if vba_parser is not None: | |
| 3558 | - vba_parser.close() | |
| 3559 | - # Open the file | |
| 3560 | - vba_parser = VBA_Parser_CLI(filename, data=data, container=container, | |
| 3561 | - relaxed=options.relaxed) | |
| 3562 | - | |
| 3563 | - if options.output_mode == 'detailed': | |
| 3564 | - # fully detailed output | |
| 3565 | - vba_parser.process_file(show_decoded_strings=options.show_decoded_strings, | |
| 3566 | - display_code=options.display_code, | |
| 3567 | - hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only, | |
| 3568 | - show_deobfuscated_code=options.show_deobfuscated_code, | |
| 3569 | - deobfuscate=options.deobfuscate) | |
| 3570 | - elif options.output_mode in ('triage', 'unspecified'): | |
| 3571 | - # print container name when it changes: | |
| 3572 | - if container != previous_container: | |
| 3573 | - if container is not None: | |
| 3574 | - print('\nFiles in %s:' % container) | |
| 3575 | - previous_container = container | |
| 3576 | - # summarized output for triage: | |
| 3577 | - vba_parser.process_file_triage(show_decoded_strings=options.show_decoded_strings, | |
| 3578 | - deobfuscate=options.deobfuscate) | |
| 3579 | - elif options.output_mode == 'json': | |
| 3580 | - print_json( | |
| 3581 | - vba_parser.process_file_json(show_decoded_strings=options.show_decoded_strings, | |
| 3582 | - display_code=options.display_code, | |
| 3583 | - hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only, | |
| 3584 | - show_deobfuscated_code=options.show_deobfuscated_code, | |
| 3585 | - deobfuscate=options.deobfuscate)) | |
| 3586 | - else: # (should be impossible) | |
| 3587 | - raise ValueError('unexpected output mode: "{0}"!'.format(options.output_mode)) | |
| 3588 | - count += 1 | |
| 3589 | - | |
| 3590 | - except (SubstreamOpenError, UnexpectedDataError) as exc: | |
| 3591 | - if options.output_mode in ('triage', 'unspecified'): | |
| 3592 | - print('%-12s %s - Error opening substream or uenxpected ' \ | |
| 3593 | - 'content' % ('?', filename)) | |
| 3594 | - elif options.output_mode == 'json': | |
| 3595 | - print_json(file=filename, type='error', | |
| 3596 | - error=type(exc).__name__, message=str(exc)) | |
| 3597 | - else: | |
| 3598 | - log.exception('Error opening substream or unexpected ' | |
| 3599 | - 'content in %s' % filename) | |
| 3600 | - return_code = RETURN_OPEN_ERROR if return_code == 0 \ | |
| 3601 | - else RETURN_SEVERAL_ERRS | |
| 3602 | - except FileOpenError as exc: | |
| 3603 | - if options.output_mode in ('triage', 'unspecified'): | |
| 3604 | - print('%-12s %s - File format not supported' % ('?', filename)) | |
| 3605 | - elif options.output_mode == 'json': | |
| 3606 | - print_json(file=filename, type='error', | |
| 3607 | - error=type(exc).__name__, message=str(exc)) | |
| 3608 | - else: | |
| 3609 | - log.exception('Failed to open %s -- probably not supported!' % filename) | |
| 3610 | - return_code = RETURN_OPEN_ERROR if return_code == 0 \ | |
| 3611 | - else RETURN_SEVERAL_ERRS | |
| 3612 | - except ProcessingError as exc: | |
| 3613 | - if options.output_mode in ('triage', 'unspecified'): | |
| 3614 | - print('%-12s %s - %s' % ('!ERROR', filename, exc.orig_exc)) | |
| 3615 | - elif options.output_mode == 'json': | |
| 3616 | - print_json(file=filename, type='error', | |
| 3617 | - error=type(exc).__name__, | |
| 3618 | - message=str(exc.orig_exc)) | |
| 3619 | - else: | |
| 3620 | - log.exception('Error processing file %s (%s)!' | |
| 3621 | - % (filename, exc.orig_exc)) | |
| 3622 | - return_code = RETURN_PARSE_ERROR if return_code == 0 \ | |
| 3623 | - else RETURN_SEVERAL_ERRS | |
| 3624 | - except FileIsEncryptedError as exc: | |
| 3625 | - if options.output_mode in ('triage', 'unspecified'): | |
| 3626 | - print('%-12s %s - File is encrypted' % ('!ERROR', filename)) | |
| 3627 | - elif options.output_mode == 'json': | |
| 3628 | - print_json(file=filename, type='error', | |
| 3629 | - error=type(exc).__name__, message=str(exc)) | |
| 3630 | - else: | |
| 3631 | - log.exception('File %s is encrypted!' % (filename)) | |
| 3632 | - return_code = RETURN_ENCRYPTED if return_code == 0 \ | |
| 3633 | - else RETURN_SEVERAL_ERRS | |
| 3634 | - # Here we do not close the vba_parser, because process_file may need it below. | |
| 3635 | - | |
| 3636 | - if options.output_mode == 'triage': | |
| 3637 | - print('\n(Flags: OpX=OpenXML, XML=Word2003XML, FlX=FlatOPC XML, MHT=MHTML, TXT=Text, M=Macros, ' \ | |
| 3638 | - 'A=Auto-executable, S=Suspicious keywords, I=IOCs, H=Hex strings, ' \ | |
| 3639 | - 'B=Base64 strings, D=Dridex strings, V=VBA strings, ?=Unknown)\n') | |
| 3640 | - | |
| 3641 | - if count == 1 and options.output_mode == 'unspecified': | |
| 3642 | - # if options -t, -d and -j were not specified and it's a single file, print details: | |
| 3643 | - vba_parser.process_file(show_decoded_strings=options.show_decoded_strings, | |
| 3644 | - display_code=options.display_code, | |
| 3645 | - hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only, | |
| 3646 | - show_deobfuscated_code=options.show_deobfuscated_code, | |
| 3647 | - deobfuscate=options.deobfuscate) | |
| 3648 | - | |
| 3649 | - if options.output_mode == 'json': | |
| 3650 | - # print last json entry (a last one without a comma) and closing ] | |
| 3651 | - print_json(type='MetaInformation', return_code=return_code, | |
| 3652 | - n_processed=count, _json_is_last=True) | |
| 3653 | - | |
| 3654 | - except Exception as exc: | |
| 3655 | - # some unexpected error, maybe some of the types caught in except clauses | |
| 3656 | - # above were not sufficient. This is very bad, so log complete trace at exception level | |
| 3657 | - # and do not care about output mode | |
| 3658 | - log.exception('Unhandled exception in main: %s' % exc, exc_info=True) | |
| 3659 | - return_code = RETURN_UNEXPECTED # even if there were others before -- this is more important | |
| 3660 | - # TODO: print msg with URL to report issues (except in JSON mode) | |
| 3661 | - | |
| 3662 | - # done. exit | |
| 3663 | - log.debug('will exit now with code %s' % return_code) | |
| 3664 | - sys.exit(return_code) | |
| 19 | +from oletools.olevba import * | |
| 20 | +from oletools.olevba import __doc__, __version__ | |
| 3665 | 21 | |
| 3666 | 22 | if __name__ == '__main__': |
| 3667 | 23 | main() |
| 3668 | 24 | |
| 3669 | -# This was coded while listening to "Dust" from I Love You But I've Chosen Darkness | ... | ... |