Commit 8e1d03d7a18b0779ea73c1d4b13914c07220c37d
1 parent
a7309e59
olevba3: replaced by a redirection to olevba + deprecation warning (issue #106)
Showing
1 changed file
with
6 additions
and
3651 deletions
oletools/olevba3.py
| 1 | #!/usr/bin/env python | 1 | #!/usr/bin/env python |
| 2 | -""" | ||
| 3 | -olevba3.py | ||
| 4 | 2 | ||
| 5 | -olevba is a script to parse OLE and OpenXML files such as MS Office documents | ||
| 6 | -(e.g. Word, Excel), to extract VBA Macro code in clear text, deobfuscate | ||
| 7 | -and analyze malicious macros. | 3 | +# olevba3 is a stub that redirects to olevba.py, for backwards compatibility |
| 8 | 4 | ||
| 9 | -olevba3 is the version of olevba that runs on Python 3.x. | ||
| 10 | - | ||
| 11 | -Supported formats: | ||
| 12 | -- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) | ||
| 13 | -- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) | ||
| 14 | -- PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm) | ||
| 15 | -- Word/PowerPoint 2007+ XML (aka Flat OPC) | ||
| 16 | -- Word 2003 XML (.xml) | ||
| 17 | -- Word/Excel Single File Web Page / MHTML (.mht) | ||
| 18 | -- Publisher (.pub) | ||
| 19 | -- raises an error if run with files encrypted using MS Crypto API RC4 | ||
| 20 | - | ||
| 21 | -Author: Philippe Lagadec - http://www.decalage.info | ||
| 22 | -License: BSD, see source code or documentation | ||
| 23 | - | ||
| 24 | -olevba is part of the python-oletools package: | ||
| 25 | -http://www.decalage.info/python/oletools | ||
| 26 | - | ||
| 27 | -olevba is based on source code from officeparser by John William Davison | ||
| 28 | -https://github.com/unixfreak0037/officeparser | ||
| 29 | -""" | ||
| 30 | - | ||
| 31 | -# === LICENSE ================================================================== | ||
| 32 | - | ||
| 33 | -# olevba is copyright (c) 2014-2018 Philippe Lagadec (http://www.decalage.info) | ||
| 34 | -# All rights reserved. | ||
| 35 | -# | ||
| 36 | -# Redistribution and use in source and binary forms, with or without modification, | ||
| 37 | -# are permitted provided that the following conditions are met: | ||
| 38 | -# | ||
| 39 | -# * Redistributions of source code must retain the above copyright notice, this | ||
| 40 | -# list of conditions and the following disclaimer. | ||
| 41 | -# * Redistributions in binary form must reproduce the above copyright notice, | ||
| 42 | -# this list of conditions and the following disclaimer in the documentation | ||
| 43 | -# and/or other materials provided with the distribution. | ||
| 44 | -# | ||
| 45 | -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | ||
| 46 | -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | ||
| 47 | -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | ||
| 48 | -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | ||
| 49 | -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
| 50 | -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||
| 51 | -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | ||
| 52 | -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||
| 53 | -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
| 54 | -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| 55 | - | ||
| 56 | - | ||
| 57 | -# olevba contains modified source code from the officeparser project, published | ||
| 58 | -# under the following MIT License (MIT): | ||
| 59 | -# | ||
| 60 | -# officeparser is copyright (c) 2014 John William Davison | ||
| 61 | -# | ||
| 62 | -# Permission is hereby granted, free of charge, to any person obtaining a copy | ||
| 63 | -# of this software and associated documentation files (the "Software"), to deal | ||
| 64 | -# in the Software without restriction, including without limitation the rights | ||
| 65 | -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
| 66 | -# copies of the Software, and to permit persons to whom the Software is | ||
| 67 | -# furnished to do so, subject to the following conditions: | ||
| 68 | -# | ||
| 69 | -# The above copyright notice and this permission notice shall be included in all | ||
| 70 | -# copies or substantial portions of the Software. | ||
| 71 | -# | ||
| 72 | -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
| 73 | -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
| 74 | -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
| 75 | -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
| 76 | -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
| 77 | -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| 78 | -# SOFTWARE. | ||
| 79 | - | ||
| 80 | -from __future__ import print_function | ||
| 81 | - | ||
| 82 | -#------------------------------------------------------------------------------ | ||
| 83 | -# CHANGELOG: | ||
| 84 | -# 2014-08-05 v0.01 PL: - first version based on officeparser code | ||
| 85 | -# 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser | ||
| 86 | -# 2014-08-15 PL: - fixed incorrect value check in projecthelpfilepath Record | ||
| 87 | -# 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats | ||
| 88 | -# and to find the VBA project root anywhere in the file | ||
| 89 | -# 2014-11-29 v0.04 PL: - use olefile instead of OleFileIO_PL | ||
| 90 | -# 2014-12-05 v0.05 PL: - refactored most functions into a class, new API | ||
| 91 | -# - added detect_vba_macros | ||
| 92 | -# 2014-12-10 v0.06 PL: - hide first lines with VB attributes | ||
| 93 | -# - detect auto-executable macros | ||
| 94 | -# - ignore empty macros | ||
| 95 | -# 2014-12-14 v0.07 PL: - detect_autoexec() is now case-insensitive | ||
| 96 | -# 2014-12-15 v0.08 PL: - improved display for empty macros | ||
| 97 | -# - added pattern extraction | ||
| 98 | -# 2014-12-25 v0.09 PL: - added suspicious keywords detection | ||
| 99 | -# 2014-12-27 v0.10 PL: - added OptionParser, main and process_file | ||
| 100 | -# - uses xglob to scan several files with wildcards | ||
| 101 | -# - option -r to recurse subdirectories | ||
| 102 | -# - option -z to scan files in password-protected zips | ||
| 103 | -# 2015-01-02 v0.11 PL: - improved filter_vba to detect colons | ||
| 104 | -# 2015-01-03 v0.12 PL: - fixed detect_patterns to detect all patterns | ||
| 105 | -# - process_file: improved display, shows container file | ||
| 106 | -# - improved list of executable file extensions | ||
| 107 | -# 2015-01-04 v0.13 PL: - added several suspicious keywords, improved display | ||
| 108 | -# 2015-01-08 v0.14 PL: - added hex strings detection and decoding | ||
| 109 | -# - fixed issue #2, decoding VBA stream names using | ||
| 110 | -# specified codepage and unicode stream names | ||
| 111 | -# 2015-01-11 v0.15 PL: - added new triage mode, options -t and -d | ||
| 112 | -# 2015-01-16 v0.16 PL: - fix for issue #3 (exception when module name="text") | ||
| 113 | -# - added several suspicious keywords | ||
| 114 | -# - added option -i to analyze VBA source code directly | ||
| 115 | -# 2015-01-17 v0.17 PL: - removed .com from the list of executable extensions | ||
| 116 | -# - added scan_vba to run all detection algorithms | ||
| 117 | -# - decoded hex strings are now also scanned + reversed | ||
| 118 | -# 2015-01-23 v0.18 PL: - fixed issue #3, case-insensitive search in code_modules | ||
| 119 | -# 2015-01-24 v0.19 PL: - improved the detection of IOCs obfuscated with hex | ||
| 120 | -# strings and StrReverse | ||
| 121 | -# 2015-01-26 v0.20 PL: - added option --hex to show all hex strings decoded | ||
| 122 | -# 2015-01-29 v0.21 PL: - added Dridex obfuscation decoding | ||
| 123 | -# - improved display, shows obfuscation name | ||
| 124 | -# 2015-02-01 v0.22 PL: - fixed issue #4: regex for URL, e-mail and exe filename | ||
| 125 | -# - added Base64 obfuscation decoding (contribution from | ||
| 126 | -# @JamesHabben) | ||
| 127 | -# 2015-02-03 v0.23 PL: - triage now uses VBA_Scanner results, shows Base64 and | ||
| 128 | -# Dridex strings | ||
| 129 | -# - exception handling in detect_base64_strings | ||
| 130 | -# 2015-02-07 v0.24 PL: - renamed option --hex to --decode, fixed display | ||
| 131 | -# - display exceptions with stack trace | ||
| 132 | -# - added several suspicious keywords | ||
| 133 | -# - improved Base64 detection and decoding | ||
| 134 | -# - fixed triage mode not to scan attrib lines | ||
| 135 | -# 2015-03-04 v0.25 PL: - added support for Word 2003 XML | ||
| 136 | -# 2015-03-22 v0.26 PL: - added suspicious keywords for sandboxing and | ||
| 137 | -# virtualisation detection | ||
| 138 | -# 2015-05-06 v0.27 PL: - added support for MHTML files with VBA macros | ||
| 139 | -# (issue #10 reported by Greg from SpamStopsHere) | ||
| 140 | -# 2015-05-24 v0.28 PL: - improved support for MHTML files with modified header | ||
| 141 | -# (issue #11 reported by Thomas Chopitea) | ||
| 142 | -# 2015-05-26 v0.29 PL: - improved MSO files parsing, taking into account | ||
| 143 | -# various data offsets (issue #12) | ||
| 144 | -# - improved detection of MSO files, avoiding incorrect | ||
| 145 | -# parsing errors (issue #7) | ||
| 146 | -# 2015-05-29 v0.30 PL: - added suspicious keywords suggested by @ozhermit, | ||
| 147 | -# Davy Douhine (issue #9), issue #13 | ||
| 148 | -# 2015-06-16 v0.31 PL: - added generic VBA expression deobfuscation (chr,asc,etc) | ||
| 149 | -# 2015-06-19 PL: - added options -a, -c, --each, --attr | ||
| 150 | -# 2015-06-21 v0.32 PL: - always display decoded strings which are printable | ||
| 151 | -# - fix VBA_Scanner.scan to return raw strings, not repr() | ||
| 152 | -# 2015-07-09 v0.40 PL: - removed usage of sys.stderr which causes issues | ||
| 153 | -# 2015-07-12 PL: - added Hex function decoding to VBA Parser | ||
| 154 | -# 2015-07-13 PL: - added Base64 function decoding to VBA Parser | ||
| 155 | -# 2015-09-06 PL: - improved VBA_Parser, refactored the main functions | ||
| 156 | -# 2015-09-13 PL: - moved main functions to a class VBA_Parser_CLI | ||
| 157 | -# - fixed issue when analysis was done twice | ||
| 158 | -# 2015-09-15 PL: - remove duplicate IOCs from results | ||
| 159 | -# 2015-09-16 PL: - join long VBA lines ending with underscore before scan | ||
| 160 | -# - disabled unused option --each | ||
| 161 | -# 2015-09-22 v0.41 PL: - added new option --reveal | ||
| 162 | -# - added suspicious strings for PowerShell.exe options | ||
| 163 | -# 2015-10-09 v0.42 PL: - VBA_Parser: split each format into a separate method | ||
| 164 | -# 2015-10-10 PL: - added support for text files with VBA source code | ||
| 165 | -# 2015-11-17 PL: - fixed bug with --decode option | ||
| 166 | -# 2015-12-16 PL: - fixed bug in main (no options input anymore) | ||
| 167 | -# - improved logging, added -l option | ||
| 168 | -# 2016-01-31 PL: - fixed issue #31 in VBA_Parser.open_mht | ||
| 169 | -# - fixed issue #32 by monkeypatching email.feedparser | ||
| 170 | -# 2016-02-07 PL: - KeyboardInterrupt is now raised properly | ||
| 171 | -# 2016-02-20 v0.43 PL: - fixed issue #34 in the VBA parser and vba_chr | ||
| 172 | -# 2016-02-29 PL: - added Workbook_Activate to suspicious keywords | ||
| 173 | -# 2016-03-08 v0.44 PL: - added VBA Form strings extraction and analysis | ||
| 174 | -# 2016-03-04 v0.45 CH: - added JSON output (by Christian Herdtweck) | ||
| 175 | -# 2016-03-16 CH: - added option --no-deobfuscate (temporary) | ||
| 176 | -# 2016-04-19 v0.46 PL: - new option --deobf instead of --no-deobfuscate | ||
| 177 | -# - updated suspicious keywords | ||
| 178 | -# 2016-05-04 v0.47 PL: - look for VBA code in any stream including orphans | ||
| 179 | -# 2016-04-28 CH: - return an exit code depending on the results | ||
| 180 | -# - improved error and exception handling | ||
| 181 | -# - improved JSON output | ||
| 182 | -# 2016-05-12 CH: - added support for PowerPoint 97-2003 files | ||
| 183 | -# 2016-06-06 CH: - improved handling of unicode VBA module names | ||
| 184 | -# 2016-06-07 CH: - added option --relaxed, stricter parsing by default | ||
| 185 | -# 2016-06-12 v0.50 PL: - fixed small bugs in VBA parsing code | ||
| 186 | -# 2016-07-01 PL: - fixed issue #58 with format() to support Python 2.6 | ||
| 187 | -# 2016-07-29 CH: - fixed several bugs including #73 (Mac Roman encoding) | ||
| 188 | -# 2016-08-31 PL: - added autoexec keyword InkPicture_Painted | ||
| 189 | -# - detect_autoexec now returns the exact keyword found | ||
| 190 | -# 2016-09-05 PL: - added autoexec keywords for MS Publisher (.pub) | ||
| 191 | -# 2016-09-06 PL: - fixed issue #20, is_zipfile on Python 2.6 | ||
| 192 | -# 2016-09-12 PL: - enabled packrat to improve pyparsing performance | ||
| 193 | -# 2016-10-25 PL: - fixed raise and print statements for Python 3 | ||
| 194 | -# 2016-11-03 v0.51 PL: - added EnumDateFormats and EnumSystemLanguageGroupsW | ||
| 195 | -# 2017-02-07 PL: - temporary fix for issue #132 | ||
| 196 | -# - added keywords for Mac-specific macros (issue #130) | ||
| 197 | -# 2017-03-08 PL: - fixed absolute imports | ||
| 198 | -# 2017-03-16 PL: - fixed issues #148 and #149 for option --reveal | ||
| 199 | -# 2017-05-19 PL: - added enable_logging to fix issue #154 | ||
| 200 | -# 2017-05-31 c1fe: - PR #135 fixing issue #132 for some Mac files | ||
| 201 | -# 2017-06-08 PL: - fixed issue #122 Chr() with negative numbers | ||
| 202 | -# 2017-06-15 PL: - deobfuscation line by line to handle large files | ||
| 203 | -# 2017-07-11 v0.52 PL: - raise exception instead of sys.exit (issue #180) | ||
| 204 | -# 2017-11-20 PL: - fixed issue #219, do not close the file too early | ||
| 205 | -# 2017-11-24 PL: - added keywords to detect self-modifying macros and | ||
| 206 | -# attempts to disable macro security (issue #221) | ||
| 207 | -# 2018-03-19 PL: - removed pyparsing from the thirdparty subfolder | ||
| 208 | -# 2018-05-13 v0.53 PL: - added support for Word/PowerPoint 2007+ XML (FlatOPC) | ||
| 209 | -# (issue #283) | ||
| 210 | -# 2018-06-11 v0.53.1 MHW: - fixed #320: chr instead of unichr on python 3 | ||
| 211 | -# 2018-06-12 MHW: - fixed #322: import reduce from functools | ||
| 212 | -# 2018-09-11 v0.54 PL: - olefile is now a dependency | ||
| 213 | -# 2018-10-25 CH: - detect encryption and raise error if detected | ||
| 214 | - | ||
| 215 | -__version__ = '0.54dev4' | ||
| 216 | - | ||
| 217 | -#------------------------------------------------------------------------------ | ||
| 218 | -# TODO: | ||
| 219 | -# + setup logging (common with other oletools) | ||
| 220 | -# + add xor bruteforcing like bbharvest | ||
| 221 | -# + options -a and -c should imply -d | ||
| 222 | - | ||
| 223 | -# TODO later: | ||
| 224 | -# + performance improvement: instead of searching each keyword separately, | ||
| 225 | -# first split vba code into a list of words (per line), then check each | ||
| 226 | -# word against a dict. (or put vba words into a set/dict?) | ||
| 227 | -# + for regex, maybe combine them into a single re with named groups? | ||
| 228 | -# + add Yara support, include sample rules? plugins like balbuzard? | ||
| 229 | -# + add balbuzard support | ||
| 230 | -# + output to file (replace print by file.write, sys.stdout by default) | ||
| 231 | -# + look for VBA in embedded documents (e.g. Excel in Word) | ||
| 232 | -# + support SRP streams (see Lenny's article + links and sample) | ||
| 233 | -# - python 3.x support | ||
| 234 | -# - check VBA macros in Visio, Access, Project, etc | ||
| 235 | -# - extract_macros: convert to a class, split long function into smaller methods | ||
| 236 | -# - extract_macros: read bytes from stream file objects instead of strings | ||
| 237 | -# - extract_macros: use combined struct.unpack instead of many calls | ||
| 238 | -# - all except clauses should target specific exceptions | ||
| 239 | - | ||
| 240 | -#------------------------------------------------------------------------------ | ||
| 241 | -# REFERENCES: | ||
| 242 | -# - [MS-OVBA]: Microsoft Office VBA File Format Structure | ||
| 243 | -# http://msdn.microsoft.com/en-us/library/office/cc313094%28v=office.12%29.aspx | ||
| 244 | -# - officeparser: https://github.com/unixfreak0037/officeparser | ||
| 245 | - | ||
| 246 | - | ||
| 247 | -#--- IMPORTS ------------------------------------------------------------------ | ||
| 248 | - | ||
| 249 | -import sys | ||
| 250 | -import os | ||
| 251 | -import logging | ||
| 252 | -import struct | ||
| 253 | -from io import BytesIO | ||
| 254 | -import math | ||
| 255 | -import zipfile | ||
| 256 | -import re | ||
| 257 | -import optparse | ||
| 258 | -import binascii | ||
| 259 | -import base64 | ||
| 260 | -import zlib | ||
| 261 | -import email # for MHTML parsing | ||
| 262 | -import string # for printable | ||
| 263 | -import json # for json output mode (argument --json) | ||
| 264 | - | ||
| 265 | -# import lxml or ElementTree for XML parsing: | ||
| 266 | -try: | ||
| 267 | - # lxml: best performance for XML processing | ||
| 268 | - import lxml.etree as ET | ||
| 269 | -except ImportError: | ||
| 270 | - try: | ||
| 271 | - # Python 2.5+: batteries included | ||
| 272 | - import xml.etree.cElementTree as ET | ||
| 273 | - except ImportError: | ||
| 274 | - try: | ||
| 275 | - # Python <2.5: standalone ElementTree install | ||
| 276 | - import elementtree.cElementTree as ET | ||
| 277 | - except ImportError: | ||
| 278 | - raise ImportError("lxml or ElementTree are not installed, " \ | ||
| 279 | - + "see http://codespeak.net/lxml " \ | ||
| 280 | - + "or http://effbot.org/zone/element-index.htm") | ||
| 281 | - | ||
| 282 | -import colorclass | ||
| 283 | - | ||
| 284 | -# On Windows, colorclass needs to be enabled: | ||
| 285 | -if os.name == 'nt': | ||
| 286 | - colorclass.Windows.enable(auto_colors=True) | 5 | +import sys, os, warnings |
| 287 | 6 | ||
| 7 | +warnings.warn('olevba3 is deprecated, olevba should be used instead.', DeprecationWarning) | ||
| 288 | 8 | ||
| 289 | # IMPORTANT: it should be possible to run oletools directly as scripts | 9 | # IMPORTANT: it should be possible to run oletools directly as scripts |
| 290 | # in any directory without installing them with pip or setup.py. | 10 | # in any directory without installing them with pip or setup.py. |
| @@ -292,3378 +12,13 @@ if os.name == 'nt': | @@ -292,3378 +12,13 @@ if os.name == 'nt': | ||
| 292 | # And to enable Python 2+3 compatibility, we need to use absolute imports, | 12 | # And to enable Python 2+3 compatibility, we need to use absolute imports, |
| 293 | # so we add the oletools parent folder to sys.path (absolute+normalized path): | 13 | # so we add the oletools parent folder to sys.path (absolute+normalized path): |
| 294 | _thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) | 14 | _thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) |
| 295 | -# print('_thismodule_dir = %r' % _thismodule_dir) | ||
| 296 | _parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) | 15 | _parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) |
| 297 | -# print('_parent_dir = %r' % _thirdparty_dir) | ||
| 298 | -if not _parent_dir in sys.path: | 16 | +if _parent_dir not in sys.path: |
| 299 | sys.path.insert(0, _parent_dir) | 17 | sys.path.insert(0, _parent_dir) |
| 300 | 18 | ||
| 301 | -import olefile | ||
| 302 | -from oletools.thirdparty.prettytable import prettytable | ||
| 303 | -from oletools.thirdparty.xglob import xglob, PathNotFoundException | ||
| 304 | -from pyparsing import \ | ||
| 305 | - CaselessKeyword, CaselessLiteral, Combine, Forward, Literal, \ | ||
| 306 | - Optional, QuotedString,Regex, Suppress, Word, WordStart, \ | ||
| 307 | - alphanums, alphas, hexnums,nums, opAssoc, srange, \ | ||
| 308 | - infixNotation, ParserElement | ||
| 309 | -import oletools.ppt_parser as ppt_parser | ||
| 310 | -from oletools import rtfobj | ||
| 311 | -from oletools import oleid | ||
| 312 | -from oletools.common.errors import FileIsEncryptedError | ||
| 313 | - | ||
| 314 | -# monkeypatch email to fix issue #32: | ||
| 315 | -# allow header lines without ":" | ||
| 316 | -import email.feedparser | ||
| 317 | -email.feedparser.headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:?|[\t ])') | ||
| 318 | - | ||
| 319 | -# === PYTHON 2+3 SUPPORT ====================================================== | ||
| 320 | - | ||
| 321 | -if sys.version_info[0] <= 2: | ||
| 322 | - # Python 2.x | ||
| 323 | - if sys.version_info[1] <= 6: | ||
| 324 | - # Python 2.6 | ||
| 325 | - # use is_zipfile backported from Python 2.7: | ||
| 326 | - from thirdparty.zipfile27 import is_zipfile | ||
| 327 | - else: | ||
| 328 | - # Python 2.7 | ||
| 329 | - from zipfile import is_zipfile | ||
| 330 | -else: | ||
| 331 | - # Python 3.x+ | ||
| 332 | - from zipfile import is_zipfile | ||
| 333 | - # xrange is now called range: | ||
| 334 | - xrange = range | ||
| 335 | - # unichr does not exist anymore, only chr: | ||
| 336 | - unichr = chr | ||
| 337 | - from functools import reduce | ||
| 338 | - | ||
| 339 | - | ||
| 340 | -# === PYTHON 3.0 - 3.4 SUPPORT ====================================================== | ||
| 341 | - | ||
| 342 | -# From https://gist.github.com/ynkdir/867347/c5e188a4886bc2dd71876c7e069a7b00b6c16c61 | ||
| 343 | - | ||
| 344 | -if sys.version_info >= (3, 0) and sys.version_info < (3, 5): | ||
| 345 | - import codecs | ||
| 346 | - | ||
| 347 | - _backslashreplace_errors = codecs.lookup_error("backslashreplace") | ||
| 348 | - | ||
| 349 | - def backslashreplace_errors(exc): | ||
| 350 | - if isinstance(exc, UnicodeDecodeError): | ||
| 351 | - u = "".join("\\x{0:02x}".format(c) for c in exc.object[exc.start:exc.end]) | ||
| 352 | - return (u, exc.end) | ||
| 353 | - return _backslashreplace_errors(exc) | ||
| 354 | - | ||
| 355 | - codecs.register_error("backslashreplace", backslashreplace_errors) | ||
| 356 | - | ||
| 357 | - | ||
| 358 | -# === LOGGING ================================================================= | ||
| 359 | - | ||
| 360 | -class NullHandler(logging.Handler): | ||
| 361 | - """ | ||
| 362 | - Log Handler without output, to avoid printing messages if logging is not | ||
| 363 | - configured by the main application. | ||
| 364 | - Python 2.7 has logging.NullHandler, but this is necessary for 2.6: | ||
| 365 | - see https://docs.python.org/2.6/library/logging.html#configuring-logging-for-a-library | ||
| 366 | - """ | ||
| 367 | - def emit(self, record): | ||
| 368 | - pass | ||
| 369 | - | ||
| 370 | -def get_logger(name, level=logging.CRITICAL+1): | ||
| 371 | - """ | ||
| 372 | - Create a suitable logger object for this module. | ||
| 373 | - The goal is not to change settings of the root logger, to avoid getting | ||
| 374 | - other modules' logs on the screen. | ||
| 375 | - If a logger exists with same name, reuse it. (Else it would have duplicate | ||
| 376 | - handlers and messages would be doubled.) | ||
| 377 | - The level is set to CRITICAL+1 by default, to avoid any logging. | ||
| 378 | - """ | ||
| 379 | - # First, test if there is already a logger with the same name, else it | ||
| 380 | - # will generate duplicate messages (due to duplicate handlers): | ||
| 381 | - if name in logging.Logger.manager.loggerDict: | ||
| 382 | - #NOTE: another less intrusive but more "hackish" solution would be to | ||
| 383 | - # use getLogger then test if its effective level is not default. | ||
| 384 | - logger = logging.getLogger(name) | ||
| 385 | - # make sure level is OK: | ||
| 386 | - logger.setLevel(level) | ||
| 387 | - return logger | ||
| 388 | - # get a new logger: | ||
| 389 | - logger = logging.getLogger(name) | ||
| 390 | - # only add a NullHandler for this logger, it is up to the application | ||
| 391 | - # to configure its own logging: | ||
| 392 | - logger.addHandler(NullHandler()) | ||
| 393 | - logger.setLevel(level) | ||
| 394 | - return logger | ||
| 395 | - | ||
| 396 | -# a global logger object used for debugging: | ||
| 397 | -log = get_logger('olevba') | ||
| 398 | - | ||
| 399 | - | ||
| 400 | -def enable_logging(): | ||
| 401 | - """ | ||
| 402 | - Enable logging for this module (disabled by default). | ||
| 403 | - This will set the module-specific logger level to NOTSET, which | ||
| 404 | - means the main application controls the actual logging level. | ||
| 405 | - """ | ||
| 406 | - log.setLevel(logging.NOTSET) | ||
| 407 | - # Also enable logging in the ppt_parser module: | ||
| 408 | - ppt_parser.enable_logging() | ||
| 409 | - | ||
| 410 | - | ||
| 411 | - | ||
| 412 | -#=== EXCEPTIONS ============================================================== | ||
| 413 | - | ||
| 414 | -class OlevbaBaseException(Exception): | ||
| 415 | - """ Base class for exceptions produced here for simpler except clauses """ | ||
| 416 | - def __init__(self, msg, filename=None, orig_exc=None, **kwargs): | ||
| 417 | - if orig_exc: | ||
| 418 | - super(OlevbaBaseException, self).__init__(msg + | ||
| 419 | - ' ({0})'.format(orig_exc), | ||
| 420 | - **kwargs) | ||
| 421 | - else: | ||
| 422 | - super(OlevbaBaseException, self).__init__(msg, **kwargs) | ||
| 423 | - self.msg = msg | ||
| 424 | - self.filename = filename | ||
| 425 | - self.orig_exc = orig_exc | ||
| 426 | - | ||
| 427 | - | ||
| 428 | -class FileOpenError(OlevbaBaseException): | ||
| 429 | - """ raised by VBA_Parser constructor if all open_... attempts failed | ||
| 430 | - | ||
| 431 | - probably means the file type is not supported | ||
| 432 | - """ | ||
| 433 | - | ||
| 434 | - def __init__(self, filename, orig_exc=None): | ||
| 435 | - super(FileOpenError, self).__init__( | ||
| 436 | - 'Failed to open file %s' % filename, filename, orig_exc) | ||
| 437 | - | ||
| 438 | - | ||
| 439 | -class ProcessingError(OlevbaBaseException): | ||
| 440 | - """ raised by VBA_Parser.process_file* functions """ | ||
| 441 | - | ||
| 442 | - def __init__(self, filename, orig_exc): | ||
| 443 | - super(ProcessingError, self).__init__( | ||
| 444 | - 'Error processing file %s' % filename, filename, orig_exc) | ||
| 445 | - | ||
| 446 | - | ||
| 447 | -class MsoExtractionError(RuntimeError, OlevbaBaseException): | ||
| 448 | - """ raised by mso_file_extract if parsing MSO/ActiveMIME data failed """ | ||
| 449 | - | ||
| 450 | - def __init__(self, msg): | ||
| 451 | - MsoExtractionError.__init__(self, msg) | ||
| 452 | - OlevbaBaseException.__init__(self, msg) | ||
| 453 | - | ||
| 454 | - | ||
| 455 | -class SubstreamOpenError(FileOpenError): | ||
| 456 | - """ special kind of FileOpenError: file is a substream of original file """ | ||
| 457 | - | ||
| 458 | - def __init__(self, filename, subfilename, orig_exc=None): | ||
| 459 | - super(SubstreamOpenError, self).__init__( | ||
| 460 | - str(filename) + '/' + str(subfilename), orig_exc) | ||
| 461 | - self.filename = filename # overwrite setting in OlevbaBaseException | ||
| 462 | - self.subfilename = subfilename | ||
| 463 | - | ||
| 464 | - | ||
| 465 | -class UnexpectedDataError(OlevbaBaseException): | ||
| 466 | - """ raised when parsing is strict (=not relaxed) and data is unexpected """ | ||
| 467 | - | ||
| 468 | - def __init__(self, stream_path, variable, expected, value): | ||
| 469 | - if isinstance(expected, int): | ||
| 470 | - es = '{0:04X}'.format(expected) | ||
| 471 | - elif isinstance(expected, tuple): | ||
| 472 | - es = ','.join('{0:04X}'.format(e) for e in expected) | ||
| 473 | - es = '({0})'.format(es) | ||
| 474 | - else: | ||
| 475 | - raise ValueError('Unknown type encountered: {0}'.format(type(expected))) | ||
| 476 | - super(UnexpectedDataError, self).__init__( | ||
| 477 | - 'Unexpected value in {0} for variable {1}: ' | ||
| 478 | - 'expected {2} but found {3:04X}!' | ||
| 479 | - .format(stream_path, variable, es, value)) | ||
| 480 | - self.stream_path = stream_path | ||
| 481 | - self.variable = variable | ||
| 482 | - self.expected = expected | ||
| 483 | - self.value = value | ||
| 484 | - | ||
| 485 | -#--- CONSTANTS ---------------------------------------------------------------- | ||
| 486 | - | ||
| 487 | -# return codes | ||
| 488 | -RETURN_OK = 0 | ||
| 489 | -RETURN_WARNINGS = 1 # (reserved, not used yet) | ||
| 490 | -RETURN_WRONG_ARGS = 2 # (fixed, built into optparse) | ||
| 491 | -RETURN_FILE_NOT_FOUND = 3 | ||
| 492 | -RETURN_XGLOB_ERR = 4 | ||
| 493 | -RETURN_OPEN_ERROR = 5 | ||
| 494 | -RETURN_PARSE_ERROR = 6 | ||
| 495 | -RETURN_SEVERAL_ERRS = 7 | ||
| 496 | -RETURN_UNEXPECTED = 8 | ||
| 497 | -RETURN_ENCRYPTED = 9 | ||
| 498 | - | ||
| 499 | -# MAC codepages (from http://stackoverflow.com/questions/1592925/decoding-mac-os-text-in-python) | ||
| 500 | -MAC_CODEPAGES = { | ||
| 501 | - 10000: 'mac-roman', | ||
| 502 | - 10001: 'shiftjis', # not found: 'mac-shift-jis', | ||
| 503 | - 10003: 'ascii', # nothing appropriate found: 'mac-hangul', | ||
| 504 | - 10008: 'gb2321', # not found: 'mac-gb2312', | ||
| 505 | - 10002: 'big5', # not found: 'mac-big5', | ||
| 506 | - 10005: 'hebrew', # not found: 'mac-hebrew', | ||
| 507 | - 10004: 'mac-arabic', | ||
| 508 | - 10006: 'mac-greek', | ||
| 509 | - 10081: 'mac-turkish', | ||
| 510 | - 10021: 'thai', # not found: mac-thai', | ||
| 511 | - 10029: 'maccentraleurope', # not found: 'mac-east europe', | ||
| 512 | - 10007: 'ascii', # nothing appropriate found: 'mac-russian', | ||
| 513 | -} | ||
| 514 | - | ||
| 515 | -# URL and message to report issues: | ||
| 516 | -URL_OLEVBA_ISSUES = 'https://github.com/decalage2/oletools/issues' | ||
| 517 | -MSG_OLEVBA_ISSUES = 'Please report this issue on %s' % URL_OLEVBA_ISSUES | ||
| 518 | - | ||
| 519 | -# Container types: | ||
| 520 | -TYPE_OLE = 'OLE' | ||
| 521 | -TYPE_OpenXML = 'OpenXML' | ||
| 522 | -TYPE_FlatOPC_XML = 'FlatOPC_XML' | ||
| 523 | -TYPE_Word2003_XML = 'Word2003_XML' | ||
| 524 | -TYPE_MHTML = 'MHTML' | ||
| 525 | -TYPE_TEXT = 'Text' | ||
| 526 | -TYPE_PPT = 'PPT' | ||
| 527 | - | ||
| 528 | -# short tag to display file types in triage mode: | ||
| 529 | -TYPE2TAG = { | ||
| 530 | - TYPE_OLE: 'OLE:', | ||
| 531 | - TYPE_OpenXML: 'OpX:', | ||
| 532 | - TYPE_FlatOPC_XML: 'FlX:', | ||
| 533 | - TYPE_Word2003_XML: 'XML:', | ||
| 534 | - TYPE_MHTML: 'MHT:', | ||
| 535 | - TYPE_TEXT: 'TXT:', | ||
| 536 | - TYPE_PPT: 'PPT', | ||
| 537 | -} | ||
| 538 | - | ||
| 539 | - | ||
| 540 | -# MSO files ActiveMime header magic | ||
| 541 | -MSO_ACTIVEMIME_HEADER = b'ActiveMime' | ||
| 542 | - | ||
| 543 | -MODULE_EXTENSION = "bas" | ||
| 544 | -CLASS_EXTENSION = "cls" | ||
| 545 | -FORM_EXTENSION = "frm" | ||
| 546 | - | ||
| 547 | -# Namespaces and tags for Word2003 XML parsing: | ||
| 548 | -NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}' | ||
| 549 | -# the tag <w:binData w:name="editdata.mso"> contains the VBA macro code: | ||
| 550 | -TAG_BINDATA = NS_W + 'binData' | ||
| 551 | -ATTR_NAME = NS_W + 'name' | ||
| 552 | - | ||
| 553 | -# Namespaces and tags for Word/PowerPoint 2007+ XML parsing: | ||
| 554 | -# root: <pkg:package xmlns:pkg="http://schemas.microsoft.com/office/2006/xmlPackage"> | ||
| 555 | -NS_XMLPACKAGE = '{http://schemas.microsoft.com/office/2006/xmlPackage}' | ||
| 556 | -TAG_PACKAGE = NS_XMLPACKAGE + 'package' | ||
| 557 | -# the tag <pkg:part> includes <pkg:binaryData> that contains the VBA macro code in Base64: | ||
| 558 | -# <pkg:part pkg:name="/word/vbaProject.bin" pkg:contentType="application/vnd.ms-office.vbaProject"><pkg:binaryData> | ||
| 559 | -TAG_PKGPART = NS_XMLPACKAGE + 'part' | ||
| 560 | -ATTR_PKG_NAME = NS_XMLPACKAGE + 'name' | ||
| 561 | -ATTR_PKG_CONTENTTYPE = NS_XMLPACKAGE + 'contentType' | ||
| 562 | -CTYPE_VBAPROJECT = "application/vnd.ms-office.vbaProject" | ||
| 563 | -TAG_PKGBINDATA = NS_XMLPACKAGE + 'binaryData' | ||
| 564 | - | ||
| 565 | -# Keywords to detect auto-executable macros | ||
| 566 | -AUTOEXEC_KEYWORDS = { | ||
| 567 | - # MS Word: | ||
| 568 | - 'Runs when the Word document is opened': | ||
| 569 | - ('AutoExec', 'AutoOpen', 'DocumentOpen'), | ||
| 570 | - 'Runs when the Word document is closed': | ||
| 571 | - ('AutoExit', 'AutoClose', 'Document_Close', 'DocumentBeforeClose'), | ||
| 572 | - 'Runs when the Word document is modified': | ||
| 573 | - ('DocumentChange',), | ||
| 574 | - 'Runs when a new Word document is created': | ||
| 575 | - ('AutoNew', 'Document_New', 'NewDocument'), | ||
| 576 | - | ||
| 577 | - # MS Word and Publisher: | ||
| 578 | - 'Runs when the Word or Publisher document is opened': | ||
| 579 | - ('Document_Open',), | ||
| 580 | - 'Runs when the Publisher document is closed': | ||
| 581 | - ('Document_BeforeClose',), | ||
| 582 | - | ||
| 583 | - # MS Excel: | ||
| 584 | - 'Runs when the Excel Workbook is opened': | ||
| 585 | - ('Auto_Open', 'Workbook_Open', 'Workbook_Activate'), | ||
| 586 | - 'Runs when the Excel Workbook is closed': | ||
| 587 | - ('Auto_Close', 'Workbook_Close'), | ||
| 588 | - | ||
| 589 | - # any MS Office application: | ||
| 590 | - 'Runs when the file is opened (using InkPicture ActiveX object)': | ||
| 591 | - # ref:https://twitter.com/joe4security/status/770691099988025345 | ||
| 592 | - (r'\w+_Painted',), | ||
| 593 | - 'Runs when the file is opened and ActiveX objects trigger events': | ||
| 594 | - (r'\w+_(?:GotFocus|LostFocus|MouseHover)',), | ||
| 595 | -} | ||
| 596 | - | ||
| 597 | -# Suspicious Keywords that may be used by malware | ||
| 598 | -# See VBA language reference: http://msdn.microsoft.com/en-us/library/office/jj692818%28v=office.15%29.aspx | ||
| 599 | -SUSPICIOUS_KEYWORDS = { | ||
| 600 | - #TODO: use regex to support variable whitespaces | ||
| 601 | - 'May read system environment variables': | ||
| 602 | - ('Environ',), | ||
| 603 | - 'May open a file': | ||
| 604 | - ('Open',), | ||
| 605 | - 'May write to a file (if combined with Open)': | ||
| 606 | - #TODO: regex to find Open+Write on same line | ||
| 607 | - ('Write', 'Put', 'Output', 'Print #'), | ||
| 608 | - 'May read or write a binary file (if combined with Open)': | ||
| 609 | - #TODO: regex to find Open+Binary on same line | ||
| 610 | - ('Binary',), | ||
| 611 | - 'May copy a file': | ||
| 612 | - ('FileCopy', 'CopyFile'), | ||
| 613 | - #FileCopy: http://msdn.microsoft.com/en-us/library/office/gg264390%28v=office.15%29.aspx | ||
| 614 | - #CopyFile: http://msdn.microsoft.com/en-us/library/office/gg264089%28v=office.15%29.aspx | ||
| 615 | - 'May delete a file': | ||
| 616 | - ('Kill',), | ||
| 617 | - 'May create a text file': | ||
| 618 | - ('CreateTextFile', 'ADODB.Stream', 'WriteText', 'SaveToFile'), | ||
| 619 | - #CreateTextFile: http://msdn.microsoft.com/en-us/library/office/gg264617%28v=office.15%29.aspx | ||
| 620 | - #ADODB.Stream sample: http://pastebin.com/Z4TMyuq6 | ||
| 621 | - 'May run an executable file or a system command': | ||
| 622 | - ('Shell', 'vbNormal', 'vbNormalFocus', 'vbHide', 'vbMinimizedFocus', 'vbMaximizedFocus', 'vbNormalNoFocus', | ||
| 623 | - 'vbMinimizedNoFocus', 'WScript.Shell', 'Run', 'ShellExecute'), | ||
| 624 | - # MacScript: see https://msdn.microsoft.com/en-us/library/office/gg264812.aspx | ||
| 625 | - 'May run an executable file or a system command on a Mac': | ||
| 626 | - ('MacScript',), | ||
| 627 | - 'May run an executable file or a system command on a Mac (if combined with libc.dylib)': | ||
| 628 | - ('system', 'popen', r'exec[lv][ep]?'), | ||
| 629 | - #Shell: http://msdn.microsoft.com/en-us/library/office/gg278437%28v=office.15%29.aspx | ||
| 630 | - #WScript.Shell+Run sample: http://pastebin.com/Z4TMyuq6 | ||
| 631 | - 'May run PowerShell commands': | ||
| 632 | - #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ | ||
| 633 | - #also: https://bitbucket.org/decalage/oletools/issues/14/olevba-library-update-ioc | ||
| 634 | - # ref: https://blog.netspi.com/15-ways-to-bypass-the-powershell-execution-policy/ | ||
| 635 | - # TODO: add support for keywords starting with a non-alpha character, such as "-noexit" | ||
| 636 | - # TODO: '-command', '-EncodedCommand', '-scriptblock' | ||
| 637 | - ('PowerShell', 'noexit', 'ExecutionPolicy', 'noprofile', 'command', 'EncodedCommand', | ||
| 638 | - 'invoke-command', 'scriptblock', 'Invoke-Expression', 'AuthorizationManager'), | ||
| 639 | - 'May run an executable file or a system command using PowerShell': | ||
| 640 | - ('Start-Process',), | ||
| 641 | - 'May hide the application': | ||
| 642 | - ('Application.Visible', 'ShowWindow', 'SW_HIDE'), | ||
| 643 | - 'May create a directory': | ||
| 644 | - ('MkDir',), | ||
| 645 | - 'May save the current workbook': | ||
| 646 | - ('ActiveWorkbook.SaveAs',), | ||
| 647 | - 'May change which directory contains files to open at startup': | ||
| 648 | - #TODO: confirm the actual effect | ||
| 649 | - ('Application.AltStartupPath',), | ||
| 650 | - 'May create an OLE object': | ||
| 651 | - ('CreateObject',), | ||
| 652 | - 'May create an OLE object using PowerShell': | ||
| 653 | - ('New-Object',), | ||
| 654 | - 'May run an application (if combined with CreateObject)': | ||
| 655 | - ('Shell.Application',), | ||
| 656 | - 'May enumerate application windows (if combined with Shell.Application object)': | ||
| 657 | - ('Windows', 'FindWindow'), | ||
| 658 | - 'May run code from a DLL': | ||
| 659 | - #TODO: regex to find declare+lib on same line - see mraptor | ||
| 660 | - ('Lib',), | ||
| 661 | - 'May run code from a library on a Mac': | ||
| 662 | - #TODO: regex to find declare+lib on same line - see mraptor | ||
| 663 | - ('libc.dylib', 'dylib'), | ||
| 664 | - 'May inject code into another process': | ||
| 665 | - ('CreateThread', 'VirtualAlloc', # (issue #9) suggested by Davy Douhine - used by MSF payload | ||
| 666 | - 'VirtualAllocEx', 'RtlMoveMemory', | ||
| 667 | - ), | ||
| 668 | - 'May run a shellcode in memory': | ||
| 669 | - ('EnumSystemLanguageGroupsW?', # Used by Hancitor in Oct 2016 | ||
| 670 | - 'EnumDateFormats(?:W|(?:Ex){1,2})?'), # see https://msdn.microsoft.com/en-us/library/windows/desktop/dd317810(v=vs.85).aspx | ||
| 671 | - 'May download files from the Internet': | ||
| 672 | - #TODO: regex to find urlmon+URLDownloadToFileA on same line | ||
| 673 | - ('URLDownloadToFileA', 'Msxml2.XMLHTTP', 'Microsoft.XMLHTTP', | ||
| 674 | - 'MSXML2.ServerXMLHTTP', # suggested in issue #13 | ||
| 675 | - 'User-Agent', # sample from @ozhermit: http://pastebin.com/MPc3iV6z | ||
| 676 | - ), | ||
| 677 | - 'May download files from the Internet using PowerShell': | ||
| 678 | - #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ | ||
| 679 | - ('Net.WebClient', 'DownloadFile', 'DownloadString'), | ||
| 680 | - 'May control another application by simulating user keystrokes': | ||
| 681 | - ('SendKeys', 'AppActivate'), | ||
| 682 | - #SendKeys: http://msdn.microsoft.com/en-us/library/office/gg278655%28v=office.15%29.aspx | ||
| 683 | - 'May attempt to obfuscate malicious function calls': | ||
| 684 | - ('CallByName',), | ||
| 685 | - #CallByName: http://msdn.microsoft.com/en-us/library/office/gg278760%28v=office.15%29.aspx | ||
| 686 | - 'May attempt to obfuscate specific strings (use option --deobf to deobfuscate)': | ||
| 687 | - #TODO: regex to find several Chr*, not just one | ||
| 688 | - ('Chr', 'ChrB', 'ChrW', 'StrReverse', 'Xor'), | ||
| 689 | - #Chr: http://msdn.microsoft.com/en-us/library/office/gg264465%28v=office.15%29.aspx | ||
| 690 | - 'May read or write registry keys': | ||
| 691 | - #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ | ||
| 692 | - ('RegOpenKeyExA', 'RegOpenKeyEx', 'RegCloseKey'), | ||
| 693 | - 'May read registry keys': | ||
| 694 | - #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ | ||
| 695 | - ('RegQueryValueExA', 'RegQueryValueEx', | ||
| 696 | - 'RegRead', #with Wscript.Shell | ||
| 697 | - ), | ||
| 698 | - 'May detect virtualization': | ||
| 699 | - # sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ | ||
| 700 | - (r'SYSTEM\ControlSet001\Services\Disk\Enum', 'VIRTUAL', 'VMWARE', 'VBOX'), | ||
| 701 | - 'May detect Anubis Sandbox': | ||
| 702 | - # sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ | ||
| 703 | - # NOTES: this sample also checks App.EXEName but that seems to be a bug, it works in VB6 but not in VBA | ||
| 704 | - # ref: http://www.syssec-project.eu/m/page-media/3/disarm-raid11.pdf | ||
| 705 | - ('GetVolumeInformationA', 'GetVolumeInformation', # with kernel32.dll | ||
| 706 | - '1824245000', r'HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows NT\CurrentVersion\ProductId', | ||
| 707 | - '76487-337-8429955-22614', 'andy', 'sample', r'C:\exec\exec.exe', 'popupkiller' | ||
| 708 | - ), | ||
| 709 | - 'May detect Sandboxie': | ||
| 710 | - # sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ | ||
| 711 | - # ref: http://www.cplusplus.com/forum/windows/96874/ | ||
| 712 | - ('SbieDll.dll', 'SandboxieControlWndClass'), | ||
| 713 | - 'May detect Sunbelt Sandbox': | ||
| 714 | - # ref: http://www.cplusplus.com/forum/windows/96874/ | ||
| 715 | - (r'C:\file.exe',), | ||
| 716 | - 'May detect Norman Sandbox': | ||
| 717 | - # ref: http://www.cplusplus.com/forum/windows/96874/ | ||
| 718 | - ('currentuser',), | ||
| 719 | - 'May detect CW Sandbox': | ||
| 720 | - # ref: http://www.cplusplus.com/forum/windows/96874/ | ||
| 721 | - ('Schmidti',), | ||
| 722 | - 'May detect WinJail Sandbox': | ||
| 723 | - # ref: http://www.cplusplus.com/forum/windows/96874/ | ||
| 724 | - ('Afx:400000:0',), | ||
| 725 | - 'May attempt to disable VBA macro security and Protected View': | ||
| 726 | - # ref: http://blog.trendmicro.com/trendlabs-security-intelligence/qkg-filecoder-self-replicating-document-encrypting-ransomware/ | ||
| 727 | - # ref: https://thehackernews.com/2017/11/ms-office-macro-malware.html | ||
| 728 | - ('AccessVBOM', 'VBAWarnings', 'ProtectedView', 'DisableAttachementsInPV', 'DisableInternetFilesInPV', | ||
| 729 | - 'DisableUnsafeLocationsInPV', 'blockcontentexecutionfrominternet'), | ||
| 730 | - 'May attempt to modify the VBA code (self-modification)': | ||
| 731 | - ('VBProject', 'VBComponents', 'CodeModule', 'AddFromString'), | ||
| 732 | -} | ||
| 733 | - | ||
| 734 | -# Suspicious Keywords to be searched for directly as strings, without regex | ||
| 735 | -SUSPICIOUS_KEYWORDS_NOREGEX = { | ||
| 736 | - 'May use special characters such as backspace to obfuscate code when printed on the console': | ||
| 737 | - ('\b',), | ||
| 738 | -} | ||
| 739 | - | ||
| 740 | -# Regular Expression for a URL: | ||
| 741 | -# http://en.wikipedia.org/wiki/Uniform_resource_locator | ||
| 742 | -# http://www.w3.org/Addressing/URL/uri-spec.html | ||
| 743 | -#TODO: also support username:password@server | ||
| 744 | -#TODO: other protocols (file, gopher, wais, ...?) | ||
| 745 | -SCHEME = r'\b(?:http|ftp)s?' | ||
| 746 | -# see http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains | ||
| 747 | -TLD = r'(?:xn--[a-zA-Z0-9]{4,20}|[a-zA-Z]{2,20})' | ||
| 748 | -DNS_NAME = r'(?:[a-zA-Z0-9\-\.]+\.' + TLD + ')' | ||
| 749 | -#TODO: IPv6 - see https://www.debuggex.com/ | ||
| 750 | -# A literal numeric IPv6 address may be given, but must be enclosed in [ ] e.g. [db8:0cec::99:123a] | ||
| 751 | -NUMBER_0_255 = r'(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])' | ||
| 752 | -IPv4 = r'(?:' + NUMBER_0_255 + r'\.){3}' + NUMBER_0_255 | ||
| 753 | -# IPv4 must come before the DNS name because it is more specific | ||
| 754 | -SERVER = r'(?:' + IPv4 + '|' + DNS_NAME + ')' | ||
| 755 | -PORT = r'(?:\:[0-9]{1,5})?' | ||
| 756 | -SERVER_PORT = SERVER + PORT | ||
| 757 | -URL_PATH = r'(?:/[a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~]*)?' # [^\.\,\)\(\s"] | ||
| 758 | -URL_RE = SCHEME + r'\://' + SERVER_PORT + URL_PATH | ||
| 759 | -re_url = re.compile(URL_RE) | ||
| 760 | - | ||
| 761 | - | ||
| 762 | -# Patterns to be extracted (IP addresses, URLs, etc) | ||
| 763 | -# From patterns.py in balbuzard | ||
| 764 | -RE_PATTERNS = ( | ||
| 765 | - ('URL', re.compile(URL_RE)), | ||
| 766 | - ('IPv4 address', re.compile(IPv4)), | ||
| 767 | - # TODO: add IPv6 | ||
| 768 | - ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@' + SERVER + '\b')), | ||
| 769 | - # ('Domain name', re.compile(r'(?=^.{1,254}$)(^(?:(?!\d+\.|-)[a-zA-Z0-9_\-]{1,63}(?<!-)\.?)+(?:[a-zA-Z]{2,})$)')), | ||
| 770 | - # Executable file name with known extensions (except .com which is present in many URLs, and .application): | ||
| 771 | - ("Executable file name", re.compile( | ||
| 772 | - r"(?i)\b\w+\.(EXE|PIF|GADGET|MSI|MSP|MSC|VBS|VBE|VB|JSE|JS|WSF|WSC|WSH|WS|BAT|CMD|DLL|SCR|HTA|CPL|CLASS|JAR|PS1XML|PS1|PS2XML|PS2|PSC1|PSC2|SCF|LNK|INF|REG)\b")), | ||
| 773 | - # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/ | ||
| 774 | - # TODO: https://support.office.com/en-us/article/Blocked-attachments-in-Outlook-3811cddc-17c3-4279-a30c-060ba0207372#__attachment_file_types | ||
| 775 | - # TODO: add win & unix file paths | ||
| 776 | - #('Hex string', re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')), | ||
| 777 | -) | ||
| 778 | - | ||
| 779 | -# regex to detect strings encoded in hexadecimal | ||
| 780 | -re_hex_string = re.compile(r'(?:[0-9A-Fa-f]{2}){4,}') | ||
| 781 | - | ||
| 782 | -# regex to detect strings encoded in base64 | ||
| 783 | -#re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?"') | ||
| 784 | -# better version from balbuzard, less false positives: | ||
| 785 | -# (plain version without double quotes, used also below in quoted_base64_string) | ||
| 786 | -BASE64_RE = r'(?:[A-Za-z0-9+/]{4}){1,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)?' | ||
| 787 | -re_base64_string = re.compile('"' + BASE64_RE + '"') | ||
| 788 | -# white list of common strings matching the base64 regex, but which are not base64 strings (all lowercase): | ||
| 789 | -BASE64_WHITELIST = set(['thisdocument', 'thisworkbook', 'test', 'temp', 'http', 'open', 'exit']) | ||
| 790 | - | ||
| 791 | -# regex to detect strings encoded with a specific Dridex algorithm | ||
| 792 | -# (see https://github.com/JamesHabben/MalwareStuff) | ||
| 793 | -re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"') | ||
| 794 | -# regex to check that it is not just a hex string: | ||
| 795 | -re_nothex_check = re.compile(r'[G-Zg-z]') | ||
| 796 | - | ||
| 797 | -# regex to extract printable strings (at least 5 chars) from VBA Forms: | ||
| 798 | -# (must be bytes for Python 3) | ||
| 799 | -re_printable_string = re.compile(b'[\\t\\r\\n\\x20-\\xFF]{5,}') | ||
| 800 | - | ||
| 801 | - | ||
| 802 | -# === PARTIAL VBA GRAMMAR ==================================================== | ||
| 803 | - | ||
| 804 | -# REFERENCES: | ||
| 805 | -# - [MS-VBAL]: VBA Language Specification | ||
| 806 | -# https://msdn.microsoft.com/en-us/library/dd361851.aspx | ||
| 807 | -# - pyparsing: http://pyparsing.wikispaces.com/ | ||
| 808 | - | ||
| 809 | -# TODO: set whitespaces according to VBA | ||
| 810 | -# TODO: merge extended lines before parsing | ||
| 811 | - | ||
| 812 | -# Enable PackRat for better performance: | ||
| 813 | -# (see https://pythonhosted.org/pyparsing/pyparsing.ParserElement-class.html#enablePackrat) | ||
| 814 | -ParserElement.enablePackrat() | ||
| 815 | - | ||
| 816 | -# VBA identifier chars (from MS-VBAL 3.3.5) | ||
| 817 | -vba_identifier_chars = alphanums + '_' | ||
| 818 | - | ||
| 819 | -class VbaExpressionString(str): | ||
| 820 | - """ | ||
| 821 | - Class identical to str, used to distinguish plain strings from strings | ||
| 822 | - obfuscated using VBA expressions (Chr, StrReverse, etc) | ||
| 823 | - Usage: each VBA expression parse action should convert strings to | ||
| 824 | - VbaExpressionString. | ||
| 825 | - Then isinstance(s, VbaExpressionString) is True only for VBA expressions. | ||
| 826 | - (see detect_vba_strings) | ||
| 827 | - """ | ||
| 828 | - # TODO: use Unicode everywhere instead of str | ||
| 829 | - pass | ||
| 830 | - | ||
| 831 | - | ||
| 832 | -# --- NUMBER TOKENS ---------------------------------------------------------- | ||
| 833 | - | ||
| 834 | -# 3.3.2 Number Tokens | ||
| 835 | -# INTEGER = integer-literal ["%" / "&" / "^"] | ||
| 836 | -# integer-literal = decimal-literal / octal-literal / hex-literal | ||
| 837 | -# decimal-literal = 1*decimal-digit | ||
| 838 | -# octal-literal = "&" [%x004F / %x006F] 1*octal-digit | ||
| 839 | -# ; & or &o or &O | ||
| 840 | -# hex-literal = "&" (%x0048 / %x0068) 1*hex-digit | ||
| 841 | -# ; &h or &H | ||
| 842 | -# octal-digit = "0" / "1" / "2" / "3" / "4" / "5" / "6" / "7" | ||
| 843 | -# decimal-digit = octal-digit / "8" / "9" | ||
| 844 | -# hex-digit = decimal-digit / %x0041-0046 / %x0061-0066 ;A-F / a-f | ||
| 845 | - | ||
| 846 | -# NOTE: here Combine() is required to avoid spaces between elements | ||
| 847 | -# NOTE: here WordStart is necessary to avoid matching a number preceded by | ||
| 848 | -# letters or underscore (e.g. "VBT1" or "ABC_34"), when using scanString | ||
| 849 | -decimal_literal = Combine(Optional('-') + WordStart(vba_identifier_chars) + Word(nums) | ||
| 850 | - + Suppress(Optional(Word('%&^', exact=1)))) | ||
| 851 | -decimal_literal.setParseAction(lambda t: int(t[0])) | ||
| 852 | - | ||
| 853 | -octal_literal = Combine(Suppress(Literal('&') + Optional((CaselessLiteral('o')))) + Word(srange('[0-7]')) | ||
| 854 | - + Suppress(Optional(Word('%&^', exact=1)))) | ||
| 855 | -octal_literal.setParseAction(lambda t: int(t[0], base=8)) | ||
| 856 | - | ||
| 857 | -hex_literal = Combine(Suppress(CaselessLiteral('&h')) + Word(srange('[0-9a-fA-F]')) | ||
| 858 | - + Suppress(Optional(Word('%&^', exact=1)))) | ||
| 859 | -hex_literal.setParseAction(lambda t: int(t[0], base=16)) | ||
| 860 | - | ||
| 861 | -integer = decimal_literal | octal_literal | hex_literal | ||
| 862 | - | ||
| 863 | - | ||
| 864 | -# --- QUOTED STRINGS --------------------------------------------------------- | ||
| 865 | - | ||
| 866 | -# 3.3.4 String Tokens | ||
| 867 | -# STRING = double-quote *string-character (double-quote / line-continuation / LINE-END) | ||
| 868 | -# double-quote = %x0022 ; " | ||
| 869 | -# string-character = NO-LINE-CONTINUATION ((double-quote double-quote) termination-character) | ||
| 870 | - | ||
| 871 | -quoted_string = QuotedString('"', escQuote='""') | ||
| 872 | -quoted_string.setParseAction(lambda t: str(t[0])) | ||
| 873 | - | ||
| 874 | - | ||
| 875 | -#--- VBA Expressions --------------------------------------------------------- | ||
| 876 | - | ||
| 877 | -# See MS-VBAL 5.6 Expressions | ||
| 878 | - | ||
| 879 | -# need to pre-declare using Forward() because it is recursive | ||
| 880 | -# VBA string expression and integer expression | ||
| 881 | -vba_expr_str = Forward() | ||
| 882 | -vba_expr_int = Forward() | ||
| 883 | - | ||
| 884 | -# --- CHR -------------------------------------------------------------------- | ||
| 885 | - | ||
| 886 | -# MS-VBAL 6.1.2.11.1.4 Chr / Chr$ | ||
| 887 | -# Function Chr(CharCode As Long) As Variant | ||
| 888 | -# Function Chr$(CharCode As Long) As String | ||
| 889 | -# Parameter Description | ||
| 890 | -# CharCode Long whose value is a code point. | ||
| 891 | -# Returns a String data value consisting of a single character containing the character whose code | ||
| 892 | -# point is the data value of the argument. | ||
| 893 | -# - If the argument is not in the range 0 to 255, Error Number 5 ("Invalid procedure call or | ||
| 894 | -# argument") is raised unless the implementation supports a character set with a larger code point | ||
| 895 | -# range. | ||
| 896 | -# - If the argument value is in the range of 0 to 127, it is interpreted as a 7-bit ASCII code point. | ||
| 897 | -# - If the argument value is in the range of 128 to 255, the code point interpretation of the value is | ||
| 898 | -# implementation defined. | ||
| 899 | -# - Chr$ has the same runtime semantics as Chr, however the declared type of its function result is | ||
| 900 | -# String rather than Variant. | ||
| 901 | - | ||
| 902 | -# 6.1.2.11.1.5 ChrB / ChrB$ | ||
| 903 | -# Function ChrB(CharCode As Long) As Variant | ||
| 904 | -# Function ChrB$(CharCode As Long) As String | ||
| 905 | -# CharCode Long whose value is a code point. | ||
| 906 | -# Returns a String data value consisting of a single byte character whose code point value is the | ||
| 907 | -# data value of the argument. | ||
| 908 | -# - If the argument is not in the range 0 to 255, Error Number 6 ("Overflow") is raised. | ||
| 909 | -# - ChrB$ has the same runtime semantics as ChrB however the declared type of its function result | ||
| 910 | -# is String rather than Variant. | ||
| 911 | -# - Note: the ChrB function is used with byte data contained in a String. Instead of returning a | ||
| 912 | -# character, which may be one or two bytes, ChrB always returns a single byte. The ChrW function | ||
| 913 | -# returns a String containing the Unicode character except on platforms where Unicode is not | ||
| 914 | -# supported, in which case, the behavior is identical to the Chr function. | ||
| 915 | - | ||
| 916 | -# 6.1.2.11.1.6 ChrW/ ChrW$ | ||
| 917 | -# Function ChrW(CharCode As Long) As Variant | ||
| 918 | -# Function ChrW$(CharCode As Long) As String | ||
| 919 | -# CharCode Long whose value is a code point. | ||
| 920 | -# Returns a String data value consisting of a single character containing the character whose code | ||
| 921 | -# point is the data value of the argument. | ||
| 922 | -# - If the argument is not in the range -32,767 to 65,535 then Error Number 5 ("Invalid procedure | ||
| 923 | -# call or argument") is raised. | ||
| 924 | -# - If the argument is a negative value it is treated as if it was the value: CharCode + 65,536. | ||
| 925 | -# - If the implemented uses 16-bit Unicode code points argument, data value is interpreted as a 16- | ||
| 926 | -# bit Unicode code point. | ||
| 927 | -# - If the implementation does not support Unicode, ChrW has the same semantics as Chr. | ||
| 928 | -# - ChrW$ has the same runtime semantics as ChrW, however the declared type of its function result | ||
| 929 | -# is String rather than Variant. | ||
| 930 | - | ||
| 931 | -# Chr, Chr$, ChrB, ChrW(int) => char | ||
| 932 | -vba_chr = Suppress( | ||
| 933 | - Combine(WordStart(vba_identifier_chars) + CaselessLiteral('Chr') | ||
| 934 | - + Optional(CaselessLiteral('B') | CaselessLiteral('W')) + Optional('$')) | ||
| 935 | - + '(') + vba_expr_int + Suppress(')') | ||
| 936 | - | ||
| 937 | -def vba_chr_tostr(t): | ||
| 938 | - try: | ||
| 939 | - i = t[0] | ||
| 940 | - if i>=0 and i<=255: | ||
| 941 | - # normal, non-unicode character: | ||
| 942 | - # TODO: check if it needs to be converted to bytes for Python 3 | ||
| 943 | - return VbaExpressionString(chr(i)) | ||
| 944 | - else: | ||
| 945 | - # unicode character | ||
| 946 | - # Note: this distinction is only needed for Python 2 | ||
| 947 | - return VbaExpressionString(unichr(i).encode('utf-8', 'backslashreplace')) | ||
| 948 | - except ValueError: | ||
| 949 | - log.exception('ERROR: incorrect parameter value for chr(): %r' % i) | ||
| 950 | - return VbaExpressionString('Chr(%r)' % i) | ||
| 951 | - | ||
| 952 | -vba_chr.setParseAction(vba_chr_tostr) | ||
| 953 | - | ||
| 954 | - | ||
| 955 | -# --- ASC -------------------------------------------------------------------- | ||
| 956 | - | ||
| 957 | -# Asc(char) => int | ||
| 958 | -#TODO: see MS-VBAL 6.1.2.11.1.1 page 240 => AscB, AscW | ||
| 959 | -vba_asc = Suppress(CaselessKeyword('Asc') + '(') + vba_expr_str + Suppress(')') | ||
| 960 | -vba_asc.setParseAction(lambda t: ord(t[0])) | ||
| 961 | - | ||
| 962 | - | ||
| 963 | -# --- VAL -------------------------------------------------------------------- | ||
| 964 | - | ||
| 965 | -# Val(string) => int | ||
| 966 | -# TODO: make sure the behavior of VBA's val is fully covered | ||
| 967 | -vba_val = Suppress(CaselessKeyword('Val') + '(') + vba_expr_str + Suppress(')') | ||
| 968 | -vba_val.setParseAction(lambda t: int(t[0].strip())) | ||
| 969 | - | ||
| 970 | - | ||
| 971 | -# --- StrReverse() -------------------------------------------------------------------- | ||
| 972 | - | ||
| 973 | -# StrReverse(string) => string | ||
| 974 | -strReverse = Suppress(CaselessKeyword('StrReverse') + '(') + vba_expr_str + Suppress(')') | ||
| 975 | -strReverse.setParseAction(lambda t: VbaExpressionString(str(t[0])[::-1])) | ||
| 976 | - | ||
| 977 | - | ||
| 978 | -# --- ENVIRON() -------------------------------------------------------------------- | ||
| 979 | - | ||
| 980 | -# Environ("name") => just translated to "%name%", that is enough for malware analysis | ||
| 981 | -environ = Suppress(CaselessKeyword('Environ') + '(') + vba_expr_str + Suppress(')') | ||
| 982 | -environ.setParseAction(lambda t: VbaExpressionString('%%%s%%' % t[0])) | ||
| 983 | - | ||
| 984 | - | ||
| 985 | -# --- IDENTIFIER ------------------------------------------------------------- | ||
| 986 | - | ||
| 987 | -#TODO: see MS-VBAL 3.3.5 page 33 | ||
| 988 | -# 3.3.5 Identifier Tokens | ||
| 989 | -# Latin-identifier = first-Latin-identifier-character *subsequent-Latin-identifier-character | ||
| 990 | -# first-Latin-identifier-character = (%x0041-005A / %x0061-007A) ; A-Z / a-z | ||
| 991 | -# subsequent-Latin-identifier-character = first-Latin-identifier-character / DIGIT / %x5F ; underscore | ||
| 992 | -latin_identifier = Word(initChars=alphas, bodyChars=alphanums + '_') | ||
| 993 | - | ||
| 994 | -# --- HEX FUNCTION ----------------------------------------------------------- | ||
| 995 | - | ||
| 996 | -# match any custom function name with a hex string as argument: | ||
| 997 | -# TODO: accept vba_expr_str_item as argument, check if it is a hex or base64 string at runtime | ||
| 998 | - | ||
| 999 | -# quoted string of at least two hexadecimal numbers of two digits: | ||
| 1000 | -quoted_hex_string = Suppress('"') + Combine(Word(hexnums, exact=2) * (2, None)) + Suppress('"') | ||
| 1001 | -quoted_hex_string.setParseAction(lambda t: str(t[0])) | ||
| 1002 | - | ||
| 1003 | -hex_function_call = Suppress(latin_identifier) + Suppress('(') + \ | ||
| 1004 | - quoted_hex_string('hex_string') + Suppress(')') | ||
| 1005 | -hex_function_call.setParseAction(lambda t: VbaExpressionString(binascii.a2b_hex(t.hex_string))) | ||
| 1006 | - | ||
| 1007 | - | ||
| 1008 | -# --- BASE64 FUNCTION ----------------------------------------------------------- | ||
| 1009 | - | ||
| 1010 | -# match any custom function name with a Base64 string as argument: | ||
| 1011 | -# TODO: accept vba_expr_str_item as argument, check if it is a hex or base64 string at runtime | ||
| 1012 | - | ||
| 1013 | -# quoted string of at least two hexadecimal numbers of two digits: | ||
| 1014 | -quoted_base64_string = Suppress('"') + Regex(BASE64_RE) + Suppress('"') | ||
| 1015 | -quoted_base64_string.setParseAction(lambda t: str(t[0])) | ||
| 1016 | - | ||
| 1017 | -base64_function_call = Suppress(latin_identifier) + Suppress('(') + \ | ||
| 1018 | - quoted_base64_string('base64_string') + Suppress(')') | ||
| 1019 | -base64_function_call.setParseAction(lambda t: VbaExpressionString(binascii.a2b_base64(t.base64_string))) | ||
| 1020 | - | ||
| 1021 | - | ||
| 1022 | -# ---STRING EXPRESSION ------------------------------------------------------- | ||
| 1023 | - | ||
| 1024 | -def concat_strings_list(tokens): | ||
| 1025 | - """ | ||
| 1026 | - parse action to concatenate strings in a VBA expression with operators '+' or '&' | ||
| 1027 | - """ | ||
| 1028 | - # extract argument from the tokens: | ||
| 1029 | - # expected to be a tuple containing a list of strings such as [a,'&',b,'&',c,...] | ||
| 1030 | - strings = tokens[0][::2] | ||
| 1031 | - return VbaExpressionString(''.join(strings)) | ||
| 1032 | - | ||
| 1033 | - | ||
| 1034 | -vba_expr_str_item = (vba_chr | strReverse | environ | quoted_string | hex_function_call | base64_function_call) | ||
| 1035 | - | ||
| 1036 | -vba_expr_str <<= infixNotation(vba_expr_str_item, | ||
| 1037 | - [ | ||
| 1038 | - ("+", 2, opAssoc.LEFT, concat_strings_list), | ||
| 1039 | - ("&", 2, opAssoc.LEFT, concat_strings_list), | ||
| 1040 | - ]) | ||
| 1041 | - | ||
| 1042 | - | ||
| 1043 | -# --- INTEGER EXPRESSION ------------------------------------------------------- | ||
| 1044 | - | ||
| 1045 | -def sum_ints_list(tokens): | ||
| 1046 | - """ | ||
| 1047 | - parse action to sum integers in a VBA expression with operator '+' | ||
| 1048 | - """ | ||
| 1049 | - # extract argument from the tokens: | ||
| 1050 | - # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...] | ||
| 1051 | - integers = tokens[0][::2] | ||
| 1052 | - return sum(integers) | ||
| 1053 | - | ||
| 1054 | - | ||
| 1055 | -def subtract_ints_list(tokens): | ||
| 1056 | - """ | ||
| 1057 | - parse action to subtract integers in a VBA expression with operator '-' | ||
| 1058 | - """ | ||
| 1059 | - # extract argument from the tokens: | ||
| 1060 | - # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...] | ||
| 1061 | - integers = tokens[0][::2] | ||
| 1062 | - return reduce(lambda x,y:x-y, integers) | ||
| 1063 | - | ||
| 1064 | - | ||
| 1065 | -def multiply_ints_list(tokens): | ||
| 1066 | - """ | ||
| 1067 | - parse action to multiply integers in a VBA expression with operator '*' | ||
| 1068 | - """ | ||
| 1069 | - # extract argument from the tokens: | ||
| 1070 | - # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...] | ||
| 1071 | - integers = tokens[0][::2] | ||
| 1072 | - return reduce(lambda x,y:x*y, integers) | ||
| 1073 | - | ||
| 1074 | - | ||
| 1075 | -def divide_ints_list(tokens): | ||
| 1076 | - """ | ||
| 1077 | - parse action to divide integers in a VBA expression with operator '/' | ||
| 1078 | - """ | ||
| 1079 | - # extract argument from the tokens: | ||
| 1080 | - # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...] | ||
| 1081 | - integers = tokens[0][::2] | ||
| 1082 | - return reduce(lambda x,y:x/y, integers) | ||
| 1083 | - | ||
| 1084 | - | ||
| 1085 | -vba_expr_int_item = (vba_asc | vba_val | integer) | ||
| 1086 | - | ||
| 1087 | -# operators associativity: | ||
| 1088 | -# https://en.wikipedia.org/wiki/Operator_associativity | ||
| 1089 | - | ||
| 1090 | -vba_expr_int <<= infixNotation(vba_expr_int_item, | ||
| 1091 | - [ | ||
| 1092 | - ("*", 2, opAssoc.LEFT, multiply_ints_list), | ||
| 1093 | - ("/", 2, opAssoc.LEFT, divide_ints_list), | ||
| 1094 | - ("-", 2, opAssoc.LEFT, subtract_ints_list), | ||
| 1095 | - ("+", 2, opAssoc.LEFT, sum_ints_list), | ||
| 1096 | - ]) | ||
| 1097 | - | ||
| 1098 | - | ||
| 1099 | -# see detect_vba_strings for the deobfuscation code using this grammar | ||
| 1100 | - | ||
| 1101 | -# === MSO/ActiveMime files parsing =========================================== | ||
| 1102 | - | ||
| 1103 | -def is_mso_file(data): | ||
| 1104 | - """ | ||
| 1105 | - Check if the provided data is the content of a MSO/ActiveMime file, such as | ||
| 1106 | - the ones created by Outlook in some cases, or Word/Excel when saving a | ||
| 1107 | - file with the MHTML format or the Word 2003 XML format. | ||
| 1108 | - This function only checks the ActiveMime magic at the beginning of data. | ||
| 1109 | - :param data: bytes string, MSO/ActiveMime file content | ||
| 1110 | - :return: bool, True if the file is MSO, False otherwise | ||
| 1111 | - """ | ||
| 1112 | - return data.startswith(MSO_ACTIVEMIME_HEADER) | ||
| 1113 | - | ||
| 1114 | - | ||
| 1115 | -# regex to find zlib block headers, starting with byte 0x78 = 'x' | ||
| 1116 | -re_zlib_header = re.compile(r'x') | ||
| 1117 | - | ||
| 1118 | - | ||
| 1119 | -def mso_file_extract(data): | ||
| 1120 | - """ | ||
| 1121 | - Extract the data stored into a MSO/ActiveMime file, such as | ||
| 1122 | - the ones created by Outlook in some cases, or Word/Excel when saving a | ||
| 1123 | - file with the MHTML format or the Word 2003 XML format. | ||
| 1124 | - | ||
| 1125 | - :param data: bytes string, MSO/ActiveMime file content | ||
| 1126 | - :return: bytes string, extracted data (uncompressed) | ||
| 1127 | - | ||
| 1128 | - raise a MsoExtractionError if the data cannot be extracted | ||
| 1129 | - """ | ||
| 1130 | - # check the magic: | ||
| 1131 | - assert is_mso_file(data) | ||
| 1132 | - | ||
| 1133 | - # In all the samples seen so far, Word always uses an offset of 0x32, | ||
| 1134 | - # and Excel 0x22A. But we read the offset from the header to be more | ||
| 1135 | - # generic. | ||
| 1136 | - offsets = [0x32, 0x22A] | ||
| 1137 | - | ||
| 1138 | - # First, attempt to get the compressed data offset from the header | ||
| 1139 | - # According to my tests, it should be an unsigned 16 bits integer, | ||
| 1140 | - # at offset 0x1E (little endian) + add 46: | ||
| 1141 | - try: | ||
| 1142 | - offset = struct.unpack_from('<H', data, offset=0x1E)[0] + 46 | ||
| 1143 | - log.debug('Parsing MSO file: data offset = 0x%X' % offset) | ||
| 1144 | - offsets.insert(0, offset) # insert at beginning of offsets | ||
| 1145 | - except struct.error as exc: | ||
| 1146 | - log.info('Unable to parse MSO/ActiveMime file header (%s)' % exc) | ||
| 1147 | - log.debug('Trace:', exc_info=True) | ||
| 1148 | - raise MsoExtractionError('Unable to parse MSO/ActiveMime file header') | ||
| 1149 | - # now try offsets | ||
| 1150 | - for start in offsets: | ||
| 1151 | - try: | ||
| 1152 | - log.debug('Attempting zlib decompression from MSO file offset 0x%X' % start) | ||
| 1153 | - extracted_data = zlib.decompress(data[start:]) | ||
| 1154 | - return extracted_data | ||
| 1155 | - except zlib.error as exc: | ||
| 1156 | - log.info('zlib decompression failed for offset %s (%s)' | ||
| 1157 | - % (start, exc)) | ||
| 1158 | - log.debug('Trace:', exc_info=True) | ||
| 1159 | - # None of the guessed offsets worked, let's try brute-forcing by looking | ||
| 1160 | - # for potential zlib-compressed blocks starting with 0x78: | ||
| 1161 | - log.debug('Looking for potential zlib-compressed blocks in MSO file') | ||
| 1162 | - for match in re_zlib_header.finditer(data): | ||
| 1163 | - start = match.start() | ||
| 1164 | - try: | ||
| 1165 | - log.debug('Attempting zlib decompression from MSO file offset 0x%X' % start) | ||
| 1166 | - extracted_data = zlib.decompress(data[start:]) | ||
| 1167 | - return extracted_data | ||
| 1168 | - except zlib.error as exc: | ||
| 1169 | - log.info('zlib decompression failed (%s)' % exc) | ||
| 1170 | - log.debug('Trace:', exc_info=True) | ||
| 1171 | - raise MsoExtractionError('Unable to decompress data from a MSO/ActiveMime file') | ||
| 1172 | - | ||
| 1173 | - | ||
| 1174 | -#--- FUNCTIONS ---------------------------------------------------------------- | ||
| 1175 | - | ||
| 1176 | -# set of printable characters, for is_printable | ||
| 1177 | -_PRINTABLE_SET = set(string.printable) | ||
| 1178 | - | ||
| 1179 | -def is_printable(s): | ||
| 1180 | - """ | ||
| 1181 | - returns True if string s only contains printable ASCII characters | ||
| 1182 | - (i.e. contained in string.printable) | ||
| 1183 | - This is similar to Python 3's str.isprintable, for Python 2.x. | ||
| 1184 | - :param s: str | ||
| 1185 | - :return: bool | ||
| 1186 | - """ | ||
| 1187 | - # inspired from http://stackoverflow.com/questions/3636928/test-if-a-python-string-is-printable | ||
| 1188 | - # check if the set of chars from s is contained into the set of printable chars: | ||
| 1189 | - return set(s).issubset(_PRINTABLE_SET) | ||
| 1190 | - | ||
| 1191 | - | ||
| 1192 | -def copytoken_help(decompressed_current, decompressed_chunk_start): | ||
| 1193 | - """ | ||
| 1194 | - compute bit masks to decode a CopyToken according to MS-OVBA 2.4.1.3.19.1 CopyToken Help | ||
| 1195 | - | ||
| 1196 | - decompressed_current: number of decompressed bytes so far, i.e. len(decompressed_container) | ||
| 1197 | - decompressed_chunk_start: offset of the current chunk in the decompressed container | ||
| 1198 | - return length_mask, offset_mask, bit_count, maximum_length | ||
| 1199 | - """ | ||
| 1200 | - difference = decompressed_current - decompressed_chunk_start | ||
| 1201 | - bit_count = int(math.ceil(math.log(difference, 2))) | ||
| 1202 | - bit_count = max([bit_count, 4]) | ||
| 1203 | - length_mask = 0xFFFF >> bit_count | ||
| 1204 | - offset_mask = ~length_mask | ||
| 1205 | - maximum_length = (0xFFFF >> bit_count) + 3 | ||
| 1206 | - return length_mask, offset_mask, bit_count, maximum_length | ||
| 1207 | - | ||
| 1208 | - | ||
| 1209 | -def decompress_stream(compressed_container): | ||
| 1210 | - """ | ||
| 1211 | - Decompress a stream according to MS-OVBA section 2.4.1 | ||
| 1212 | - | ||
| 1213 | - compressed_container: string compressed according to the MS-OVBA 2.4.1.3.6 Compression algorithm | ||
| 1214 | - return the decompressed container as a string (bytes) | ||
| 1215 | - """ | ||
| 1216 | - # 2.4.1.2 State Variables | ||
| 1217 | - | ||
| 1218 | - # The following state is maintained for the CompressedContainer (section 2.4.1.1.1): | ||
| 1219 | - # CompressedRecordEnd: The location of the byte after the last byte in the CompressedContainer (section 2.4.1.1.1). | ||
| 1220 | - # CompressedCurrent: The location of the next byte in the CompressedContainer (section 2.4.1.1.1) to be read by | ||
| 1221 | - # decompression or to be written by compression. | ||
| 1222 | - | ||
| 1223 | - # The following state is maintained for the current CompressedChunk (section 2.4.1.1.4): | ||
| 1224 | - # CompressedChunkStart: The location of the first byte of the CompressedChunk (section 2.4.1.1.4) within the | ||
| 1225 | - # CompressedContainer (section 2.4.1.1.1). | ||
| 1226 | - | ||
| 1227 | - # The following state is maintained for a DecompressedBuffer (section 2.4.1.1.2): | ||
| 1228 | - # DecompressedCurrent: The location of the next byte in the DecompressedBuffer (section 2.4.1.1.2) to be written by | ||
| 1229 | - # decompression or to be read by compression. | ||
| 1230 | - # DecompressedBufferEnd: The location of the byte after the last byte in the DecompressedBuffer (section 2.4.1.1.2). | ||
| 1231 | - | ||
| 1232 | - # The following state is maintained for the current DecompressedChunk (section 2.4.1.1.3): | ||
| 1233 | - # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the | ||
| 1234 | - # DecompressedBuffer (section 2.4.1.1.2). | ||
| 1235 | - | ||
| 1236 | - # Check the input is a bytearray: | ||
| 1237 | - if not isinstance(compressed_container, bytearray): | ||
| 1238 | - raise TypeError('decompress_stream requires a bytearray as input') | ||
| 1239 | - decompressed_container = bytearray() # result | ||
| 1240 | - compressed_current = 0 | ||
| 1241 | - | ||
| 1242 | - sig_byte = compressed_container[compressed_current] | ||
| 1243 | - if sig_byte != 0x01: | ||
| 1244 | - raise ValueError('invalid signature byte {0:02X}'.format(sig_byte)) | ||
| 1245 | - | ||
| 1246 | - compressed_current += 1 | ||
| 1247 | - | ||
| 1248 | - #NOTE: the definition of CompressedRecordEnd is ambiguous. Here we assume that | ||
| 1249 | - # CompressedRecordEnd = len(compressed_container) | ||
| 1250 | - while compressed_current < len(compressed_container): | ||
| 1251 | - # 2.4.1.1.5 | ||
| 1252 | - compressed_chunk_start = compressed_current | ||
| 1253 | - # chunk header = first 16 bits | ||
| 1254 | - compressed_chunk_header = \ | ||
| 1255 | - struct.unpack("<H", compressed_container[compressed_chunk_start:compressed_chunk_start + 2])[0] | ||
| 1256 | - # chunk size = 12 first bits of header + 3 | ||
| 1257 | - chunk_size = (compressed_chunk_header & 0x0FFF) + 3 | ||
| 1258 | - # chunk signature = 3 next bits - should always be 0b011 | ||
| 1259 | - chunk_signature = (compressed_chunk_header >> 12) & 0x07 | ||
| 1260 | - if chunk_signature != 0b011: | ||
| 1261 | - raise ValueError('Invalid CompressedChunkSignature in VBA compressed stream') | ||
| 1262 | - # chunk flag = next bit - 1 == compressed, 0 == uncompressed | ||
| 1263 | - chunk_flag = (compressed_chunk_header >> 15) & 0x01 | ||
| 1264 | - log.debug("chunk size = {0}, compressed flag = {1}".format(chunk_size, chunk_flag)) | ||
| 1265 | - | ||
| 1266 | - #MS-OVBA 2.4.1.3.12: the maximum size of a chunk including its header is 4098 bytes (header 2 + data 4096) | ||
| 1267 | - # The minimum size is 3 bytes | ||
| 1268 | - # NOTE: there seems to be a typo in MS-OVBA, the check should be with 4098, not 4095 (which is the max value | ||
| 1269 | - # in chunk header before adding 3. | ||
| 1270 | - # Also the first test is not useful since a 12 bits value cannot be larger than 4095. | ||
| 1271 | - if chunk_flag == 1 and chunk_size > 4098: | ||
| 1272 | - raise ValueError('CompressedChunkSize > 4098 but CompressedChunkFlag == 1') | ||
| 1273 | - if chunk_flag == 0 and chunk_size != 4098: | ||
| 1274 | - raise ValueError('CompressedChunkSize != 4098 but CompressedChunkFlag == 0') | ||
| 1275 | - | ||
| 1276 | - # check if chunk_size goes beyond the compressed data, instead of silently cutting it: | ||
| 1277 | - #TODO: raise an exception? | ||
| 1278 | - if compressed_chunk_start + chunk_size > len(compressed_container): | ||
| 1279 | - log.warning('Chunk size is larger than remaining compressed data') | ||
| 1280 | - compressed_end = min([len(compressed_container), compressed_chunk_start + chunk_size]) | ||
| 1281 | - # read after chunk header: | ||
| 1282 | - compressed_current = compressed_chunk_start + 2 | ||
| 1283 | - | ||
| 1284 | - if chunk_flag == 0: | ||
| 1285 | - # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk | ||
| 1286 | - # uncompressed chunk: read the next 4096 bytes as-is | ||
| 1287 | - #TODO: check if there are at least 4096 bytes left | ||
| 1288 | - decompressed_container.extend([compressed_container[compressed_current:compressed_current + 4096]]) | ||
| 1289 | - compressed_current += 4096 | ||
| 1290 | - else: | ||
| 1291 | - # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk | ||
| 1292 | - # compressed chunk | ||
| 1293 | - decompressed_chunk_start = len(decompressed_container) | ||
| 1294 | - while compressed_current < compressed_end: | ||
| 1295 | - # MS-OVBA 2.4.1.3.4 Decompressing a TokenSequence | ||
| 1296 | - # log.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end)) | ||
| 1297 | - # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or | ||
| 1298 | - # copy tokens (reference to a previous literal token) | ||
| 1299 | - flag_byte = compressed_container[compressed_current] | ||
| 1300 | - compressed_current += 1 | ||
| 1301 | - for bit_index in xrange(0, 8): | ||
| 1302 | - # log.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end)) | ||
| 1303 | - if compressed_current >= compressed_end: | ||
| 1304 | - break | ||
| 1305 | - # MS-OVBA 2.4.1.3.5 Decompressing a Token | ||
| 1306 | - # MS-OVBA 2.4.1.3.17 Extract FlagBit | ||
| 1307 | - flag_bit = (flag_byte >> bit_index) & 1 | ||
| 1308 | - #log.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit)) | ||
| 1309 | - if flag_bit == 0: # LiteralToken | ||
| 1310 | - # copy one byte directly to output | ||
| 1311 | - decompressed_container.extend([compressed_container[compressed_current]]) | ||
| 1312 | - compressed_current += 1 | ||
| 1313 | - else: # CopyToken | ||
| 1314 | - # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken | ||
| 1315 | - copy_token = \ | ||
| 1316 | - struct.unpack("<H", compressed_container[compressed_current:compressed_current + 2])[0] | ||
| 1317 | - #TODO: check this | ||
| 1318 | - length_mask, offset_mask, bit_count, _ = copytoken_help( | ||
| 1319 | - len(decompressed_container), decompressed_chunk_start) | ||
| 1320 | - length = (copy_token & length_mask) + 3 | ||
| 1321 | - temp1 = copy_token & offset_mask | ||
| 1322 | - temp2 = 16 - bit_count | ||
| 1323 | - offset = (temp1 >> temp2) + 1 | ||
| 1324 | - #log.debug('offset=%d length=%d' % (offset, length)) | ||
| 1325 | - copy_source = len(decompressed_container) - offset | ||
| 1326 | - for index in xrange(copy_source, copy_source + length): | ||
| 1327 | - decompressed_container.extend([decompressed_container[index]]) | ||
| 1328 | - compressed_current += 2 | ||
| 1329 | - return bytes(decompressed_container) | ||
| 1330 | - | ||
| 1331 | - | ||
| 1332 | -def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): | ||
| 1333 | - """ | ||
| 1334 | - Extract VBA macros from an OleFileIO object. | ||
| 1335 | - Internal function, do not call directly. | ||
| 1336 | - | ||
| 1337 | - vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream | ||
| 1338 | - vba_project: path to the PROJECT stream | ||
| 1339 | - :param relaxed: If True, only create info/debug log entry if data is not as expected | ||
| 1340 | - (e.g. opening substream fails); if False, raise an error in this case | ||
| 1341 | - This is a generator, yielding (stream path, VBA filename, VBA source code) for each VBA code stream | ||
| 1342 | - """ | ||
| 1343 | - # Open the PROJECT stream: | ||
| 1344 | - project = ole.openstream(project_path) | ||
| 1345 | - log.debug('relaxed is %s' % relaxed) | ||
| 1346 | - | ||
| 1347 | - # sample content of the PROJECT stream: | ||
| 1348 | - | ||
| 1349 | - ## ID="{5312AC8A-349D-4950-BDD0-49BE3C4DD0F0}" | ||
| 1350 | - ## Document=ThisDocument/&H00000000 | ||
| 1351 | - ## Module=NewMacros | ||
| 1352 | - ## Name="Project" | ||
| 1353 | - ## HelpContextID="0" | ||
| 1354 | - ## VersionCompatible32="393222000" | ||
| 1355 | - ## CMG="F1F301E705E705E705E705" | ||
| 1356 | - ## DPB="8F8D7FE3831F2020202020" | ||
| 1357 | - ## GC="2D2FDD81E51EE61EE6E1" | ||
| 1358 | - ## | ||
| 1359 | - ## [Host Extender Info] | ||
| 1360 | - ## &H00000001={3832D640-CF90-11CF-8E43-00A0C911005A};VBE;&H00000000 | ||
| 1361 | - ## &H00000002={000209F2-0000-0000-C000-000000000046};Word8.0;&H00000000 | ||
| 1362 | - ## | ||
| 1363 | - ## [Workspace] | ||
| 1364 | - ## ThisDocument=22, 29, 339, 477, Z | ||
| 1365 | - ## NewMacros=-4, 42, 832, 510, C | ||
| 1366 | - | ||
| 1367 | - code_modules = {} | ||
| 1368 | - | ||
| 1369 | - for line in project: | ||
| 1370 | - line = line.strip().decode('utf-8','ignore') | ||
| 1371 | - if '=' in line: | ||
| 1372 | - # split line at the 1st equal sign: | ||
| 1373 | - name, value = line.split('=', 1) | ||
| 1374 | - # looking for code modules | ||
| 1375 | - # add the code module as a key in the dictionary | ||
| 1376 | - # the value will be the extension needed later | ||
| 1377 | - # The value is converted to lowercase, to allow case-insensitive matching (issue #3) | ||
| 1378 | - value = value.lower() | ||
| 1379 | - if name == 'Document': | ||
| 1380 | - # split value at the 1st slash, keep 1st part: | ||
| 1381 | - value = value.split('/', 1)[0] | ||
| 1382 | - code_modules[value] = CLASS_EXTENSION | ||
| 1383 | - elif name == 'Module': | ||
| 1384 | - code_modules[value] = MODULE_EXTENSION | ||
| 1385 | - elif name == 'Class': | ||
| 1386 | - code_modules[value] = CLASS_EXTENSION | ||
| 1387 | - elif name == 'BaseClass': | ||
| 1388 | - code_modules[value] = FORM_EXTENSION | ||
| 1389 | - | ||
| 1390 | - # read data from dir stream (compressed) | ||
| 1391 | - dir_compressed = ole.openstream(dir_path).read() | ||
| 1392 | - | ||
| 1393 | - def check_value(name, expected, value): | ||
| 1394 | - if expected != value: | ||
| 1395 | - if relaxed: | ||
| 1396 | - log.error("invalid value for {0} expected {1:04X} got {2:04X}" | ||
| 1397 | - .format(name, expected, value)) | ||
| 1398 | - else: | ||
| 1399 | - raise UnexpectedDataError(dir_path, name, expected, value) | ||
| 1400 | - | ||
| 1401 | - dir_stream = BytesIO(decompress_stream(bytearray(dir_compressed))) | ||
| 1402 | - | ||
| 1403 | - # PROJECTSYSKIND Record | ||
| 1404 | - projectsyskind_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1405 | - check_value('PROJECTSYSKIND_Id', 0x0001, projectsyskind_id) | ||
| 1406 | - projectsyskind_size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1407 | - check_value('PROJECTSYSKIND_Size', 0x0004, projectsyskind_size) | ||
| 1408 | - projectsyskind_syskind = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1409 | - if projectsyskind_syskind == 0x00: | ||
| 1410 | - log.debug("16-bit Windows") | ||
| 1411 | - elif projectsyskind_syskind == 0x01: | ||
| 1412 | - log.debug("32-bit Windows") | ||
| 1413 | - elif projectsyskind_syskind == 0x02: | ||
| 1414 | - log.debug("Macintosh") | ||
| 1415 | - elif projectsyskind_syskind == 0x03: | ||
| 1416 | - log.debug("64-bit Windows") | ||
| 1417 | - else: | ||
| 1418 | - log.error("invalid PROJECTSYSKIND_SysKind {0:04X}".format(projectsyskind_syskind)) | ||
| 1419 | - | ||
| 1420 | - # PROJECTLCID Record | ||
| 1421 | - projectlcid_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1422 | - check_value('PROJECTLCID_Id', 0x0002, projectlcid_id) | ||
| 1423 | - projectlcid_size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1424 | - check_value('PROJECTLCID_Size', 0x0004, projectlcid_size) | ||
| 1425 | - projectlcid_lcid = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1426 | - check_value('PROJECTLCID_Lcid', 0x409, projectlcid_lcid) | ||
| 1427 | - | ||
| 1428 | - # PROJECTLCIDINVOKE Record | ||
| 1429 | - projectlcidinvoke_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1430 | - check_value('PROJECTLCIDINVOKE_Id', 0x0014, projectlcidinvoke_id) | ||
| 1431 | - projectlcidinvoke_size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1432 | - check_value('PROJECTLCIDINVOKE_Size', 0x0004, projectlcidinvoke_size) | ||
| 1433 | - projectlcidinvoke_lcidinvoke = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1434 | - check_value('PROJECTLCIDINVOKE_LcidInvoke', 0x409, projectlcidinvoke_lcidinvoke) | ||
| 1435 | - | ||
| 1436 | - # PROJECTCODEPAGE Record | ||
| 1437 | - projectcodepage_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1438 | - check_value('PROJECTCODEPAGE_Id', 0x0003, projectcodepage_id) | ||
| 1439 | - projectcodepage_size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1440 | - check_value('PROJECTCODEPAGE_Size', 0x0002, projectcodepage_size) | ||
| 1441 | - projectcodepage_codepage = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1442 | - | ||
| 1443 | - # PROJECTNAME Record | ||
| 1444 | - projectname_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1445 | - check_value('PROJECTNAME_Id', 0x0004, projectname_id) | ||
| 1446 | - projectname_sizeof_projectname = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1447 | - if projectname_sizeof_projectname < 1 or projectname_sizeof_projectname > 128: | ||
| 1448 | - log.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(projectname_sizeof_projectname)) | ||
| 1449 | - projectname_projectname = dir_stream.read(projectname_sizeof_projectname) | ||
| 1450 | - unused = projectname_projectname | ||
| 1451 | - | ||
| 1452 | - # PROJECTDOCSTRING Record | ||
| 1453 | - projectdocstring_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1454 | - check_value('PROJECTDOCSTRING_Id', 0x0005, projectdocstring_id) | ||
| 1455 | - projectdocstring_sizeof_docstring = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1456 | - if projectdocstring_sizeof_docstring > 2000: | ||
| 1457 | - log.error( | ||
| 1458 | - "PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(projectdocstring_sizeof_docstring)) | ||
| 1459 | - projectdocstring_docstring = dir_stream.read(projectdocstring_sizeof_docstring) | ||
| 1460 | - projectdocstring_reserved = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1461 | - check_value('PROJECTDOCSTRING_Reserved', 0x0040, projectdocstring_reserved) | ||
| 1462 | - projectdocstring_sizeof_docstring_unicode = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1463 | - if projectdocstring_sizeof_docstring_unicode % 2 != 0: | ||
| 1464 | - log.error("PROJECTDOCSTRING_SizeOfDocStringUnicode is not even") | ||
| 1465 | - projectdocstring_docstring_unicode = dir_stream.read(projectdocstring_sizeof_docstring_unicode) | ||
| 1466 | - unused = projectdocstring_docstring | ||
| 1467 | - unused = projectdocstring_docstring_unicode | ||
| 1468 | - | ||
| 1469 | - # PROJECTHELPFILEPATH Record - MS-OVBA 2.3.4.2.1.7 | ||
| 1470 | - projecthelpfilepath_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1471 | - check_value('PROJECTHELPFILEPATH_Id', 0x0006, projecthelpfilepath_id) | ||
| 1472 | - projecthelpfilepath_sizeof_helpfile1 = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1473 | - if projecthelpfilepath_sizeof_helpfile1 > 260: | ||
| 1474 | - log.error( | ||
| 1475 | - "PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(projecthelpfilepath_sizeof_helpfile1)) | ||
| 1476 | - projecthelpfilepath_helpfile1 = dir_stream.read(projecthelpfilepath_sizeof_helpfile1) | ||
| 1477 | - projecthelpfilepath_reserved = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1478 | - check_value('PROJECTHELPFILEPATH_Reserved', 0x003D, projecthelpfilepath_reserved) | ||
| 1479 | - projecthelpfilepath_sizeof_helpfile2 = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1480 | - if projecthelpfilepath_sizeof_helpfile2 != projecthelpfilepath_sizeof_helpfile1: | ||
| 1481 | - log.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 does not equal PROJECTHELPFILEPATH_SizeOfHelpFile2") | ||
| 1482 | - projecthelpfilepath_helpfile2 = dir_stream.read(projecthelpfilepath_sizeof_helpfile2) | ||
| 1483 | - if projecthelpfilepath_helpfile2 != projecthelpfilepath_helpfile1: | ||
| 1484 | - log.error("PROJECTHELPFILEPATH_HelpFile1 does not equal PROJECTHELPFILEPATH_HelpFile2") | ||
| 1485 | - | ||
| 1486 | - # PROJECTHELPCONTEXT Record | ||
| 1487 | - projecthelpcontext_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1488 | - check_value('PROJECTHELPCONTEXT_Id', 0x0007, projecthelpcontext_id) | ||
| 1489 | - projecthelpcontext_size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1490 | - check_value('PROJECTHELPCONTEXT_Size', 0x0004, projecthelpcontext_size) | ||
| 1491 | - projecthelpcontext_helpcontext = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1492 | - unused = projecthelpcontext_helpcontext | ||
| 1493 | - | ||
| 1494 | - # PROJECTLIBFLAGS Record | ||
| 1495 | - projectlibflags_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1496 | - check_value('PROJECTLIBFLAGS_Id', 0x0008, projectlibflags_id) | ||
| 1497 | - projectlibflags_size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1498 | - check_value('PROJECTLIBFLAGS_Size', 0x0004, projectlibflags_size) | ||
| 1499 | - projectlibflags_projectlibflags = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1500 | - check_value('PROJECTLIBFLAGS_ProjectLibFlags', 0x0000, projectlibflags_projectlibflags) | ||
| 1501 | - | ||
| 1502 | - # PROJECTVERSION Record | ||
| 1503 | - projectversion_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1504 | - check_value('PROJECTVERSION_Id', 0x0009, projectversion_id) | ||
| 1505 | - projectversion_reserved = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1506 | - check_value('PROJECTVERSION_Reserved', 0x0004, projectversion_reserved) | ||
| 1507 | - projectversion_versionmajor = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1508 | - projectversion_versionminor = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1509 | - unused = projectversion_versionmajor | ||
| 1510 | - unused = projectversion_versionminor | ||
| 1511 | - | ||
| 1512 | - # PROJECTCONSTANTS Record | ||
| 1513 | - projectconstants_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1514 | - check_value('PROJECTCONSTANTS_Id', 0x000C, projectconstants_id) | ||
| 1515 | - projectconstants_sizeof_constants = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1516 | - if projectconstants_sizeof_constants > 1015: | ||
| 1517 | - log.error( | ||
| 1518 | - "PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(projectconstants_sizeof_constants)) | ||
| 1519 | - projectconstants_constants = dir_stream.read(projectconstants_sizeof_constants) | ||
| 1520 | - projectconstants_reserved = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1521 | - check_value('PROJECTCONSTANTS_Reserved', 0x003C, projectconstants_reserved) | ||
| 1522 | - projectconstants_sizeof_constants_unicode = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1523 | - if projectconstants_sizeof_constants_unicode % 2 != 0: | ||
| 1524 | - log.error("PROJECTCONSTANTS_SizeOfConstantsUnicode is not even") | ||
| 1525 | - projectconstants_constants_unicode = dir_stream.read(projectconstants_sizeof_constants_unicode) | ||
| 1526 | - unused = projectconstants_constants | ||
| 1527 | - unused = projectconstants_constants_unicode | ||
| 1528 | - | ||
| 1529 | - # array of REFERENCE records | ||
| 1530 | - check = None | ||
| 1531 | - while True: | ||
| 1532 | - check = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1533 | - log.debug("reference type = {0:04X}".format(check)) | ||
| 1534 | - if check == 0x000F: | ||
| 1535 | - break | ||
| 1536 | - | ||
| 1537 | - if check == 0x0016: | ||
| 1538 | - # REFERENCENAME | ||
| 1539 | - reference_id = check | ||
| 1540 | - reference_sizeof_name = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1541 | - reference_name = dir_stream.read(reference_sizeof_name) | ||
| 1542 | - reference_reserved = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1543 | - # According to [MS-OVBA] 2.3.4.2.2.2 REFERENCENAME Record: | ||
| 1544 | - # "Reserved (2 bytes): MUST be 0x003E. MUST be ignored." | ||
| 1545 | - # So let's ignore it, otherwise it crashes on some files (issue #132) | ||
| 1546 | - # PR #135 by @c1fe: | ||
| 1547 | - # contrary to the specification I think that the unicode name | ||
| 1548 | - # is optional. if reference_reserved is not 0x003E I think it | ||
| 1549 | - # is actually the start of another REFERENCE record | ||
| 1550 | - # at least when projectsyskind_syskind == 0x02 (Macintosh) | ||
| 1551 | - if reference_reserved == 0x003E: | ||
| 1552 | - #if reference_reserved not in (0x003E, 0x000D): | ||
| 1553 | - # raise UnexpectedDataError(dir_path, 'REFERENCE_Reserved', | ||
| 1554 | - # 0x0003E, reference_reserved) | ||
| 1555 | - reference_sizeof_name_unicode = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1556 | - reference_name_unicode = dir_stream.read(reference_sizeof_name_unicode) | ||
| 1557 | - unused = reference_id | ||
| 1558 | - unused = reference_name | ||
| 1559 | - unused = reference_name_unicode | ||
| 1560 | - continue | ||
| 1561 | - else: | ||
| 1562 | - check = reference_reserved | ||
| 1563 | - log.debug("reference type = {0:04X}".format(check)) | ||
| 1564 | - | ||
| 1565 | - if check == 0x0033: | ||
| 1566 | - # REFERENCEORIGINAL (followed by REFERENCECONTROL) | ||
| 1567 | - referenceoriginal_id = check | ||
| 1568 | - referenceoriginal_sizeof_libidoriginal = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1569 | - referenceoriginal_libidoriginal = dir_stream.read(referenceoriginal_sizeof_libidoriginal) | ||
| 1570 | - unused = referenceoriginal_id | ||
| 1571 | - unused = referenceoriginal_libidoriginal | ||
| 1572 | - continue | ||
| 1573 | - | ||
| 1574 | - if check == 0x002F: | ||
| 1575 | - # REFERENCECONTROL | ||
| 1576 | - referencecontrol_id = check | ||
| 1577 | - referencecontrol_sizetwiddled = struct.unpack("<L", dir_stream.read(4))[0] # ignore | ||
| 1578 | - referencecontrol_sizeof_libidtwiddled = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1579 | - referencecontrol_libidtwiddled = dir_stream.read(referencecontrol_sizeof_libidtwiddled) | ||
| 1580 | - referencecontrol_reserved1 = struct.unpack("<L", dir_stream.read(4))[0] # ignore | ||
| 1581 | - check_value('REFERENCECONTROL_Reserved1', 0x0000, referencecontrol_reserved1) | ||
| 1582 | - referencecontrol_reserved2 = struct.unpack("<H", dir_stream.read(2))[0] # ignore | ||
| 1583 | - check_value('REFERENCECONTROL_Reserved2', 0x0000, referencecontrol_reserved2) | ||
| 1584 | - unused = referencecontrol_id | ||
| 1585 | - unused = referencecontrol_sizetwiddled | ||
| 1586 | - unused = referencecontrol_libidtwiddled | ||
| 1587 | - # optional field | ||
| 1588 | - check2 = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1589 | - if check2 == 0x0016: | ||
| 1590 | - referencecontrol_namerecordextended_id = check | ||
| 1591 | - referencecontrol_namerecordextended_sizeof_name = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1592 | - referencecontrol_namerecordextended_name = dir_stream.read( | ||
| 1593 | - referencecontrol_namerecordextended_sizeof_name) | ||
| 1594 | - referencecontrol_namerecordextended_reserved = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1595 | - if referencecontrol_namerecordextended_reserved == 0x003E: | ||
| 1596 | - referencecontrol_namerecordextended_sizeof_name_unicode = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1597 | - referencecontrol_namerecordextended_name_unicode = dir_stream.read( | ||
| 1598 | - referencecontrol_namerecordextended_sizeof_name_unicode) | ||
| 1599 | - referencecontrol_reserved3 = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1600 | - unused = referencecontrol_namerecordextended_id | ||
| 1601 | - unused = referencecontrol_namerecordextended_name | ||
| 1602 | - unused = referencecontrol_namerecordextended_name_unicode | ||
| 1603 | - else: | ||
| 1604 | - referencecontrol_reserved3 = referencecontrol_namerecordextended_reserved | ||
| 1605 | - else: | ||
| 1606 | - referencecontrol_reserved3 = check2 | ||
| 1607 | - | ||
| 1608 | - check_value('REFERENCECONTROL_Reserved3', 0x0030, referencecontrol_reserved3) | ||
| 1609 | - referencecontrol_sizeextended = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1610 | - referencecontrol_sizeof_libidextended = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1611 | - referencecontrol_libidextended = dir_stream.read(referencecontrol_sizeof_libidextended) | ||
| 1612 | - referencecontrol_reserved4 = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1613 | - referencecontrol_reserved5 = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1614 | - referencecontrol_originaltypelib = dir_stream.read(16) | ||
| 1615 | - referencecontrol_cookie = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1616 | - unused = referencecontrol_sizeextended | ||
| 1617 | - unused = referencecontrol_libidextended | ||
| 1618 | - unused = referencecontrol_reserved4 | ||
| 1619 | - unused = referencecontrol_reserved5 | ||
| 1620 | - unused = referencecontrol_originaltypelib | ||
| 1621 | - unused = referencecontrol_cookie | ||
| 1622 | - continue | ||
| 1623 | - | ||
| 1624 | - if check == 0x000D: | ||
| 1625 | - # REFERENCEREGISTERED | ||
| 1626 | - referenceregistered_id = check | ||
| 1627 | - referenceregistered_size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1628 | - referenceregistered_sizeof_libid = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1629 | - referenceregistered_libid = dir_stream.read(referenceregistered_sizeof_libid) | ||
| 1630 | - referenceregistered_reserved1 = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1631 | - check_value('REFERENCEREGISTERED_Reserved1', 0x0000, referenceregistered_reserved1) | ||
| 1632 | - referenceregistered_reserved2 = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1633 | - check_value('REFERENCEREGISTERED_Reserved2', 0x0000, referenceregistered_reserved2) | ||
| 1634 | - unused = referenceregistered_id | ||
| 1635 | - unused = referenceregistered_size | ||
| 1636 | - unused = referenceregistered_libid | ||
| 1637 | - continue | ||
| 1638 | - | ||
| 1639 | - if check == 0x000E: | ||
| 1640 | - # REFERENCEPROJECT | ||
| 1641 | - referenceproject_id = check | ||
| 1642 | - referenceproject_size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1643 | - referenceproject_sizeof_libidabsolute = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1644 | - referenceproject_libidabsolute = dir_stream.read(referenceproject_sizeof_libidabsolute) | ||
| 1645 | - referenceproject_sizeof_libidrelative = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1646 | - referenceproject_libidrelative = dir_stream.read(referenceproject_sizeof_libidrelative) | ||
| 1647 | - referenceproject_majorversion = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1648 | - referenceproject_minorversion = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1649 | - unused = referenceproject_id | ||
| 1650 | - unused = referenceproject_size | ||
| 1651 | - unused = referenceproject_libidabsolute | ||
| 1652 | - unused = referenceproject_libidrelative | ||
| 1653 | - unused = referenceproject_majorversion | ||
| 1654 | - unused = referenceproject_minorversion | ||
| 1655 | - continue | ||
| 1656 | - | ||
| 1657 | - log.error('invalid or unknown check Id {0:04X}'.format(check)) | ||
| 1658 | - # raise an exception instead of stopping abruptly (issue #180) | ||
| 1659 | - raise UnexpectedDataError(dir_path, 'reference type', (0x0F, 0x16, 0x33, 0x2F, 0x0D, 0x0E), check) | ||
| 1660 | - #sys.exit(0) | ||
| 1661 | - | ||
| 1662 | - projectmodules_id = check #struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1663 | - check_value('PROJECTMODULES_Id', 0x000F, projectmodules_id) | ||
| 1664 | - projectmodules_size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1665 | - check_value('PROJECTMODULES_Size', 0x0002, projectmodules_size) | ||
| 1666 | - projectmodules_count = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1667 | - projectmodules_projectcookierecord_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1668 | - check_value('PROJECTMODULES_ProjectCookieRecord_Id', 0x0013, projectmodules_projectcookierecord_id) | ||
| 1669 | - projectmodules_projectcookierecord_size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1670 | - check_value('PROJECTMODULES_ProjectCookieRecord_Size', 0x0002, projectmodules_projectcookierecord_size) | ||
| 1671 | - projectmodules_projectcookierecord_cookie = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1672 | - unused = projectmodules_projectcookierecord_cookie | ||
| 1673 | - | ||
| 1674 | - # short function to simplify unicode text output | ||
| 1675 | - uni_out = lambda unicode_text: unicode_text.encode('utf-8', 'replace') | ||
| 1676 | - | ||
| 1677 | - log.debug("parsing {0} modules".format(projectmodules_count)) | ||
| 1678 | - for projectmodule_index in xrange(0, projectmodules_count): | ||
| 1679 | - try: | ||
| 1680 | - modulename_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1681 | - check_value('MODULENAME_Id', 0x0019, modulename_id) | ||
| 1682 | - modulename_sizeof_modulename = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1683 | - modulename_modulename = dir_stream.read(modulename_sizeof_modulename).decode('utf-8', 'backslashreplace') | ||
| 1684 | - # TODO: preset variables to avoid "referenced before assignment" errors | ||
| 1685 | - modulename_unicode_modulename_unicode = '' | ||
| 1686 | - # account for optional sections | ||
| 1687 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1688 | - if section_id == 0x0047: | ||
| 1689 | - modulename_unicode_id = section_id | ||
| 1690 | - modulename_unicode_sizeof_modulename_unicode = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1691 | - modulename_unicode_modulename_unicode = dir_stream.read( | ||
| 1692 | - modulename_unicode_sizeof_modulename_unicode).decode('UTF-16LE', 'replace') | ||
| 1693 | - # just guessing that this is the same encoding as used in OleFileIO | ||
| 1694 | - unused = modulename_unicode_id | ||
| 1695 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1696 | - if section_id == 0x001A: | ||
| 1697 | - modulestreamname_id = section_id | ||
| 1698 | - modulestreamname_sizeof_streamname = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1699 | - modulestreamname_streamname = dir_stream.read(modulestreamname_sizeof_streamname) | ||
| 1700 | - modulestreamname_reserved = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1701 | - check_value('MODULESTREAMNAME_Reserved', 0x0032, modulestreamname_reserved) | ||
| 1702 | - modulestreamname_sizeof_streamname_unicode = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1703 | - modulestreamname_streamname_unicode = dir_stream.read( | ||
| 1704 | - modulestreamname_sizeof_streamname_unicode).decode('UTF-16LE', 'replace') | ||
| 1705 | - # just guessing that this is the same encoding as used in OleFileIO | ||
| 1706 | - unused = modulestreamname_id | ||
| 1707 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1708 | - if section_id == 0x001C: | ||
| 1709 | - moduledocstring_id = section_id | ||
| 1710 | - check_value('MODULEDOCSTRING_Id', 0x001C, moduledocstring_id) | ||
| 1711 | - moduledocstring_sizeof_docstring = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1712 | - moduledocstring_docstring = dir_stream.read(moduledocstring_sizeof_docstring) | ||
| 1713 | - moduledocstring_reserved = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1714 | - check_value('MODULEDOCSTRING_Reserved', 0x0048, moduledocstring_reserved) | ||
| 1715 | - moduledocstring_sizeof_docstring_unicode = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1716 | - moduledocstring_docstring_unicode = dir_stream.read(moduledocstring_sizeof_docstring_unicode) | ||
| 1717 | - unused = moduledocstring_docstring | ||
| 1718 | - unused = moduledocstring_docstring_unicode | ||
| 1719 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1720 | - if section_id == 0x0031: | ||
| 1721 | - moduleoffset_id = section_id | ||
| 1722 | - check_value('MODULEOFFSET_Id', 0x0031, moduleoffset_id) | ||
| 1723 | - moduleoffset_size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1724 | - check_value('MODULEOFFSET_Size', 0x0004, moduleoffset_size) | ||
| 1725 | - moduleoffset_textoffset = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1726 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1727 | - if section_id == 0x001E: | ||
| 1728 | - modulehelpcontext_id = section_id | ||
| 1729 | - check_value('MODULEHELPCONTEXT_Id', 0x001E, modulehelpcontext_id) | ||
| 1730 | - modulehelpcontext_size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1731 | - check_value('MODULEHELPCONTEXT_Size', 0x0004, modulehelpcontext_size) | ||
| 1732 | - modulehelpcontext_helpcontext = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1733 | - unused = modulehelpcontext_helpcontext | ||
| 1734 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1735 | - if section_id == 0x002C: | ||
| 1736 | - modulecookie_id = section_id | ||
| 1737 | - check_value('MODULECOOKIE_Id', 0x002C, modulecookie_id) | ||
| 1738 | - modulecookie_size = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1739 | - check_value('MODULECOOKIE_Size', 0x0002, modulecookie_size) | ||
| 1740 | - modulecookie_cookie = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1741 | - unused = modulecookie_cookie | ||
| 1742 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1743 | - if section_id == 0x0021 or section_id == 0x0022: | ||
| 1744 | - moduletype_id = section_id | ||
| 1745 | - moduletype_reserved = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1746 | - unused = moduletype_id | ||
| 1747 | - unused = moduletype_reserved | ||
| 1748 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1749 | - if section_id == 0x0025: | ||
| 1750 | - modulereadonly_id = section_id | ||
| 1751 | - check_value('MODULEREADONLY_Id', 0x0025, modulereadonly_id) | ||
| 1752 | - modulereadonly_reserved = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1753 | - check_value('MODULEREADONLY_Reserved', 0x0000, modulereadonly_reserved) | ||
| 1754 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1755 | - if section_id == 0x0028: | ||
| 1756 | - moduleprivate_id = section_id | ||
| 1757 | - check_value('MODULEPRIVATE_Id', 0x0028, moduleprivate_id) | ||
| 1758 | - moduleprivate_reserved = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1759 | - check_value('MODULEPRIVATE_Reserved', 0x0000, moduleprivate_reserved) | ||
| 1760 | - section_id = struct.unpack("<H", dir_stream.read(2))[0] | ||
| 1761 | - if section_id == 0x002B: # TERMINATOR | ||
| 1762 | - module_reserved = struct.unpack("<L", dir_stream.read(4))[0] | ||
| 1763 | - check_value('MODULE_Reserved', 0x0000, module_reserved) | ||
| 1764 | - section_id = None | ||
| 1765 | - if section_id != None: | ||
| 1766 | - log.warning('unknown or invalid module section id {0:04X}'.format(section_id)) | ||
| 1767 | - | ||
| 1768 | - log.debug('Project CodePage = %d' % projectcodepage_codepage) | ||
| 1769 | - if projectcodepage_codepage in MAC_CODEPAGES: | ||
| 1770 | - vba_codec = MAC_CODEPAGES[projectcodepage_codepage] | ||
| 1771 | - else: | ||
| 1772 | - vba_codec = 'cp%d' % projectcodepage_codepage | ||
| 1773 | - log.debug("ModuleName = {0}".format(modulename_modulename)) | ||
| 1774 | - log.debug("ModuleNameUnicode = {0}".format(uni_out(modulename_unicode_modulename_unicode))) | ||
| 1775 | - log.debug("StreamName = {0}".format(modulestreamname_streamname)) | ||
| 1776 | - try: | ||
| 1777 | - streamname_unicode = modulestreamname_streamname.decode(vba_codec) | ||
| 1778 | - except UnicodeError as ue: | ||
| 1779 | - log.debug('failed to decode stream name {0!r} with codec {1}' | ||
| 1780 | - .format(uni_out(streamname_unicode), vba_codec)) | ||
| 1781 | - streamname_unicode = modulestreamname_streamname.decode(vba_codec, errors='replace') | ||
| 1782 | - log.debug("StreamName.decode('%s') = %s" % (vba_codec, uni_out(streamname_unicode))) | ||
| 1783 | - log.debug("StreamNameUnicode = {0}".format(uni_out(modulestreamname_streamname_unicode))) | ||
| 1784 | - log.debug("TextOffset = {0}".format(moduleoffset_textoffset)) | ||
| 1785 | - | ||
| 1786 | - code_data = None | ||
| 1787 | - try_names = streamname_unicode, \ | ||
| 1788 | - modulename_unicode_modulename_unicode, \ | ||
| 1789 | - modulestreamname_streamname_unicode | ||
| 1790 | - for stream_name in try_names: | ||
| 1791 | - # TODO: if olefile._find were less private, could replace this | ||
| 1792 | - # try-except with calls to it | ||
| 1793 | - try: | ||
| 1794 | - code_path = vba_root + u'VBA/' + stream_name | ||
| 1795 | - log.debug('opening VBA code stream %s' % uni_out(code_path)) | ||
| 1796 | - code_data = ole.openstream(code_path).read() | ||
| 1797 | - break | ||
| 1798 | - except IOError as ioe: | ||
| 1799 | - log.debug('failed to open stream VBA/%r (%r), try other name' | ||
| 1800 | - % (uni_out(stream_name), ioe)) | ||
| 1801 | - | ||
| 1802 | - if code_data is None: | ||
| 1803 | - log.info("Could not open stream %d of %d ('VBA/' + one of %r)!" | ||
| 1804 | - % (projectmodule_index, projectmodules_count, | ||
| 1805 | - '/'.join("'" + uni_out(stream_name) + "'" | ||
| 1806 | - for stream_name in try_names))) | ||
| 1807 | - if relaxed: | ||
| 1808 | - continue # ... with next submodule | ||
| 1809 | - else: | ||
| 1810 | - raise SubstreamOpenError('[BASE]', 'VBA/' + | ||
| 1811 | - uni_out(modulename_unicode_modulename_unicode)) | ||
| 1812 | - | ||
| 1813 | - log.debug("length of code_data = {0}".format(len(code_data))) | ||
| 1814 | - log.debug("offset of code_data = {0}".format(moduleoffset_textoffset)) | ||
| 1815 | - code_data = code_data[moduleoffset_textoffset:] | ||
| 1816 | - if len(code_data) > 0: | ||
| 1817 | - code_data = decompress_stream(bytearray(code_data)) | ||
| 1818 | - # case-insensitive search in the code_modules dict to find the file extension: | ||
| 1819 | - filext = code_modules.get(modulename_modulename.lower(), 'bin') | ||
| 1820 | - filename = '{0}.{1}'.format(modulename_modulename, filext) | ||
| 1821 | - #TODO: also yield the codepage so that callers can decode it properly | ||
| 1822 | - yield (code_path, filename, code_data) | ||
| 1823 | - # print '-'*79 | ||
| 1824 | - # print filename | ||
| 1825 | - # print '' | ||
| 1826 | - # print code_data | ||
| 1827 | - # print '' | ||
| 1828 | - log.debug('extracted file {0}'.format(filename)) | ||
| 1829 | - else: | ||
| 1830 | - log.warning("module stream {0} has code data length 0".format(modulestreamname_streamname)) | ||
| 1831 | - except (UnexpectedDataError, SubstreamOpenError): | ||
| 1832 | - raise | ||
| 1833 | - except Exception as exc: | ||
| 1834 | - log.info('Error parsing module {0} of {1} in _extract_vba:' | ||
| 1835 | - .format(projectmodule_index, projectmodules_count), | ||
| 1836 | - exc_info=True) | ||
| 1837 | - if not relaxed: | ||
| 1838 | - raise | ||
| 1839 | - _ = unused # make pylint happy: now variable "unused" is being used ;-) | ||
| 1840 | - return | ||
| 1841 | - | ||
| 1842 | - | ||
| 1843 | -def vba_collapse_long_lines(vba_code): | ||
| 1844 | - """ | ||
| 1845 | - Parse a VBA module code to detect continuation line characters (underscore) and | ||
| 1846 | - collapse split lines. Continuation line characters are replaced by spaces. | ||
| 1847 | - | ||
| 1848 | - :param vba_code: str, VBA module code | ||
| 1849 | - :return: str, VBA module code with long lines collapsed | ||
| 1850 | - """ | ||
| 1851 | - # TODO: use a regex instead, to allow whitespaces after the underscore? | ||
| 1852 | - vba_code = vba_code.replace(' _\r\n', ' ') | ||
| 1853 | - vba_code = vba_code.replace(' _\r', ' ') | ||
| 1854 | - vba_code = vba_code.replace(' _\n', ' ') | ||
| 1855 | - return vba_code | ||
| 1856 | - | ||
| 1857 | - | ||
| 1858 | -def filter_vba(vba_code): | ||
| 1859 | - """ | ||
| 1860 | - Filter VBA source code to remove the first lines starting with "Attribute VB_", | ||
| 1861 | - which are automatically added by MS Office and not displayed in the VBA Editor. | ||
| 1862 | - This should only be used when displaying source code for human analysis. | ||
| 1863 | - | ||
| 1864 | - Note: lines are not filtered if they contain a colon, because it could be | ||
| 1865 | - used to hide malicious instructions. | ||
| 1866 | - | ||
| 1867 | - :param vba_code: str, VBA source code | ||
| 1868 | - :return: str, filtered VBA source code | ||
| 1869 | - """ | ||
| 1870 | - vba_lines = vba_code.splitlines() | ||
| 1871 | - start = 0 | ||
| 1872 | - for line in vba_lines: | ||
| 1873 | - if line.startswith("Attribute VB_") and not ':' in line: | ||
| 1874 | - start += 1 | ||
| 1875 | - else: | ||
| 1876 | - break | ||
| 1877 | - #TODO: also remove empty lines? | ||
| 1878 | - vba = '\n'.join(vba_lines[start:]) | ||
| 1879 | - return vba | ||
| 1880 | - | ||
| 1881 | - | ||
| 1882 | -def detect_autoexec(vba_code, obfuscation=None): | ||
| 1883 | - """ | ||
| 1884 | - Detect if the VBA code contains keywords corresponding to macros running | ||
| 1885 | - automatically when triggered by specific actions (e.g. when a document is | ||
| 1886 | - opened or closed). | ||
| 1887 | - | ||
| 1888 | - :param vba_code: str, VBA source code | ||
| 1889 | - :param obfuscation: None or str, name of obfuscation to be added to description | ||
| 1890 | - :return: list of str tuples (keyword, description) | ||
| 1891 | - """ | ||
| 1892 | - #TODO: merge code with detect_suspicious | ||
| 1893 | - # case-insensitive search | ||
| 1894 | - #vba_code = vba_code.lower() | ||
| 1895 | - results = [] | ||
| 1896 | - obf_text = '' | ||
| 1897 | - if obfuscation: | ||
| 1898 | - obf_text = ' (obfuscation: %s)' % obfuscation | ||
| 1899 | - for description, keywords in AUTOEXEC_KEYWORDS.items(): | ||
| 1900 | - for keyword in keywords: | ||
| 1901 | - #TODO: if keyword is already a compiled regex, use it as-is | ||
| 1902 | - # search using regex to detect word boundaries: | ||
| 1903 | - match = re.search(r'(?i)\b' + keyword + r'\b', vba_code) | ||
| 1904 | - if match: | ||
| 1905 | - #if keyword.lower() in vba_code: | ||
| 1906 | - found_keyword = match.group() | ||
| 1907 | - results.append((found_keyword, description + obf_text)) | ||
| 1908 | - return results | ||
| 1909 | - | ||
| 1910 | - | ||
| 1911 | -def detect_suspicious(vba_code, obfuscation=None): | ||
| 1912 | - """ | ||
| 1913 | - Detect if the VBA code contains suspicious keywords corresponding to | ||
| 1914 | - potential malware behaviour. | ||
| 1915 | - | ||
| 1916 | - :param vba_code: str, VBA source code | ||
| 1917 | - :param obfuscation: None or str, name of obfuscation to be added to description | ||
| 1918 | - :return: list of str tuples (keyword, description) | ||
| 1919 | - """ | ||
| 1920 | - # case-insensitive search | ||
| 1921 | - #vba_code = vba_code.lower() | ||
| 1922 | - results = [] | ||
| 1923 | - obf_text = '' | ||
| 1924 | - if obfuscation: | ||
| 1925 | - obf_text = ' (obfuscation: %s)' % obfuscation | ||
| 1926 | - for description, keywords in SUSPICIOUS_KEYWORDS.items(): | ||
| 1927 | - for keyword in keywords: | ||
| 1928 | - # search using regex to detect word boundaries: | ||
| 1929 | - match = re.search(r'(?i)\b' + re.escape(keyword) + r'\b', vba_code) | ||
| 1930 | - if match: | ||
| 1931 | - #if keyword.lower() in vba_code: | ||
| 1932 | - found_keyword = match.group() | ||
| 1933 | - results.append((found_keyword, description + obf_text)) | ||
| 1934 | - return results | ||
| 1935 | - | ||
| 1936 | - | ||
| 1937 | -def detect_patterns(vba_code, obfuscation=None): | ||
| 1938 | - """ | ||
| 1939 | - Detect if the VBA code contains specific patterns such as IP addresses, | ||
| 1940 | - URLs, e-mail addresses, executable file names, etc. | ||
| 1941 | - | ||
| 1942 | - :param vba_code: str, VBA source code | ||
| 1943 | - :return: list of str tuples (pattern type, value) | ||
| 1944 | - """ | ||
| 1945 | - results = [] | ||
| 1946 | - found = set() | ||
| 1947 | - obf_text = '' | ||
| 1948 | - if obfuscation: | ||
| 1949 | - obf_text = ' (obfuscation: %s)' % obfuscation | ||
| 1950 | - for pattern_type, pattern_re in RE_PATTERNS: | ||
| 1951 | - for match in pattern_re.finditer(vba_code): | ||
| 1952 | - value = match.group() | ||
| 1953 | - if value not in found: | ||
| 1954 | - results.append((pattern_type + obf_text, value)) | ||
| 1955 | - found.add(value) | ||
| 1956 | - return results | ||
| 1957 | - | ||
| 1958 | - | ||
| 1959 | -def detect_hex_strings(vba_code): | ||
| 1960 | - """ | ||
| 1961 | - Detect if the VBA code contains strings encoded in hexadecimal. | ||
| 1962 | - | ||
| 1963 | - :param vba_code: str, VBA source code | ||
| 1964 | - :return: list of str tuples (encoded string, decoded string) | ||
| 1965 | - """ | ||
| 1966 | - results = [] | ||
| 1967 | - found = set() | ||
| 1968 | - for match in re_hex_string.finditer(vba_code): | ||
| 1969 | - value = match.group() | ||
| 1970 | - if value not in found: | ||
| 1971 | - decoded = binascii.unhexlify(value) | ||
| 1972 | - results.append((value, decoded.decode('utf-8', 'backslashreplace'))) | ||
| 1973 | - found.add(value) | ||
| 1974 | - return results | ||
| 1975 | - | ||
| 1976 | - | ||
| 1977 | -def detect_base64_strings(vba_code): | ||
| 1978 | - """ | ||
| 1979 | - Detect if the VBA code contains strings encoded in base64. | ||
| 1980 | - | ||
| 1981 | - :param vba_code: str, VBA source code | ||
| 1982 | - :return: list of str tuples (encoded string, decoded string) | ||
| 1983 | - """ | ||
| 1984 | - #TODO: avoid matching simple hex strings as base64? | ||
| 1985 | - results = [] | ||
| 1986 | - found = set() | ||
| 1987 | - for match in re_base64_string.finditer(vba_code): | ||
| 1988 | - # extract the base64 string without quotes: | ||
| 1989 | - value = match.group().strip('"') | ||
| 1990 | - # check it is not just a hex string: | ||
| 1991 | - if not re_nothex_check.search(value): | ||
| 1992 | - continue | ||
| 1993 | - # only keep new values and not in the whitelist: | ||
| 1994 | - if value not in found and value.lower() not in BASE64_WHITELIST: | ||
| 1995 | - try: | ||
| 1996 | - decoded = base64.b64decode(value) | ||
| 1997 | - results.append((value, decoded.decode('utf-8','replace'))) | ||
| 1998 | - found.add(value) | ||
| 1999 | - except (TypeError, ValueError) as exc: | ||
| 2000 | - log.debug('Failed to base64-decode (%s)' % exc) | ||
| 2001 | - # if an exception occurs, it is likely not a base64-encoded string | ||
| 2002 | - return results | ||
| 2003 | - | ||
| 2004 | - | ||
| 2005 | -def detect_dridex_strings(vba_code): | ||
| 2006 | - """ | ||
| 2007 | - Detect if the VBA code contains strings obfuscated with a specific algorithm found in Dridex samples. | ||
| 2008 | - | ||
| 2009 | - :param vba_code: str, VBA source code | ||
| 2010 | - :return: list of str tuples (encoded string, decoded string) | ||
| 2011 | - """ | ||
| 2012 | - # TODO: move this at the beginning of script | ||
| 2013 | - from oletools.thirdparty.DridexUrlDecoder.DridexUrlDecoder import DridexUrlDecode | ||
| 2014 | - | ||
| 2015 | - results = [] | ||
| 2016 | - found = set() | ||
| 2017 | - for match in re_dridex_string.finditer(vba_code): | ||
| 2018 | - value = match.group()[1:-1] | ||
| 2019 | - # check it is not just a hex string: | ||
| 2020 | - if not re_nothex_check.search(value): | ||
| 2021 | - continue | ||
| 2022 | - if value not in found: | ||
| 2023 | - try: | ||
| 2024 | - decoded = DridexUrlDecode(value) | ||
| 2025 | - results.append((value, decoded)) | ||
| 2026 | - found.add(value) | ||
| 2027 | - except Exception as exc: | ||
| 2028 | - log.debug('Failed to Dridex-decode (%s)' % exc) | ||
| 2029 | - # if an exception occurs, it is likely not a dridex-encoded string | ||
| 2030 | - return results | ||
| 2031 | - | ||
| 2032 | - | ||
| 2033 | -def detect_vba_strings(vba_code): | ||
| 2034 | - """ | ||
| 2035 | - Detect if the VBA code contains strings obfuscated with VBA expressions | ||
| 2036 | - using keywords such as Chr, Asc, Val, StrReverse, etc. | ||
| 2037 | - | ||
| 2038 | - :param vba_code: str, VBA source code | ||
| 2039 | - :return: list of str tuples (encoded string, decoded string) | ||
| 2040 | - """ | ||
| 2041 | - # TODO: handle exceptions | ||
| 2042 | - results = [] | ||
| 2043 | - found = set() | ||
| 2044 | - # IMPORTANT: to extract the actual VBA expressions found in the code, | ||
| 2045 | - # we must expand tabs to have the same string as pyparsing. | ||
| 2046 | - # Otherwise, start and end offsets are incorrect. | ||
| 2047 | - vba_code = vba_code.expandtabs() | ||
| 2048 | - # Split the VBA code line by line to avoid MemoryError on large scripts: | ||
| 2049 | - for vba_line in vba_code.splitlines(): | ||
| 2050 | - for tokens, start, end in vba_expr_str.scanString(vba_line): | ||
| 2051 | - encoded = vba_line[start:end] | ||
| 2052 | - decoded = tokens[0] | ||
| 2053 | - if isinstance(decoded, VbaExpressionString): | ||
| 2054 | - # This is a VBA expression, not a simple string | ||
| 2055 | - # print 'VBA EXPRESSION: encoded=%r => decoded=%r' % (encoded, decoded) | ||
| 2056 | - # remove parentheses and quotes from original string: | ||
| 2057 | - # if encoded.startswith('(') and encoded.endswith(')'): | ||
| 2058 | - # encoded = encoded[1:-1] | ||
| 2059 | - # if encoded.startswith('"') and encoded.endswith('"'): | ||
| 2060 | - # encoded = encoded[1:-1] | ||
| 2061 | - # avoid duplicates and simple strings: | ||
| 2062 | - if encoded not in found and decoded != encoded: | ||
| 2063 | - results.append((encoded, decoded)) | ||
| 2064 | - found.add(encoded) | ||
| 2065 | - # else: | ||
| 2066 | - # print 'VBA STRING: encoded=%r => decoded=%r' % (encoded, decoded) | ||
| 2067 | - return results | ||
| 2068 | - | ||
| 2069 | - | ||
| 2070 | -def json2ascii(json_obj, encoding='utf8', errors='replace'): | ||
| 2071 | - """ ensure there is no unicode in json and all strings are safe to decode | ||
| 2072 | - | ||
| 2073 | - works recursively, decodes and re-encodes every string to/from unicode | ||
| 2074 | - to ensure there will be no trouble in loading the dumped json output | ||
| 2075 | - """ | ||
| 2076 | - if json_obj is None: | ||
| 2077 | - pass | ||
| 2078 | - elif isinstance(json_obj, (bool, int, float)): | ||
| 2079 | - pass | ||
| 2080 | - elif isinstance(json_obj, str): | ||
| 2081 | - # de-code and re-encode | ||
| 2082 | - dencoded = json_obj | ||
| 2083 | - if dencoded != json_obj: | ||
| 2084 | - log.debug('json2ascii: replaced: {0} (len {1})' | ||
| 2085 | - .format(json_obj, len(json_obj))) | ||
| 2086 | - log.debug('json2ascii: with: {0} (len {1})' | ||
| 2087 | - .format(dencoded, len(dencoded))) | ||
| 2088 | - return dencoded | ||
| 2089 | - elif isinstance(json_obj, bytes): | ||
| 2090 | - log.debug('json2ascii: encode unicode: {0}' | ||
| 2091 | - .format(json_obj.decode(encoding, errors))) | ||
| 2092 | - # cannot put original into logger | ||
| 2093 | - # print 'original: ' json_obj | ||
| 2094 | - return json_obj.decode(encoding, errors) | ||
| 2095 | - elif isinstance(json_obj, dict): | ||
| 2096 | - for key in json_obj: | ||
| 2097 | - json_obj[key] = json2ascii(json_obj[key]) | ||
| 2098 | - elif isinstance(json_obj, (list,tuple)): | ||
| 2099 | - for item in json_obj: | ||
| 2100 | - item = json2ascii(item) | ||
| 2101 | - else: | ||
| 2102 | - log.debug('unexpected type in json2ascii: {0} -- leave as is' | ||
| 2103 | - .format(type(json_obj))) | ||
| 2104 | - return json_obj | ||
| 2105 | - | ||
| 2106 | - | ||
| 2107 | -def print_json(json_dict=None, _json_is_first=False, _json_is_last=False, | ||
| 2108 | - **json_parts): | ||
| 2109 | - """ line-wise print of json.dumps(json2ascii(..)) with options and indent+1 | ||
| 2110 | - | ||
| 2111 | - can use in two ways: | ||
| 2112 | - (1) print_json(some_dict) | ||
| 2113 | - (2) print_json(key1=value1, key2=value2, ...) | ||
| 2114 | - | ||
| 2115 | - :param bool _json_is_first: set to True only for very first entry to complete | ||
| 2116 | - the top-level json-list | ||
| 2117 | - :param bool _json_is_last: set to True only for very last entry to complete | ||
| 2118 | - the top-level json-list | ||
| 2119 | - """ | ||
| 2120 | - if json_dict and json_parts: | ||
| 2121 | - raise ValueError('Invalid json argument: want either single dict or ' | ||
| 2122 | - 'key=value parts but got both)') | ||
| 2123 | - elif (json_dict is not None) and (not isinstance(json_dict, dict)): | ||
| 2124 | - raise ValueError('Invalid json argument: want either single dict or ' | ||
| 2125 | - 'key=value parts but got {0} instead of dict)' | ||
| 2126 | - .format(type(json_dict))) | ||
| 2127 | - if json_parts: | ||
| 2128 | - json_dict = json_parts | ||
| 2129 | - | ||
| 2130 | - if _json_is_first: | ||
| 2131 | - print('[') | ||
| 2132 | - | ||
| 2133 | - lines = json.dumps(json2ascii(json_dict), check_circular=False, | ||
| 2134 | - indent=4, ensure_ascii=False).splitlines() | ||
| 2135 | - for line in lines[:-1]: | ||
| 2136 | - print(' {0}'.format(line)) | ||
| 2137 | - if _json_is_last: | ||
| 2138 | - print(' {0}'.format(lines[-1])) # print last line without comma | ||
| 2139 | - print(']') | ||
| 2140 | - else: | ||
| 2141 | - print(' {0},'.format(lines[-1])) # print last line with comma | ||
| 2142 | - | ||
| 2143 | - | ||
| 2144 | -class VBA_Scanner(object): | ||
| 2145 | - """ | ||
| 2146 | - Class to scan the source code of a VBA module to find obfuscated strings, | ||
| 2147 | - suspicious keywords, IOCs, auto-executable macros, etc. | ||
| 2148 | - """ | ||
| 2149 | - | ||
| 2150 | - def __init__(self, vba_code): | ||
| 2151 | - """ | ||
| 2152 | - VBA_Scanner constructor | ||
| 2153 | - | ||
| 2154 | - :param vba_code: str, VBA source code to be analyzed | ||
| 2155 | - """ | ||
| 2156 | - if isinstance(vba_code, bytes): | ||
| 2157 | - vba_code = vba_code.decode('utf-8', 'backslashreplace') | ||
| 2158 | - # join long lines ending with " _": | ||
| 2159 | - self.code = vba_collapse_long_lines(vba_code) | ||
| 2160 | - self.code_hex = '' | ||
| 2161 | - self.code_hex_rev = '' | ||
| 2162 | - self.code_rev_hex = '' | ||
| 2163 | - self.code_base64 = '' | ||
| 2164 | - self.code_dridex = '' | ||
| 2165 | - self.code_vba = '' | ||
| 2166 | - self.strReverse = None | ||
| 2167 | - # results = None before scanning, then a list of tuples after scanning | ||
| 2168 | - self.results = None | ||
| 2169 | - self.autoexec_keywords = None | ||
| 2170 | - self.suspicious_keywords = None | ||
| 2171 | - self.iocs = None | ||
| 2172 | - self.hex_strings = None | ||
| 2173 | - self.base64_strings = None | ||
| 2174 | - self.dridex_strings = None | ||
| 2175 | - self.vba_strings = None | ||
| 2176 | - | ||
| 2177 | - | ||
| 2178 | - def scan(self, include_decoded_strings=False, deobfuscate=False): | ||
| 2179 | - """ | ||
| 2180 | - Analyze the provided VBA code to detect suspicious keywords, | ||
| 2181 | - auto-executable macros, IOC patterns, obfuscation patterns | ||
| 2182 | - such as hex-encoded strings. | ||
| 2183 | - | ||
| 2184 | - :param include_decoded_strings: bool, if True, all encoded strings will be included with their decoded content. | ||
| 2185 | - :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) | ||
| 2186 | - :return: list of tuples (type, keyword, description) | ||
| 2187 | - (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String') | ||
| 2188 | - """ | ||
| 2189 | - # First, detect and extract hex-encoded strings: | ||
| 2190 | - self.hex_strings = detect_hex_strings(self.code) | ||
| 2191 | - # detect if the code contains StrReverse: | ||
| 2192 | - self.strReverse = False | ||
| 2193 | - if 'strreverse' in self.code.lower(): self.strReverse = True | ||
| 2194 | - # Then append the decoded strings to the VBA code, to detect obfuscated IOCs and keywords: | ||
| 2195 | - for encoded, decoded in self.hex_strings: | ||
| 2196 | - self.code_hex += '\n' + decoded | ||
| 2197 | - # if the code contains "StrReverse", also append the hex strings in reverse order: | ||
| 2198 | - if self.strReverse: | ||
| 2199 | - # StrReverse after hex decoding: | ||
| 2200 | - self.code_hex_rev += '\n' + decoded[::-1] | ||
| 2201 | - # StrReverse before hex decoding: | ||
| 2202 | - self.code_rev_hex += '\n' + str(binascii.unhexlify(encoded[::-1])) | ||
| 2203 | - #example: https://malwr.com/analysis/NmFlMGI4YTY1YzYyNDkwNTg1ZTBiZmY5OGI3YjlhYzU/ | ||
| 2204 | - #TODO: also append the full code reversed if StrReverse? (risk of false positives?) | ||
| 2205 | - # Detect Base64-encoded strings | ||
| 2206 | - self.base64_strings = detect_base64_strings(self.code) | ||
| 2207 | - for encoded, decoded in self.base64_strings: | ||
| 2208 | - self.code_base64 += '\n' + decoded | ||
| 2209 | - # Detect Dridex-encoded strings | ||
| 2210 | - self.dridex_strings = detect_dridex_strings(self.code) | ||
| 2211 | - for encoded, decoded in self.dridex_strings: | ||
| 2212 | - self.code_dridex += '\n' + decoded | ||
| 2213 | - # Detect obfuscated strings in VBA expressions | ||
| 2214 | - if deobfuscate: | ||
| 2215 | - self.vba_strings = detect_vba_strings(self.code) | ||
| 2216 | - else: | ||
| 2217 | - self.vba_strings = [] | ||
| 2218 | - for encoded, decoded in self.vba_strings: | ||
| 2219 | - self.code_vba += '\n' + decoded | ||
| 2220 | - results = [] | ||
| 2221 | - self.autoexec_keywords = [] | ||
| 2222 | - self.suspicious_keywords = [] | ||
| 2223 | - self.iocs = [] | ||
| 2224 | - | ||
| 2225 | - for code, obfuscation in ( | ||
| 2226 | - (self.code, None), | ||
| 2227 | - (self.code_hex, 'Hex'), | ||
| 2228 | - (self.code_hex_rev, 'Hex+StrReverse'), | ||
| 2229 | - (self.code_rev_hex, 'StrReverse+Hex'), | ||
| 2230 | - (self.code_base64, 'Base64'), | ||
| 2231 | - (self.code_dridex, 'Dridex'), | ||
| 2232 | - (self.code_vba, 'VBA expression'), | ||
| 2233 | - ): | ||
| 2234 | - if isinstance(code,bytes): | ||
| 2235 | - code=code.decode('utf-8','backslashreplace') | ||
| 2236 | - self.autoexec_keywords += detect_autoexec(code, obfuscation) | ||
| 2237 | - self.suspicious_keywords += detect_suspicious(code, obfuscation) | ||
| 2238 | - self.iocs += detect_patterns(code, obfuscation) | ||
| 2239 | - | ||
| 2240 | - # If hex-encoded strings were discovered, add an item to suspicious keywords: | ||
| 2241 | - if self.hex_strings: | ||
| 2242 | - self.suspicious_keywords.append(('Hex Strings', | ||
| 2243 | - 'Hex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)')) | ||
| 2244 | - if self.base64_strings: | ||
| 2245 | - self.suspicious_keywords.append(('Base64 Strings', | ||
| 2246 | - 'Base64-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)')) | ||
| 2247 | - if self.dridex_strings: | ||
| 2248 | - self.suspicious_keywords.append(('Dridex Strings', | ||
| 2249 | - 'Dridex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)')) | ||
| 2250 | - if self.vba_strings: | ||
| 2251 | - self.suspicious_keywords.append(('VBA obfuscated Strings', | ||
| 2252 | - 'VBA string expressions were detected, may be used to obfuscate strings (option --decode to see all)')) | ||
| 2253 | - # use a set to avoid duplicate keywords | ||
| 2254 | - keyword_set = set() | ||
| 2255 | - for keyword, description in self.autoexec_keywords: | ||
| 2256 | - if keyword not in keyword_set: | ||
| 2257 | - results.append(('AutoExec', keyword, description)) | ||
| 2258 | - keyword_set.add(keyword) | ||
| 2259 | - keyword_set = set() | ||
| 2260 | - for keyword, description in self.suspicious_keywords: | ||
| 2261 | - if keyword not in keyword_set: | ||
| 2262 | - results.append(('Suspicious', keyword, description)) | ||
| 2263 | - keyword_set.add(keyword) | ||
| 2264 | - keyword_set = set() | ||
| 2265 | - for pattern_type, value in self.iocs: | ||
| 2266 | - if value not in keyword_set: | ||
| 2267 | - results.append(('IOC', value, pattern_type)) | ||
| 2268 | - keyword_set.add(value) | ||
| 2269 | - | ||
| 2270 | - # include decoded strings only if they are printable or if --decode option: | ||
| 2271 | - for encoded, decoded in self.hex_strings: | ||
| 2272 | - if include_decoded_strings or is_printable(decoded): | ||
| 2273 | - results.append(('Hex String', decoded, encoded)) | ||
| 2274 | - for encoded, decoded in self.base64_strings: | ||
| 2275 | - if include_decoded_strings or is_printable(decoded): | ||
| 2276 | - results.append(('Base64 String', decoded, encoded)) | ||
| 2277 | - for encoded, decoded in self.dridex_strings: | ||
| 2278 | - if include_decoded_strings or is_printable(decoded): | ||
| 2279 | - results.append(('Dridex string', decoded, encoded)) | ||
| 2280 | - for encoded, decoded in self.vba_strings: | ||
| 2281 | - if include_decoded_strings or is_printable(decoded): | ||
| 2282 | - results.append(('VBA string', decoded, encoded)) | ||
| 2283 | - self.results = results | ||
| 2284 | - return results | ||
| 2285 | - | ||
| 2286 | - def scan_summary(self): | ||
| 2287 | - """ | ||
| 2288 | - Analyze the provided VBA code to detect suspicious keywords, | ||
| 2289 | - auto-executable macros, IOC patterns, obfuscation patterns | ||
| 2290 | - such as hex-encoded strings. | ||
| 2291 | - | ||
| 2292 | - :return: tuple with the number of items found for each category: | ||
| 2293 | - (autoexec, suspicious, IOCs, hex, base64, dridex, vba) | ||
| 2294 | - """ | ||
| 2295 | - # avoid scanning the same code twice: | ||
| 2296 | - if self.results is None: | ||
| 2297 | - self.scan() | ||
| 2298 | - return (len(self.autoexec_keywords), len(self.suspicious_keywords), | ||
| 2299 | - len(self.iocs), len(self.hex_strings), len(self.base64_strings), | ||
| 2300 | - len(self.dridex_strings), len(self.vba_strings)) | ||
| 2301 | - | ||
| 2302 | - | ||
| 2303 | -def scan_vba(vba_code, include_decoded_strings, deobfuscate=False): | ||
| 2304 | - """ | ||
| 2305 | - Analyze the provided VBA code to detect suspicious keywords, | ||
| 2306 | - auto-executable macros, IOC patterns, obfuscation patterns | ||
| 2307 | - such as hex-encoded strings. | ||
| 2308 | - (shortcut for VBA_Scanner(vba_code).scan()) | ||
| 2309 | - | ||
| 2310 | - :param vba_code: str, VBA source code to be analyzed | ||
| 2311 | - :param include_decoded_strings: bool, if True all encoded strings will be included with their decoded content. | ||
| 2312 | - :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) | ||
| 2313 | - :return: list of tuples (type, keyword, description) | ||
| 2314 | - (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String') | ||
| 2315 | - """ | ||
| 2316 | - return VBA_Scanner(vba_code).scan(include_decoded_strings, deobfuscate) | ||
| 2317 | - | ||
| 2318 | - | ||
| 2319 | -#=== CLASSES ================================================================= | ||
| 2320 | - | ||
| 2321 | -class VBA_Parser(object): | ||
| 2322 | - """ | ||
| 2323 | - Class to parse MS Office files, to detect VBA macros and extract VBA source code | ||
| 2324 | - Supported file formats: | ||
| 2325 | - - Word 97-2003 (.doc, .dot) | ||
| 2326 | - - Word 2007+ (.docm, .dotm) | ||
| 2327 | - - Word 2003 XML (.xml) | ||
| 2328 | - - Word MHT - Single File Web Page / MHTML (.mht) | ||
| 2329 | - - Excel 97-2003 (.xls) | ||
| 2330 | - - Excel 2007+ (.xlsm, .xlsb) | ||
| 2331 | - - PowerPoint 97-2003 (.ppt) | ||
| 2332 | - - PowerPoint 2007+ (.pptm, .ppsm) | ||
| 2333 | - """ | ||
| 2334 | - | ||
| 2335 | - def __init__(self, filename, data=None, container=None, relaxed=False): | ||
| 2336 | - """ | ||
| 2337 | - Constructor for VBA_Parser | ||
| 2338 | - | ||
| 2339 | - :param filename: filename or path of file to parse, or file-like object | ||
| 2340 | - | ||
| 2341 | - :param data: None or bytes str, if None the file will be read from disk (or from the file-like object). | ||
| 2342 | - If data is provided as a bytes string, it will be parsed as the content of the file in memory, | ||
| 2343 | - and not read from disk. Note: files must be read in binary mode, i.e. open(f, 'rb'). | ||
| 2344 | - | ||
| 2345 | - :param container: str, path and filename of container if the file is within | ||
| 2346 | - a zip archive, None otherwise. | ||
| 2347 | - | ||
| 2348 | - :param relaxed: if True, treat mal-formed documents and missing streams more like MS office: | ||
| 2349 | - do nothing; if False (default), raise errors in these cases | ||
| 2350 | - | ||
| 2351 | - raises a FileOpenError if all attemps to interpret the data header failed | ||
| 2352 | - """ | ||
| 2353 | - #TODO: filename should only be a string, data should be used for the file-like object | ||
| 2354 | - #TODO: filename should be mandatory, optional data is a string or file-like object | ||
| 2355 | - #TODO: also support olefile and zipfile as input | ||
| 2356 | - if data is None: | ||
| 2357 | - # open file from disk: | ||
| 2358 | - _file = filename | ||
| 2359 | - else: | ||
| 2360 | - # file already read in memory, make it a file-like object for zipfile: | ||
| 2361 | - _file = BytesIO(data) | ||
| 2362 | - #self.file = _file | ||
| 2363 | - self.ole_file = None | ||
| 2364 | - self.ole_subfiles = [] | ||
| 2365 | - self.filename = filename | ||
| 2366 | - self.container = container | ||
| 2367 | - self.relaxed = relaxed | ||
| 2368 | - self.type = None | ||
| 2369 | - self.vba_projects = None | ||
| 2370 | - self.vba_forms = None | ||
| 2371 | - self.contains_macros = None # will be set to True or False by detect_macros | ||
| 2372 | - self.vba_code_all_modules = None # to store the source code of all modules | ||
| 2373 | - # list of tuples for each module: (subfilename, stream_path, vba_filename, vba_code) | ||
| 2374 | - self.modules = None | ||
| 2375 | - # Analysis results: list of tuples (type, keyword, description) - See VBA_Scanner | ||
| 2376 | - self.analysis_results = None | ||
| 2377 | - # statistics for the scan summary and flags | ||
| 2378 | - self.nb_macros = 0 | ||
| 2379 | - self.nb_autoexec = 0 | ||
| 2380 | - self.nb_suspicious = 0 | ||
| 2381 | - self.nb_iocs = 0 | ||
| 2382 | - self.nb_hexstrings = 0 | ||
| 2383 | - self.nb_base64strings = 0 | ||
| 2384 | - self.nb_dridexstrings = 0 | ||
| 2385 | - self.nb_vbastrings = 0 | ||
| 2386 | - | ||
| 2387 | - # if filename is None: | ||
| 2388 | - # if isinstance(_file, basestring): | ||
| 2389 | - # if len(_file) < olefile.MINIMAL_OLEFILE_SIZE: | ||
| 2390 | - # self.filename = _file | ||
| 2391 | - # else: | ||
| 2392 | - # self.filename = '<file in bytes string>' | ||
| 2393 | - # else: | ||
| 2394 | - # self.filename = '<file-like object>' | ||
| 2395 | - if olefile.isOleFile(_file): | ||
| 2396 | - # This looks like an OLE file | ||
| 2397 | - self.open_ole(_file) | ||
| 2398 | - | ||
| 2399 | - # check whether file is encrypted (need to do this before try ppt) | ||
| 2400 | - log.debug('Check encryption of ole file') | ||
| 2401 | - crypt_indicator = oleid.OleID(self.ole_file).check_encrypted() | ||
| 2402 | - if crypt_indicator.value: | ||
| 2403 | - raise FileIsEncryptedError(filename) | ||
| 2404 | - | ||
| 2405 | - # if this worked, try whether it is a ppt file (special ole file) | ||
| 2406 | - self.open_ppt() | ||
| 2407 | - if self.type is None and is_zipfile(_file): | ||
| 2408 | - # Zip file, which may be an OpenXML document | ||
| 2409 | - self.open_openxml(_file) | ||
| 2410 | - if self.type is None: | ||
| 2411 | - # read file from disk, check if it is a Word 2003 XML file (WordProcessingML), Excel 2003 XML, | ||
| 2412 | - # or a plain text file containing VBA code | ||
| 2413 | - if data is None: | ||
| 2414 | - with open(filename, 'rb') as file_handle: | ||
| 2415 | - data = file_handle.read() | ||
| 2416 | - # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace | ||
| 2417 | - if b'http://schemas.microsoft.com/office/word/2003/wordml' in data: | ||
| 2418 | - self.open_word2003xml(data) | ||
| 2419 | - # check if it is a Word/PowerPoint 2007+ XML file (Flat OPC): must contain the namespace | ||
| 2420 | - if b'http://schemas.microsoft.com/office/2006/xmlPackage' in data: | ||
| 2421 | - self.open_flatopc(data) | ||
| 2422 | - # store a lowercase version for the next tests: | ||
| 2423 | - data_lowercase = data.lower() | ||
| 2424 | - # check if it is a MHT file (MIME HTML, Word or Excel saved as "Single File Web Page"): | ||
| 2425 | - # According to my tests, these files usually start with "MIME-Version: 1.0" on the 1st line | ||
| 2426 | - # BUT Word accepts a blank line or other MIME headers inserted before, | ||
| 2427 | - # and even whitespaces in between "MIME", "-", "Version" and ":". The version number is ignored. | ||
| 2428 | - # And the line is case insensitive. | ||
| 2429 | - # so we'll just check the presence of mime, version and multipart anywhere: | ||
| 2430 | - if self.type is None and b'mime' in data_lowercase and b'version' in data_lowercase \ | ||
| 2431 | - and b'multipart' in data_lowercase: | ||
| 2432 | - self.open_mht(data) | ||
| 2433 | - #TODO: handle exceptions | ||
| 2434 | - #TODO: Excel 2003 XML | ||
| 2435 | - # Check whether this is rtf | ||
| 2436 | - if rtfobj.is_rtf(data, treat_str_as_data=True): | ||
| 2437 | - # Ignore RTF since it contains no macros and methods in here will not find macros | ||
| 2438 | - # in embedded objects. run rtfobj and repeat on its output. | ||
| 2439 | - msg = '%s is RTF, need to run rtfobj.py and find VBA Macros in its output.' % self.filename | ||
| 2440 | - log.info(msg) | ||
| 2441 | - raise FileOpenError(msg) | ||
| 2442 | - # Check if this is a plain text VBA or VBScript file: | ||
| 2443 | - # To avoid scanning binary files, we simply check for some control chars: | ||
| 2444 | - if self.type is None and b'\x00' not in data: | ||
| 2445 | - self.open_text(data) | ||
| 2446 | - if self.type is None: | ||
| 2447 | - # At this stage, could not match a known format: | ||
| 2448 | - msg = '%s is not a supported file type, cannot extract VBA Macros.' % self.filename | ||
| 2449 | - log.info(msg) | ||
| 2450 | - raise FileOpenError(msg) | ||
| 2451 | - | ||
| 2452 | - def open_ole(self, _file): | ||
| 2453 | - """ | ||
| 2454 | - Open an OLE file | ||
| 2455 | - :param _file: filename or file contents in a file object | ||
| 2456 | - :return: nothing | ||
| 2457 | - """ | ||
| 2458 | - log.info('Opening OLE file %s' % self.filename) | ||
| 2459 | - try: | ||
| 2460 | - # Open and parse the OLE file, using unicode for path names: | ||
| 2461 | - self.ole_file = olefile.OleFileIO(_file, path_encoding=None) | ||
| 2462 | - # set type only if parsing succeeds | ||
| 2463 | - self.type = TYPE_OLE | ||
| 2464 | - except (IOError, TypeError, ValueError) as exc: | ||
| 2465 | - # TODO: handle OLE parsing exceptions | ||
| 2466 | - log.info('Failed OLE parsing for file %r (%s)' % (self.filename, exc)) | ||
| 2467 | - log.debug('Trace:', exc_info=True) | ||
| 2468 | - | ||
| 2469 | - | ||
| 2470 | - def open_openxml(self, _file): | ||
| 2471 | - """ | ||
| 2472 | - Open an OpenXML file | ||
| 2473 | - :param _file: filename or file contents in a file object | ||
| 2474 | - :return: nothing | ||
| 2475 | - """ | ||
| 2476 | - # This looks like a zip file, need to look for vbaProject.bin inside | ||
| 2477 | - # It can be any OLE file inside the archive | ||
| 2478 | - #...because vbaProject.bin can be renamed: | ||
| 2479 | - # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18 | ||
| 2480 | - log.info('Opening ZIP/OpenXML file %s' % self.filename) | ||
| 2481 | - try: | ||
| 2482 | - z = zipfile.ZipFile(_file) | ||
| 2483 | - #TODO: check if this is actually an OpenXML file | ||
| 2484 | - #TODO: if the zip file is encrypted, suggest to use the -z option, or try '-z infected' automatically | ||
| 2485 | - # check each file within the zip if it is an OLE file, by reading its magic: | ||
| 2486 | - for subfile in z.namelist(): | ||
| 2487 | - with z.open(subfile) as file_handle: | ||
| 2488 | - magic = file_handle.read(len(olefile.MAGIC)) | ||
| 2489 | - if magic == olefile.MAGIC: | ||
| 2490 | - log.debug('Opening OLE file %s within zip' % subfile) | ||
| 2491 | - with z.open(subfile) as file_handle: | ||
| 2492 | - ole_data = file_handle.read() | ||
| 2493 | - try: | ||
| 2494 | - self.ole_subfiles.append( | ||
| 2495 | - VBA_Parser(filename=subfile, data=ole_data, | ||
| 2496 | - relaxed=self.relaxed)) | ||
| 2497 | - except OlevbaBaseException as exc: | ||
| 2498 | - if self.relaxed: | ||
| 2499 | - log.info('%s is not a valid OLE file (%s)' % (subfile, exc)) | ||
| 2500 | - log.debug('Trace:', exc_info=True) | ||
| 2501 | - continue | ||
| 2502 | - else: | ||
| 2503 | - raise SubstreamOpenError(self.filename, subfile, | ||
| 2504 | - exc) | ||
| 2505 | - z.close() | ||
| 2506 | - # set type only if parsing succeeds | ||
| 2507 | - self.type = TYPE_OpenXML | ||
| 2508 | - except OlevbaBaseException as exc: | ||
| 2509 | - if self.relaxed: | ||
| 2510 | - log.info('Error {0} caught in Zip/OpenXML parsing for file {1}' | ||
| 2511 | - .format(exc, self.filename)) | ||
| 2512 | - log.debug('Trace:', exc_info=True) | ||
| 2513 | - else: | ||
| 2514 | - raise | ||
| 2515 | - except (RuntimeError, zipfile.BadZipfile, zipfile.LargeZipFile, IOError) as exc: | ||
| 2516 | - # TODO: handle parsing exceptions | ||
| 2517 | - log.info('Failed Zip/OpenXML parsing for file %r (%s)' | ||
| 2518 | - % (self.filename, exc)) | ||
| 2519 | - log.debug('Trace:', exc_info=True) | ||
| 2520 | - | ||
| 2521 | - def open_word2003xml(self, data): | ||
| 2522 | - """ | ||
| 2523 | - Open a Word 2003 XML file | ||
| 2524 | - :param data: file contents in a string or bytes | ||
| 2525 | - :return: nothing | ||
| 2526 | - """ | ||
| 2527 | - log.info('Opening Word 2003 XML file %s' % self.filename) | ||
| 2528 | - try: | ||
| 2529 | - # parse the XML content | ||
| 2530 | - # TODO: handle XML parsing exceptions | ||
| 2531 | - et = ET.fromstring(data) | ||
| 2532 | - # find all the binData elements: | ||
| 2533 | - for bindata in et.getiterator(TAG_BINDATA): | ||
| 2534 | - # the binData content is an OLE container for the VBA project, compressed | ||
| 2535 | - # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded. | ||
| 2536 | - # get the filename: | ||
| 2537 | - fname = bindata.get(ATTR_NAME, 'noname.mso') | ||
| 2538 | - # decode the base64 activemime | ||
| 2539 | - mso_data = binascii.a2b_base64(bindata.text) | ||
| 2540 | - if is_mso_file(mso_data): | ||
| 2541 | - # decompress the zlib data stored in the MSO file, which is the OLE container: | ||
| 2542 | - # TODO: handle different offsets => separate function | ||
| 2543 | - try: | ||
| 2544 | - ole_data = mso_file_extract(mso_data) | ||
| 2545 | - self.ole_subfiles.append( | ||
| 2546 | - VBA_Parser(filename=fname, data=ole_data, | ||
| 2547 | - relaxed=self.relaxed)) | ||
| 2548 | - except OlevbaBaseException as exc: | ||
| 2549 | - if self.relaxed: | ||
| 2550 | - log.info('Error parsing subfile {0}: {1}' | ||
| 2551 | - .format(fname, exc)) | ||
| 2552 | - log.debug('Trace:', exc_info=True) | ||
| 2553 | - else: | ||
| 2554 | - raise SubstreamOpenError(self.filename, fname, exc) | ||
| 2555 | - else: | ||
| 2556 | - log.info('%s is not a valid MSO file' % fname) | ||
| 2557 | - # set type only if parsing succeeds | ||
| 2558 | - self.type = TYPE_Word2003_XML | ||
| 2559 | - except OlevbaBaseException as exc: | ||
| 2560 | - if self.relaxed: | ||
| 2561 | - log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc)) | ||
| 2562 | - log.debug('Trace:', exc_info=True) | ||
| 2563 | - else: | ||
| 2564 | - raise | ||
| 2565 | - except Exception as exc: | ||
| 2566 | - # TODO: differentiate exceptions for each parsing stage | ||
| 2567 | - # (but ET is different libs, no good exception description in API) | ||
| 2568 | - # found: XMLSyntaxError | ||
| 2569 | - log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc)) | ||
| 2570 | - log.debug('Trace:', exc_info=True) | ||
| 2571 | - | ||
| 2572 | - def open_flatopc(self, data): | ||
| 2573 | - """ | ||
| 2574 | - Open a Word or PowerPoint 2007+ XML file, aka "Flat OPC" | ||
| 2575 | - :param data: file contents in a string or bytes | ||
| 2576 | - :return: nothing | ||
| 2577 | - """ | ||
| 2578 | - log.info('Opening Flat OPC Word/PowerPoint XML file %s' % self.filename) | ||
| 2579 | - try: | ||
| 2580 | - # parse the XML content | ||
| 2581 | - # TODO: handle XML parsing exceptions | ||
| 2582 | - et = ET.fromstring(data) | ||
| 2583 | - # TODO: check root node namespace and tag | ||
| 2584 | - # find all the pkg:part elements: | ||
| 2585 | - for pkgpart in et.iter(TAG_PKGPART): | ||
| 2586 | - fname = pkgpart.get(ATTR_PKG_NAME, 'unknown') | ||
| 2587 | - content_type = pkgpart.get(ATTR_PKG_CONTENTTYPE, 'unknown') | ||
| 2588 | - if content_type == CTYPE_VBAPROJECT: | ||
| 2589 | - for bindata in pkgpart.iterfind(TAG_PKGBINDATA): | ||
| 2590 | - try: | ||
| 2591 | - ole_data = binascii.a2b_base64(bindata.text) | ||
| 2592 | - self.ole_subfiles.append( | ||
| 2593 | - VBA_Parser(filename=fname, data=ole_data, | ||
| 2594 | - relaxed=self.relaxed)) | ||
| 2595 | - except OlevbaBaseException as exc: | ||
| 2596 | - if self.relaxed: | ||
| 2597 | - log.info('Error parsing subfile {0}: {1}' | ||
| 2598 | - .format(fname, exc)) | ||
| 2599 | - log.debug('Trace:', exc_info=True) | ||
| 2600 | - else: | ||
| 2601 | - raise SubstreamOpenError(self.filename, fname, exc) | ||
| 2602 | - # set type only if parsing succeeds | ||
| 2603 | - self.type = TYPE_FlatOPC_XML | ||
| 2604 | - except OlevbaBaseException as exc: | ||
| 2605 | - if self.relaxed: | ||
| 2606 | - log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc)) | ||
| 2607 | - log.debug('Trace:', exc_info=True) | ||
| 2608 | - else: | ||
| 2609 | - raise | ||
| 2610 | - except Exception as exc: | ||
| 2611 | - # TODO: differentiate exceptions for each parsing stage | ||
| 2612 | - # (but ET is different libs, no good exception description in API) | ||
| 2613 | - # found: XMLSyntaxError | ||
| 2614 | - log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc)) | ||
| 2615 | - log.debug('Trace:', exc_info=True) | ||
| 2616 | - | ||
| 2617 | - def open_mht(self, data): | ||
| 2618 | - """ | ||
| 2619 | - Open a MHTML file | ||
| 2620 | - :param data: file contents in a string or bytes | ||
| 2621 | - :return: nothing | ||
| 2622 | - """ | ||
| 2623 | - log.info('Opening MHTML file %s' % self.filename) | ||
| 2624 | - try: | ||
| 2625 | - if isinstance(data,bytes): | ||
| 2626 | - data = data.decode('utf8', 'backslashreplace') | ||
| 2627 | - # parse the MIME content | ||
| 2628 | - # remove any leading whitespace or newline (workaround for issue in email package) | ||
| 2629 | - stripped_data = data.lstrip('\r\n\t ') | ||
| 2630 | - # strip any junk from the beginning of the file | ||
| 2631 | - # (issue #31 fix by Greg C - gdigreg) | ||
| 2632 | - # TODO: improve keywords to avoid false positives | ||
| 2633 | - mime_offset = stripped_data.find('MIME') | ||
| 2634 | - content_offset = stripped_data.find('Content') | ||
| 2635 | - # if "MIME" is found, and located before "Content": | ||
| 2636 | - if -1 < mime_offset <= content_offset: | ||
| 2637 | - stripped_data = stripped_data[mime_offset:] | ||
| 2638 | - # else if "Content" is found, and before "MIME" | ||
| 2639 | - # TODO: can it work without "MIME" at all? | ||
| 2640 | - elif content_offset > -1: | ||
| 2641 | - stripped_data = stripped_data[content_offset:] | ||
| 2642 | - # TODO: quick and dirty fix: insert a standard line with MIME-Version header? | ||
| 2643 | - mhtml = email.message_from_string(stripped_data) | ||
| 2644 | - # find all the attached files: | ||
| 2645 | - for part in mhtml.walk(): | ||
| 2646 | - content_type = part.get_content_type() # always returns a value | ||
| 2647 | - fname = part.get_filename(None) # returns None if it fails | ||
| 2648 | - # TODO: get content-location if no filename | ||
| 2649 | - log.debug('MHTML part: filename=%r, content-type=%r' % (fname, content_type)) | ||
| 2650 | - part_data = part.get_payload(decode=True) | ||
| 2651 | - # VBA macros are stored in a binary file named "editdata.mso". | ||
| 2652 | - # the data content is an OLE container for the VBA project, compressed | ||
| 2653 | - # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded. | ||
| 2654 | - # decompress the zlib data starting at offset 0x32, which is the OLE container: | ||
| 2655 | - # check ActiveMime header: | ||
| 2656 | - | ||
| 2657 | - if (isinstance(part_data, str) or isinstance(part_data, bytes)) and is_mso_file(part_data): | ||
| 2658 | - log.debug('Found ActiveMime header, decompressing MSO container') | ||
| 2659 | - try: | ||
| 2660 | - ole_data = mso_file_extract(part_data) | ||
| 2661 | - | ||
| 2662 | - # TODO: check if it is actually an OLE file | ||
| 2663 | - # TODO: get the MSO filename from content_location? | ||
| 2664 | - self.ole_subfiles.append( | ||
| 2665 | - VBA_Parser(filename=fname, data=ole_data, | ||
| 2666 | - relaxed=self.relaxed)) | ||
| 2667 | - except OlevbaBaseException as exc: | ||
| 2668 | - if self.relaxed: | ||
| 2669 | - log.info('%s does not contain a valid OLE file (%s)' | ||
| 2670 | - % (fname, exc)) | ||
| 2671 | - log.debug('Trace:', exc_info=True) | ||
| 2672 | - # TODO: bug here - need to split in smaller functions/classes? | ||
| 2673 | - else: | ||
| 2674 | - raise SubstreamOpenError(self.filename, fname, exc) | ||
| 2675 | - else: | ||
| 2676 | - log.debug('type(part_data) = %s' % type(part_data)) | ||
| 2677 | - try: | ||
| 2678 | - log.debug('part_data[0:20] = %r' % part_data[0:20]) | ||
| 2679 | - except TypeError as err: | ||
| 2680 | - log.debug('part_data has no __getitem__') | ||
| 2681 | - # set type only if parsing succeeds | ||
| 2682 | - self.type = TYPE_MHTML | ||
| 2683 | - except OlevbaBaseException: | ||
| 2684 | - raise | ||
| 2685 | - except Exception: | ||
| 2686 | - log.info('Failed MIME parsing for file %r - %s' | ||
| 2687 | - % (self.filename, MSG_OLEVBA_ISSUES)) | ||
| 2688 | - log.debug('Trace:', exc_info=True) | ||
| 2689 | - | ||
| 2690 | - def open_ppt(self): | ||
| 2691 | - """ try to interpret self.ole_file as PowerPoint 97-2003 using PptParser | ||
| 2692 | - | ||
| 2693 | - Although self.ole_file is a valid olefile.OleFileIO, we set | ||
| 2694 | - self.ole_file = None in here and instead set self.ole_subfiles to the | ||
| 2695 | - VBA ole streams found within the main ole file. That makes most of the | ||
| 2696 | - code below treat this like an OpenXML file and only look at the | ||
| 2697 | - ole_subfiles (except find_vba_* which needs to explicitly check for | ||
| 2698 | - self.type) | ||
| 2699 | - """ | ||
| 2700 | - | ||
| 2701 | - log.info('Check whether OLE file is PPT') | ||
| 2702 | - try: | ||
| 2703 | - ppt = ppt_parser.PptParser(self.ole_file, fast_fail=True) | ||
| 2704 | - for vba_data in ppt.iter_vba_data(): | ||
| 2705 | - self.ole_subfiles.append(VBA_Parser(None, vba_data, | ||
| 2706 | - container='PptParser')) | ||
| 2707 | - log.info('File is PPT') | ||
| 2708 | - self.ole_file.close() # just in case | ||
| 2709 | - self.ole_file = None # required to make other methods look at ole_subfiles | ||
| 2710 | - self.type = TYPE_PPT | ||
| 2711 | - except Exception as exc: | ||
| 2712 | - if self.container == 'PptParser': | ||
| 2713 | - # this is a subfile of a ppt --> to be expected that is no ppt | ||
| 2714 | - log.debug('PPT subfile is not a PPT file') | ||
| 2715 | - else: | ||
| 2716 | - log.debug("File appears not to be a ppt file (%s)" % exc) | ||
| 2717 | - | ||
| 2718 | - | ||
| 2719 | - def open_text(self, data): | ||
| 2720 | - """ | ||
| 2721 | - Open a text file containing VBA or VBScript source code | ||
| 2722 | - :param data: file contents in a string or bytes | ||
| 2723 | - :return: nothing | ||
| 2724 | - """ | ||
| 2725 | - log.info('Opening text file %s' % self.filename) | ||
| 2726 | - # directly store the source code: | ||
| 2727 | - if isinstance(data,bytes): | ||
| 2728 | - data=data.decode('utf8','backslashreplace') | ||
| 2729 | - self.vba_code_all_modules = data | ||
| 2730 | - self.contains_macros = True | ||
| 2731 | - # set type only if parsing succeeds | ||
| 2732 | - self.type = TYPE_TEXT | ||
| 2733 | - | ||
| 2734 | - | ||
| 2735 | - def find_vba_projects(self): | ||
| 2736 | - """ | ||
| 2737 | - Finds all the VBA projects stored in an OLE file. | ||
| 2738 | - | ||
| 2739 | - Return None if the file is not OLE but OpenXML. | ||
| 2740 | - Return a list of tuples (vba_root, project_path, dir_path) for each VBA project. | ||
| 2741 | - vba_root is the path of the root OLE storage containing the VBA project, | ||
| 2742 | - including a trailing slash unless it is the root of the OLE file. | ||
| 2743 | - project_path is the path of the OLE stream named "PROJECT" within the VBA project. | ||
| 2744 | - dir_path is the path of the OLE stream named "VBA/dir" within the VBA project. | ||
| 2745 | - | ||
| 2746 | - If this function returns an empty list for one of the supported formats | ||
| 2747 | - (i.e. Word, Excel, Powerpoint), then the file does not contain VBA macros. | ||
| 2748 | - | ||
| 2749 | - :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path) | ||
| 2750 | - for each VBA project found if OLE file | ||
| 2751 | - """ | ||
| 2752 | - log.debug('VBA_Parser.find_vba_projects') | ||
| 2753 | - | ||
| 2754 | - # if the file is not OLE but OpenXML, return None: | ||
| 2755 | - if self.ole_file is None and self.type != TYPE_PPT: | ||
| 2756 | - return None | ||
| 2757 | - | ||
| 2758 | - # if this method has already been called, return previous result: | ||
| 2759 | - if self.vba_projects is not None: | ||
| 2760 | - return self.vba_projects | ||
| 2761 | - | ||
| 2762 | - # if this is a ppt file (PowerPoint 97-2003): | ||
| 2763 | - # self.ole_file is None but the ole_subfiles do contain vba_projects | ||
| 2764 | - # (like for OpenXML files). | ||
| 2765 | - if self.type == TYPE_PPT: | ||
| 2766 | - # TODO: so far, this function is never called for PPT files, but | ||
| 2767 | - # if that happens, the information is lost which ole file contains | ||
| 2768 | - # which storage! | ||
| 2769 | - log.warning('Returned info is not complete for PPT types!') | ||
| 2770 | - self.vba_projects = [] | ||
| 2771 | - for subfile in self.ole_subfiles: | ||
| 2772 | - self.vba_projects.extend(subfile.find_vba_projects()) | ||
| 2773 | - return self.vba_projects | ||
| 2774 | - | ||
| 2775 | - # Find the VBA project root (different in MS Word, Excel, etc): | ||
| 2776 | - # - Word 97-2003: Macros | ||
| 2777 | - # - Excel 97-2003: _VBA_PROJECT_CUR | ||
| 2778 | - # - PowerPoint 97-2003: PptParser has identified ole_subfiles | ||
| 2779 | - # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin. | ||
| 2780 | - # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word | ||
| 2781 | - # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word | ||
| 2782 | - # - Visio 2007: not supported yet (different file structure) | ||
| 2783 | - | ||
| 2784 | - # According to MS-OVBA section 2.2.1: | ||
| 2785 | - # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream | ||
| 2786 | - # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream | ||
| 2787 | - # - all names are case-insensitive | ||
| 2788 | - | ||
| 2789 | - def check_vba_stream(ole, vba_root, stream_path): | ||
| 2790 | - full_path = vba_root + stream_path | ||
| 2791 | - if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM: | ||
| 2792 | - log.debug('Found %s stream: %s' % (stream_path, full_path)) | ||
| 2793 | - return full_path | ||
| 2794 | - else: | ||
| 2795 | - log.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path) | ||
| 2796 | - return False | ||
| 2797 | - | ||
| 2798 | - # start with an empty list: | ||
| 2799 | - self.vba_projects = [] | ||
| 2800 | - # Look for any storage containing those storage/streams: | ||
| 2801 | - ole = self.ole_file | ||
| 2802 | - for storage in ole.listdir(streams=False, storages=True): | ||
| 2803 | - log.debug('Checking storage %r' % storage) | ||
| 2804 | - # Look for a storage ending with "VBA": | ||
| 2805 | - if storage[-1].upper() == 'VBA': | ||
| 2806 | - log.debug('Found VBA storage: %s' % ('/'.join(storage))) | ||
| 2807 | - vba_root = '/'.join(storage[:-1]) | ||
| 2808 | - # Add a trailing slash to vba_root, unless it is the root of the OLE file: | ||
| 2809 | - # (used later to append all the child streams/storages) | ||
| 2810 | - if vba_root != '': | ||
| 2811 | - vba_root += '/' | ||
| 2812 | - log.debug('Checking vba_root="%s"' % vba_root) | ||
| 2813 | - | ||
| 2814 | - # Check if the VBA root storage also contains a PROJECT stream: | ||
| 2815 | - project_path = check_vba_stream(ole, vba_root, 'PROJECT') | ||
| 2816 | - if not project_path: continue | ||
| 2817 | - # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream: | ||
| 2818 | - vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT') | ||
| 2819 | - if not vba_project_path: continue | ||
| 2820 | - # Check if the VBA root storage also contains a VBA/dir stream: | ||
| 2821 | - dir_path = check_vba_stream(ole, vba_root, 'VBA/dir') | ||
| 2822 | - if not dir_path: continue | ||
| 2823 | - # Now we are pretty sure it is a VBA project structure | ||
| 2824 | - log.debug('VBA root storage: "%s"' % vba_root) | ||
| 2825 | - # append the results to the list as a tuple for later use: | ||
| 2826 | - self.vba_projects.append((vba_root, project_path, dir_path)) | ||
| 2827 | - return self.vba_projects | ||
| 2828 | - | ||
| 2829 | - def detect_vba_macros(self): | ||
| 2830 | - """ | ||
| 2831 | - Detect the potential presence of VBA macros in the file, by checking | ||
| 2832 | - if it contains VBA projects. Both OLE and OpenXML files are supported. | ||
| 2833 | - | ||
| 2834 | - Important: for now, results are accurate only for Word, Excel and PowerPoint | ||
| 2835 | - | ||
| 2836 | - Note: this method does NOT attempt to check the actual presence or validity | ||
| 2837 | - of VBA macro source code, so there might be false positives. | ||
| 2838 | - It may also detect VBA macros in files embedded within the main file, | ||
| 2839 | - for example an Excel workbook with macros embedded into a Word | ||
| 2840 | - document without macros may be detected, without distinction. | ||
| 2841 | - | ||
| 2842 | - :return: bool, True if at least one VBA project has been found, False otherwise | ||
| 2843 | - """ | ||
| 2844 | - #TODO: return None or raise exception if format not supported | ||
| 2845 | - #TODO: return the number of VBA projects found instead of True/False? | ||
| 2846 | - # if this method was already called, return the previous result: | ||
| 2847 | - if self.contains_macros is not None: | ||
| 2848 | - return self.contains_macros | ||
| 2849 | - # if OpenXML/PPT, check all the OLE subfiles: | ||
| 2850 | - if self.ole_file is None: | ||
| 2851 | - for ole_subfile in self.ole_subfiles: | ||
| 2852 | - if ole_subfile.detect_vba_macros(): | ||
| 2853 | - self.contains_macros = True | ||
| 2854 | - return True | ||
| 2855 | - # otherwise, no macro found: | ||
| 2856 | - self.contains_macros = False | ||
| 2857 | - return False | ||
| 2858 | - # otherwise it's an OLE file, find VBA projects: | ||
| 2859 | - vba_projects = self.find_vba_projects() | ||
| 2860 | - if len(vba_projects) == 0: | ||
| 2861 | - self.contains_macros = False | ||
| 2862 | - else: | ||
| 2863 | - self.contains_macros = True | ||
| 2864 | - # Also look for VBA code in any stream including orphans | ||
| 2865 | - # (happens in some malformed files) | ||
| 2866 | - ole = self.ole_file | ||
| 2867 | - for sid in xrange(len(ole.direntries)): | ||
| 2868 | - # check if id is already done above: | ||
| 2869 | - log.debug('Checking DirEntry #%d' % sid) | ||
| 2870 | - d = ole.direntries[sid] | ||
| 2871 | - if d is None: | ||
| 2872 | - # this direntry is not part of the tree: either unused or an orphan | ||
| 2873 | - d = ole._load_direntry(sid) | ||
| 2874 | - log.debug('This DirEntry is an orphan or unused') | ||
| 2875 | - if d.entry_type == olefile.STGTY_STREAM: | ||
| 2876 | - # read data | ||
| 2877 | - log.debug('Reading data from stream %r - size: %d bytes' % (d.name, d.size)) | ||
| 2878 | - try: | ||
| 2879 | - data = ole._open(d.isectStart, d.size).read() | ||
| 2880 | - log.debug('Read %d bytes' % len(data)) | ||
| 2881 | - if len(data) > 200: | ||
| 2882 | - log.debug('%r...[much more data]...%r' % (data[:100], data[-50:])) | ||
| 2883 | - else: | ||
| 2884 | - log.debug(repr(data)) | ||
| 2885 | - if 'Attribut\x00' in data.decode('utf-8', 'ignore'): | ||
| 2886 | - log.debug('Found VBA compressed code') | ||
| 2887 | - self.contains_macros = True | ||
| 2888 | - except IOError as exc: | ||
| 2889 | - if self.relaxed: | ||
| 2890 | - log.info('Error when reading OLE Stream %r' % d.name) | ||
| 2891 | - log.debug('Trace:', exc_trace=True) | ||
| 2892 | - else: | ||
| 2893 | - raise SubstreamOpenError(self.filename, d.name, exc) | ||
| 2894 | - return self.contains_macros | ||
| 2895 | - | ||
| 2896 | - def extract_macros(self): | ||
| 2897 | - """ | ||
| 2898 | - Extract and decompress source code for each VBA macro found in the file | ||
| 2899 | - | ||
| 2900 | - Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found | ||
| 2901 | - If the file is OLE, filename is the path of the file. | ||
| 2902 | - If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros | ||
| 2903 | - within the zip archive, e.g. word/vbaProject.bin. | ||
| 2904 | - If the file is PPT, result is as for OpenXML but filename is useless | ||
| 2905 | - """ | ||
| 2906 | - log.debug('extract_macros:') | ||
| 2907 | - if self.ole_file is None: | ||
| 2908 | - # This may be either an OpenXML/PPT or a text file: | ||
| 2909 | - if self.type == TYPE_TEXT: | ||
| 2910 | - # This is a text file, yield the full code: | ||
| 2911 | - yield (self.filename, '', self.filename, self.vba_code_all_modules) | ||
| 2912 | - else: | ||
| 2913 | - # OpenXML/PPT: recursively yield results from each OLE subfile: | ||
| 2914 | - for ole_subfile in self.ole_subfiles: | ||
| 2915 | - for results in ole_subfile.extract_macros(): | ||
| 2916 | - yield results | ||
| 2917 | - else: | ||
| 2918 | - # This is an OLE file: | ||
| 2919 | - self.find_vba_projects() | ||
| 2920 | - # set of stream ids | ||
| 2921 | - vba_stream_ids = set() | ||
| 2922 | - for vba_root, project_path, dir_path in self.vba_projects: | ||
| 2923 | - # extract all VBA macros from that VBA root storage: | ||
| 2924 | - # The function _extract_vba may fail on some files (issue #132) | ||
| 2925 | - try: | ||
| 2926 | - for stream_path, vba_filename, vba_code in \ | ||
| 2927 | - _extract_vba(self.ole_file, vba_root, project_path, | ||
| 2928 | - dir_path, self.relaxed): | ||
| 2929 | - # store direntry ids in a set: | ||
| 2930 | - vba_stream_ids.add(self.ole_file._find(stream_path)) | ||
| 2931 | - yield (self.filename, stream_path, vba_filename, vba_code) | ||
| 2932 | - except Exception as e: | ||
| 2933 | - log.exception('Error in _extract_vba') | ||
| 2934 | - # Also look for VBA code in any stream including orphans | ||
| 2935 | - # (happens in some malformed files) | ||
| 2936 | - ole = self.ole_file | ||
| 2937 | - for sid in xrange(len(ole.direntries)): | ||
| 2938 | - # check if id is already done above: | ||
| 2939 | - log.debug('Checking DirEntry #%d' % sid) | ||
| 2940 | - if sid in vba_stream_ids: | ||
| 2941 | - log.debug('Already extracted') | ||
| 2942 | - continue | ||
| 2943 | - d = ole.direntries[sid] | ||
| 2944 | - if d is None: | ||
| 2945 | - # this direntry is not part of the tree: either unused or an orphan | ||
| 2946 | - d = ole._load_direntry(sid) | ||
| 2947 | - log.debug('This DirEntry is an orphan or unused') | ||
| 2948 | - if d.entry_type == olefile.STGTY_STREAM: | ||
| 2949 | - # read data | ||
| 2950 | - log.debug('Reading data from stream %r' % d.name) | ||
| 2951 | - data = ole._open(d.isectStart, d.size).read() | ||
| 2952 | - for match in re.finditer(b'\\x00Attribut[^e]', data, flags=re.IGNORECASE): | ||
| 2953 | - start = match.start() - 3 | ||
| 2954 | - log.debug('Found VBA compressed code at index %X' % start) | ||
| 2955 | - compressed_code = data[start:] | ||
| 2956 | - try: | ||
| 2957 | - vba_code = decompress_stream(bytearray(compressed_code)) | ||
| 2958 | - yield (self.filename, d.name, d.name, vba_code) | ||
| 2959 | - except Exception as exc: | ||
| 2960 | - # display the exception with full stack trace for debugging | ||
| 2961 | - log.debug('Error processing stream %r in file %r (%s)' % (d.name, self.filename, exc)) | ||
| 2962 | - log.debug('Traceback:', exc_info=True) | ||
| 2963 | - # do not raise the error, as it is unlikely to be a compressed macro stream | ||
| 2964 | - | ||
| 2965 | - def extract_all_macros(self): | ||
| 2966 | - """ | ||
| 2967 | - Extract and decompress source code for each VBA macro found in the file | ||
| 2968 | - by calling extract_macros(), store the results as a list of tuples | ||
| 2969 | - (filename, stream_path, vba_filename, vba_code) in self.modules. | ||
| 2970 | - See extract_macros for details. | ||
| 2971 | - """ | ||
| 2972 | - if self.modules is None: | ||
| 2973 | - self.modules = [] | ||
| 2974 | - for (subfilename, stream_path, vba_filename, vba_code) in self.extract_macros(): | ||
| 2975 | - self.modules.append((subfilename, stream_path, vba_filename, vba_code)) | ||
| 2976 | - self.nb_macros = len(self.modules) | ||
| 2977 | - return self.modules | ||
| 2978 | - | ||
| 2979 | - | ||
| 2980 | - | ||
| 2981 | - def analyze_macros(self, show_decoded_strings=False, deobfuscate=False): | ||
| 2982 | - """ | ||
| 2983 | - runs extract_macros and analyze the source code of all VBA macros | ||
| 2984 | - found in the file. | ||
| 2985 | - All results are stored in self.analysis_results. | ||
| 2986 | - If called more than once, simply returns the previous results. | ||
| 2987 | - """ | ||
| 2988 | - if self.detect_vba_macros(): | ||
| 2989 | - # if the analysis was already done, avoid doing it twice: | ||
| 2990 | - if self.analysis_results is not None: | ||
| 2991 | - return self.analysis_results | ||
| 2992 | - # variable to merge source code from all modules: | ||
| 2993 | - if self.vba_code_all_modules is None: | ||
| 2994 | - self.vba_code_all_modules = '' | ||
| 2995 | - for (_, _, _, vba_code) in self.extract_all_macros(): | ||
| 2996 | - #TODO: filter code? (each module) | ||
| 2997 | - if isinstance(vba_code, bytes): | ||
| 2998 | - vba_code = vba_code.decode('utf-8', 'ignore') | ||
| 2999 | - self.vba_code_all_modules += vba_code + '\n' | ||
| 3000 | - for (_, _, form_string) in self.extract_form_strings(): | ||
| 3001 | - self.vba_code_all_modules += form_string.decode('utf-8', 'ignore') + '\n' | ||
| 3002 | - # Analyze the whole code at once: | ||
| 3003 | - scanner = VBA_Scanner(self.vba_code_all_modules) | ||
| 3004 | - self.analysis_results = scanner.scan(show_decoded_strings, deobfuscate) | ||
| 3005 | - autoexec, suspicious, iocs, hexstrings, base64strings, dridex, vbastrings = scanner.scan_summary() | ||
| 3006 | - self.nb_autoexec += autoexec | ||
| 3007 | - self.nb_suspicious += suspicious | ||
| 3008 | - self.nb_iocs += iocs | ||
| 3009 | - self.nb_hexstrings += hexstrings | ||
| 3010 | - self.nb_base64strings += base64strings | ||
| 3011 | - self.nb_dridexstrings += dridex | ||
| 3012 | - self.nb_vbastrings += vbastrings | ||
| 3013 | - | ||
| 3014 | - return self.analysis_results | ||
| 3015 | - | ||
| 3016 | - | ||
| 3017 | - def reveal(self): | ||
| 3018 | - # we only want printable strings: | ||
| 3019 | - analysis = self.analyze_macros(show_decoded_strings=False) | ||
| 3020 | - # to avoid replacing short strings contained into longer strings, we sort the analysis results | ||
| 3021 | - # based on the length of the encoded string, in reverse order: | ||
| 3022 | - analysis = sorted(analysis, key=lambda type_decoded_encoded: len(type_decoded_encoded[2]), reverse=True) | ||
| 3023 | - # normally now self.vba_code_all_modules contains source code from all modules | ||
| 3024 | - # Need to collapse long lines: | ||
| 3025 | - deobf_code = vba_collapse_long_lines(self.vba_code_all_modules) | ||
| 3026 | - deobf_code = filter_vba(deobf_code) | ||
| 3027 | - for kw_type, decoded, encoded in analysis: | ||
| 3028 | - if kw_type == 'VBA string': | ||
| 3029 | - #print '%3d occurences: %r => %r' % (deobf_code.count(encoded), encoded, decoded) | ||
| 3030 | - # need to add double quotes around the decoded strings | ||
| 3031 | - # after escaping double-quotes as double-double-quotes for VBA: | ||
| 3032 | - decoded = decoded.replace('"', '""') | ||
| 3033 | - decoded = '"%s"' % decoded | ||
| 3034 | - # if the encoded string is enclosed in parentheses, | ||
| 3035 | - # keep them in the decoded version: | ||
| 3036 | - if encoded.startswith('(') and encoded.endswith(')'): | ||
| 3037 | - decoded = '(%s)' % decoded | ||
| 3038 | - deobf_code = deobf_code.replace(encoded, decoded) | ||
| 3039 | - # # TODO: there is a bug somewhere which creates double returns '\r\r' | ||
| 3040 | - # deobf_code = deobf_code.replace('\r\r', '\r') | ||
| 3041 | - return deobf_code | ||
| 3042 | - #TODO: repasser l'analyse plusieurs fois si des chaines hex ou base64 sont revelees | ||
| 3043 | - | ||
| 3044 | - | ||
| 3045 | - def find_vba_forms(self): | ||
| 3046 | - """ | ||
| 3047 | - Finds all the VBA forms stored in an OLE file. | ||
| 3048 | - | ||
| 3049 | - Return None if the file is not OLE but OpenXML. | ||
| 3050 | - Return a list of tuples (vba_root, project_path, dir_path) for each VBA project. | ||
| 3051 | - vba_root is the path of the root OLE storage containing the VBA project, | ||
| 3052 | - including a trailing slash unless it is the root of the OLE file. | ||
| 3053 | - project_path is the path of the OLE stream named "PROJECT" within the VBA project. | ||
| 3054 | - dir_path is the path of the OLE stream named "VBA/dir" within the VBA project. | ||
| 3055 | - | ||
| 3056 | - If this function returns an empty list for one of the supported formats | ||
| 3057 | - (i.e. Word, Excel, Powerpoint), then the file does not contain VBA forms. | ||
| 3058 | - | ||
| 3059 | - :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path) | ||
| 3060 | - for each VBA project found if OLE file | ||
| 3061 | - """ | ||
| 3062 | - log.debug('VBA_Parser.find_vba_forms') | ||
| 3063 | - | ||
| 3064 | - # if the file is not OLE but OpenXML, return None: | ||
| 3065 | - if self.ole_file is None and self.type != TYPE_PPT: | ||
| 3066 | - return None | ||
| 3067 | - | ||
| 3068 | - # if this method has already been called, return previous result: | ||
| 3069 | - # if self.vba_projects is not None: | ||
| 3070 | - # return self.vba_projects | ||
| 3071 | - | ||
| 3072 | - # According to MS-OFORMS section 2.1.2 Control Streams: | ||
| 3073 | - # - A parent control, that is, a control that can contain embedded controls, | ||
| 3074 | - # MUST be persisted as a storage that contains multiple streams. | ||
| 3075 | - # - All parent controls MUST contain a FormControl. The FormControl | ||
| 3076 | - # properties are persisted to a stream (1) as specified in section 2.1.1.2. | ||
| 3077 | - # The name of this stream (1) MUST be "f". | ||
| 3078 | - # - Embedded controls that cannot themselves contain other embedded | ||
| 3079 | - # controls are persisted sequentially as FormEmbeddedActiveXControls | ||
| 3080 | - # to a stream (1) contained in the same storage as the parent control. | ||
| 3081 | - # The name of this stream (1) MUST be "o". | ||
| 3082 | - # - all names are case-insensitive | ||
| 3083 | - | ||
| 3084 | - if self.type == TYPE_PPT: | ||
| 3085 | - # TODO: so far, this function is never called for PPT files, but | ||
| 3086 | - # if that happens, the information is lost which ole file contains | ||
| 3087 | - # which storage! | ||
| 3088 | - ole_files = self.ole_subfiles | ||
| 3089 | - log.warning('Returned info is not complete for PPT types!') | ||
| 3090 | - else: | ||
| 3091 | - ole_files = [self.ole_file, ] | ||
| 3092 | - | ||
| 3093 | - # start with an empty list: | ||
| 3094 | - self.vba_forms = [] | ||
| 3095 | - | ||
| 3096 | - # Loop over ole streams | ||
| 3097 | - for ole in ole_files: | ||
| 3098 | - # Look for any storage containing those storage/streams: | ||
| 3099 | - for storage in ole.listdir(streams=False, storages=True): | ||
| 3100 | - log.debug('Checking storage %r' % storage) | ||
| 3101 | - # Look for two streams named 'o' and 'f': | ||
| 3102 | - o_stream = storage + ['o'] | ||
| 3103 | - f_stream = storage + ['f'] | ||
| 3104 | - log.debug('Checking if streams %r and %r exist' % (f_stream, o_stream)) | ||
| 3105 | - if ole.exists(o_stream) and ole.get_type(o_stream) == olefile.STGTY_STREAM \ | ||
| 3106 | - and ole.exists(f_stream) and ole.get_type(f_stream) == olefile.STGTY_STREAM: | ||
| 3107 | - form_path = '/'.join(storage) | ||
| 3108 | - log.debug('Found VBA Form: %r' % form_path) | ||
| 3109 | - self.vba_forms.append(storage) | ||
| 3110 | - return self.vba_forms | ||
| 3111 | - | ||
| 3112 | - def extract_form_strings(self): | ||
| 3113 | - """ | ||
| 3114 | - Extract printable strings from each VBA Form found in the file | ||
| 3115 | - | ||
| 3116 | - Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found | ||
| 3117 | - If the file is OLE, filename is the path of the file. | ||
| 3118 | - If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros | ||
| 3119 | - within the zip archive, e.g. word/vbaProject.bin. | ||
| 3120 | - If the file is PPT, result is as for OpenXML but filename is useless | ||
| 3121 | - """ | ||
| 3122 | - if self.ole_file is None: | ||
| 3123 | - # This may be either an OpenXML/PPT or a text file: | ||
| 3124 | - if self.type == TYPE_TEXT: | ||
| 3125 | - # This is a text file, return no results: | ||
| 3126 | - return | ||
| 3127 | - else: | ||
| 3128 | - # OpenXML/PPT: recursively yield results from each OLE subfile: | ||
| 3129 | - for ole_subfile in self.ole_subfiles: | ||
| 3130 | - for results in ole_subfile.extract_form_strings(): | ||
| 3131 | - yield results | ||
| 3132 | - else: | ||
| 3133 | - # This is an OLE file: | ||
| 3134 | - self.find_vba_forms() | ||
| 3135 | - ole = self.ole_file | ||
| 3136 | - for form_storage in self.vba_forms: | ||
| 3137 | - o_stream = form_storage + ['o'] | ||
| 3138 | - log.debug('Opening form object stream %r' % '/'.join(o_stream)) | ||
| 3139 | - form_data = ole.openstream(o_stream).read() | ||
| 3140 | - # Extract printable strings from the form object stream "o": | ||
| 3141 | - for m in re_printable_string.finditer(form_data): | ||
| 3142 | - log.debug('Printable string found in form: %r' % m.group()) | ||
| 3143 | - yield (self.filename, '/'.join(o_stream), m.group()) | ||
| 3144 | - | ||
| 3145 | - | ||
| 3146 | - def close(self): | ||
| 3147 | - """ | ||
| 3148 | - Close all the open files. This method must be called after usage, if | ||
| 3149 | - the application is opening many files. | ||
| 3150 | - """ | ||
| 3151 | - if self.ole_file is None: | ||
| 3152 | - if self.ole_subfiles is not None: | ||
| 3153 | - for ole_subfile in self.ole_subfiles: | ||
| 3154 | - ole_subfile.close() | ||
| 3155 | - else: | ||
| 3156 | - self.ole_file.close() | ||
| 3157 | - | ||
| 3158 | - | ||
| 3159 | - | ||
| 3160 | -class VBA_Parser_CLI(VBA_Parser): | ||
| 3161 | - """ | ||
| 3162 | - VBA parser and analyzer, adding methods for the command line interface | ||
| 3163 | - of olevba. (see VBA_Parser) | ||
| 3164 | - """ | ||
| 3165 | - | ||
| 3166 | - def __init__(self, *args, **kwargs): | ||
| 3167 | - """ | ||
| 3168 | - Constructor for VBA_Parser_CLI. | ||
| 3169 | - Calls __init__ from VBA_Parser with all arguments --> see doc there | ||
| 3170 | - """ | ||
| 3171 | - super(VBA_Parser_CLI, self).__init__(*args, **kwargs) | ||
| 3172 | - | ||
| 3173 | - | ||
| 3174 | - def print_analysis(self, show_decoded_strings=False, deobfuscate=False): | ||
| 3175 | - """ | ||
| 3176 | - Analyze the provided VBA code, and print the results in a table | ||
| 3177 | - | ||
| 3178 | - :param vba_code: str, VBA source code to be analyzed | ||
| 3179 | - :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content. | ||
| 3180 | - :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) | ||
| 3181 | - :return: None | ||
| 3182 | - """ | ||
| 3183 | - # print a waiting message only if the output is not redirected to a file: | ||
| 3184 | - if sys.stdout.isatty(): | ||
| 3185 | - print('Analysis...\r', end='') | ||
| 3186 | - sys.stdout.flush() | ||
| 3187 | - results = self.analyze_macros(show_decoded_strings, deobfuscate) | ||
| 3188 | - if results: | ||
| 3189 | - t = prettytable.PrettyTable(('Type', 'Keyword', 'Description')) | ||
| 3190 | - t.align = 'l' | ||
| 3191 | - t.max_width['Type'] = 10 | ||
| 3192 | - t.max_width['Keyword'] = 20 | ||
| 3193 | - t.max_width['Description'] = 39 | ||
| 3194 | - for kw_type, keyword, description in results: | ||
| 3195 | - # handle non printable strings: | ||
| 3196 | - if not is_printable(keyword): | ||
| 3197 | - keyword = repr(keyword) | ||
| 3198 | - if not is_printable(description): | ||
| 3199 | - description = repr(description) | ||
| 3200 | - t.add_row((kw_type, keyword, description)) | ||
| 3201 | - print(t) | ||
| 3202 | - else: | ||
| 3203 | - print('No suspicious keyword or IOC found.') | ||
| 3204 | - | ||
| 3205 | - def print_analysis_json(self, show_decoded_strings=False, deobfuscate=False): | ||
| 3206 | - """ | ||
| 3207 | - Analyze the provided VBA code, and return the results in json format | ||
| 3208 | - | ||
| 3209 | - :param vba_code: str, VBA source code to be analyzed | ||
| 3210 | - :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content. | ||
| 3211 | - :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) | ||
| 3212 | - | ||
| 3213 | - :return: dict | ||
| 3214 | - """ | ||
| 3215 | - # print a waiting message only if the output is not redirected to a file: | ||
| 3216 | - if sys.stdout.isatty(): | ||
| 3217 | - print('Analysis...\r', end='') | ||
| 3218 | - sys.stdout.flush() | ||
| 3219 | - return [dict(type=kw_type, keyword=keyword, description=description) | ||
| 3220 | - for kw_type, keyword, description in self.analyze_macros(show_decoded_strings, deobfuscate)] | ||
| 3221 | - | ||
| 3222 | - def process_file(self, show_decoded_strings=False, | ||
| 3223 | - display_code=True, hide_attributes=True, | ||
| 3224 | - vba_code_only=False, show_deobfuscated_code=False, | ||
| 3225 | - deobfuscate=False): | ||
| 3226 | - """ | ||
| 3227 | - Process a single file | ||
| 3228 | - | ||
| 3229 | - :param filename: str, path and filename of file on disk, or within the container. | ||
| 3230 | - :param data: bytes, content of the file if it is in a container, None if it is a file on disk. | ||
| 3231 | - :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content. | ||
| 3232 | - :param display_code: bool, if False VBA source code is not displayed (default True) | ||
| 3233 | - :param global_analysis: bool, if True all modules are merged for a single analysis (default), | ||
| 3234 | - otherwise each module is analyzed separately (old behaviour) | ||
| 3235 | - :param hide_attributes: bool, if True the first lines starting with "Attribute VB" are hidden (default) | ||
| 3236 | - :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) | ||
| 3237 | - """ | ||
| 3238 | - #TODO: replace print by writing to a provided output file (sys.stdout by default) | ||
| 3239 | - # fix conflicting parameters: | ||
| 3240 | - if vba_code_only and not display_code: | ||
| 3241 | - display_code = True | ||
| 3242 | - if self.container: | ||
| 3243 | - display_filename = '%s in %s' % (self.filename, self.container) | ||
| 3244 | - else: | ||
| 3245 | - display_filename = self.filename | ||
| 3246 | - print('=' * 79) | ||
| 3247 | - print('FILE: %s' % display_filename) | ||
| 3248 | - try: | ||
| 3249 | - #TODO: handle olefile errors, when an OLE file is malformed | ||
| 3250 | - print('Type: %s'% self.type) | ||
| 3251 | - if self.detect_vba_macros(): | ||
| 3252 | - #print 'Contains VBA Macros:' | ||
| 3253 | - for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros(): | ||
| 3254 | - if hide_attributes: | ||
| 3255 | - # hide attribute lines: | ||
| 3256 | - if isinstance(vba_code,bytes): | ||
| 3257 | - vba_code =vba_code.decode('utf-8','backslashreplace') | ||
| 3258 | - vba_code_filtered = filter_vba(vba_code) | ||
| 3259 | - else: | ||
| 3260 | - vba_code_filtered = vba_code | ||
| 3261 | - print('-' * 79) | ||
| 3262 | - print('VBA MACRO %s ' % vba_filename) | ||
| 3263 | - print('in file: %s - OLE stream: %s' % (subfilename, repr(stream_path))) | ||
| 3264 | - if display_code: | ||
| 3265 | - print('- ' * 39) | ||
| 3266 | - # detect empty macros: | ||
| 3267 | - if vba_code_filtered.strip() == '': | ||
| 3268 | - print('(empty macro)') | ||
| 3269 | - else: | ||
| 3270 | - print(vba_code_filtered) | ||
| 3271 | - for (subfilename, stream_path, form_string) in self.extract_form_strings(): | ||
| 3272 | - print('-' * 79) | ||
| 3273 | - print('VBA FORM STRING IN %r - OLE stream: %r' % (subfilename, stream_path)) | ||
| 3274 | - print('- ' * 39) | ||
| 3275 | - print(form_string.decode('utf-8', 'ignore')) | ||
| 3276 | - if not vba_code_only: | ||
| 3277 | - # analyse the code from all modules at once: | ||
| 3278 | - self.print_analysis(show_decoded_strings, deobfuscate) | ||
| 3279 | - if show_deobfuscated_code: | ||
| 3280 | - print('MACRO SOURCE CODE WITH DEOBFUSCATED VBA STRINGS (EXPERIMENTAL):\n\n') | ||
| 3281 | - print(self.reveal()) | ||
| 3282 | - else: | ||
| 3283 | - print('No VBA macros found.') | ||
| 3284 | - except OlevbaBaseException: | ||
| 3285 | - raise | ||
| 3286 | - except Exception as exc: | ||
| 3287 | - # display the exception with full stack trace for debugging | ||
| 3288 | - log.info('Error processing file %s (%s)' % (self.filename, exc)) | ||
| 3289 | - log.debug('Traceback:', exc_info=True) | ||
| 3290 | - raise ProcessingError(self.filename, exc) | ||
| 3291 | - print('') | ||
| 3292 | - | ||
| 3293 | - | ||
| 3294 | - def process_file_json(self, show_decoded_strings=False, | ||
| 3295 | - display_code=True, hide_attributes=True, | ||
| 3296 | - vba_code_only=False, show_deobfuscated_code=False, | ||
| 3297 | - deobfuscate=False): | ||
| 3298 | - """ | ||
| 3299 | - Process a single file | ||
| 3300 | - | ||
| 3301 | - every "show" or "print" here is to be translated as "add to json" | ||
| 3302 | - | ||
| 3303 | - :param filename: str, path and filename of file on disk, or within the container. | ||
| 3304 | - :param data: bytes, content of the file if it is in a container, None if it is a file on disk. | ||
| 3305 | - :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content. | ||
| 3306 | - :param display_code: bool, if False VBA source code is not displayed (default True) | ||
| 3307 | - :param global_analysis: bool, if True all modules are merged for a single analysis (default), | ||
| 3308 | - otherwise each module is analyzed separately (old behaviour) | ||
| 3309 | - :param hide_attributes: bool, if True the first lines starting with "Attribute VB" are hidden (default) | ||
| 3310 | - :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) | ||
| 3311 | - """ | ||
| 3312 | - #TODO: fix conflicting parameters (?) | ||
| 3313 | - | ||
| 3314 | - if vba_code_only and not display_code: | ||
| 3315 | - display_code = True | ||
| 3316 | - | ||
| 3317 | - result = {} | ||
| 3318 | - | ||
| 3319 | - if self.container: | ||
| 3320 | - result['container'] = self.container | ||
| 3321 | - else: | ||
| 3322 | - result['container'] = None | ||
| 3323 | - result['file'] = self.filename | ||
| 3324 | - result['json_conversion_successful'] = False | ||
| 3325 | - result['analysis'] = None | ||
| 3326 | - result['code_deobfuscated'] = None | ||
| 3327 | - result['do_deobfuscate'] = deobfuscate | ||
| 3328 | - | ||
| 3329 | - try: | ||
| 3330 | - #TODO: handle olefile errors, when an OLE file is malformed | ||
| 3331 | - result['type'] = self.type | ||
| 3332 | - macros = [] | ||
| 3333 | - if self.detect_vba_macros(): | ||
| 3334 | - for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros(): | ||
| 3335 | - curr_macro = {} | ||
| 3336 | - if isinstance(vba_code, bytes): | ||
| 3337 | - vba_code = vba_code.decode('utf-8', 'backslashreplace') | ||
| 3338 | - | ||
| 3339 | - if hide_attributes: | ||
| 3340 | - # hide attribute lines: | ||
| 3341 | - vba_code_filtered = filter_vba(vba_code) | ||
| 3342 | - else: | ||
| 3343 | - vba_code_filtered = vba_code | ||
| 3344 | - | ||
| 3345 | - curr_macro['vba_filename'] = vba_filename | ||
| 3346 | - curr_macro['subfilename'] = subfilename | ||
| 3347 | - curr_macro['ole_stream'] = stream_path | ||
| 3348 | - if display_code: | ||
| 3349 | - curr_macro['code'] = vba_code_filtered.strip() | ||
| 3350 | - else: | ||
| 3351 | - curr_macro['code'] = None | ||
| 3352 | - macros.append(curr_macro) | ||
| 3353 | - if not vba_code_only: | ||
| 3354 | - # analyse the code from all modules at once: | ||
| 3355 | - result['analysis'] = self.print_analysis_json(show_decoded_strings, | ||
| 3356 | - deobfuscate) | ||
| 3357 | - if show_deobfuscated_code: | ||
| 3358 | - result['code_deobfuscated'] = self.reveal() | ||
| 3359 | - result['macros'] = macros | ||
| 3360 | - result['json_conversion_successful'] = True | ||
| 3361 | - except Exception as exc: | ||
| 3362 | - # display the exception with full stack trace for debugging | ||
| 3363 | - log.info('Error processing file %s (%s)' % (self.filename, exc)) | ||
| 3364 | - log.debug('Traceback:', exc_info=True) | ||
| 3365 | - raise ProcessingError(self.filename, exc) | ||
| 3366 | - | ||
| 3367 | - return result | ||
| 3368 | - | ||
| 3369 | - | ||
| 3370 | - def process_file_triage(self, show_decoded_strings=False, deobfuscate=False): | ||
| 3371 | - """ | ||
| 3372 | - Process a file in triage mode, showing only summary results on one line. | ||
| 3373 | - """ | ||
| 3374 | - #TODO: replace print by writing to a provided output file (sys.stdout by default) | ||
| 3375 | - try: | ||
| 3376 | - #TODO: handle olefile errors, when an OLE file is malformed | ||
| 3377 | - if self.detect_vba_macros(): | ||
| 3378 | - # print a waiting message only if the output is not redirected to a file: | ||
| 3379 | - if sys.stdout.isatty(): | ||
| 3380 | - print('Analysis...\r', end='') | ||
| 3381 | - sys.stdout.flush() | ||
| 3382 | - self.analyze_macros(show_decoded_strings=show_decoded_strings, | ||
| 3383 | - deobfuscate=deobfuscate) | ||
| 3384 | - flags = TYPE2TAG[self.type] | ||
| 3385 | - macros = autoexec = suspicious = iocs = hexstrings = base64obf = dridex = vba_obf = '-' | ||
| 3386 | - if self.contains_macros: macros = 'M' | ||
| 3387 | - if self.nb_autoexec: autoexec = 'A' | ||
| 3388 | - if self.nb_suspicious: suspicious = 'S' | ||
| 3389 | - if self.nb_iocs: iocs = 'I' | ||
| 3390 | - if self.nb_hexstrings: hexstrings = 'H' | ||
| 3391 | - if self.nb_base64strings: base64obf = 'B' | ||
| 3392 | - if self.nb_dridexstrings: dridex = 'D' | ||
| 3393 | - if self.nb_vbastrings: vba_obf = 'V' | ||
| 3394 | - flags += '%s%s%s%s%s%s%s%s' % (macros, autoexec, suspicious, iocs, hexstrings, | ||
| 3395 | - base64obf, dridex, vba_obf) | ||
| 3396 | - | ||
| 3397 | - line = '%-12s %s' % (flags, self.filename) | ||
| 3398 | - print(line) | ||
| 3399 | - except Exception as exc: | ||
| 3400 | - # display the exception with full stack trace for debugging only | ||
| 3401 | - log.debug('Error processing file %s (%s)' % (self.filename, exc), | ||
| 3402 | - exc_info=True) | ||
| 3403 | - raise ProcessingError(self.filename, exc) | ||
| 3404 | - | ||
| 3405 | - | ||
| 3406 | -#=== MAIN ===================================================================== | ||
| 3407 | - | ||
| 3408 | -def parse_args(cmd_line_args=None): | ||
| 3409 | - """ parse command line arguments (given ones or per default sys.argv) """ | ||
| 3410 | - | ||
| 3411 | - DEFAULT_LOG_LEVEL = "warning" # Default log level | ||
| 3412 | - LOG_LEVELS = { | ||
| 3413 | - 'debug': logging.DEBUG, | ||
| 3414 | - 'info': logging.INFO, | ||
| 3415 | - 'warning': logging.WARNING, | ||
| 3416 | - 'error': logging.ERROR, | ||
| 3417 | - 'critical': logging.CRITICAL | ||
| 3418 | - } | ||
| 3419 | - | ||
| 3420 | - usage = 'usage: olevba [options] <filename> [filename2 ...]' | ||
| 3421 | - parser = optparse.OptionParser(usage=usage) | ||
| 3422 | - # parser.add_option('-o', '--outfile', dest='outfile', | ||
| 3423 | - # help='output file') | ||
| 3424 | - # parser.add_option('-c', '--csv', dest='csv', | ||
| 3425 | - # help='export results to a CSV file') | ||
| 3426 | - parser.add_option("-r", action="store_true", dest="recursive", | ||
| 3427 | - help='find files recursively in subdirectories.') | ||
| 3428 | - parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None, | ||
| 3429 | - help='if the file is a zip archive, open all files from it, using the provided password (requires Python 2.6+)') | ||
| 3430 | - parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*', | ||
| 3431 | - help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)') | ||
| 3432 | - # output mode; could make this even simpler with add_option(type='choice') but that would make | ||
| 3433 | - # cmd line interface incompatible... | ||
| 3434 | - modes = optparse.OptionGroup(parser, title='Output mode (mutually exclusive)') | ||
| 3435 | - modes.add_option("-t", '--triage', action="store_const", dest="output_mode", | ||
| 3436 | - const='triage', default='unspecified', | ||
| 3437 | - help='triage mode, display results as a summary table (default for multiple files)') | ||
| 3438 | - modes.add_option("-d", '--detailed', action="store_const", dest="output_mode", | ||
| 3439 | - const='detailed', default='unspecified', | ||
| 3440 | - help='detailed mode, display full results (default for single file)') | ||
| 3441 | - modes.add_option("-j", '--json', action="store_const", dest="output_mode", | ||
| 3442 | - const='json', default='unspecified', | ||
| 3443 | - help='json mode, detailed in json format (never default)') | ||
| 3444 | - parser.add_option_group(modes) | ||
| 3445 | - parser.add_option("-a", '--analysis', action="store_false", dest="display_code", default=True, | ||
| 3446 | - help='display only analysis results, not the macro source code') | ||
| 3447 | - parser.add_option("-c", '--code', action="store_true", dest="vba_code_only", default=False, | ||
| 3448 | - help='display only VBA source code, do not analyze it') | ||
| 3449 | - parser.add_option("--decode", action="store_true", dest="show_decoded_strings", | ||
| 3450 | - help='display all the obfuscated strings with their decoded content (Hex, Base64, StrReverse, Dridex, VBA).') | ||
| 3451 | - parser.add_option("--attr", action="store_false", dest="hide_attributes", default=True, | ||
| 3452 | - help='display the attribute lines at the beginning of VBA source code') | ||
| 3453 | - parser.add_option("--reveal", action="store_true", dest="show_deobfuscated_code", | ||
| 3454 | - help='display the macro source code after replacing all the obfuscated strings by their decoded content.') | ||
| 3455 | - parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL, | ||
| 3456 | - help="logging level debug/info/warning/error/critical (default=%default)") | ||
| 3457 | - parser.add_option('--deobf', dest="deobfuscate", action="store_true", default=False, | ||
| 3458 | - help="Attempt to deobfuscate VBA expressions (slow)") | ||
| 3459 | - parser.add_option('--relaxed', dest="relaxed", action="store_true", default=False, | ||
| 3460 | - help="Do not raise errors if opening of substream fails") | ||
| 3461 | - | ||
| 3462 | - (options, args) = parser.parse_args(cmd_line_args) | ||
| 3463 | - | ||
| 3464 | - # Print help if no arguments are passed | ||
| 3465 | - if len(args) == 0: | ||
| 3466 | - # print banner with version | ||
| 3467 | - python_version = '%d.%d.%d' % sys.version_info[0:3] | ||
| 3468 | - print('olevba3 %s on Python %s - http://decalage.info/python/oletools' % | ||
| 3469 | - (__version__, python_version)) | ||
| 3470 | - print(__doc__) | ||
| 3471 | - parser.print_help() | ||
| 3472 | - sys.exit(RETURN_WRONG_ARGS) | ||
| 3473 | - | ||
| 3474 | - options.loglevel = LOG_LEVELS[options.loglevel] | ||
| 3475 | - | ||
| 3476 | - return options, args | ||
| 3477 | - | ||
| 3478 | - | ||
| 3479 | -def main(cmd_line_args=None): | ||
| 3480 | - """ | ||
| 3481 | - Main function, called when olevba is run from the command line | ||
| 3482 | - | ||
| 3483 | - Optional argument: command line arguments to be forwarded to ArgumentParser | ||
| 3484 | - in process_args. Per default (cmd_line_args=None), sys.argv is used. Option | ||
| 3485 | - mainly added for unit-testing | ||
| 3486 | - """ | ||
| 3487 | - | ||
| 3488 | - options, args = parse_args(cmd_line_args) | ||
| 3489 | - | ||
| 3490 | - # provide info about tool and its version | ||
| 3491 | - if options.output_mode == 'json': | ||
| 3492 | - # print first json entry with meta info and opening '[' | ||
| 3493 | - print_json(script_name='olevba', version=__version__, | ||
| 3494 | - url='http://decalage.info/python/oletools', | ||
| 3495 | - type='MetaInformation', _json_is_first=True) | ||
| 3496 | - else: | ||
| 3497 | - # print banner with version | ||
| 3498 | - python_version = '%d.%d.%d' % sys.version_info[0:3] | ||
| 3499 | - print('olevba3 %s on Python %s - http://decalage.info/python/oletools' % | ||
| 3500 | - (__version__, python_version)) | ||
| 3501 | - | ||
| 3502 | - logging.basicConfig(level=options.loglevel, format='%(levelname)-8s %(message)s') | ||
| 3503 | - # enable logging in the modules: | ||
| 3504 | - enable_logging() | ||
| 3505 | - | ||
| 3506 | - # with the option --reveal, make sure --deobf is also enabled: | ||
| 3507 | - if options.show_deobfuscated_code and not options.deobfuscate: | ||
| 3508 | - log.info('set --deobf because --reveal was set') | ||
| 3509 | - options.deobfuscate = True | ||
| 3510 | - if options.output_mode == 'triage' and options.show_deobfuscated_code: | ||
| 3511 | - log.info('ignoring option --reveal in triage output mode') | ||
| 3512 | - | ||
| 3513 | - # Column headers (do not know how many files there will be yet, so if no output_mode | ||
| 3514 | - # was specified, we will print triage for first file --> need these headers) | ||
| 3515 | - if options.output_mode in ('triage', 'unspecified'): | ||
| 3516 | - print('%-12s %-65s' % ('Flags', 'Filename')) | ||
| 3517 | - print('%-12s %-65s' % ('-' * 11, '-' * 65)) | ||
| 3518 | - | ||
| 3519 | - previous_container = None | ||
| 3520 | - count = 0 | ||
| 3521 | - container = filename = data = None | ||
| 3522 | - vba_parser = None | ||
| 3523 | - return_code = RETURN_OK | ||
| 3524 | - try: | ||
| 3525 | - for container, filename, data in xglob.iter_files(args, recursive=options.recursive, | ||
| 3526 | - zip_password=options.zip_password, zip_fname=options.zip_fname): | ||
| 3527 | - # ignore directory names stored in zip files: | ||
| 3528 | - if container and filename.endswith('/'): | ||
| 3529 | - continue | ||
| 3530 | - | ||
| 3531 | - # handle errors from xglob | ||
| 3532 | - if isinstance(data, Exception): | ||
| 3533 | - if isinstance(data, PathNotFoundException): | ||
| 3534 | - if options.output_mode in ('triage', 'unspecified'): | ||
| 3535 | - print('%-12s %s - File not found' % ('?', filename)) | ||
| 3536 | - elif options.output_mode != 'json': | ||
| 3537 | - log.error('Given path %r does not exist!' % filename) | ||
| 3538 | - return_code = RETURN_FILE_NOT_FOUND if return_code == 0 \ | ||
| 3539 | - else RETURN_SEVERAL_ERRS | ||
| 3540 | - else: | ||
| 3541 | - if options.output_mode in ('triage', 'unspecified'): | ||
| 3542 | - print('%-12s %s - Failed to read from zip file %s' % ('?', filename, container)) | ||
| 3543 | - elif options.output_mode != 'json': | ||
| 3544 | - log.error('Exception opening/reading %r from zip file %r: %s' | ||
| 3545 | - % (filename, container, data)) | ||
| 3546 | - return_code = RETURN_XGLOB_ERR if return_code == 0 \ | ||
| 3547 | - else RETURN_SEVERAL_ERRS | ||
| 3548 | - if options.output_mode == 'json': | ||
| 3549 | - print_json(file=filename, type='error', | ||
| 3550 | - error=type(data).__name__, message=str(data)) | ||
| 3551 | - continue | ||
| 3552 | - | ||
| 3553 | - try: | ||
| 3554 | - # close the previous file if analyzing several: | ||
| 3555 | - # (this must be done here to avoid closing the file if there is only 1, | ||
| 3556 | - # to fix issue #219) | ||
| 3557 | - if vba_parser is not None: | ||
| 3558 | - vba_parser.close() | ||
| 3559 | - # Open the file | ||
| 3560 | - vba_parser = VBA_Parser_CLI(filename, data=data, container=container, | ||
| 3561 | - relaxed=options.relaxed) | ||
| 3562 | - | ||
| 3563 | - if options.output_mode == 'detailed': | ||
| 3564 | - # fully detailed output | ||
| 3565 | - vba_parser.process_file(show_decoded_strings=options.show_decoded_strings, | ||
| 3566 | - display_code=options.display_code, | ||
| 3567 | - hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only, | ||
| 3568 | - show_deobfuscated_code=options.show_deobfuscated_code, | ||
| 3569 | - deobfuscate=options.deobfuscate) | ||
| 3570 | - elif options.output_mode in ('triage', 'unspecified'): | ||
| 3571 | - # print container name when it changes: | ||
| 3572 | - if container != previous_container: | ||
| 3573 | - if container is not None: | ||
| 3574 | - print('\nFiles in %s:' % container) | ||
| 3575 | - previous_container = container | ||
| 3576 | - # summarized output for triage: | ||
| 3577 | - vba_parser.process_file_triage(show_decoded_strings=options.show_decoded_strings, | ||
| 3578 | - deobfuscate=options.deobfuscate) | ||
| 3579 | - elif options.output_mode == 'json': | ||
| 3580 | - print_json( | ||
| 3581 | - vba_parser.process_file_json(show_decoded_strings=options.show_decoded_strings, | ||
| 3582 | - display_code=options.display_code, | ||
| 3583 | - hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only, | ||
| 3584 | - show_deobfuscated_code=options.show_deobfuscated_code, | ||
| 3585 | - deobfuscate=options.deobfuscate)) | ||
| 3586 | - else: # (should be impossible) | ||
| 3587 | - raise ValueError('unexpected output mode: "{0}"!'.format(options.output_mode)) | ||
| 3588 | - count += 1 | ||
| 3589 | - | ||
| 3590 | - except (SubstreamOpenError, UnexpectedDataError) as exc: | ||
| 3591 | - if options.output_mode in ('triage', 'unspecified'): | ||
| 3592 | - print('%-12s %s - Error opening substream or uenxpected ' \ | ||
| 3593 | - 'content' % ('?', filename)) | ||
| 3594 | - elif options.output_mode == 'json': | ||
| 3595 | - print_json(file=filename, type='error', | ||
| 3596 | - error=type(exc).__name__, message=str(exc)) | ||
| 3597 | - else: | ||
| 3598 | - log.exception('Error opening substream or unexpected ' | ||
| 3599 | - 'content in %s' % filename) | ||
| 3600 | - return_code = RETURN_OPEN_ERROR if return_code == 0 \ | ||
| 3601 | - else RETURN_SEVERAL_ERRS | ||
| 3602 | - except FileOpenError as exc: | ||
| 3603 | - if options.output_mode in ('triage', 'unspecified'): | ||
| 3604 | - print('%-12s %s - File format not supported' % ('?', filename)) | ||
| 3605 | - elif options.output_mode == 'json': | ||
| 3606 | - print_json(file=filename, type='error', | ||
| 3607 | - error=type(exc).__name__, message=str(exc)) | ||
| 3608 | - else: | ||
| 3609 | - log.exception('Failed to open %s -- probably not supported!' % filename) | ||
| 3610 | - return_code = RETURN_OPEN_ERROR if return_code == 0 \ | ||
| 3611 | - else RETURN_SEVERAL_ERRS | ||
| 3612 | - except ProcessingError as exc: | ||
| 3613 | - if options.output_mode in ('triage', 'unspecified'): | ||
| 3614 | - print('%-12s %s - %s' % ('!ERROR', filename, exc.orig_exc)) | ||
| 3615 | - elif options.output_mode == 'json': | ||
| 3616 | - print_json(file=filename, type='error', | ||
| 3617 | - error=type(exc).__name__, | ||
| 3618 | - message=str(exc.orig_exc)) | ||
| 3619 | - else: | ||
| 3620 | - log.exception('Error processing file %s (%s)!' | ||
| 3621 | - % (filename, exc.orig_exc)) | ||
| 3622 | - return_code = RETURN_PARSE_ERROR if return_code == 0 \ | ||
| 3623 | - else RETURN_SEVERAL_ERRS | ||
| 3624 | - except FileIsEncryptedError as exc: | ||
| 3625 | - if options.output_mode in ('triage', 'unspecified'): | ||
| 3626 | - print('%-12s %s - File is encrypted' % ('!ERROR', filename)) | ||
| 3627 | - elif options.output_mode == 'json': | ||
| 3628 | - print_json(file=filename, type='error', | ||
| 3629 | - error=type(exc).__name__, message=str(exc)) | ||
| 3630 | - else: | ||
| 3631 | - log.exception('File %s is encrypted!' % (filename)) | ||
| 3632 | - return_code = RETURN_ENCRYPTED if return_code == 0 \ | ||
| 3633 | - else RETURN_SEVERAL_ERRS | ||
| 3634 | - # Here we do not close the vba_parser, because process_file may need it below. | ||
| 3635 | - | ||
| 3636 | - if options.output_mode == 'triage': | ||
| 3637 | - print('\n(Flags: OpX=OpenXML, XML=Word2003XML, FlX=FlatOPC XML, MHT=MHTML, TXT=Text, M=Macros, ' \ | ||
| 3638 | - 'A=Auto-executable, S=Suspicious keywords, I=IOCs, H=Hex strings, ' \ | ||
| 3639 | - 'B=Base64 strings, D=Dridex strings, V=VBA strings, ?=Unknown)\n') | ||
| 3640 | - | ||
| 3641 | - if count == 1 and options.output_mode == 'unspecified': | ||
| 3642 | - # if options -t, -d and -j were not specified and it's a single file, print details: | ||
| 3643 | - vba_parser.process_file(show_decoded_strings=options.show_decoded_strings, | ||
| 3644 | - display_code=options.display_code, | ||
| 3645 | - hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only, | ||
| 3646 | - show_deobfuscated_code=options.show_deobfuscated_code, | ||
| 3647 | - deobfuscate=options.deobfuscate) | ||
| 3648 | - | ||
| 3649 | - if options.output_mode == 'json': | ||
| 3650 | - # print last json entry (a last one without a comma) and closing ] | ||
| 3651 | - print_json(type='MetaInformation', return_code=return_code, | ||
| 3652 | - n_processed=count, _json_is_last=True) | ||
| 3653 | - | ||
| 3654 | - except Exception as exc: | ||
| 3655 | - # some unexpected error, maybe some of the types caught in except clauses | ||
| 3656 | - # above were not sufficient. This is very bad, so log complete trace at exception level | ||
| 3657 | - # and do not care about output mode | ||
| 3658 | - log.exception('Unhandled exception in main: %s' % exc, exc_info=True) | ||
| 3659 | - return_code = RETURN_UNEXPECTED # even if there were others before -- this is more important | ||
| 3660 | - # TODO: print msg with URL to report issues (except in JSON mode) | ||
| 3661 | - | ||
| 3662 | - # done. exit | ||
| 3663 | - log.debug('will exit now with code %s' % return_code) | ||
| 3664 | - sys.exit(return_code) | 19 | +from oletools.olevba import * |
| 20 | +from oletools.olevba import __doc__, __version__ | ||
| 3665 | 21 | ||
| 3666 | if __name__ == '__main__': | 22 | if __name__ == '__main__': |
| 3667 | main() | 23 | main() |
| 3668 | 24 | ||
| 3669 | -# This was coded while listening to "Dust" from I Love You But I've Chosen Darkness |