Commit 8e1d03d7a18b0779ea73c1d4b13914c07220c37d

Authored by decalage2
1 parent a7309e59

olevba3: replaced by a redirection to olevba + deprecation warning (issue #106)

Showing 1 changed file with 6 additions and 3651 deletions
oletools/olevba3.py
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 -"""  
3 -olevba3.py  
4 2
5 -olevba is a script to parse OLE and OpenXML files such as MS Office documents  
6 -(e.g. Word, Excel), to extract VBA Macro code in clear text, deobfuscate  
7 -and analyze malicious macros. 3 +# olevba3 is a stub that redirects to olevba.py, for backwards compatibility
8 4
9 -olevba3 is the version of olevba that runs on Python 3.x.  
10 -  
11 -Supported formats:  
12 -- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)  
13 -- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)  
14 -- PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm)  
15 -- Word/PowerPoint 2007+ XML (aka Flat OPC)  
16 -- Word 2003 XML (.xml)  
17 -- Word/Excel Single File Web Page / MHTML (.mht)  
18 -- Publisher (.pub)  
19 -- raises an error if run with files encrypted using MS Crypto API RC4  
20 -  
21 -Author: Philippe Lagadec - http://www.decalage.info  
22 -License: BSD, see source code or documentation  
23 -  
24 -olevba is part of the python-oletools package:  
25 -http://www.decalage.info/python/oletools  
26 -  
27 -olevba is based on source code from officeparser by John William Davison  
28 -https://github.com/unixfreak0037/officeparser  
29 -"""  
30 -  
31 -# === LICENSE ==================================================================  
32 -  
33 -# olevba is copyright (c) 2014-2018 Philippe Lagadec (http://www.decalage.info)  
34 -# All rights reserved.  
35 -#  
36 -# Redistribution and use in source and binary forms, with or without modification,  
37 -# are permitted provided that the following conditions are met:  
38 -#  
39 -# * Redistributions of source code must retain the above copyright notice, this  
40 -# list of conditions and the following disclaimer.  
41 -# * Redistributions in binary form must reproduce the above copyright notice,  
42 -# this list of conditions and the following disclaimer in the documentation  
43 -# and/or other materials provided with the distribution.  
44 -#  
45 -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND  
46 -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED  
47 -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE  
48 -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE  
49 -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL  
50 -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR  
51 -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER  
52 -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,  
53 -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE  
54 -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
55 -  
56 -  
57 -# olevba contains modified source code from the officeparser project, published  
58 -# under the following MIT License (MIT):  
59 -#  
60 -# officeparser is copyright (c) 2014 John William Davison  
61 -#  
62 -# Permission is hereby granted, free of charge, to any person obtaining a copy  
63 -# of this software and associated documentation files (the "Software"), to deal  
64 -# in the Software without restriction, including without limitation the rights  
65 -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell  
66 -# copies of the Software, and to permit persons to whom the Software is  
67 -# furnished to do so, subject to the following conditions:  
68 -#  
69 -# The above copyright notice and this permission notice shall be included in all  
70 -# copies or substantial portions of the Software.  
71 -#  
72 -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR  
73 -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,  
74 -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE  
75 -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER  
76 -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,  
77 -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE  
78 -# SOFTWARE.  
79 -  
80 -from __future__ import print_function  
81 -  
82 -#------------------------------------------------------------------------------  
83 -# CHANGELOG:  
84 -# 2014-08-05 v0.01 PL: - first version based on officeparser code  
85 -# 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser  
86 -# 2014-08-15 PL: - fixed incorrect value check in projecthelpfilepath Record  
87 -# 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats  
88 -# and to find the VBA project root anywhere in the file  
89 -# 2014-11-29 v0.04 PL: - use olefile instead of OleFileIO_PL  
90 -# 2014-12-05 v0.05 PL: - refactored most functions into a class, new API  
91 -# - added detect_vba_macros  
92 -# 2014-12-10 v0.06 PL: - hide first lines with VB attributes  
93 -# - detect auto-executable macros  
94 -# - ignore empty macros  
95 -# 2014-12-14 v0.07 PL: - detect_autoexec() is now case-insensitive  
96 -# 2014-12-15 v0.08 PL: - improved display for empty macros  
97 -# - added pattern extraction  
98 -# 2014-12-25 v0.09 PL: - added suspicious keywords detection  
99 -# 2014-12-27 v0.10 PL: - added OptionParser, main and process_file  
100 -# - uses xglob to scan several files with wildcards  
101 -# - option -r to recurse subdirectories  
102 -# - option -z to scan files in password-protected zips  
103 -# 2015-01-02 v0.11 PL: - improved filter_vba to detect colons  
104 -# 2015-01-03 v0.12 PL: - fixed detect_patterns to detect all patterns  
105 -# - process_file: improved display, shows container file  
106 -# - improved list of executable file extensions  
107 -# 2015-01-04 v0.13 PL: - added several suspicious keywords, improved display  
108 -# 2015-01-08 v0.14 PL: - added hex strings detection and decoding  
109 -# - fixed issue #2, decoding VBA stream names using  
110 -# specified codepage and unicode stream names  
111 -# 2015-01-11 v0.15 PL: - added new triage mode, options -t and -d  
112 -# 2015-01-16 v0.16 PL: - fix for issue #3 (exception when module name="text")  
113 -# - added several suspicious keywords  
114 -# - added option -i to analyze VBA source code directly  
115 -# 2015-01-17 v0.17 PL: - removed .com from the list of executable extensions  
116 -# - added scan_vba to run all detection algorithms  
117 -# - decoded hex strings are now also scanned + reversed  
118 -# 2015-01-23 v0.18 PL: - fixed issue #3, case-insensitive search in code_modules  
119 -# 2015-01-24 v0.19 PL: - improved the detection of IOCs obfuscated with hex  
120 -# strings and StrReverse  
121 -# 2015-01-26 v0.20 PL: - added option --hex to show all hex strings decoded  
122 -# 2015-01-29 v0.21 PL: - added Dridex obfuscation decoding  
123 -# - improved display, shows obfuscation name  
124 -# 2015-02-01 v0.22 PL: - fixed issue #4: regex for URL, e-mail and exe filename  
125 -# - added Base64 obfuscation decoding (contribution from  
126 -# @JamesHabben)  
127 -# 2015-02-03 v0.23 PL: - triage now uses VBA_Scanner results, shows Base64 and  
128 -# Dridex strings  
129 -# - exception handling in detect_base64_strings  
130 -# 2015-02-07 v0.24 PL: - renamed option --hex to --decode, fixed display  
131 -# - display exceptions with stack trace  
132 -# - added several suspicious keywords  
133 -# - improved Base64 detection and decoding  
134 -# - fixed triage mode not to scan attrib lines  
135 -# 2015-03-04 v0.25 PL: - added support for Word 2003 XML  
136 -# 2015-03-22 v0.26 PL: - added suspicious keywords for sandboxing and  
137 -# virtualisation detection  
138 -# 2015-05-06 v0.27 PL: - added support for MHTML files with VBA macros  
139 -# (issue #10 reported by Greg from SpamStopsHere)  
140 -# 2015-05-24 v0.28 PL: - improved support for MHTML files with modified header  
141 -# (issue #11 reported by Thomas Chopitea)  
142 -# 2015-05-26 v0.29 PL: - improved MSO files parsing, taking into account  
143 -# various data offsets (issue #12)  
144 -# - improved detection of MSO files, avoiding incorrect  
145 -# parsing errors (issue #7)  
146 -# 2015-05-29 v0.30 PL: - added suspicious keywords suggested by @ozhermit,  
147 -# Davy Douhine (issue #9), issue #13  
148 -# 2015-06-16 v0.31 PL: - added generic VBA expression deobfuscation (chr,asc,etc)  
149 -# 2015-06-19 PL: - added options -a, -c, --each, --attr  
150 -# 2015-06-21 v0.32 PL: - always display decoded strings which are printable  
151 -# - fix VBA_Scanner.scan to return raw strings, not repr()  
152 -# 2015-07-09 v0.40 PL: - removed usage of sys.stderr which causes issues  
153 -# 2015-07-12 PL: - added Hex function decoding to VBA Parser  
154 -# 2015-07-13 PL: - added Base64 function decoding to VBA Parser  
155 -# 2015-09-06 PL: - improved VBA_Parser, refactored the main functions  
156 -# 2015-09-13 PL: - moved main functions to a class VBA_Parser_CLI  
157 -# - fixed issue when analysis was done twice  
158 -# 2015-09-15 PL: - remove duplicate IOCs from results  
159 -# 2015-09-16 PL: - join long VBA lines ending with underscore before scan  
160 -# - disabled unused option --each  
161 -# 2015-09-22 v0.41 PL: - added new option --reveal  
162 -# - added suspicious strings for PowerShell.exe options  
163 -# 2015-10-09 v0.42 PL: - VBA_Parser: split each format into a separate method  
164 -# 2015-10-10 PL: - added support for text files with VBA source code  
165 -# 2015-11-17 PL: - fixed bug with --decode option  
166 -# 2015-12-16 PL: - fixed bug in main (no options input anymore)  
167 -# - improved logging, added -l option  
168 -# 2016-01-31 PL: - fixed issue #31 in VBA_Parser.open_mht  
169 -# - fixed issue #32 by monkeypatching email.feedparser  
170 -# 2016-02-07 PL: - KeyboardInterrupt is now raised properly  
171 -# 2016-02-20 v0.43 PL: - fixed issue #34 in the VBA parser and vba_chr  
172 -# 2016-02-29 PL: - added Workbook_Activate to suspicious keywords  
173 -# 2016-03-08 v0.44 PL: - added VBA Form strings extraction and analysis  
174 -# 2016-03-04 v0.45 CH: - added JSON output (by Christian Herdtweck)  
175 -# 2016-03-16 CH: - added option --no-deobfuscate (temporary)  
176 -# 2016-04-19 v0.46 PL: - new option --deobf instead of --no-deobfuscate  
177 -# - updated suspicious keywords  
178 -# 2016-05-04 v0.47 PL: - look for VBA code in any stream including orphans  
179 -# 2016-04-28 CH: - return an exit code depending on the results  
180 -# - improved error and exception handling  
181 -# - improved JSON output  
182 -# 2016-05-12 CH: - added support for PowerPoint 97-2003 files  
183 -# 2016-06-06 CH: - improved handling of unicode VBA module names  
184 -# 2016-06-07 CH: - added option --relaxed, stricter parsing by default  
185 -# 2016-06-12 v0.50 PL: - fixed small bugs in VBA parsing code  
186 -# 2016-07-01 PL: - fixed issue #58 with format() to support Python 2.6  
187 -# 2016-07-29 CH: - fixed several bugs including #73 (Mac Roman encoding)  
188 -# 2016-08-31 PL: - added autoexec keyword InkPicture_Painted  
189 -# - detect_autoexec now returns the exact keyword found  
190 -# 2016-09-05 PL: - added autoexec keywords for MS Publisher (.pub)  
191 -# 2016-09-06 PL: - fixed issue #20, is_zipfile on Python 2.6  
192 -# 2016-09-12 PL: - enabled packrat to improve pyparsing performance  
193 -# 2016-10-25 PL: - fixed raise and print statements for Python 3  
194 -# 2016-11-03 v0.51 PL: - added EnumDateFormats and EnumSystemLanguageGroupsW  
195 -# 2017-02-07 PL: - temporary fix for issue #132  
196 -# - added keywords for Mac-specific macros (issue #130)  
197 -# 2017-03-08 PL: - fixed absolute imports  
198 -# 2017-03-16 PL: - fixed issues #148 and #149 for option --reveal  
199 -# 2017-05-19 PL: - added enable_logging to fix issue #154  
200 -# 2017-05-31 c1fe: - PR #135 fixing issue #132 for some Mac files  
201 -# 2017-06-08 PL: - fixed issue #122 Chr() with negative numbers  
202 -# 2017-06-15 PL: - deobfuscation line by line to handle large files  
203 -# 2017-07-11 v0.52 PL: - raise exception instead of sys.exit (issue #180)  
204 -# 2017-11-20 PL: - fixed issue #219, do not close the file too early  
205 -# 2017-11-24 PL: - added keywords to detect self-modifying macros and  
206 -# attempts to disable macro security (issue #221)  
207 -# 2018-03-19 PL: - removed pyparsing from the thirdparty subfolder  
208 -# 2018-05-13 v0.53 PL: - added support for Word/PowerPoint 2007+ XML (FlatOPC)  
209 -# (issue #283)  
210 -# 2018-06-11 v0.53.1 MHW: - fixed #320: chr instead of unichr on python 3  
211 -# 2018-06-12 MHW: - fixed #322: import reduce from functools  
212 -# 2018-09-11 v0.54 PL: - olefile is now a dependency  
213 -# 2018-10-25 CH: - detect encryption and raise error if detected  
214 -  
215 -__version__ = '0.54dev4'  
216 -  
217 -#------------------------------------------------------------------------------  
218 -# TODO:  
219 -# + setup logging (common with other oletools)  
220 -# + add xor bruteforcing like bbharvest  
221 -# + options -a and -c should imply -d  
222 -  
223 -# TODO later:  
224 -# + performance improvement: instead of searching each keyword separately,  
225 -# first split vba code into a list of words (per line), then check each  
226 -# word against a dict. (or put vba words into a set/dict?)  
227 -# + for regex, maybe combine them into a single re with named groups?  
228 -# + add Yara support, include sample rules? plugins like balbuzard?  
229 -# + add balbuzard support  
230 -# + output to file (replace print by file.write, sys.stdout by default)  
231 -# + look for VBA in embedded documents (e.g. Excel in Word)  
232 -# + support SRP streams (see Lenny's article + links and sample)  
233 -# - python 3.x support  
234 -# - check VBA macros in Visio, Access, Project, etc  
235 -# - extract_macros: convert to a class, split long function into smaller methods  
236 -# - extract_macros: read bytes from stream file objects instead of strings  
237 -# - extract_macros: use combined struct.unpack instead of many calls  
238 -# - all except clauses should target specific exceptions  
239 -  
240 -#------------------------------------------------------------------------------  
241 -# REFERENCES:  
242 -# - [MS-OVBA]: Microsoft Office VBA File Format Structure  
243 -# http://msdn.microsoft.com/en-us/library/office/cc313094%28v=office.12%29.aspx  
244 -# - officeparser: https://github.com/unixfreak0037/officeparser  
245 -  
246 -  
247 -#--- IMPORTS ------------------------------------------------------------------  
248 -  
249 -import sys  
250 -import os  
251 -import logging  
252 -import struct  
253 -from io import BytesIO  
254 -import math  
255 -import zipfile  
256 -import re  
257 -import optparse  
258 -import binascii  
259 -import base64  
260 -import zlib  
261 -import email # for MHTML parsing  
262 -import string # for printable  
263 -import json # for json output mode (argument --json)  
264 -  
265 -# import lxml or ElementTree for XML parsing:  
266 -try:  
267 - # lxml: best performance for XML processing  
268 - import lxml.etree as ET  
269 -except ImportError:  
270 - try:  
271 - # Python 2.5+: batteries included  
272 - import xml.etree.cElementTree as ET  
273 - except ImportError:  
274 - try:  
275 - # Python <2.5: standalone ElementTree install  
276 - import elementtree.cElementTree as ET  
277 - except ImportError:  
278 - raise ImportError("lxml or ElementTree are not installed, " \  
279 - + "see http://codespeak.net/lxml " \  
280 - + "or http://effbot.org/zone/element-index.htm")  
281 -  
282 -import colorclass  
283 -  
284 -# On Windows, colorclass needs to be enabled:  
285 -if os.name == 'nt':  
286 - colorclass.Windows.enable(auto_colors=True) 5 +import sys, os, warnings
287 6
  7 +warnings.warn('olevba3 is deprecated, olevba should be used instead.', DeprecationWarning)
288 8
289 # IMPORTANT: it should be possible to run oletools directly as scripts 9 # IMPORTANT: it should be possible to run oletools directly as scripts
290 # in any directory without installing them with pip or setup.py. 10 # in any directory without installing them with pip or setup.py.
@@ -292,3378 +12,13 @@ if os.name == &#39;nt&#39;: @@ -292,3378 +12,13 @@ if os.name == &#39;nt&#39;:
292 # And to enable Python 2+3 compatibility, we need to use absolute imports, 12 # And to enable Python 2+3 compatibility, we need to use absolute imports,
293 # so we add the oletools parent folder to sys.path (absolute+normalized path): 13 # so we add the oletools parent folder to sys.path (absolute+normalized path):
294 _thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) 14 _thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__)))
295 -# print('_thismodule_dir = %r' % _thismodule_dir)  
296 _parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) 15 _parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..'))
297 -# print('_parent_dir = %r' % _thirdparty_dir)  
298 -if not _parent_dir in sys.path: 16 +if _parent_dir not in sys.path:
299 sys.path.insert(0, _parent_dir) 17 sys.path.insert(0, _parent_dir)
300 18
301 -import olefile  
302 -from oletools.thirdparty.prettytable import prettytable  
303 -from oletools.thirdparty.xglob import xglob, PathNotFoundException  
304 -from pyparsing import \  
305 - CaselessKeyword, CaselessLiteral, Combine, Forward, Literal, \  
306 - Optional, QuotedString,Regex, Suppress, Word, WordStart, \  
307 - alphanums, alphas, hexnums,nums, opAssoc, srange, \  
308 - infixNotation, ParserElement  
309 -import oletools.ppt_parser as ppt_parser  
310 -from oletools import rtfobj  
311 -from oletools import oleid  
312 -from oletools.common.errors import FileIsEncryptedError  
313 -  
314 -# monkeypatch email to fix issue #32:  
315 -# allow header lines without ":"  
316 -import email.feedparser  
317 -email.feedparser.headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:?|[\t ])')  
318 -  
319 -# === PYTHON 2+3 SUPPORT ======================================================  
320 -  
321 -if sys.version_info[0] <= 2:  
322 - # Python 2.x  
323 - if sys.version_info[1] <= 6:  
324 - # Python 2.6  
325 - # use is_zipfile backported from Python 2.7:  
326 - from thirdparty.zipfile27 import is_zipfile  
327 - else:  
328 - # Python 2.7  
329 - from zipfile import is_zipfile  
330 -else:  
331 - # Python 3.x+  
332 - from zipfile import is_zipfile  
333 - # xrange is now called range:  
334 - xrange = range  
335 - # unichr does not exist anymore, only chr:  
336 - unichr = chr  
337 - from functools import reduce  
338 -  
339 -  
340 -# === PYTHON 3.0 - 3.4 SUPPORT ======================================================  
341 -  
342 -# From https://gist.github.com/ynkdir/867347/c5e188a4886bc2dd71876c7e069a7b00b6c16c61  
343 -  
344 -if sys.version_info >= (3, 0) and sys.version_info < (3, 5):  
345 - import codecs  
346 -  
347 - _backslashreplace_errors = codecs.lookup_error("backslashreplace")  
348 -  
349 - def backslashreplace_errors(exc):  
350 - if isinstance(exc, UnicodeDecodeError):  
351 - u = "".join("\\x{0:02x}".format(c) for c in exc.object[exc.start:exc.end])  
352 - return (u, exc.end)  
353 - return _backslashreplace_errors(exc)  
354 -  
355 - codecs.register_error("backslashreplace", backslashreplace_errors)  
356 -  
357 -  
358 -# === LOGGING =================================================================  
359 -  
360 -class NullHandler(logging.Handler):  
361 - """  
362 - Log Handler without output, to avoid printing messages if logging is not  
363 - configured by the main application.  
364 - Python 2.7 has logging.NullHandler, but this is necessary for 2.6:  
365 - see https://docs.python.org/2.6/library/logging.html#configuring-logging-for-a-library  
366 - """  
367 - def emit(self, record):  
368 - pass  
369 -  
370 -def get_logger(name, level=logging.CRITICAL+1):  
371 - """  
372 - Create a suitable logger object for this module.  
373 - The goal is not to change settings of the root logger, to avoid getting  
374 - other modules' logs on the screen.  
375 - If a logger exists with same name, reuse it. (Else it would have duplicate  
376 - handlers and messages would be doubled.)  
377 - The level is set to CRITICAL+1 by default, to avoid any logging.  
378 - """  
379 - # First, test if there is already a logger with the same name, else it  
380 - # will generate duplicate messages (due to duplicate handlers):  
381 - if name in logging.Logger.manager.loggerDict:  
382 - #NOTE: another less intrusive but more "hackish" solution would be to  
383 - # use getLogger then test if its effective level is not default.  
384 - logger = logging.getLogger(name)  
385 - # make sure level is OK:  
386 - logger.setLevel(level)  
387 - return logger  
388 - # get a new logger:  
389 - logger = logging.getLogger(name)  
390 - # only add a NullHandler for this logger, it is up to the application  
391 - # to configure its own logging:  
392 - logger.addHandler(NullHandler())  
393 - logger.setLevel(level)  
394 - return logger  
395 -  
396 -# a global logger object used for debugging:  
397 -log = get_logger('olevba')  
398 -  
399 -  
400 -def enable_logging():  
401 - """  
402 - Enable logging for this module (disabled by default).  
403 - This will set the module-specific logger level to NOTSET, which  
404 - means the main application controls the actual logging level.  
405 - """  
406 - log.setLevel(logging.NOTSET)  
407 - # Also enable logging in the ppt_parser module:  
408 - ppt_parser.enable_logging()  
409 -  
410 -  
411 -  
412 -#=== EXCEPTIONS ==============================================================  
413 -  
414 -class OlevbaBaseException(Exception):  
415 - """ Base class for exceptions produced here for simpler except clauses """  
416 - def __init__(self, msg, filename=None, orig_exc=None, **kwargs):  
417 - if orig_exc:  
418 - super(OlevbaBaseException, self).__init__(msg +  
419 - ' ({0})'.format(orig_exc),  
420 - **kwargs)  
421 - else:  
422 - super(OlevbaBaseException, self).__init__(msg, **kwargs)  
423 - self.msg = msg  
424 - self.filename = filename  
425 - self.orig_exc = orig_exc  
426 -  
427 -  
428 -class FileOpenError(OlevbaBaseException):  
429 - """ raised by VBA_Parser constructor if all open_... attempts failed  
430 -  
431 - probably means the file type is not supported  
432 - """  
433 -  
434 - def __init__(self, filename, orig_exc=None):  
435 - super(FileOpenError, self).__init__(  
436 - 'Failed to open file %s' % filename, filename, orig_exc)  
437 -  
438 -  
439 -class ProcessingError(OlevbaBaseException):  
440 - """ raised by VBA_Parser.process_file* functions """  
441 -  
442 - def __init__(self, filename, orig_exc):  
443 - super(ProcessingError, self).__init__(  
444 - 'Error processing file %s' % filename, filename, orig_exc)  
445 -  
446 -  
447 -class MsoExtractionError(RuntimeError, OlevbaBaseException):  
448 - """ raised by mso_file_extract if parsing MSO/ActiveMIME data failed """  
449 -  
450 - def __init__(self, msg):  
451 - MsoExtractionError.__init__(self, msg)  
452 - OlevbaBaseException.__init__(self, msg)  
453 -  
454 -  
455 -class SubstreamOpenError(FileOpenError):  
456 - """ special kind of FileOpenError: file is a substream of original file """  
457 -  
458 - def __init__(self, filename, subfilename, orig_exc=None):  
459 - super(SubstreamOpenError, self).__init__(  
460 - str(filename) + '/' + str(subfilename), orig_exc)  
461 - self.filename = filename # overwrite setting in OlevbaBaseException  
462 - self.subfilename = subfilename  
463 -  
464 -  
465 -class UnexpectedDataError(OlevbaBaseException):  
466 - """ raised when parsing is strict (=not relaxed) and data is unexpected """  
467 -  
468 - def __init__(self, stream_path, variable, expected, value):  
469 - if isinstance(expected, int):  
470 - es = '{0:04X}'.format(expected)  
471 - elif isinstance(expected, tuple):  
472 - es = ','.join('{0:04X}'.format(e) for e in expected)  
473 - es = '({0})'.format(es)  
474 - else:  
475 - raise ValueError('Unknown type encountered: {0}'.format(type(expected)))  
476 - super(UnexpectedDataError, self).__init__(  
477 - 'Unexpected value in {0} for variable {1}: '  
478 - 'expected {2} but found {3:04X}!'  
479 - .format(stream_path, variable, es, value))  
480 - self.stream_path = stream_path  
481 - self.variable = variable  
482 - self.expected = expected  
483 - self.value = value  
484 -  
485 -#--- CONSTANTS ----------------------------------------------------------------  
486 -  
487 -# return codes  
488 -RETURN_OK = 0  
489 -RETURN_WARNINGS = 1 # (reserved, not used yet)  
490 -RETURN_WRONG_ARGS = 2 # (fixed, built into optparse)  
491 -RETURN_FILE_NOT_FOUND = 3  
492 -RETURN_XGLOB_ERR = 4  
493 -RETURN_OPEN_ERROR = 5  
494 -RETURN_PARSE_ERROR = 6  
495 -RETURN_SEVERAL_ERRS = 7  
496 -RETURN_UNEXPECTED = 8  
497 -RETURN_ENCRYPTED = 9  
498 -  
499 -# MAC codepages (from http://stackoverflow.com/questions/1592925/decoding-mac-os-text-in-python)  
500 -MAC_CODEPAGES = {  
501 - 10000: 'mac-roman',  
502 - 10001: 'shiftjis', # not found: 'mac-shift-jis',  
503 - 10003: 'ascii', # nothing appropriate found: 'mac-hangul',  
504 - 10008: 'gb2321', # not found: 'mac-gb2312',  
505 - 10002: 'big5', # not found: 'mac-big5',  
506 - 10005: 'hebrew', # not found: 'mac-hebrew',  
507 - 10004: 'mac-arabic',  
508 - 10006: 'mac-greek',  
509 - 10081: 'mac-turkish',  
510 - 10021: 'thai', # not found: mac-thai',  
511 - 10029: 'maccentraleurope', # not found: 'mac-east europe',  
512 - 10007: 'ascii', # nothing appropriate found: 'mac-russian',  
513 -}  
514 -  
515 -# URL and message to report issues:  
516 -URL_OLEVBA_ISSUES = 'https://github.com/decalage2/oletools/issues'  
517 -MSG_OLEVBA_ISSUES = 'Please report this issue on %s' % URL_OLEVBA_ISSUES  
518 -  
519 -# Container types:  
520 -TYPE_OLE = 'OLE'  
521 -TYPE_OpenXML = 'OpenXML'  
522 -TYPE_FlatOPC_XML = 'FlatOPC_XML'  
523 -TYPE_Word2003_XML = 'Word2003_XML'  
524 -TYPE_MHTML = 'MHTML'  
525 -TYPE_TEXT = 'Text'  
526 -TYPE_PPT = 'PPT'  
527 -  
528 -# short tag to display file types in triage mode:  
529 -TYPE2TAG = {  
530 - TYPE_OLE: 'OLE:',  
531 - TYPE_OpenXML: 'OpX:',  
532 - TYPE_FlatOPC_XML: 'FlX:',  
533 - TYPE_Word2003_XML: 'XML:',  
534 - TYPE_MHTML: 'MHT:',  
535 - TYPE_TEXT: 'TXT:',  
536 - TYPE_PPT: 'PPT',  
537 -}  
538 -  
539 -  
540 -# MSO files ActiveMime header magic  
541 -MSO_ACTIVEMIME_HEADER = b'ActiveMime'  
542 -  
543 -MODULE_EXTENSION = "bas"  
544 -CLASS_EXTENSION = "cls"  
545 -FORM_EXTENSION = "frm"  
546 -  
547 -# Namespaces and tags for Word2003 XML parsing:  
548 -NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}'  
549 -# the tag <w:binData w:name="editdata.mso"> contains the VBA macro code:  
550 -TAG_BINDATA = NS_W + 'binData'  
551 -ATTR_NAME = NS_W + 'name'  
552 -  
553 -# Namespaces and tags for Word/PowerPoint 2007+ XML parsing:  
554 -# root: <pkg:package xmlns:pkg="http://schemas.microsoft.com/office/2006/xmlPackage">  
555 -NS_XMLPACKAGE = '{http://schemas.microsoft.com/office/2006/xmlPackage}'  
556 -TAG_PACKAGE = NS_XMLPACKAGE + 'package'  
557 -# the tag <pkg:part> includes <pkg:binaryData> that contains the VBA macro code in Base64:  
558 -# <pkg:part pkg:name="/word/vbaProject.bin" pkg:contentType="application/vnd.ms-office.vbaProject"><pkg:binaryData>  
559 -TAG_PKGPART = NS_XMLPACKAGE + 'part'  
560 -ATTR_PKG_NAME = NS_XMLPACKAGE + 'name'  
561 -ATTR_PKG_CONTENTTYPE = NS_XMLPACKAGE + 'contentType'  
562 -CTYPE_VBAPROJECT = "application/vnd.ms-office.vbaProject"  
563 -TAG_PKGBINDATA = NS_XMLPACKAGE + 'binaryData'  
564 -  
565 -# Keywords to detect auto-executable macros  
566 -AUTOEXEC_KEYWORDS = {  
567 - # MS Word:  
568 - 'Runs when the Word document is opened':  
569 - ('AutoExec', 'AutoOpen', 'DocumentOpen'),  
570 - 'Runs when the Word document is closed':  
571 - ('AutoExit', 'AutoClose', 'Document_Close', 'DocumentBeforeClose'),  
572 - 'Runs when the Word document is modified':  
573 - ('DocumentChange',),  
574 - 'Runs when a new Word document is created':  
575 - ('AutoNew', 'Document_New', 'NewDocument'),  
576 -  
577 - # MS Word and Publisher:  
578 - 'Runs when the Word or Publisher document is opened':  
579 - ('Document_Open',),  
580 - 'Runs when the Publisher document is closed':  
581 - ('Document_BeforeClose',),  
582 -  
583 - # MS Excel:  
584 - 'Runs when the Excel Workbook is opened':  
585 - ('Auto_Open', 'Workbook_Open', 'Workbook_Activate'),  
586 - 'Runs when the Excel Workbook is closed':  
587 - ('Auto_Close', 'Workbook_Close'),  
588 -  
589 - # any MS Office application:  
590 - 'Runs when the file is opened (using InkPicture ActiveX object)':  
591 - # ref:https://twitter.com/joe4security/status/770691099988025345  
592 - (r'\w+_Painted',),  
593 - 'Runs when the file is opened and ActiveX objects trigger events':  
594 - (r'\w+_(?:GotFocus|LostFocus|MouseHover)',),  
595 -}  
596 -  
597 -# Suspicious Keywords that may be used by malware  
598 -# See VBA language reference: http://msdn.microsoft.com/en-us/library/office/jj692818%28v=office.15%29.aspx  
599 -SUSPICIOUS_KEYWORDS = {  
600 - #TODO: use regex to support variable whitespaces  
601 - 'May read system environment variables':  
602 - ('Environ',),  
603 - 'May open a file':  
604 - ('Open',),  
605 - 'May write to a file (if combined with Open)':  
606 - #TODO: regex to find Open+Write on same line  
607 - ('Write', 'Put', 'Output', 'Print #'),  
608 - 'May read or write a binary file (if combined with Open)':  
609 - #TODO: regex to find Open+Binary on same line  
610 - ('Binary',),  
611 - 'May copy a file':  
612 - ('FileCopy', 'CopyFile'),  
613 - #FileCopy: http://msdn.microsoft.com/en-us/library/office/gg264390%28v=office.15%29.aspx  
614 - #CopyFile: http://msdn.microsoft.com/en-us/library/office/gg264089%28v=office.15%29.aspx  
615 - 'May delete a file':  
616 - ('Kill',),  
617 - 'May create a text file':  
618 - ('CreateTextFile', 'ADODB.Stream', 'WriteText', 'SaveToFile'),  
619 - #CreateTextFile: http://msdn.microsoft.com/en-us/library/office/gg264617%28v=office.15%29.aspx  
620 - #ADODB.Stream sample: http://pastebin.com/Z4TMyuq6  
621 - 'May run an executable file or a system command':  
622 - ('Shell', 'vbNormal', 'vbNormalFocus', 'vbHide', 'vbMinimizedFocus', 'vbMaximizedFocus', 'vbNormalNoFocus',  
623 - 'vbMinimizedNoFocus', 'WScript.Shell', 'Run', 'ShellExecute'),  
624 - # MacScript: see https://msdn.microsoft.com/en-us/library/office/gg264812.aspx  
625 - 'May run an executable file or a system command on a Mac':  
626 - ('MacScript',),  
627 - 'May run an executable file or a system command on a Mac (if combined with libc.dylib)':  
628 - ('system', 'popen', r'exec[lv][ep]?'),  
629 - #Shell: http://msdn.microsoft.com/en-us/library/office/gg278437%28v=office.15%29.aspx  
630 - #WScript.Shell+Run sample: http://pastebin.com/Z4TMyuq6  
631 - 'May run PowerShell commands':  
632 - #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/  
633 - #also: https://bitbucket.org/decalage/oletools/issues/14/olevba-library-update-ioc  
634 - # ref: https://blog.netspi.com/15-ways-to-bypass-the-powershell-execution-policy/  
635 - # TODO: add support for keywords starting with a non-alpha character, such as "-noexit"  
636 - # TODO: '-command', '-EncodedCommand', '-scriptblock'  
637 - ('PowerShell', 'noexit', 'ExecutionPolicy', 'noprofile', 'command', 'EncodedCommand',  
638 - 'invoke-command', 'scriptblock', 'Invoke-Expression', 'AuthorizationManager'),  
639 - 'May run an executable file or a system command using PowerShell':  
640 - ('Start-Process',),  
641 - 'May hide the application':  
642 - ('Application.Visible', 'ShowWindow', 'SW_HIDE'),  
643 - 'May create a directory':  
644 - ('MkDir',),  
645 - 'May save the current workbook':  
646 - ('ActiveWorkbook.SaveAs',),  
647 - 'May change which directory contains files to open at startup':  
648 - #TODO: confirm the actual effect  
649 - ('Application.AltStartupPath',),  
650 - 'May create an OLE object':  
651 - ('CreateObject',),  
652 - 'May create an OLE object using PowerShell':  
653 - ('New-Object',),  
654 - 'May run an application (if combined with CreateObject)':  
655 - ('Shell.Application',),  
656 - 'May enumerate application windows (if combined with Shell.Application object)':  
657 - ('Windows', 'FindWindow'),  
658 - 'May run code from a DLL':  
659 - #TODO: regex to find declare+lib on same line - see mraptor  
660 - ('Lib',),  
661 - 'May run code from a library on a Mac':  
662 - #TODO: regex to find declare+lib on same line - see mraptor  
663 - ('libc.dylib', 'dylib'),  
664 - 'May inject code into another process':  
665 - ('CreateThread', 'VirtualAlloc', # (issue #9) suggested by Davy Douhine - used by MSF payload  
666 - 'VirtualAllocEx', 'RtlMoveMemory',  
667 - ),  
668 - 'May run a shellcode in memory':  
669 - ('EnumSystemLanguageGroupsW?', # Used by Hancitor in Oct 2016  
670 - 'EnumDateFormats(?:W|(?:Ex){1,2})?'), # see https://msdn.microsoft.com/en-us/library/windows/desktop/dd317810(v=vs.85).aspx  
671 - 'May download files from the Internet':  
672 - #TODO: regex to find urlmon+URLDownloadToFileA on same line  
673 - ('URLDownloadToFileA', 'Msxml2.XMLHTTP', 'Microsoft.XMLHTTP',  
674 - 'MSXML2.ServerXMLHTTP', # suggested in issue #13  
675 - 'User-Agent', # sample from @ozhermit: http://pastebin.com/MPc3iV6z  
676 - ),  
677 - 'May download files from the Internet using PowerShell':  
678 - #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/  
679 - ('Net.WebClient', 'DownloadFile', 'DownloadString'),  
680 - 'May control another application by simulating user keystrokes':  
681 - ('SendKeys', 'AppActivate'),  
682 - #SendKeys: http://msdn.microsoft.com/en-us/library/office/gg278655%28v=office.15%29.aspx  
683 - 'May attempt to obfuscate malicious function calls':  
684 - ('CallByName',),  
685 - #CallByName: http://msdn.microsoft.com/en-us/library/office/gg278760%28v=office.15%29.aspx  
686 - 'May attempt to obfuscate specific strings (use option --deobf to deobfuscate)':  
687 - #TODO: regex to find several Chr*, not just one  
688 - ('Chr', 'ChrB', 'ChrW', 'StrReverse', 'Xor'),  
689 - #Chr: http://msdn.microsoft.com/en-us/library/office/gg264465%28v=office.15%29.aspx  
690 - 'May read or write registry keys':  
691 - #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/  
692 - ('RegOpenKeyExA', 'RegOpenKeyEx', 'RegCloseKey'),  
693 - 'May read registry keys':  
694 - #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/  
695 - ('RegQueryValueExA', 'RegQueryValueEx',  
696 - 'RegRead', #with Wscript.Shell  
697 - ),  
698 - 'May detect virtualization':  
699 - # sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/  
700 - (r'SYSTEM\ControlSet001\Services\Disk\Enum', 'VIRTUAL', 'VMWARE', 'VBOX'),  
701 - 'May detect Anubis Sandbox':  
702 - # sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/  
703 - # NOTES: this sample also checks App.EXEName but that seems to be a bug, it works in VB6 but not in VBA  
704 - # ref: http://www.syssec-project.eu/m/page-media/3/disarm-raid11.pdf  
705 - ('GetVolumeInformationA', 'GetVolumeInformation', # with kernel32.dll  
706 - '1824245000', r'HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows NT\CurrentVersion\ProductId',  
707 - '76487-337-8429955-22614', 'andy', 'sample', r'C:\exec\exec.exe', 'popupkiller'  
708 - ),  
709 - 'May detect Sandboxie':  
710 - # sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/  
711 - # ref: http://www.cplusplus.com/forum/windows/96874/  
712 - ('SbieDll.dll', 'SandboxieControlWndClass'),  
713 - 'May detect Sunbelt Sandbox':  
714 - # ref: http://www.cplusplus.com/forum/windows/96874/  
715 - (r'C:\file.exe',),  
716 - 'May detect Norman Sandbox':  
717 - # ref: http://www.cplusplus.com/forum/windows/96874/  
718 - ('currentuser',),  
719 - 'May detect CW Sandbox':  
720 - # ref: http://www.cplusplus.com/forum/windows/96874/  
721 - ('Schmidti',),  
722 - 'May detect WinJail Sandbox':  
723 - # ref: http://www.cplusplus.com/forum/windows/96874/  
724 - ('Afx:400000:0',),  
725 - 'May attempt to disable VBA macro security and Protected View':  
726 - # ref: http://blog.trendmicro.com/trendlabs-security-intelligence/qkg-filecoder-self-replicating-document-encrypting-ransomware/  
727 - # ref: https://thehackernews.com/2017/11/ms-office-macro-malware.html  
728 - ('AccessVBOM', 'VBAWarnings', 'ProtectedView', 'DisableAttachementsInPV', 'DisableInternetFilesInPV',  
729 - 'DisableUnsafeLocationsInPV', 'blockcontentexecutionfrominternet'),  
730 - 'May attempt to modify the VBA code (self-modification)':  
731 - ('VBProject', 'VBComponents', 'CodeModule', 'AddFromString'),  
732 -}  
733 -  
734 -# Suspicious Keywords to be searched for directly as strings, without regex  
735 -SUSPICIOUS_KEYWORDS_NOREGEX = {  
736 - 'May use special characters such as backspace to obfuscate code when printed on the console':  
737 - ('\b',),  
738 -}  
739 -  
740 -# Regular Expression for a URL:  
741 -# http://en.wikipedia.org/wiki/Uniform_resource_locator  
742 -# http://www.w3.org/Addressing/URL/uri-spec.html  
743 -#TODO: also support username:password@server  
744 -#TODO: other protocols (file, gopher, wais, ...?)  
745 -SCHEME = r'\b(?:http|ftp)s?'  
746 -# see http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains  
747 -TLD = r'(?:xn--[a-zA-Z0-9]{4,20}|[a-zA-Z]{2,20})'  
748 -DNS_NAME = r'(?:[a-zA-Z0-9\-\.]+\.' + TLD + ')'  
749 -#TODO: IPv6 - see https://www.debuggex.com/  
750 -# A literal numeric IPv6 address may be given, but must be enclosed in [ ] e.g. [db8:0cec::99:123a]  
751 -NUMBER_0_255 = r'(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'  
752 -IPv4 = r'(?:' + NUMBER_0_255 + r'\.){3}' + NUMBER_0_255  
753 -# IPv4 must come before the DNS name because it is more specific  
754 -SERVER = r'(?:' + IPv4 + '|' + DNS_NAME + ')'  
755 -PORT = r'(?:\:[0-9]{1,5})?'  
756 -SERVER_PORT = SERVER + PORT  
757 -URL_PATH = r'(?:/[a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~]*)?' # [^\.\,\)\(\s"]  
758 -URL_RE = SCHEME + r'\://' + SERVER_PORT + URL_PATH  
759 -re_url = re.compile(URL_RE)  
760 -  
761 -  
762 -# Patterns to be extracted (IP addresses, URLs, etc)  
763 -# From patterns.py in balbuzard  
764 -RE_PATTERNS = (  
765 - ('URL', re.compile(URL_RE)),  
766 - ('IPv4 address', re.compile(IPv4)),  
767 - # TODO: add IPv6  
768 - ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@' + SERVER + '\b')),  
769 - # ('Domain name', re.compile(r'(?=^.{1,254}$)(^(?:(?!\d+\.|-)[a-zA-Z0-9_\-]{1,63}(?<!-)\.?)+(?:[a-zA-Z]{2,})$)')),  
770 - # Executable file name with known extensions (except .com which is present in many URLs, and .application):  
771 - ("Executable file name", re.compile(  
772 - r"(?i)\b\w+\.(EXE|PIF|GADGET|MSI|MSP|MSC|VBS|VBE|VB|JSE|JS|WSF|WSC|WSH|WS|BAT|CMD|DLL|SCR|HTA|CPL|CLASS|JAR|PS1XML|PS1|PS2XML|PS2|PSC1|PSC2|SCF|LNK|INF|REG)\b")),  
773 - # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/  
774 - # TODO: https://support.office.com/en-us/article/Blocked-attachments-in-Outlook-3811cddc-17c3-4279-a30c-060ba0207372#__attachment_file_types  
775 - # TODO: add win & unix file paths  
776 - #('Hex string', re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')),  
777 -)  
778 -  
779 -# regex to detect strings encoded in hexadecimal  
780 -re_hex_string = re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')  
781 -  
782 -# regex to detect strings encoded in base64  
783 -#re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?"')  
784 -# better version from balbuzard, less false positives:  
785 -# (plain version without double quotes, used also below in quoted_base64_string)  
786 -BASE64_RE = r'(?:[A-Za-z0-9+/]{4}){1,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)?'  
787 -re_base64_string = re.compile('"' + BASE64_RE + '"')  
788 -# white list of common strings matching the base64 regex, but which are not base64 strings (all lowercase):  
789 -BASE64_WHITELIST = set(['thisdocument', 'thisworkbook', 'test', 'temp', 'http', 'open', 'exit'])  
790 -  
791 -# regex to detect strings encoded with a specific Dridex algorithm  
792 -# (see https://github.com/JamesHabben/MalwareStuff)  
793 -re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"')  
794 -# regex to check that it is not just a hex string:  
795 -re_nothex_check = re.compile(r'[G-Zg-z]')  
796 -  
797 -# regex to extract printable strings (at least 5 chars) from VBA Forms:  
798 -# (must be bytes for Python 3)  
799 -re_printable_string = re.compile(b'[\\t\\r\\n\\x20-\\xFF]{5,}')  
800 -  
801 -  
802 -# === PARTIAL VBA GRAMMAR ====================================================  
803 -  
804 -# REFERENCES:  
805 -# - [MS-VBAL]: VBA Language Specification  
806 -# https://msdn.microsoft.com/en-us/library/dd361851.aspx  
807 -# - pyparsing: http://pyparsing.wikispaces.com/  
808 -  
809 -# TODO: set whitespaces according to VBA  
810 -# TODO: merge extended lines before parsing  
811 -  
812 -# Enable PackRat for better performance:  
813 -# (see https://pythonhosted.org/pyparsing/pyparsing.ParserElement-class.html#enablePackrat)  
814 -ParserElement.enablePackrat()  
815 -  
816 -# VBA identifier chars (from MS-VBAL 3.3.5)  
817 -vba_identifier_chars = alphanums + '_'  
818 -  
819 -class VbaExpressionString(str):  
820 - """  
821 - Class identical to str, used to distinguish plain strings from strings  
822 - obfuscated using VBA expressions (Chr, StrReverse, etc)  
823 - Usage: each VBA expression parse action should convert strings to  
824 - VbaExpressionString.  
825 - Then isinstance(s, VbaExpressionString) is True only for VBA expressions.  
826 - (see detect_vba_strings)  
827 - """  
828 - # TODO: use Unicode everywhere instead of str  
829 - pass  
830 -  
831 -  
832 -# --- NUMBER TOKENS ----------------------------------------------------------  
833 -  
834 -# 3.3.2 Number Tokens  
835 -# INTEGER = integer-literal ["%" / "&" / "^"]  
836 -# integer-literal = decimal-literal / octal-literal / hex-literal  
837 -# decimal-literal = 1*decimal-digit  
838 -# octal-literal = "&" [%x004F / %x006F] 1*octal-digit  
839 -# ; & or &o or &O  
840 -# hex-literal = "&" (%x0048 / %x0068) 1*hex-digit  
841 -# ; &h or &H  
842 -# octal-digit = "0" / "1" / "2" / "3" / "4" / "5" / "6" / "7"  
843 -# decimal-digit = octal-digit / "8" / "9"  
844 -# hex-digit = decimal-digit / %x0041-0046 / %x0061-0066 ;A-F / a-f  
845 -  
846 -# NOTE: here Combine() is required to avoid spaces between elements  
847 -# NOTE: here WordStart is necessary to avoid matching a number preceded by  
848 -# letters or underscore (e.g. "VBT1" or "ABC_34"), when using scanString  
849 -decimal_literal = Combine(Optional('-') + WordStart(vba_identifier_chars) + Word(nums)  
850 - + Suppress(Optional(Word('%&^', exact=1))))  
851 -decimal_literal.setParseAction(lambda t: int(t[0]))  
852 -  
853 -octal_literal = Combine(Suppress(Literal('&') + Optional((CaselessLiteral('o')))) + Word(srange('[0-7]'))  
854 - + Suppress(Optional(Word('%&^', exact=1))))  
855 -octal_literal.setParseAction(lambda t: int(t[0], base=8))  
856 -  
857 -hex_literal = Combine(Suppress(CaselessLiteral('&h')) + Word(srange('[0-9a-fA-F]'))  
858 - + Suppress(Optional(Word('%&^', exact=1))))  
859 -hex_literal.setParseAction(lambda t: int(t[0], base=16))  
860 -  
861 -integer = decimal_literal | octal_literal | hex_literal  
862 -  
863 -  
864 -# --- QUOTED STRINGS ---------------------------------------------------------  
865 -  
866 -# 3.3.4 String Tokens  
867 -# STRING = double-quote *string-character (double-quote / line-continuation / LINE-END)  
868 -# double-quote = %x0022 ; "  
869 -# string-character = NO-LINE-CONTINUATION ((double-quote double-quote) termination-character)  
870 -  
871 -quoted_string = QuotedString('"', escQuote='""')  
872 -quoted_string.setParseAction(lambda t: str(t[0]))  
873 -  
874 -  
875 -#--- VBA Expressions ---------------------------------------------------------  
876 -  
877 -# See MS-VBAL 5.6 Expressions  
878 -  
879 -# need to pre-declare using Forward() because it is recursive  
880 -# VBA string expression and integer expression  
881 -vba_expr_str = Forward()  
882 -vba_expr_int = Forward()  
883 -  
884 -# --- CHR --------------------------------------------------------------------  
885 -  
886 -# MS-VBAL 6.1.2.11.1.4 Chr / Chr$  
887 -# Function Chr(CharCode As Long) As Variant  
888 -# Function Chr$(CharCode As Long) As String  
889 -# Parameter Description  
890 -# CharCode Long whose value is a code point.  
891 -# Returns a String data value consisting of a single character containing the character whose code  
892 -# point is the data value of the argument.  
893 -# - If the argument is not in the range 0 to 255, Error Number 5 ("Invalid procedure call or  
894 -# argument") is raised unless the implementation supports a character set with a larger code point  
895 -# range.  
896 -# - If the argument value is in the range of 0 to 127, it is interpreted as a 7-bit ASCII code point.  
897 -# - If the argument value is in the range of 128 to 255, the code point interpretation of the value is  
898 -# implementation defined.  
899 -# - Chr$ has the same runtime semantics as Chr, however the declared type of its function result is  
900 -# String rather than Variant.  
901 -  
902 -# 6.1.2.11.1.5 ChrB / ChrB$  
903 -# Function ChrB(CharCode As Long) As Variant  
904 -# Function ChrB$(CharCode As Long) As String  
905 -# CharCode Long whose value is a code point.  
906 -# Returns a String data value consisting of a single byte character whose code point value is the  
907 -# data value of the argument.  
908 -# - If the argument is not in the range 0 to 255, Error Number 6 ("Overflow") is raised.  
909 -# - ChrB$ has the same runtime semantics as ChrB however the declared type of its function result  
910 -# is String rather than Variant.  
911 -# - Note: the ChrB function is used with byte data contained in a String. Instead of returning a  
912 -# character, which may be one or two bytes, ChrB always returns a single byte. The ChrW function  
913 -# returns a String containing the Unicode character except on platforms where Unicode is not  
914 -# supported, in which case, the behavior is identical to the Chr function.  
915 -  
916 -# 6.1.2.11.1.6 ChrW/ ChrW$  
917 -# Function ChrW(CharCode As Long) As Variant  
918 -# Function ChrW$(CharCode As Long) As String  
919 -# CharCode Long whose value is a code point.  
920 -# Returns a String data value consisting of a single character containing the character whose code  
921 -# point is the data value of the argument.  
922 -# - If the argument is not in the range -32,767 to 65,535 then Error Number 5 ("Invalid procedure  
923 -# call or argument") is raised.  
924 -# - If the argument is a negative value it is treated as if it was the value: CharCode + 65,536.  
925 -# - If the implemented uses 16-bit Unicode code points argument, data value is interpreted as a 16-  
926 -# bit Unicode code point.  
927 -# - If the implementation does not support Unicode, ChrW has the same semantics as Chr.  
928 -# - ChrW$ has the same runtime semantics as ChrW, however the declared type of its function result  
929 -# is String rather than Variant.  
930 -  
931 -# Chr, Chr$, ChrB, ChrW(int) => char  
932 -vba_chr = Suppress(  
933 - Combine(WordStart(vba_identifier_chars) + CaselessLiteral('Chr')  
934 - + Optional(CaselessLiteral('B') | CaselessLiteral('W')) + Optional('$'))  
935 - + '(') + vba_expr_int + Suppress(')')  
936 -  
937 -def vba_chr_tostr(t):  
938 - try:  
939 - i = t[0]  
940 - if i>=0 and i<=255:  
941 - # normal, non-unicode character:  
942 - # TODO: check if it needs to be converted to bytes for Python 3  
943 - return VbaExpressionString(chr(i))  
944 - else:  
945 - # unicode character  
946 - # Note: this distinction is only needed for Python 2  
947 - return VbaExpressionString(unichr(i).encode('utf-8', 'backslashreplace'))  
948 - except ValueError:  
949 - log.exception('ERROR: incorrect parameter value for chr(): %r' % i)  
950 - return VbaExpressionString('Chr(%r)' % i)  
951 -  
952 -vba_chr.setParseAction(vba_chr_tostr)  
953 -  
954 -  
955 -# --- ASC --------------------------------------------------------------------  
956 -  
957 -# Asc(char) => int  
958 -#TODO: see MS-VBAL 6.1.2.11.1.1 page 240 => AscB, AscW  
959 -vba_asc = Suppress(CaselessKeyword('Asc') + '(') + vba_expr_str + Suppress(')')  
960 -vba_asc.setParseAction(lambda t: ord(t[0]))  
961 -  
962 -  
963 -# --- VAL --------------------------------------------------------------------  
964 -  
965 -# Val(string) => int  
966 -# TODO: make sure the behavior of VBA's val is fully covered  
967 -vba_val = Suppress(CaselessKeyword('Val') + '(') + vba_expr_str + Suppress(')')  
968 -vba_val.setParseAction(lambda t: int(t[0].strip()))  
969 -  
970 -  
971 -# --- StrReverse() --------------------------------------------------------------------  
972 -  
973 -# StrReverse(string) => string  
974 -strReverse = Suppress(CaselessKeyword('StrReverse') + '(') + vba_expr_str + Suppress(')')  
975 -strReverse.setParseAction(lambda t: VbaExpressionString(str(t[0])[::-1]))  
976 -  
977 -  
978 -# --- ENVIRON() --------------------------------------------------------------------  
979 -  
980 -# Environ("name") => just translated to "%name%", that is enough for malware analysis  
981 -environ = Suppress(CaselessKeyword('Environ') + '(') + vba_expr_str + Suppress(')')  
982 -environ.setParseAction(lambda t: VbaExpressionString('%%%s%%' % t[0]))  
983 -  
984 -  
985 -# --- IDENTIFIER -------------------------------------------------------------  
986 -  
987 -#TODO: see MS-VBAL 3.3.5 page 33  
988 -# 3.3.5 Identifier Tokens  
989 -# Latin-identifier = first-Latin-identifier-character *subsequent-Latin-identifier-character  
990 -# first-Latin-identifier-character = (%x0041-005A / %x0061-007A) ; A-Z / a-z  
991 -# subsequent-Latin-identifier-character = first-Latin-identifier-character / DIGIT / %x5F ; underscore  
992 -latin_identifier = Word(initChars=alphas, bodyChars=alphanums + '_')  
993 -  
994 -# --- HEX FUNCTION -----------------------------------------------------------  
995 -  
996 -# match any custom function name with a hex string as argument:  
997 -# TODO: accept vba_expr_str_item as argument, check if it is a hex or base64 string at runtime  
998 -  
999 -# quoted string of at least two hexadecimal numbers of two digits:  
1000 -quoted_hex_string = Suppress('"') + Combine(Word(hexnums, exact=2) * (2, None)) + Suppress('"')  
1001 -quoted_hex_string.setParseAction(lambda t: str(t[0]))  
1002 -  
1003 -hex_function_call = Suppress(latin_identifier) + Suppress('(') + \  
1004 - quoted_hex_string('hex_string') + Suppress(')')  
1005 -hex_function_call.setParseAction(lambda t: VbaExpressionString(binascii.a2b_hex(t.hex_string)))  
1006 -  
1007 -  
1008 -# --- BASE64 FUNCTION -----------------------------------------------------------  
1009 -  
1010 -# match any custom function name with a Base64 string as argument:  
1011 -# TODO: accept vba_expr_str_item as argument, check if it is a hex or base64 string at runtime  
1012 -  
1013 -# quoted string of at least two hexadecimal numbers of two digits:  
1014 -quoted_base64_string = Suppress('"') + Regex(BASE64_RE) + Suppress('"')  
1015 -quoted_base64_string.setParseAction(lambda t: str(t[0]))  
1016 -  
1017 -base64_function_call = Suppress(latin_identifier) + Suppress('(') + \  
1018 - quoted_base64_string('base64_string') + Suppress(')')  
1019 -base64_function_call.setParseAction(lambda t: VbaExpressionString(binascii.a2b_base64(t.base64_string)))  
1020 -  
1021 -  
1022 -# ---STRING EXPRESSION -------------------------------------------------------  
1023 -  
1024 -def concat_strings_list(tokens):  
1025 - """  
1026 - parse action to concatenate strings in a VBA expression with operators '+' or '&'  
1027 - """  
1028 - # extract argument from the tokens:  
1029 - # expected to be a tuple containing a list of strings such as [a,'&',b,'&',c,...]  
1030 - strings = tokens[0][::2]  
1031 - return VbaExpressionString(''.join(strings))  
1032 -  
1033 -  
1034 -vba_expr_str_item = (vba_chr | strReverse | environ | quoted_string | hex_function_call | base64_function_call)  
1035 -  
1036 -vba_expr_str <<= infixNotation(vba_expr_str_item,  
1037 - [  
1038 - ("+", 2, opAssoc.LEFT, concat_strings_list),  
1039 - ("&", 2, opAssoc.LEFT, concat_strings_list),  
1040 - ])  
1041 -  
1042 -  
1043 -# --- INTEGER EXPRESSION -------------------------------------------------------  
1044 -  
1045 -def sum_ints_list(tokens):  
1046 - """  
1047 - parse action to sum integers in a VBA expression with operator '+'  
1048 - """  
1049 - # extract argument from the tokens:  
1050 - # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...]  
1051 - integers = tokens[0][::2]  
1052 - return sum(integers)  
1053 -  
1054 -  
1055 -def subtract_ints_list(tokens):  
1056 - """  
1057 - parse action to subtract integers in a VBA expression with operator '-'  
1058 - """  
1059 - # extract argument from the tokens:  
1060 - # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...]  
1061 - integers = tokens[0][::2]  
1062 - return reduce(lambda x,y:x-y, integers)  
1063 -  
1064 -  
1065 -def multiply_ints_list(tokens):  
1066 - """  
1067 - parse action to multiply integers in a VBA expression with operator '*'  
1068 - """  
1069 - # extract argument from the tokens:  
1070 - # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...]  
1071 - integers = tokens[0][::2]  
1072 - return reduce(lambda x,y:x*y, integers)  
1073 -  
1074 -  
1075 -def divide_ints_list(tokens):  
1076 - """  
1077 - parse action to divide integers in a VBA expression with operator '/'  
1078 - """  
1079 - # extract argument from the tokens:  
1080 - # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...]  
1081 - integers = tokens[0][::2]  
1082 - return reduce(lambda x,y:x/y, integers)  
1083 -  
1084 -  
1085 -vba_expr_int_item = (vba_asc | vba_val | integer)  
1086 -  
1087 -# operators associativity:  
1088 -# https://en.wikipedia.org/wiki/Operator_associativity  
1089 -  
1090 -vba_expr_int <<= infixNotation(vba_expr_int_item,  
1091 - [  
1092 - ("*", 2, opAssoc.LEFT, multiply_ints_list),  
1093 - ("/", 2, opAssoc.LEFT, divide_ints_list),  
1094 - ("-", 2, opAssoc.LEFT, subtract_ints_list),  
1095 - ("+", 2, opAssoc.LEFT, sum_ints_list),  
1096 - ])  
1097 -  
1098 -  
1099 -# see detect_vba_strings for the deobfuscation code using this grammar  
1100 -  
1101 -# === MSO/ActiveMime files parsing ===========================================  
1102 -  
1103 -def is_mso_file(data):  
1104 - """  
1105 - Check if the provided data is the content of a MSO/ActiveMime file, such as  
1106 - the ones created by Outlook in some cases, or Word/Excel when saving a  
1107 - file with the MHTML format or the Word 2003 XML format.  
1108 - This function only checks the ActiveMime magic at the beginning of data.  
1109 - :param data: bytes string, MSO/ActiveMime file content  
1110 - :return: bool, True if the file is MSO, False otherwise  
1111 - """  
1112 - return data.startswith(MSO_ACTIVEMIME_HEADER)  
1113 -  
1114 -  
1115 -# regex to find zlib block headers, starting with byte 0x78 = 'x'  
1116 -re_zlib_header = re.compile(r'x')  
1117 -  
1118 -  
1119 -def mso_file_extract(data):  
1120 - """  
1121 - Extract the data stored into a MSO/ActiveMime file, such as  
1122 - the ones created by Outlook in some cases, or Word/Excel when saving a  
1123 - file with the MHTML format or the Word 2003 XML format.  
1124 -  
1125 - :param data: bytes string, MSO/ActiveMime file content  
1126 - :return: bytes string, extracted data (uncompressed)  
1127 -  
1128 - raise a MsoExtractionError if the data cannot be extracted  
1129 - """  
1130 - # check the magic:  
1131 - assert is_mso_file(data)  
1132 -  
1133 - # In all the samples seen so far, Word always uses an offset of 0x32,  
1134 - # and Excel 0x22A. But we read the offset from the header to be more  
1135 - # generic.  
1136 - offsets = [0x32, 0x22A]  
1137 -  
1138 - # First, attempt to get the compressed data offset from the header  
1139 - # According to my tests, it should be an unsigned 16 bits integer,  
1140 - # at offset 0x1E (little endian) + add 46:  
1141 - try:  
1142 - offset = struct.unpack_from('<H', data, offset=0x1E)[0] + 46  
1143 - log.debug('Parsing MSO file: data offset = 0x%X' % offset)  
1144 - offsets.insert(0, offset) # insert at beginning of offsets  
1145 - except struct.error as exc:  
1146 - log.info('Unable to parse MSO/ActiveMime file header (%s)' % exc)  
1147 - log.debug('Trace:', exc_info=True)  
1148 - raise MsoExtractionError('Unable to parse MSO/ActiveMime file header')  
1149 - # now try offsets  
1150 - for start in offsets:  
1151 - try:  
1152 - log.debug('Attempting zlib decompression from MSO file offset 0x%X' % start)  
1153 - extracted_data = zlib.decompress(data[start:])  
1154 - return extracted_data  
1155 - except zlib.error as exc:  
1156 - log.info('zlib decompression failed for offset %s (%s)'  
1157 - % (start, exc))  
1158 - log.debug('Trace:', exc_info=True)  
1159 - # None of the guessed offsets worked, let's try brute-forcing by looking  
1160 - # for potential zlib-compressed blocks starting with 0x78:  
1161 - log.debug('Looking for potential zlib-compressed blocks in MSO file')  
1162 - for match in re_zlib_header.finditer(data):  
1163 - start = match.start()  
1164 - try:  
1165 - log.debug('Attempting zlib decompression from MSO file offset 0x%X' % start)  
1166 - extracted_data = zlib.decompress(data[start:])  
1167 - return extracted_data  
1168 - except zlib.error as exc:  
1169 - log.info('zlib decompression failed (%s)' % exc)  
1170 - log.debug('Trace:', exc_info=True)  
1171 - raise MsoExtractionError('Unable to decompress data from a MSO/ActiveMime file')  
1172 -  
1173 -  
1174 -#--- FUNCTIONS ----------------------------------------------------------------  
1175 -  
1176 -# set of printable characters, for is_printable  
1177 -_PRINTABLE_SET = set(string.printable)  
1178 -  
1179 -def is_printable(s):  
1180 - """  
1181 - returns True if string s only contains printable ASCII characters  
1182 - (i.e. contained in string.printable)  
1183 - This is similar to Python 3's str.isprintable, for Python 2.x.  
1184 - :param s: str  
1185 - :return: bool  
1186 - """  
1187 - # inspired from http://stackoverflow.com/questions/3636928/test-if-a-python-string-is-printable  
1188 - # check if the set of chars from s is contained into the set of printable chars:  
1189 - return set(s).issubset(_PRINTABLE_SET)  
1190 -  
1191 -  
1192 -def copytoken_help(decompressed_current, decompressed_chunk_start):  
1193 - """  
1194 - compute bit masks to decode a CopyToken according to MS-OVBA 2.4.1.3.19.1 CopyToken Help  
1195 -  
1196 - decompressed_current: number of decompressed bytes so far, i.e. len(decompressed_container)  
1197 - decompressed_chunk_start: offset of the current chunk in the decompressed container  
1198 - return length_mask, offset_mask, bit_count, maximum_length  
1199 - """  
1200 - difference = decompressed_current - decompressed_chunk_start  
1201 - bit_count = int(math.ceil(math.log(difference, 2)))  
1202 - bit_count = max([bit_count, 4])  
1203 - length_mask = 0xFFFF >> bit_count  
1204 - offset_mask = ~length_mask  
1205 - maximum_length = (0xFFFF >> bit_count) + 3  
1206 - return length_mask, offset_mask, bit_count, maximum_length  
1207 -  
1208 -  
1209 -def decompress_stream(compressed_container):  
1210 - """  
1211 - Decompress a stream according to MS-OVBA section 2.4.1  
1212 -  
1213 - compressed_container: string compressed according to the MS-OVBA 2.4.1.3.6 Compression algorithm  
1214 - return the decompressed container as a string (bytes)  
1215 - """  
1216 - # 2.4.1.2 State Variables  
1217 -  
1218 - # The following state is maintained for the CompressedContainer (section 2.4.1.1.1):  
1219 - # CompressedRecordEnd: The location of the byte after the last byte in the CompressedContainer (section 2.4.1.1.1).  
1220 - # CompressedCurrent: The location of the next byte in the CompressedContainer (section 2.4.1.1.1) to be read by  
1221 - # decompression or to be written by compression.  
1222 -  
1223 - # The following state is maintained for the current CompressedChunk (section 2.4.1.1.4):  
1224 - # CompressedChunkStart: The location of the first byte of the CompressedChunk (section 2.4.1.1.4) within the  
1225 - # CompressedContainer (section 2.4.1.1.1).  
1226 -  
1227 - # The following state is maintained for a DecompressedBuffer (section 2.4.1.1.2):  
1228 - # DecompressedCurrent: The location of the next byte in the DecompressedBuffer (section 2.4.1.1.2) to be written by  
1229 - # decompression or to be read by compression.  
1230 - # DecompressedBufferEnd: The location of the byte after the last byte in the DecompressedBuffer (section 2.4.1.1.2).  
1231 -  
1232 - # The following state is maintained for the current DecompressedChunk (section 2.4.1.1.3):  
1233 - # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the  
1234 - # DecompressedBuffer (section 2.4.1.1.2).  
1235 -  
1236 - # Check the input is a bytearray:  
1237 - if not isinstance(compressed_container, bytearray):  
1238 - raise TypeError('decompress_stream requires a bytearray as input')  
1239 - decompressed_container = bytearray() # result  
1240 - compressed_current = 0  
1241 -  
1242 - sig_byte = compressed_container[compressed_current]  
1243 - if sig_byte != 0x01:  
1244 - raise ValueError('invalid signature byte {0:02X}'.format(sig_byte))  
1245 -  
1246 - compressed_current += 1  
1247 -  
1248 - #NOTE: the definition of CompressedRecordEnd is ambiguous. Here we assume that  
1249 - # CompressedRecordEnd = len(compressed_container)  
1250 - while compressed_current < len(compressed_container):  
1251 - # 2.4.1.1.5  
1252 - compressed_chunk_start = compressed_current  
1253 - # chunk header = first 16 bits  
1254 - compressed_chunk_header = \  
1255 - struct.unpack("<H", compressed_container[compressed_chunk_start:compressed_chunk_start + 2])[0]  
1256 - # chunk size = 12 first bits of header + 3  
1257 - chunk_size = (compressed_chunk_header & 0x0FFF) + 3  
1258 - # chunk signature = 3 next bits - should always be 0b011  
1259 - chunk_signature = (compressed_chunk_header >> 12) & 0x07  
1260 - if chunk_signature != 0b011:  
1261 - raise ValueError('Invalid CompressedChunkSignature in VBA compressed stream')  
1262 - # chunk flag = next bit - 1 == compressed, 0 == uncompressed  
1263 - chunk_flag = (compressed_chunk_header >> 15) & 0x01  
1264 - log.debug("chunk size = {0}, compressed flag = {1}".format(chunk_size, chunk_flag))  
1265 -  
1266 - #MS-OVBA 2.4.1.3.12: the maximum size of a chunk including its header is 4098 bytes (header 2 + data 4096)  
1267 - # The minimum size is 3 bytes  
1268 - # NOTE: there seems to be a typo in MS-OVBA, the check should be with 4098, not 4095 (which is the max value  
1269 - # in chunk header before adding 3.  
1270 - # Also the first test is not useful since a 12 bits value cannot be larger than 4095.  
1271 - if chunk_flag == 1 and chunk_size > 4098:  
1272 - raise ValueError('CompressedChunkSize > 4098 but CompressedChunkFlag == 1')  
1273 - if chunk_flag == 0 and chunk_size != 4098:  
1274 - raise ValueError('CompressedChunkSize != 4098 but CompressedChunkFlag == 0')  
1275 -  
1276 - # check if chunk_size goes beyond the compressed data, instead of silently cutting it:  
1277 - #TODO: raise an exception?  
1278 - if compressed_chunk_start + chunk_size > len(compressed_container):  
1279 - log.warning('Chunk size is larger than remaining compressed data')  
1280 - compressed_end = min([len(compressed_container), compressed_chunk_start + chunk_size])  
1281 - # read after chunk header:  
1282 - compressed_current = compressed_chunk_start + 2  
1283 -  
1284 - if chunk_flag == 0:  
1285 - # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk  
1286 - # uncompressed chunk: read the next 4096 bytes as-is  
1287 - #TODO: check if there are at least 4096 bytes left  
1288 - decompressed_container.extend([compressed_container[compressed_current:compressed_current + 4096]])  
1289 - compressed_current += 4096  
1290 - else:  
1291 - # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk  
1292 - # compressed chunk  
1293 - decompressed_chunk_start = len(decompressed_container)  
1294 - while compressed_current < compressed_end:  
1295 - # MS-OVBA 2.4.1.3.4 Decompressing a TokenSequence  
1296 - # log.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end))  
1297 - # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or  
1298 - # copy tokens (reference to a previous literal token)  
1299 - flag_byte = compressed_container[compressed_current]  
1300 - compressed_current += 1  
1301 - for bit_index in xrange(0, 8):  
1302 - # log.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end))  
1303 - if compressed_current >= compressed_end:  
1304 - break  
1305 - # MS-OVBA 2.4.1.3.5 Decompressing a Token  
1306 - # MS-OVBA 2.4.1.3.17 Extract FlagBit  
1307 - flag_bit = (flag_byte >> bit_index) & 1  
1308 - #log.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit))  
1309 - if flag_bit == 0: # LiteralToken  
1310 - # copy one byte directly to output  
1311 - decompressed_container.extend([compressed_container[compressed_current]])  
1312 - compressed_current += 1  
1313 - else: # CopyToken  
1314 - # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken  
1315 - copy_token = \  
1316 - struct.unpack("<H", compressed_container[compressed_current:compressed_current + 2])[0]  
1317 - #TODO: check this  
1318 - length_mask, offset_mask, bit_count, _ = copytoken_help(  
1319 - len(decompressed_container), decompressed_chunk_start)  
1320 - length = (copy_token & length_mask) + 3  
1321 - temp1 = copy_token & offset_mask  
1322 - temp2 = 16 - bit_count  
1323 - offset = (temp1 >> temp2) + 1  
1324 - #log.debug('offset=%d length=%d' % (offset, length))  
1325 - copy_source = len(decompressed_container) - offset  
1326 - for index in xrange(copy_source, copy_source + length):  
1327 - decompressed_container.extend([decompressed_container[index]])  
1328 - compressed_current += 2  
1329 - return bytes(decompressed_container)  
1330 -  
1331 -  
1332 -def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):  
1333 - """  
1334 - Extract VBA macros from an OleFileIO object.  
1335 - Internal function, do not call directly.  
1336 -  
1337 - vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream  
1338 - vba_project: path to the PROJECT stream  
1339 - :param relaxed: If True, only create info/debug log entry if data is not as expected  
1340 - (e.g. opening substream fails); if False, raise an error in this case  
1341 - This is a generator, yielding (stream path, VBA filename, VBA source code) for each VBA code stream  
1342 - """  
1343 - # Open the PROJECT stream:  
1344 - project = ole.openstream(project_path)  
1345 - log.debug('relaxed is %s' % relaxed)  
1346 -  
1347 - # sample content of the PROJECT stream:  
1348 -  
1349 - ## ID="{5312AC8A-349D-4950-BDD0-49BE3C4DD0F0}"  
1350 - ## Document=ThisDocument/&H00000000  
1351 - ## Module=NewMacros  
1352 - ## Name="Project"  
1353 - ## HelpContextID="0"  
1354 - ## VersionCompatible32="393222000"  
1355 - ## CMG="F1F301E705E705E705E705"  
1356 - ## DPB="8F8D7FE3831F2020202020"  
1357 - ## GC="2D2FDD81E51EE61EE6E1"  
1358 - ##  
1359 - ## [Host Extender Info]  
1360 - ## &H00000001={3832D640-CF90-11CF-8E43-00A0C911005A};VBE;&H00000000  
1361 - ## &H00000002={000209F2-0000-0000-C000-000000000046};Word8.0;&H00000000  
1362 - ##  
1363 - ## [Workspace]  
1364 - ## ThisDocument=22, 29, 339, 477, Z  
1365 - ## NewMacros=-4, 42, 832, 510, C  
1366 -  
1367 - code_modules = {}  
1368 -  
1369 - for line in project:  
1370 - line = line.strip().decode('utf-8','ignore')  
1371 - if '=' in line:  
1372 - # split line at the 1st equal sign:  
1373 - name, value = line.split('=', 1)  
1374 - # looking for code modules  
1375 - # add the code module as a key in the dictionary  
1376 - # the value will be the extension needed later  
1377 - # The value is converted to lowercase, to allow case-insensitive matching (issue #3)  
1378 - value = value.lower()  
1379 - if name == 'Document':  
1380 - # split value at the 1st slash, keep 1st part:  
1381 - value = value.split('/', 1)[0]  
1382 - code_modules[value] = CLASS_EXTENSION  
1383 - elif name == 'Module':  
1384 - code_modules[value] = MODULE_EXTENSION  
1385 - elif name == 'Class':  
1386 - code_modules[value] = CLASS_EXTENSION  
1387 - elif name == 'BaseClass':  
1388 - code_modules[value] = FORM_EXTENSION  
1389 -  
1390 - # read data from dir stream (compressed)  
1391 - dir_compressed = ole.openstream(dir_path).read()  
1392 -  
1393 - def check_value(name, expected, value):  
1394 - if expected != value:  
1395 - if relaxed:  
1396 - log.error("invalid value for {0} expected {1:04X} got {2:04X}"  
1397 - .format(name, expected, value))  
1398 - else:  
1399 - raise UnexpectedDataError(dir_path, name, expected, value)  
1400 -  
1401 - dir_stream = BytesIO(decompress_stream(bytearray(dir_compressed)))  
1402 -  
1403 - # PROJECTSYSKIND Record  
1404 - projectsyskind_id = struct.unpack("<H", dir_stream.read(2))[0]  
1405 - check_value('PROJECTSYSKIND_Id', 0x0001, projectsyskind_id)  
1406 - projectsyskind_size = struct.unpack("<L", dir_stream.read(4))[0]  
1407 - check_value('PROJECTSYSKIND_Size', 0x0004, projectsyskind_size)  
1408 - projectsyskind_syskind = struct.unpack("<L", dir_stream.read(4))[0]  
1409 - if projectsyskind_syskind == 0x00:  
1410 - log.debug("16-bit Windows")  
1411 - elif projectsyskind_syskind == 0x01:  
1412 - log.debug("32-bit Windows")  
1413 - elif projectsyskind_syskind == 0x02:  
1414 - log.debug("Macintosh")  
1415 - elif projectsyskind_syskind == 0x03:  
1416 - log.debug("64-bit Windows")  
1417 - else:  
1418 - log.error("invalid PROJECTSYSKIND_SysKind {0:04X}".format(projectsyskind_syskind))  
1419 -  
1420 - # PROJECTLCID Record  
1421 - projectlcid_id = struct.unpack("<H", dir_stream.read(2))[0]  
1422 - check_value('PROJECTLCID_Id', 0x0002, projectlcid_id)  
1423 - projectlcid_size = struct.unpack("<L", dir_stream.read(4))[0]  
1424 - check_value('PROJECTLCID_Size', 0x0004, projectlcid_size)  
1425 - projectlcid_lcid = struct.unpack("<L", dir_stream.read(4))[0]  
1426 - check_value('PROJECTLCID_Lcid', 0x409, projectlcid_lcid)  
1427 -  
1428 - # PROJECTLCIDINVOKE Record  
1429 - projectlcidinvoke_id = struct.unpack("<H", dir_stream.read(2))[0]  
1430 - check_value('PROJECTLCIDINVOKE_Id', 0x0014, projectlcidinvoke_id)  
1431 - projectlcidinvoke_size = struct.unpack("<L", dir_stream.read(4))[0]  
1432 - check_value('PROJECTLCIDINVOKE_Size', 0x0004, projectlcidinvoke_size)  
1433 - projectlcidinvoke_lcidinvoke = struct.unpack("<L", dir_stream.read(4))[0]  
1434 - check_value('PROJECTLCIDINVOKE_LcidInvoke', 0x409, projectlcidinvoke_lcidinvoke)  
1435 -  
1436 - # PROJECTCODEPAGE Record  
1437 - projectcodepage_id = struct.unpack("<H", dir_stream.read(2))[0]  
1438 - check_value('PROJECTCODEPAGE_Id', 0x0003, projectcodepage_id)  
1439 - projectcodepage_size = struct.unpack("<L", dir_stream.read(4))[0]  
1440 - check_value('PROJECTCODEPAGE_Size', 0x0002, projectcodepage_size)  
1441 - projectcodepage_codepage = struct.unpack("<H", dir_stream.read(2))[0]  
1442 -  
1443 - # PROJECTNAME Record  
1444 - projectname_id = struct.unpack("<H", dir_stream.read(2))[0]  
1445 - check_value('PROJECTNAME_Id', 0x0004, projectname_id)  
1446 - projectname_sizeof_projectname = struct.unpack("<L", dir_stream.read(4))[0]  
1447 - if projectname_sizeof_projectname < 1 or projectname_sizeof_projectname > 128:  
1448 - log.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(projectname_sizeof_projectname))  
1449 - projectname_projectname = dir_stream.read(projectname_sizeof_projectname)  
1450 - unused = projectname_projectname  
1451 -  
1452 - # PROJECTDOCSTRING Record  
1453 - projectdocstring_id = struct.unpack("<H", dir_stream.read(2))[0]  
1454 - check_value('PROJECTDOCSTRING_Id', 0x0005, projectdocstring_id)  
1455 - projectdocstring_sizeof_docstring = struct.unpack("<L", dir_stream.read(4))[0]  
1456 - if projectdocstring_sizeof_docstring > 2000:  
1457 - log.error(  
1458 - "PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(projectdocstring_sizeof_docstring))  
1459 - projectdocstring_docstring = dir_stream.read(projectdocstring_sizeof_docstring)  
1460 - projectdocstring_reserved = struct.unpack("<H", dir_stream.read(2))[0]  
1461 - check_value('PROJECTDOCSTRING_Reserved', 0x0040, projectdocstring_reserved)  
1462 - projectdocstring_sizeof_docstring_unicode = struct.unpack("<L", dir_stream.read(4))[0]  
1463 - if projectdocstring_sizeof_docstring_unicode % 2 != 0:  
1464 - log.error("PROJECTDOCSTRING_SizeOfDocStringUnicode is not even")  
1465 - projectdocstring_docstring_unicode = dir_stream.read(projectdocstring_sizeof_docstring_unicode)  
1466 - unused = projectdocstring_docstring  
1467 - unused = projectdocstring_docstring_unicode  
1468 -  
1469 - # PROJECTHELPFILEPATH Record - MS-OVBA 2.3.4.2.1.7  
1470 - projecthelpfilepath_id = struct.unpack("<H", dir_stream.read(2))[0]  
1471 - check_value('PROJECTHELPFILEPATH_Id', 0x0006, projecthelpfilepath_id)  
1472 - projecthelpfilepath_sizeof_helpfile1 = struct.unpack("<L", dir_stream.read(4))[0]  
1473 - if projecthelpfilepath_sizeof_helpfile1 > 260:  
1474 - log.error(  
1475 - "PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(projecthelpfilepath_sizeof_helpfile1))  
1476 - projecthelpfilepath_helpfile1 = dir_stream.read(projecthelpfilepath_sizeof_helpfile1)  
1477 - projecthelpfilepath_reserved = struct.unpack("<H", dir_stream.read(2))[0]  
1478 - check_value('PROJECTHELPFILEPATH_Reserved', 0x003D, projecthelpfilepath_reserved)  
1479 - projecthelpfilepath_sizeof_helpfile2 = struct.unpack("<L", dir_stream.read(4))[0]  
1480 - if projecthelpfilepath_sizeof_helpfile2 != projecthelpfilepath_sizeof_helpfile1:  
1481 - log.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 does not equal PROJECTHELPFILEPATH_SizeOfHelpFile2")  
1482 - projecthelpfilepath_helpfile2 = dir_stream.read(projecthelpfilepath_sizeof_helpfile2)  
1483 - if projecthelpfilepath_helpfile2 != projecthelpfilepath_helpfile1:  
1484 - log.error("PROJECTHELPFILEPATH_HelpFile1 does not equal PROJECTHELPFILEPATH_HelpFile2")  
1485 -  
1486 - # PROJECTHELPCONTEXT Record  
1487 - projecthelpcontext_id = struct.unpack("<H", dir_stream.read(2))[0]  
1488 - check_value('PROJECTHELPCONTEXT_Id', 0x0007, projecthelpcontext_id)  
1489 - projecthelpcontext_size = struct.unpack("<L", dir_stream.read(4))[0]  
1490 - check_value('PROJECTHELPCONTEXT_Size', 0x0004, projecthelpcontext_size)  
1491 - projecthelpcontext_helpcontext = struct.unpack("<L", dir_stream.read(4))[0]  
1492 - unused = projecthelpcontext_helpcontext  
1493 -  
1494 - # PROJECTLIBFLAGS Record  
1495 - projectlibflags_id = struct.unpack("<H", dir_stream.read(2))[0]  
1496 - check_value('PROJECTLIBFLAGS_Id', 0x0008, projectlibflags_id)  
1497 - projectlibflags_size = struct.unpack("<L", dir_stream.read(4))[0]  
1498 - check_value('PROJECTLIBFLAGS_Size', 0x0004, projectlibflags_size)  
1499 - projectlibflags_projectlibflags = struct.unpack("<L", dir_stream.read(4))[0]  
1500 - check_value('PROJECTLIBFLAGS_ProjectLibFlags', 0x0000, projectlibflags_projectlibflags)  
1501 -  
1502 - # PROJECTVERSION Record  
1503 - projectversion_id = struct.unpack("<H", dir_stream.read(2))[0]  
1504 - check_value('PROJECTVERSION_Id', 0x0009, projectversion_id)  
1505 - projectversion_reserved = struct.unpack("<L", dir_stream.read(4))[0]  
1506 - check_value('PROJECTVERSION_Reserved', 0x0004, projectversion_reserved)  
1507 - projectversion_versionmajor = struct.unpack("<L", dir_stream.read(4))[0]  
1508 - projectversion_versionminor = struct.unpack("<H", dir_stream.read(2))[0]  
1509 - unused = projectversion_versionmajor  
1510 - unused = projectversion_versionminor  
1511 -  
1512 - # PROJECTCONSTANTS Record  
1513 - projectconstants_id = struct.unpack("<H", dir_stream.read(2))[0]  
1514 - check_value('PROJECTCONSTANTS_Id', 0x000C, projectconstants_id)  
1515 - projectconstants_sizeof_constants = struct.unpack("<L", dir_stream.read(4))[0]  
1516 - if projectconstants_sizeof_constants > 1015:  
1517 - log.error(  
1518 - "PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(projectconstants_sizeof_constants))  
1519 - projectconstants_constants = dir_stream.read(projectconstants_sizeof_constants)  
1520 - projectconstants_reserved = struct.unpack("<H", dir_stream.read(2))[0]  
1521 - check_value('PROJECTCONSTANTS_Reserved', 0x003C, projectconstants_reserved)  
1522 - projectconstants_sizeof_constants_unicode = struct.unpack("<L", dir_stream.read(4))[0]  
1523 - if projectconstants_sizeof_constants_unicode % 2 != 0:  
1524 - log.error("PROJECTCONSTANTS_SizeOfConstantsUnicode is not even")  
1525 - projectconstants_constants_unicode = dir_stream.read(projectconstants_sizeof_constants_unicode)  
1526 - unused = projectconstants_constants  
1527 - unused = projectconstants_constants_unicode  
1528 -  
1529 - # array of REFERENCE records  
1530 - check = None  
1531 - while True:  
1532 - check = struct.unpack("<H", dir_stream.read(2))[0]  
1533 - log.debug("reference type = {0:04X}".format(check))  
1534 - if check == 0x000F:  
1535 - break  
1536 -  
1537 - if check == 0x0016:  
1538 - # REFERENCENAME  
1539 - reference_id = check  
1540 - reference_sizeof_name = struct.unpack("<L", dir_stream.read(4))[0]  
1541 - reference_name = dir_stream.read(reference_sizeof_name)  
1542 - reference_reserved = struct.unpack("<H", dir_stream.read(2))[0]  
1543 - # According to [MS-OVBA] 2.3.4.2.2.2 REFERENCENAME Record:  
1544 - # "Reserved (2 bytes): MUST be 0x003E. MUST be ignored."  
1545 - # So let's ignore it, otherwise it crashes on some files (issue #132)  
1546 - # PR #135 by @c1fe:  
1547 - # contrary to the specification I think that the unicode name  
1548 - # is optional. if reference_reserved is not 0x003E I think it  
1549 - # is actually the start of another REFERENCE record  
1550 - # at least when projectsyskind_syskind == 0x02 (Macintosh)  
1551 - if reference_reserved == 0x003E:  
1552 - #if reference_reserved not in (0x003E, 0x000D):  
1553 - # raise UnexpectedDataError(dir_path, 'REFERENCE_Reserved',  
1554 - # 0x0003E, reference_reserved)  
1555 - reference_sizeof_name_unicode = struct.unpack("<L", dir_stream.read(4))[0]  
1556 - reference_name_unicode = dir_stream.read(reference_sizeof_name_unicode)  
1557 - unused = reference_id  
1558 - unused = reference_name  
1559 - unused = reference_name_unicode  
1560 - continue  
1561 - else:  
1562 - check = reference_reserved  
1563 - log.debug("reference type = {0:04X}".format(check))  
1564 -  
1565 - if check == 0x0033:  
1566 - # REFERENCEORIGINAL (followed by REFERENCECONTROL)  
1567 - referenceoriginal_id = check  
1568 - referenceoriginal_sizeof_libidoriginal = struct.unpack("<L", dir_stream.read(4))[0]  
1569 - referenceoriginal_libidoriginal = dir_stream.read(referenceoriginal_sizeof_libidoriginal)  
1570 - unused = referenceoriginal_id  
1571 - unused = referenceoriginal_libidoriginal  
1572 - continue  
1573 -  
1574 - if check == 0x002F:  
1575 - # REFERENCECONTROL  
1576 - referencecontrol_id = check  
1577 - referencecontrol_sizetwiddled = struct.unpack("<L", dir_stream.read(4))[0] # ignore  
1578 - referencecontrol_sizeof_libidtwiddled = struct.unpack("<L", dir_stream.read(4))[0]  
1579 - referencecontrol_libidtwiddled = dir_stream.read(referencecontrol_sizeof_libidtwiddled)  
1580 - referencecontrol_reserved1 = struct.unpack("<L", dir_stream.read(4))[0] # ignore  
1581 - check_value('REFERENCECONTROL_Reserved1', 0x0000, referencecontrol_reserved1)  
1582 - referencecontrol_reserved2 = struct.unpack("<H", dir_stream.read(2))[0] # ignore  
1583 - check_value('REFERENCECONTROL_Reserved2', 0x0000, referencecontrol_reserved2)  
1584 - unused = referencecontrol_id  
1585 - unused = referencecontrol_sizetwiddled  
1586 - unused = referencecontrol_libidtwiddled  
1587 - # optional field  
1588 - check2 = struct.unpack("<H", dir_stream.read(2))[0]  
1589 - if check2 == 0x0016:  
1590 - referencecontrol_namerecordextended_id = check  
1591 - referencecontrol_namerecordextended_sizeof_name = struct.unpack("<L", dir_stream.read(4))[0]  
1592 - referencecontrol_namerecordextended_name = dir_stream.read(  
1593 - referencecontrol_namerecordextended_sizeof_name)  
1594 - referencecontrol_namerecordextended_reserved = struct.unpack("<H", dir_stream.read(2))[0]  
1595 - if referencecontrol_namerecordextended_reserved == 0x003E:  
1596 - referencecontrol_namerecordextended_sizeof_name_unicode = struct.unpack("<L", dir_stream.read(4))[0]  
1597 - referencecontrol_namerecordextended_name_unicode = dir_stream.read(  
1598 - referencecontrol_namerecordextended_sizeof_name_unicode)  
1599 - referencecontrol_reserved3 = struct.unpack("<H", dir_stream.read(2))[0]  
1600 - unused = referencecontrol_namerecordextended_id  
1601 - unused = referencecontrol_namerecordextended_name  
1602 - unused = referencecontrol_namerecordextended_name_unicode  
1603 - else:  
1604 - referencecontrol_reserved3 = referencecontrol_namerecordextended_reserved  
1605 - else:  
1606 - referencecontrol_reserved3 = check2  
1607 -  
1608 - check_value('REFERENCECONTROL_Reserved3', 0x0030, referencecontrol_reserved3)  
1609 - referencecontrol_sizeextended = struct.unpack("<L", dir_stream.read(4))[0]  
1610 - referencecontrol_sizeof_libidextended = struct.unpack("<L", dir_stream.read(4))[0]  
1611 - referencecontrol_libidextended = dir_stream.read(referencecontrol_sizeof_libidextended)  
1612 - referencecontrol_reserved4 = struct.unpack("<L", dir_stream.read(4))[0]  
1613 - referencecontrol_reserved5 = struct.unpack("<H", dir_stream.read(2))[0]  
1614 - referencecontrol_originaltypelib = dir_stream.read(16)  
1615 - referencecontrol_cookie = struct.unpack("<L", dir_stream.read(4))[0]  
1616 - unused = referencecontrol_sizeextended  
1617 - unused = referencecontrol_libidextended  
1618 - unused = referencecontrol_reserved4  
1619 - unused = referencecontrol_reserved5  
1620 - unused = referencecontrol_originaltypelib  
1621 - unused = referencecontrol_cookie  
1622 - continue  
1623 -  
1624 - if check == 0x000D:  
1625 - # REFERENCEREGISTERED  
1626 - referenceregistered_id = check  
1627 - referenceregistered_size = struct.unpack("<L", dir_stream.read(4))[0]  
1628 - referenceregistered_sizeof_libid = struct.unpack("<L", dir_stream.read(4))[0]  
1629 - referenceregistered_libid = dir_stream.read(referenceregistered_sizeof_libid)  
1630 - referenceregistered_reserved1 = struct.unpack("<L", dir_stream.read(4))[0]  
1631 - check_value('REFERENCEREGISTERED_Reserved1', 0x0000, referenceregistered_reserved1)  
1632 - referenceregistered_reserved2 = struct.unpack("<H", dir_stream.read(2))[0]  
1633 - check_value('REFERENCEREGISTERED_Reserved2', 0x0000, referenceregistered_reserved2)  
1634 - unused = referenceregistered_id  
1635 - unused = referenceregistered_size  
1636 - unused = referenceregistered_libid  
1637 - continue  
1638 -  
1639 - if check == 0x000E:  
1640 - # REFERENCEPROJECT  
1641 - referenceproject_id = check  
1642 - referenceproject_size = struct.unpack("<L", dir_stream.read(4))[0]  
1643 - referenceproject_sizeof_libidabsolute = struct.unpack("<L", dir_stream.read(4))[0]  
1644 - referenceproject_libidabsolute = dir_stream.read(referenceproject_sizeof_libidabsolute)  
1645 - referenceproject_sizeof_libidrelative = struct.unpack("<L", dir_stream.read(4))[0]  
1646 - referenceproject_libidrelative = dir_stream.read(referenceproject_sizeof_libidrelative)  
1647 - referenceproject_majorversion = struct.unpack("<L", dir_stream.read(4))[0]  
1648 - referenceproject_minorversion = struct.unpack("<H", dir_stream.read(2))[0]  
1649 - unused = referenceproject_id  
1650 - unused = referenceproject_size  
1651 - unused = referenceproject_libidabsolute  
1652 - unused = referenceproject_libidrelative  
1653 - unused = referenceproject_majorversion  
1654 - unused = referenceproject_minorversion  
1655 - continue  
1656 -  
1657 - log.error('invalid or unknown check Id {0:04X}'.format(check))  
1658 - # raise an exception instead of stopping abruptly (issue #180)  
1659 - raise UnexpectedDataError(dir_path, 'reference type', (0x0F, 0x16, 0x33, 0x2F, 0x0D, 0x0E), check)  
1660 - #sys.exit(0)  
1661 -  
1662 - projectmodules_id = check #struct.unpack("<H", dir_stream.read(2))[0]  
1663 - check_value('PROJECTMODULES_Id', 0x000F, projectmodules_id)  
1664 - projectmodules_size = struct.unpack("<L", dir_stream.read(4))[0]  
1665 - check_value('PROJECTMODULES_Size', 0x0002, projectmodules_size)  
1666 - projectmodules_count = struct.unpack("<H", dir_stream.read(2))[0]  
1667 - projectmodules_projectcookierecord_id = struct.unpack("<H", dir_stream.read(2))[0]  
1668 - check_value('PROJECTMODULES_ProjectCookieRecord_Id', 0x0013, projectmodules_projectcookierecord_id)  
1669 - projectmodules_projectcookierecord_size = struct.unpack("<L", dir_stream.read(4))[0]  
1670 - check_value('PROJECTMODULES_ProjectCookieRecord_Size', 0x0002, projectmodules_projectcookierecord_size)  
1671 - projectmodules_projectcookierecord_cookie = struct.unpack("<H", dir_stream.read(2))[0]  
1672 - unused = projectmodules_projectcookierecord_cookie  
1673 -  
1674 - # short function to simplify unicode text output  
1675 - uni_out = lambda unicode_text: unicode_text.encode('utf-8', 'replace')  
1676 -  
1677 - log.debug("parsing {0} modules".format(projectmodules_count))  
1678 - for projectmodule_index in xrange(0, projectmodules_count):  
1679 - try:  
1680 - modulename_id = struct.unpack("<H", dir_stream.read(2))[0]  
1681 - check_value('MODULENAME_Id', 0x0019, modulename_id)  
1682 - modulename_sizeof_modulename = struct.unpack("<L", dir_stream.read(4))[0]  
1683 - modulename_modulename = dir_stream.read(modulename_sizeof_modulename).decode('utf-8', 'backslashreplace')  
1684 - # TODO: preset variables to avoid "referenced before assignment" errors  
1685 - modulename_unicode_modulename_unicode = ''  
1686 - # account for optional sections  
1687 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1688 - if section_id == 0x0047:  
1689 - modulename_unicode_id = section_id  
1690 - modulename_unicode_sizeof_modulename_unicode = struct.unpack("<L", dir_stream.read(4))[0]  
1691 - modulename_unicode_modulename_unicode = dir_stream.read(  
1692 - modulename_unicode_sizeof_modulename_unicode).decode('UTF-16LE', 'replace')  
1693 - # just guessing that this is the same encoding as used in OleFileIO  
1694 - unused = modulename_unicode_id  
1695 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1696 - if section_id == 0x001A:  
1697 - modulestreamname_id = section_id  
1698 - modulestreamname_sizeof_streamname = struct.unpack("<L", dir_stream.read(4))[0]  
1699 - modulestreamname_streamname = dir_stream.read(modulestreamname_sizeof_streamname)  
1700 - modulestreamname_reserved = struct.unpack("<H", dir_stream.read(2))[0]  
1701 - check_value('MODULESTREAMNAME_Reserved', 0x0032, modulestreamname_reserved)  
1702 - modulestreamname_sizeof_streamname_unicode = struct.unpack("<L", dir_stream.read(4))[0]  
1703 - modulestreamname_streamname_unicode = dir_stream.read(  
1704 - modulestreamname_sizeof_streamname_unicode).decode('UTF-16LE', 'replace')  
1705 - # just guessing that this is the same encoding as used in OleFileIO  
1706 - unused = modulestreamname_id  
1707 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1708 - if section_id == 0x001C:  
1709 - moduledocstring_id = section_id  
1710 - check_value('MODULEDOCSTRING_Id', 0x001C, moduledocstring_id)  
1711 - moduledocstring_sizeof_docstring = struct.unpack("<L", dir_stream.read(4))[0]  
1712 - moduledocstring_docstring = dir_stream.read(moduledocstring_sizeof_docstring)  
1713 - moduledocstring_reserved = struct.unpack("<H", dir_stream.read(2))[0]  
1714 - check_value('MODULEDOCSTRING_Reserved', 0x0048, moduledocstring_reserved)  
1715 - moduledocstring_sizeof_docstring_unicode = struct.unpack("<L", dir_stream.read(4))[0]  
1716 - moduledocstring_docstring_unicode = dir_stream.read(moduledocstring_sizeof_docstring_unicode)  
1717 - unused = moduledocstring_docstring  
1718 - unused = moduledocstring_docstring_unicode  
1719 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1720 - if section_id == 0x0031:  
1721 - moduleoffset_id = section_id  
1722 - check_value('MODULEOFFSET_Id', 0x0031, moduleoffset_id)  
1723 - moduleoffset_size = struct.unpack("<L", dir_stream.read(4))[0]  
1724 - check_value('MODULEOFFSET_Size', 0x0004, moduleoffset_size)  
1725 - moduleoffset_textoffset = struct.unpack("<L", dir_stream.read(4))[0]  
1726 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1727 - if section_id == 0x001E:  
1728 - modulehelpcontext_id = section_id  
1729 - check_value('MODULEHELPCONTEXT_Id', 0x001E, modulehelpcontext_id)  
1730 - modulehelpcontext_size = struct.unpack("<L", dir_stream.read(4))[0]  
1731 - check_value('MODULEHELPCONTEXT_Size', 0x0004, modulehelpcontext_size)  
1732 - modulehelpcontext_helpcontext = struct.unpack("<L", dir_stream.read(4))[0]  
1733 - unused = modulehelpcontext_helpcontext  
1734 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1735 - if section_id == 0x002C:  
1736 - modulecookie_id = section_id  
1737 - check_value('MODULECOOKIE_Id', 0x002C, modulecookie_id)  
1738 - modulecookie_size = struct.unpack("<L", dir_stream.read(4))[0]  
1739 - check_value('MODULECOOKIE_Size', 0x0002, modulecookie_size)  
1740 - modulecookie_cookie = struct.unpack("<H", dir_stream.read(2))[0]  
1741 - unused = modulecookie_cookie  
1742 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1743 - if section_id == 0x0021 or section_id == 0x0022:  
1744 - moduletype_id = section_id  
1745 - moduletype_reserved = struct.unpack("<L", dir_stream.read(4))[0]  
1746 - unused = moduletype_id  
1747 - unused = moduletype_reserved  
1748 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1749 - if section_id == 0x0025:  
1750 - modulereadonly_id = section_id  
1751 - check_value('MODULEREADONLY_Id', 0x0025, modulereadonly_id)  
1752 - modulereadonly_reserved = struct.unpack("<L", dir_stream.read(4))[0]  
1753 - check_value('MODULEREADONLY_Reserved', 0x0000, modulereadonly_reserved)  
1754 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1755 - if section_id == 0x0028:  
1756 - moduleprivate_id = section_id  
1757 - check_value('MODULEPRIVATE_Id', 0x0028, moduleprivate_id)  
1758 - moduleprivate_reserved = struct.unpack("<L", dir_stream.read(4))[0]  
1759 - check_value('MODULEPRIVATE_Reserved', 0x0000, moduleprivate_reserved)  
1760 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1761 - if section_id == 0x002B: # TERMINATOR  
1762 - module_reserved = struct.unpack("<L", dir_stream.read(4))[0]  
1763 - check_value('MODULE_Reserved', 0x0000, module_reserved)  
1764 - section_id = None  
1765 - if section_id != None:  
1766 - log.warning('unknown or invalid module section id {0:04X}'.format(section_id))  
1767 -  
1768 - log.debug('Project CodePage = %d' % projectcodepage_codepage)  
1769 - if projectcodepage_codepage in MAC_CODEPAGES:  
1770 - vba_codec = MAC_CODEPAGES[projectcodepage_codepage]  
1771 - else:  
1772 - vba_codec = 'cp%d' % projectcodepage_codepage  
1773 - log.debug("ModuleName = {0}".format(modulename_modulename))  
1774 - log.debug("ModuleNameUnicode = {0}".format(uni_out(modulename_unicode_modulename_unicode)))  
1775 - log.debug("StreamName = {0}".format(modulestreamname_streamname))  
1776 - try:  
1777 - streamname_unicode = modulestreamname_streamname.decode(vba_codec)  
1778 - except UnicodeError as ue:  
1779 - log.debug('failed to decode stream name {0!r} with codec {1}'  
1780 - .format(uni_out(streamname_unicode), vba_codec))  
1781 - streamname_unicode = modulestreamname_streamname.decode(vba_codec, errors='replace')  
1782 - log.debug("StreamName.decode('%s') = %s" % (vba_codec, uni_out(streamname_unicode)))  
1783 - log.debug("StreamNameUnicode = {0}".format(uni_out(modulestreamname_streamname_unicode)))  
1784 - log.debug("TextOffset = {0}".format(moduleoffset_textoffset))  
1785 -  
1786 - code_data = None  
1787 - try_names = streamname_unicode, \  
1788 - modulename_unicode_modulename_unicode, \  
1789 - modulestreamname_streamname_unicode  
1790 - for stream_name in try_names:  
1791 - # TODO: if olefile._find were less private, could replace this  
1792 - # try-except with calls to it  
1793 - try:  
1794 - code_path = vba_root + u'VBA/' + stream_name  
1795 - log.debug('opening VBA code stream %s' % uni_out(code_path))  
1796 - code_data = ole.openstream(code_path).read()  
1797 - break  
1798 - except IOError as ioe:  
1799 - log.debug('failed to open stream VBA/%r (%r), try other name'  
1800 - % (uni_out(stream_name), ioe))  
1801 -  
1802 - if code_data is None:  
1803 - log.info("Could not open stream %d of %d ('VBA/' + one of %r)!"  
1804 - % (projectmodule_index, projectmodules_count,  
1805 - '/'.join("'" + uni_out(stream_name) + "'"  
1806 - for stream_name in try_names)))  
1807 - if relaxed:  
1808 - continue # ... with next submodule  
1809 - else:  
1810 - raise SubstreamOpenError('[BASE]', 'VBA/' +  
1811 - uni_out(modulename_unicode_modulename_unicode))  
1812 -  
1813 - log.debug("length of code_data = {0}".format(len(code_data)))  
1814 - log.debug("offset of code_data = {0}".format(moduleoffset_textoffset))  
1815 - code_data = code_data[moduleoffset_textoffset:]  
1816 - if len(code_data) > 0:  
1817 - code_data = decompress_stream(bytearray(code_data))  
1818 - # case-insensitive search in the code_modules dict to find the file extension:  
1819 - filext = code_modules.get(modulename_modulename.lower(), 'bin')  
1820 - filename = '{0}.{1}'.format(modulename_modulename, filext)  
1821 - #TODO: also yield the codepage so that callers can decode it properly  
1822 - yield (code_path, filename, code_data)  
1823 - # print '-'*79  
1824 - # print filename  
1825 - # print ''  
1826 - # print code_data  
1827 - # print ''  
1828 - log.debug('extracted file {0}'.format(filename))  
1829 - else:  
1830 - log.warning("module stream {0} has code data length 0".format(modulestreamname_streamname))  
1831 - except (UnexpectedDataError, SubstreamOpenError):  
1832 - raise  
1833 - except Exception as exc:  
1834 - log.info('Error parsing module {0} of {1} in _extract_vba:'  
1835 - .format(projectmodule_index, projectmodules_count),  
1836 - exc_info=True)  
1837 - if not relaxed:  
1838 - raise  
1839 - _ = unused # make pylint happy: now variable "unused" is being used ;-)  
1840 - return  
1841 -  
1842 -  
1843 -def vba_collapse_long_lines(vba_code):  
1844 - """  
1845 - Parse a VBA module code to detect continuation line characters (underscore) and  
1846 - collapse split lines. Continuation line characters are replaced by spaces.  
1847 -  
1848 - :param vba_code: str, VBA module code  
1849 - :return: str, VBA module code with long lines collapsed  
1850 - """  
1851 - # TODO: use a regex instead, to allow whitespaces after the underscore?  
1852 - vba_code = vba_code.replace(' _\r\n', ' ')  
1853 - vba_code = vba_code.replace(' _\r', ' ')  
1854 - vba_code = vba_code.replace(' _\n', ' ')  
1855 - return vba_code  
1856 -  
1857 -  
1858 -def filter_vba(vba_code):  
1859 - """  
1860 - Filter VBA source code to remove the first lines starting with "Attribute VB_",  
1861 - which are automatically added by MS Office and not displayed in the VBA Editor.  
1862 - This should only be used when displaying source code for human analysis.  
1863 -  
1864 - Note: lines are not filtered if they contain a colon, because it could be  
1865 - used to hide malicious instructions.  
1866 -  
1867 - :param vba_code: str, VBA source code  
1868 - :return: str, filtered VBA source code  
1869 - """  
1870 - vba_lines = vba_code.splitlines()  
1871 - start = 0  
1872 - for line in vba_lines:  
1873 - if line.startswith("Attribute VB_") and not ':' in line:  
1874 - start += 1  
1875 - else:  
1876 - break  
1877 - #TODO: also remove empty lines?  
1878 - vba = '\n'.join(vba_lines[start:])  
1879 - return vba  
1880 -  
1881 -  
1882 -def detect_autoexec(vba_code, obfuscation=None):  
1883 - """  
1884 - Detect if the VBA code contains keywords corresponding to macros running  
1885 - automatically when triggered by specific actions (e.g. when a document is  
1886 - opened or closed).  
1887 -  
1888 - :param vba_code: str, VBA source code  
1889 - :param obfuscation: None or str, name of obfuscation to be added to description  
1890 - :return: list of str tuples (keyword, description)  
1891 - """  
1892 - #TODO: merge code with detect_suspicious  
1893 - # case-insensitive search  
1894 - #vba_code = vba_code.lower()  
1895 - results = []  
1896 - obf_text = ''  
1897 - if obfuscation:  
1898 - obf_text = ' (obfuscation: %s)' % obfuscation  
1899 - for description, keywords in AUTOEXEC_KEYWORDS.items():  
1900 - for keyword in keywords:  
1901 - #TODO: if keyword is already a compiled regex, use it as-is  
1902 - # search using regex to detect word boundaries:  
1903 - match = re.search(r'(?i)\b' + keyword + r'\b', vba_code)  
1904 - if match:  
1905 - #if keyword.lower() in vba_code:  
1906 - found_keyword = match.group()  
1907 - results.append((found_keyword, description + obf_text))  
1908 - return results  
1909 -  
1910 -  
1911 -def detect_suspicious(vba_code, obfuscation=None):  
1912 - """  
1913 - Detect if the VBA code contains suspicious keywords corresponding to  
1914 - potential malware behaviour.  
1915 -  
1916 - :param vba_code: str, VBA source code  
1917 - :param obfuscation: None or str, name of obfuscation to be added to description  
1918 - :return: list of str tuples (keyword, description)  
1919 - """  
1920 - # case-insensitive search  
1921 - #vba_code = vba_code.lower()  
1922 - results = []  
1923 - obf_text = ''  
1924 - if obfuscation:  
1925 - obf_text = ' (obfuscation: %s)' % obfuscation  
1926 - for description, keywords in SUSPICIOUS_KEYWORDS.items():  
1927 - for keyword in keywords:  
1928 - # search using regex to detect word boundaries:  
1929 - match = re.search(r'(?i)\b' + re.escape(keyword) + r'\b', vba_code)  
1930 - if match:  
1931 - #if keyword.lower() in vba_code:  
1932 - found_keyword = match.group()  
1933 - results.append((found_keyword, description + obf_text))  
1934 - return results  
1935 -  
1936 -  
1937 -def detect_patterns(vba_code, obfuscation=None):  
1938 - """  
1939 - Detect if the VBA code contains specific patterns such as IP addresses,  
1940 - URLs, e-mail addresses, executable file names, etc.  
1941 -  
1942 - :param vba_code: str, VBA source code  
1943 - :return: list of str tuples (pattern type, value)  
1944 - """  
1945 - results = []  
1946 - found = set()  
1947 - obf_text = ''  
1948 - if obfuscation:  
1949 - obf_text = ' (obfuscation: %s)' % obfuscation  
1950 - for pattern_type, pattern_re in RE_PATTERNS:  
1951 - for match in pattern_re.finditer(vba_code):  
1952 - value = match.group()  
1953 - if value not in found:  
1954 - results.append((pattern_type + obf_text, value))  
1955 - found.add(value)  
1956 - return results  
1957 -  
1958 -  
1959 -def detect_hex_strings(vba_code):  
1960 - """  
1961 - Detect if the VBA code contains strings encoded in hexadecimal.  
1962 -  
1963 - :param vba_code: str, VBA source code  
1964 - :return: list of str tuples (encoded string, decoded string)  
1965 - """  
1966 - results = []  
1967 - found = set()  
1968 - for match in re_hex_string.finditer(vba_code):  
1969 - value = match.group()  
1970 - if value not in found:  
1971 - decoded = binascii.unhexlify(value)  
1972 - results.append((value, decoded.decode('utf-8', 'backslashreplace')))  
1973 - found.add(value)  
1974 - return results  
1975 -  
1976 -  
1977 -def detect_base64_strings(vba_code):  
1978 - """  
1979 - Detect if the VBA code contains strings encoded in base64.  
1980 -  
1981 - :param vba_code: str, VBA source code  
1982 - :return: list of str tuples (encoded string, decoded string)  
1983 - """  
1984 - #TODO: avoid matching simple hex strings as base64?  
1985 - results = []  
1986 - found = set()  
1987 - for match in re_base64_string.finditer(vba_code):  
1988 - # extract the base64 string without quotes:  
1989 - value = match.group().strip('"')  
1990 - # check it is not just a hex string:  
1991 - if not re_nothex_check.search(value):  
1992 - continue  
1993 - # only keep new values and not in the whitelist:  
1994 - if value not in found and value.lower() not in BASE64_WHITELIST:  
1995 - try:  
1996 - decoded = base64.b64decode(value)  
1997 - results.append((value, decoded.decode('utf-8','replace')))  
1998 - found.add(value)  
1999 - except (TypeError, ValueError) as exc:  
2000 - log.debug('Failed to base64-decode (%s)' % exc)  
2001 - # if an exception occurs, it is likely not a base64-encoded string  
2002 - return results  
2003 -  
2004 -  
2005 -def detect_dridex_strings(vba_code):  
2006 - """  
2007 - Detect if the VBA code contains strings obfuscated with a specific algorithm found in Dridex samples.  
2008 -  
2009 - :param vba_code: str, VBA source code  
2010 - :return: list of str tuples (encoded string, decoded string)  
2011 - """  
2012 - # TODO: move this at the beginning of script  
2013 - from oletools.thirdparty.DridexUrlDecoder.DridexUrlDecoder import DridexUrlDecode  
2014 -  
2015 - results = []  
2016 - found = set()  
2017 - for match in re_dridex_string.finditer(vba_code):  
2018 - value = match.group()[1:-1]  
2019 - # check it is not just a hex string:  
2020 - if not re_nothex_check.search(value):  
2021 - continue  
2022 - if value not in found:  
2023 - try:  
2024 - decoded = DridexUrlDecode(value)  
2025 - results.append((value, decoded))  
2026 - found.add(value)  
2027 - except Exception as exc:  
2028 - log.debug('Failed to Dridex-decode (%s)' % exc)  
2029 - # if an exception occurs, it is likely not a dridex-encoded string  
2030 - return results  
2031 -  
2032 -  
2033 -def detect_vba_strings(vba_code):  
2034 - """  
2035 - Detect if the VBA code contains strings obfuscated with VBA expressions  
2036 - using keywords such as Chr, Asc, Val, StrReverse, etc.  
2037 -  
2038 - :param vba_code: str, VBA source code  
2039 - :return: list of str tuples (encoded string, decoded string)  
2040 - """  
2041 - # TODO: handle exceptions  
2042 - results = []  
2043 - found = set()  
2044 - # IMPORTANT: to extract the actual VBA expressions found in the code,  
2045 - # we must expand tabs to have the same string as pyparsing.  
2046 - # Otherwise, start and end offsets are incorrect.  
2047 - vba_code = vba_code.expandtabs()  
2048 - # Split the VBA code line by line to avoid MemoryError on large scripts:  
2049 - for vba_line in vba_code.splitlines():  
2050 - for tokens, start, end in vba_expr_str.scanString(vba_line):  
2051 - encoded = vba_line[start:end]  
2052 - decoded = tokens[0]  
2053 - if isinstance(decoded, VbaExpressionString):  
2054 - # This is a VBA expression, not a simple string  
2055 - # print 'VBA EXPRESSION: encoded=%r => decoded=%r' % (encoded, decoded)  
2056 - # remove parentheses and quotes from original string:  
2057 - # if encoded.startswith('(') and encoded.endswith(')'):  
2058 - # encoded = encoded[1:-1]  
2059 - # if encoded.startswith('"') and encoded.endswith('"'):  
2060 - # encoded = encoded[1:-1]  
2061 - # avoid duplicates and simple strings:  
2062 - if encoded not in found and decoded != encoded:  
2063 - results.append((encoded, decoded))  
2064 - found.add(encoded)  
2065 - # else:  
2066 - # print 'VBA STRING: encoded=%r => decoded=%r' % (encoded, decoded)  
2067 - return results  
2068 -  
2069 -  
2070 -def json2ascii(json_obj, encoding='utf8', errors='replace'):  
2071 - """ ensure there is no unicode in json and all strings are safe to decode  
2072 -  
2073 - works recursively, decodes and re-encodes every string to/from unicode  
2074 - to ensure there will be no trouble in loading the dumped json output  
2075 - """  
2076 - if json_obj is None:  
2077 - pass  
2078 - elif isinstance(json_obj, (bool, int, float)):  
2079 - pass  
2080 - elif isinstance(json_obj, str):  
2081 - # de-code and re-encode  
2082 - dencoded = json_obj  
2083 - if dencoded != json_obj:  
2084 - log.debug('json2ascii: replaced: {0} (len {1})'  
2085 - .format(json_obj, len(json_obj)))  
2086 - log.debug('json2ascii: with: {0} (len {1})'  
2087 - .format(dencoded, len(dencoded)))  
2088 - return dencoded  
2089 - elif isinstance(json_obj, bytes):  
2090 - log.debug('json2ascii: encode unicode: {0}'  
2091 - .format(json_obj.decode(encoding, errors)))  
2092 - # cannot put original into logger  
2093 - # print 'original: ' json_obj  
2094 - return json_obj.decode(encoding, errors)  
2095 - elif isinstance(json_obj, dict):  
2096 - for key in json_obj:  
2097 - json_obj[key] = json2ascii(json_obj[key])  
2098 - elif isinstance(json_obj, (list,tuple)):  
2099 - for item in json_obj:  
2100 - item = json2ascii(item)  
2101 - else:  
2102 - log.debug('unexpected type in json2ascii: {0} -- leave as is'  
2103 - .format(type(json_obj)))  
2104 - return json_obj  
2105 -  
2106 -  
2107 -def print_json(json_dict=None, _json_is_first=False, _json_is_last=False,  
2108 - **json_parts):  
2109 - """ line-wise print of json.dumps(json2ascii(..)) with options and indent+1  
2110 -  
2111 - can use in two ways:  
2112 - (1) print_json(some_dict)  
2113 - (2) print_json(key1=value1, key2=value2, ...)  
2114 -  
2115 - :param bool _json_is_first: set to True only for very first entry to complete  
2116 - the top-level json-list  
2117 - :param bool _json_is_last: set to True only for very last entry to complete  
2118 - the top-level json-list  
2119 - """  
2120 - if json_dict and json_parts:  
2121 - raise ValueError('Invalid json argument: want either single dict or '  
2122 - 'key=value parts but got both)')  
2123 - elif (json_dict is not None) and (not isinstance(json_dict, dict)):  
2124 - raise ValueError('Invalid json argument: want either single dict or '  
2125 - 'key=value parts but got {0} instead of dict)'  
2126 - .format(type(json_dict)))  
2127 - if json_parts:  
2128 - json_dict = json_parts  
2129 -  
2130 - if _json_is_first:  
2131 - print('[')  
2132 -  
2133 - lines = json.dumps(json2ascii(json_dict), check_circular=False,  
2134 - indent=4, ensure_ascii=False).splitlines()  
2135 - for line in lines[:-1]:  
2136 - print(' {0}'.format(line))  
2137 - if _json_is_last:  
2138 - print(' {0}'.format(lines[-1])) # print last line without comma  
2139 - print(']')  
2140 - else:  
2141 - print(' {0},'.format(lines[-1])) # print last line with comma  
2142 -  
2143 -  
2144 -class VBA_Scanner(object):  
2145 - """  
2146 - Class to scan the source code of a VBA module to find obfuscated strings,  
2147 - suspicious keywords, IOCs, auto-executable macros, etc.  
2148 - """  
2149 -  
2150 - def __init__(self, vba_code):  
2151 - """  
2152 - VBA_Scanner constructor  
2153 -  
2154 - :param vba_code: str, VBA source code to be analyzed  
2155 - """  
2156 - if isinstance(vba_code, bytes):  
2157 - vba_code = vba_code.decode('utf-8', 'backslashreplace')  
2158 - # join long lines ending with " _":  
2159 - self.code = vba_collapse_long_lines(vba_code)  
2160 - self.code_hex = ''  
2161 - self.code_hex_rev = ''  
2162 - self.code_rev_hex = ''  
2163 - self.code_base64 = ''  
2164 - self.code_dridex = ''  
2165 - self.code_vba = ''  
2166 - self.strReverse = None  
2167 - # results = None before scanning, then a list of tuples after scanning  
2168 - self.results = None  
2169 - self.autoexec_keywords = None  
2170 - self.suspicious_keywords = None  
2171 - self.iocs = None  
2172 - self.hex_strings = None  
2173 - self.base64_strings = None  
2174 - self.dridex_strings = None  
2175 - self.vba_strings = None  
2176 -  
2177 -  
2178 - def scan(self, include_decoded_strings=False, deobfuscate=False):  
2179 - """  
2180 - Analyze the provided VBA code to detect suspicious keywords,  
2181 - auto-executable macros, IOC patterns, obfuscation patterns  
2182 - such as hex-encoded strings.  
2183 -  
2184 - :param include_decoded_strings: bool, if True, all encoded strings will be included with their decoded content.  
2185 - :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow)  
2186 - :return: list of tuples (type, keyword, description)  
2187 - (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String')  
2188 - """  
2189 - # First, detect and extract hex-encoded strings:  
2190 - self.hex_strings = detect_hex_strings(self.code)  
2191 - # detect if the code contains StrReverse:  
2192 - self.strReverse = False  
2193 - if 'strreverse' in self.code.lower(): self.strReverse = True  
2194 - # Then append the decoded strings to the VBA code, to detect obfuscated IOCs and keywords:  
2195 - for encoded, decoded in self.hex_strings:  
2196 - self.code_hex += '\n' + decoded  
2197 - # if the code contains "StrReverse", also append the hex strings in reverse order:  
2198 - if self.strReverse:  
2199 - # StrReverse after hex decoding:  
2200 - self.code_hex_rev += '\n' + decoded[::-1]  
2201 - # StrReverse before hex decoding:  
2202 - self.code_rev_hex += '\n' + str(binascii.unhexlify(encoded[::-1]))  
2203 - #example: https://malwr.com/analysis/NmFlMGI4YTY1YzYyNDkwNTg1ZTBiZmY5OGI3YjlhYzU/  
2204 - #TODO: also append the full code reversed if StrReverse? (risk of false positives?)  
2205 - # Detect Base64-encoded strings  
2206 - self.base64_strings = detect_base64_strings(self.code)  
2207 - for encoded, decoded in self.base64_strings:  
2208 - self.code_base64 += '\n' + decoded  
2209 - # Detect Dridex-encoded strings  
2210 - self.dridex_strings = detect_dridex_strings(self.code)  
2211 - for encoded, decoded in self.dridex_strings:  
2212 - self.code_dridex += '\n' + decoded  
2213 - # Detect obfuscated strings in VBA expressions  
2214 - if deobfuscate:  
2215 - self.vba_strings = detect_vba_strings(self.code)  
2216 - else:  
2217 - self.vba_strings = []  
2218 - for encoded, decoded in self.vba_strings:  
2219 - self.code_vba += '\n' + decoded  
2220 - results = []  
2221 - self.autoexec_keywords = []  
2222 - self.suspicious_keywords = []  
2223 - self.iocs = []  
2224 -  
2225 - for code, obfuscation in (  
2226 - (self.code, None),  
2227 - (self.code_hex, 'Hex'),  
2228 - (self.code_hex_rev, 'Hex+StrReverse'),  
2229 - (self.code_rev_hex, 'StrReverse+Hex'),  
2230 - (self.code_base64, 'Base64'),  
2231 - (self.code_dridex, 'Dridex'),  
2232 - (self.code_vba, 'VBA expression'),  
2233 - ):  
2234 - if isinstance(code,bytes):  
2235 - code=code.decode('utf-8','backslashreplace')  
2236 - self.autoexec_keywords += detect_autoexec(code, obfuscation)  
2237 - self.suspicious_keywords += detect_suspicious(code, obfuscation)  
2238 - self.iocs += detect_patterns(code, obfuscation)  
2239 -  
2240 - # If hex-encoded strings were discovered, add an item to suspicious keywords:  
2241 - if self.hex_strings:  
2242 - self.suspicious_keywords.append(('Hex Strings',  
2243 - 'Hex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))  
2244 - if self.base64_strings:  
2245 - self.suspicious_keywords.append(('Base64 Strings',  
2246 - 'Base64-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))  
2247 - if self.dridex_strings:  
2248 - self.suspicious_keywords.append(('Dridex Strings',  
2249 - 'Dridex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))  
2250 - if self.vba_strings:  
2251 - self.suspicious_keywords.append(('VBA obfuscated Strings',  
2252 - 'VBA string expressions were detected, may be used to obfuscate strings (option --decode to see all)'))  
2253 - # use a set to avoid duplicate keywords  
2254 - keyword_set = set()  
2255 - for keyword, description in self.autoexec_keywords:  
2256 - if keyword not in keyword_set:  
2257 - results.append(('AutoExec', keyword, description))  
2258 - keyword_set.add(keyword)  
2259 - keyword_set = set()  
2260 - for keyword, description in self.suspicious_keywords:  
2261 - if keyword not in keyword_set:  
2262 - results.append(('Suspicious', keyword, description))  
2263 - keyword_set.add(keyword)  
2264 - keyword_set = set()  
2265 - for pattern_type, value in self.iocs:  
2266 - if value not in keyword_set:  
2267 - results.append(('IOC', value, pattern_type))  
2268 - keyword_set.add(value)  
2269 -  
2270 - # include decoded strings only if they are printable or if --decode option:  
2271 - for encoded, decoded in self.hex_strings:  
2272 - if include_decoded_strings or is_printable(decoded):  
2273 - results.append(('Hex String', decoded, encoded))  
2274 - for encoded, decoded in self.base64_strings:  
2275 - if include_decoded_strings or is_printable(decoded):  
2276 - results.append(('Base64 String', decoded, encoded))  
2277 - for encoded, decoded in self.dridex_strings:  
2278 - if include_decoded_strings or is_printable(decoded):  
2279 - results.append(('Dridex string', decoded, encoded))  
2280 - for encoded, decoded in self.vba_strings:  
2281 - if include_decoded_strings or is_printable(decoded):  
2282 - results.append(('VBA string', decoded, encoded))  
2283 - self.results = results  
2284 - return results  
2285 -  
2286 - def scan_summary(self):  
2287 - """  
2288 - Analyze the provided VBA code to detect suspicious keywords,  
2289 - auto-executable macros, IOC patterns, obfuscation patterns  
2290 - such as hex-encoded strings.  
2291 -  
2292 - :return: tuple with the number of items found for each category:  
2293 - (autoexec, suspicious, IOCs, hex, base64, dridex, vba)  
2294 - """  
2295 - # avoid scanning the same code twice:  
2296 - if self.results is None:  
2297 - self.scan()  
2298 - return (len(self.autoexec_keywords), len(self.suspicious_keywords),  
2299 - len(self.iocs), len(self.hex_strings), len(self.base64_strings),  
2300 - len(self.dridex_strings), len(self.vba_strings))  
2301 -  
2302 -  
2303 -def scan_vba(vba_code, include_decoded_strings, deobfuscate=False):  
2304 - """  
2305 - Analyze the provided VBA code to detect suspicious keywords,  
2306 - auto-executable macros, IOC patterns, obfuscation patterns  
2307 - such as hex-encoded strings.  
2308 - (shortcut for VBA_Scanner(vba_code).scan())  
2309 -  
2310 - :param vba_code: str, VBA source code to be analyzed  
2311 - :param include_decoded_strings: bool, if True all encoded strings will be included with their decoded content.  
2312 - :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow)  
2313 - :return: list of tuples (type, keyword, description)  
2314 - (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String')  
2315 - """  
2316 - return VBA_Scanner(vba_code).scan(include_decoded_strings, deobfuscate)  
2317 -  
2318 -  
2319 -#=== CLASSES =================================================================  
2320 -  
2321 -class VBA_Parser(object):  
2322 - """  
2323 - Class to parse MS Office files, to detect VBA macros and extract VBA source code  
2324 - Supported file formats:  
2325 - - Word 97-2003 (.doc, .dot)  
2326 - - Word 2007+ (.docm, .dotm)  
2327 - - Word 2003 XML (.xml)  
2328 - - Word MHT - Single File Web Page / MHTML (.mht)  
2329 - - Excel 97-2003 (.xls)  
2330 - - Excel 2007+ (.xlsm, .xlsb)  
2331 - - PowerPoint 97-2003 (.ppt)  
2332 - - PowerPoint 2007+ (.pptm, .ppsm)  
2333 - """  
2334 -  
2335 - def __init__(self, filename, data=None, container=None, relaxed=False):  
2336 - """  
2337 - Constructor for VBA_Parser  
2338 -  
2339 - :param filename: filename or path of file to parse, or file-like object  
2340 -  
2341 - :param data: None or bytes str, if None the file will be read from disk (or from the file-like object).  
2342 - If data is provided as a bytes string, it will be parsed as the content of the file in memory,  
2343 - and not read from disk. Note: files must be read in binary mode, i.e. open(f, 'rb').  
2344 -  
2345 - :param container: str, path and filename of container if the file is within  
2346 - a zip archive, None otherwise.  
2347 -  
2348 - :param relaxed: if True, treat mal-formed documents and missing streams more like MS office:  
2349 - do nothing; if False (default), raise errors in these cases  
2350 -  
2351 - raises a FileOpenError if all attemps to interpret the data header failed  
2352 - """  
2353 - #TODO: filename should only be a string, data should be used for the file-like object  
2354 - #TODO: filename should be mandatory, optional data is a string or file-like object  
2355 - #TODO: also support olefile and zipfile as input  
2356 - if data is None:  
2357 - # open file from disk:  
2358 - _file = filename  
2359 - else:  
2360 - # file already read in memory, make it a file-like object for zipfile:  
2361 - _file = BytesIO(data)  
2362 - #self.file = _file  
2363 - self.ole_file = None  
2364 - self.ole_subfiles = []  
2365 - self.filename = filename  
2366 - self.container = container  
2367 - self.relaxed = relaxed  
2368 - self.type = None  
2369 - self.vba_projects = None  
2370 - self.vba_forms = None  
2371 - self.contains_macros = None # will be set to True or False by detect_macros  
2372 - self.vba_code_all_modules = None # to store the source code of all modules  
2373 - # list of tuples for each module: (subfilename, stream_path, vba_filename, vba_code)  
2374 - self.modules = None  
2375 - # Analysis results: list of tuples (type, keyword, description) - See VBA_Scanner  
2376 - self.analysis_results = None  
2377 - # statistics for the scan summary and flags  
2378 - self.nb_macros = 0  
2379 - self.nb_autoexec = 0  
2380 - self.nb_suspicious = 0  
2381 - self.nb_iocs = 0  
2382 - self.nb_hexstrings = 0  
2383 - self.nb_base64strings = 0  
2384 - self.nb_dridexstrings = 0  
2385 - self.nb_vbastrings = 0  
2386 -  
2387 - # if filename is None:  
2388 - # if isinstance(_file, basestring):  
2389 - # if len(_file) < olefile.MINIMAL_OLEFILE_SIZE:  
2390 - # self.filename = _file  
2391 - # else:  
2392 - # self.filename = '<file in bytes string>'  
2393 - # else:  
2394 - # self.filename = '<file-like object>'  
2395 - if olefile.isOleFile(_file):  
2396 - # This looks like an OLE file  
2397 - self.open_ole(_file)  
2398 -  
2399 - # check whether file is encrypted (need to do this before try ppt)  
2400 - log.debug('Check encryption of ole file')  
2401 - crypt_indicator = oleid.OleID(self.ole_file).check_encrypted()  
2402 - if crypt_indicator.value:  
2403 - raise FileIsEncryptedError(filename)  
2404 -  
2405 - # if this worked, try whether it is a ppt file (special ole file)  
2406 - self.open_ppt()  
2407 - if self.type is None and is_zipfile(_file):  
2408 - # Zip file, which may be an OpenXML document  
2409 - self.open_openxml(_file)  
2410 - if self.type is None:  
2411 - # read file from disk, check if it is a Word 2003 XML file (WordProcessingML), Excel 2003 XML,  
2412 - # or a plain text file containing VBA code  
2413 - if data is None:  
2414 - with open(filename, 'rb') as file_handle:  
2415 - data = file_handle.read()  
2416 - # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace  
2417 - if b'http://schemas.microsoft.com/office/word/2003/wordml' in data:  
2418 - self.open_word2003xml(data)  
2419 - # check if it is a Word/PowerPoint 2007+ XML file (Flat OPC): must contain the namespace  
2420 - if b'http://schemas.microsoft.com/office/2006/xmlPackage' in data:  
2421 - self.open_flatopc(data)  
2422 - # store a lowercase version for the next tests:  
2423 - data_lowercase = data.lower()  
2424 - # check if it is a MHT file (MIME HTML, Word or Excel saved as "Single File Web Page"):  
2425 - # According to my tests, these files usually start with "MIME-Version: 1.0" on the 1st line  
2426 - # BUT Word accepts a blank line or other MIME headers inserted before,  
2427 - # and even whitespaces in between "MIME", "-", "Version" and ":". The version number is ignored.  
2428 - # And the line is case insensitive.  
2429 - # so we'll just check the presence of mime, version and multipart anywhere:  
2430 - if self.type is None and b'mime' in data_lowercase and b'version' in data_lowercase \  
2431 - and b'multipart' in data_lowercase:  
2432 - self.open_mht(data)  
2433 - #TODO: handle exceptions  
2434 - #TODO: Excel 2003 XML  
2435 - # Check whether this is rtf  
2436 - if rtfobj.is_rtf(data, treat_str_as_data=True):  
2437 - # Ignore RTF since it contains no macros and methods in here will not find macros  
2438 - # in embedded objects. run rtfobj and repeat on its output.  
2439 - msg = '%s is RTF, need to run rtfobj.py and find VBA Macros in its output.' % self.filename  
2440 - log.info(msg)  
2441 - raise FileOpenError(msg)  
2442 - # Check if this is a plain text VBA or VBScript file:  
2443 - # To avoid scanning binary files, we simply check for some control chars:  
2444 - if self.type is None and b'\x00' not in data:  
2445 - self.open_text(data)  
2446 - if self.type is None:  
2447 - # At this stage, could not match a known format:  
2448 - msg = '%s is not a supported file type, cannot extract VBA Macros.' % self.filename  
2449 - log.info(msg)  
2450 - raise FileOpenError(msg)  
2451 -  
2452 - def open_ole(self, _file):  
2453 - """  
2454 - Open an OLE file  
2455 - :param _file: filename or file contents in a file object  
2456 - :return: nothing  
2457 - """  
2458 - log.info('Opening OLE file %s' % self.filename)  
2459 - try:  
2460 - # Open and parse the OLE file, using unicode for path names:  
2461 - self.ole_file = olefile.OleFileIO(_file, path_encoding=None)  
2462 - # set type only if parsing succeeds  
2463 - self.type = TYPE_OLE  
2464 - except (IOError, TypeError, ValueError) as exc:  
2465 - # TODO: handle OLE parsing exceptions  
2466 - log.info('Failed OLE parsing for file %r (%s)' % (self.filename, exc))  
2467 - log.debug('Trace:', exc_info=True)  
2468 -  
2469 -  
2470 - def open_openxml(self, _file):  
2471 - """  
2472 - Open an OpenXML file  
2473 - :param _file: filename or file contents in a file object  
2474 - :return: nothing  
2475 - """  
2476 - # This looks like a zip file, need to look for vbaProject.bin inside  
2477 - # It can be any OLE file inside the archive  
2478 - #...because vbaProject.bin can be renamed:  
2479 - # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18  
2480 - log.info('Opening ZIP/OpenXML file %s' % self.filename)  
2481 - try:  
2482 - z = zipfile.ZipFile(_file)  
2483 - #TODO: check if this is actually an OpenXML file  
2484 - #TODO: if the zip file is encrypted, suggest to use the -z option, or try '-z infected' automatically  
2485 - # check each file within the zip if it is an OLE file, by reading its magic:  
2486 - for subfile in z.namelist():  
2487 - with z.open(subfile) as file_handle:  
2488 - magic = file_handle.read(len(olefile.MAGIC))  
2489 - if magic == olefile.MAGIC:  
2490 - log.debug('Opening OLE file %s within zip' % subfile)  
2491 - with z.open(subfile) as file_handle:  
2492 - ole_data = file_handle.read()  
2493 - try:  
2494 - self.ole_subfiles.append(  
2495 - VBA_Parser(filename=subfile, data=ole_data,  
2496 - relaxed=self.relaxed))  
2497 - except OlevbaBaseException as exc:  
2498 - if self.relaxed:  
2499 - log.info('%s is not a valid OLE file (%s)' % (subfile, exc))  
2500 - log.debug('Trace:', exc_info=True)  
2501 - continue  
2502 - else:  
2503 - raise SubstreamOpenError(self.filename, subfile,  
2504 - exc)  
2505 - z.close()  
2506 - # set type only if parsing succeeds  
2507 - self.type = TYPE_OpenXML  
2508 - except OlevbaBaseException as exc:  
2509 - if self.relaxed:  
2510 - log.info('Error {0} caught in Zip/OpenXML parsing for file {1}'  
2511 - .format(exc, self.filename))  
2512 - log.debug('Trace:', exc_info=True)  
2513 - else:  
2514 - raise  
2515 - except (RuntimeError, zipfile.BadZipfile, zipfile.LargeZipFile, IOError) as exc:  
2516 - # TODO: handle parsing exceptions  
2517 - log.info('Failed Zip/OpenXML parsing for file %r (%s)'  
2518 - % (self.filename, exc))  
2519 - log.debug('Trace:', exc_info=True)  
2520 -  
2521 - def open_word2003xml(self, data):  
2522 - """  
2523 - Open a Word 2003 XML file  
2524 - :param data: file contents in a string or bytes  
2525 - :return: nothing  
2526 - """  
2527 - log.info('Opening Word 2003 XML file %s' % self.filename)  
2528 - try:  
2529 - # parse the XML content  
2530 - # TODO: handle XML parsing exceptions  
2531 - et = ET.fromstring(data)  
2532 - # find all the binData elements:  
2533 - for bindata in et.getiterator(TAG_BINDATA):  
2534 - # the binData content is an OLE container for the VBA project, compressed  
2535 - # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded.  
2536 - # get the filename:  
2537 - fname = bindata.get(ATTR_NAME, 'noname.mso')  
2538 - # decode the base64 activemime  
2539 - mso_data = binascii.a2b_base64(bindata.text)  
2540 - if is_mso_file(mso_data):  
2541 - # decompress the zlib data stored in the MSO file, which is the OLE container:  
2542 - # TODO: handle different offsets => separate function  
2543 - try:  
2544 - ole_data = mso_file_extract(mso_data)  
2545 - self.ole_subfiles.append(  
2546 - VBA_Parser(filename=fname, data=ole_data,  
2547 - relaxed=self.relaxed))  
2548 - except OlevbaBaseException as exc:  
2549 - if self.relaxed:  
2550 - log.info('Error parsing subfile {0}: {1}'  
2551 - .format(fname, exc))  
2552 - log.debug('Trace:', exc_info=True)  
2553 - else:  
2554 - raise SubstreamOpenError(self.filename, fname, exc)  
2555 - else:  
2556 - log.info('%s is not a valid MSO file' % fname)  
2557 - # set type only if parsing succeeds  
2558 - self.type = TYPE_Word2003_XML  
2559 - except OlevbaBaseException as exc:  
2560 - if self.relaxed:  
2561 - log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc))  
2562 - log.debug('Trace:', exc_info=True)  
2563 - else:  
2564 - raise  
2565 - except Exception as exc:  
2566 - # TODO: differentiate exceptions for each parsing stage  
2567 - # (but ET is different libs, no good exception description in API)  
2568 - # found: XMLSyntaxError  
2569 - log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc))  
2570 - log.debug('Trace:', exc_info=True)  
2571 -  
2572 - def open_flatopc(self, data):  
2573 - """  
2574 - Open a Word or PowerPoint 2007+ XML file, aka "Flat OPC"  
2575 - :param data: file contents in a string or bytes  
2576 - :return: nothing  
2577 - """  
2578 - log.info('Opening Flat OPC Word/PowerPoint XML file %s' % self.filename)  
2579 - try:  
2580 - # parse the XML content  
2581 - # TODO: handle XML parsing exceptions  
2582 - et = ET.fromstring(data)  
2583 - # TODO: check root node namespace and tag  
2584 - # find all the pkg:part elements:  
2585 - for pkgpart in et.iter(TAG_PKGPART):  
2586 - fname = pkgpart.get(ATTR_PKG_NAME, 'unknown')  
2587 - content_type = pkgpart.get(ATTR_PKG_CONTENTTYPE, 'unknown')  
2588 - if content_type == CTYPE_VBAPROJECT:  
2589 - for bindata in pkgpart.iterfind(TAG_PKGBINDATA):  
2590 - try:  
2591 - ole_data = binascii.a2b_base64(bindata.text)  
2592 - self.ole_subfiles.append(  
2593 - VBA_Parser(filename=fname, data=ole_data,  
2594 - relaxed=self.relaxed))  
2595 - except OlevbaBaseException as exc:  
2596 - if self.relaxed:  
2597 - log.info('Error parsing subfile {0}: {1}'  
2598 - .format(fname, exc))  
2599 - log.debug('Trace:', exc_info=True)  
2600 - else:  
2601 - raise SubstreamOpenError(self.filename, fname, exc)  
2602 - # set type only if parsing succeeds  
2603 - self.type = TYPE_FlatOPC_XML  
2604 - except OlevbaBaseException as exc:  
2605 - if self.relaxed:  
2606 - log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc))  
2607 - log.debug('Trace:', exc_info=True)  
2608 - else:  
2609 - raise  
2610 - except Exception as exc:  
2611 - # TODO: differentiate exceptions for each parsing stage  
2612 - # (but ET is different libs, no good exception description in API)  
2613 - # found: XMLSyntaxError  
2614 - log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc))  
2615 - log.debug('Trace:', exc_info=True)  
2616 -  
2617 - def open_mht(self, data):  
2618 - """  
2619 - Open a MHTML file  
2620 - :param data: file contents in a string or bytes  
2621 - :return: nothing  
2622 - """  
2623 - log.info('Opening MHTML file %s' % self.filename)  
2624 - try:  
2625 - if isinstance(data,bytes):  
2626 - data = data.decode('utf8', 'backslashreplace')  
2627 - # parse the MIME content  
2628 - # remove any leading whitespace or newline (workaround for issue in email package)  
2629 - stripped_data = data.lstrip('\r\n\t ')  
2630 - # strip any junk from the beginning of the file  
2631 - # (issue #31 fix by Greg C - gdigreg)  
2632 - # TODO: improve keywords to avoid false positives  
2633 - mime_offset = stripped_data.find('MIME')  
2634 - content_offset = stripped_data.find('Content')  
2635 - # if "MIME" is found, and located before "Content":  
2636 - if -1 < mime_offset <= content_offset:  
2637 - stripped_data = stripped_data[mime_offset:]  
2638 - # else if "Content" is found, and before "MIME"  
2639 - # TODO: can it work without "MIME" at all?  
2640 - elif content_offset > -1:  
2641 - stripped_data = stripped_data[content_offset:]  
2642 - # TODO: quick and dirty fix: insert a standard line with MIME-Version header?  
2643 - mhtml = email.message_from_string(stripped_data)  
2644 - # find all the attached files:  
2645 - for part in mhtml.walk():  
2646 - content_type = part.get_content_type() # always returns a value  
2647 - fname = part.get_filename(None) # returns None if it fails  
2648 - # TODO: get content-location if no filename  
2649 - log.debug('MHTML part: filename=%r, content-type=%r' % (fname, content_type))  
2650 - part_data = part.get_payload(decode=True)  
2651 - # VBA macros are stored in a binary file named "editdata.mso".  
2652 - # the data content is an OLE container for the VBA project, compressed  
2653 - # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded.  
2654 - # decompress the zlib data starting at offset 0x32, which is the OLE container:  
2655 - # check ActiveMime header:  
2656 -  
2657 - if (isinstance(part_data, str) or isinstance(part_data, bytes)) and is_mso_file(part_data):  
2658 - log.debug('Found ActiveMime header, decompressing MSO container')  
2659 - try:  
2660 - ole_data = mso_file_extract(part_data)  
2661 -  
2662 - # TODO: check if it is actually an OLE file  
2663 - # TODO: get the MSO filename from content_location?  
2664 - self.ole_subfiles.append(  
2665 - VBA_Parser(filename=fname, data=ole_data,  
2666 - relaxed=self.relaxed))  
2667 - except OlevbaBaseException as exc:  
2668 - if self.relaxed:  
2669 - log.info('%s does not contain a valid OLE file (%s)'  
2670 - % (fname, exc))  
2671 - log.debug('Trace:', exc_info=True)  
2672 - # TODO: bug here - need to split in smaller functions/classes?  
2673 - else:  
2674 - raise SubstreamOpenError(self.filename, fname, exc)  
2675 - else:  
2676 - log.debug('type(part_data) = %s' % type(part_data))  
2677 - try:  
2678 - log.debug('part_data[0:20] = %r' % part_data[0:20])  
2679 - except TypeError as err:  
2680 - log.debug('part_data has no __getitem__')  
2681 - # set type only if parsing succeeds  
2682 - self.type = TYPE_MHTML  
2683 - except OlevbaBaseException:  
2684 - raise  
2685 - except Exception:  
2686 - log.info('Failed MIME parsing for file %r - %s'  
2687 - % (self.filename, MSG_OLEVBA_ISSUES))  
2688 - log.debug('Trace:', exc_info=True)  
2689 -  
2690 - def open_ppt(self):  
2691 - """ try to interpret self.ole_file as PowerPoint 97-2003 using PptParser  
2692 -  
2693 - Although self.ole_file is a valid olefile.OleFileIO, we set  
2694 - self.ole_file = None in here and instead set self.ole_subfiles to the  
2695 - VBA ole streams found within the main ole file. That makes most of the  
2696 - code below treat this like an OpenXML file and only look at the  
2697 - ole_subfiles (except find_vba_* which needs to explicitly check for  
2698 - self.type)  
2699 - """  
2700 -  
2701 - log.info('Check whether OLE file is PPT')  
2702 - try:  
2703 - ppt = ppt_parser.PptParser(self.ole_file, fast_fail=True)  
2704 - for vba_data in ppt.iter_vba_data():  
2705 - self.ole_subfiles.append(VBA_Parser(None, vba_data,  
2706 - container='PptParser'))  
2707 - log.info('File is PPT')  
2708 - self.ole_file.close() # just in case  
2709 - self.ole_file = None # required to make other methods look at ole_subfiles  
2710 - self.type = TYPE_PPT  
2711 - except Exception as exc:  
2712 - if self.container == 'PptParser':  
2713 - # this is a subfile of a ppt --> to be expected that is no ppt  
2714 - log.debug('PPT subfile is not a PPT file')  
2715 - else:  
2716 - log.debug("File appears not to be a ppt file (%s)" % exc)  
2717 -  
2718 -  
2719 - def open_text(self, data):  
2720 - """  
2721 - Open a text file containing VBA or VBScript source code  
2722 - :param data: file contents in a string or bytes  
2723 - :return: nothing  
2724 - """  
2725 - log.info('Opening text file %s' % self.filename)  
2726 - # directly store the source code:  
2727 - if isinstance(data,bytes):  
2728 - data=data.decode('utf8','backslashreplace')  
2729 - self.vba_code_all_modules = data  
2730 - self.contains_macros = True  
2731 - # set type only if parsing succeeds  
2732 - self.type = TYPE_TEXT  
2733 -  
2734 -  
2735 - def find_vba_projects(self):  
2736 - """  
2737 - Finds all the VBA projects stored in an OLE file.  
2738 -  
2739 - Return None if the file is not OLE but OpenXML.  
2740 - Return a list of tuples (vba_root, project_path, dir_path) for each VBA project.  
2741 - vba_root is the path of the root OLE storage containing the VBA project,  
2742 - including a trailing slash unless it is the root of the OLE file.  
2743 - project_path is the path of the OLE stream named "PROJECT" within the VBA project.  
2744 - dir_path is the path of the OLE stream named "VBA/dir" within the VBA project.  
2745 -  
2746 - If this function returns an empty list for one of the supported formats  
2747 - (i.e. Word, Excel, Powerpoint), then the file does not contain VBA macros.  
2748 -  
2749 - :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path)  
2750 - for each VBA project found if OLE file  
2751 - """  
2752 - log.debug('VBA_Parser.find_vba_projects')  
2753 -  
2754 - # if the file is not OLE but OpenXML, return None:  
2755 - if self.ole_file is None and self.type != TYPE_PPT:  
2756 - return None  
2757 -  
2758 - # if this method has already been called, return previous result:  
2759 - if self.vba_projects is not None:  
2760 - return self.vba_projects  
2761 -  
2762 - # if this is a ppt file (PowerPoint 97-2003):  
2763 - # self.ole_file is None but the ole_subfiles do contain vba_projects  
2764 - # (like for OpenXML files).  
2765 - if self.type == TYPE_PPT:  
2766 - # TODO: so far, this function is never called for PPT files, but  
2767 - # if that happens, the information is lost which ole file contains  
2768 - # which storage!  
2769 - log.warning('Returned info is not complete for PPT types!')  
2770 - self.vba_projects = []  
2771 - for subfile in self.ole_subfiles:  
2772 - self.vba_projects.extend(subfile.find_vba_projects())  
2773 - return self.vba_projects  
2774 -  
2775 - # Find the VBA project root (different in MS Word, Excel, etc):  
2776 - # - Word 97-2003: Macros  
2777 - # - Excel 97-2003: _VBA_PROJECT_CUR  
2778 - # - PowerPoint 97-2003: PptParser has identified ole_subfiles  
2779 - # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin.  
2780 - # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word  
2781 - # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word  
2782 - # - Visio 2007: not supported yet (different file structure)  
2783 -  
2784 - # According to MS-OVBA section 2.2.1:  
2785 - # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream  
2786 - # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream  
2787 - # - all names are case-insensitive  
2788 -  
2789 - def check_vba_stream(ole, vba_root, stream_path):  
2790 - full_path = vba_root + stream_path  
2791 - if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM:  
2792 - log.debug('Found %s stream: %s' % (stream_path, full_path))  
2793 - return full_path  
2794 - else:  
2795 - log.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path)  
2796 - return False  
2797 -  
2798 - # start with an empty list:  
2799 - self.vba_projects = []  
2800 - # Look for any storage containing those storage/streams:  
2801 - ole = self.ole_file  
2802 - for storage in ole.listdir(streams=False, storages=True):  
2803 - log.debug('Checking storage %r' % storage)  
2804 - # Look for a storage ending with "VBA":  
2805 - if storage[-1].upper() == 'VBA':  
2806 - log.debug('Found VBA storage: %s' % ('/'.join(storage)))  
2807 - vba_root = '/'.join(storage[:-1])  
2808 - # Add a trailing slash to vba_root, unless it is the root of the OLE file:  
2809 - # (used later to append all the child streams/storages)  
2810 - if vba_root != '':  
2811 - vba_root += '/'  
2812 - log.debug('Checking vba_root="%s"' % vba_root)  
2813 -  
2814 - # Check if the VBA root storage also contains a PROJECT stream:  
2815 - project_path = check_vba_stream(ole, vba_root, 'PROJECT')  
2816 - if not project_path: continue  
2817 - # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream:  
2818 - vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT')  
2819 - if not vba_project_path: continue  
2820 - # Check if the VBA root storage also contains a VBA/dir stream:  
2821 - dir_path = check_vba_stream(ole, vba_root, 'VBA/dir')  
2822 - if not dir_path: continue  
2823 - # Now we are pretty sure it is a VBA project structure  
2824 - log.debug('VBA root storage: "%s"' % vba_root)  
2825 - # append the results to the list as a tuple for later use:  
2826 - self.vba_projects.append((vba_root, project_path, dir_path))  
2827 - return self.vba_projects  
2828 -  
2829 - def detect_vba_macros(self):  
2830 - """  
2831 - Detect the potential presence of VBA macros in the file, by checking  
2832 - if it contains VBA projects. Both OLE and OpenXML files are supported.  
2833 -  
2834 - Important: for now, results are accurate only for Word, Excel and PowerPoint  
2835 -  
2836 - Note: this method does NOT attempt to check the actual presence or validity  
2837 - of VBA macro source code, so there might be false positives.  
2838 - It may also detect VBA macros in files embedded within the main file,  
2839 - for example an Excel workbook with macros embedded into a Word  
2840 - document without macros may be detected, without distinction.  
2841 -  
2842 - :return: bool, True if at least one VBA project has been found, False otherwise  
2843 - """  
2844 - #TODO: return None or raise exception if format not supported  
2845 - #TODO: return the number of VBA projects found instead of True/False?  
2846 - # if this method was already called, return the previous result:  
2847 - if self.contains_macros is not None:  
2848 - return self.contains_macros  
2849 - # if OpenXML/PPT, check all the OLE subfiles:  
2850 - if self.ole_file is None:  
2851 - for ole_subfile in self.ole_subfiles:  
2852 - if ole_subfile.detect_vba_macros():  
2853 - self.contains_macros = True  
2854 - return True  
2855 - # otherwise, no macro found:  
2856 - self.contains_macros = False  
2857 - return False  
2858 - # otherwise it's an OLE file, find VBA projects:  
2859 - vba_projects = self.find_vba_projects()  
2860 - if len(vba_projects) == 0:  
2861 - self.contains_macros = False  
2862 - else:  
2863 - self.contains_macros = True  
2864 - # Also look for VBA code in any stream including orphans  
2865 - # (happens in some malformed files)  
2866 - ole = self.ole_file  
2867 - for sid in xrange(len(ole.direntries)):  
2868 - # check if id is already done above:  
2869 - log.debug('Checking DirEntry #%d' % sid)  
2870 - d = ole.direntries[sid]  
2871 - if d is None:  
2872 - # this direntry is not part of the tree: either unused or an orphan  
2873 - d = ole._load_direntry(sid)  
2874 - log.debug('This DirEntry is an orphan or unused')  
2875 - if d.entry_type == olefile.STGTY_STREAM:  
2876 - # read data  
2877 - log.debug('Reading data from stream %r - size: %d bytes' % (d.name, d.size))  
2878 - try:  
2879 - data = ole._open(d.isectStart, d.size).read()  
2880 - log.debug('Read %d bytes' % len(data))  
2881 - if len(data) > 200:  
2882 - log.debug('%r...[much more data]...%r' % (data[:100], data[-50:]))  
2883 - else:  
2884 - log.debug(repr(data))  
2885 - if 'Attribut\x00' in data.decode('utf-8', 'ignore'):  
2886 - log.debug('Found VBA compressed code')  
2887 - self.contains_macros = True  
2888 - except IOError as exc:  
2889 - if self.relaxed:  
2890 - log.info('Error when reading OLE Stream %r' % d.name)  
2891 - log.debug('Trace:', exc_trace=True)  
2892 - else:  
2893 - raise SubstreamOpenError(self.filename, d.name, exc)  
2894 - return self.contains_macros  
2895 -  
2896 - def extract_macros(self):  
2897 - """  
2898 - Extract and decompress source code for each VBA macro found in the file  
2899 -  
2900 - Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found  
2901 - If the file is OLE, filename is the path of the file.  
2902 - If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros  
2903 - within the zip archive, e.g. word/vbaProject.bin.  
2904 - If the file is PPT, result is as for OpenXML but filename is useless  
2905 - """  
2906 - log.debug('extract_macros:')  
2907 - if self.ole_file is None:  
2908 - # This may be either an OpenXML/PPT or a text file:  
2909 - if self.type == TYPE_TEXT:  
2910 - # This is a text file, yield the full code:  
2911 - yield (self.filename, '', self.filename, self.vba_code_all_modules)  
2912 - else:  
2913 - # OpenXML/PPT: recursively yield results from each OLE subfile:  
2914 - for ole_subfile in self.ole_subfiles:  
2915 - for results in ole_subfile.extract_macros():  
2916 - yield results  
2917 - else:  
2918 - # This is an OLE file:  
2919 - self.find_vba_projects()  
2920 - # set of stream ids  
2921 - vba_stream_ids = set()  
2922 - for vba_root, project_path, dir_path in self.vba_projects:  
2923 - # extract all VBA macros from that VBA root storage:  
2924 - # The function _extract_vba may fail on some files (issue #132)  
2925 - try:  
2926 - for stream_path, vba_filename, vba_code in \  
2927 - _extract_vba(self.ole_file, vba_root, project_path,  
2928 - dir_path, self.relaxed):  
2929 - # store direntry ids in a set:  
2930 - vba_stream_ids.add(self.ole_file._find(stream_path))  
2931 - yield (self.filename, stream_path, vba_filename, vba_code)  
2932 - except Exception as e:  
2933 - log.exception('Error in _extract_vba')  
2934 - # Also look for VBA code in any stream including orphans  
2935 - # (happens in some malformed files)  
2936 - ole = self.ole_file  
2937 - for sid in xrange(len(ole.direntries)):  
2938 - # check if id is already done above:  
2939 - log.debug('Checking DirEntry #%d' % sid)  
2940 - if sid in vba_stream_ids:  
2941 - log.debug('Already extracted')  
2942 - continue  
2943 - d = ole.direntries[sid]  
2944 - if d is None:  
2945 - # this direntry is not part of the tree: either unused or an orphan  
2946 - d = ole._load_direntry(sid)  
2947 - log.debug('This DirEntry is an orphan or unused')  
2948 - if d.entry_type == olefile.STGTY_STREAM:  
2949 - # read data  
2950 - log.debug('Reading data from stream %r' % d.name)  
2951 - data = ole._open(d.isectStart, d.size).read()  
2952 - for match in re.finditer(b'\\x00Attribut[^e]', data, flags=re.IGNORECASE):  
2953 - start = match.start() - 3  
2954 - log.debug('Found VBA compressed code at index %X' % start)  
2955 - compressed_code = data[start:]  
2956 - try:  
2957 - vba_code = decompress_stream(bytearray(compressed_code))  
2958 - yield (self.filename, d.name, d.name, vba_code)  
2959 - except Exception as exc:  
2960 - # display the exception with full stack trace for debugging  
2961 - log.debug('Error processing stream %r in file %r (%s)' % (d.name, self.filename, exc))  
2962 - log.debug('Traceback:', exc_info=True)  
2963 - # do not raise the error, as it is unlikely to be a compressed macro stream  
2964 -  
2965 - def extract_all_macros(self):  
2966 - """  
2967 - Extract and decompress source code for each VBA macro found in the file  
2968 - by calling extract_macros(), store the results as a list of tuples  
2969 - (filename, stream_path, vba_filename, vba_code) in self.modules.  
2970 - See extract_macros for details.  
2971 - """  
2972 - if self.modules is None:  
2973 - self.modules = []  
2974 - for (subfilename, stream_path, vba_filename, vba_code) in self.extract_macros():  
2975 - self.modules.append((subfilename, stream_path, vba_filename, vba_code))  
2976 - self.nb_macros = len(self.modules)  
2977 - return self.modules  
2978 -  
2979 -  
2980 -  
2981 - def analyze_macros(self, show_decoded_strings=False, deobfuscate=False):  
2982 - """  
2983 - runs extract_macros and analyze the source code of all VBA macros  
2984 - found in the file.  
2985 - All results are stored in self.analysis_results.  
2986 - If called more than once, simply returns the previous results.  
2987 - """  
2988 - if self.detect_vba_macros():  
2989 - # if the analysis was already done, avoid doing it twice:  
2990 - if self.analysis_results is not None:  
2991 - return self.analysis_results  
2992 - # variable to merge source code from all modules:  
2993 - if self.vba_code_all_modules is None:  
2994 - self.vba_code_all_modules = ''  
2995 - for (_, _, _, vba_code) in self.extract_all_macros():  
2996 - #TODO: filter code? (each module)  
2997 - if isinstance(vba_code, bytes):  
2998 - vba_code = vba_code.decode('utf-8', 'ignore')  
2999 - self.vba_code_all_modules += vba_code + '\n'  
3000 - for (_, _, form_string) in self.extract_form_strings():  
3001 - self.vba_code_all_modules += form_string.decode('utf-8', 'ignore') + '\n'  
3002 - # Analyze the whole code at once:  
3003 - scanner = VBA_Scanner(self.vba_code_all_modules)  
3004 - self.analysis_results = scanner.scan(show_decoded_strings, deobfuscate)  
3005 - autoexec, suspicious, iocs, hexstrings, base64strings, dridex, vbastrings = scanner.scan_summary()  
3006 - self.nb_autoexec += autoexec  
3007 - self.nb_suspicious += suspicious  
3008 - self.nb_iocs += iocs  
3009 - self.nb_hexstrings += hexstrings  
3010 - self.nb_base64strings += base64strings  
3011 - self.nb_dridexstrings += dridex  
3012 - self.nb_vbastrings += vbastrings  
3013 -  
3014 - return self.analysis_results  
3015 -  
3016 -  
3017 - def reveal(self):  
3018 - # we only want printable strings:  
3019 - analysis = self.analyze_macros(show_decoded_strings=False)  
3020 - # to avoid replacing short strings contained into longer strings, we sort the analysis results  
3021 - # based on the length of the encoded string, in reverse order:  
3022 - analysis = sorted(analysis, key=lambda type_decoded_encoded: len(type_decoded_encoded[2]), reverse=True)  
3023 - # normally now self.vba_code_all_modules contains source code from all modules  
3024 - # Need to collapse long lines:  
3025 - deobf_code = vba_collapse_long_lines(self.vba_code_all_modules)  
3026 - deobf_code = filter_vba(deobf_code)  
3027 - for kw_type, decoded, encoded in analysis:  
3028 - if kw_type == 'VBA string':  
3029 - #print '%3d occurences: %r => %r' % (deobf_code.count(encoded), encoded, decoded)  
3030 - # need to add double quotes around the decoded strings  
3031 - # after escaping double-quotes as double-double-quotes for VBA:  
3032 - decoded = decoded.replace('"', '""')  
3033 - decoded = '"%s"' % decoded  
3034 - # if the encoded string is enclosed in parentheses,  
3035 - # keep them in the decoded version:  
3036 - if encoded.startswith('(') and encoded.endswith(')'):  
3037 - decoded = '(%s)' % decoded  
3038 - deobf_code = deobf_code.replace(encoded, decoded)  
3039 - # # TODO: there is a bug somewhere which creates double returns '\r\r'  
3040 - # deobf_code = deobf_code.replace('\r\r', '\r')  
3041 - return deobf_code  
3042 - #TODO: repasser l'analyse plusieurs fois si des chaines hex ou base64 sont revelees  
3043 -  
3044 -  
3045 - def find_vba_forms(self):  
3046 - """  
3047 - Finds all the VBA forms stored in an OLE file.  
3048 -  
3049 - Return None if the file is not OLE but OpenXML.  
3050 - Return a list of tuples (vba_root, project_path, dir_path) for each VBA project.  
3051 - vba_root is the path of the root OLE storage containing the VBA project,  
3052 - including a trailing slash unless it is the root of the OLE file.  
3053 - project_path is the path of the OLE stream named "PROJECT" within the VBA project.  
3054 - dir_path is the path of the OLE stream named "VBA/dir" within the VBA project.  
3055 -  
3056 - If this function returns an empty list for one of the supported formats  
3057 - (i.e. Word, Excel, Powerpoint), then the file does not contain VBA forms.  
3058 -  
3059 - :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path)  
3060 - for each VBA project found if OLE file  
3061 - """  
3062 - log.debug('VBA_Parser.find_vba_forms')  
3063 -  
3064 - # if the file is not OLE but OpenXML, return None:  
3065 - if self.ole_file is None and self.type != TYPE_PPT:  
3066 - return None  
3067 -  
3068 - # if this method has already been called, return previous result:  
3069 - # if self.vba_projects is not None:  
3070 - # return self.vba_projects  
3071 -  
3072 - # According to MS-OFORMS section 2.1.2 Control Streams:  
3073 - # - A parent control, that is, a control that can contain embedded controls,  
3074 - # MUST be persisted as a storage that contains multiple streams.  
3075 - # - All parent controls MUST contain a FormControl. The FormControl  
3076 - # properties are persisted to a stream (1) as specified in section 2.1.1.2.  
3077 - # The name of this stream (1) MUST be "f".  
3078 - # - Embedded controls that cannot themselves contain other embedded  
3079 - # controls are persisted sequentially as FormEmbeddedActiveXControls  
3080 - # to a stream (1) contained in the same storage as the parent control.  
3081 - # The name of this stream (1) MUST be "o".  
3082 - # - all names are case-insensitive  
3083 -  
3084 - if self.type == TYPE_PPT:  
3085 - # TODO: so far, this function is never called for PPT files, but  
3086 - # if that happens, the information is lost which ole file contains  
3087 - # which storage!  
3088 - ole_files = self.ole_subfiles  
3089 - log.warning('Returned info is not complete for PPT types!')  
3090 - else:  
3091 - ole_files = [self.ole_file, ]  
3092 -  
3093 - # start with an empty list:  
3094 - self.vba_forms = []  
3095 -  
3096 - # Loop over ole streams  
3097 - for ole in ole_files:  
3098 - # Look for any storage containing those storage/streams:  
3099 - for storage in ole.listdir(streams=False, storages=True):  
3100 - log.debug('Checking storage %r' % storage)  
3101 - # Look for two streams named 'o' and 'f':  
3102 - o_stream = storage + ['o']  
3103 - f_stream = storage + ['f']  
3104 - log.debug('Checking if streams %r and %r exist' % (f_stream, o_stream))  
3105 - if ole.exists(o_stream) and ole.get_type(o_stream) == olefile.STGTY_STREAM \  
3106 - and ole.exists(f_stream) and ole.get_type(f_stream) == olefile.STGTY_STREAM:  
3107 - form_path = '/'.join(storage)  
3108 - log.debug('Found VBA Form: %r' % form_path)  
3109 - self.vba_forms.append(storage)  
3110 - return self.vba_forms  
3111 -  
3112 - def extract_form_strings(self):  
3113 - """  
3114 - Extract printable strings from each VBA Form found in the file  
3115 -  
3116 - Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found  
3117 - If the file is OLE, filename is the path of the file.  
3118 - If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros  
3119 - within the zip archive, e.g. word/vbaProject.bin.  
3120 - If the file is PPT, result is as for OpenXML but filename is useless  
3121 - """  
3122 - if self.ole_file is None:  
3123 - # This may be either an OpenXML/PPT or a text file:  
3124 - if self.type == TYPE_TEXT:  
3125 - # This is a text file, return no results:  
3126 - return  
3127 - else:  
3128 - # OpenXML/PPT: recursively yield results from each OLE subfile:  
3129 - for ole_subfile in self.ole_subfiles:  
3130 - for results in ole_subfile.extract_form_strings():  
3131 - yield results  
3132 - else:  
3133 - # This is an OLE file:  
3134 - self.find_vba_forms()  
3135 - ole = self.ole_file  
3136 - for form_storage in self.vba_forms:  
3137 - o_stream = form_storage + ['o']  
3138 - log.debug('Opening form object stream %r' % '/'.join(o_stream))  
3139 - form_data = ole.openstream(o_stream).read()  
3140 - # Extract printable strings from the form object stream "o":  
3141 - for m in re_printable_string.finditer(form_data):  
3142 - log.debug('Printable string found in form: %r' % m.group())  
3143 - yield (self.filename, '/'.join(o_stream), m.group())  
3144 -  
3145 -  
3146 - def close(self):  
3147 - """  
3148 - Close all the open files. This method must be called after usage, if  
3149 - the application is opening many files.  
3150 - """  
3151 - if self.ole_file is None:  
3152 - if self.ole_subfiles is not None:  
3153 - for ole_subfile in self.ole_subfiles:  
3154 - ole_subfile.close()  
3155 - else:  
3156 - self.ole_file.close()  
3157 -  
3158 -  
3159 -  
3160 -class VBA_Parser_CLI(VBA_Parser):  
3161 - """  
3162 - VBA parser and analyzer, adding methods for the command line interface  
3163 - of olevba. (see VBA_Parser)  
3164 - """  
3165 -  
3166 - def __init__(self, *args, **kwargs):  
3167 - """  
3168 - Constructor for VBA_Parser_CLI.  
3169 - Calls __init__ from VBA_Parser with all arguments --> see doc there  
3170 - """  
3171 - super(VBA_Parser_CLI, self).__init__(*args, **kwargs)  
3172 -  
3173 -  
3174 - def print_analysis(self, show_decoded_strings=False, deobfuscate=False):  
3175 - """  
3176 - Analyze the provided VBA code, and print the results in a table  
3177 -  
3178 - :param vba_code: str, VBA source code to be analyzed  
3179 - :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.  
3180 - :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow)  
3181 - :return: None  
3182 - """  
3183 - # print a waiting message only if the output is not redirected to a file:  
3184 - if sys.stdout.isatty():  
3185 - print('Analysis...\r', end='')  
3186 - sys.stdout.flush()  
3187 - results = self.analyze_macros(show_decoded_strings, deobfuscate)  
3188 - if results:  
3189 - t = prettytable.PrettyTable(('Type', 'Keyword', 'Description'))  
3190 - t.align = 'l'  
3191 - t.max_width['Type'] = 10  
3192 - t.max_width['Keyword'] = 20  
3193 - t.max_width['Description'] = 39  
3194 - for kw_type, keyword, description in results:  
3195 - # handle non printable strings:  
3196 - if not is_printable(keyword):  
3197 - keyword = repr(keyword)  
3198 - if not is_printable(description):  
3199 - description = repr(description)  
3200 - t.add_row((kw_type, keyword, description))  
3201 - print(t)  
3202 - else:  
3203 - print('No suspicious keyword or IOC found.')  
3204 -  
3205 - def print_analysis_json(self, show_decoded_strings=False, deobfuscate=False):  
3206 - """  
3207 - Analyze the provided VBA code, and return the results in json format  
3208 -  
3209 - :param vba_code: str, VBA source code to be analyzed  
3210 - :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.  
3211 - :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow)  
3212 -  
3213 - :return: dict  
3214 - """  
3215 - # print a waiting message only if the output is not redirected to a file:  
3216 - if sys.stdout.isatty():  
3217 - print('Analysis...\r', end='')  
3218 - sys.stdout.flush()  
3219 - return [dict(type=kw_type, keyword=keyword, description=description)  
3220 - for kw_type, keyword, description in self.analyze_macros(show_decoded_strings, deobfuscate)]  
3221 -  
3222 - def process_file(self, show_decoded_strings=False,  
3223 - display_code=True, hide_attributes=True,  
3224 - vba_code_only=False, show_deobfuscated_code=False,  
3225 - deobfuscate=False):  
3226 - """  
3227 - Process a single file  
3228 -  
3229 - :param filename: str, path and filename of file on disk, or within the container.  
3230 - :param data: bytes, content of the file if it is in a container, None if it is a file on disk.  
3231 - :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.  
3232 - :param display_code: bool, if False VBA source code is not displayed (default True)  
3233 - :param global_analysis: bool, if True all modules are merged for a single analysis (default),  
3234 - otherwise each module is analyzed separately (old behaviour)  
3235 - :param hide_attributes: bool, if True the first lines starting with "Attribute VB" are hidden (default)  
3236 - :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow)  
3237 - """  
3238 - #TODO: replace print by writing to a provided output file (sys.stdout by default)  
3239 - # fix conflicting parameters:  
3240 - if vba_code_only and not display_code:  
3241 - display_code = True  
3242 - if self.container:  
3243 - display_filename = '%s in %s' % (self.filename, self.container)  
3244 - else:  
3245 - display_filename = self.filename  
3246 - print('=' * 79)  
3247 - print('FILE: %s' % display_filename)  
3248 - try:  
3249 - #TODO: handle olefile errors, when an OLE file is malformed  
3250 - print('Type: %s'% self.type)  
3251 - if self.detect_vba_macros():  
3252 - #print 'Contains VBA Macros:'  
3253 - for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros():  
3254 - if hide_attributes:  
3255 - # hide attribute lines:  
3256 - if isinstance(vba_code,bytes):  
3257 - vba_code =vba_code.decode('utf-8','backslashreplace')  
3258 - vba_code_filtered = filter_vba(vba_code)  
3259 - else:  
3260 - vba_code_filtered = vba_code  
3261 - print('-' * 79)  
3262 - print('VBA MACRO %s ' % vba_filename)  
3263 - print('in file: %s - OLE stream: %s' % (subfilename, repr(stream_path)))  
3264 - if display_code:  
3265 - print('- ' * 39)  
3266 - # detect empty macros:  
3267 - if vba_code_filtered.strip() == '':  
3268 - print('(empty macro)')  
3269 - else:  
3270 - print(vba_code_filtered)  
3271 - for (subfilename, stream_path, form_string) in self.extract_form_strings():  
3272 - print('-' * 79)  
3273 - print('VBA FORM STRING IN %r - OLE stream: %r' % (subfilename, stream_path))  
3274 - print('- ' * 39)  
3275 - print(form_string.decode('utf-8', 'ignore'))  
3276 - if not vba_code_only:  
3277 - # analyse the code from all modules at once:  
3278 - self.print_analysis(show_decoded_strings, deobfuscate)  
3279 - if show_deobfuscated_code:  
3280 - print('MACRO SOURCE CODE WITH DEOBFUSCATED VBA STRINGS (EXPERIMENTAL):\n\n')  
3281 - print(self.reveal())  
3282 - else:  
3283 - print('No VBA macros found.')  
3284 - except OlevbaBaseException:  
3285 - raise  
3286 - except Exception as exc:  
3287 - # display the exception with full stack trace for debugging  
3288 - log.info('Error processing file %s (%s)' % (self.filename, exc))  
3289 - log.debug('Traceback:', exc_info=True)  
3290 - raise ProcessingError(self.filename, exc)  
3291 - print('')  
3292 -  
3293 -  
3294 - def process_file_json(self, show_decoded_strings=False,  
3295 - display_code=True, hide_attributes=True,  
3296 - vba_code_only=False, show_deobfuscated_code=False,  
3297 - deobfuscate=False):  
3298 - """  
3299 - Process a single file  
3300 -  
3301 - every "show" or "print" here is to be translated as "add to json"  
3302 -  
3303 - :param filename: str, path and filename of file on disk, or within the container.  
3304 - :param data: bytes, content of the file if it is in a container, None if it is a file on disk.  
3305 - :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.  
3306 - :param display_code: bool, if False VBA source code is not displayed (default True)  
3307 - :param global_analysis: bool, if True all modules are merged for a single analysis (default),  
3308 - otherwise each module is analyzed separately (old behaviour)  
3309 - :param hide_attributes: bool, if True the first lines starting with "Attribute VB" are hidden (default)  
3310 - :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow)  
3311 - """  
3312 - #TODO: fix conflicting parameters (?)  
3313 -  
3314 - if vba_code_only and not display_code:  
3315 - display_code = True  
3316 -  
3317 - result = {}  
3318 -  
3319 - if self.container:  
3320 - result['container'] = self.container  
3321 - else:  
3322 - result['container'] = None  
3323 - result['file'] = self.filename  
3324 - result['json_conversion_successful'] = False  
3325 - result['analysis'] = None  
3326 - result['code_deobfuscated'] = None  
3327 - result['do_deobfuscate'] = deobfuscate  
3328 -  
3329 - try:  
3330 - #TODO: handle olefile errors, when an OLE file is malformed  
3331 - result['type'] = self.type  
3332 - macros = []  
3333 - if self.detect_vba_macros():  
3334 - for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros():  
3335 - curr_macro = {}  
3336 - if isinstance(vba_code, bytes):  
3337 - vba_code = vba_code.decode('utf-8', 'backslashreplace')  
3338 -  
3339 - if hide_attributes:  
3340 - # hide attribute lines:  
3341 - vba_code_filtered = filter_vba(vba_code)  
3342 - else:  
3343 - vba_code_filtered = vba_code  
3344 -  
3345 - curr_macro['vba_filename'] = vba_filename  
3346 - curr_macro['subfilename'] = subfilename  
3347 - curr_macro['ole_stream'] = stream_path  
3348 - if display_code:  
3349 - curr_macro['code'] = vba_code_filtered.strip()  
3350 - else:  
3351 - curr_macro['code'] = None  
3352 - macros.append(curr_macro)  
3353 - if not vba_code_only:  
3354 - # analyse the code from all modules at once:  
3355 - result['analysis'] = self.print_analysis_json(show_decoded_strings,  
3356 - deobfuscate)  
3357 - if show_deobfuscated_code:  
3358 - result['code_deobfuscated'] = self.reveal()  
3359 - result['macros'] = macros  
3360 - result['json_conversion_successful'] = True  
3361 - except Exception as exc:  
3362 - # display the exception with full stack trace for debugging  
3363 - log.info('Error processing file %s (%s)' % (self.filename, exc))  
3364 - log.debug('Traceback:', exc_info=True)  
3365 - raise ProcessingError(self.filename, exc)  
3366 -  
3367 - return result  
3368 -  
3369 -  
3370 - def process_file_triage(self, show_decoded_strings=False, deobfuscate=False):  
3371 - """  
3372 - Process a file in triage mode, showing only summary results on one line.  
3373 - """  
3374 - #TODO: replace print by writing to a provided output file (sys.stdout by default)  
3375 - try:  
3376 - #TODO: handle olefile errors, when an OLE file is malformed  
3377 - if self.detect_vba_macros():  
3378 - # print a waiting message only if the output is not redirected to a file:  
3379 - if sys.stdout.isatty():  
3380 - print('Analysis...\r', end='')  
3381 - sys.stdout.flush()  
3382 - self.analyze_macros(show_decoded_strings=show_decoded_strings,  
3383 - deobfuscate=deobfuscate)  
3384 - flags = TYPE2TAG[self.type]  
3385 - macros = autoexec = suspicious = iocs = hexstrings = base64obf = dridex = vba_obf = '-'  
3386 - if self.contains_macros: macros = 'M'  
3387 - if self.nb_autoexec: autoexec = 'A'  
3388 - if self.nb_suspicious: suspicious = 'S'  
3389 - if self.nb_iocs: iocs = 'I'  
3390 - if self.nb_hexstrings: hexstrings = 'H'  
3391 - if self.nb_base64strings: base64obf = 'B'  
3392 - if self.nb_dridexstrings: dridex = 'D'  
3393 - if self.nb_vbastrings: vba_obf = 'V'  
3394 - flags += '%s%s%s%s%s%s%s%s' % (macros, autoexec, suspicious, iocs, hexstrings,  
3395 - base64obf, dridex, vba_obf)  
3396 -  
3397 - line = '%-12s %s' % (flags, self.filename)  
3398 - print(line)  
3399 - except Exception as exc:  
3400 - # display the exception with full stack trace for debugging only  
3401 - log.debug('Error processing file %s (%s)' % (self.filename, exc),  
3402 - exc_info=True)  
3403 - raise ProcessingError(self.filename, exc)  
3404 -  
3405 -  
3406 -#=== MAIN =====================================================================  
3407 -  
3408 -def parse_args(cmd_line_args=None):  
3409 - """ parse command line arguments (given ones or per default sys.argv) """  
3410 -  
3411 - DEFAULT_LOG_LEVEL = "warning" # Default log level  
3412 - LOG_LEVELS = {  
3413 - 'debug': logging.DEBUG,  
3414 - 'info': logging.INFO,  
3415 - 'warning': logging.WARNING,  
3416 - 'error': logging.ERROR,  
3417 - 'critical': logging.CRITICAL  
3418 - }  
3419 -  
3420 - usage = 'usage: olevba [options] <filename> [filename2 ...]'  
3421 - parser = optparse.OptionParser(usage=usage)  
3422 - # parser.add_option('-o', '--outfile', dest='outfile',  
3423 - # help='output file')  
3424 - # parser.add_option('-c', '--csv', dest='csv',  
3425 - # help='export results to a CSV file')  
3426 - parser.add_option("-r", action="store_true", dest="recursive",  
3427 - help='find files recursively in subdirectories.')  
3428 - parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None,  
3429 - help='if the file is a zip archive, open all files from it, using the provided password (requires Python 2.6+)')  
3430 - parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*',  
3431 - help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)')  
3432 - # output mode; could make this even simpler with add_option(type='choice') but that would make  
3433 - # cmd line interface incompatible...  
3434 - modes = optparse.OptionGroup(parser, title='Output mode (mutually exclusive)')  
3435 - modes.add_option("-t", '--triage', action="store_const", dest="output_mode",  
3436 - const='triage', default='unspecified',  
3437 - help='triage mode, display results as a summary table (default for multiple files)')  
3438 - modes.add_option("-d", '--detailed', action="store_const", dest="output_mode",  
3439 - const='detailed', default='unspecified',  
3440 - help='detailed mode, display full results (default for single file)')  
3441 - modes.add_option("-j", '--json', action="store_const", dest="output_mode",  
3442 - const='json', default='unspecified',  
3443 - help='json mode, detailed in json format (never default)')  
3444 - parser.add_option_group(modes)  
3445 - parser.add_option("-a", '--analysis', action="store_false", dest="display_code", default=True,  
3446 - help='display only analysis results, not the macro source code')  
3447 - parser.add_option("-c", '--code', action="store_true", dest="vba_code_only", default=False,  
3448 - help='display only VBA source code, do not analyze it')  
3449 - parser.add_option("--decode", action="store_true", dest="show_decoded_strings",  
3450 - help='display all the obfuscated strings with their decoded content (Hex, Base64, StrReverse, Dridex, VBA).')  
3451 - parser.add_option("--attr", action="store_false", dest="hide_attributes", default=True,  
3452 - help='display the attribute lines at the beginning of VBA source code')  
3453 - parser.add_option("--reveal", action="store_true", dest="show_deobfuscated_code",  
3454 - help='display the macro source code after replacing all the obfuscated strings by their decoded content.')  
3455 - parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL,  
3456 - help="logging level debug/info/warning/error/critical (default=%default)")  
3457 - parser.add_option('--deobf', dest="deobfuscate", action="store_true", default=False,  
3458 - help="Attempt to deobfuscate VBA expressions (slow)")  
3459 - parser.add_option('--relaxed', dest="relaxed", action="store_true", default=False,  
3460 - help="Do not raise errors if opening of substream fails")  
3461 -  
3462 - (options, args) = parser.parse_args(cmd_line_args)  
3463 -  
3464 - # Print help if no arguments are passed  
3465 - if len(args) == 0:  
3466 - # print banner with version  
3467 - python_version = '%d.%d.%d' % sys.version_info[0:3]  
3468 - print('olevba3 %s on Python %s - http://decalage.info/python/oletools' %  
3469 - (__version__, python_version))  
3470 - print(__doc__)  
3471 - parser.print_help()  
3472 - sys.exit(RETURN_WRONG_ARGS)  
3473 -  
3474 - options.loglevel = LOG_LEVELS[options.loglevel]  
3475 -  
3476 - return options, args  
3477 -  
3478 -  
3479 -def main(cmd_line_args=None):  
3480 - """  
3481 - Main function, called when olevba is run from the command line  
3482 -  
3483 - Optional argument: command line arguments to be forwarded to ArgumentParser  
3484 - in process_args. Per default (cmd_line_args=None), sys.argv is used. Option  
3485 - mainly added for unit-testing  
3486 - """  
3487 -  
3488 - options, args = parse_args(cmd_line_args)  
3489 -  
3490 - # provide info about tool and its version  
3491 - if options.output_mode == 'json':  
3492 - # print first json entry with meta info and opening '['  
3493 - print_json(script_name='olevba', version=__version__,  
3494 - url='http://decalage.info/python/oletools',  
3495 - type='MetaInformation', _json_is_first=True)  
3496 - else:  
3497 - # print banner with version  
3498 - python_version = '%d.%d.%d' % sys.version_info[0:3]  
3499 - print('olevba3 %s on Python %s - http://decalage.info/python/oletools' %  
3500 - (__version__, python_version))  
3501 -  
3502 - logging.basicConfig(level=options.loglevel, format='%(levelname)-8s %(message)s')  
3503 - # enable logging in the modules:  
3504 - enable_logging()  
3505 -  
3506 - # with the option --reveal, make sure --deobf is also enabled:  
3507 - if options.show_deobfuscated_code and not options.deobfuscate:  
3508 - log.info('set --deobf because --reveal was set')  
3509 - options.deobfuscate = True  
3510 - if options.output_mode == 'triage' and options.show_deobfuscated_code:  
3511 - log.info('ignoring option --reveal in triage output mode')  
3512 -  
3513 - # Column headers (do not know how many files there will be yet, so if no output_mode  
3514 - # was specified, we will print triage for first file --> need these headers)  
3515 - if options.output_mode in ('triage', 'unspecified'):  
3516 - print('%-12s %-65s' % ('Flags', 'Filename'))  
3517 - print('%-12s %-65s' % ('-' * 11, '-' * 65))  
3518 -  
3519 - previous_container = None  
3520 - count = 0  
3521 - container = filename = data = None  
3522 - vba_parser = None  
3523 - return_code = RETURN_OK  
3524 - try:  
3525 - for container, filename, data in xglob.iter_files(args, recursive=options.recursive,  
3526 - zip_password=options.zip_password, zip_fname=options.zip_fname):  
3527 - # ignore directory names stored in zip files:  
3528 - if container and filename.endswith('/'):  
3529 - continue  
3530 -  
3531 - # handle errors from xglob  
3532 - if isinstance(data, Exception):  
3533 - if isinstance(data, PathNotFoundException):  
3534 - if options.output_mode in ('triage', 'unspecified'):  
3535 - print('%-12s %s - File not found' % ('?', filename))  
3536 - elif options.output_mode != 'json':  
3537 - log.error('Given path %r does not exist!' % filename)  
3538 - return_code = RETURN_FILE_NOT_FOUND if return_code == 0 \  
3539 - else RETURN_SEVERAL_ERRS  
3540 - else:  
3541 - if options.output_mode in ('triage', 'unspecified'):  
3542 - print('%-12s %s - Failed to read from zip file %s' % ('?', filename, container))  
3543 - elif options.output_mode != 'json':  
3544 - log.error('Exception opening/reading %r from zip file %r: %s'  
3545 - % (filename, container, data))  
3546 - return_code = RETURN_XGLOB_ERR if return_code == 0 \  
3547 - else RETURN_SEVERAL_ERRS  
3548 - if options.output_mode == 'json':  
3549 - print_json(file=filename, type='error',  
3550 - error=type(data).__name__, message=str(data))  
3551 - continue  
3552 -  
3553 - try:  
3554 - # close the previous file if analyzing several:  
3555 - # (this must be done here to avoid closing the file if there is only 1,  
3556 - # to fix issue #219)  
3557 - if vba_parser is not None:  
3558 - vba_parser.close()  
3559 - # Open the file  
3560 - vba_parser = VBA_Parser_CLI(filename, data=data, container=container,  
3561 - relaxed=options.relaxed)  
3562 -  
3563 - if options.output_mode == 'detailed':  
3564 - # fully detailed output  
3565 - vba_parser.process_file(show_decoded_strings=options.show_decoded_strings,  
3566 - display_code=options.display_code,  
3567 - hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only,  
3568 - show_deobfuscated_code=options.show_deobfuscated_code,  
3569 - deobfuscate=options.deobfuscate)  
3570 - elif options.output_mode in ('triage', 'unspecified'):  
3571 - # print container name when it changes:  
3572 - if container != previous_container:  
3573 - if container is not None:  
3574 - print('\nFiles in %s:' % container)  
3575 - previous_container = container  
3576 - # summarized output for triage:  
3577 - vba_parser.process_file_triage(show_decoded_strings=options.show_decoded_strings,  
3578 - deobfuscate=options.deobfuscate)  
3579 - elif options.output_mode == 'json':  
3580 - print_json(  
3581 - vba_parser.process_file_json(show_decoded_strings=options.show_decoded_strings,  
3582 - display_code=options.display_code,  
3583 - hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only,  
3584 - show_deobfuscated_code=options.show_deobfuscated_code,  
3585 - deobfuscate=options.deobfuscate))  
3586 - else: # (should be impossible)  
3587 - raise ValueError('unexpected output mode: "{0}"!'.format(options.output_mode))  
3588 - count += 1  
3589 -  
3590 - except (SubstreamOpenError, UnexpectedDataError) as exc:  
3591 - if options.output_mode in ('triage', 'unspecified'):  
3592 - print('%-12s %s - Error opening substream or uenxpected ' \  
3593 - 'content' % ('?', filename))  
3594 - elif options.output_mode == 'json':  
3595 - print_json(file=filename, type='error',  
3596 - error=type(exc).__name__, message=str(exc))  
3597 - else:  
3598 - log.exception('Error opening substream or unexpected '  
3599 - 'content in %s' % filename)  
3600 - return_code = RETURN_OPEN_ERROR if return_code == 0 \  
3601 - else RETURN_SEVERAL_ERRS  
3602 - except FileOpenError as exc:  
3603 - if options.output_mode in ('triage', 'unspecified'):  
3604 - print('%-12s %s - File format not supported' % ('?', filename))  
3605 - elif options.output_mode == 'json':  
3606 - print_json(file=filename, type='error',  
3607 - error=type(exc).__name__, message=str(exc))  
3608 - else:  
3609 - log.exception('Failed to open %s -- probably not supported!' % filename)  
3610 - return_code = RETURN_OPEN_ERROR if return_code == 0 \  
3611 - else RETURN_SEVERAL_ERRS  
3612 - except ProcessingError as exc:  
3613 - if options.output_mode in ('triage', 'unspecified'):  
3614 - print('%-12s %s - %s' % ('!ERROR', filename, exc.orig_exc))  
3615 - elif options.output_mode == 'json':  
3616 - print_json(file=filename, type='error',  
3617 - error=type(exc).__name__,  
3618 - message=str(exc.orig_exc))  
3619 - else:  
3620 - log.exception('Error processing file %s (%s)!'  
3621 - % (filename, exc.orig_exc))  
3622 - return_code = RETURN_PARSE_ERROR if return_code == 0 \  
3623 - else RETURN_SEVERAL_ERRS  
3624 - except FileIsEncryptedError as exc:  
3625 - if options.output_mode in ('triage', 'unspecified'):  
3626 - print('%-12s %s - File is encrypted' % ('!ERROR', filename))  
3627 - elif options.output_mode == 'json':  
3628 - print_json(file=filename, type='error',  
3629 - error=type(exc).__name__, message=str(exc))  
3630 - else:  
3631 - log.exception('File %s is encrypted!' % (filename))  
3632 - return_code = RETURN_ENCRYPTED if return_code == 0 \  
3633 - else RETURN_SEVERAL_ERRS  
3634 - # Here we do not close the vba_parser, because process_file may need it below.  
3635 -  
3636 - if options.output_mode == 'triage':  
3637 - print('\n(Flags: OpX=OpenXML, XML=Word2003XML, FlX=FlatOPC XML, MHT=MHTML, TXT=Text, M=Macros, ' \  
3638 - 'A=Auto-executable, S=Suspicious keywords, I=IOCs, H=Hex strings, ' \  
3639 - 'B=Base64 strings, D=Dridex strings, V=VBA strings, ?=Unknown)\n')  
3640 -  
3641 - if count == 1 and options.output_mode == 'unspecified':  
3642 - # if options -t, -d and -j were not specified and it's a single file, print details:  
3643 - vba_parser.process_file(show_decoded_strings=options.show_decoded_strings,  
3644 - display_code=options.display_code,  
3645 - hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only,  
3646 - show_deobfuscated_code=options.show_deobfuscated_code,  
3647 - deobfuscate=options.deobfuscate)  
3648 -  
3649 - if options.output_mode == 'json':  
3650 - # print last json entry (a last one without a comma) and closing ]  
3651 - print_json(type='MetaInformation', return_code=return_code,  
3652 - n_processed=count, _json_is_last=True)  
3653 -  
3654 - except Exception as exc:  
3655 - # some unexpected error, maybe some of the types caught in except clauses  
3656 - # above were not sufficient. This is very bad, so log complete trace at exception level  
3657 - # and do not care about output mode  
3658 - log.exception('Unhandled exception in main: %s' % exc, exc_info=True)  
3659 - return_code = RETURN_UNEXPECTED # even if there were others before -- this is more important  
3660 - # TODO: print msg with URL to report issues (except in JSON mode)  
3661 -  
3662 - # done. exit  
3663 - log.debug('will exit now with code %s' % return_code)  
3664 - sys.exit(return_code) 19 +from oletools.olevba import *
  20 +from oletools.olevba import __doc__, __version__
3665 21
3666 if __name__ == '__main__': 22 if __name__ == '__main__':
3667 main() 23 main()
3668 24
3669 -# This was coded while listening to "Dust" from I Love You But I've Chosen Darkness