Commit 8e1d03d7a18b0779ea73c1d4b13914c07220c37d

Authored by decalage2
1 parent a7309e59

olevba3: replaced by a redirection to olevba + deprecation warning (issue #106)

Showing 1 changed file with 6 additions and 3651 deletions
oletools/olevba3.py
1 1 #!/usr/bin/env python
2   -"""
3   -olevba3.py
4 2  
5   -olevba is a script to parse OLE and OpenXML files such as MS Office documents
6   -(e.g. Word, Excel), to extract VBA Macro code in clear text, deobfuscate
7   -and analyze malicious macros.
  3 +# olevba3 is a stub that redirects to olevba.py, for backwards compatibility
8 4  
9   -olevba3 is the version of olevba that runs on Python 3.x.
10   -
11   -Supported formats:
12   -- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)
13   -- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
14   -- PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm)
15   -- Word/PowerPoint 2007+ XML (aka Flat OPC)
16   -- Word 2003 XML (.xml)
17   -- Word/Excel Single File Web Page / MHTML (.mht)
18   -- Publisher (.pub)
19   -- raises an error if run with files encrypted using MS Crypto API RC4
20   -
21   -Author: Philippe Lagadec - http://www.decalage.info
22   -License: BSD, see source code or documentation
23   -
24   -olevba is part of the python-oletools package:
25   -http://www.decalage.info/python/oletools
26   -
27   -olevba is based on source code from officeparser by John William Davison
28   -https://github.com/unixfreak0037/officeparser
29   -"""
30   -
31   -# === LICENSE ==================================================================
32   -
33   -# olevba is copyright (c) 2014-2018 Philippe Lagadec (http://www.decalage.info)
34   -# All rights reserved.
35   -#
36   -# Redistribution and use in source and binary forms, with or without modification,
37   -# are permitted provided that the following conditions are met:
38   -#
39   -# * Redistributions of source code must retain the above copyright notice, this
40   -# list of conditions and the following disclaimer.
41   -# * Redistributions in binary form must reproduce the above copyright notice,
42   -# this list of conditions and the following disclaimer in the documentation
43   -# and/or other materials provided with the distribution.
44   -#
45   -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
46   -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
47   -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
48   -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
49   -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50   -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
51   -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
52   -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
53   -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
54   -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
55   -
56   -
57   -# olevba contains modified source code from the officeparser project, published
58   -# under the following MIT License (MIT):
59   -#
60   -# officeparser is copyright (c) 2014 John William Davison
61   -#
62   -# Permission is hereby granted, free of charge, to any person obtaining a copy
63   -# of this software and associated documentation files (the "Software"), to deal
64   -# in the Software without restriction, including without limitation the rights
65   -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
66   -# copies of the Software, and to permit persons to whom the Software is
67   -# furnished to do so, subject to the following conditions:
68   -#
69   -# The above copyright notice and this permission notice shall be included in all
70   -# copies or substantial portions of the Software.
71   -#
72   -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
73   -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
74   -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
75   -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
76   -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
77   -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
78   -# SOFTWARE.
79   -
80   -from __future__ import print_function
81   -
82   -#------------------------------------------------------------------------------
83   -# CHANGELOG:
84   -# 2014-08-05 v0.01 PL: - first version based on officeparser code
85   -# 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser
86   -# 2014-08-15 PL: - fixed incorrect value check in projecthelpfilepath Record
87   -# 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats
88   -# and to find the VBA project root anywhere in the file
89   -# 2014-11-29 v0.04 PL: - use olefile instead of OleFileIO_PL
90   -# 2014-12-05 v0.05 PL: - refactored most functions into a class, new API
91   -# - added detect_vba_macros
92   -# 2014-12-10 v0.06 PL: - hide first lines with VB attributes
93   -# - detect auto-executable macros
94   -# - ignore empty macros
95   -# 2014-12-14 v0.07 PL: - detect_autoexec() is now case-insensitive
96   -# 2014-12-15 v0.08 PL: - improved display for empty macros
97   -# - added pattern extraction
98   -# 2014-12-25 v0.09 PL: - added suspicious keywords detection
99   -# 2014-12-27 v0.10 PL: - added OptionParser, main and process_file
100   -# - uses xglob to scan several files with wildcards
101   -# - option -r to recurse subdirectories
102   -# - option -z to scan files in password-protected zips
103   -# 2015-01-02 v0.11 PL: - improved filter_vba to detect colons
104   -# 2015-01-03 v0.12 PL: - fixed detect_patterns to detect all patterns
105   -# - process_file: improved display, shows container file
106   -# - improved list of executable file extensions
107   -# 2015-01-04 v0.13 PL: - added several suspicious keywords, improved display
108   -# 2015-01-08 v0.14 PL: - added hex strings detection and decoding
109   -# - fixed issue #2, decoding VBA stream names using
110   -# specified codepage and unicode stream names
111   -# 2015-01-11 v0.15 PL: - added new triage mode, options -t and -d
112   -# 2015-01-16 v0.16 PL: - fix for issue #3 (exception when module name="text")
113   -# - added several suspicious keywords
114   -# - added option -i to analyze VBA source code directly
115   -# 2015-01-17 v0.17 PL: - removed .com from the list of executable extensions
116   -# - added scan_vba to run all detection algorithms
117   -# - decoded hex strings are now also scanned + reversed
118   -# 2015-01-23 v0.18 PL: - fixed issue #3, case-insensitive search in code_modules
119   -# 2015-01-24 v0.19 PL: - improved the detection of IOCs obfuscated with hex
120   -# strings and StrReverse
121   -# 2015-01-26 v0.20 PL: - added option --hex to show all hex strings decoded
122   -# 2015-01-29 v0.21 PL: - added Dridex obfuscation decoding
123   -# - improved display, shows obfuscation name
124   -# 2015-02-01 v0.22 PL: - fixed issue #4: regex for URL, e-mail and exe filename
125   -# - added Base64 obfuscation decoding (contribution from
126   -# @JamesHabben)
127   -# 2015-02-03 v0.23 PL: - triage now uses VBA_Scanner results, shows Base64 and
128   -# Dridex strings
129   -# - exception handling in detect_base64_strings
130   -# 2015-02-07 v0.24 PL: - renamed option --hex to --decode, fixed display
131   -# - display exceptions with stack trace
132   -# - added several suspicious keywords
133   -# - improved Base64 detection and decoding
134   -# - fixed triage mode not to scan attrib lines
135   -# 2015-03-04 v0.25 PL: - added support for Word 2003 XML
136   -# 2015-03-22 v0.26 PL: - added suspicious keywords for sandboxing and
137   -# virtualisation detection
138   -# 2015-05-06 v0.27 PL: - added support for MHTML files with VBA macros
139   -# (issue #10 reported by Greg from SpamStopsHere)
140   -# 2015-05-24 v0.28 PL: - improved support for MHTML files with modified header
141   -# (issue #11 reported by Thomas Chopitea)
142   -# 2015-05-26 v0.29 PL: - improved MSO files parsing, taking into account
143   -# various data offsets (issue #12)
144   -# - improved detection of MSO files, avoiding incorrect
145   -# parsing errors (issue #7)
146   -# 2015-05-29 v0.30 PL: - added suspicious keywords suggested by @ozhermit,
147   -# Davy Douhine (issue #9), issue #13
148   -# 2015-06-16 v0.31 PL: - added generic VBA expression deobfuscation (chr,asc,etc)
149   -# 2015-06-19 PL: - added options -a, -c, --each, --attr
150   -# 2015-06-21 v0.32 PL: - always display decoded strings which are printable
151   -# - fix VBA_Scanner.scan to return raw strings, not repr()
152   -# 2015-07-09 v0.40 PL: - removed usage of sys.stderr which causes issues
153   -# 2015-07-12 PL: - added Hex function decoding to VBA Parser
154   -# 2015-07-13 PL: - added Base64 function decoding to VBA Parser
155   -# 2015-09-06 PL: - improved VBA_Parser, refactored the main functions
156   -# 2015-09-13 PL: - moved main functions to a class VBA_Parser_CLI
157   -# - fixed issue when analysis was done twice
158   -# 2015-09-15 PL: - remove duplicate IOCs from results
159   -# 2015-09-16 PL: - join long VBA lines ending with underscore before scan
160   -# - disabled unused option --each
161   -# 2015-09-22 v0.41 PL: - added new option --reveal
162   -# - added suspicious strings for PowerShell.exe options
163   -# 2015-10-09 v0.42 PL: - VBA_Parser: split each format into a separate method
164   -# 2015-10-10 PL: - added support for text files with VBA source code
165   -# 2015-11-17 PL: - fixed bug with --decode option
166   -# 2015-12-16 PL: - fixed bug in main (no options input anymore)
167   -# - improved logging, added -l option
168   -# 2016-01-31 PL: - fixed issue #31 in VBA_Parser.open_mht
169   -# - fixed issue #32 by monkeypatching email.feedparser
170   -# 2016-02-07 PL: - KeyboardInterrupt is now raised properly
171   -# 2016-02-20 v0.43 PL: - fixed issue #34 in the VBA parser and vba_chr
172   -# 2016-02-29 PL: - added Workbook_Activate to suspicious keywords
173   -# 2016-03-08 v0.44 PL: - added VBA Form strings extraction and analysis
174   -# 2016-03-04 v0.45 CH: - added JSON output (by Christian Herdtweck)
175   -# 2016-03-16 CH: - added option --no-deobfuscate (temporary)
176   -# 2016-04-19 v0.46 PL: - new option --deobf instead of --no-deobfuscate
177   -# - updated suspicious keywords
178   -# 2016-05-04 v0.47 PL: - look for VBA code in any stream including orphans
179   -# 2016-04-28 CH: - return an exit code depending on the results
180   -# - improved error and exception handling
181   -# - improved JSON output
182   -# 2016-05-12 CH: - added support for PowerPoint 97-2003 files
183   -# 2016-06-06 CH: - improved handling of unicode VBA module names
184   -# 2016-06-07 CH: - added option --relaxed, stricter parsing by default
185   -# 2016-06-12 v0.50 PL: - fixed small bugs in VBA parsing code
186   -# 2016-07-01 PL: - fixed issue #58 with format() to support Python 2.6
187   -# 2016-07-29 CH: - fixed several bugs including #73 (Mac Roman encoding)
188   -# 2016-08-31 PL: - added autoexec keyword InkPicture_Painted
189   -# - detect_autoexec now returns the exact keyword found
190   -# 2016-09-05 PL: - added autoexec keywords for MS Publisher (.pub)
191   -# 2016-09-06 PL: - fixed issue #20, is_zipfile on Python 2.6
192   -# 2016-09-12 PL: - enabled packrat to improve pyparsing performance
193   -# 2016-10-25 PL: - fixed raise and print statements for Python 3
194   -# 2016-11-03 v0.51 PL: - added EnumDateFormats and EnumSystemLanguageGroupsW
195   -# 2017-02-07 PL: - temporary fix for issue #132
196   -# - added keywords for Mac-specific macros (issue #130)
197   -# 2017-03-08 PL: - fixed absolute imports
198   -# 2017-03-16 PL: - fixed issues #148 and #149 for option --reveal
199   -# 2017-05-19 PL: - added enable_logging to fix issue #154
200   -# 2017-05-31 c1fe: - PR #135 fixing issue #132 for some Mac files
201   -# 2017-06-08 PL: - fixed issue #122 Chr() with negative numbers
202   -# 2017-06-15 PL: - deobfuscation line by line to handle large files
203   -# 2017-07-11 v0.52 PL: - raise exception instead of sys.exit (issue #180)
204   -# 2017-11-20 PL: - fixed issue #219, do not close the file too early
205   -# 2017-11-24 PL: - added keywords to detect self-modifying macros and
206   -# attempts to disable macro security (issue #221)
207   -# 2018-03-19 PL: - removed pyparsing from the thirdparty subfolder
208   -# 2018-05-13 v0.53 PL: - added support for Word/PowerPoint 2007+ XML (FlatOPC)
209   -# (issue #283)
210   -# 2018-06-11 v0.53.1 MHW: - fixed #320: chr instead of unichr on python 3
211   -# 2018-06-12 MHW: - fixed #322: import reduce from functools
212   -# 2018-09-11 v0.54 PL: - olefile is now a dependency
213   -# 2018-10-25 CH: - detect encryption and raise error if detected
214   -
215   -__version__ = '0.54dev4'
216   -
217   -#------------------------------------------------------------------------------
218   -# TODO:
219   -# + setup logging (common with other oletools)
220   -# + add xor bruteforcing like bbharvest
221   -# + options -a and -c should imply -d
222   -
223   -# TODO later:
224   -# + performance improvement: instead of searching each keyword separately,
225   -# first split vba code into a list of words (per line), then check each
226   -# word against a dict. (or put vba words into a set/dict?)
227   -# + for regex, maybe combine them into a single re with named groups?
228   -# + add Yara support, include sample rules? plugins like balbuzard?
229   -# + add balbuzard support
230   -# + output to file (replace print by file.write, sys.stdout by default)
231   -# + look for VBA in embedded documents (e.g. Excel in Word)
232   -# + support SRP streams (see Lenny's article + links and sample)
233   -# - python 3.x support
234   -# - check VBA macros in Visio, Access, Project, etc
235   -# - extract_macros: convert to a class, split long function into smaller methods
236   -# - extract_macros: read bytes from stream file objects instead of strings
237   -# - extract_macros: use combined struct.unpack instead of many calls
238   -# - all except clauses should target specific exceptions
239   -
240   -#------------------------------------------------------------------------------
241   -# REFERENCES:
242   -# - [MS-OVBA]: Microsoft Office VBA File Format Structure
243   -# http://msdn.microsoft.com/en-us/library/office/cc313094%28v=office.12%29.aspx
244   -# - officeparser: https://github.com/unixfreak0037/officeparser
245   -
246   -
247   -#--- IMPORTS ------------------------------------------------------------------
248   -
249   -import sys
250   -import os
251   -import logging
252   -import struct
253   -from io import BytesIO
254   -import math
255   -import zipfile
256   -import re
257   -import optparse
258   -import binascii
259   -import base64
260   -import zlib
261   -import email # for MHTML parsing
262   -import string # for printable
263   -import json # for json output mode (argument --json)
264   -
265   -# import lxml or ElementTree for XML parsing:
266   -try:
267   - # lxml: best performance for XML processing
268   - import lxml.etree as ET
269   -except ImportError:
270   - try:
271   - # Python 2.5+: batteries included
272   - import xml.etree.cElementTree as ET
273   - except ImportError:
274   - try:
275   - # Python <2.5: standalone ElementTree install
276   - import elementtree.cElementTree as ET
277   - except ImportError:
278   - raise ImportError("lxml or ElementTree are not installed, " \
279   - + "see http://codespeak.net/lxml " \
280   - + "or http://effbot.org/zone/element-index.htm")
281   -
282   -import colorclass
283   -
284   -# On Windows, colorclass needs to be enabled:
285   -if os.name == 'nt':
286   - colorclass.Windows.enable(auto_colors=True)
  5 +import sys, os, warnings
287 6  
  7 +warnings.warn('olevba3 is deprecated, olevba should be used instead.', DeprecationWarning)
288 8  
289 9 # IMPORTANT: it should be possible to run oletools directly as scripts
290 10 # in any directory without installing them with pip or setup.py.
... ... @@ -292,3378 +12,13 @@ if os.name == &#39;nt&#39;:
292 12 # And to enable Python 2+3 compatibility, we need to use absolute imports,
293 13 # so we add the oletools parent folder to sys.path (absolute+normalized path):
294 14 _thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__)))
295   -# print('_thismodule_dir = %r' % _thismodule_dir)
296 15 _parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..'))
297   -# print('_parent_dir = %r' % _thirdparty_dir)
298   -if not _parent_dir in sys.path:
  16 +if _parent_dir not in sys.path:
299 17 sys.path.insert(0, _parent_dir)
300 18  
301   -import olefile
302   -from oletools.thirdparty.prettytable import prettytable
303   -from oletools.thirdparty.xglob import xglob, PathNotFoundException
304   -from pyparsing import \
305   - CaselessKeyword, CaselessLiteral, Combine, Forward, Literal, \
306   - Optional, QuotedString,Regex, Suppress, Word, WordStart, \
307   - alphanums, alphas, hexnums,nums, opAssoc, srange, \
308   - infixNotation, ParserElement
309   -import oletools.ppt_parser as ppt_parser
310   -from oletools import rtfobj
311   -from oletools import oleid
312   -from oletools.common.errors import FileIsEncryptedError
313   -
314   -# monkeypatch email to fix issue #32:
315   -# allow header lines without ":"
316   -import email.feedparser
317   -email.feedparser.headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:?|[\t ])')
318   -
319   -# === PYTHON 2+3 SUPPORT ======================================================
320   -
321   -if sys.version_info[0] <= 2:
322   - # Python 2.x
323   - if sys.version_info[1] <= 6:
324   - # Python 2.6
325   - # use is_zipfile backported from Python 2.7:
326   - from thirdparty.zipfile27 import is_zipfile
327   - else:
328   - # Python 2.7
329   - from zipfile import is_zipfile
330   -else:
331   - # Python 3.x+
332   - from zipfile import is_zipfile
333   - # xrange is now called range:
334   - xrange = range
335   - # unichr does not exist anymore, only chr:
336   - unichr = chr
337   - from functools import reduce
338   -
339   -
340   -# === PYTHON 3.0 - 3.4 SUPPORT ======================================================
341   -
342   -# From https://gist.github.com/ynkdir/867347/c5e188a4886bc2dd71876c7e069a7b00b6c16c61
343   -
344   -if sys.version_info >= (3, 0) and sys.version_info < (3, 5):
345   - import codecs
346   -
347   - _backslashreplace_errors = codecs.lookup_error("backslashreplace")
348   -
349   - def backslashreplace_errors(exc):
350   - if isinstance(exc, UnicodeDecodeError):
351   - u = "".join("\\x{0:02x}".format(c) for c in exc.object[exc.start:exc.end])
352   - return (u, exc.end)
353   - return _backslashreplace_errors(exc)
354   -
355   - codecs.register_error("backslashreplace", backslashreplace_errors)
356   -
357   -
358   -# === LOGGING =================================================================
359   -
360   -class NullHandler(logging.Handler):
361   - """
362   - Log Handler without output, to avoid printing messages if logging is not
363   - configured by the main application.
364   - Python 2.7 has logging.NullHandler, but this is necessary for 2.6:
365   - see https://docs.python.org/2.6/library/logging.html#configuring-logging-for-a-library
366   - """
367   - def emit(self, record):
368   - pass
369   -
370   -def get_logger(name, level=logging.CRITICAL+1):
371   - """
372   - Create a suitable logger object for this module.
373   - The goal is not to change settings of the root logger, to avoid getting
374   - other modules' logs on the screen.
375   - If a logger exists with same name, reuse it. (Else it would have duplicate
376   - handlers and messages would be doubled.)
377   - The level is set to CRITICAL+1 by default, to avoid any logging.
378   - """
379   - # First, test if there is already a logger with the same name, else it
380   - # will generate duplicate messages (due to duplicate handlers):
381   - if name in logging.Logger.manager.loggerDict:
382   - #NOTE: another less intrusive but more "hackish" solution would be to
383   - # use getLogger then test if its effective level is not default.
384   - logger = logging.getLogger(name)
385   - # make sure level is OK:
386   - logger.setLevel(level)
387   - return logger
388   - # get a new logger:
389   - logger = logging.getLogger(name)
390   - # only add a NullHandler for this logger, it is up to the application
391   - # to configure its own logging:
392   - logger.addHandler(NullHandler())
393   - logger.setLevel(level)
394   - return logger
395   -
396   -# a global logger object used for debugging:
397   -log = get_logger('olevba')
398   -
399   -
400   -def enable_logging():
401   - """
402   - Enable logging for this module (disabled by default).
403   - This will set the module-specific logger level to NOTSET, which
404   - means the main application controls the actual logging level.
405   - """
406   - log.setLevel(logging.NOTSET)
407   - # Also enable logging in the ppt_parser module:
408   - ppt_parser.enable_logging()
409   -
410   -
411   -
412   -#=== EXCEPTIONS ==============================================================
413   -
414   -class OlevbaBaseException(Exception):
415   - """ Base class for exceptions produced here for simpler except clauses """
416   - def __init__(self, msg, filename=None, orig_exc=None, **kwargs):
417   - if orig_exc:
418   - super(OlevbaBaseException, self).__init__(msg +
419   - ' ({0})'.format(orig_exc),
420   - **kwargs)
421   - else:
422   - super(OlevbaBaseException, self).__init__(msg, **kwargs)
423   - self.msg = msg
424   - self.filename = filename
425   - self.orig_exc = orig_exc
426   -
427   -
428   -class FileOpenError(OlevbaBaseException):
429   - """ raised by VBA_Parser constructor if all open_... attempts failed
430   -
431   - probably means the file type is not supported
432   - """
433   -
434   - def __init__(self, filename, orig_exc=None):
435   - super(FileOpenError, self).__init__(
436   - 'Failed to open file %s' % filename, filename, orig_exc)
437   -
438   -
439   -class ProcessingError(OlevbaBaseException):
440   - """ raised by VBA_Parser.process_file* functions """
441   -
442   - def __init__(self, filename, orig_exc):
443   - super(ProcessingError, self).__init__(
444   - 'Error processing file %s' % filename, filename, orig_exc)
445   -
446   -
447   -class MsoExtractionError(RuntimeError, OlevbaBaseException):
448   - """ raised by mso_file_extract if parsing MSO/ActiveMIME data failed """
449   -
450   - def __init__(self, msg):
451   - MsoExtractionError.__init__(self, msg)
452   - OlevbaBaseException.__init__(self, msg)
453   -
454   -
455   -class SubstreamOpenError(FileOpenError):
456   - """ special kind of FileOpenError: file is a substream of original file """
457   -
458   - def __init__(self, filename, subfilename, orig_exc=None):
459   - super(SubstreamOpenError, self).__init__(
460   - str(filename) + '/' + str(subfilename), orig_exc)
461   - self.filename = filename # overwrite setting in OlevbaBaseException
462   - self.subfilename = subfilename
463   -
464   -
465   -class UnexpectedDataError(OlevbaBaseException):
466   - """ raised when parsing is strict (=not relaxed) and data is unexpected """
467   -
468   - def __init__(self, stream_path, variable, expected, value):
469   - if isinstance(expected, int):
470   - es = '{0:04X}'.format(expected)
471   - elif isinstance(expected, tuple):
472   - es = ','.join('{0:04X}'.format(e) for e in expected)
473   - es = '({0})'.format(es)
474   - else:
475   - raise ValueError('Unknown type encountered: {0}'.format(type(expected)))
476   - super(UnexpectedDataError, self).__init__(
477   - 'Unexpected value in {0} for variable {1}: '
478   - 'expected {2} but found {3:04X}!'
479   - .format(stream_path, variable, es, value))
480   - self.stream_path = stream_path
481   - self.variable = variable
482   - self.expected = expected
483   - self.value = value
484   -
485   -#--- CONSTANTS ----------------------------------------------------------------
486   -
487   -# return codes
488   -RETURN_OK = 0
489   -RETURN_WARNINGS = 1 # (reserved, not used yet)
490   -RETURN_WRONG_ARGS = 2 # (fixed, built into optparse)
491   -RETURN_FILE_NOT_FOUND = 3
492   -RETURN_XGLOB_ERR = 4
493   -RETURN_OPEN_ERROR = 5
494   -RETURN_PARSE_ERROR = 6
495   -RETURN_SEVERAL_ERRS = 7
496   -RETURN_UNEXPECTED = 8
497   -RETURN_ENCRYPTED = 9
498   -
499   -# MAC codepages (from http://stackoverflow.com/questions/1592925/decoding-mac-os-text-in-python)
500   -MAC_CODEPAGES = {
501   - 10000: 'mac-roman',
502   - 10001: 'shiftjis', # not found: 'mac-shift-jis',
503   - 10003: 'ascii', # nothing appropriate found: 'mac-hangul',
504   - 10008: 'gb2321', # not found: 'mac-gb2312',
505   - 10002: 'big5', # not found: 'mac-big5',
506   - 10005: 'hebrew', # not found: 'mac-hebrew',
507   - 10004: 'mac-arabic',
508   - 10006: 'mac-greek',
509   - 10081: 'mac-turkish',
510   - 10021: 'thai', # not found: mac-thai',
511   - 10029: 'maccentraleurope', # not found: 'mac-east europe',
512   - 10007: 'ascii', # nothing appropriate found: 'mac-russian',
513   -}
514   -
515   -# URL and message to report issues:
516   -URL_OLEVBA_ISSUES = 'https://github.com/decalage2/oletools/issues'
517   -MSG_OLEVBA_ISSUES = 'Please report this issue on %s' % URL_OLEVBA_ISSUES
518   -
519   -# Container types:
520   -TYPE_OLE = 'OLE'
521   -TYPE_OpenXML = 'OpenXML'
522   -TYPE_FlatOPC_XML = 'FlatOPC_XML'
523   -TYPE_Word2003_XML = 'Word2003_XML'
524   -TYPE_MHTML = 'MHTML'
525   -TYPE_TEXT = 'Text'
526   -TYPE_PPT = 'PPT'
527   -
528   -# short tag to display file types in triage mode:
529   -TYPE2TAG = {
530   - TYPE_OLE: 'OLE:',
531   - TYPE_OpenXML: 'OpX:',
532   - TYPE_FlatOPC_XML: 'FlX:',
533   - TYPE_Word2003_XML: 'XML:',
534   - TYPE_MHTML: 'MHT:',
535   - TYPE_TEXT: 'TXT:',
536   - TYPE_PPT: 'PPT',
537   -}
538   -
539   -
540   -# MSO files ActiveMime header magic
541   -MSO_ACTIVEMIME_HEADER = b'ActiveMime'
542   -
543   -MODULE_EXTENSION = "bas"
544   -CLASS_EXTENSION = "cls"
545   -FORM_EXTENSION = "frm"
546   -
547   -# Namespaces and tags for Word2003 XML parsing:
548   -NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}'
549   -# the tag <w:binData w:name="editdata.mso"> contains the VBA macro code:
550   -TAG_BINDATA = NS_W + 'binData'
551   -ATTR_NAME = NS_W + 'name'
552   -
553   -# Namespaces and tags for Word/PowerPoint 2007+ XML parsing:
554   -# root: <pkg:package xmlns:pkg="http://schemas.microsoft.com/office/2006/xmlPackage">
555   -NS_XMLPACKAGE = '{http://schemas.microsoft.com/office/2006/xmlPackage}'
556   -TAG_PACKAGE = NS_XMLPACKAGE + 'package'
557   -# the tag <pkg:part> includes <pkg:binaryData> that contains the VBA macro code in Base64:
558   -# <pkg:part pkg:name="/word/vbaProject.bin" pkg:contentType="application/vnd.ms-office.vbaProject"><pkg:binaryData>
559   -TAG_PKGPART = NS_XMLPACKAGE + 'part'
560   -ATTR_PKG_NAME = NS_XMLPACKAGE + 'name'
561   -ATTR_PKG_CONTENTTYPE = NS_XMLPACKAGE + 'contentType'
562   -CTYPE_VBAPROJECT = "application/vnd.ms-office.vbaProject"
563   -TAG_PKGBINDATA = NS_XMLPACKAGE + 'binaryData'
564   -
565   -# Keywords to detect auto-executable macros
566   -AUTOEXEC_KEYWORDS = {
567   - # MS Word:
568   - 'Runs when the Word document is opened':
569   - ('AutoExec', 'AutoOpen', 'DocumentOpen'),
570   - 'Runs when the Word document is closed':
571   - ('AutoExit', 'AutoClose', 'Document_Close', 'DocumentBeforeClose'),
572   - 'Runs when the Word document is modified':
573   - ('DocumentChange',),
574   - 'Runs when a new Word document is created':
575   - ('AutoNew', 'Document_New', 'NewDocument'),
576   -
577   - # MS Word and Publisher:
578   - 'Runs when the Word or Publisher document is opened':
579   - ('Document_Open',),
580   - 'Runs when the Publisher document is closed':
581   - ('Document_BeforeClose',),
582   -
583   - # MS Excel:
584   - 'Runs when the Excel Workbook is opened':
585   - ('Auto_Open', 'Workbook_Open', 'Workbook_Activate'),
586   - 'Runs when the Excel Workbook is closed':
587   - ('Auto_Close', 'Workbook_Close'),
588   -
589   - # any MS Office application:
590   - 'Runs when the file is opened (using InkPicture ActiveX object)':
591   - # ref:https://twitter.com/joe4security/status/770691099988025345
592   - (r'\w+_Painted',),
593   - 'Runs when the file is opened and ActiveX objects trigger events':
594   - (r'\w+_(?:GotFocus|LostFocus|MouseHover)',),
595   -}
596   -
597   -# Suspicious Keywords that may be used by malware
598   -# See VBA language reference: http://msdn.microsoft.com/en-us/library/office/jj692818%28v=office.15%29.aspx
599   -SUSPICIOUS_KEYWORDS = {
600   - #TODO: use regex to support variable whitespaces
601   - 'May read system environment variables':
602   - ('Environ',),
603   - 'May open a file':
604   - ('Open',),
605   - 'May write to a file (if combined with Open)':
606   - #TODO: regex to find Open+Write on same line
607   - ('Write', 'Put', 'Output', 'Print #'),
608   - 'May read or write a binary file (if combined with Open)':
609   - #TODO: regex to find Open+Binary on same line
610   - ('Binary',),
611   - 'May copy a file':
612   - ('FileCopy', 'CopyFile'),
613   - #FileCopy: http://msdn.microsoft.com/en-us/library/office/gg264390%28v=office.15%29.aspx
614   - #CopyFile: http://msdn.microsoft.com/en-us/library/office/gg264089%28v=office.15%29.aspx
615   - 'May delete a file':
616   - ('Kill',),
617   - 'May create a text file':
618   - ('CreateTextFile', 'ADODB.Stream', 'WriteText', 'SaveToFile'),
619   - #CreateTextFile: http://msdn.microsoft.com/en-us/library/office/gg264617%28v=office.15%29.aspx
620   - #ADODB.Stream sample: http://pastebin.com/Z4TMyuq6
621   - 'May run an executable file or a system command':
622   - ('Shell', 'vbNormal', 'vbNormalFocus', 'vbHide', 'vbMinimizedFocus', 'vbMaximizedFocus', 'vbNormalNoFocus',
623   - 'vbMinimizedNoFocus', 'WScript.Shell', 'Run', 'ShellExecute'),
624   - # MacScript: see https://msdn.microsoft.com/en-us/library/office/gg264812.aspx
625   - 'May run an executable file or a system command on a Mac':
626   - ('MacScript',),
627   - 'May run an executable file or a system command on a Mac (if combined with libc.dylib)':
628   - ('system', 'popen', r'exec[lv][ep]?'),
629   - #Shell: http://msdn.microsoft.com/en-us/library/office/gg278437%28v=office.15%29.aspx
630   - #WScript.Shell+Run sample: http://pastebin.com/Z4TMyuq6
631   - 'May run PowerShell commands':
632   - #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/
633   - #also: https://bitbucket.org/decalage/oletools/issues/14/olevba-library-update-ioc
634   - # ref: https://blog.netspi.com/15-ways-to-bypass-the-powershell-execution-policy/
635   - # TODO: add support for keywords starting with a non-alpha character, such as "-noexit"
636   - # TODO: '-command', '-EncodedCommand', '-scriptblock'
637   - ('PowerShell', 'noexit', 'ExecutionPolicy', 'noprofile', 'command', 'EncodedCommand',
638   - 'invoke-command', 'scriptblock', 'Invoke-Expression', 'AuthorizationManager'),
639   - 'May run an executable file or a system command using PowerShell':
640   - ('Start-Process',),
641   - 'May hide the application':
642   - ('Application.Visible', 'ShowWindow', 'SW_HIDE'),
643   - 'May create a directory':
644   - ('MkDir',),
645   - 'May save the current workbook':
646   - ('ActiveWorkbook.SaveAs',),
647   - 'May change which directory contains files to open at startup':
648   - #TODO: confirm the actual effect
649   - ('Application.AltStartupPath',),
650   - 'May create an OLE object':
651   - ('CreateObject',),
652   - 'May create an OLE object using PowerShell':
653   - ('New-Object',),
654   - 'May run an application (if combined with CreateObject)':
655   - ('Shell.Application',),
656   - 'May enumerate application windows (if combined with Shell.Application object)':
657   - ('Windows', 'FindWindow'),
658   - 'May run code from a DLL':
659   - #TODO: regex to find declare+lib on same line - see mraptor
660   - ('Lib',),
661   - 'May run code from a library on a Mac':
662   - #TODO: regex to find declare+lib on same line - see mraptor
663   - ('libc.dylib', 'dylib'),
664   - 'May inject code into another process':
665   - ('CreateThread', 'VirtualAlloc', # (issue #9) suggested by Davy Douhine - used by MSF payload
666   - 'VirtualAllocEx', 'RtlMoveMemory',
667   - ),
668   - 'May run a shellcode in memory':
669   - ('EnumSystemLanguageGroupsW?', # Used by Hancitor in Oct 2016
670   - 'EnumDateFormats(?:W|(?:Ex){1,2})?'), # see https://msdn.microsoft.com/en-us/library/windows/desktop/dd317810(v=vs.85).aspx
671   - 'May download files from the Internet':
672   - #TODO: regex to find urlmon+URLDownloadToFileA on same line
673   - ('URLDownloadToFileA', 'Msxml2.XMLHTTP', 'Microsoft.XMLHTTP',
674   - 'MSXML2.ServerXMLHTTP', # suggested in issue #13
675   - 'User-Agent', # sample from @ozhermit: http://pastebin.com/MPc3iV6z
676   - ),
677   - 'May download files from the Internet using PowerShell':
678   - #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/
679   - ('Net.WebClient', 'DownloadFile', 'DownloadString'),
680   - 'May control another application by simulating user keystrokes':
681   - ('SendKeys', 'AppActivate'),
682   - #SendKeys: http://msdn.microsoft.com/en-us/library/office/gg278655%28v=office.15%29.aspx
683   - 'May attempt to obfuscate malicious function calls':
684   - ('CallByName',),
685   - #CallByName: http://msdn.microsoft.com/en-us/library/office/gg278760%28v=office.15%29.aspx
686   - 'May attempt to obfuscate specific strings (use option --deobf to deobfuscate)':
687   - #TODO: regex to find several Chr*, not just one
688   - ('Chr', 'ChrB', 'ChrW', 'StrReverse', 'Xor'),
689   - #Chr: http://msdn.microsoft.com/en-us/library/office/gg264465%28v=office.15%29.aspx
690   - 'May read or write registry keys':
691   - #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/
692   - ('RegOpenKeyExA', 'RegOpenKeyEx', 'RegCloseKey'),
693   - 'May read registry keys':
694   - #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/
695   - ('RegQueryValueExA', 'RegQueryValueEx',
696   - 'RegRead', #with Wscript.Shell
697   - ),
698   - 'May detect virtualization':
699   - # sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/
700   - (r'SYSTEM\ControlSet001\Services\Disk\Enum', 'VIRTUAL', 'VMWARE', 'VBOX'),
701   - 'May detect Anubis Sandbox':
702   - # sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/
703   - # NOTES: this sample also checks App.EXEName but that seems to be a bug, it works in VB6 but not in VBA
704   - # ref: http://www.syssec-project.eu/m/page-media/3/disarm-raid11.pdf
705   - ('GetVolumeInformationA', 'GetVolumeInformation', # with kernel32.dll
706   - '1824245000', r'HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows NT\CurrentVersion\ProductId',
707   - '76487-337-8429955-22614', 'andy', 'sample', r'C:\exec\exec.exe', 'popupkiller'
708   - ),
709   - 'May detect Sandboxie':
710   - # sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/
711   - # ref: http://www.cplusplus.com/forum/windows/96874/
712   - ('SbieDll.dll', 'SandboxieControlWndClass'),
713   - 'May detect Sunbelt Sandbox':
714   - # ref: http://www.cplusplus.com/forum/windows/96874/
715   - (r'C:\file.exe',),
716   - 'May detect Norman Sandbox':
717   - # ref: http://www.cplusplus.com/forum/windows/96874/
718   - ('currentuser',),
719   - 'May detect CW Sandbox':
720   - # ref: http://www.cplusplus.com/forum/windows/96874/
721   - ('Schmidti',),
722   - 'May detect WinJail Sandbox':
723   - # ref: http://www.cplusplus.com/forum/windows/96874/
724   - ('Afx:400000:0',),
725   - 'May attempt to disable VBA macro security and Protected View':
726   - # ref: http://blog.trendmicro.com/trendlabs-security-intelligence/qkg-filecoder-self-replicating-document-encrypting-ransomware/
727   - # ref: https://thehackernews.com/2017/11/ms-office-macro-malware.html
728   - ('AccessVBOM', 'VBAWarnings', 'ProtectedView', 'DisableAttachementsInPV', 'DisableInternetFilesInPV',
729   - 'DisableUnsafeLocationsInPV', 'blockcontentexecutionfrominternet'),
730   - 'May attempt to modify the VBA code (self-modification)':
731   - ('VBProject', 'VBComponents', 'CodeModule', 'AddFromString'),
732   -}
733   -
734   -# Suspicious Keywords to be searched for directly as strings, without regex
735   -SUSPICIOUS_KEYWORDS_NOREGEX = {
736   - 'May use special characters such as backspace to obfuscate code when printed on the console':
737   - ('\b',),
738   -}
739   -
740   -# Regular Expression for a URL:
741   -# http://en.wikipedia.org/wiki/Uniform_resource_locator
742   -# http://www.w3.org/Addressing/URL/uri-spec.html
743   -#TODO: also support username:password@server
744   -#TODO: other protocols (file, gopher, wais, ...?)
745   -SCHEME = r'\b(?:http|ftp)s?'
746   -# see http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains
747   -TLD = r'(?:xn--[a-zA-Z0-9]{4,20}|[a-zA-Z]{2,20})'
748   -DNS_NAME = r'(?:[a-zA-Z0-9\-\.]+\.' + TLD + ')'
749   -#TODO: IPv6 - see https://www.debuggex.com/
750   -# A literal numeric IPv6 address may be given, but must be enclosed in [ ] e.g. [db8:0cec::99:123a]
751   -NUMBER_0_255 = r'(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'
752   -IPv4 = r'(?:' + NUMBER_0_255 + r'\.){3}' + NUMBER_0_255
753   -# IPv4 must come before the DNS name because it is more specific
754   -SERVER = r'(?:' + IPv4 + '|' + DNS_NAME + ')'
755   -PORT = r'(?:\:[0-9]{1,5})?'
756   -SERVER_PORT = SERVER + PORT
757   -URL_PATH = r'(?:/[a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~]*)?' # [^\.\,\)\(\s"]
758   -URL_RE = SCHEME + r'\://' + SERVER_PORT + URL_PATH
759   -re_url = re.compile(URL_RE)
760   -
761   -
762   -# Patterns to be extracted (IP addresses, URLs, etc)
763   -# From patterns.py in balbuzard
764   -RE_PATTERNS = (
765   - ('URL', re.compile(URL_RE)),
766   - ('IPv4 address', re.compile(IPv4)),
767   - # TODO: add IPv6
768   - ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@' + SERVER + '\b')),
769   - # ('Domain name', re.compile(r'(?=^.{1,254}$)(^(?:(?!\d+\.|-)[a-zA-Z0-9_\-]{1,63}(?<!-)\.?)+(?:[a-zA-Z]{2,})$)')),
770   - # Executable file name with known extensions (except .com which is present in many URLs, and .application):
771   - ("Executable file name", re.compile(
772   - r"(?i)\b\w+\.(EXE|PIF|GADGET|MSI|MSP|MSC|VBS|VBE|VB|JSE|JS|WSF|WSC|WSH|WS|BAT|CMD|DLL|SCR|HTA|CPL|CLASS|JAR|PS1XML|PS1|PS2XML|PS2|PSC1|PSC2|SCF|LNK|INF|REG)\b")),
773   - # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/
774   - # TODO: https://support.office.com/en-us/article/Blocked-attachments-in-Outlook-3811cddc-17c3-4279-a30c-060ba0207372#__attachment_file_types
775   - # TODO: add win & unix file paths
776   - #('Hex string', re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')),
777   -)
778   -
779   -# regex to detect strings encoded in hexadecimal
780   -re_hex_string = re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')
781   -
782   -# regex to detect strings encoded in base64
783   -#re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?"')
784   -# better version from balbuzard, less false positives:
785   -# (plain version without double quotes, used also below in quoted_base64_string)
786   -BASE64_RE = r'(?:[A-Za-z0-9+/]{4}){1,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)?'
787   -re_base64_string = re.compile('"' + BASE64_RE + '"')
788   -# white list of common strings matching the base64 regex, but which are not base64 strings (all lowercase):
789   -BASE64_WHITELIST = set(['thisdocument', 'thisworkbook', 'test', 'temp', 'http', 'open', 'exit'])
790   -
791   -# regex to detect strings encoded with a specific Dridex algorithm
792   -# (see https://github.com/JamesHabben/MalwareStuff)
793   -re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"')
794   -# regex to check that it is not just a hex string:
795   -re_nothex_check = re.compile(r'[G-Zg-z]')
796   -
797   -# regex to extract printable strings (at least 5 chars) from VBA Forms:
798   -# (must be bytes for Python 3)
799   -re_printable_string = re.compile(b'[\\t\\r\\n\\x20-\\xFF]{5,}')
800   -
801   -
802   -# === PARTIAL VBA GRAMMAR ====================================================
803   -
804   -# REFERENCES:
805   -# - [MS-VBAL]: VBA Language Specification
806   -# https://msdn.microsoft.com/en-us/library/dd361851.aspx
807   -# - pyparsing: http://pyparsing.wikispaces.com/
808   -
809   -# TODO: set whitespaces according to VBA
810   -# TODO: merge extended lines before parsing
811   -
812   -# Enable PackRat for better performance:
813   -# (see https://pythonhosted.org/pyparsing/pyparsing.ParserElement-class.html#enablePackrat)
814   -ParserElement.enablePackrat()
815   -
816   -# VBA identifier chars (from MS-VBAL 3.3.5)
817   -vba_identifier_chars = alphanums + '_'
818   -
819   -class VbaExpressionString(str):
820   - """
821   - Class identical to str, used to distinguish plain strings from strings
822   - obfuscated using VBA expressions (Chr, StrReverse, etc)
823   - Usage: each VBA expression parse action should convert strings to
824   - VbaExpressionString.
825   - Then isinstance(s, VbaExpressionString) is True only for VBA expressions.
826   - (see detect_vba_strings)
827   - """
828   - # TODO: use Unicode everywhere instead of str
829   - pass
830   -
831   -
832   -# --- NUMBER TOKENS ----------------------------------------------------------
833   -
834   -# 3.3.2 Number Tokens
835   -# INTEGER = integer-literal ["%" / "&" / "^"]
836   -# integer-literal = decimal-literal / octal-literal / hex-literal
837   -# decimal-literal = 1*decimal-digit
838   -# octal-literal = "&" [%x004F / %x006F] 1*octal-digit
839   -# ; & or &o or &O
840   -# hex-literal = "&" (%x0048 / %x0068) 1*hex-digit
841   -# ; &h or &H
842   -# octal-digit = "0" / "1" / "2" / "3" / "4" / "5" / "6" / "7"
843   -# decimal-digit = octal-digit / "8" / "9"
844   -# hex-digit = decimal-digit / %x0041-0046 / %x0061-0066 ;A-F / a-f
845   -
846   -# NOTE: here Combine() is required to avoid spaces between elements
847   -# NOTE: here WordStart is necessary to avoid matching a number preceded by
848   -# letters or underscore (e.g. "VBT1" or "ABC_34"), when using scanString
849   -decimal_literal = Combine(Optional('-') + WordStart(vba_identifier_chars) + Word(nums)
850   - + Suppress(Optional(Word('%&^', exact=1))))
851   -decimal_literal.setParseAction(lambda t: int(t[0]))
852   -
853   -octal_literal = Combine(Suppress(Literal('&') + Optional((CaselessLiteral('o')))) + Word(srange('[0-7]'))
854   - + Suppress(Optional(Word('%&^', exact=1))))
855   -octal_literal.setParseAction(lambda t: int(t[0], base=8))
856   -
857   -hex_literal = Combine(Suppress(CaselessLiteral('&h')) + Word(srange('[0-9a-fA-F]'))
858   - + Suppress(Optional(Word('%&^', exact=1))))
859   -hex_literal.setParseAction(lambda t: int(t[0], base=16))
860   -
861   -integer = decimal_literal | octal_literal | hex_literal
862   -
863   -
864   -# --- QUOTED STRINGS ---------------------------------------------------------
865   -
866   -# 3.3.4 String Tokens
867   -# STRING = double-quote *string-character (double-quote / line-continuation / LINE-END)
868   -# double-quote = %x0022 ; "
869   -# string-character = NO-LINE-CONTINUATION ((double-quote double-quote) termination-character)
870   -
871   -quoted_string = QuotedString('"', escQuote='""')
872   -quoted_string.setParseAction(lambda t: str(t[0]))
873   -
874   -
875   -#--- VBA Expressions ---------------------------------------------------------
876   -
877   -# See MS-VBAL 5.6 Expressions
878   -
879   -# need to pre-declare using Forward() because it is recursive
880   -# VBA string expression and integer expression
881   -vba_expr_str = Forward()
882   -vba_expr_int = Forward()
883   -
884   -# --- CHR --------------------------------------------------------------------
885   -
886   -# MS-VBAL 6.1.2.11.1.4 Chr / Chr$
887   -# Function Chr(CharCode As Long) As Variant
888   -# Function Chr$(CharCode As Long) As String
889   -# Parameter Description
890   -# CharCode Long whose value is a code point.
891   -# Returns a String data value consisting of a single character containing the character whose code
892   -# point is the data value of the argument.
893   -# - If the argument is not in the range 0 to 255, Error Number 5 ("Invalid procedure call or
894   -# argument") is raised unless the implementation supports a character set with a larger code point
895   -# range.
896   -# - If the argument value is in the range of 0 to 127, it is interpreted as a 7-bit ASCII code point.
897   -# - If the argument value is in the range of 128 to 255, the code point interpretation of the value is
898   -# implementation defined.
899   -# - Chr$ has the same runtime semantics as Chr, however the declared type of its function result is
900   -# String rather than Variant.
901   -
902   -# 6.1.2.11.1.5 ChrB / ChrB$
903   -# Function ChrB(CharCode As Long) As Variant
904   -# Function ChrB$(CharCode As Long) As String
905   -# CharCode Long whose value is a code point.
906   -# Returns a String data value consisting of a single byte character whose code point value is the
907   -# data value of the argument.
908   -# - If the argument is not in the range 0 to 255, Error Number 6 ("Overflow") is raised.
909   -# - ChrB$ has the same runtime semantics as ChrB however the declared type of its function result
910   -# is String rather than Variant.
911   -# - Note: the ChrB function is used with byte data contained in a String. Instead of returning a
912   -# character, which may be one or two bytes, ChrB always returns a single byte. The ChrW function
913   -# returns a String containing the Unicode character except on platforms where Unicode is not
914   -# supported, in which case, the behavior is identical to the Chr function.
915   -
916   -# 6.1.2.11.1.6 ChrW/ ChrW$
917   -# Function ChrW(CharCode As Long) As Variant
918   -# Function ChrW$(CharCode As Long) As String
919   -# CharCode Long whose value is a code point.
920   -# Returns a String data value consisting of a single character containing the character whose code
921   -# point is the data value of the argument.
922   -# - If the argument is not in the range -32,767 to 65,535 then Error Number 5 ("Invalid procedure
923   -# call or argument") is raised.
924   -# - If the argument is a negative value it is treated as if it was the value: CharCode + 65,536.
925   -# - If the implemented uses 16-bit Unicode code points argument, data value is interpreted as a 16-
926   -# bit Unicode code point.
927   -# - If the implementation does not support Unicode, ChrW has the same semantics as Chr.
928   -# - ChrW$ has the same runtime semantics as ChrW, however the declared type of its function result
929   -# is String rather than Variant.
930   -
931   -# Chr, Chr$, ChrB, ChrW(int) => char
932   -vba_chr = Suppress(
933   - Combine(WordStart(vba_identifier_chars) + CaselessLiteral('Chr')
934   - + Optional(CaselessLiteral('B') | CaselessLiteral('W')) + Optional('$'))
935   - + '(') + vba_expr_int + Suppress(')')
936   -
937   -def vba_chr_tostr(t):
938   - try:
939   - i = t[0]
940   - if i>=0 and i<=255:
941   - # normal, non-unicode character:
942   - # TODO: check if it needs to be converted to bytes for Python 3
943   - return VbaExpressionString(chr(i))
944   - else:
945   - # unicode character
946   - # Note: this distinction is only needed for Python 2
947   - return VbaExpressionString(unichr(i).encode('utf-8', 'backslashreplace'))
948   - except ValueError:
949   - log.exception('ERROR: incorrect parameter value for chr(): %r' % i)
950   - return VbaExpressionString('Chr(%r)' % i)
951   -
952   -vba_chr.setParseAction(vba_chr_tostr)
953   -
954   -
955   -# --- ASC --------------------------------------------------------------------
956   -
957   -# Asc(char) => int
958   -#TODO: see MS-VBAL 6.1.2.11.1.1 page 240 => AscB, AscW
959   -vba_asc = Suppress(CaselessKeyword('Asc') + '(') + vba_expr_str + Suppress(')')
960   -vba_asc.setParseAction(lambda t: ord(t[0]))
961   -
962   -
963   -# --- VAL --------------------------------------------------------------------
964   -
965   -# Val(string) => int
966   -# TODO: make sure the behavior of VBA's val is fully covered
967   -vba_val = Suppress(CaselessKeyword('Val') + '(') + vba_expr_str + Suppress(')')
968   -vba_val.setParseAction(lambda t: int(t[0].strip()))
969   -
970   -
971   -# --- StrReverse() --------------------------------------------------------------------
972   -
973   -# StrReverse(string) => string
974   -strReverse = Suppress(CaselessKeyword('StrReverse') + '(') + vba_expr_str + Suppress(')')
975   -strReverse.setParseAction(lambda t: VbaExpressionString(str(t[0])[::-1]))
976   -
977   -
978   -# --- ENVIRON() --------------------------------------------------------------------
979   -
980   -# Environ("name") => just translated to "%name%", that is enough for malware analysis
981   -environ = Suppress(CaselessKeyword('Environ') + '(') + vba_expr_str + Suppress(')')
982   -environ.setParseAction(lambda t: VbaExpressionString('%%%s%%' % t[0]))
983   -
984   -
985   -# --- IDENTIFIER -------------------------------------------------------------
986   -
987   -#TODO: see MS-VBAL 3.3.5 page 33
988   -# 3.3.5 Identifier Tokens
989   -# Latin-identifier = first-Latin-identifier-character *subsequent-Latin-identifier-character
990   -# first-Latin-identifier-character = (%x0041-005A / %x0061-007A) ; A-Z / a-z
991   -# subsequent-Latin-identifier-character = first-Latin-identifier-character / DIGIT / %x5F ; underscore
992   -latin_identifier = Word(initChars=alphas, bodyChars=alphanums + '_')
993   -
994   -# --- HEX FUNCTION -----------------------------------------------------------
995   -
996   -# match any custom function name with a hex string as argument:
997   -# TODO: accept vba_expr_str_item as argument, check if it is a hex or base64 string at runtime
998   -
999   -# quoted string of at least two hexadecimal numbers of two digits:
1000   -quoted_hex_string = Suppress('"') + Combine(Word(hexnums, exact=2) * (2, None)) + Suppress('"')
1001   -quoted_hex_string.setParseAction(lambda t: str(t[0]))
1002   -
1003   -hex_function_call = Suppress(latin_identifier) + Suppress('(') + \
1004   - quoted_hex_string('hex_string') + Suppress(')')
1005   -hex_function_call.setParseAction(lambda t: VbaExpressionString(binascii.a2b_hex(t.hex_string)))
1006   -
1007   -
1008   -# --- BASE64 FUNCTION -----------------------------------------------------------
1009   -
1010   -# match any custom function name with a Base64 string as argument:
1011   -# TODO: accept vba_expr_str_item as argument, check if it is a hex or base64 string at runtime
1012   -
1013   -# quoted string of at least two hexadecimal numbers of two digits:
1014   -quoted_base64_string = Suppress('"') + Regex(BASE64_RE) + Suppress('"')
1015   -quoted_base64_string.setParseAction(lambda t: str(t[0]))
1016   -
1017   -base64_function_call = Suppress(latin_identifier) + Suppress('(') + \
1018   - quoted_base64_string('base64_string') + Suppress(')')
1019   -base64_function_call.setParseAction(lambda t: VbaExpressionString(binascii.a2b_base64(t.base64_string)))
1020   -
1021   -
1022   -# ---STRING EXPRESSION -------------------------------------------------------
1023   -
1024   -def concat_strings_list(tokens):
1025   - """
1026   - parse action to concatenate strings in a VBA expression with operators '+' or '&'
1027   - """
1028   - # extract argument from the tokens:
1029   - # expected to be a tuple containing a list of strings such as [a,'&',b,'&',c,...]
1030   - strings = tokens[0][::2]
1031   - return VbaExpressionString(''.join(strings))
1032   -
1033   -
1034   -vba_expr_str_item = (vba_chr | strReverse | environ | quoted_string | hex_function_call | base64_function_call)
1035   -
1036   -vba_expr_str <<= infixNotation(vba_expr_str_item,
1037   - [
1038   - ("+", 2, opAssoc.LEFT, concat_strings_list),
1039   - ("&", 2, opAssoc.LEFT, concat_strings_list),
1040   - ])
1041   -
1042   -
1043   -# --- INTEGER EXPRESSION -------------------------------------------------------
1044   -
1045   -def sum_ints_list(tokens):
1046   - """
1047   - parse action to sum integers in a VBA expression with operator '+'
1048   - """
1049   - # extract argument from the tokens:
1050   - # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...]
1051   - integers = tokens[0][::2]
1052   - return sum(integers)
1053   -
1054   -
1055   -def subtract_ints_list(tokens):
1056   - """
1057   - parse action to subtract integers in a VBA expression with operator '-'
1058   - """
1059   - # extract argument from the tokens:
1060   - # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...]
1061   - integers = tokens[0][::2]
1062   - return reduce(lambda x,y:x-y, integers)
1063   -
1064   -
1065   -def multiply_ints_list(tokens):
1066   - """
1067   - parse action to multiply integers in a VBA expression with operator '*'
1068   - """
1069   - # extract argument from the tokens:
1070   - # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...]
1071   - integers = tokens[0][::2]
1072   - return reduce(lambda x,y:x*y, integers)
1073   -
1074   -
1075   -def divide_ints_list(tokens):
1076   - """
1077   - parse action to divide integers in a VBA expression with operator '/'
1078   - """
1079   - # extract argument from the tokens:
1080   - # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...]
1081   - integers = tokens[0][::2]
1082   - return reduce(lambda x,y:x/y, integers)
1083   -
1084   -
1085   -vba_expr_int_item = (vba_asc | vba_val | integer)
1086   -
1087   -# operators associativity:
1088   -# https://en.wikipedia.org/wiki/Operator_associativity
1089   -
1090   -vba_expr_int <<= infixNotation(vba_expr_int_item,
1091   - [
1092   - ("*", 2, opAssoc.LEFT, multiply_ints_list),
1093   - ("/", 2, opAssoc.LEFT, divide_ints_list),
1094   - ("-", 2, opAssoc.LEFT, subtract_ints_list),
1095   - ("+", 2, opAssoc.LEFT, sum_ints_list),
1096   - ])
1097   -
1098   -
1099   -# see detect_vba_strings for the deobfuscation code using this grammar
1100   -
1101   -# === MSO/ActiveMime files parsing ===========================================
1102   -
1103   -def is_mso_file(data):
1104   - """
1105   - Check if the provided data is the content of a MSO/ActiveMime file, such as
1106   - the ones created by Outlook in some cases, or Word/Excel when saving a
1107   - file with the MHTML format or the Word 2003 XML format.
1108   - This function only checks the ActiveMime magic at the beginning of data.
1109   - :param data: bytes string, MSO/ActiveMime file content
1110   - :return: bool, True if the file is MSO, False otherwise
1111   - """
1112   - return data.startswith(MSO_ACTIVEMIME_HEADER)
1113   -
1114   -
1115   -# regex to find zlib block headers, starting with byte 0x78 = 'x'
1116   -re_zlib_header = re.compile(r'x')
1117   -
1118   -
1119   -def mso_file_extract(data):
1120   - """
1121   - Extract the data stored into a MSO/ActiveMime file, such as
1122   - the ones created by Outlook in some cases, or Word/Excel when saving a
1123   - file with the MHTML format or the Word 2003 XML format.
1124   -
1125   - :param data: bytes string, MSO/ActiveMime file content
1126   - :return: bytes string, extracted data (uncompressed)
1127   -
1128   - raise a MsoExtractionError if the data cannot be extracted
1129   - """
1130   - # check the magic:
1131   - assert is_mso_file(data)
1132   -
1133   - # In all the samples seen so far, Word always uses an offset of 0x32,
1134   - # and Excel 0x22A. But we read the offset from the header to be more
1135   - # generic.
1136   - offsets = [0x32, 0x22A]
1137   -
1138   - # First, attempt to get the compressed data offset from the header
1139   - # According to my tests, it should be an unsigned 16 bits integer,
1140   - # at offset 0x1E (little endian) + add 46:
1141   - try:
1142   - offset = struct.unpack_from('<H', data, offset=0x1E)[0] + 46
1143   - log.debug('Parsing MSO file: data offset = 0x%X' % offset)
1144   - offsets.insert(0, offset) # insert at beginning of offsets
1145   - except struct.error as exc:
1146   - log.info('Unable to parse MSO/ActiveMime file header (%s)' % exc)
1147   - log.debug('Trace:', exc_info=True)
1148   - raise MsoExtractionError('Unable to parse MSO/ActiveMime file header')
1149   - # now try offsets
1150   - for start in offsets:
1151   - try:
1152   - log.debug('Attempting zlib decompression from MSO file offset 0x%X' % start)
1153   - extracted_data = zlib.decompress(data[start:])
1154   - return extracted_data
1155   - except zlib.error as exc:
1156   - log.info('zlib decompression failed for offset %s (%s)'
1157   - % (start, exc))
1158   - log.debug('Trace:', exc_info=True)
1159   - # None of the guessed offsets worked, let's try brute-forcing by looking
1160   - # for potential zlib-compressed blocks starting with 0x78:
1161   - log.debug('Looking for potential zlib-compressed blocks in MSO file')
1162   - for match in re_zlib_header.finditer(data):
1163   - start = match.start()
1164   - try:
1165   - log.debug('Attempting zlib decompression from MSO file offset 0x%X' % start)
1166   - extracted_data = zlib.decompress(data[start:])
1167   - return extracted_data
1168   - except zlib.error as exc:
1169   - log.info('zlib decompression failed (%s)' % exc)
1170   - log.debug('Trace:', exc_info=True)
1171   - raise MsoExtractionError('Unable to decompress data from a MSO/ActiveMime file')
1172   -
1173   -
1174   -#--- FUNCTIONS ----------------------------------------------------------------
1175   -
1176   -# set of printable characters, for is_printable
1177   -_PRINTABLE_SET = set(string.printable)
1178   -
1179   -def is_printable(s):
1180   - """
1181   - returns True if string s only contains printable ASCII characters
1182   - (i.e. contained in string.printable)
1183   - This is similar to Python 3's str.isprintable, for Python 2.x.
1184   - :param s: str
1185   - :return: bool
1186   - """
1187   - # inspired from http://stackoverflow.com/questions/3636928/test-if-a-python-string-is-printable
1188   - # check if the set of chars from s is contained into the set of printable chars:
1189   - return set(s).issubset(_PRINTABLE_SET)
1190   -
1191   -
1192   -def copytoken_help(decompressed_current, decompressed_chunk_start):
1193   - """
1194   - compute bit masks to decode a CopyToken according to MS-OVBA 2.4.1.3.19.1 CopyToken Help
1195   -
1196   - decompressed_current: number of decompressed bytes so far, i.e. len(decompressed_container)
1197   - decompressed_chunk_start: offset of the current chunk in the decompressed container
1198   - return length_mask, offset_mask, bit_count, maximum_length
1199   - """
1200   - difference = decompressed_current - decompressed_chunk_start
1201   - bit_count = int(math.ceil(math.log(difference, 2)))
1202   - bit_count = max([bit_count, 4])
1203   - length_mask = 0xFFFF >> bit_count
1204   - offset_mask = ~length_mask
1205   - maximum_length = (0xFFFF >> bit_count) + 3
1206   - return length_mask, offset_mask, bit_count, maximum_length
1207   -
1208   -
1209   -def decompress_stream(compressed_container):
1210   - """
1211   - Decompress a stream according to MS-OVBA section 2.4.1
1212   -
1213   - compressed_container: string compressed according to the MS-OVBA 2.4.1.3.6 Compression algorithm
1214   - return the decompressed container as a string (bytes)
1215   - """
1216   - # 2.4.1.2 State Variables
1217   -
1218   - # The following state is maintained for the CompressedContainer (section 2.4.1.1.1):
1219   - # CompressedRecordEnd: The location of the byte after the last byte in the CompressedContainer (section 2.4.1.1.1).
1220   - # CompressedCurrent: The location of the next byte in the CompressedContainer (section 2.4.1.1.1) to be read by
1221   - # decompression or to be written by compression.
1222   -
1223   - # The following state is maintained for the current CompressedChunk (section 2.4.1.1.4):
1224   - # CompressedChunkStart: The location of the first byte of the CompressedChunk (section 2.4.1.1.4) within the
1225   - # CompressedContainer (section 2.4.1.1.1).
1226   -
1227   - # The following state is maintained for a DecompressedBuffer (section 2.4.1.1.2):
1228   - # DecompressedCurrent: The location of the next byte in the DecompressedBuffer (section 2.4.1.1.2) to be written by
1229   - # decompression or to be read by compression.
1230   - # DecompressedBufferEnd: The location of the byte after the last byte in the DecompressedBuffer (section 2.4.1.1.2).
1231   -
1232   - # The following state is maintained for the current DecompressedChunk (section 2.4.1.1.3):
1233   - # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the
1234   - # DecompressedBuffer (section 2.4.1.1.2).
1235   -
1236   - # Check the input is a bytearray:
1237   - if not isinstance(compressed_container, bytearray):
1238   - raise TypeError('decompress_stream requires a bytearray as input')
1239   - decompressed_container = bytearray() # result
1240   - compressed_current = 0
1241   -
1242   - sig_byte = compressed_container[compressed_current]
1243   - if sig_byte != 0x01:
1244   - raise ValueError('invalid signature byte {0:02X}'.format(sig_byte))
1245   -
1246   - compressed_current += 1
1247   -
1248   - #NOTE: the definition of CompressedRecordEnd is ambiguous. Here we assume that
1249   - # CompressedRecordEnd = len(compressed_container)
1250   - while compressed_current < len(compressed_container):
1251   - # 2.4.1.1.5
1252   - compressed_chunk_start = compressed_current
1253   - # chunk header = first 16 bits
1254   - compressed_chunk_header = \
1255   - struct.unpack("<H", compressed_container[compressed_chunk_start:compressed_chunk_start + 2])[0]
1256   - # chunk size = 12 first bits of header + 3
1257   - chunk_size = (compressed_chunk_header & 0x0FFF) + 3
1258   - # chunk signature = 3 next bits - should always be 0b011
1259   - chunk_signature = (compressed_chunk_header >> 12) & 0x07
1260   - if chunk_signature != 0b011:
1261   - raise ValueError('Invalid CompressedChunkSignature in VBA compressed stream')
1262   - # chunk flag = next bit - 1 == compressed, 0 == uncompressed
1263   - chunk_flag = (compressed_chunk_header >> 15) & 0x01
1264   - log.debug("chunk size = {0}, compressed flag = {1}".format(chunk_size, chunk_flag))
1265   -
1266   - #MS-OVBA 2.4.1.3.12: the maximum size of a chunk including its header is 4098 bytes (header 2 + data 4096)
1267   - # The minimum size is 3 bytes
1268   - # NOTE: there seems to be a typo in MS-OVBA, the check should be with 4098, not 4095 (which is the max value
1269   - # in chunk header before adding 3.
1270   - # Also the first test is not useful since a 12 bits value cannot be larger than 4095.
1271   - if chunk_flag == 1 and chunk_size > 4098:
1272   - raise ValueError('CompressedChunkSize > 4098 but CompressedChunkFlag == 1')
1273   - if chunk_flag == 0 and chunk_size != 4098:
1274   - raise ValueError('CompressedChunkSize != 4098 but CompressedChunkFlag == 0')
1275   -
1276   - # check if chunk_size goes beyond the compressed data, instead of silently cutting it:
1277   - #TODO: raise an exception?
1278   - if compressed_chunk_start + chunk_size > len(compressed_container):
1279   - log.warning('Chunk size is larger than remaining compressed data')
1280   - compressed_end = min([len(compressed_container), compressed_chunk_start + chunk_size])
1281   - # read after chunk header:
1282   - compressed_current = compressed_chunk_start + 2
1283   -
1284   - if chunk_flag == 0:
1285   - # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk
1286   - # uncompressed chunk: read the next 4096 bytes as-is
1287   - #TODO: check if there are at least 4096 bytes left
1288   - decompressed_container.extend([compressed_container[compressed_current:compressed_current + 4096]])
1289   - compressed_current += 4096
1290   - else:
1291   - # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk
1292   - # compressed chunk
1293   - decompressed_chunk_start = len(decompressed_container)
1294   - while compressed_current < compressed_end:
1295   - # MS-OVBA 2.4.1.3.4 Decompressing a TokenSequence
1296   - # log.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end))
1297   - # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or
1298   - # copy tokens (reference to a previous literal token)
1299   - flag_byte = compressed_container[compressed_current]
1300   - compressed_current += 1
1301   - for bit_index in xrange(0, 8):
1302   - # log.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end))
1303   - if compressed_current >= compressed_end:
1304   - break
1305   - # MS-OVBA 2.4.1.3.5 Decompressing a Token
1306   - # MS-OVBA 2.4.1.3.17 Extract FlagBit
1307   - flag_bit = (flag_byte >> bit_index) & 1
1308   - #log.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit))
1309   - if flag_bit == 0: # LiteralToken
1310   - # copy one byte directly to output
1311   - decompressed_container.extend([compressed_container[compressed_current]])
1312   - compressed_current += 1
1313   - else: # CopyToken
1314   - # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken
1315   - copy_token = \
1316   - struct.unpack("<H", compressed_container[compressed_current:compressed_current + 2])[0]
1317   - #TODO: check this
1318   - length_mask, offset_mask, bit_count, _ = copytoken_help(
1319   - len(decompressed_container), decompressed_chunk_start)
1320   - length = (copy_token & length_mask) + 3
1321   - temp1 = copy_token & offset_mask
1322   - temp2 = 16 - bit_count
1323   - offset = (temp1 >> temp2) + 1
1324   - #log.debug('offset=%d length=%d' % (offset, length))
1325   - copy_source = len(decompressed_container) - offset
1326   - for index in xrange(copy_source, copy_source + length):
1327   - decompressed_container.extend([decompressed_container[index]])
1328   - compressed_current += 2
1329   - return bytes(decompressed_container)
1330   -
1331   -
1332   -def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1333   - """
1334   - Extract VBA macros from an OleFileIO object.
1335   - Internal function, do not call directly.
1336   -
1337   - vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream
1338   - vba_project: path to the PROJECT stream
1339   - :param relaxed: If True, only create info/debug log entry if data is not as expected
1340   - (e.g. opening substream fails); if False, raise an error in this case
1341   - This is a generator, yielding (stream path, VBA filename, VBA source code) for each VBA code stream
1342   - """
1343   - # Open the PROJECT stream:
1344   - project = ole.openstream(project_path)
1345   - log.debug('relaxed is %s' % relaxed)
1346   -
1347   - # sample content of the PROJECT stream:
1348   -
1349   - ## ID="{5312AC8A-349D-4950-BDD0-49BE3C4DD0F0}"
1350   - ## Document=ThisDocument/&H00000000
1351   - ## Module=NewMacros
1352   - ## Name="Project"
1353   - ## HelpContextID="0"
1354   - ## VersionCompatible32="393222000"
1355   - ## CMG="F1F301E705E705E705E705"
1356   - ## DPB="8F8D7FE3831F2020202020"
1357   - ## GC="2D2FDD81E51EE61EE6E1"
1358   - ##
1359   - ## [Host Extender Info]
1360   - ## &H00000001={3832D640-CF90-11CF-8E43-00A0C911005A};VBE;&H00000000
1361   - ## &H00000002={000209F2-0000-0000-C000-000000000046};Word8.0;&H00000000
1362   - ##
1363   - ## [Workspace]
1364   - ## ThisDocument=22, 29, 339, 477, Z
1365   - ## NewMacros=-4, 42, 832, 510, C
1366   -
1367   - code_modules = {}
1368   -
1369   - for line in project:
1370   - line = line.strip().decode('utf-8','ignore')
1371   - if '=' in line:
1372   - # split line at the 1st equal sign:
1373   - name, value = line.split('=', 1)
1374   - # looking for code modules
1375   - # add the code module as a key in the dictionary
1376   - # the value will be the extension needed later
1377   - # The value is converted to lowercase, to allow case-insensitive matching (issue #3)
1378   - value = value.lower()
1379   - if name == 'Document':
1380   - # split value at the 1st slash, keep 1st part:
1381   - value = value.split('/', 1)[0]
1382   - code_modules[value] = CLASS_EXTENSION
1383   - elif name == 'Module':
1384   - code_modules[value] = MODULE_EXTENSION
1385   - elif name == 'Class':
1386   - code_modules[value] = CLASS_EXTENSION
1387   - elif name == 'BaseClass':
1388   - code_modules[value] = FORM_EXTENSION
1389   -
1390   - # read data from dir stream (compressed)
1391   - dir_compressed = ole.openstream(dir_path).read()
1392   -
1393   - def check_value(name, expected, value):
1394   - if expected != value:
1395   - if relaxed:
1396   - log.error("invalid value for {0} expected {1:04X} got {2:04X}"
1397   - .format(name, expected, value))
1398   - else:
1399   - raise UnexpectedDataError(dir_path, name, expected, value)
1400   -
1401   - dir_stream = BytesIO(decompress_stream(bytearray(dir_compressed)))
1402   -
1403   - # PROJECTSYSKIND Record
1404   - projectsyskind_id = struct.unpack("<H", dir_stream.read(2))[0]
1405   - check_value('PROJECTSYSKIND_Id', 0x0001, projectsyskind_id)
1406   - projectsyskind_size = struct.unpack("<L", dir_stream.read(4))[0]
1407   - check_value('PROJECTSYSKIND_Size', 0x0004, projectsyskind_size)
1408   - projectsyskind_syskind = struct.unpack("<L", dir_stream.read(4))[0]
1409   - if projectsyskind_syskind == 0x00:
1410   - log.debug("16-bit Windows")
1411   - elif projectsyskind_syskind == 0x01:
1412   - log.debug("32-bit Windows")
1413   - elif projectsyskind_syskind == 0x02:
1414   - log.debug("Macintosh")
1415   - elif projectsyskind_syskind == 0x03:
1416   - log.debug("64-bit Windows")
1417   - else:
1418   - log.error("invalid PROJECTSYSKIND_SysKind {0:04X}".format(projectsyskind_syskind))
1419   -
1420   - # PROJECTLCID Record
1421   - projectlcid_id = struct.unpack("<H", dir_stream.read(2))[0]
1422   - check_value('PROJECTLCID_Id', 0x0002, projectlcid_id)
1423   - projectlcid_size = struct.unpack("<L", dir_stream.read(4))[0]
1424   - check_value('PROJECTLCID_Size', 0x0004, projectlcid_size)
1425   - projectlcid_lcid = struct.unpack("<L", dir_stream.read(4))[0]
1426   - check_value('PROJECTLCID_Lcid', 0x409, projectlcid_lcid)
1427   -
1428   - # PROJECTLCIDINVOKE Record
1429   - projectlcidinvoke_id = struct.unpack("<H", dir_stream.read(2))[0]
1430   - check_value('PROJECTLCIDINVOKE_Id', 0x0014, projectlcidinvoke_id)
1431   - projectlcidinvoke_size = struct.unpack("<L", dir_stream.read(4))[0]
1432   - check_value('PROJECTLCIDINVOKE_Size', 0x0004, projectlcidinvoke_size)
1433   - projectlcidinvoke_lcidinvoke = struct.unpack("<L", dir_stream.read(4))[0]
1434   - check_value('PROJECTLCIDINVOKE_LcidInvoke', 0x409, projectlcidinvoke_lcidinvoke)
1435   -
1436   - # PROJECTCODEPAGE Record
1437   - projectcodepage_id = struct.unpack("<H", dir_stream.read(2))[0]
1438   - check_value('PROJECTCODEPAGE_Id', 0x0003, projectcodepage_id)
1439   - projectcodepage_size = struct.unpack("<L", dir_stream.read(4))[0]
1440   - check_value('PROJECTCODEPAGE_Size', 0x0002, projectcodepage_size)
1441   - projectcodepage_codepage = struct.unpack("<H", dir_stream.read(2))[0]
1442   -
1443   - # PROJECTNAME Record
1444   - projectname_id = struct.unpack("<H", dir_stream.read(2))[0]
1445   - check_value('PROJECTNAME_Id', 0x0004, projectname_id)
1446   - projectname_sizeof_projectname = struct.unpack("<L", dir_stream.read(4))[0]
1447   - if projectname_sizeof_projectname < 1 or projectname_sizeof_projectname > 128:
1448   - log.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(projectname_sizeof_projectname))
1449   - projectname_projectname = dir_stream.read(projectname_sizeof_projectname)
1450   - unused = projectname_projectname
1451   -
1452   - # PROJECTDOCSTRING Record
1453   - projectdocstring_id = struct.unpack("<H", dir_stream.read(2))[0]
1454   - check_value('PROJECTDOCSTRING_Id', 0x0005, projectdocstring_id)
1455   - projectdocstring_sizeof_docstring = struct.unpack("<L", dir_stream.read(4))[0]
1456   - if projectdocstring_sizeof_docstring > 2000:
1457   - log.error(
1458   - "PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(projectdocstring_sizeof_docstring))
1459   - projectdocstring_docstring = dir_stream.read(projectdocstring_sizeof_docstring)
1460   - projectdocstring_reserved = struct.unpack("<H", dir_stream.read(2))[0]
1461   - check_value('PROJECTDOCSTRING_Reserved', 0x0040, projectdocstring_reserved)
1462   - projectdocstring_sizeof_docstring_unicode = struct.unpack("<L", dir_stream.read(4))[0]
1463   - if projectdocstring_sizeof_docstring_unicode % 2 != 0:
1464   - log.error("PROJECTDOCSTRING_SizeOfDocStringUnicode is not even")
1465   - projectdocstring_docstring_unicode = dir_stream.read(projectdocstring_sizeof_docstring_unicode)
1466   - unused = projectdocstring_docstring
1467   - unused = projectdocstring_docstring_unicode
1468   -
1469   - # PROJECTHELPFILEPATH Record - MS-OVBA 2.3.4.2.1.7
1470   - projecthelpfilepath_id = struct.unpack("<H", dir_stream.read(2))[0]
1471   - check_value('PROJECTHELPFILEPATH_Id', 0x0006, projecthelpfilepath_id)
1472   - projecthelpfilepath_sizeof_helpfile1 = struct.unpack("<L", dir_stream.read(4))[0]
1473   - if projecthelpfilepath_sizeof_helpfile1 > 260:
1474   - log.error(
1475   - "PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(projecthelpfilepath_sizeof_helpfile1))
1476   - projecthelpfilepath_helpfile1 = dir_stream.read(projecthelpfilepath_sizeof_helpfile1)
1477   - projecthelpfilepath_reserved = struct.unpack("<H", dir_stream.read(2))[0]
1478   - check_value('PROJECTHELPFILEPATH_Reserved', 0x003D, projecthelpfilepath_reserved)
1479   - projecthelpfilepath_sizeof_helpfile2 = struct.unpack("<L", dir_stream.read(4))[0]
1480   - if projecthelpfilepath_sizeof_helpfile2 != projecthelpfilepath_sizeof_helpfile1:
1481   - log.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 does not equal PROJECTHELPFILEPATH_SizeOfHelpFile2")
1482   - projecthelpfilepath_helpfile2 = dir_stream.read(projecthelpfilepath_sizeof_helpfile2)
1483   - if projecthelpfilepath_helpfile2 != projecthelpfilepath_helpfile1:
1484   - log.error("PROJECTHELPFILEPATH_HelpFile1 does not equal PROJECTHELPFILEPATH_HelpFile2")
1485   -
1486   - # PROJECTHELPCONTEXT Record
1487   - projecthelpcontext_id = struct.unpack("<H", dir_stream.read(2))[0]
1488   - check_value('PROJECTHELPCONTEXT_Id', 0x0007, projecthelpcontext_id)
1489   - projecthelpcontext_size = struct.unpack("<L", dir_stream.read(4))[0]
1490   - check_value('PROJECTHELPCONTEXT_Size', 0x0004, projecthelpcontext_size)
1491   - projecthelpcontext_helpcontext = struct.unpack("<L", dir_stream.read(4))[0]
1492   - unused = projecthelpcontext_helpcontext
1493   -
1494   - # PROJECTLIBFLAGS Record
1495   - projectlibflags_id = struct.unpack("<H", dir_stream.read(2))[0]
1496   - check_value('PROJECTLIBFLAGS_Id', 0x0008, projectlibflags_id)
1497   - projectlibflags_size = struct.unpack("<L", dir_stream.read(4))[0]
1498   - check_value('PROJECTLIBFLAGS_Size', 0x0004, projectlibflags_size)
1499   - projectlibflags_projectlibflags = struct.unpack("<L", dir_stream.read(4))[0]
1500   - check_value('PROJECTLIBFLAGS_ProjectLibFlags', 0x0000, projectlibflags_projectlibflags)
1501   -
1502   - # PROJECTVERSION Record
1503   - projectversion_id = struct.unpack("<H", dir_stream.read(2))[0]
1504   - check_value('PROJECTVERSION_Id', 0x0009, projectversion_id)
1505   - projectversion_reserved = struct.unpack("<L", dir_stream.read(4))[0]
1506   - check_value('PROJECTVERSION_Reserved', 0x0004, projectversion_reserved)
1507   - projectversion_versionmajor = struct.unpack("<L", dir_stream.read(4))[0]
1508   - projectversion_versionminor = struct.unpack("<H", dir_stream.read(2))[0]
1509   - unused = projectversion_versionmajor
1510   - unused = projectversion_versionminor
1511   -
1512   - # PROJECTCONSTANTS Record
1513   - projectconstants_id = struct.unpack("<H", dir_stream.read(2))[0]
1514   - check_value('PROJECTCONSTANTS_Id', 0x000C, projectconstants_id)
1515   - projectconstants_sizeof_constants = struct.unpack("<L", dir_stream.read(4))[0]
1516   - if projectconstants_sizeof_constants > 1015:
1517   - log.error(
1518   - "PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(projectconstants_sizeof_constants))
1519   - projectconstants_constants = dir_stream.read(projectconstants_sizeof_constants)
1520   - projectconstants_reserved = struct.unpack("<H", dir_stream.read(2))[0]
1521   - check_value('PROJECTCONSTANTS_Reserved', 0x003C, projectconstants_reserved)
1522   - projectconstants_sizeof_constants_unicode = struct.unpack("<L", dir_stream.read(4))[0]
1523   - if projectconstants_sizeof_constants_unicode % 2 != 0:
1524   - log.error("PROJECTCONSTANTS_SizeOfConstantsUnicode is not even")
1525   - projectconstants_constants_unicode = dir_stream.read(projectconstants_sizeof_constants_unicode)
1526   - unused = projectconstants_constants
1527   - unused = projectconstants_constants_unicode
1528   -
1529   - # array of REFERENCE records
1530   - check = None
1531   - while True:
1532   - check = struct.unpack("<H", dir_stream.read(2))[0]
1533   - log.debug("reference type = {0:04X}".format(check))
1534   - if check == 0x000F:
1535   - break
1536   -
1537   - if check == 0x0016:
1538   - # REFERENCENAME
1539   - reference_id = check
1540   - reference_sizeof_name = struct.unpack("<L", dir_stream.read(4))[0]
1541   - reference_name = dir_stream.read(reference_sizeof_name)
1542   - reference_reserved = struct.unpack("<H", dir_stream.read(2))[0]
1543   - # According to [MS-OVBA] 2.3.4.2.2.2 REFERENCENAME Record:
1544   - # "Reserved (2 bytes): MUST be 0x003E. MUST be ignored."
1545   - # So let's ignore it, otherwise it crashes on some files (issue #132)
1546   - # PR #135 by @c1fe:
1547   - # contrary to the specification I think that the unicode name
1548   - # is optional. if reference_reserved is not 0x003E I think it
1549   - # is actually the start of another REFERENCE record
1550   - # at least when projectsyskind_syskind == 0x02 (Macintosh)
1551   - if reference_reserved == 0x003E:
1552   - #if reference_reserved not in (0x003E, 0x000D):
1553   - # raise UnexpectedDataError(dir_path, 'REFERENCE_Reserved',
1554   - # 0x0003E, reference_reserved)
1555   - reference_sizeof_name_unicode = struct.unpack("<L", dir_stream.read(4))[0]
1556   - reference_name_unicode = dir_stream.read(reference_sizeof_name_unicode)
1557   - unused = reference_id
1558   - unused = reference_name
1559   - unused = reference_name_unicode
1560   - continue
1561   - else:
1562   - check = reference_reserved
1563   - log.debug("reference type = {0:04X}".format(check))
1564   -
1565   - if check == 0x0033:
1566   - # REFERENCEORIGINAL (followed by REFERENCECONTROL)
1567   - referenceoriginal_id = check
1568   - referenceoriginal_sizeof_libidoriginal = struct.unpack("<L", dir_stream.read(4))[0]
1569   - referenceoriginal_libidoriginal = dir_stream.read(referenceoriginal_sizeof_libidoriginal)
1570   - unused = referenceoriginal_id
1571   - unused = referenceoriginal_libidoriginal
1572   - continue
1573   -
1574   - if check == 0x002F:
1575   - # REFERENCECONTROL
1576   - referencecontrol_id = check
1577   - referencecontrol_sizetwiddled = struct.unpack("<L", dir_stream.read(4))[0] # ignore
1578   - referencecontrol_sizeof_libidtwiddled = struct.unpack("<L", dir_stream.read(4))[0]
1579   - referencecontrol_libidtwiddled = dir_stream.read(referencecontrol_sizeof_libidtwiddled)
1580   - referencecontrol_reserved1 = struct.unpack("<L", dir_stream.read(4))[0] # ignore
1581   - check_value('REFERENCECONTROL_Reserved1', 0x0000, referencecontrol_reserved1)
1582   - referencecontrol_reserved2 = struct.unpack("<H", dir_stream.read(2))[0] # ignore
1583   - check_value('REFERENCECONTROL_Reserved2', 0x0000, referencecontrol_reserved2)
1584   - unused = referencecontrol_id
1585   - unused = referencecontrol_sizetwiddled
1586   - unused = referencecontrol_libidtwiddled
1587   - # optional field
1588   - check2 = struct.unpack("<H", dir_stream.read(2))[0]
1589   - if check2 == 0x0016:
1590   - referencecontrol_namerecordextended_id = check
1591   - referencecontrol_namerecordextended_sizeof_name = struct.unpack("<L", dir_stream.read(4))[0]
1592   - referencecontrol_namerecordextended_name = dir_stream.read(
1593   - referencecontrol_namerecordextended_sizeof_name)
1594   - referencecontrol_namerecordextended_reserved = struct.unpack("<H", dir_stream.read(2))[0]
1595   - if referencecontrol_namerecordextended_reserved == 0x003E:
1596   - referencecontrol_namerecordextended_sizeof_name_unicode = struct.unpack("<L", dir_stream.read(4))[0]
1597   - referencecontrol_namerecordextended_name_unicode = dir_stream.read(
1598   - referencecontrol_namerecordextended_sizeof_name_unicode)
1599   - referencecontrol_reserved3 = struct.unpack("<H", dir_stream.read(2))[0]
1600   - unused = referencecontrol_namerecordextended_id
1601   - unused = referencecontrol_namerecordextended_name
1602   - unused = referencecontrol_namerecordextended_name_unicode
1603   - else:
1604   - referencecontrol_reserved3 = referencecontrol_namerecordextended_reserved
1605   - else:
1606   - referencecontrol_reserved3 = check2
1607   -
1608   - check_value('REFERENCECONTROL_Reserved3', 0x0030, referencecontrol_reserved3)
1609   - referencecontrol_sizeextended = struct.unpack("<L", dir_stream.read(4))[0]
1610   - referencecontrol_sizeof_libidextended = struct.unpack("<L", dir_stream.read(4))[0]
1611   - referencecontrol_libidextended = dir_stream.read(referencecontrol_sizeof_libidextended)
1612   - referencecontrol_reserved4 = struct.unpack("<L", dir_stream.read(4))[0]
1613   - referencecontrol_reserved5 = struct.unpack("<H", dir_stream.read(2))[0]
1614   - referencecontrol_originaltypelib = dir_stream.read(16)
1615   - referencecontrol_cookie = struct.unpack("<L", dir_stream.read(4))[0]
1616   - unused = referencecontrol_sizeextended
1617   - unused = referencecontrol_libidextended
1618   - unused = referencecontrol_reserved4
1619   - unused = referencecontrol_reserved5
1620   - unused = referencecontrol_originaltypelib
1621   - unused = referencecontrol_cookie
1622   - continue
1623   -
1624   - if check == 0x000D:
1625   - # REFERENCEREGISTERED
1626   - referenceregistered_id = check
1627   - referenceregistered_size = struct.unpack("<L", dir_stream.read(4))[0]
1628   - referenceregistered_sizeof_libid = struct.unpack("<L", dir_stream.read(4))[0]
1629   - referenceregistered_libid = dir_stream.read(referenceregistered_sizeof_libid)
1630   - referenceregistered_reserved1 = struct.unpack("<L", dir_stream.read(4))[0]
1631   - check_value('REFERENCEREGISTERED_Reserved1', 0x0000, referenceregistered_reserved1)
1632   - referenceregistered_reserved2 = struct.unpack("<H", dir_stream.read(2))[0]
1633   - check_value('REFERENCEREGISTERED_Reserved2', 0x0000, referenceregistered_reserved2)
1634   - unused = referenceregistered_id
1635   - unused = referenceregistered_size
1636   - unused = referenceregistered_libid
1637   - continue
1638   -
1639   - if check == 0x000E:
1640   - # REFERENCEPROJECT
1641   - referenceproject_id = check
1642   - referenceproject_size = struct.unpack("<L", dir_stream.read(4))[0]
1643   - referenceproject_sizeof_libidabsolute = struct.unpack("<L", dir_stream.read(4))[0]
1644   - referenceproject_libidabsolute = dir_stream.read(referenceproject_sizeof_libidabsolute)
1645   - referenceproject_sizeof_libidrelative = struct.unpack("<L", dir_stream.read(4))[0]
1646   - referenceproject_libidrelative = dir_stream.read(referenceproject_sizeof_libidrelative)
1647   - referenceproject_majorversion = struct.unpack("<L", dir_stream.read(4))[0]
1648   - referenceproject_minorversion = struct.unpack("<H", dir_stream.read(2))[0]
1649   - unused = referenceproject_id
1650   - unused = referenceproject_size
1651   - unused = referenceproject_libidabsolute
1652   - unused = referenceproject_libidrelative
1653   - unused = referenceproject_majorversion
1654   - unused = referenceproject_minorversion
1655   - continue
1656   -
1657   - log.error('invalid or unknown check Id {0:04X}'.format(check))
1658   - # raise an exception instead of stopping abruptly (issue #180)
1659   - raise UnexpectedDataError(dir_path, 'reference type', (0x0F, 0x16, 0x33, 0x2F, 0x0D, 0x0E), check)
1660   - #sys.exit(0)
1661   -
1662   - projectmodules_id = check #struct.unpack("<H", dir_stream.read(2))[0]
1663   - check_value('PROJECTMODULES_Id', 0x000F, projectmodules_id)
1664   - projectmodules_size = struct.unpack("<L", dir_stream.read(4))[0]
1665   - check_value('PROJECTMODULES_Size', 0x0002, projectmodules_size)
1666   - projectmodules_count = struct.unpack("<H", dir_stream.read(2))[0]
1667   - projectmodules_projectcookierecord_id = struct.unpack("<H", dir_stream.read(2))[0]
1668   - check_value('PROJECTMODULES_ProjectCookieRecord_Id', 0x0013, projectmodules_projectcookierecord_id)
1669   - projectmodules_projectcookierecord_size = struct.unpack("<L", dir_stream.read(4))[0]
1670   - check_value('PROJECTMODULES_ProjectCookieRecord_Size', 0x0002, projectmodules_projectcookierecord_size)
1671   - projectmodules_projectcookierecord_cookie = struct.unpack("<H", dir_stream.read(2))[0]
1672   - unused = projectmodules_projectcookierecord_cookie
1673   -
1674   - # short function to simplify unicode text output
1675   - uni_out = lambda unicode_text: unicode_text.encode('utf-8', 'replace')
1676   -
1677   - log.debug("parsing {0} modules".format(projectmodules_count))
1678   - for projectmodule_index in xrange(0, projectmodules_count):
1679   - try:
1680   - modulename_id = struct.unpack("<H", dir_stream.read(2))[0]
1681   - check_value('MODULENAME_Id', 0x0019, modulename_id)
1682   - modulename_sizeof_modulename = struct.unpack("<L", dir_stream.read(4))[0]
1683   - modulename_modulename = dir_stream.read(modulename_sizeof_modulename).decode('utf-8', 'backslashreplace')
1684   - # TODO: preset variables to avoid "referenced before assignment" errors
1685   - modulename_unicode_modulename_unicode = ''
1686   - # account for optional sections
1687   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1688   - if section_id == 0x0047:
1689   - modulename_unicode_id = section_id
1690   - modulename_unicode_sizeof_modulename_unicode = struct.unpack("<L", dir_stream.read(4))[0]
1691   - modulename_unicode_modulename_unicode = dir_stream.read(
1692   - modulename_unicode_sizeof_modulename_unicode).decode('UTF-16LE', 'replace')
1693   - # just guessing that this is the same encoding as used in OleFileIO
1694   - unused = modulename_unicode_id
1695   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1696   - if section_id == 0x001A:
1697   - modulestreamname_id = section_id
1698   - modulestreamname_sizeof_streamname = struct.unpack("<L", dir_stream.read(4))[0]
1699   - modulestreamname_streamname = dir_stream.read(modulestreamname_sizeof_streamname)
1700   - modulestreamname_reserved = struct.unpack("<H", dir_stream.read(2))[0]
1701   - check_value('MODULESTREAMNAME_Reserved', 0x0032, modulestreamname_reserved)
1702   - modulestreamname_sizeof_streamname_unicode = struct.unpack("<L", dir_stream.read(4))[0]
1703   - modulestreamname_streamname_unicode = dir_stream.read(
1704   - modulestreamname_sizeof_streamname_unicode).decode('UTF-16LE', 'replace')
1705   - # just guessing that this is the same encoding as used in OleFileIO
1706   - unused = modulestreamname_id
1707   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1708   - if section_id == 0x001C:
1709   - moduledocstring_id = section_id
1710   - check_value('MODULEDOCSTRING_Id', 0x001C, moduledocstring_id)
1711   - moduledocstring_sizeof_docstring = struct.unpack("<L", dir_stream.read(4))[0]
1712   - moduledocstring_docstring = dir_stream.read(moduledocstring_sizeof_docstring)
1713   - moduledocstring_reserved = struct.unpack("<H", dir_stream.read(2))[0]
1714   - check_value('MODULEDOCSTRING_Reserved', 0x0048, moduledocstring_reserved)
1715   - moduledocstring_sizeof_docstring_unicode = struct.unpack("<L", dir_stream.read(4))[0]
1716   - moduledocstring_docstring_unicode = dir_stream.read(moduledocstring_sizeof_docstring_unicode)
1717   - unused = moduledocstring_docstring
1718   - unused = moduledocstring_docstring_unicode
1719   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1720   - if section_id == 0x0031:
1721   - moduleoffset_id = section_id
1722   - check_value('MODULEOFFSET_Id', 0x0031, moduleoffset_id)
1723   - moduleoffset_size = struct.unpack("<L", dir_stream.read(4))[0]
1724   - check_value('MODULEOFFSET_Size', 0x0004, moduleoffset_size)
1725   - moduleoffset_textoffset = struct.unpack("<L", dir_stream.read(4))[0]
1726   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1727   - if section_id == 0x001E:
1728   - modulehelpcontext_id = section_id
1729   - check_value('MODULEHELPCONTEXT_Id', 0x001E, modulehelpcontext_id)
1730   - modulehelpcontext_size = struct.unpack("<L", dir_stream.read(4))[0]
1731   - check_value('MODULEHELPCONTEXT_Size', 0x0004, modulehelpcontext_size)
1732   - modulehelpcontext_helpcontext = struct.unpack("<L", dir_stream.read(4))[0]
1733   - unused = modulehelpcontext_helpcontext
1734   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1735   - if section_id == 0x002C:
1736   - modulecookie_id = section_id
1737   - check_value('MODULECOOKIE_Id', 0x002C, modulecookie_id)
1738   - modulecookie_size = struct.unpack("<L", dir_stream.read(4))[0]
1739   - check_value('MODULECOOKIE_Size', 0x0002, modulecookie_size)
1740   - modulecookie_cookie = struct.unpack("<H", dir_stream.read(2))[0]
1741   - unused = modulecookie_cookie
1742   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1743   - if section_id == 0x0021 or section_id == 0x0022:
1744   - moduletype_id = section_id
1745   - moduletype_reserved = struct.unpack("<L", dir_stream.read(4))[0]
1746   - unused = moduletype_id
1747   - unused = moduletype_reserved
1748   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1749   - if section_id == 0x0025:
1750   - modulereadonly_id = section_id
1751   - check_value('MODULEREADONLY_Id', 0x0025, modulereadonly_id)
1752   - modulereadonly_reserved = struct.unpack("<L", dir_stream.read(4))[0]
1753   - check_value('MODULEREADONLY_Reserved', 0x0000, modulereadonly_reserved)
1754   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1755   - if section_id == 0x0028:
1756   - moduleprivate_id = section_id
1757   - check_value('MODULEPRIVATE_Id', 0x0028, moduleprivate_id)
1758   - moduleprivate_reserved = struct.unpack("<L", dir_stream.read(4))[0]
1759   - check_value('MODULEPRIVATE_Reserved', 0x0000, moduleprivate_reserved)
1760   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1761   - if section_id == 0x002B: # TERMINATOR
1762   - module_reserved = struct.unpack("<L", dir_stream.read(4))[0]
1763   - check_value('MODULE_Reserved', 0x0000, module_reserved)
1764   - section_id = None
1765   - if section_id != None:
1766   - log.warning('unknown or invalid module section id {0:04X}'.format(section_id))
1767   -
1768   - log.debug('Project CodePage = %d' % projectcodepage_codepage)
1769   - if projectcodepage_codepage in MAC_CODEPAGES:
1770   - vba_codec = MAC_CODEPAGES[projectcodepage_codepage]
1771   - else:
1772   - vba_codec = 'cp%d' % projectcodepage_codepage
1773   - log.debug("ModuleName = {0}".format(modulename_modulename))
1774   - log.debug("ModuleNameUnicode = {0}".format(uni_out(modulename_unicode_modulename_unicode)))
1775   - log.debug("StreamName = {0}".format(modulestreamname_streamname))
1776   - try:
1777   - streamname_unicode = modulestreamname_streamname.decode(vba_codec)
1778   - except UnicodeError as ue:
1779   - log.debug('failed to decode stream name {0!r} with codec {1}'
1780   - .format(uni_out(streamname_unicode), vba_codec))
1781   - streamname_unicode = modulestreamname_streamname.decode(vba_codec, errors='replace')
1782   - log.debug("StreamName.decode('%s') = %s" % (vba_codec, uni_out(streamname_unicode)))
1783   - log.debug("StreamNameUnicode = {0}".format(uni_out(modulestreamname_streamname_unicode)))
1784   - log.debug("TextOffset = {0}".format(moduleoffset_textoffset))
1785   -
1786   - code_data = None
1787   - try_names = streamname_unicode, \
1788   - modulename_unicode_modulename_unicode, \
1789   - modulestreamname_streamname_unicode
1790   - for stream_name in try_names:
1791   - # TODO: if olefile._find were less private, could replace this
1792   - # try-except with calls to it
1793   - try:
1794   - code_path = vba_root + u'VBA/' + stream_name
1795   - log.debug('opening VBA code stream %s' % uni_out(code_path))
1796   - code_data = ole.openstream(code_path).read()
1797   - break
1798   - except IOError as ioe:
1799   - log.debug('failed to open stream VBA/%r (%r), try other name'
1800   - % (uni_out(stream_name), ioe))
1801   -
1802   - if code_data is None:
1803   - log.info("Could not open stream %d of %d ('VBA/' + one of %r)!"
1804   - % (projectmodule_index, projectmodules_count,
1805   - '/'.join("'" + uni_out(stream_name) + "'"
1806   - for stream_name in try_names)))
1807   - if relaxed:
1808   - continue # ... with next submodule
1809   - else:
1810   - raise SubstreamOpenError('[BASE]', 'VBA/' +
1811   - uni_out(modulename_unicode_modulename_unicode))
1812   -
1813   - log.debug("length of code_data = {0}".format(len(code_data)))
1814   - log.debug("offset of code_data = {0}".format(moduleoffset_textoffset))
1815   - code_data = code_data[moduleoffset_textoffset:]
1816   - if len(code_data) > 0:
1817   - code_data = decompress_stream(bytearray(code_data))
1818   - # case-insensitive search in the code_modules dict to find the file extension:
1819   - filext = code_modules.get(modulename_modulename.lower(), 'bin')
1820   - filename = '{0}.{1}'.format(modulename_modulename, filext)
1821   - #TODO: also yield the codepage so that callers can decode it properly
1822   - yield (code_path, filename, code_data)
1823   - # print '-'*79
1824   - # print filename
1825   - # print ''
1826   - # print code_data
1827   - # print ''
1828   - log.debug('extracted file {0}'.format(filename))
1829   - else:
1830   - log.warning("module stream {0} has code data length 0".format(modulestreamname_streamname))
1831   - except (UnexpectedDataError, SubstreamOpenError):
1832   - raise
1833   - except Exception as exc:
1834   - log.info('Error parsing module {0} of {1} in _extract_vba:'
1835   - .format(projectmodule_index, projectmodules_count),
1836   - exc_info=True)
1837   - if not relaxed:
1838   - raise
1839   - _ = unused # make pylint happy: now variable "unused" is being used ;-)
1840   - return
1841   -
1842   -
1843   -def vba_collapse_long_lines(vba_code):
1844   - """
1845   - Parse a VBA module code to detect continuation line characters (underscore) and
1846   - collapse split lines. Continuation line characters are replaced by spaces.
1847   -
1848   - :param vba_code: str, VBA module code
1849   - :return: str, VBA module code with long lines collapsed
1850   - """
1851   - # TODO: use a regex instead, to allow whitespaces after the underscore?
1852   - vba_code = vba_code.replace(' _\r\n', ' ')
1853   - vba_code = vba_code.replace(' _\r', ' ')
1854   - vba_code = vba_code.replace(' _\n', ' ')
1855   - return vba_code
1856   -
1857   -
1858   -def filter_vba(vba_code):
1859   - """
1860   - Filter VBA source code to remove the first lines starting with "Attribute VB_",
1861   - which are automatically added by MS Office and not displayed in the VBA Editor.
1862   - This should only be used when displaying source code for human analysis.
1863   -
1864   - Note: lines are not filtered if they contain a colon, because it could be
1865   - used to hide malicious instructions.
1866   -
1867   - :param vba_code: str, VBA source code
1868   - :return: str, filtered VBA source code
1869   - """
1870   - vba_lines = vba_code.splitlines()
1871   - start = 0
1872   - for line in vba_lines:
1873   - if line.startswith("Attribute VB_") and not ':' in line:
1874   - start += 1
1875   - else:
1876   - break
1877   - #TODO: also remove empty lines?
1878   - vba = '\n'.join(vba_lines[start:])
1879   - return vba
1880   -
1881   -
1882   -def detect_autoexec(vba_code, obfuscation=None):
1883   - """
1884   - Detect if the VBA code contains keywords corresponding to macros running
1885   - automatically when triggered by specific actions (e.g. when a document is
1886   - opened or closed).
1887   -
1888   - :param vba_code: str, VBA source code
1889   - :param obfuscation: None or str, name of obfuscation to be added to description
1890   - :return: list of str tuples (keyword, description)
1891   - """
1892   - #TODO: merge code with detect_suspicious
1893   - # case-insensitive search
1894   - #vba_code = vba_code.lower()
1895   - results = []
1896   - obf_text = ''
1897   - if obfuscation:
1898   - obf_text = ' (obfuscation: %s)' % obfuscation
1899   - for description, keywords in AUTOEXEC_KEYWORDS.items():
1900   - for keyword in keywords:
1901   - #TODO: if keyword is already a compiled regex, use it as-is
1902   - # search using regex to detect word boundaries:
1903   - match = re.search(r'(?i)\b' + keyword + r'\b', vba_code)
1904   - if match:
1905   - #if keyword.lower() in vba_code:
1906   - found_keyword = match.group()
1907   - results.append((found_keyword, description + obf_text))
1908   - return results
1909   -
1910   -
1911   -def detect_suspicious(vba_code, obfuscation=None):
1912   - """
1913   - Detect if the VBA code contains suspicious keywords corresponding to
1914   - potential malware behaviour.
1915   -
1916   - :param vba_code: str, VBA source code
1917   - :param obfuscation: None or str, name of obfuscation to be added to description
1918   - :return: list of str tuples (keyword, description)
1919   - """
1920   - # case-insensitive search
1921   - #vba_code = vba_code.lower()
1922   - results = []
1923   - obf_text = ''
1924   - if obfuscation:
1925   - obf_text = ' (obfuscation: %s)' % obfuscation
1926   - for description, keywords in SUSPICIOUS_KEYWORDS.items():
1927   - for keyword in keywords:
1928   - # search using regex to detect word boundaries:
1929   - match = re.search(r'(?i)\b' + re.escape(keyword) + r'\b', vba_code)
1930   - if match:
1931   - #if keyword.lower() in vba_code:
1932   - found_keyword = match.group()
1933   - results.append((found_keyword, description + obf_text))
1934   - return results
1935   -
1936   -
1937   -def detect_patterns(vba_code, obfuscation=None):
1938   - """
1939   - Detect if the VBA code contains specific patterns such as IP addresses,
1940   - URLs, e-mail addresses, executable file names, etc.
1941   -
1942   - :param vba_code: str, VBA source code
1943   - :return: list of str tuples (pattern type, value)
1944   - """
1945   - results = []
1946   - found = set()
1947   - obf_text = ''
1948   - if obfuscation:
1949   - obf_text = ' (obfuscation: %s)' % obfuscation
1950   - for pattern_type, pattern_re in RE_PATTERNS:
1951   - for match in pattern_re.finditer(vba_code):
1952   - value = match.group()
1953   - if value not in found:
1954   - results.append((pattern_type + obf_text, value))
1955   - found.add(value)
1956   - return results
1957   -
1958   -
1959   -def detect_hex_strings(vba_code):
1960   - """
1961   - Detect if the VBA code contains strings encoded in hexadecimal.
1962   -
1963   - :param vba_code: str, VBA source code
1964   - :return: list of str tuples (encoded string, decoded string)
1965   - """
1966   - results = []
1967   - found = set()
1968   - for match in re_hex_string.finditer(vba_code):
1969   - value = match.group()
1970   - if value not in found:
1971   - decoded = binascii.unhexlify(value)
1972   - results.append((value, decoded.decode('utf-8', 'backslashreplace')))
1973   - found.add(value)
1974   - return results
1975   -
1976   -
1977   -def detect_base64_strings(vba_code):
1978   - """
1979   - Detect if the VBA code contains strings encoded in base64.
1980   -
1981   - :param vba_code: str, VBA source code
1982   - :return: list of str tuples (encoded string, decoded string)
1983   - """
1984   - #TODO: avoid matching simple hex strings as base64?
1985   - results = []
1986   - found = set()
1987   - for match in re_base64_string.finditer(vba_code):
1988   - # extract the base64 string without quotes:
1989   - value = match.group().strip('"')
1990   - # check it is not just a hex string:
1991   - if not re_nothex_check.search(value):
1992   - continue
1993   - # only keep new values and not in the whitelist:
1994   - if value not in found and value.lower() not in BASE64_WHITELIST:
1995   - try:
1996   - decoded = base64.b64decode(value)
1997   - results.append((value, decoded.decode('utf-8','replace')))
1998   - found.add(value)
1999   - except (TypeError, ValueError) as exc:
2000   - log.debug('Failed to base64-decode (%s)' % exc)
2001   - # if an exception occurs, it is likely not a base64-encoded string
2002   - return results
2003   -
2004   -
2005   -def detect_dridex_strings(vba_code):
2006   - """
2007   - Detect if the VBA code contains strings obfuscated with a specific algorithm found in Dridex samples.
2008   -
2009   - :param vba_code: str, VBA source code
2010   - :return: list of str tuples (encoded string, decoded string)
2011   - """
2012   - # TODO: move this at the beginning of script
2013   - from oletools.thirdparty.DridexUrlDecoder.DridexUrlDecoder import DridexUrlDecode
2014   -
2015   - results = []
2016   - found = set()
2017   - for match in re_dridex_string.finditer(vba_code):
2018   - value = match.group()[1:-1]
2019   - # check it is not just a hex string:
2020   - if not re_nothex_check.search(value):
2021   - continue
2022   - if value not in found:
2023   - try:
2024   - decoded = DridexUrlDecode(value)
2025   - results.append((value, decoded))
2026   - found.add(value)
2027   - except Exception as exc:
2028   - log.debug('Failed to Dridex-decode (%s)' % exc)
2029   - # if an exception occurs, it is likely not a dridex-encoded string
2030   - return results
2031   -
2032   -
2033   -def detect_vba_strings(vba_code):
2034   - """
2035   - Detect if the VBA code contains strings obfuscated with VBA expressions
2036   - using keywords such as Chr, Asc, Val, StrReverse, etc.
2037   -
2038   - :param vba_code: str, VBA source code
2039   - :return: list of str tuples (encoded string, decoded string)
2040   - """
2041   - # TODO: handle exceptions
2042   - results = []
2043   - found = set()
2044   - # IMPORTANT: to extract the actual VBA expressions found in the code,
2045   - # we must expand tabs to have the same string as pyparsing.
2046   - # Otherwise, start and end offsets are incorrect.
2047   - vba_code = vba_code.expandtabs()
2048   - # Split the VBA code line by line to avoid MemoryError on large scripts:
2049   - for vba_line in vba_code.splitlines():
2050   - for tokens, start, end in vba_expr_str.scanString(vba_line):
2051   - encoded = vba_line[start:end]
2052   - decoded = tokens[0]
2053   - if isinstance(decoded, VbaExpressionString):
2054   - # This is a VBA expression, not a simple string
2055   - # print 'VBA EXPRESSION: encoded=%r => decoded=%r' % (encoded, decoded)
2056   - # remove parentheses and quotes from original string:
2057   - # if encoded.startswith('(') and encoded.endswith(')'):
2058   - # encoded = encoded[1:-1]
2059   - # if encoded.startswith('"') and encoded.endswith('"'):
2060   - # encoded = encoded[1:-1]
2061   - # avoid duplicates and simple strings:
2062   - if encoded not in found and decoded != encoded:
2063   - results.append((encoded, decoded))
2064   - found.add(encoded)
2065   - # else:
2066   - # print 'VBA STRING: encoded=%r => decoded=%r' % (encoded, decoded)
2067   - return results
2068   -
2069   -
2070   -def json2ascii(json_obj, encoding='utf8', errors='replace'):
2071   - """ ensure there is no unicode in json and all strings are safe to decode
2072   -
2073   - works recursively, decodes and re-encodes every string to/from unicode
2074   - to ensure there will be no trouble in loading the dumped json output
2075   - """
2076   - if json_obj is None:
2077   - pass
2078   - elif isinstance(json_obj, (bool, int, float)):
2079   - pass
2080   - elif isinstance(json_obj, str):
2081   - # de-code and re-encode
2082   - dencoded = json_obj
2083   - if dencoded != json_obj:
2084   - log.debug('json2ascii: replaced: {0} (len {1})'
2085   - .format(json_obj, len(json_obj)))
2086   - log.debug('json2ascii: with: {0} (len {1})'
2087   - .format(dencoded, len(dencoded)))
2088   - return dencoded
2089   - elif isinstance(json_obj, bytes):
2090   - log.debug('json2ascii: encode unicode: {0}'
2091   - .format(json_obj.decode(encoding, errors)))
2092   - # cannot put original into logger
2093   - # print 'original: ' json_obj
2094   - return json_obj.decode(encoding, errors)
2095   - elif isinstance(json_obj, dict):
2096   - for key in json_obj:
2097   - json_obj[key] = json2ascii(json_obj[key])
2098   - elif isinstance(json_obj, (list,tuple)):
2099   - for item in json_obj:
2100   - item = json2ascii(item)
2101   - else:
2102   - log.debug('unexpected type in json2ascii: {0} -- leave as is'
2103   - .format(type(json_obj)))
2104   - return json_obj
2105   -
2106   -
2107   -def print_json(json_dict=None, _json_is_first=False, _json_is_last=False,
2108   - **json_parts):
2109   - """ line-wise print of json.dumps(json2ascii(..)) with options and indent+1
2110   -
2111   - can use in two ways:
2112   - (1) print_json(some_dict)
2113   - (2) print_json(key1=value1, key2=value2, ...)
2114   -
2115   - :param bool _json_is_first: set to True only for very first entry to complete
2116   - the top-level json-list
2117   - :param bool _json_is_last: set to True only for very last entry to complete
2118   - the top-level json-list
2119   - """
2120   - if json_dict and json_parts:
2121   - raise ValueError('Invalid json argument: want either single dict or '
2122   - 'key=value parts but got both)')
2123   - elif (json_dict is not None) and (not isinstance(json_dict, dict)):
2124   - raise ValueError('Invalid json argument: want either single dict or '
2125   - 'key=value parts but got {0} instead of dict)'
2126   - .format(type(json_dict)))
2127   - if json_parts:
2128   - json_dict = json_parts
2129   -
2130   - if _json_is_first:
2131   - print('[')
2132   -
2133   - lines = json.dumps(json2ascii(json_dict), check_circular=False,
2134   - indent=4, ensure_ascii=False).splitlines()
2135   - for line in lines[:-1]:
2136   - print(' {0}'.format(line))
2137   - if _json_is_last:
2138   - print(' {0}'.format(lines[-1])) # print last line without comma
2139   - print(']')
2140   - else:
2141   - print(' {0},'.format(lines[-1])) # print last line with comma
2142   -
2143   -
2144   -class VBA_Scanner(object):
2145   - """
2146   - Class to scan the source code of a VBA module to find obfuscated strings,
2147   - suspicious keywords, IOCs, auto-executable macros, etc.
2148   - """
2149   -
2150   - def __init__(self, vba_code):
2151   - """
2152   - VBA_Scanner constructor
2153   -
2154   - :param vba_code: str, VBA source code to be analyzed
2155   - """
2156   - if isinstance(vba_code, bytes):
2157   - vba_code = vba_code.decode('utf-8', 'backslashreplace')
2158   - # join long lines ending with " _":
2159   - self.code = vba_collapse_long_lines(vba_code)
2160   - self.code_hex = ''
2161   - self.code_hex_rev = ''
2162   - self.code_rev_hex = ''
2163   - self.code_base64 = ''
2164   - self.code_dridex = ''
2165   - self.code_vba = ''
2166   - self.strReverse = None
2167   - # results = None before scanning, then a list of tuples after scanning
2168   - self.results = None
2169   - self.autoexec_keywords = None
2170   - self.suspicious_keywords = None
2171   - self.iocs = None
2172   - self.hex_strings = None
2173   - self.base64_strings = None
2174   - self.dridex_strings = None
2175   - self.vba_strings = None
2176   -
2177   -
2178   - def scan(self, include_decoded_strings=False, deobfuscate=False):
2179   - """
2180   - Analyze the provided VBA code to detect suspicious keywords,
2181   - auto-executable macros, IOC patterns, obfuscation patterns
2182   - such as hex-encoded strings.
2183   -
2184   - :param include_decoded_strings: bool, if True, all encoded strings will be included with their decoded content.
2185   - :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow)
2186   - :return: list of tuples (type, keyword, description)
2187   - (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String')
2188   - """
2189   - # First, detect and extract hex-encoded strings:
2190   - self.hex_strings = detect_hex_strings(self.code)
2191   - # detect if the code contains StrReverse:
2192   - self.strReverse = False
2193   - if 'strreverse' in self.code.lower(): self.strReverse = True
2194   - # Then append the decoded strings to the VBA code, to detect obfuscated IOCs and keywords:
2195   - for encoded, decoded in self.hex_strings:
2196   - self.code_hex += '\n' + decoded
2197   - # if the code contains "StrReverse", also append the hex strings in reverse order:
2198   - if self.strReverse:
2199   - # StrReverse after hex decoding:
2200   - self.code_hex_rev += '\n' + decoded[::-1]
2201   - # StrReverse before hex decoding:
2202   - self.code_rev_hex += '\n' + str(binascii.unhexlify(encoded[::-1]))
2203   - #example: https://malwr.com/analysis/NmFlMGI4YTY1YzYyNDkwNTg1ZTBiZmY5OGI3YjlhYzU/
2204   - #TODO: also append the full code reversed if StrReverse? (risk of false positives?)
2205   - # Detect Base64-encoded strings
2206   - self.base64_strings = detect_base64_strings(self.code)
2207   - for encoded, decoded in self.base64_strings:
2208   - self.code_base64 += '\n' + decoded
2209   - # Detect Dridex-encoded strings
2210   - self.dridex_strings = detect_dridex_strings(self.code)
2211   - for encoded, decoded in self.dridex_strings:
2212   - self.code_dridex += '\n' + decoded
2213   - # Detect obfuscated strings in VBA expressions
2214   - if deobfuscate:
2215   - self.vba_strings = detect_vba_strings(self.code)
2216   - else:
2217   - self.vba_strings = []
2218   - for encoded, decoded in self.vba_strings:
2219   - self.code_vba += '\n' + decoded
2220   - results = []
2221   - self.autoexec_keywords = []
2222   - self.suspicious_keywords = []
2223   - self.iocs = []
2224   -
2225   - for code, obfuscation in (
2226   - (self.code, None),
2227   - (self.code_hex, 'Hex'),
2228   - (self.code_hex_rev, 'Hex+StrReverse'),
2229   - (self.code_rev_hex, 'StrReverse+Hex'),
2230   - (self.code_base64, 'Base64'),
2231   - (self.code_dridex, 'Dridex'),
2232   - (self.code_vba, 'VBA expression'),
2233   - ):
2234   - if isinstance(code,bytes):
2235   - code=code.decode('utf-8','backslashreplace')
2236   - self.autoexec_keywords += detect_autoexec(code, obfuscation)
2237   - self.suspicious_keywords += detect_suspicious(code, obfuscation)
2238   - self.iocs += detect_patterns(code, obfuscation)
2239   -
2240   - # If hex-encoded strings were discovered, add an item to suspicious keywords:
2241   - if self.hex_strings:
2242   - self.suspicious_keywords.append(('Hex Strings',
2243   - 'Hex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))
2244   - if self.base64_strings:
2245   - self.suspicious_keywords.append(('Base64 Strings',
2246   - 'Base64-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))
2247   - if self.dridex_strings:
2248   - self.suspicious_keywords.append(('Dridex Strings',
2249   - 'Dridex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))
2250   - if self.vba_strings:
2251   - self.suspicious_keywords.append(('VBA obfuscated Strings',
2252   - 'VBA string expressions were detected, may be used to obfuscate strings (option --decode to see all)'))
2253   - # use a set to avoid duplicate keywords
2254   - keyword_set = set()
2255   - for keyword, description in self.autoexec_keywords:
2256   - if keyword not in keyword_set:
2257   - results.append(('AutoExec', keyword, description))
2258   - keyword_set.add(keyword)
2259   - keyword_set = set()
2260   - for keyword, description in self.suspicious_keywords:
2261   - if keyword not in keyword_set:
2262   - results.append(('Suspicious', keyword, description))
2263   - keyword_set.add(keyword)
2264   - keyword_set = set()
2265   - for pattern_type, value in self.iocs:
2266   - if value not in keyword_set:
2267   - results.append(('IOC', value, pattern_type))
2268   - keyword_set.add(value)
2269   -
2270   - # include decoded strings only if they are printable or if --decode option:
2271   - for encoded, decoded in self.hex_strings:
2272   - if include_decoded_strings or is_printable(decoded):
2273   - results.append(('Hex String', decoded, encoded))
2274   - for encoded, decoded in self.base64_strings:
2275   - if include_decoded_strings or is_printable(decoded):
2276   - results.append(('Base64 String', decoded, encoded))
2277   - for encoded, decoded in self.dridex_strings:
2278   - if include_decoded_strings or is_printable(decoded):
2279   - results.append(('Dridex string', decoded, encoded))
2280   - for encoded, decoded in self.vba_strings:
2281   - if include_decoded_strings or is_printable(decoded):
2282   - results.append(('VBA string', decoded, encoded))
2283   - self.results = results
2284   - return results
2285   -
2286   - def scan_summary(self):
2287   - """
2288   - Analyze the provided VBA code to detect suspicious keywords,
2289   - auto-executable macros, IOC patterns, obfuscation patterns
2290   - such as hex-encoded strings.
2291   -
2292   - :return: tuple with the number of items found for each category:
2293   - (autoexec, suspicious, IOCs, hex, base64, dridex, vba)
2294   - """
2295   - # avoid scanning the same code twice:
2296   - if self.results is None:
2297   - self.scan()
2298   - return (len(self.autoexec_keywords), len(self.suspicious_keywords),
2299   - len(self.iocs), len(self.hex_strings), len(self.base64_strings),
2300   - len(self.dridex_strings), len(self.vba_strings))
2301   -
2302   -
2303   -def scan_vba(vba_code, include_decoded_strings, deobfuscate=False):
2304   - """
2305   - Analyze the provided VBA code to detect suspicious keywords,
2306   - auto-executable macros, IOC patterns, obfuscation patterns
2307   - such as hex-encoded strings.
2308   - (shortcut for VBA_Scanner(vba_code).scan())
2309   -
2310   - :param vba_code: str, VBA source code to be analyzed
2311   - :param include_decoded_strings: bool, if True all encoded strings will be included with their decoded content.
2312   - :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow)
2313   - :return: list of tuples (type, keyword, description)
2314   - (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String')
2315   - """
2316   - return VBA_Scanner(vba_code).scan(include_decoded_strings, deobfuscate)
2317   -
2318   -
2319   -#=== CLASSES =================================================================
2320   -
2321   -class VBA_Parser(object):
2322   - """
2323   - Class to parse MS Office files, to detect VBA macros and extract VBA source code
2324   - Supported file formats:
2325   - - Word 97-2003 (.doc, .dot)
2326   - - Word 2007+ (.docm, .dotm)
2327   - - Word 2003 XML (.xml)
2328   - - Word MHT - Single File Web Page / MHTML (.mht)
2329   - - Excel 97-2003 (.xls)
2330   - - Excel 2007+ (.xlsm, .xlsb)
2331   - - PowerPoint 97-2003 (.ppt)
2332   - - PowerPoint 2007+ (.pptm, .ppsm)
2333   - """
2334   -
2335   - def __init__(self, filename, data=None, container=None, relaxed=False):
2336   - """
2337   - Constructor for VBA_Parser
2338   -
2339   - :param filename: filename or path of file to parse, or file-like object
2340   -
2341   - :param data: None or bytes str, if None the file will be read from disk (or from the file-like object).
2342   - If data is provided as a bytes string, it will be parsed as the content of the file in memory,
2343   - and not read from disk. Note: files must be read in binary mode, i.e. open(f, 'rb').
2344   -
2345   - :param container: str, path and filename of container if the file is within
2346   - a zip archive, None otherwise.
2347   -
2348   - :param relaxed: if True, treat mal-formed documents and missing streams more like MS office:
2349   - do nothing; if False (default), raise errors in these cases
2350   -
2351   - raises a FileOpenError if all attemps to interpret the data header failed
2352   - """
2353   - #TODO: filename should only be a string, data should be used for the file-like object
2354   - #TODO: filename should be mandatory, optional data is a string or file-like object
2355   - #TODO: also support olefile and zipfile as input
2356   - if data is None:
2357   - # open file from disk:
2358   - _file = filename
2359   - else:
2360   - # file already read in memory, make it a file-like object for zipfile:
2361   - _file = BytesIO(data)
2362   - #self.file = _file
2363   - self.ole_file = None
2364   - self.ole_subfiles = []
2365   - self.filename = filename
2366   - self.container = container
2367   - self.relaxed = relaxed
2368   - self.type = None
2369   - self.vba_projects = None
2370   - self.vba_forms = None
2371   - self.contains_macros = None # will be set to True or False by detect_macros
2372   - self.vba_code_all_modules = None # to store the source code of all modules
2373   - # list of tuples for each module: (subfilename, stream_path, vba_filename, vba_code)
2374   - self.modules = None
2375   - # Analysis results: list of tuples (type, keyword, description) - See VBA_Scanner
2376   - self.analysis_results = None
2377   - # statistics for the scan summary and flags
2378   - self.nb_macros = 0
2379   - self.nb_autoexec = 0
2380   - self.nb_suspicious = 0
2381   - self.nb_iocs = 0
2382   - self.nb_hexstrings = 0
2383   - self.nb_base64strings = 0
2384   - self.nb_dridexstrings = 0
2385   - self.nb_vbastrings = 0
2386   -
2387   - # if filename is None:
2388   - # if isinstance(_file, basestring):
2389   - # if len(_file) < olefile.MINIMAL_OLEFILE_SIZE:
2390   - # self.filename = _file
2391   - # else:
2392   - # self.filename = '<file in bytes string>'
2393   - # else:
2394   - # self.filename = '<file-like object>'
2395   - if olefile.isOleFile(_file):
2396   - # This looks like an OLE file
2397   - self.open_ole(_file)
2398   -
2399   - # check whether file is encrypted (need to do this before try ppt)
2400   - log.debug('Check encryption of ole file')
2401   - crypt_indicator = oleid.OleID(self.ole_file).check_encrypted()
2402   - if crypt_indicator.value:
2403   - raise FileIsEncryptedError(filename)
2404   -
2405   - # if this worked, try whether it is a ppt file (special ole file)
2406   - self.open_ppt()
2407   - if self.type is None and is_zipfile(_file):
2408   - # Zip file, which may be an OpenXML document
2409   - self.open_openxml(_file)
2410   - if self.type is None:
2411   - # read file from disk, check if it is a Word 2003 XML file (WordProcessingML), Excel 2003 XML,
2412   - # or a plain text file containing VBA code
2413   - if data is None:
2414   - with open(filename, 'rb') as file_handle:
2415   - data = file_handle.read()
2416   - # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace
2417   - if b'http://schemas.microsoft.com/office/word/2003/wordml' in data:
2418   - self.open_word2003xml(data)
2419   - # check if it is a Word/PowerPoint 2007+ XML file (Flat OPC): must contain the namespace
2420   - if b'http://schemas.microsoft.com/office/2006/xmlPackage' in data:
2421   - self.open_flatopc(data)
2422   - # store a lowercase version for the next tests:
2423   - data_lowercase = data.lower()
2424   - # check if it is a MHT file (MIME HTML, Word or Excel saved as "Single File Web Page"):
2425   - # According to my tests, these files usually start with "MIME-Version: 1.0" on the 1st line
2426   - # BUT Word accepts a blank line or other MIME headers inserted before,
2427   - # and even whitespaces in between "MIME", "-", "Version" and ":". The version number is ignored.
2428   - # And the line is case insensitive.
2429   - # so we'll just check the presence of mime, version and multipart anywhere:
2430   - if self.type is None and b'mime' in data_lowercase and b'version' in data_lowercase \
2431   - and b'multipart' in data_lowercase:
2432   - self.open_mht(data)
2433   - #TODO: handle exceptions
2434   - #TODO: Excel 2003 XML
2435   - # Check whether this is rtf
2436   - if rtfobj.is_rtf(data, treat_str_as_data=True):
2437   - # Ignore RTF since it contains no macros and methods in here will not find macros
2438   - # in embedded objects. run rtfobj and repeat on its output.
2439   - msg = '%s is RTF, need to run rtfobj.py and find VBA Macros in its output.' % self.filename
2440   - log.info(msg)
2441   - raise FileOpenError(msg)
2442   - # Check if this is a plain text VBA or VBScript file:
2443   - # To avoid scanning binary files, we simply check for some control chars:
2444   - if self.type is None and b'\x00' not in data:
2445   - self.open_text(data)
2446   - if self.type is None:
2447   - # At this stage, could not match a known format:
2448   - msg = '%s is not a supported file type, cannot extract VBA Macros.' % self.filename
2449   - log.info(msg)
2450   - raise FileOpenError(msg)
2451   -
2452   - def open_ole(self, _file):
2453   - """
2454   - Open an OLE file
2455   - :param _file: filename or file contents in a file object
2456   - :return: nothing
2457   - """
2458   - log.info('Opening OLE file %s' % self.filename)
2459   - try:
2460   - # Open and parse the OLE file, using unicode for path names:
2461   - self.ole_file = olefile.OleFileIO(_file, path_encoding=None)
2462   - # set type only if parsing succeeds
2463   - self.type = TYPE_OLE
2464   - except (IOError, TypeError, ValueError) as exc:
2465   - # TODO: handle OLE parsing exceptions
2466   - log.info('Failed OLE parsing for file %r (%s)' % (self.filename, exc))
2467   - log.debug('Trace:', exc_info=True)
2468   -
2469   -
2470   - def open_openxml(self, _file):
2471   - """
2472   - Open an OpenXML file
2473   - :param _file: filename or file contents in a file object
2474   - :return: nothing
2475   - """
2476   - # This looks like a zip file, need to look for vbaProject.bin inside
2477   - # It can be any OLE file inside the archive
2478   - #...because vbaProject.bin can be renamed:
2479   - # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18
2480   - log.info('Opening ZIP/OpenXML file %s' % self.filename)
2481   - try:
2482   - z = zipfile.ZipFile(_file)
2483   - #TODO: check if this is actually an OpenXML file
2484   - #TODO: if the zip file is encrypted, suggest to use the -z option, or try '-z infected' automatically
2485   - # check each file within the zip if it is an OLE file, by reading its magic:
2486   - for subfile in z.namelist():
2487   - with z.open(subfile) as file_handle:
2488   - magic = file_handle.read(len(olefile.MAGIC))
2489   - if magic == olefile.MAGIC:
2490   - log.debug('Opening OLE file %s within zip' % subfile)
2491   - with z.open(subfile) as file_handle:
2492   - ole_data = file_handle.read()
2493   - try:
2494   - self.ole_subfiles.append(
2495   - VBA_Parser(filename=subfile, data=ole_data,
2496   - relaxed=self.relaxed))
2497   - except OlevbaBaseException as exc:
2498   - if self.relaxed:
2499   - log.info('%s is not a valid OLE file (%s)' % (subfile, exc))
2500   - log.debug('Trace:', exc_info=True)
2501   - continue
2502   - else:
2503   - raise SubstreamOpenError(self.filename, subfile,
2504   - exc)
2505   - z.close()
2506   - # set type only if parsing succeeds
2507   - self.type = TYPE_OpenXML
2508   - except OlevbaBaseException as exc:
2509   - if self.relaxed:
2510   - log.info('Error {0} caught in Zip/OpenXML parsing for file {1}'
2511   - .format(exc, self.filename))
2512   - log.debug('Trace:', exc_info=True)
2513   - else:
2514   - raise
2515   - except (RuntimeError, zipfile.BadZipfile, zipfile.LargeZipFile, IOError) as exc:
2516   - # TODO: handle parsing exceptions
2517   - log.info('Failed Zip/OpenXML parsing for file %r (%s)'
2518   - % (self.filename, exc))
2519   - log.debug('Trace:', exc_info=True)
2520   -
2521   - def open_word2003xml(self, data):
2522   - """
2523   - Open a Word 2003 XML file
2524   - :param data: file contents in a string or bytes
2525   - :return: nothing
2526   - """
2527   - log.info('Opening Word 2003 XML file %s' % self.filename)
2528   - try:
2529   - # parse the XML content
2530   - # TODO: handle XML parsing exceptions
2531   - et = ET.fromstring(data)
2532   - # find all the binData elements:
2533   - for bindata in et.getiterator(TAG_BINDATA):
2534   - # the binData content is an OLE container for the VBA project, compressed
2535   - # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded.
2536   - # get the filename:
2537   - fname = bindata.get(ATTR_NAME, 'noname.mso')
2538   - # decode the base64 activemime
2539   - mso_data = binascii.a2b_base64(bindata.text)
2540   - if is_mso_file(mso_data):
2541   - # decompress the zlib data stored in the MSO file, which is the OLE container:
2542   - # TODO: handle different offsets => separate function
2543   - try:
2544   - ole_data = mso_file_extract(mso_data)
2545   - self.ole_subfiles.append(
2546   - VBA_Parser(filename=fname, data=ole_data,
2547   - relaxed=self.relaxed))
2548   - except OlevbaBaseException as exc:
2549   - if self.relaxed:
2550   - log.info('Error parsing subfile {0}: {1}'
2551   - .format(fname, exc))
2552   - log.debug('Trace:', exc_info=True)
2553   - else:
2554   - raise SubstreamOpenError(self.filename, fname, exc)
2555   - else:
2556   - log.info('%s is not a valid MSO file' % fname)
2557   - # set type only if parsing succeeds
2558   - self.type = TYPE_Word2003_XML
2559   - except OlevbaBaseException as exc:
2560   - if self.relaxed:
2561   - log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc))
2562   - log.debug('Trace:', exc_info=True)
2563   - else:
2564   - raise
2565   - except Exception as exc:
2566   - # TODO: differentiate exceptions for each parsing stage
2567   - # (but ET is different libs, no good exception description in API)
2568   - # found: XMLSyntaxError
2569   - log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc))
2570   - log.debug('Trace:', exc_info=True)
2571   -
2572   - def open_flatopc(self, data):
2573   - """
2574   - Open a Word or PowerPoint 2007+ XML file, aka "Flat OPC"
2575   - :param data: file contents in a string or bytes
2576   - :return: nothing
2577   - """
2578   - log.info('Opening Flat OPC Word/PowerPoint XML file %s' % self.filename)
2579   - try:
2580   - # parse the XML content
2581   - # TODO: handle XML parsing exceptions
2582   - et = ET.fromstring(data)
2583   - # TODO: check root node namespace and tag
2584   - # find all the pkg:part elements:
2585   - for pkgpart in et.iter(TAG_PKGPART):
2586   - fname = pkgpart.get(ATTR_PKG_NAME, 'unknown')
2587   - content_type = pkgpart.get(ATTR_PKG_CONTENTTYPE, 'unknown')
2588   - if content_type == CTYPE_VBAPROJECT:
2589   - for bindata in pkgpart.iterfind(TAG_PKGBINDATA):
2590   - try:
2591   - ole_data = binascii.a2b_base64(bindata.text)
2592   - self.ole_subfiles.append(
2593   - VBA_Parser(filename=fname, data=ole_data,
2594   - relaxed=self.relaxed))
2595   - except OlevbaBaseException as exc:
2596   - if self.relaxed:
2597   - log.info('Error parsing subfile {0}: {1}'
2598   - .format(fname, exc))
2599   - log.debug('Trace:', exc_info=True)
2600   - else:
2601   - raise SubstreamOpenError(self.filename, fname, exc)
2602   - # set type only if parsing succeeds
2603   - self.type = TYPE_FlatOPC_XML
2604   - except OlevbaBaseException as exc:
2605   - if self.relaxed:
2606   - log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc))
2607   - log.debug('Trace:', exc_info=True)
2608   - else:
2609   - raise
2610   - except Exception as exc:
2611   - # TODO: differentiate exceptions for each parsing stage
2612   - # (but ET is different libs, no good exception description in API)
2613   - # found: XMLSyntaxError
2614   - log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc))
2615   - log.debug('Trace:', exc_info=True)
2616   -
2617   - def open_mht(self, data):
2618   - """
2619   - Open a MHTML file
2620   - :param data: file contents in a string or bytes
2621   - :return: nothing
2622   - """
2623   - log.info('Opening MHTML file %s' % self.filename)
2624   - try:
2625   - if isinstance(data,bytes):
2626   - data = data.decode('utf8', 'backslashreplace')
2627   - # parse the MIME content
2628   - # remove any leading whitespace or newline (workaround for issue in email package)
2629   - stripped_data = data.lstrip('\r\n\t ')
2630   - # strip any junk from the beginning of the file
2631   - # (issue #31 fix by Greg C - gdigreg)
2632   - # TODO: improve keywords to avoid false positives
2633   - mime_offset = stripped_data.find('MIME')
2634   - content_offset = stripped_data.find('Content')
2635   - # if "MIME" is found, and located before "Content":
2636   - if -1 < mime_offset <= content_offset:
2637   - stripped_data = stripped_data[mime_offset:]
2638   - # else if "Content" is found, and before "MIME"
2639   - # TODO: can it work without "MIME" at all?
2640   - elif content_offset > -1:
2641   - stripped_data = stripped_data[content_offset:]
2642   - # TODO: quick and dirty fix: insert a standard line with MIME-Version header?
2643   - mhtml = email.message_from_string(stripped_data)
2644   - # find all the attached files:
2645   - for part in mhtml.walk():
2646   - content_type = part.get_content_type() # always returns a value
2647   - fname = part.get_filename(None) # returns None if it fails
2648   - # TODO: get content-location if no filename
2649   - log.debug('MHTML part: filename=%r, content-type=%r' % (fname, content_type))
2650   - part_data = part.get_payload(decode=True)
2651   - # VBA macros are stored in a binary file named "editdata.mso".
2652   - # the data content is an OLE container for the VBA project, compressed
2653   - # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded.
2654   - # decompress the zlib data starting at offset 0x32, which is the OLE container:
2655   - # check ActiveMime header:
2656   -
2657   - if (isinstance(part_data, str) or isinstance(part_data, bytes)) and is_mso_file(part_data):
2658   - log.debug('Found ActiveMime header, decompressing MSO container')
2659   - try:
2660   - ole_data = mso_file_extract(part_data)
2661   -
2662   - # TODO: check if it is actually an OLE file
2663   - # TODO: get the MSO filename from content_location?
2664   - self.ole_subfiles.append(
2665   - VBA_Parser(filename=fname, data=ole_data,
2666   - relaxed=self.relaxed))
2667   - except OlevbaBaseException as exc:
2668   - if self.relaxed:
2669   - log.info('%s does not contain a valid OLE file (%s)'
2670   - % (fname, exc))
2671   - log.debug('Trace:', exc_info=True)
2672   - # TODO: bug here - need to split in smaller functions/classes?
2673   - else:
2674   - raise SubstreamOpenError(self.filename, fname, exc)
2675   - else:
2676   - log.debug('type(part_data) = %s' % type(part_data))
2677   - try:
2678   - log.debug('part_data[0:20] = %r' % part_data[0:20])
2679   - except TypeError as err:
2680   - log.debug('part_data has no __getitem__')
2681   - # set type only if parsing succeeds
2682   - self.type = TYPE_MHTML
2683   - except OlevbaBaseException:
2684   - raise
2685   - except Exception:
2686   - log.info('Failed MIME parsing for file %r - %s'
2687   - % (self.filename, MSG_OLEVBA_ISSUES))
2688   - log.debug('Trace:', exc_info=True)
2689   -
2690   - def open_ppt(self):
2691   - """ try to interpret self.ole_file as PowerPoint 97-2003 using PptParser
2692   -
2693   - Although self.ole_file is a valid olefile.OleFileIO, we set
2694   - self.ole_file = None in here and instead set self.ole_subfiles to the
2695   - VBA ole streams found within the main ole file. That makes most of the
2696   - code below treat this like an OpenXML file and only look at the
2697   - ole_subfiles (except find_vba_* which needs to explicitly check for
2698   - self.type)
2699   - """
2700   -
2701   - log.info('Check whether OLE file is PPT')
2702   - try:
2703   - ppt = ppt_parser.PptParser(self.ole_file, fast_fail=True)
2704   - for vba_data in ppt.iter_vba_data():
2705   - self.ole_subfiles.append(VBA_Parser(None, vba_data,
2706   - container='PptParser'))
2707   - log.info('File is PPT')
2708   - self.ole_file.close() # just in case
2709   - self.ole_file = None # required to make other methods look at ole_subfiles
2710   - self.type = TYPE_PPT
2711   - except Exception as exc:
2712   - if self.container == 'PptParser':
2713   - # this is a subfile of a ppt --> to be expected that is no ppt
2714   - log.debug('PPT subfile is not a PPT file')
2715   - else:
2716   - log.debug("File appears not to be a ppt file (%s)" % exc)
2717   -
2718   -
2719   - def open_text(self, data):
2720   - """
2721   - Open a text file containing VBA or VBScript source code
2722   - :param data: file contents in a string or bytes
2723   - :return: nothing
2724   - """
2725   - log.info('Opening text file %s' % self.filename)
2726   - # directly store the source code:
2727   - if isinstance(data,bytes):
2728   - data=data.decode('utf8','backslashreplace')
2729   - self.vba_code_all_modules = data
2730   - self.contains_macros = True
2731   - # set type only if parsing succeeds
2732   - self.type = TYPE_TEXT
2733   -
2734   -
2735   - def find_vba_projects(self):
2736   - """
2737   - Finds all the VBA projects stored in an OLE file.
2738   -
2739   - Return None if the file is not OLE but OpenXML.
2740   - Return a list of tuples (vba_root, project_path, dir_path) for each VBA project.
2741   - vba_root is the path of the root OLE storage containing the VBA project,
2742   - including a trailing slash unless it is the root of the OLE file.
2743   - project_path is the path of the OLE stream named "PROJECT" within the VBA project.
2744   - dir_path is the path of the OLE stream named "VBA/dir" within the VBA project.
2745   -
2746   - If this function returns an empty list for one of the supported formats
2747   - (i.e. Word, Excel, Powerpoint), then the file does not contain VBA macros.
2748   -
2749   - :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path)
2750   - for each VBA project found if OLE file
2751   - """
2752   - log.debug('VBA_Parser.find_vba_projects')
2753   -
2754   - # if the file is not OLE but OpenXML, return None:
2755   - if self.ole_file is None and self.type != TYPE_PPT:
2756   - return None
2757   -
2758   - # if this method has already been called, return previous result:
2759   - if self.vba_projects is not None:
2760   - return self.vba_projects
2761   -
2762   - # if this is a ppt file (PowerPoint 97-2003):
2763   - # self.ole_file is None but the ole_subfiles do contain vba_projects
2764   - # (like for OpenXML files).
2765   - if self.type == TYPE_PPT:
2766   - # TODO: so far, this function is never called for PPT files, but
2767   - # if that happens, the information is lost which ole file contains
2768   - # which storage!
2769   - log.warning('Returned info is not complete for PPT types!')
2770   - self.vba_projects = []
2771   - for subfile in self.ole_subfiles:
2772   - self.vba_projects.extend(subfile.find_vba_projects())
2773   - return self.vba_projects
2774   -
2775   - # Find the VBA project root (different in MS Word, Excel, etc):
2776   - # - Word 97-2003: Macros
2777   - # - Excel 97-2003: _VBA_PROJECT_CUR
2778   - # - PowerPoint 97-2003: PptParser has identified ole_subfiles
2779   - # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin.
2780   - # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word
2781   - # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word
2782   - # - Visio 2007: not supported yet (different file structure)
2783   -
2784   - # According to MS-OVBA section 2.2.1:
2785   - # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream
2786   - # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream
2787   - # - all names are case-insensitive
2788   -
2789   - def check_vba_stream(ole, vba_root, stream_path):
2790   - full_path = vba_root + stream_path
2791   - if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM:
2792   - log.debug('Found %s stream: %s' % (stream_path, full_path))
2793   - return full_path
2794   - else:
2795   - log.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path)
2796   - return False
2797   -
2798   - # start with an empty list:
2799   - self.vba_projects = []
2800   - # Look for any storage containing those storage/streams:
2801   - ole = self.ole_file
2802   - for storage in ole.listdir(streams=False, storages=True):
2803   - log.debug('Checking storage %r' % storage)
2804   - # Look for a storage ending with "VBA":
2805   - if storage[-1].upper() == 'VBA':
2806   - log.debug('Found VBA storage: %s' % ('/'.join(storage)))
2807   - vba_root = '/'.join(storage[:-1])
2808   - # Add a trailing slash to vba_root, unless it is the root of the OLE file:
2809   - # (used later to append all the child streams/storages)
2810   - if vba_root != '':
2811   - vba_root += '/'
2812   - log.debug('Checking vba_root="%s"' % vba_root)
2813   -
2814   - # Check if the VBA root storage also contains a PROJECT stream:
2815   - project_path = check_vba_stream(ole, vba_root, 'PROJECT')
2816   - if not project_path: continue
2817   - # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream:
2818   - vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT')
2819   - if not vba_project_path: continue
2820   - # Check if the VBA root storage also contains a VBA/dir stream:
2821   - dir_path = check_vba_stream(ole, vba_root, 'VBA/dir')
2822   - if not dir_path: continue
2823   - # Now we are pretty sure it is a VBA project structure
2824   - log.debug('VBA root storage: "%s"' % vba_root)
2825   - # append the results to the list as a tuple for later use:
2826   - self.vba_projects.append((vba_root, project_path, dir_path))
2827   - return self.vba_projects
2828   -
2829   - def detect_vba_macros(self):
2830   - """
2831   - Detect the potential presence of VBA macros in the file, by checking
2832   - if it contains VBA projects. Both OLE and OpenXML files are supported.
2833   -
2834   - Important: for now, results are accurate only for Word, Excel and PowerPoint
2835   -
2836   - Note: this method does NOT attempt to check the actual presence or validity
2837   - of VBA macro source code, so there might be false positives.
2838   - It may also detect VBA macros in files embedded within the main file,
2839   - for example an Excel workbook with macros embedded into a Word
2840   - document without macros may be detected, without distinction.
2841   -
2842   - :return: bool, True if at least one VBA project has been found, False otherwise
2843   - """
2844   - #TODO: return None or raise exception if format not supported
2845   - #TODO: return the number of VBA projects found instead of True/False?
2846   - # if this method was already called, return the previous result:
2847   - if self.contains_macros is not None:
2848   - return self.contains_macros
2849   - # if OpenXML/PPT, check all the OLE subfiles:
2850   - if self.ole_file is None:
2851   - for ole_subfile in self.ole_subfiles:
2852   - if ole_subfile.detect_vba_macros():
2853   - self.contains_macros = True
2854   - return True
2855   - # otherwise, no macro found:
2856   - self.contains_macros = False
2857   - return False
2858   - # otherwise it's an OLE file, find VBA projects:
2859   - vba_projects = self.find_vba_projects()
2860   - if len(vba_projects) == 0:
2861   - self.contains_macros = False
2862   - else:
2863   - self.contains_macros = True
2864   - # Also look for VBA code in any stream including orphans
2865   - # (happens in some malformed files)
2866   - ole = self.ole_file
2867   - for sid in xrange(len(ole.direntries)):
2868   - # check if id is already done above:
2869   - log.debug('Checking DirEntry #%d' % sid)
2870   - d = ole.direntries[sid]
2871   - if d is None:
2872   - # this direntry is not part of the tree: either unused or an orphan
2873   - d = ole._load_direntry(sid)
2874   - log.debug('This DirEntry is an orphan or unused')
2875   - if d.entry_type == olefile.STGTY_STREAM:
2876   - # read data
2877   - log.debug('Reading data from stream %r - size: %d bytes' % (d.name, d.size))
2878   - try:
2879   - data = ole._open(d.isectStart, d.size).read()
2880   - log.debug('Read %d bytes' % len(data))
2881   - if len(data) > 200:
2882   - log.debug('%r...[much more data]...%r' % (data[:100], data[-50:]))
2883   - else:
2884   - log.debug(repr(data))
2885   - if 'Attribut\x00' in data.decode('utf-8', 'ignore'):
2886   - log.debug('Found VBA compressed code')
2887   - self.contains_macros = True
2888   - except IOError as exc:
2889   - if self.relaxed:
2890   - log.info('Error when reading OLE Stream %r' % d.name)
2891   - log.debug('Trace:', exc_trace=True)
2892   - else:
2893   - raise SubstreamOpenError(self.filename, d.name, exc)
2894   - return self.contains_macros
2895   -
2896   - def extract_macros(self):
2897   - """
2898   - Extract and decompress source code for each VBA macro found in the file
2899   -
2900   - Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found
2901   - If the file is OLE, filename is the path of the file.
2902   - If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros
2903   - within the zip archive, e.g. word/vbaProject.bin.
2904   - If the file is PPT, result is as for OpenXML but filename is useless
2905   - """
2906   - log.debug('extract_macros:')
2907   - if self.ole_file is None:
2908   - # This may be either an OpenXML/PPT or a text file:
2909   - if self.type == TYPE_TEXT:
2910   - # This is a text file, yield the full code:
2911   - yield (self.filename, '', self.filename, self.vba_code_all_modules)
2912   - else:
2913   - # OpenXML/PPT: recursively yield results from each OLE subfile:
2914   - for ole_subfile in self.ole_subfiles:
2915   - for results in ole_subfile.extract_macros():
2916   - yield results
2917   - else:
2918   - # This is an OLE file:
2919   - self.find_vba_projects()
2920   - # set of stream ids
2921   - vba_stream_ids = set()
2922   - for vba_root, project_path, dir_path in self.vba_projects:
2923   - # extract all VBA macros from that VBA root storage:
2924   - # The function _extract_vba may fail on some files (issue #132)
2925   - try:
2926   - for stream_path, vba_filename, vba_code in \
2927   - _extract_vba(self.ole_file, vba_root, project_path,
2928   - dir_path, self.relaxed):
2929   - # store direntry ids in a set:
2930   - vba_stream_ids.add(self.ole_file._find(stream_path))
2931   - yield (self.filename, stream_path, vba_filename, vba_code)
2932   - except Exception as e:
2933   - log.exception('Error in _extract_vba')
2934   - # Also look for VBA code in any stream including orphans
2935   - # (happens in some malformed files)
2936   - ole = self.ole_file
2937   - for sid in xrange(len(ole.direntries)):
2938   - # check if id is already done above:
2939   - log.debug('Checking DirEntry #%d' % sid)
2940   - if sid in vba_stream_ids:
2941   - log.debug('Already extracted')
2942   - continue
2943   - d = ole.direntries[sid]
2944   - if d is None:
2945   - # this direntry is not part of the tree: either unused or an orphan
2946   - d = ole._load_direntry(sid)
2947   - log.debug('This DirEntry is an orphan or unused')
2948   - if d.entry_type == olefile.STGTY_STREAM:
2949   - # read data
2950   - log.debug('Reading data from stream %r' % d.name)
2951   - data = ole._open(d.isectStart, d.size).read()
2952   - for match in re.finditer(b'\\x00Attribut[^e]', data, flags=re.IGNORECASE):
2953   - start = match.start() - 3
2954   - log.debug('Found VBA compressed code at index %X' % start)
2955   - compressed_code = data[start:]
2956   - try:
2957   - vba_code = decompress_stream(bytearray(compressed_code))
2958   - yield (self.filename, d.name, d.name, vba_code)
2959   - except Exception as exc:
2960   - # display the exception with full stack trace for debugging
2961   - log.debug('Error processing stream %r in file %r (%s)' % (d.name, self.filename, exc))
2962   - log.debug('Traceback:', exc_info=True)
2963   - # do not raise the error, as it is unlikely to be a compressed macro stream
2964   -
2965   - def extract_all_macros(self):
2966   - """
2967   - Extract and decompress source code for each VBA macro found in the file
2968   - by calling extract_macros(), store the results as a list of tuples
2969   - (filename, stream_path, vba_filename, vba_code) in self.modules.
2970   - See extract_macros for details.
2971   - """
2972   - if self.modules is None:
2973   - self.modules = []
2974   - for (subfilename, stream_path, vba_filename, vba_code) in self.extract_macros():
2975   - self.modules.append((subfilename, stream_path, vba_filename, vba_code))
2976   - self.nb_macros = len(self.modules)
2977   - return self.modules
2978   -
2979   -
2980   -
2981   - def analyze_macros(self, show_decoded_strings=False, deobfuscate=False):
2982   - """
2983   - runs extract_macros and analyze the source code of all VBA macros
2984   - found in the file.
2985   - All results are stored in self.analysis_results.
2986   - If called more than once, simply returns the previous results.
2987   - """
2988   - if self.detect_vba_macros():
2989   - # if the analysis was already done, avoid doing it twice:
2990   - if self.analysis_results is not None:
2991   - return self.analysis_results
2992   - # variable to merge source code from all modules:
2993   - if self.vba_code_all_modules is None:
2994   - self.vba_code_all_modules = ''
2995   - for (_, _, _, vba_code) in self.extract_all_macros():
2996   - #TODO: filter code? (each module)
2997   - if isinstance(vba_code, bytes):
2998   - vba_code = vba_code.decode('utf-8', 'ignore')
2999   - self.vba_code_all_modules += vba_code + '\n'
3000   - for (_, _, form_string) in self.extract_form_strings():
3001   - self.vba_code_all_modules += form_string.decode('utf-8', 'ignore') + '\n'
3002   - # Analyze the whole code at once:
3003   - scanner = VBA_Scanner(self.vba_code_all_modules)
3004   - self.analysis_results = scanner.scan(show_decoded_strings, deobfuscate)
3005   - autoexec, suspicious, iocs, hexstrings, base64strings, dridex, vbastrings = scanner.scan_summary()
3006   - self.nb_autoexec += autoexec
3007   - self.nb_suspicious += suspicious
3008   - self.nb_iocs += iocs
3009   - self.nb_hexstrings += hexstrings
3010   - self.nb_base64strings += base64strings
3011   - self.nb_dridexstrings += dridex
3012   - self.nb_vbastrings += vbastrings
3013   -
3014   - return self.analysis_results
3015   -
3016   -
3017   - def reveal(self):
3018   - # we only want printable strings:
3019   - analysis = self.analyze_macros(show_decoded_strings=False)
3020   - # to avoid replacing short strings contained into longer strings, we sort the analysis results
3021   - # based on the length of the encoded string, in reverse order:
3022   - analysis = sorted(analysis, key=lambda type_decoded_encoded: len(type_decoded_encoded[2]), reverse=True)
3023   - # normally now self.vba_code_all_modules contains source code from all modules
3024   - # Need to collapse long lines:
3025   - deobf_code = vba_collapse_long_lines(self.vba_code_all_modules)
3026   - deobf_code = filter_vba(deobf_code)
3027   - for kw_type, decoded, encoded in analysis:
3028   - if kw_type == 'VBA string':
3029   - #print '%3d occurences: %r => %r' % (deobf_code.count(encoded), encoded, decoded)
3030   - # need to add double quotes around the decoded strings
3031   - # after escaping double-quotes as double-double-quotes for VBA:
3032   - decoded = decoded.replace('"', '""')
3033   - decoded = '"%s"' % decoded
3034   - # if the encoded string is enclosed in parentheses,
3035   - # keep them in the decoded version:
3036   - if encoded.startswith('(') and encoded.endswith(')'):
3037   - decoded = '(%s)' % decoded
3038   - deobf_code = deobf_code.replace(encoded, decoded)
3039   - # # TODO: there is a bug somewhere which creates double returns '\r\r'
3040   - # deobf_code = deobf_code.replace('\r\r', '\r')
3041   - return deobf_code
3042   - #TODO: repasser l'analyse plusieurs fois si des chaines hex ou base64 sont revelees
3043   -
3044   -
3045   - def find_vba_forms(self):
3046   - """
3047   - Finds all the VBA forms stored in an OLE file.
3048   -
3049   - Return None if the file is not OLE but OpenXML.
3050   - Return a list of tuples (vba_root, project_path, dir_path) for each VBA project.
3051   - vba_root is the path of the root OLE storage containing the VBA project,
3052   - including a trailing slash unless it is the root of the OLE file.
3053   - project_path is the path of the OLE stream named "PROJECT" within the VBA project.
3054   - dir_path is the path of the OLE stream named "VBA/dir" within the VBA project.
3055   -
3056   - If this function returns an empty list for one of the supported formats
3057   - (i.e. Word, Excel, Powerpoint), then the file does not contain VBA forms.
3058   -
3059   - :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path)
3060   - for each VBA project found if OLE file
3061   - """
3062   - log.debug('VBA_Parser.find_vba_forms')
3063   -
3064   - # if the file is not OLE but OpenXML, return None:
3065   - if self.ole_file is None and self.type != TYPE_PPT:
3066   - return None
3067   -
3068   - # if this method has already been called, return previous result:
3069   - # if self.vba_projects is not None:
3070   - # return self.vba_projects
3071   -
3072   - # According to MS-OFORMS section 2.1.2 Control Streams:
3073   - # - A parent control, that is, a control that can contain embedded controls,
3074   - # MUST be persisted as a storage that contains multiple streams.
3075   - # - All parent controls MUST contain a FormControl. The FormControl
3076   - # properties are persisted to a stream (1) as specified in section 2.1.1.2.
3077   - # The name of this stream (1) MUST be "f".
3078   - # - Embedded controls that cannot themselves contain other embedded
3079   - # controls are persisted sequentially as FormEmbeddedActiveXControls
3080   - # to a stream (1) contained in the same storage as the parent control.
3081   - # The name of this stream (1) MUST be "o".
3082   - # - all names are case-insensitive
3083   -
3084   - if self.type == TYPE_PPT:
3085   - # TODO: so far, this function is never called for PPT files, but
3086   - # if that happens, the information is lost which ole file contains
3087   - # which storage!
3088   - ole_files = self.ole_subfiles
3089   - log.warning('Returned info is not complete for PPT types!')
3090   - else:
3091   - ole_files = [self.ole_file, ]
3092   -
3093   - # start with an empty list:
3094   - self.vba_forms = []
3095   -
3096   - # Loop over ole streams
3097   - for ole in ole_files:
3098   - # Look for any storage containing those storage/streams:
3099   - for storage in ole.listdir(streams=False, storages=True):
3100   - log.debug('Checking storage %r' % storage)
3101   - # Look for two streams named 'o' and 'f':
3102   - o_stream = storage + ['o']
3103   - f_stream = storage + ['f']
3104   - log.debug('Checking if streams %r and %r exist' % (f_stream, o_stream))
3105   - if ole.exists(o_stream) and ole.get_type(o_stream) == olefile.STGTY_STREAM \
3106   - and ole.exists(f_stream) and ole.get_type(f_stream) == olefile.STGTY_STREAM:
3107   - form_path = '/'.join(storage)
3108   - log.debug('Found VBA Form: %r' % form_path)
3109   - self.vba_forms.append(storage)
3110   - return self.vba_forms
3111   -
3112   - def extract_form_strings(self):
3113   - """
3114   - Extract printable strings from each VBA Form found in the file
3115   -
3116   - Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found
3117   - If the file is OLE, filename is the path of the file.
3118   - If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros
3119   - within the zip archive, e.g. word/vbaProject.bin.
3120   - If the file is PPT, result is as for OpenXML but filename is useless
3121   - """
3122   - if self.ole_file is None:
3123   - # This may be either an OpenXML/PPT or a text file:
3124   - if self.type == TYPE_TEXT:
3125   - # This is a text file, return no results:
3126   - return
3127   - else:
3128   - # OpenXML/PPT: recursively yield results from each OLE subfile:
3129   - for ole_subfile in self.ole_subfiles:
3130   - for results in ole_subfile.extract_form_strings():
3131   - yield results
3132   - else:
3133   - # This is an OLE file:
3134   - self.find_vba_forms()
3135   - ole = self.ole_file
3136   - for form_storage in self.vba_forms:
3137   - o_stream = form_storage + ['o']
3138   - log.debug('Opening form object stream %r' % '/'.join(o_stream))
3139   - form_data = ole.openstream(o_stream).read()
3140   - # Extract printable strings from the form object stream "o":
3141   - for m in re_printable_string.finditer(form_data):
3142   - log.debug('Printable string found in form: %r' % m.group())
3143   - yield (self.filename, '/'.join(o_stream), m.group())
3144   -
3145   -
3146   - def close(self):
3147   - """
3148   - Close all the open files. This method must be called after usage, if
3149   - the application is opening many files.
3150   - """
3151   - if self.ole_file is None:
3152   - if self.ole_subfiles is not None:
3153   - for ole_subfile in self.ole_subfiles:
3154   - ole_subfile.close()
3155   - else:
3156   - self.ole_file.close()
3157   -
3158   -
3159   -
3160   -class VBA_Parser_CLI(VBA_Parser):
3161   - """
3162   - VBA parser and analyzer, adding methods for the command line interface
3163   - of olevba. (see VBA_Parser)
3164   - """
3165   -
3166   - def __init__(self, *args, **kwargs):
3167   - """
3168   - Constructor for VBA_Parser_CLI.
3169   - Calls __init__ from VBA_Parser with all arguments --> see doc there
3170   - """
3171   - super(VBA_Parser_CLI, self).__init__(*args, **kwargs)
3172   -
3173   -
3174   - def print_analysis(self, show_decoded_strings=False, deobfuscate=False):
3175   - """
3176   - Analyze the provided VBA code, and print the results in a table
3177   -
3178   - :param vba_code: str, VBA source code to be analyzed
3179   - :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.
3180   - :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow)
3181   - :return: None
3182   - """
3183   - # print a waiting message only if the output is not redirected to a file:
3184   - if sys.stdout.isatty():
3185   - print('Analysis...\r', end='')
3186   - sys.stdout.flush()
3187   - results = self.analyze_macros(show_decoded_strings, deobfuscate)
3188   - if results:
3189   - t = prettytable.PrettyTable(('Type', 'Keyword', 'Description'))
3190   - t.align = 'l'
3191   - t.max_width['Type'] = 10
3192   - t.max_width['Keyword'] = 20
3193   - t.max_width['Description'] = 39
3194   - for kw_type, keyword, description in results:
3195   - # handle non printable strings:
3196   - if not is_printable(keyword):
3197   - keyword = repr(keyword)
3198   - if not is_printable(description):
3199   - description = repr(description)
3200   - t.add_row((kw_type, keyword, description))
3201   - print(t)
3202   - else:
3203   - print('No suspicious keyword or IOC found.')
3204   -
3205   - def print_analysis_json(self, show_decoded_strings=False, deobfuscate=False):
3206   - """
3207   - Analyze the provided VBA code, and return the results in json format
3208   -
3209   - :param vba_code: str, VBA source code to be analyzed
3210   - :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.
3211   - :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow)
3212   -
3213   - :return: dict
3214   - """
3215   - # print a waiting message only if the output is not redirected to a file:
3216   - if sys.stdout.isatty():
3217   - print('Analysis...\r', end='')
3218   - sys.stdout.flush()
3219   - return [dict(type=kw_type, keyword=keyword, description=description)
3220   - for kw_type, keyword, description in self.analyze_macros(show_decoded_strings, deobfuscate)]
3221   -
3222   - def process_file(self, show_decoded_strings=False,
3223   - display_code=True, hide_attributes=True,
3224   - vba_code_only=False, show_deobfuscated_code=False,
3225   - deobfuscate=False):
3226   - """
3227   - Process a single file
3228   -
3229   - :param filename: str, path and filename of file on disk, or within the container.
3230   - :param data: bytes, content of the file if it is in a container, None if it is a file on disk.
3231   - :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.
3232   - :param display_code: bool, if False VBA source code is not displayed (default True)
3233   - :param global_analysis: bool, if True all modules are merged for a single analysis (default),
3234   - otherwise each module is analyzed separately (old behaviour)
3235   - :param hide_attributes: bool, if True the first lines starting with "Attribute VB" are hidden (default)
3236   - :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow)
3237   - """
3238   - #TODO: replace print by writing to a provided output file (sys.stdout by default)
3239   - # fix conflicting parameters:
3240   - if vba_code_only and not display_code:
3241   - display_code = True
3242   - if self.container:
3243   - display_filename = '%s in %s' % (self.filename, self.container)
3244   - else:
3245   - display_filename = self.filename
3246   - print('=' * 79)
3247   - print('FILE: %s' % display_filename)
3248   - try:
3249   - #TODO: handle olefile errors, when an OLE file is malformed
3250   - print('Type: %s'% self.type)
3251   - if self.detect_vba_macros():
3252   - #print 'Contains VBA Macros:'
3253   - for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros():
3254   - if hide_attributes:
3255   - # hide attribute lines:
3256   - if isinstance(vba_code,bytes):
3257   - vba_code =vba_code.decode('utf-8','backslashreplace')
3258   - vba_code_filtered = filter_vba(vba_code)
3259   - else:
3260   - vba_code_filtered = vba_code
3261   - print('-' * 79)
3262   - print('VBA MACRO %s ' % vba_filename)
3263   - print('in file: %s - OLE stream: %s' % (subfilename, repr(stream_path)))
3264   - if display_code:
3265   - print('- ' * 39)
3266   - # detect empty macros:
3267   - if vba_code_filtered.strip() == '':
3268   - print('(empty macro)')
3269   - else:
3270   - print(vba_code_filtered)
3271   - for (subfilename, stream_path, form_string) in self.extract_form_strings():
3272   - print('-' * 79)
3273   - print('VBA FORM STRING IN %r - OLE stream: %r' % (subfilename, stream_path))
3274   - print('- ' * 39)
3275   - print(form_string.decode('utf-8', 'ignore'))
3276   - if not vba_code_only:
3277   - # analyse the code from all modules at once:
3278   - self.print_analysis(show_decoded_strings, deobfuscate)
3279   - if show_deobfuscated_code:
3280   - print('MACRO SOURCE CODE WITH DEOBFUSCATED VBA STRINGS (EXPERIMENTAL):\n\n')
3281   - print(self.reveal())
3282   - else:
3283   - print('No VBA macros found.')
3284   - except OlevbaBaseException:
3285   - raise
3286   - except Exception as exc:
3287   - # display the exception with full stack trace for debugging
3288   - log.info('Error processing file %s (%s)' % (self.filename, exc))
3289   - log.debug('Traceback:', exc_info=True)
3290   - raise ProcessingError(self.filename, exc)
3291   - print('')
3292   -
3293   -
3294   - def process_file_json(self, show_decoded_strings=False,
3295   - display_code=True, hide_attributes=True,
3296   - vba_code_only=False, show_deobfuscated_code=False,
3297   - deobfuscate=False):
3298   - """
3299   - Process a single file
3300   -
3301   - every "show" or "print" here is to be translated as "add to json"
3302   -
3303   - :param filename: str, path and filename of file on disk, or within the container.
3304   - :param data: bytes, content of the file if it is in a container, None if it is a file on disk.
3305   - :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.
3306   - :param display_code: bool, if False VBA source code is not displayed (default True)
3307   - :param global_analysis: bool, if True all modules are merged for a single analysis (default),
3308   - otherwise each module is analyzed separately (old behaviour)
3309   - :param hide_attributes: bool, if True the first lines starting with "Attribute VB" are hidden (default)
3310   - :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow)
3311   - """
3312   - #TODO: fix conflicting parameters (?)
3313   -
3314   - if vba_code_only and not display_code:
3315   - display_code = True
3316   -
3317   - result = {}
3318   -
3319   - if self.container:
3320   - result['container'] = self.container
3321   - else:
3322   - result['container'] = None
3323   - result['file'] = self.filename
3324   - result['json_conversion_successful'] = False
3325   - result['analysis'] = None
3326   - result['code_deobfuscated'] = None
3327   - result['do_deobfuscate'] = deobfuscate
3328   -
3329   - try:
3330   - #TODO: handle olefile errors, when an OLE file is malformed
3331   - result['type'] = self.type
3332   - macros = []
3333   - if self.detect_vba_macros():
3334   - for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros():
3335   - curr_macro = {}
3336   - if isinstance(vba_code, bytes):
3337   - vba_code = vba_code.decode('utf-8', 'backslashreplace')
3338   -
3339   - if hide_attributes:
3340   - # hide attribute lines:
3341   - vba_code_filtered = filter_vba(vba_code)
3342   - else:
3343   - vba_code_filtered = vba_code
3344   -
3345   - curr_macro['vba_filename'] = vba_filename
3346   - curr_macro['subfilename'] = subfilename
3347   - curr_macro['ole_stream'] = stream_path
3348   - if display_code:
3349   - curr_macro['code'] = vba_code_filtered.strip()
3350   - else:
3351   - curr_macro['code'] = None
3352   - macros.append(curr_macro)
3353   - if not vba_code_only:
3354   - # analyse the code from all modules at once:
3355   - result['analysis'] = self.print_analysis_json(show_decoded_strings,
3356   - deobfuscate)
3357   - if show_deobfuscated_code:
3358   - result['code_deobfuscated'] = self.reveal()
3359   - result['macros'] = macros
3360   - result['json_conversion_successful'] = True
3361   - except Exception as exc:
3362   - # display the exception with full stack trace for debugging
3363   - log.info('Error processing file %s (%s)' % (self.filename, exc))
3364   - log.debug('Traceback:', exc_info=True)
3365   - raise ProcessingError(self.filename, exc)
3366   -
3367   - return result
3368   -
3369   -
3370   - def process_file_triage(self, show_decoded_strings=False, deobfuscate=False):
3371   - """
3372   - Process a file in triage mode, showing only summary results on one line.
3373   - """
3374   - #TODO: replace print by writing to a provided output file (sys.stdout by default)
3375   - try:
3376   - #TODO: handle olefile errors, when an OLE file is malformed
3377   - if self.detect_vba_macros():
3378   - # print a waiting message only if the output is not redirected to a file:
3379   - if sys.stdout.isatty():
3380   - print('Analysis...\r', end='')
3381   - sys.stdout.flush()
3382   - self.analyze_macros(show_decoded_strings=show_decoded_strings,
3383   - deobfuscate=deobfuscate)
3384   - flags = TYPE2TAG[self.type]
3385   - macros = autoexec = suspicious = iocs = hexstrings = base64obf = dridex = vba_obf = '-'
3386   - if self.contains_macros: macros = 'M'
3387   - if self.nb_autoexec: autoexec = 'A'
3388   - if self.nb_suspicious: suspicious = 'S'
3389   - if self.nb_iocs: iocs = 'I'
3390   - if self.nb_hexstrings: hexstrings = 'H'
3391   - if self.nb_base64strings: base64obf = 'B'
3392   - if self.nb_dridexstrings: dridex = 'D'
3393   - if self.nb_vbastrings: vba_obf = 'V'
3394   - flags += '%s%s%s%s%s%s%s%s' % (macros, autoexec, suspicious, iocs, hexstrings,
3395   - base64obf, dridex, vba_obf)
3396   -
3397   - line = '%-12s %s' % (flags, self.filename)
3398   - print(line)
3399   - except Exception as exc:
3400   - # display the exception with full stack trace for debugging only
3401   - log.debug('Error processing file %s (%s)' % (self.filename, exc),
3402   - exc_info=True)
3403   - raise ProcessingError(self.filename, exc)
3404   -
3405   -
3406   -#=== MAIN =====================================================================
3407   -
3408   -def parse_args(cmd_line_args=None):
3409   - """ parse command line arguments (given ones or per default sys.argv) """
3410   -
3411   - DEFAULT_LOG_LEVEL = "warning" # Default log level
3412   - LOG_LEVELS = {
3413   - 'debug': logging.DEBUG,
3414   - 'info': logging.INFO,
3415   - 'warning': logging.WARNING,
3416   - 'error': logging.ERROR,
3417   - 'critical': logging.CRITICAL
3418   - }
3419   -
3420   - usage = 'usage: olevba [options] <filename> [filename2 ...]'
3421   - parser = optparse.OptionParser(usage=usage)
3422   - # parser.add_option('-o', '--outfile', dest='outfile',
3423   - # help='output file')
3424   - # parser.add_option('-c', '--csv', dest='csv',
3425   - # help='export results to a CSV file')
3426   - parser.add_option("-r", action="store_true", dest="recursive",
3427   - help='find files recursively in subdirectories.')
3428   - parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None,
3429   - help='if the file is a zip archive, open all files from it, using the provided password (requires Python 2.6+)')
3430   - parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*',
3431   - help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)')
3432   - # output mode; could make this even simpler with add_option(type='choice') but that would make
3433   - # cmd line interface incompatible...
3434   - modes = optparse.OptionGroup(parser, title='Output mode (mutually exclusive)')
3435   - modes.add_option("-t", '--triage', action="store_const", dest="output_mode",
3436   - const='triage', default='unspecified',
3437   - help='triage mode, display results as a summary table (default for multiple files)')
3438   - modes.add_option("-d", '--detailed', action="store_const", dest="output_mode",
3439   - const='detailed', default='unspecified',
3440   - help='detailed mode, display full results (default for single file)')
3441   - modes.add_option("-j", '--json', action="store_const", dest="output_mode",
3442   - const='json', default='unspecified',
3443   - help='json mode, detailed in json format (never default)')
3444   - parser.add_option_group(modes)
3445   - parser.add_option("-a", '--analysis', action="store_false", dest="display_code", default=True,
3446   - help='display only analysis results, not the macro source code')
3447   - parser.add_option("-c", '--code', action="store_true", dest="vba_code_only", default=False,
3448   - help='display only VBA source code, do not analyze it')
3449   - parser.add_option("--decode", action="store_true", dest="show_decoded_strings",
3450   - help='display all the obfuscated strings with their decoded content (Hex, Base64, StrReverse, Dridex, VBA).')
3451   - parser.add_option("--attr", action="store_false", dest="hide_attributes", default=True,
3452   - help='display the attribute lines at the beginning of VBA source code')
3453   - parser.add_option("--reveal", action="store_true", dest="show_deobfuscated_code",
3454   - help='display the macro source code after replacing all the obfuscated strings by their decoded content.')
3455   - parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL,
3456   - help="logging level debug/info/warning/error/critical (default=%default)")
3457   - parser.add_option('--deobf', dest="deobfuscate", action="store_true", default=False,
3458   - help="Attempt to deobfuscate VBA expressions (slow)")
3459   - parser.add_option('--relaxed', dest="relaxed", action="store_true", default=False,
3460   - help="Do not raise errors if opening of substream fails")
3461   -
3462   - (options, args) = parser.parse_args(cmd_line_args)
3463   -
3464   - # Print help if no arguments are passed
3465   - if len(args) == 0:
3466   - # print banner with version
3467   - python_version = '%d.%d.%d' % sys.version_info[0:3]
3468   - print('olevba3 %s on Python %s - http://decalage.info/python/oletools' %
3469   - (__version__, python_version))
3470   - print(__doc__)
3471   - parser.print_help()
3472   - sys.exit(RETURN_WRONG_ARGS)
3473   -
3474   - options.loglevel = LOG_LEVELS[options.loglevel]
3475   -
3476   - return options, args
3477   -
3478   -
3479   -def main(cmd_line_args=None):
3480   - """
3481   - Main function, called when olevba is run from the command line
3482   -
3483   - Optional argument: command line arguments to be forwarded to ArgumentParser
3484   - in process_args. Per default (cmd_line_args=None), sys.argv is used. Option
3485   - mainly added for unit-testing
3486   - """
3487   -
3488   - options, args = parse_args(cmd_line_args)
3489   -
3490   - # provide info about tool and its version
3491   - if options.output_mode == 'json':
3492   - # print first json entry with meta info and opening '['
3493   - print_json(script_name='olevba', version=__version__,
3494   - url='http://decalage.info/python/oletools',
3495   - type='MetaInformation', _json_is_first=True)
3496   - else:
3497   - # print banner with version
3498   - python_version = '%d.%d.%d' % sys.version_info[0:3]
3499   - print('olevba3 %s on Python %s - http://decalage.info/python/oletools' %
3500   - (__version__, python_version))
3501   -
3502   - logging.basicConfig(level=options.loglevel, format='%(levelname)-8s %(message)s')
3503   - # enable logging in the modules:
3504   - enable_logging()
3505   -
3506   - # with the option --reveal, make sure --deobf is also enabled:
3507   - if options.show_deobfuscated_code and not options.deobfuscate:
3508   - log.info('set --deobf because --reveal was set')
3509   - options.deobfuscate = True
3510   - if options.output_mode == 'triage' and options.show_deobfuscated_code:
3511   - log.info('ignoring option --reveal in triage output mode')
3512   -
3513   - # Column headers (do not know how many files there will be yet, so if no output_mode
3514   - # was specified, we will print triage for first file --> need these headers)
3515   - if options.output_mode in ('triage', 'unspecified'):
3516   - print('%-12s %-65s' % ('Flags', 'Filename'))
3517   - print('%-12s %-65s' % ('-' * 11, '-' * 65))
3518   -
3519   - previous_container = None
3520   - count = 0
3521   - container = filename = data = None
3522   - vba_parser = None
3523   - return_code = RETURN_OK
3524   - try:
3525   - for container, filename, data in xglob.iter_files(args, recursive=options.recursive,
3526   - zip_password=options.zip_password, zip_fname=options.zip_fname):
3527   - # ignore directory names stored in zip files:
3528   - if container and filename.endswith('/'):
3529   - continue
3530   -
3531   - # handle errors from xglob
3532   - if isinstance(data, Exception):
3533   - if isinstance(data, PathNotFoundException):
3534   - if options.output_mode in ('triage', 'unspecified'):
3535   - print('%-12s %s - File not found' % ('?', filename))
3536   - elif options.output_mode != 'json':
3537   - log.error('Given path %r does not exist!' % filename)
3538   - return_code = RETURN_FILE_NOT_FOUND if return_code == 0 \
3539   - else RETURN_SEVERAL_ERRS
3540   - else:
3541   - if options.output_mode in ('triage', 'unspecified'):
3542   - print('%-12s %s - Failed to read from zip file %s' % ('?', filename, container))
3543   - elif options.output_mode != 'json':
3544   - log.error('Exception opening/reading %r from zip file %r: %s'
3545   - % (filename, container, data))
3546   - return_code = RETURN_XGLOB_ERR if return_code == 0 \
3547   - else RETURN_SEVERAL_ERRS
3548   - if options.output_mode == 'json':
3549   - print_json(file=filename, type='error',
3550   - error=type(data).__name__, message=str(data))
3551   - continue
3552   -
3553   - try:
3554   - # close the previous file if analyzing several:
3555   - # (this must be done here to avoid closing the file if there is only 1,
3556   - # to fix issue #219)
3557   - if vba_parser is not None:
3558   - vba_parser.close()
3559   - # Open the file
3560   - vba_parser = VBA_Parser_CLI(filename, data=data, container=container,
3561   - relaxed=options.relaxed)
3562   -
3563   - if options.output_mode == 'detailed':
3564   - # fully detailed output
3565   - vba_parser.process_file(show_decoded_strings=options.show_decoded_strings,
3566   - display_code=options.display_code,
3567   - hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only,
3568   - show_deobfuscated_code=options.show_deobfuscated_code,
3569   - deobfuscate=options.deobfuscate)
3570   - elif options.output_mode in ('triage', 'unspecified'):
3571   - # print container name when it changes:
3572   - if container != previous_container:
3573   - if container is not None:
3574   - print('\nFiles in %s:' % container)
3575   - previous_container = container
3576   - # summarized output for triage:
3577   - vba_parser.process_file_triage(show_decoded_strings=options.show_decoded_strings,
3578   - deobfuscate=options.deobfuscate)
3579   - elif options.output_mode == 'json':
3580   - print_json(
3581   - vba_parser.process_file_json(show_decoded_strings=options.show_decoded_strings,
3582   - display_code=options.display_code,
3583   - hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only,
3584   - show_deobfuscated_code=options.show_deobfuscated_code,
3585   - deobfuscate=options.deobfuscate))
3586   - else: # (should be impossible)
3587   - raise ValueError('unexpected output mode: "{0}"!'.format(options.output_mode))
3588   - count += 1
3589   -
3590   - except (SubstreamOpenError, UnexpectedDataError) as exc:
3591   - if options.output_mode in ('triage', 'unspecified'):
3592   - print('%-12s %s - Error opening substream or uenxpected ' \
3593   - 'content' % ('?', filename))
3594   - elif options.output_mode == 'json':
3595   - print_json(file=filename, type='error',
3596   - error=type(exc).__name__, message=str(exc))
3597   - else:
3598   - log.exception('Error opening substream or unexpected '
3599   - 'content in %s' % filename)
3600   - return_code = RETURN_OPEN_ERROR if return_code == 0 \
3601   - else RETURN_SEVERAL_ERRS
3602   - except FileOpenError as exc:
3603   - if options.output_mode in ('triage', 'unspecified'):
3604   - print('%-12s %s - File format not supported' % ('?', filename))
3605   - elif options.output_mode == 'json':
3606   - print_json(file=filename, type='error',
3607   - error=type(exc).__name__, message=str(exc))
3608   - else:
3609   - log.exception('Failed to open %s -- probably not supported!' % filename)
3610   - return_code = RETURN_OPEN_ERROR if return_code == 0 \
3611   - else RETURN_SEVERAL_ERRS
3612   - except ProcessingError as exc:
3613   - if options.output_mode in ('triage', 'unspecified'):
3614   - print('%-12s %s - %s' % ('!ERROR', filename, exc.orig_exc))
3615   - elif options.output_mode == 'json':
3616   - print_json(file=filename, type='error',
3617   - error=type(exc).__name__,
3618   - message=str(exc.orig_exc))
3619   - else:
3620   - log.exception('Error processing file %s (%s)!'
3621   - % (filename, exc.orig_exc))
3622   - return_code = RETURN_PARSE_ERROR if return_code == 0 \
3623   - else RETURN_SEVERAL_ERRS
3624   - except FileIsEncryptedError as exc:
3625   - if options.output_mode in ('triage', 'unspecified'):
3626   - print('%-12s %s - File is encrypted' % ('!ERROR', filename))
3627   - elif options.output_mode == 'json':
3628   - print_json(file=filename, type='error',
3629   - error=type(exc).__name__, message=str(exc))
3630   - else:
3631   - log.exception('File %s is encrypted!' % (filename))
3632   - return_code = RETURN_ENCRYPTED if return_code == 0 \
3633   - else RETURN_SEVERAL_ERRS
3634   - # Here we do not close the vba_parser, because process_file may need it below.
3635   -
3636   - if options.output_mode == 'triage':
3637   - print('\n(Flags: OpX=OpenXML, XML=Word2003XML, FlX=FlatOPC XML, MHT=MHTML, TXT=Text, M=Macros, ' \
3638   - 'A=Auto-executable, S=Suspicious keywords, I=IOCs, H=Hex strings, ' \
3639   - 'B=Base64 strings, D=Dridex strings, V=VBA strings, ?=Unknown)\n')
3640   -
3641   - if count == 1 and options.output_mode == 'unspecified':
3642   - # if options -t, -d and -j were not specified and it's a single file, print details:
3643   - vba_parser.process_file(show_decoded_strings=options.show_decoded_strings,
3644   - display_code=options.display_code,
3645   - hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only,
3646   - show_deobfuscated_code=options.show_deobfuscated_code,
3647   - deobfuscate=options.deobfuscate)
3648   -
3649   - if options.output_mode == 'json':
3650   - # print last json entry (a last one without a comma) and closing ]
3651   - print_json(type='MetaInformation', return_code=return_code,
3652   - n_processed=count, _json_is_last=True)
3653   -
3654   - except Exception as exc:
3655   - # some unexpected error, maybe some of the types caught in except clauses
3656   - # above were not sufficient. This is very bad, so log complete trace at exception level
3657   - # and do not care about output mode
3658   - log.exception('Unhandled exception in main: %s' % exc, exc_info=True)
3659   - return_code = RETURN_UNEXPECTED # even if there were others before -- this is more important
3660   - # TODO: print msg with URL to report issues (except in JSON mode)
3661   -
3662   - # done. exit
3663   - log.debug('will exit now with code %s' % return_code)
3664   - sys.exit(return_code)
  19 +from oletools.olevba import *
  20 +from oletools.olevba import __doc__, __version__
3665 21  
3666 22 if __name__ == '__main__':
3667 23 main()
3668 24  
3669   -# This was coded while listening to "Dust" from I Love You But I've Chosen Darkness
... ...