Commit a4ffb743f926d59e022f10313ca70d6af9f8c8b7

Authored by Philippe Lagadec
1 parent 41896bcf

olevba: changed line endings from CRLF to LF

Showing 1 changed file with 1670 additions and 1670 deletions
oletools/olevba.py 100644 → 100755
1   -#!/usr/bin/env python
2   -"""
3   -olevba.py
4   -
5   -olevba is a script to parse OLE and OpenXML files such as MS Office documents
6   -(e.g. Word, Excel), to extract VBA Macro code in clear text, deobfuscate
7   -and analyze malicious macros.
8   -
9   -Supported formats:
10   -- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)
11   -- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
12   -- PowerPoint 2007+ (.pptm, .ppsm)
13   -- Word 2003 XML (.xml)
14   -
15   -Author: Philippe Lagadec - http://www.decalage.info
16   -License: BSD, see source code or documentation
17   -
18   -olevba is part of the python-oletools package:
19   -http://www.decalage.info/python/oletools
20   -
21   -olevba is based on source code from officeparser by John William Davison
22   -https://github.com/unixfreak0037/officeparser
23   -"""
24   -
25   -#=== LICENSE ==================================================================
26   -
27   -# olevba is copyright (c) 2014-2015 Philippe Lagadec (http://www.decalage.info)
28   -# All rights reserved.
29   -#
30   -# Redistribution and use in source and binary forms, with or without modification,
31   -# are permitted provided that the following conditions are met:
32   -#
33   -# * Redistributions of source code must retain the above copyright notice, this
34   -# list of conditions and the following disclaimer.
35   -# * Redistributions in binary form must reproduce the above copyright notice,
36   -# this list of conditions and the following disclaimer in the documentation
37   -# and/or other materials provided with the distribution.
38   -#
39   -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
40   -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
41   -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
42   -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
43   -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
44   -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
45   -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
46   -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
47   -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
48   -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
49   -
50   -
51   -# olevba contains modified source code from the officeparser project, published
52   -# under the following MIT License (MIT):
53   -#
54   -# officeparser is copyright (c) 2014 John William Davison
55   -#
56   -# Permission is hereby granted, free of charge, to any person obtaining a copy
57   -# of this software and associated documentation files (the "Software"), to deal
58   -# in the Software without restriction, including without limitation the rights
59   -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
60   -# copies of the Software, and to permit persons to whom the Software is
61   -# furnished to do so, subject to the following conditions:
62   -#
63   -# The above copyright notice and this permission notice shall be included in all
64   -# copies or substantial portions of the Software.
65   -#
66   -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
67   -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
68   -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
69   -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
70   -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
71   -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
72   -# SOFTWARE.
73   -
74   -#------------------------------------------------------------------------------
75   -# CHANGELOG:
76   -# 2014-08-05 v0.01 PL: - first version based on officeparser code
77   -# 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser
78   -# 2014-08-15 PL: - fixed incorrect value check in PROJECTHELPFILEPATH Record
79   -# 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats
80   -# and to find the VBA project root anywhere in the file
81   -# 2014-11-29 v0.04 PL: - use olefile instead of OleFileIO_PL
82   -# 2014-12-05 v0.05 PL: - refactored most functions into a class, new API
83   -# - added detect_vba_macros
84   -# 2014-12-10 v0.06 PL: - hide first lines with VB attributes
85   -# - detect auto-executable macros
86   -# - ignore empty macros
87   -# 2014-12-14 v0.07 PL: - detect_autoexec() is now case-insensitive
88   -# 2014-12-15 v0.08 PL: - improved display for empty macros
89   -# - added pattern extraction
90   -# 2014-12-25 v0.09 PL: - added suspicious keywords detection
91   -# 2014-12-27 v0.10 PL: - added OptionParser, main and process_file
92   -# - uses xglob to scan several files with wildcards
93   -# - option -r to recurse subdirectories
94   -# - option -z to scan files in password-protected zips
95   -# 2015-01-02 v0.11 PL: - improved filter_vba to detect colons
96   -# 2015-01-03 v0.12 PL: - fixed detect_patterns to detect all patterns
97   -# - process_file: improved display, shows container file
98   -# - improved list of executable file extensions
99   -# 2015-01-04 v0.13 PL: - added several suspicious keywords, improved display
100   -# 2015-01-08 v0.14 PL: - added hex strings detection and decoding
101   -# - fixed issue #2, decoding VBA stream names using
102   -# specified codepage and unicode stream names
103   -# 2015-01-11 v0.15 PL: - added new triage mode, options -t and -d
104   -# 2015-01-16 v0.16 PL: - fix for issue #3 (exception when module name="text")
105   -# - added several suspicious keywords
106   -# - added option -i to analyze VBA source code directly
107   -# 2015-01-17 v0.17 PL: - removed .com from the list of executable extensions
108   -# - added scan_vba to run all detection algorithms
109   -# - decoded hex strings are now also scanned + reversed
110   -# 2015-01-23 v0.18 PL: - fixed issue #3, case-insensitive search in code_modules
111   -# 2015-01-24 v0.19 PL: - improved the detection of IOCs obfuscated with hex
112   -# strings and StrReverse
113   -# 2015-01-26 v0.20 PL: - added option --hex to show all hex strings decoded
114   -# 2015-01-29 v0.21 PL: - added Dridex obfuscation decoding
115   -# - improved display, shows obfuscation name
116   -# 2015-02-01 v0.22 PL: - fixed issue #4: regex for URL, e-mail and exe filename
117   -# - added Base64 obfuscation decoding (contribution from
118   -# @JamesHabben)
119   -# 2015-02-03 v0.23 PL: - triage now uses VBA_Scanner results, shows Base64 and
120   -# Dridex strings
121   -# - exception handling in detect_base64_strings
122   -# 2015-02-07 v0.24 PL: - renamed option --hex to --decode, fixed display
123   -# - display exceptions with stack trace
124   -# - added several suspicious keywords
125   -# - improved Base64 detection and decoding
126   -# - fixed triage mode not to scan attrib lines
127   -# 2015-03-04 v0.25 PL: - added support for Word 2003 XML
128   -
129   -__version__ = '0.25'
130   -
131   -#------------------------------------------------------------------------------
132   -# TODO:
133   -# + do not use logging, but a provided logger (null logger by default)
134   -# + setup logging (common with other oletools)
135   -# + add xor bruteforcing like bbharvest
136   -# + add chr() decoding
137   -
138   -# TODO later:
139   -# + performance improvement: instead of searching each keyword separately,
140   -# first split vba code into a list of words (per line), then check each
141   -# word against a dict. (or put vba words into a set/dict?)
142   -# + for regex, maybe combine them into a single re with named groups?
143   -# + add Yara support, include sample rules? plugins like balbuzard?
144   -# + add balbuzard support
145   -# + output to file (replace print by file.write, sys.stdout by default)
146   -# + look for VBA in embedded documents (e.g. Excel in Word)
147   -# + support SRP streams (see Lenny's article + links and sample)
148   -# - python 3.x support
149   -# - add support for PowerPoint macros (see libclamav, libgsf), use oledump heuristic?
150   -# - check VBA macros in Visio, Access, Project, etc
151   -# - extract_macros: convert to a class, split long function into smaller methods
152   -# - extract_macros: read bytes from stream file objects instead of strings
153   -# - extract_macros: use combined struct.unpack instead of many calls
154   -
155   -#------------------------------------------------------------------------------
156   -# REFERENCES:
157   -# - [MS-OVBA]: Microsoft Office VBA File Format Structure
158   -# http://msdn.microsoft.com/en-us/library/office/cc313094%28v=office.12%29.aspx
159   -# - officeparser: https://github.com/unixfreak0037/officeparser
160   -
161   -
162   -#--- IMPORTS ------------------------------------------------------------------
163   -
164   -import sys, logging
165   -import struct
166   -import cStringIO
167   -import math
168   -import zipfile
169   -import re
170   -import optparse
171   -import os.path
172   -import binascii
173   -import base64
174   -import traceback
175   -import zlib
176   -
177   -# import lxml or ElementTree for XML parsing:
178   -try:
179   - # lxml: best performance for XML processing
180   - import lxml.etree as ET
181   -except ImportError:
182   - try:
183   - # Python 2.5+: batteries included
184   - import xml.etree.cElementTree as ET
185   - except ImportError:
186   - try:
187   - # Python <2.5: standalone ElementTree install
188   - import elementtree.cElementTree as ET
189   - except ImportError:
190   - raise ImportError, "lxml or ElementTree are not installed, "\
191   - +"see http://codespeak.net/lxml "\
192   - +"or http://effbot.org/zone/element-index.htm"
193   -
194   -import thirdparty.olefile as olefile
195   -from thirdparty.prettytable import prettytable
196   -from thirdparty.xglob import xglob
197   -
198   -#--- CONSTANTS ----------------------------------------------------------------
199   -
200   -TYPE_OLE = 'OLE'
201   -TYPE_OpenXML = 'OpenXML'
202   -TYPE_Word2003_XML = 'Word2003_XML'
203   -
204   -MODULE_EXTENSION = "bas"
205   -CLASS_EXTENSION = "cls"
206   -FORM_EXTENSION = "frm"
207   -
208   -# Namespaces and tags for Word2003 XML parsing:
209   -NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}'
210   -# the tag <w:binData w:name="editdata.mso"> contains the VBA macro code:
211   -TAG_BINDATA = NS_W + 'binData'
212   -ATTR_NAME = NS_W + 'name'
213   -
214   -# Keywords to detect auto-executable macros
215   -AUTOEXEC_KEYWORDS = {
216   - # MS Word:
217   - 'Runs when the Word document is opened':
218   - ('AutoExec', 'AutoOpen', 'Document_Open', 'DocumentOpen'),
219   - 'Runs when the Word document is closed':
220   - ('AutoExit', 'AutoClose', 'Document_Close', 'DocumentBeforeClose'),
221   - 'Runs when the Word document is modified':
222   - ('DocumentChange',),
223   - 'Runs when a new Word document is created':
224   - ('AutoNew', 'Document_New', 'NewDocument'),
225   -
226   - # MS Excel:
227   - 'Runs when the Excel Workbook is opened':
228   - ('Auto_Open', 'Workbook_Open'),
229   - 'Runs when the Excel Workbook is closed':
230   - ('Auto_Close', 'Workbook_Close'),
231   -
232   - #TODO: full list in MS specs??
233   -}
234   -
235   -# Suspicious Keywords that may be used by malware
236   -# See VBA language reference: http://msdn.microsoft.com/en-us/library/office/jj692818%28v=office.15%29.aspx
237   -SUSPICIOUS_KEYWORDS = {
238   - #TODO: use regex to support variable whitespaces
239   - 'May read system environment variables':
240   - ('Environ',),
241   - 'May open a file':
242   - ('Open',),
243   - 'May write to a file (if combined with Open)':
244   - #TODO: regex to find Open+Write on same line
245   - ('Write', 'Put', 'Output', 'Print #'),
246   - 'May read or write a binary file (if combined with Open)':
247   - #TODO: regex to find Open+Binary on same line
248   - ('Binary',),
249   - 'May copy a file':
250   - ('FileCopy', 'CopyFile'),
251   - #FileCopy: http://msdn.microsoft.com/en-us/library/office/gg264390%28v=office.15%29.aspx
252   - #CopyFile: http://msdn.microsoft.com/en-us/library/office/gg264089%28v=office.15%29.aspx
253   - 'May delete a file':
254   - ('Kill',),
255   - 'May create a text file':
256   - ('CreateTextFile','ADODB.Stream', 'WriteText', 'SaveToFile'),
257   - #CreateTextFile: http://msdn.microsoft.com/en-us/library/office/gg264617%28v=office.15%29.aspx
258   - #ADODB.Stream sample: http://pastebin.com/Z4TMyuq6
259   - 'May run an executable file or a system command':
260   - ('Shell', 'vbNormal', 'vbNormalFocus', 'vbHide', 'vbMinimizedFocus', 'vbMaximizedFocus', 'vbNormalNoFocus',
261   - 'vbMinimizedNoFocus', 'WScript.Shell', 'Run'),
262   - #Shell: http://msdn.microsoft.com/en-us/library/office/gg278437%28v=office.15%29.aspx
263   - #WScript.Shell+Run sample: http://pastebin.com/Z4TMyuq6
264   - 'May hide the application':
265   - ('Application.Visible', 'ShowWindow', 'SW_HIDE'),
266   - 'May create a directory':
267   - ('MkDir',),
268   - 'May save the current workbook':
269   - ('ActiveWorkbook.SaveAs',),
270   - 'May change which directory contains files to open at startup':
271   - #TODO: confirm the actual effect
272   - ('Application.AltStartupPath',),
273   - 'May create an OLE object':
274   - ('CreateObject',),
275   - 'May run an application (if combined with CreateObject)':
276   - ('Shell.Application',),
277   - 'May enumerate application windows (if combined with Shell.Application object)':
278   - ('Windows', 'FindWindow'),
279   - 'May run code from a DLL':
280   - #TODO: regex to find declare+lib on same line
281   - ('Lib',),
282   - 'May download files from the Internet':
283   - #TODO: regex to find urlmon+URLDownloadToFileA on same line
284   - ('URLDownloadToFileA', 'Msxml2.XMLHTTP', 'Microsoft.XMLHTTP'),
285   - 'May control another application by simulating user keystrokes':
286   - ('SendKeys', 'AppActivate'),
287   - #SendKeys: http://msdn.microsoft.com/en-us/library/office/gg278655%28v=office.15%29.aspx
288   - 'May attempt to obfuscate malicious function calls':
289   - ('CallByName',),
290   - #CallByName: http://msdn.microsoft.com/en-us/library/office/gg278760%28v=office.15%29.aspx
291   - 'May attempt to obfuscate specific strings':
292   - #TODO: regex to find several Chr*, not just one
293   - ('Chr', 'ChrB', 'ChrW', 'StrReverse', 'Xor'),
294   - #Chr: http://msdn.microsoft.com/en-us/library/office/gg264465%28v=office.15%29.aspx
295   -}
296   -
297   -# Regular Expression for a URL:
298   -# http://en.wikipedia.org/wiki/Uniform_resource_locator
299   -# http://www.w3.org/Addressing/URL/uri-spec.html
300   -#TODO: also support username:password@server
301   -#TODO: other protocols (file, gopher, wais, ...?)
302   -SCHEME = r'\b(?:http|ftp)s?'
303   -# see http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains
304   -TLD = r'(?:xn--[a-zA-Z0-9]{4,20}|[a-zA-Z]{2,20})'
305   -DNS_NAME = r'(?:[a-zA-Z0-9\-\.]+\.' + TLD + ')'
306   -#TODO: IPv6 - see https://www.debuggex.com/
307   -# A literal numeric IPv6 address may be given, but must be enclosed in [ ] e.g. [db8:0cec::99:123a]
308   -NUMBER_0_255 = r'(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'
309   -IPv4 = r'(?:'+NUMBER_0_255+r'\.){3}'+NUMBER_0_255
310   -# IPv4 must come before the DNS name because it is more specific
311   -SERVER = r'(?:' + IPv4 + '|' + DNS_NAME + ')'
312   -PORT = r'(?:\:[0-9]{1,5})?'
313   -SERVER_PORT = SERVER + PORT
314   -URL_PATH = r'(?:/[a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~]*)?' # [^\.\,\)\(\s"]
315   -URL_RE = SCHEME + r'\://' + SERVER_PORT + URL_PATH
316   -re_url = re.compile(URL_RE)
317   -
318   -
319   -# Patterns to be extracted (IP addresses, URLs, etc)
320   -# From patterns.py in balbuzard
321   -RE_PATTERNS = (
322   - ('URL', re.compile(URL_RE)),
323   - ('IPv4 address', re.compile(IPv4)),
324   - ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@'+SERVER+'\b')),
325   - # ('Domain name', re.compile(r'(?=^.{1,254}$)(^(?:(?!\d+\.|-)[a-zA-Z0-9_\-]{1,63}(?<!-)\.?)+(?:[a-zA-Z]{2,})$)')),
326   - # Executable file name with known extensions (except .com which is present in many URLs, and .application):
327   - ("Executable file name", re.compile(r"(?i)\b\w+\.(EXE|PIF|GADGET|MSI|MSP|MSC|VBS|VBE|VB|JSE|JS|WSF|WSC|WSH|WS|BAT|CMD|DLL|SCR|HTA|CPL|CLASS|JAR|PS1XML|PS1|PS2XML|PS2|PSC1|PSC2|SCF|LNK|INF|REG)\b")),
328   - # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/
329   - #TODO: https://support.office.com/en-us/article/Blocked-attachments-in-Outlook-3811cddc-17c3-4279-a30c-060ba0207372#__attachment_file_types
330   - #('Hex string', re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')),
331   - )
332   -
333   -# regex to detect strings encoded in hexadecimal
334   -re_hex_string = re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')
335   -
336   -# regex to detect strings encoded in base64
337   -#re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?"')
338   -# better version from balbuzard, less false positives:
339   -re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4}){1,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)?"')
340   -# white list of common strings matching the base64 regex, but which are not base64 strings (all lowercase):
341   -BASE64_WHITELIST = set(['thisdocument', 'thisworkbook', 'test', 'temp', 'http', 'open', 'exit'])
342   -
343   -# regex to detect strings encoded with a specific Dridex algorithm
344   -# (see https://github.com/JamesHabben/MalwareStuff)
345   -re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"')
346   -# regex to check that it is not just a hex string:
347   -re_nothex_check = re.compile(r'[G-Zg-z]')
348   -
349   -#--- FUNCTIONS ----------------------------------------------------------------
350   -
351   -def copytoken_help(decompressed_current, decompressed_chunk_start):
352   - """
353   - compute bit masks to decode a CopyToken according to MS-OVBA 2.4.1.3.19.1 CopyToken Help
354   -
355   - decompressed_current: number of decompressed bytes so far, i.e. len(decompressed_container)
356   - decompressed_chunk_start: offset of the current chunk in the decompressed container
357   - return length_mask, offset_mask, bit_count, maximum_length
358   - """
359   - difference = decompressed_current - decompressed_chunk_start
360   - bit_count = int(math.ceil(math.log(difference, 2)))
361   - bit_count = max([bit_count, 4])
362   - length_mask = 0xFFFF >> bit_count
363   - offset_mask = ~length_mask
364   - maximum_length = (0xFFFF >> bit_count) + 3
365   - return length_mask, offset_mask, bit_count, maximum_length
366   -
367   -
368   -def decompress_stream (compressed_container):
369   - """
370   - Decompress a stream according to MS-OVBA section 2.4.1
371   -
372   - compressed_container: string compressed according to the MS-OVBA 2.4.1.3.6 Compression algorithm
373   - return the decompressed container as a string (bytes)
374   - """
375   - # 2.4.1.2 State Variables
376   -
377   - # The following state is maintained for the CompressedContainer (section 2.4.1.1.1):
378   - # CompressedRecordEnd: The location of the byte after the last byte in the CompressedContainer (section 2.4.1.1.1).
379   - # CompressedCurrent: The location of the next byte in the CompressedContainer (section 2.4.1.1.1) to be read by
380   - # decompression or to be written by compression.
381   -
382   - # The following state is maintained for the current CompressedChunk (section 2.4.1.1.4):
383   - # CompressedChunkStart: The location of the first byte of the CompressedChunk (section 2.4.1.1.4) within the
384   - # CompressedContainer (section 2.4.1.1.1).
385   -
386   - # The following state is maintained for a DecompressedBuffer (section 2.4.1.1.2):
387   - # DecompressedCurrent: The location of the next byte in the DecompressedBuffer (section 2.4.1.1.2) to be written by
388   - # decompression or to be read by compression.
389   - # DecompressedBufferEnd: The location of the byte after the last byte in the DecompressedBuffer (section 2.4.1.1.2).
390   -
391   - # The following state is maintained for the current DecompressedChunk (section 2.4.1.1.3):
392   - # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the
393   - # DecompressedBuffer (section 2.4.1.1.2).
394   -
395   - decompressed_container = '' # result
396   - compressed_current = 0
397   -
398   - sig_byte = ord(compressed_container[compressed_current])
399   - if sig_byte != 0x01:
400   - raise ValueError('invalid signature byte {0:02X}'.format(sig_byte))
401   -
402   - compressed_current += 1
403   -
404   - #NOTE: the definition of CompressedRecordEnd is ambiguous. Here we assume that
405   - # CompressedRecordEnd = len(compressed_container)
406   - while compressed_current < len(compressed_container):
407   - # 2.4.1.1.5
408   - compressed_chunk_start = compressed_current
409   - # chunk header = first 16 bits
410   - compressed_chunk_header = struct.unpack("<H", compressed_container[compressed_chunk_start:compressed_chunk_start + 2])[0]
411   - # chunk size = 12 first bits of header + 3
412   - chunk_size = (compressed_chunk_header & 0x0FFF) + 3
413   - # chunk signature = 3 next bits - should always be 0b011
414   - chunk_signature = (compressed_chunk_header >> 12) & 0x07
415   - if chunk_signature != 0b011:
416   - raise ValueError('Invalid CompressedChunkSignature in VBA compressed stream')
417   - # chunk flag = next bit - 1 == compressed, 0 == uncompressed
418   - chunk_flag = (compressed_chunk_header >> 15) & 0x01
419   - logging.debug("chunk size = {0}, compressed flag = {1}".format(chunk_size, chunk_flag))
420   -
421   - #MS-OVBA 2.4.1.3.12: the maximum size of a chunk including its header is 4098 bytes (header 2 + data 4096)
422   - # The minimum size is 3 bytes
423   - # NOTE: there seems to be a typo in MS-OVBA, the check should be with 4098, not 4095 (which is the max value
424   - # in chunk header before adding 3.
425   - # Also the first test is not useful since a 12 bits value cannot be larger than 4095.
426   - if chunk_flag == 1 and chunk_size > 4098:
427   - raise ValueError('CompressedChunkSize > 4098 but CompressedChunkFlag == 1')
428   - if chunk_flag == 0 and chunk_size != 4098:
429   - raise ValueError('CompressedChunkSize != 4098 but CompressedChunkFlag == 0')
430   -
431   - # check if chunk_size goes beyond the compressed data, instead of silently cutting it:
432   - #TODO: raise an exception?
433   - if compressed_chunk_start + chunk_size > len(compressed_container):
434   - logging.warning('Chunk size is larger than remaining compressed data')
435   - compressed_end = min([len(compressed_container), compressed_chunk_start + chunk_size])
436   - # read after chunk header:
437   - compressed_current = compressed_chunk_start + 2
438   -
439   - if chunk_flag == 0:
440   - # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk
441   - # uncompressed chunk: read the next 4096 bytes as-is
442   - #TODO: check if there are at least 4096 bytes left
443   - decompressed_container += compressed_container[compressed_current:compressed_current + 4096]
444   - compressed_current += 4096
445   - else:
446   - # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk
447   - # compressed chunk
448   - decompressed_chunk_start = len(decompressed_container)
449   - while compressed_current < compressed_end:
450   - # MS-OVBA 2.4.1.3.4 Decompressing a TokenSequence
451   - # logging.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end))
452   - # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or
453   - # copy tokens (reference to a previous literal token)
454   - flag_byte = ord(compressed_container[compressed_current])
455   - compressed_current += 1
456   - for bit_index in xrange(0, 8):
457   - # logging.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end))
458   - if compressed_current >= compressed_end:
459   - break
460   - # MS-OVBA 2.4.1.3.5 Decompressing a Token
461   - # MS-OVBA 2.4.1.3.17 Extract FlagBit
462   - flag_bit = (flag_byte >> bit_index) & 1
463   - #logging.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit))
464   - if flag_bit == 0: # LiteralToken
465   - # copy one byte directly to output
466   - decompressed_container += compressed_container[compressed_current]
467   - compressed_current += 1
468   - else: # CopyToken
469   - # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken
470   - copy_token = struct.unpack("<H", compressed_container[compressed_current:compressed_current + 2])[0]
471   - #TODO: check this
472   - length_mask, offset_mask, bit_count, maximum_length = copytoken_help(
473   - len(decompressed_container), decompressed_chunk_start)
474   - length = (copy_token & length_mask) + 3
475   - temp1 = copy_token & offset_mask
476   - temp2 = 16 - bit_count
477   - offset = (temp1 >> temp2) + 1
478   - #logging.debug('offset=%d length=%d' % (offset, length))
479   - copy_source = len(decompressed_container) - offset
480   - for index in xrange(copy_source, copy_source + length):
481   - decompressed_container += decompressed_container[index]
482   - compressed_current += 2
483   - return decompressed_container
484   -
485   -
486   -def _extract_vba (ole, vba_root, project_path, dir_path):
487   - """
488   - Extract VBA macros from an OleFileIO object.
489   - Internal function, do not call directly.
490   -
491   - vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream
492   - vba_project: path to the PROJECT stream
493   - This is a generator, yielding (stream path, VBA filename, VBA source code) for each VBA code stream
494   - """
495   - # Open the PROJECT stream:
496   - project = ole.openstream(project_path)
497   -
498   - # sample content of the PROJECT stream:
499   -
500   - ## ID="{5312AC8A-349D-4950-BDD0-49BE3C4DD0F0}"
501   - ## Document=ThisDocument/&H00000000
502   - ## Module=NewMacros
503   - ## Name="Project"
504   - ## HelpContextID="0"
505   - ## VersionCompatible32="393222000"
506   - ## CMG="F1F301E705E705E705E705"
507   - ## DPB="8F8D7FE3831F2020202020"
508   - ## GC="2D2FDD81E51EE61EE6E1"
509   - ##
510   - ## [Host Extender Info]
511   - ## &H00000001={3832D640-CF90-11CF-8E43-00A0C911005A};VBE;&H00000000
512   - ## &H00000002={000209F2-0000-0000-C000-000000000046};Word8.0;&H00000000
513   - ##
514   - ## [Workspace]
515   - ## ThisDocument=22, 29, 339, 477, Z
516   - ## NewMacros=-4, 42, 832, 510, C
517   -
518   - code_modules = {}
519   -
520   - for line in project:
521   - line = line.strip()
522   - if '=' in line:
523   - # split line at the 1st equal sign:
524   - name, value = line.split('=', 1)
525   - # looking for code modules
526   - # add the code module as a key in the dictionary
527   - # the value will be the extension needed later
528   - # The value is converted to lowercase, to allow case-insensitive matching (issue #3)
529   - value = value.lower()
530   - if name == 'Document':
531   - # split value at the 1st slash, keep 1st part:
532   - value = value.split('/', 1)[0]
533   - code_modules[value] = CLASS_EXTENSION
534   - elif name == 'Module':
535   - code_modules[value] = MODULE_EXTENSION
536   - elif name == 'Class':
537   - code_modules[value] = CLASS_EXTENSION
538   - elif name == 'BaseClass':
539   - code_modules[value] = FORM_EXTENSION
540   -
541   - # read data from dir stream (compressed)
542   - dir_compressed = ole.openstream(dir_path).read()
543   -
544   - def check_value(name, expected, value):
545   - if expected != value:
546   - logging.error("invalid value for {0} expected {1:04X} got {2:04X}".format(name, expected, value))
547   -
548   - dir_stream = cStringIO.StringIO(decompress_stream(dir_compressed))
549   -
550   - # PROJECTSYSKIND Record
551   - PROJECTSYSKIND_Id = struct.unpack("<H", dir_stream.read(2))[0]
552   - check_value('PROJECTSYSKIND_Id', 0x0001, PROJECTSYSKIND_Id)
553   - PROJECTSYSKIND_Size = struct.unpack("<L", dir_stream.read(4))[0]
554   - check_value('PROJECTSYSKIND_Size', 0x0004, PROJECTSYSKIND_Size)
555   - PROJECTSYSKIND_SysKind = struct.unpack("<L", dir_stream.read(4))[0]
556   - if PROJECTSYSKIND_SysKind == 0x00:
557   - logging.debug("16-bit Windows")
558   - elif PROJECTSYSKIND_SysKind == 0x01:
559   - logging.debug("32-bit Windows")
560   - elif PROJECTSYSKIND_SysKind == 0x02:
561   - logging.debug("Macintosh")
562   - elif PROJECTSYSKIND_SysKind == 0x03:
563   - logging.debug("64-bit Windows")
564   - else:
565   - logging.error("invalid PROJECTSYSKIND_SysKind {0:04X}".format(PROJECTSYSKIND_SysKind))
566   -
567   - # PROJECTLCID Record
568   - PROJECTLCID_Id = struct.unpack("<H", dir_stream.read(2))[0]
569   - check_value('PROJECTLCID_Id', 0x0002, PROJECTLCID_Id)
570   - PROJECTLCID_Size = struct.unpack("<L", dir_stream.read(4))[0]
571   - check_value('PROJECTLCID_Size', 0x0004, PROJECTLCID_Size)
572   - PROJECTLCID_Lcid = struct.unpack("<L", dir_stream.read(4))[0]
573   - check_value('PROJECTLCID_Lcid', 0x409, PROJECTLCID_Lcid)
574   -
575   - # PROJECTLCIDINVOKE Record
576   - PROJECTLCIDINVOKE_Id = struct.unpack("<H", dir_stream.read(2))[0]
577   - check_value('PROJECTLCIDINVOKE_Id', 0x0014, PROJECTLCIDINVOKE_Id)
578   - PROJECTLCIDINVOKE_Size = struct.unpack("<L", dir_stream.read(4))[0]
579   - check_value('PROJECTLCIDINVOKE_Size', 0x0004, PROJECTLCIDINVOKE_Size)
580   - PROJECTLCIDINVOKE_LcidInvoke = struct.unpack("<L", dir_stream.read(4))[0]
581   - check_value('PROJECTLCIDINVOKE_LcidInvoke', 0x409, PROJECTLCIDINVOKE_LcidInvoke)
582   -
583   - # PROJECTCODEPAGE Record
584   - PROJECTCODEPAGE_Id = struct.unpack("<H", dir_stream.read(2))[0]
585   - check_value('PROJECTCODEPAGE_Id', 0x0003, PROJECTCODEPAGE_Id)
586   - PROJECTCODEPAGE_Size = struct.unpack("<L", dir_stream.read(4))[0]
587   - check_value('PROJECTCODEPAGE_Size', 0x0002, PROJECTCODEPAGE_Size)
588   - PROJECTCODEPAGE_CodePage = struct.unpack("<H", dir_stream.read(2))[0]
589   -
590   - # PROJECTNAME Record
591   - PROJECTNAME_Id = struct.unpack("<H", dir_stream.read(2))[0]
592   - check_value('PROJECTNAME_Id', 0x0004, PROJECTNAME_Id)
593   - PROJECTNAME_SizeOfProjectName = struct.unpack("<L", dir_stream.read(4))[0]
594   - if PROJECTNAME_SizeOfProjectName < 1 or PROJECTNAME_SizeOfProjectName > 128:
595   - logging.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(PROJECTNAME_SizeOfProjectName))
596   - PROJECTNAME_ProjectName = dir_stream.read(PROJECTNAME_SizeOfProjectName)
597   -
598   - # PROJECTDOCSTRING Record
599   - PROJECTDOCSTRING_Id = struct.unpack("<H", dir_stream.read(2))[0]
600   - check_value('PROJECTDOCSTRING_Id', 0x0005, PROJECTDOCSTRING_Id)
601   - PROJECTDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0]
602   - if PROJECTNAME_SizeOfProjectName > 2000:
603   - logging.error("PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(PROJECTDOCSTRING_SizeOfDocString))
604   - PROJECTDOCSTRING_DocString = dir_stream.read(PROJECTDOCSTRING_SizeOfDocString)
605   - PROJECTDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
606   - check_value('PROJECTDOCSTRING_Reserved', 0x0040, PROJECTDOCSTRING_Reserved)
607   - PROJECTDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0]
608   - if PROJECTDOCSTRING_SizeOfDocStringUnicode % 2 != 0:
609   - logging.error("PROJECTDOCSTRING_SizeOfDocStringUnicode is not even")
610   - PROJECTDOCSTRING_DocStringUnicode = dir_stream.read(PROJECTDOCSTRING_SizeOfDocStringUnicode)
611   -
612   - # PROJECTHELPFILEPATH Record - MS-OVBA 2.3.4.2.1.7
613   - PROJECTHELPFILEPATH_Id = struct.unpack("<H", dir_stream.read(2))[0]
614   - check_value('PROJECTHELPFILEPATH_Id', 0x0006, PROJECTHELPFILEPATH_Id)
615   - PROJECTHELPFILEPATH_SizeOfHelpFile1 = struct.unpack("<L", dir_stream.read(4))[0]
616   - if PROJECTHELPFILEPATH_SizeOfHelpFile1 > 260:
617   - logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(PROJECTHELPFILEPATH_SizeOfHelpFile1))
618   - PROJECTHELPFILEPATH_HelpFile1 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile1)
619   - PROJECTHELPFILEPATH_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
620   - check_value('PROJECTHELPFILEPATH_Reserved', 0x003D, PROJECTHELPFILEPATH_Reserved)
621   - PROJECTHELPFILEPATH_SizeOfHelpFile2 = struct.unpack("<L", dir_stream.read(4))[0]
622   - if PROJECTHELPFILEPATH_SizeOfHelpFile2 != PROJECTHELPFILEPATH_SizeOfHelpFile1:
623   - logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 does not equal PROJECTHELPFILEPATH_SizeOfHelpFile2")
624   - PROJECTHELPFILEPATH_HelpFile2 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile2)
625   - if PROJECTHELPFILEPATH_HelpFile2 != PROJECTHELPFILEPATH_HelpFile1:
626   - logging.error("PROJECTHELPFILEPATH_HelpFile1 does not equal PROJECTHELPFILEPATH_HelpFile2")
627   -
628   - # PROJECTHELPCONTEXT Record
629   - PROJECTHELPCONTEXT_Id = struct.unpack("<H", dir_stream.read(2))[0]
630   - check_value('PROJECTHELPCONTEXT_Id', 0x0007, PROJECTHELPCONTEXT_Id)
631   - PROJECTHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0]
632   - check_value('PROJECTHELPCONTEXT_Size', 0x0004, PROJECTHELPCONTEXT_Size)
633   - PROJECTHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0]
634   -
635   - # PROJECTLIBFLAGS Record
636   - PROJECTLIBFLAGS_Id = struct.unpack("<H", dir_stream.read(2))[0]
637   - check_value('PROJECTLIBFLAGS_Id', 0x0008, PROJECTLIBFLAGS_Id)
638   - PROJECTLIBFLAGS_Size = struct.unpack("<L", dir_stream.read(4))[0]
639   - check_value('PROJECTLIBFLAGS_Size', 0x0004, PROJECTLIBFLAGS_Size)
640   - PROJECTLIBFLAGS_ProjectLibFlags = struct.unpack("<L", dir_stream.read(4))[0]
641   - check_value('PROJECTLIBFLAGS_ProjectLibFlags', 0x0000, PROJECTLIBFLAGS_ProjectLibFlags)
642   -
643   - # PROJECTVERSION Record
644   - PROJECTVERSION_Id = struct.unpack("<H", dir_stream.read(2))[0]
645   - check_value('PROJECTVERSION_Id', 0x0009, PROJECTVERSION_Id)
646   - PROJECTVERSION_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
647   - check_value('PROJECTVERSION_Reserved', 0x0004, PROJECTVERSION_Reserved)
648   - PROJECTVERSION_VersionMajor = struct.unpack("<L", dir_stream.read(4))[0]
649   - PROJECTVERSION_VersionMinor = struct.unpack("<H", dir_stream.read(2))[0]
650   -
651   - # PROJECTCONSTANTS Record
652   - PROJECTCONSTANTS_Id = struct.unpack("<H", dir_stream.read(2))[0]
653   - check_value('PROJECTCONSTANTS_Id', 0x000C, PROJECTCONSTANTS_Id)
654   - PROJECTCONSTANTS_SizeOfConstants = struct.unpack("<L", dir_stream.read(4))[0]
655   - if PROJECTCONSTANTS_SizeOfConstants > 1015:
656   - logging.error("PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(PROJECTCONSTANTS_SizeOfConstants))
657   - PROJECTCONSTANTS_Constants = dir_stream.read(PROJECTCONSTANTS_SizeOfConstants)
658   - PROJECTCONSTANTS_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
659   - check_value('PROJECTCONSTANTS_Reserved', 0x003C, PROJECTCONSTANTS_Reserved)
660   - PROJECTCONSTANTS_SizeOfConstantsUnicode = struct.unpack("<L", dir_stream.read(4))[0]
661   - if PROJECTCONSTANTS_SizeOfConstantsUnicode % 2 != 0:
662   - logging.error("PROJECTCONSTANTS_SizeOfConstantsUnicode is not even")
663   - PROJECTCONSTANTS_ConstantsUnicode = dir_stream.read(PROJECTCONSTANTS_SizeOfConstantsUnicode)
664   -
665   - # array of REFERENCE records
666   - check = None
667   - while True:
668   - check = struct.unpack("<H", dir_stream.read(2))[0]
669   - logging.debug("reference type = {0:04X}".format(check))
670   - if check == 0x000F:
671   - break
672   -
673   - if check == 0x0016:
674   - # REFERENCENAME
675   - REFERENCE_Id = check
676   - REFERENCE_SizeOfName = struct.unpack("<L", dir_stream.read(4))[0]
677   - REFERENCE_Name = dir_stream.read(REFERENCE_SizeOfName)
678   - REFERENCE_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
679   - check_value('REFERENCE_Reserved', 0x003E, REFERENCE_Reserved)
680   - REFERENCE_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
681   - REFERENCE_NameUnicode = dir_stream.read(REFERENCE_SizeOfNameUnicode)
682   - continue
683   -
684   - if check == 0x0033:
685   - # REFERENCEORIGINAL (followed by REFERENCECONTROL)
686   - REFERENCEORIGINAL_Id = check
687   - REFERENCEORIGINAL_SizeOfLibidOriginal = struct.unpack("<L", dir_stream.read(4))[0]
688   - REFERENCEORIGINAL_LibidOriginal = dir_stream.read(REFERENCEORIGINAL_SizeOfLibidOriginal)
689   - continue
690   -
691   - if check == 0x002F:
692   - # REFERENCECONTROL
693   - REFERENCECONTROL_Id = check
694   - REFERENCECONTROL_SizeTwiddled = struct.unpack("<L", dir_stream.read(4))[0] # ignore
695   - REFERENCECONTROL_SizeOfLibidTwiddled = struct.unpack("<L", dir_stream.read(4))[0]
696   - REFERENCECONTROL_LibidTwiddled = dir_stream.read(REFERENCECONTROL_SizeOfLibidTwiddled)
697   - REFERENCECONTROL_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0] # ignore
698   - check_value('REFERENCECONTROL_Reserved1', 0x0000, REFERENCECONTROL_Reserved1)
699   - REFERENCECONTROL_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0] # ignore
700   - check_value('REFERENCECONTROL_Reserved2', 0x0000, REFERENCECONTROL_Reserved2)
701   - # optional field
702   - check2 = struct.unpack("<H", dir_stream.read(2))[0]
703   - if check2 == 0x0016:
704   - REFERENCECONTROL_NameRecordExtended_Id = check
705   - REFERENCECONTROL_NameRecordExtended_SizeofName = struct.unpack("<L", dir_stream.read(4))[0]
706   - REFERENCECONTROL_NameRecordExtended_Name = dir_stream.read(REFERENCECONTROL_NameRecordExtended_SizeofName)
707   - REFERENCECONTROL_NameRecordExtended_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
708   - check_value('REFERENCECONTROL_NameRecordExtended_Reserved', 0x003E, REFERENCECONTROL_NameRecordExtended_Reserved)
709   - REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
710   - REFERENCECONTROL_NameRecordExtended_NameUnicode = dir_stream.read(REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode)
711   - REFERENCECONTROL_Reserved3 = struct.unpack("<H", dir_stream.read(2))[0]
712   - else:
713   - REFERENCECONTROL_Reserved3 = check2
714   -
715   - check_value('REFERENCECONTROL_Reserved3', 0x0030, REFERENCECONTROL_Reserved3)
716   - REFERENCECONTROL_SizeExtended = struct.unpack("<L", dir_stream.read(4))[0]
717   - REFERENCECONTROL_SizeOfLibidExtended = struct.unpack("<L", dir_stream.read(4))[0]
718   - REFERENCECONTROL_LibidExtended = dir_stream.read(REFERENCECONTROL_SizeOfLibidExtended)
719   - REFERENCECONTROL_Reserved4 = struct.unpack("<L", dir_stream.read(4))[0]
720   - REFERENCECONTROL_Reserved5 = struct.unpack("<H", dir_stream.read(2))[0]
721   - REFERENCECONTROL_OriginalTypeLib = dir_stream.read(16)
722   - REFERENCECONTROL_Cookie = struct.unpack("<L", dir_stream.read(4))[0]
723   - continue
724   -
725   - if check == 0x000D:
726   - # REFERENCEREGISTERED
727   - REFERENCEREGISTERED_Id = check
728   - REFERENCEREGISTERED_Size = struct.unpack("<L", dir_stream.read(4))[0]
729   - REFERENCEREGISTERED_SizeOfLibid = struct.unpack("<L", dir_stream.read(4))[0]
730   - REFERENCEREGISTERED_Libid = dir_stream.read(REFERENCEREGISTERED_SizeOfLibid)
731   - REFERENCEREGISTERED_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0]
732   - check_value('REFERENCEREGISTERED_Reserved1', 0x0000, REFERENCEREGISTERED_Reserved1)
733   - REFERENCEREGISTERED_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0]
734   - check_value('REFERENCEREGISTERED_Reserved2', 0x0000, REFERENCEREGISTERED_Reserved2)
735   - continue
736   -
737   - if check == 0x000E:
738   - # REFERENCEPROJECT
739   - REFERENCEPROJECT_Id = check
740   - REFERENCEPROJECT_Size = struct.unpack("<L", dir_stream.read(4))[0]
741   - REFERENCEPROJECT_SizeOfLibidAbsolute = struct.unpack("<L", dir_stream.read(4))[0]
742   - REFERENCEPROJECT_LibidAbsolute = dir_stream.read(REFERENCEPROJECT_SizeOfLibidAbsolute)
743   - REFERENCEPROJECT_SizeOfLibidRelative = struct.unpack("<L", dir_stream.read(4))[0]
744   - REFERENCEPROJECT_LibidRelative = dir_stream.read(REFERENCEPROJECT_SizeOfLibidRelative)
745   - REFERENCEPROJECT_MajorVersion = struct.unpack("<L", dir_stream.read(4))[0]
746   - REFERENCEPROJECT_MinorVersion = struct.unpack("<H", dir_stream.read(2))[0]
747   - continue
748   -
749   - logging.error('invalid or unknown check Id {0:04X}'.format(check))
750   - sys.exit(0)
751   -
752   - PROJECTMODULES_Id = check #struct.unpack("<H", dir_stream.read(2))[0]
753   - check_value('PROJECTMODULES_Id', 0x000F, PROJECTMODULES_Id)
754   - PROJECTMODULES_Size = struct.unpack("<L", dir_stream.read(4))[0]
755   - check_value('PROJECTMODULES_Size', 0x0002, PROJECTMODULES_Size)
756   - PROJECTMODULES_Count = struct.unpack("<H", dir_stream.read(2))[0]
757   - PROJECTMODULES_ProjectCookieRecord_Id = struct.unpack("<H", dir_stream.read(2))[0]
758   - check_value('PROJECTMODULES_ProjectCookieRecord_Id', 0x0013, PROJECTMODULES_ProjectCookieRecord_Id)
759   - PROJECTMODULES_ProjectCookieRecord_Size = struct.unpack("<L", dir_stream.read(4))[0]
760   - check_value('PROJECTMODULES_ProjectCookieRecord_Size', 0x0002, PROJECTMODULES_ProjectCookieRecord_Size)
761   - PROJECTMODULES_ProjectCookieRecord_Cookie = struct.unpack("<H", dir_stream.read(2))[0]
762   -
763   - logging.debug("parsing {0} modules".format(PROJECTMODULES_Count))
764   - for x in xrange(0, PROJECTMODULES_Count):
765   - MODULENAME_Id = struct.unpack("<H", dir_stream.read(2))[0]
766   - check_value('MODULENAME_Id', 0x0019, MODULENAME_Id)
767   - MODULENAME_SizeOfModuleName = struct.unpack("<L", dir_stream.read(4))[0]
768   - MODULENAME_ModuleName = dir_stream.read(MODULENAME_SizeOfModuleName)
769   - # account for optional sections
770   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
771   - if section_id == 0x0047:
772   - MODULENAMEUNICODE_Id = section_id
773   - MODULENAMEUNICODE_SizeOfModuleNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
774   - MODULENAMEUNICODE_ModuleNameUnicode = dir_stream.read(MODULENAMEUNICODE_SizeOfModuleNameUnicode)
775   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
776   - if section_id == 0x001A:
777   - MODULESTREAMNAME_id = section_id
778   - MODULESTREAMNAME_SizeOfStreamName = struct.unpack("<L", dir_stream.read(4))[0]
779   - MODULESTREAMNAME_StreamName = dir_stream.read(MODULESTREAMNAME_SizeOfStreamName)
780   - MODULESTREAMNAME_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
781   - check_value('MODULESTREAMNAME_Reserved', 0x0032, MODULESTREAMNAME_Reserved)
782   - MODULESTREAMNAME_SizeOfStreamNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
783   - MODULESTREAMNAME_StreamNameUnicode = dir_stream.read(MODULESTREAMNAME_SizeOfStreamNameUnicode)
784   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
785   - if section_id == 0x001C:
786   - MODULEDOCSTRING_Id = section_id
787   - check_value('MODULEDOCSTRING_Id', 0x001C, MODULEDOCSTRING_Id)
788   - MODULEDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0]
789   - MODULEDOCSTRING_DocString = dir_stream.read(MODULEDOCSTRING_SizeOfDocString)
790   - MODULEDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
791   - check_value('MODULEDOCSTRING_Reserved', 0x0048, MODULEDOCSTRING_Reserved)
792   - MODULEDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0]
793   - MODULEDOCSTRING_DocStringUnicode = dir_stream.read(MODULEDOCSTRING_SizeOfDocStringUnicode)
794   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
795   - if section_id == 0x0031:
796   - MODULEOFFSET_Id = section_id
797   - check_value('MODULEOFFSET_Id', 0x0031, MODULEOFFSET_Id)
798   - MODULEOFFSET_Size = struct.unpack("<L", dir_stream.read(4))[0]
799   - check_value('MODULEOFFSET_Size', 0x0004, MODULEOFFSET_Size)
800   - MODULEOFFSET_TextOffset = struct.unpack("<L", dir_stream.read(4))[0]
801   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
802   - if section_id == 0x001E:
803   - MODULEHELPCONTEXT_Id = section_id
804   - check_value('MODULEHELPCONTEXT_Id', 0x001E, MODULEHELPCONTEXT_Id)
805   - MODULEHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0]
806   - check_value('MODULEHELPCONTEXT_Size', 0x0004, MODULEHELPCONTEXT_Size)
807   - MODULEHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0]
808   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
809   - if section_id == 0x002C:
810   - MODULECOOKIE_Id = section_id
811   - check_value('MODULECOOKIE_Id', 0x002C, MODULECOOKIE_Id)
812   - MODULECOOKIE_Size = struct.unpack("<L", dir_stream.read(4))[0]
813   - check_value('MODULECOOKIE_Size', 0x0002, MODULECOOKIE_Size)
814   - MODULECOOKIE_Cookie = struct.unpack("<H", dir_stream.read(2))[0]
815   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
816   - if section_id == 0x0021 or section_id == 0x0022:
817   - MODULETYPE_Id = section_id
818   - MODULETYPE_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
819   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
820   - if section_id == 0x0025:
821   - MODULEREADONLY_Id = section_id
822   - check_value('MODULEREADONLY_Id', 0x0025, MODULEREADONLY_Id)
823   - MODULEREADONLY_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
824   - check_value('MODULEREADONLY_Reserved', 0x0000, MODULEREADONLY_Reserved)
825   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
826   - if section_id == 0x0028:
827   - MODULEPRIVATE_Id = section_id
828   - check_value('MODULEPRIVATE_Id', 0x0028, MODULEPRIVATE_Id)
829   - MODULEPRIVATE_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
830   - check_value('MODULEPRIVATE_Reserved', 0x0000, MODULEPRIVATE_Reserved)
831   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
832   - if section_id == 0x002B: # TERMINATOR
833   - MODULE_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
834   - check_value('MODULE_Reserved', 0x0000, MODULE_Reserved)
835   - section_id = None
836   - if section_id != None:
837   - logging.warning('unknown or invalid module section id {0:04X}'.format(section_id))
838   -
839   - logging.debug('Project CodePage = %d' % PROJECTCODEPAGE_CodePage)
840   - vba_codec = 'cp%d' % PROJECTCODEPAGE_CodePage
841   - logging.debug("ModuleName = {0}".format(MODULENAME_ModuleName))
842   - logging.debug("StreamName = {0}".format(repr(MODULESTREAMNAME_StreamName)))
843   - streamname_unicode = MODULESTREAMNAME_StreamName.decode(vba_codec)
844   - logging.debug("StreamName.decode('%s') = %s" % (vba_codec, repr(streamname_unicode)))
845   - logging.debug("StreamNameUnicode = {0}".format(repr(MODULESTREAMNAME_StreamNameUnicode)))
846   - logging.debug("TextOffset = {0}".format(MODULEOFFSET_TextOffset))
847   -
848   - code_path = vba_root + u'VBA/' + streamname_unicode
849   - #TODO: test if stream exists
850   - logging.debug('opening VBA code stream %s' % repr(code_path))
851   - code_data = ole.openstream(code_path).read()
852   - logging.debug("length of code_data = {0}".format(len(code_data)))
853   - logging.debug("offset of code_data = {0}".format(MODULEOFFSET_TextOffset))
854   - code_data = code_data[MODULEOFFSET_TextOffset:]
855   - if len(code_data) > 0:
856   - code_data = decompress_stream(code_data)
857   - # case-insensitive search in the code_modules dict to find the file extension:
858   - filext = code_modules.get(MODULENAME_ModuleName.lower(), 'bin')
859   - filename = '{0}.{1}'.format(MODULENAME_ModuleName, filext)
860   - #TODO: also yield the codepage so that callers can decode it properly
861   - yield (code_path, filename, code_data)
862   - # print '-'*79
863   - # print filename
864   - # print ''
865   - # print code_data
866   - # print ''
867   - logging.debug('extracted file {0}'.format(filename))
868   - else:
869   - logging.warning("module stream {0} has code data length 0".format(MODULESTREAMNAME_StreamName))
870   - return
871   -
872   -
873   -def filter_vba(vba_code):
874   - """
875   - Filter VBA source code to remove the first lines starting with "Attribute VB_",
876   - which are automatically added by MS Office and not displayed in the VBA Editor.
877   - This should only be used when displaying source code for human analysis.
878   -
879   - Note: lines are not filtered if they contain a colon, because it could be
880   - used to hide malicious instructions.
881   -
882   - :param vba_code: str, VBA source code
883   - :return: str, filtered VBA source code
884   - """
885   - vba_lines = vba_code.splitlines()
886   - start = 0
887   - for line in vba_lines:
888   - if line.startswith("Attribute VB_") and not ':' in line:
889   - start += 1
890   - else:
891   - break
892   - #TODO: also remove empty lines?
893   - vba = '\n'.join(vba_lines[start:])
894   - return vba
895   -
896   -
897   -def detect_autoexec(vba_code, obfuscation=None):
898   - """
899   - Detect if the VBA code contains keywords corresponding to macros running
900   - automatically when triggered by specific actions (e.g. when a document is
901   - opened or closed).
902   -
903   - :param vba_code: str, VBA source code
904   - :param obfuscation: None or str, name of obfuscation to be added to description
905   - :return: list of str tuples (keyword, description)
906   - """
907   - #TODO: merge code with detect_suspicious
908   - # case-insensitive search
909   - #vba_code = vba_code.lower()
910   - results = []
911   - obf_text = ''
912   - if obfuscation:
913   - obf_text = ' (obfuscation: %s)' % obfuscation
914   - for description, keywords in AUTOEXEC_KEYWORDS.items():
915   - for keyword in keywords:
916   - #TODO: if keyword is already a compiled regex, use it as-is
917   - # search using regex to detect word boundaries:
918   - if re.search(r'(?i)\b'+keyword+r'\b', vba_code):
919   - #if keyword.lower() in vba_code:
920   - results.append((keyword, description+obf_text))
921   - return results
922   -
923   -
924   -def detect_suspicious(vba_code, obfuscation=None):
925   - """
926   - Detect if the VBA code contains suspicious keywords corresponding to
927   - potential malware behaviour.
928   -
929   - :param vba_code: str, VBA source code
930   - :param obfuscation: None or str, name of obfuscation to be added to description
931   - :return: list of str tuples (keyword, description)
932   - """
933   - # case-insensitive search
934   - #vba_code = vba_code.lower()
935   - results = []
936   - obf_text = ''
937   - if obfuscation:
938   - obf_text = ' (obfuscation: %s)' % obfuscation
939   - for description, keywords in SUSPICIOUS_KEYWORDS.items():
940   - for keyword in keywords:
941   - # search using regex to detect word boundaries:
942   - if re.search(r'(?i)\b'+keyword+r'\b', vba_code):
943   - #if keyword.lower() in vba_code:
944   - results.append((keyword, description+obf_text))
945   - return results
946   -
947   -
948   -def detect_patterns(vba_code, obfuscation=None):
949   - """
950   - Detect if the VBA code contains specific patterns such as IP addresses,
951   - URLs, e-mail addresses, executable file names, etc.
952   -
953   - :param vba_code: str, VBA source code
954   - :return: list of str tuples (pattern type, value)
955   - """
956   - results = []
957   - found = set()
958   - obf_text = ''
959   - if obfuscation:
960   - obf_text = ' (obfuscation: %s)' % obfuscation
961   - for pattern_type, pattern_re in RE_PATTERNS:
962   - for match in pattern_re.finditer(vba_code):
963   - value = match.group()
964   - if value not in found:
965   - results.append((pattern_type+obf_text, value))
966   - found.add(value)
967   - return results
968   -
969   -
970   -def detect_hex_strings(vba_code):
971   - """
972   - Detect if the VBA code contains strings encoded in hexadecimal.
973   -
974   - :param vba_code: str, VBA source code
975   - :return: list of str tuples (encoded string, decoded string)
976   - """
977   - results = []
978   - found = set()
979   - for match in re_hex_string.finditer(vba_code):
980   - value = match.group()
981   - if value not in found:
982   - decoded = binascii.unhexlify(value)
983   - results.append((value, decoded))
984   - found.add(value)
985   - return results
986   -
987   -
988   -def detect_base64_strings(vba_code):
989   - """
990   - Detect if the VBA code contains strings encoded in base64.
991   -
992   - :param vba_code: str, VBA source code
993   - :return: list of str tuples (encoded string, decoded string)
994   - """
995   - #TODO: avoid matching simple hex strings as base64?
996   - results = []
997   - found = set()
998   - for match in re_base64_string.finditer(vba_code):
999   - # extract the base64 string without quotes:
1000   - value = match.group().strip('"')
1001   - # check it is not just a hex string:
1002   - if not re_nothex_check.search(value):
1003   - continue
1004   - # only keep new values and not in the whitelist:
1005   - if value not in found and value.lower() not in BASE64_WHITELIST:
1006   - try:
1007   - decoded = base64.b64decode(value)
1008   - results.append((value, decoded))
1009   - found.add(value)
1010   - except:
1011   - # if an exception occurs, it is likely not a base64-encoded string
1012   - pass
1013   - return results
1014   -
1015   -
1016   -def detect_dridex_strings(vba_code):
1017   - """
1018   - Detect if the VBA code contains strings obfuscated with a specific algorithm found in Dridex samples.
1019   -
1020   - :param vba_code: str, VBA source code
1021   - :return: list of str tuples (encoded string, decoded string)
1022   - """
1023   - from thirdparty.DridexUrlDecoder.DridexUrlDecoder import DridexUrlDecode
1024   - results = []
1025   - found = set()
1026   - for match in re_dridex_string.finditer(vba_code):
1027   - value = match.group()[1:-1]
1028   - # check it is not just a hex string:
1029   - if not re_nothex_check.search(value):
1030   - continue
1031   - if value not in found:
1032   - try:
1033   - decoded = DridexUrlDecode(value)
1034   - results.append((value, decoded))
1035   - found.add(value)
1036   - except:
1037   - # if an exception occurs, it is likely not a dridex-encoded string
1038   - pass
1039   - return results
1040   -
1041   -
1042   -class VBA_Scanner (object):
1043   - """
1044   - Class to scan the source code of a VBA module to find obfuscated strings,
1045   - suspicious keywords, IOCs, auto-executable macros, etc.
1046   - """
1047   -
1048   - def __init__(self, vba_code):
1049   - """
1050   - VBA_Scanner constructor
1051   -
1052   - :param vba_code: str, VBA source code to be analyzed
1053   - """
1054   - self.code = vba_code
1055   - self.code_hex = ''
1056   - self.code_hex_rev = ''
1057   - self.code_rev_hex = ''
1058   - self.code_base64 = ''
1059   - self.code_dridex = ''
1060   -
1061   -
1062   - def scan(self, include_decoded_strings=False):
1063   - """
1064   - Analyze the provided VBA code to detect suspicious keywords,
1065   - auto-executable macros, IOC patterns, obfuscation patterns
1066   - such as hex-encoded strings.
1067   -
1068   - :param include_decoded_strings: bool, if True, all encoded strings will be included with their decoded content.
1069   - :return: list of tuples (type, keyword, description)
1070   - (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String')
1071   - """
1072   - # First, detect and extract hex-encoded strings:
1073   - self.hex_strings = detect_hex_strings(self.code)
1074   - # detect if the code contains StrReverse:
1075   - self.strReverse = False
1076   - if 'strreverse' in self.code.lower(): self.strReverse = True
1077   - # Then append the decoded strings to the VBA code, to detect obfuscated IOCs and keywords:
1078   - for encoded, decoded in self.hex_strings:
1079   - self.code_hex += '\n'+decoded
1080   - # if the code contains "StrReverse", also append the hex strings in reverse order:
1081   - if self.strReverse:
1082   - # StrReverse after hex decoding:
1083   - self.code_hex_rev += '\n'+decoded[::-1]
1084   - # StrReverse before hex decoding:
1085   - self.code_rev_hex += '\n'+binascii.unhexlify(encoded[::-1])
1086   - #example: https://malwr.com/analysis/NmFlMGI4YTY1YzYyNDkwNTg1ZTBiZmY5OGI3YjlhYzU/
1087   - #TODO: also append the full code reversed if StrReverse? (risk of false positives?)
1088   - # Detect Base64-encoded strings
1089   - self.base64_strings = detect_base64_strings(self.code)
1090   - for encoded, decoded in self.base64_strings:
1091   - self.code_base64 += '\n'+decoded
1092   - # Detect Dridex-encoded strings
1093   - self.dridex_strings = detect_dridex_strings(self.code)
1094   - for encoded, decoded in self.dridex_strings:
1095   - self.code_dridex += '\n'+decoded
1096   - results = []
1097   - self.autoexec_keywords = []
1098   - self.suspicious_keywords = []
1099   - self.iocs = []
1100   -
1101   - for code, obfuscation in (
1102   - (self.code, None),
1103   - (self.code_hex, 'Hex'),
1104   - (self.code_hex_rev, 'Hex+StrReverse'),
1105   - (self.code_rev_hex, 'StrReverse+Hex'),
1106   - (self.code_base64, 'Base64'),
1107   - (self.code_dridex, 'Dridex'),
1108   - ):
1109   - self.autoexec_keywords += detect_autoexec(code, obfuscation)
1110   - self.suspicious_keywords += detect_suspicious(code, obfuscation)
1111   - self.iocs += detect_patterns(code, obfuscation)
1112   -
1113   - # If hex-encoded strings were discovered, add an item to suspicious keywords:
1114   - if self.hex_strings:
1115   - self.suspicious_keywords.append(('Hex Strings',
1116   - 'Hex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))
1117   - if self.base64_strings:
1118   - self.suspicious_keywords.append(('Base64 Strings',
1119   - 'Base64-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))
1120   - if self.dridex_strings:
1121   - self.suspicious_keywords.append(('Dridex Strings',
1122   - 'Dridex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))
1123   - for keyword, description in self.autoexec_keywords:
1124   - results.append(('AutoExec', keyword, description))
1125   - for keyword, description in self.suspicious_keywords:
1126   - results.append(('Suspicious', keyword, description))
1127   - for pattern_type, value in self.iocs:
1128   - results.append(('IOC', value, pattern_type))
1129   - if include_decoded_strings:
1130   - for encoded, decoded in self.hex_strings:
1131   - results.append(('Hex String', repr(decoded), encoded))
1132   - for encoded, decoded in self.base64_strings:
1133   - results.append(('Base64 String', repr(decoded), encoded))
1134   - for encoded, decoded in self.dridex_strings:
1135   - results.append(('Dridex string', repr(decoded), encoded))
1136   - return results
1137   -
1138   - def scan_summary(self):
1139   - """
1140   - Analyze the provided VBA code to detect suspicious keywords,
1141   - auto-executable macros, IOC patterns, obfuscation patterns
1142   - such as hex-encoded strings.
1143   -
1144   - :return: tuple with the number of items found for each category:
1145   - (autoexec, suspicious, IOCs, hex, base64, dridex)
1146   - """
1147   - self.scan()
1148   - return (len(self.autoexec_keywords), len(self.suspicious_keywords),
1149   - len(self.iocs), len(self.hex_strings), len(self.base64_strings),
1150   - len(self.dridex_strings))
1151   -
1152   -
1153   -
1154   -def scan_vba(vba_code, include_decoded_strings):
1155   - """
1156   - Analyze the provided VBA code to detect suspicious keywords,
1157   - auto-executable macros, IOC patterns, obfuscation patterns
1158   - such as hex-encoded strings.
1159   - (shortcut for VBA_Scanner(vba_code).scan())
1160   -
1161   - :param vba_code: str, VBA source code to be analyzed
1162   - :param include_decoded_strings: bool, if True all encoded strings will be included with their decoded content.
1163   - :return: list of tuples (type, keyword, description)
1164   - (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String')
1165   - """
1166   - return VBA_Scanner(vba_code).scan(include_decoded_strings)
1167   -
1168   -
1169   -#=== CLASSES =================================================================
1170   -
1171   -class VBA_Parser(object):
1172   - """
1173   - Class to parse MS Office files, to detect VBA macros and extract VBA source code
1174   - Supported file formats:
1175   - - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)
1176   - - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
1177   - - PowerPoint 2007+ (.pptm, .ppsm)
1178   - """
1179   -
1180   - def __init__(self, filename, data=None):
1181   - """
1182   - Constructor for VBA_Parser
1183   -
1184   - :param filename: filename or path of file to parse, or file-like object
1185   -
1186   - :param data: None or bytes str, if None the file will be read from disk (or from the file-like object).
1187   - If data is provided as a bytes string, it will be parsed as the content of the file in memory,
1188   - and not read from disk. Note: files must be read in binary mode, i.e. open(f, 'rb').
1189   - """
1190   - #TODO: filename should only be a string, data should be used for the file-like object
1191   - #TODO: filename should be mandatory, optional data is a string or file-like object
1192   - #TODO: also support olefile and zipfile as input
1193   - if data is None:
1194   - # open file from disk:
1195   - _file = filename
1196   - else:
1197   - # file already read in memory, make it a file-like object for zipfile:
1198   - _file = cStringIO.StringIO(data)
1199   - #self.file = _file
1200   - self.ole_file = None
1201   - self.ole_subfiles = []
1202   - self.filename = filename
1203   - self.type = None
1204   - self.vba_projects = None
1205   - # if filename is None:
1206   - # if isinstance(_file, basestring):
1207   - # if len(_file) < olefile.MINIMAL_OLEFILE_SIZE:
1208   - # self.filename = _file
1209   - # else:
1210   - # self.filename = '<file in bytes string>'
1211   - # else:
1212   - # self.filename = '<file-like object>'
1213   - if olefile.isOleFile(_file):
1214   - # This looks like an OLE file
1215   - logging.info('Parsing OLE file %s' % self.filename)
1216   - # Open and parse the OLE file, using unicode for path names:
1217   - self.ole_file = olefile.OleFileIO(_file, path_encoding=None)
1218   - self.type = TYPE_OLE
1219   - #TODO: raise TypeError if this is a Powerpoint 97 file, since VBA macros cannot be detected yet
1220   - elif zipfile.is_zipfile(_file):
1221   - # This looks like a zip file, need to look for vbaProject.bin inside
1222   - # It can be any OLE file inside the archive
1223   - #...because vbaProject.bin can be renamed:
1224   - # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18
1225   - logging.info('Opening ZIP/OpenXML file %s' % self.filename)
1226   - self.type = TYPE_OpenXML
1227   - z = zipfile.ZipFile(_file)
1228   - #TODO: check if this is actually an OpenXML file
1229   - #TODO: if the zip file is encrypted, suggest to use the -z option, or try '-z infected' automatically?
1230   - # check each file within the zip if it is an OLE file, by reading its magic:
1231   - for subfile in z.namelist():
1232   - magic = z.open(subfile).read(len(olefile.MAGIC))
1233   - if magic == olefile.MAGIC:
1234   - logging.debug('Opening OLE file %s within zip' % subfile)
1235   - ole_data = z.open(subfile).read()
1236   - try:
1237   - self.ole_subfiles.append(VBA_Parser(filename=subfile, data=ole_data))
1238   - except:
1239   - logging.debug('%s is not a valid OLE file' % subfile)
1240   - continue
1241   - z.close()
1242   - else:
1243   - # read file from disk, check if it is a Word 2003 XML file (WordProcessingML), Excel 2003 XML,
1244   - # or a plain text file containing VBA code
1245   - if data is None:
1246   - data = open(filename, 'rb').read()
1247   - # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace
1248   - if 'http://schemas.microsoft.com/office/word/2003/wordml' in data:
1249   - logging.info('Opening Word 2003 XML file %s' % self.filename)
1250   - self.type = TYPE_Word2003_XML
1251   - # parse the XML content
1252   - et = ET.fromstring(data)
1253   - # find all the binData elements:
1254   - for bindata in et.getiterator(TAG_BINDATA):
1255   - # the binData content is an OLE container for the VBA project, compressed
1256   - # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded.
1257   - # get the filename:
1258   - fname = bindata.get(ATTR_NAME, 'noname.mso')
1259   - # decode the base64 activemime
1260   - activemime = binascii.a2b_base64(bindata.text)
1261   - # decompress the zlib data starting at offset 0x32, which is the OLE container:
1262   - ole_data = zlib.decompress(activemime[0x32:])
1263   - try:
1264   - self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data))
1265   - except:
1266   - logging.debug('%s is not a valid OLE file' % fname)
1267   - continue
1268   - #TODO: handle exceptions
1269   - #TODO: Excel 2003 XML
1270   - #TODO: plain text VBA file
1271   - else:
1272   - msg = '%s is not an OLE nor an OpenXML file, cannot extract VBA Macros.' % self.filename
1273   - logging.error(msg)
1274   - raise TypeError(msg)
1275   -
1276   - def find_vba_projects (self):
1277   - """
1278   - Finds all the VBA projects stored in an OLE file.
1279   -
1280   - Return None if the file is not OLE but OpenXML.
1281   - Return a list of tuples (vba_root, project_path, dir_path) for each VBA project.
1282   - vba_root is the path of the root OLE storage containing the VBA project,
1283   - including a trailing slash unless it is the root of the OLE file.
1284   - project_path is the path of the OLE stream named "PROJECT" within the VBA project.
1285   - dir_path is the path of the OLE stream named "VBA/dir" within the VBA project.
1286   -
1287   - If this function returns an empty list for one of the supported formats
1288   - (i.e. Word, Excel, Powerpoint except Powerpoint 97-2003), then the
1289   - file does not contain VBA macros.
1290   -
1291   - :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path)
1292   - for each VBA project found if OLE file
1293   - """
1294   - # if the file is not OLE but OpenXML, return None:
1295   - if self.ole_file is None:
1296   - return None
1297   -
1298   - # if this method has already been called, return previous result:
1299   - if self.vba_projects is not None:
1300   - return self.vba_projects
1301   -
1302   - # Find the VBA project root (different in MS Word, Excel, etc):
1303   - # - Word 97-2003: Macros
1304   - # - Excel 97-2003: _VBA_PROJECT_CUR
1305   - # - PowerPoint 97-2003: not supported yet (different file structure)
1306   - # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin.
1307   - # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word
1308   - # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word
1309   - # - Visio 2007: not supported yet (different file structure)
1310   -
1311   - # According to MS-OVBA section 2.2.1:
1312   - # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream
1313   - # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream
1314   - # - all names are case-insensitive
1315   -
1316   - # start with an empty list:
1317   - self.vba_projects = []
1318   - # Look for any storage containing those storage/streams:
1319   - ole = self.ole_file
1320   - for storage in ole.listdir(streams=False, storages=True):
1321   - # Look for a storage ending with "VBA":
1322   - if storage[-1].upper() == 'VBA':
1323   - logging.debug('Found VBA storage: %s' % ('/'.join(storage)))
1324   - vba_root = '/'.join(storage[:-1])
1325   - # Add a trailing slash to vba_root, unless it is the root of the OLE file:
1326   - # (used later to append all the child streams/storages)
1327   - if vba_root != '':
1328   - vba_root += '/'
1329   - logging.debug('Checking vba_root="%s"' % vba_root)
1330   -
1331   - def check_vba_stream(ole, vba_root, stream_path):
1332   - full_path = vba_root + stream_path
1333   - if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM:
1334   - logging.debug('Found %s stream: %s' % (stream_path, full_path))
1335   - return full_path
1336   - else:
1337   - logging.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path)
1338   - return False
1339   -
1340   - # Check if the VBA root storage also contains a PROJECT stream:
1341   - project_path = check_vba_stream(ole, vba_root, 'PROJECT')
1342   - if not project_path: continue
1343   - # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream:
1344   - vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT')
1345   - if not vba_project_path: continue
1346   - # Check if the VBA root storage also contains a VBA/dir stream:
1347   - dir_path = check_vba_stream(ole, vba_root, 'VBA/dir')
1348   - if not dir_path: continue
1349   - # Now we are pretty sure it is a VBA project structure
1350   - logging.debug('VBA root storage: "%s"' % vba_root)
1351   - # append the results to the list as a tuple for later use:
1352   - self.vba_projects.append((vba_root, project_path, dir_path))
1353   - return self.vba_projects
1354   -
1355   - def detect_vba_macros(self):
1356   - """
1357   - Detect the potential presence of VBA macros in the file, by checking
1358   - if it contains VBA projects. Both OLE and OpenXML files are supported.
1359   -
1360   - Important: for now, results are accurate only for Word, Excel and PowerPoint
1361   - EXCEPT Powerpoint 97-2003, which has a different structure for VBA.
1362   -
1363   - Note: this method does NOT attempt to check the actual presence or validity
1364   - of VBA macro source code, so there might be false positives.
1365   - It may also detect VBA macros in files embedded within the main file,
1366   - for example an Excel workbook with macros embedded into a Word
1367   - document without macros may be detected, without distinction.
1368   -
1369   - :return: bool, True if at least one VBA project has been found, False otherwise
1370   - """
1371   - #TODO: return None or raise exception if format not supported like PPT 97-2003
1372   - #TODO: return the number of VBA projects found instead of True/False?
1373   - # if OpenXML, check all the OLE subfiles:
1374   - if self.ole_file is None:
1375   - for ole_subfile in self.ole_subfiles:
1376   - if ole_subfile.detect_vba_macros():
1377   - return True
1378   - return False
1379   - # otherwise it's an OLE file, find VBA projects:
1380   - vba_projects = self.find_vba_projects()
1381   - if len(vba_projects) == 0:
1382   - return False
1383   - else:
1384   - return True
1385   -
1386   -
1387   - def extract_macros (self):
1388   - """
1389   - Extract and decompress source code for each VBA macro found in the file
1390   -
1391   - Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found
1392   - If the file is OLE, filename is the path of the file.
1393   - If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros
1394   - within the zip archive, e.g. word/vbaProject.bin.
1395   - """
1396   - if self.ole_file is None:
1397   - for ole_subfile in self.ole_subfiles:
1398   - for results in ole_subfile.extract_macros():
1399   - yield results
1400   - else:
1401   - self.find_vba_projects()
1402   - for vba_root, project_path, dir_path in self.vba_projects:
1403   - # extract all VBA macros from that VBA root storage:
1404   - for stream_path, vba_filename, vba_code in _extract_vba(self.ole_file, vba_root, project_path, dir_path):
1405   - yield (self.filename, stream_path, vba_filename, vba_code)
1406   -
1407   -
1408   - def close(self):
1409   - """
1410   - Close all the open files. This method must be called after usage, if
1411   - the application is opening many files.
1412   - """
1413   - if self.ole_file is None:
1414   - for ole_subfile in self.ole_subfiles:
1415   - ole_subfile.close()
1416   - else:
1417   - self.ole_file.close()
1418   -
1419   -
1420   -def print_analysis(vba_code, show_decoded_strings=False):
1421   - """
1422   - Analyze the provided VBA code, and print the results in a table
1423   -
1424   - :param vba_code: str, VBA source code to be analyzed
1425   - :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.
1426   - :return: None
1427   - """
1428   - results = scan_vba(vba_code, show_decoded_strings)
1429   - if results:
1430   - t = prettytable.PrettyTable(('Type', 'Keyword', 'Description'))
1431   - t.align = 'l'
1432   - t.max_width['Type'] = 10
1433   - t.max_width['Keyword'] = 20
1434   - t.max_width['Description'] = 39
1435   - for kw_type, keyword, description in results:
1436   - t.add_row((kw_type, keyword, description))
1437   - print t
1438   - else:
1439   - print 'No suspicious keyword or IOC found.'
1440   -
1441   -
1442   -
1443   -def process_file (container, filename, data, show_decoded_strings=False):
1444   - """
1445   - Process a single file
1446   -
1447   - :param container: str, path and filename of container if the file is within
1448   - a zip archive, None otherwise.
1449   - :param filename: str, path and filename of file on disk, or within the container.
1450   - :param data: bytes, content of the file if it is in a container, None if it is a file on disk.
1451   - :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.
1452   - """
1453   - #TODO: replace print by writing to a provided output file (sys.stdout by default)
1454   - if container:
1455   - display_filename = '%s in %s' % (filename, container)
1456   - else:
1457   - display_filename = filename
1458   - print '='*79
1459   - print 'FILE:', display_filename
1460   - try:
1461   - #TODO: handle olefile errors, when an OLE file is malformed
1462   - vba = VBA_Parser(filename, data)
1463   - print 'Type:', vba.type
1464   - if vba.detect_vba_macros():
1465   - #print 'Contains VBA Macros:'
1466   - for (subfilename, stream_path, vba_filename, vba_code) in vba.extract_macros():
1467   - # hide attribute lines:
1468   - #TODO: option to disable attribute filtering
1469   - vba_code_filtered = filter_vba(vba_code)
1470   - print '-'*79
1471   - print 'VBA MACRO %s ' % vba_filename
1472   - print 'in file: %s - OLE stream: %s' % (subfilename, repr(stream_path))
1473   - print '- '*39
1474   - # detect empty macros:
1475   - if vba_code_filtered.strip() == '':
1476   - print '(empty macro)'
1477   - else:
1478   - print vba_code_filtered
1479   - print '- '*39
1480   - print 'ANALYSIS:'
1481   - # analyse the whole code, filtered to avoid false positives:
1482   - print_analysis(vba_code_filtered, show_decoded_strings)
1483   - else:
1484   - print 'No VBA macros found.'
1485   - except: #TypeError:
1486   - #raise
1487   - #TODO: print more info if debug mode
1488   - #print sys.exc_value
1489   - # display the exception with full stack trace for debugging, but do not stop:
1490   - traceback.print_exc()
1491   - print ''
1492   -
1493   -
1494   -def process_file_triage (container, filename, data):
1495   - """
1496   - Process a single file
1497   -
1498   - :param container: str, path and filename of container if the file is within
1499   - a zip archive, None otherwise.
1500   - :param filename: str, path and filename of file on disk, or within the container.
1501   - :param data: bytes, content of the file if it is in a container, None if it is a file on disk.
1502   - """
1503   - #TODO: replace print by writing to a provided output file (sys.stdout by default)
1504   - nb_macros = 0
1505   - nb_autoexec = 0
1506   - nb_suspicious = 0
1507   - nb_iocs = 0
1508   - nb_hexstrings = 0
1509   - nb_base64strings = 0
1510   - nb_dridexstrings = 0
1511   - # ftype = 'Other'
1512   - message = ''
1513   - try:
1514   - #TODO: handle olefile errors, when an OLE file is malformed
1515   - vba = VBA_Parser(filename, data)
1516   - if vba.detect_vba_macros():
1517   - for (subfilename, stream_path, vba_filename, vba_code) in vba.extract_macros():
1518   - nb_macros += 1
1519   - if vba_code.strip() != '':
1520   - # analyse the whole code, filtered to avoid false positives:
1521   - scanner = VBA_Scanner(filter_vba(vba_code))
1522   - autoexec, suspicious, iocs, hexstrings, base64strings, dridex = scanner.scan_summary()
1523   - nb_autoexec += autoexec
1524   - nb_suspicious += suspicious
1525   - nb_iocs += iocs
1526   - nb_hexstrings += hexstrings
1527   - nb_base64strings += base64strings
1528   - nb_dridexstrings += dridex
1529   - if vba.type == TYPE_OLE:
1530   - flags = 'OLE:'
1531   - elif vba.type == TYPE_OpenXML:
1532   - flags = 'OpX:'
1533   - elif vba.type == TYPE_Word2003_XML:
1534   - flags = 'XML:'
1535   - macros = autoexec = suspicious = iocs = hexstrings = base64obf = dridex = '-'
1536   - if nb_macros: macros = 'M'
1537   - if nb_autoexec: autoexec = 'A'
1538   - if nb_suspicious: suspicious = 'S'
1539   - if nb_iocs: iocs = 'I'
1540   - if nb_hexstrings: hexstrings = 'H'
1541   - if nb_base64strings: base64obf = 'B'
1542   - if nb_dridexstrings: dridex = 'D'
1543   - flags += '%s%s%s%s%s%s%s' % (macros, autoexec, suspicious, iocs, hexstrings,
1544   - base64obf, dridex)
1545   -
1546   - # macros = autoexec = suspicious = iocs = hexstrings = 'no'
1547   - # if nb_macros: macros = 'YES:%d' % nb_macros
1548   - # if nb_autoexec: autoexec = 'YES:%d' % nb_autoexec
1549   - # if nb_suspicious: suspicious = 'YES:%d' % nb_suspicious
1550   - # if nb_iocs: iocs = 'YES:%d' % nb_iocs
1551   - # if nb_hexstrings: hexstrings = 'YES:%d' % nb_hexstrings
1552   - # # 2nd line = info
1553   - # print '%-8s %-7s %-7s %-7s %-7s %-7s' % (vba.type, macros, autoexec, suspicious, iocs, hexstrings)
1554   - except TypeError:
1555   - # file type not OLE nor OpenXML
1556   - flags = '?'
1557   - message = 'File format not supported'
1558   - except:
1559   - # another error occurred
1560   - #raise
1561   - #TODO: print more info if debug mode
1562   - #TODO: distinguish real errors from incorrect file types
1563   - flags = '!ERROR'
1564   - message = sys.exc_value
1565   - line = '%-11s %s' % (flags, filename)
1566   - if message:
1567   - line += ' - %s' % message
1568   - print line
1569   -
1570   - # t = prettytable.PrettyTable(('filename', 'type', 'macros', 'autoexec', 'suspicious', 'ioc', 'hexstrings'),
1571   - # header=False, border=False)
1572   - # t.align = 'l'
1573   - # t.max_width['filename'] = 30
1574   - # t.max_width['type'] = 10
1575   - # t.max_width['macros'] = 6
1576   - # t.max_width['autoexec'] = 6
1577   - # t.max_width['suspicious'] = 6
1578   - # t.max_width['ioc'] = 6
1579   - # t.max_width['hexstrings'] = 6
1580   - # t.add_row((filename, ftype, macros, autoexec, suspicious, iocs, hexstrings))
1581   - # print t
1582   -
1583   -def main_triage_quick():
1584   - pass
1585   -
1586   -#=== MAIN =====================================================================
1587   -
1588   -def main():
1589   - """
1590   - Main function, called when olevba is run from the command line
1591   - """
1592   - usage = 'usage: %prog [options] <filename> [filename2 ...]'
1593   - parser = optparse.OptionParser(usage=usage)
1594   - # parser.add_option('-o', '--outfile', dest='outfile',
1595   - # help='output file')
1596   - # parser.add_option('-c', '--csv', dest='csv',
1597   - # help='export results to a CSV file')
1598   - parser.add_option("-r", action="store_true", dest="recursive",
1599   - help='find files recursively in subdirectories.')
1600   - parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None,
1601   - help='if the file is a zip archive, open first file from it, using the provided password (requires Python 2.6+)')
1602   - parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*',
1603   - help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)')
1604   - parser.add_option("-t", action="store_true", dest="triage_mode",
1605   - help='triage mode, display results as a summary table (default for multiple files)')
1606   - parser.add_option("-d", action="store_true", dest="detailed_mode",
1607   - help='detailed mode, display full results (default for single file)')
1608   - parser.add_option("-i", "--input", dest='input', type='str', default=None,
1609   - help='input file containing VBA source code to be analyzed (no parsing)')
1610   - parser.add_option("--decode", action="store_true", dest="show_decoded_strings",
1611   - help='display all the obfuscated strings with their decoded content (Hex, Base64, StrReverse, Dridex).')
1612   -
1613   - (options, args) = parser.parse_args()
1614   -
1615   - # Print help if no arguments are passed
1616   - if len(args) == 0 and not options.input:
1617   - print __doc__
1618   - parser.print_help()
1619   - sys.exit()
1620   -
1621   - # print banner with version
1622   - print 'olevba %s - http://decalage.info/python/oletools' % __version__
1623   -
1624   - logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING) #INFO)
1625   - # For now, all logging is disabled:
1626   - logging.disable(logging.CRITICAL)
1627   -
1628   - if options.input:
1629   - # input file provided with VBA source code to be analyzed directly:
1630   - print 'Analysis of VBA source code from %s:' % options.input
1631   - vba_code = open(options.input).read()
1632   - print_analysis(vba_code, show_decoded_strings=options.show_decoded_strings)
1633   - sys.exit()
1634   -
1635   - # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('Type', 'Macros', 'AutoEx', 'Susp.', 'IOCs', 'HexStr')
1636   - # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('-'*8, '-'*7, '-'*7, '-'*7, '-'*7, '-'*7)
1637   - if not options.detailed_mode or options.triage_mode:
1638   - print '%-11s %-65s' % ('Flags', 'Filename')
1639   - print '%-11s %-65s' % ('-'*11, '-'*65)
1640   - previous_container = None
1641   - count = 0
1642   - container = filename = data = None
1643   - for container, filename, data in xglob.iter_files(args, recursive=options.recursive,
1644   - zip_password=options.zip_password, zip_fname=options.zip_fname):
1645   - # ignore directory names stored in zip files:
1646   - if container and filename.endswith('/'):
1647   - continue
1648   - if options.detailed_mode and not options.triage_mode:
1649   - # fully detailed output
1650   - process_file(container, filename, data, show_decoded_strings=options.show_decoded_strings)
1651   - else:
1652   - # print container name when it changes:
1653   - if container != previous_container:
1654   - if container is not None:
1655   - print '\nFiles in %s:' % container
1656   - previous_container = container
1657   - # summarized output for triage:
1658   - process_file_triage(container, filename, data)
1659   - count += 1
1660   - if not options.detailed_mode or options.triage_mode:
1661   - print '\n(Flags: OpX=OpenXML, XML=Word2003XML, M=Macros, A=Auto-executable, S=Suspicious keywords, I=IOCs, H=Hex strings, B=Base64 strings, D=Dridex strings, ?=Unknown)\n'
1662   -
1663   - if count == 1 and not options.triage_mode and not options.detailed_mode:
1664   - # if options -t and -d were not specified and it's a single file, print details:
1665   - #TODO: avoid doing the analysis twice by storing results
1666   - process_file(container, filename, data, show_decoded_strings=options.show_decoded_strings)
1667   -
1668   -if __name__ == '__main__':
1669   - main()
1670   -
  1 +#!/usr/bin/env python
  2 +"""
  3 +olevba.py
  4 +
  5 +olevba is a script to parse OLE and OpenXML files such as MS Office documents
  6 +(e.g. Word, Excel), to extract VBA Macro code in clear text, deobfuscate
  7 +and analyze malicious macros.
  8 +
  9 +Supported formats:
  10 +- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)
  11 +- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
  12 +- PowerPoint 2007+ (.pptm, .ppsm)
  13 +- Word 2003 XML (.xml)
  14 +
  15 +Author: Philippe Lagadec - http://www.decalage.info
  16 +License: BSD, see source code or documentation
  17 +
  18 +olevba is part of the python-oletools package:
  19 +http://www.decalage.info/python/oletools
  20 +
  21 +olevba is based on source code from officeparser by John William Davison
  22 +https://github.com/unixfreak0037/officeparser
  23 +"""
  24 +
  25 +#=== LICENSE ==================================================================
  26 +
  27 +# olevba is copyright (c) 2014-2015 Philippe Lagadec (http://www.decalage.info)
  28 +# All rights reserved.
  29 +#
  30 +# Redistribution and use in source and binary forms, with or without modification,
  31 +# are permitted provided that the following conditions are met:
  32 +#
  33 +# * Redistributions of source code must retain the above copyright notice, this
  34 +# list of conditions and the following disclaimer.
  35 +# * Redistributions in binary form must reproduce the above copyright notice,
  36 +# this list of conditions and the following disclaimer in the documentation
  37 +# and/or other materials provided with the distribution.
  38 +#
  39 +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  40 +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  41 +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  42 +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  43 +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  44 +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  45 +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  46 +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  47 +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  48 +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  49 +
  50 +
  51 +# olevba contains modified source code from the officeparser project, published
  52 +# under the following MIT License (MIT):
  53 +#
  54 +# officeparser is copyright (c) 2014 John William Davison
  55 +#
  56 +# Permission is hereby granted, free of charge, to any person obtaining a copy
  57 +# of this software and associated documentation files (the "Software"), to deal
  58 +# in the Software without restriction, including without limitation the rights
  59 +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  60 +# copies of the Software, and to permit persons to whom the Software is
  61 +# furnished to do so, subject to the following conditions:
  62 +#
  63 +# The above copyright notice and this permission notice shall be included in all
  64 +# copies or substantial portions of the Software.
  65 +#
  66 +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  67 +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  68 +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  69 +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  70 +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  71 +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  72 +# SOFTWARE.
  73 +
  74 +#------------------------------------------------------------------------------
  75 +# CHANGELOG:
  76 +# 2014-08-05 v0.01 PL: - first version based on officeparser code
  77 +# 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser
  78 +# 2014-08-15 PL: - fixed incorrect value check in PROJECTHELPFILEPATH Record
  79 +# 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats
  80 +# and to find the VBA project root anywhere in the file
  81 +# 2014-11-29 v0.04 PL: - use olefile instead of OleFileIO_PL
  82 +# 2014-12-05 v0.05 PL: - refactored most functions into a class, new API
  83 +# - added detect_vba_macros
  84 +# 2014-12-10 v0.06 PL: - hide first lines with VB attributes
  85 +# - detect auto-executable macros
  86 +# - ignore empty macros
  87 +# 2014-12-14 v0.07 PL: - detect_autoexec() is now case-insensitive
  88 +# 2014-12-15 v0.08 PL: - improved display for empty macros
  89 +# - added pattern extraction
  90 +# 2014-12-25 v0.09 PL: - added suspicious keywords detection
  91 +# 2014-12-27 v0.10 PL: - added OptionParser, main and process_file
  92 +# - uses xglob to scan several files with wildcards
  93 +# - option -r to recurse subdirectories
  94 +# - option -z to scan files in password-protected zips
  95 +# 2015-01-02 v0.11 PL: - improved filter_vba to detect colons
  96 +# 2015-01-03 v0.12 PL: - fixed detect_patterns to detect all patterns
  97 +# - process_file: improved display, shows container file
  98 +# - improved list of executable file extensions
  99 +# 2015-01-04 v0.13 PL: - added several suspicious keywords, improved display
  100 +# 2015-01-08 v0.14 PL: - added hex strings detection and decoding
  101 +# - fixed issue #2, decoding VBA stream names using
  102 +# specified codepage and unicode stream names
  103 +# 2015-01-11 v0.15 PL: - added new triage mode, options -t and -d
  104 +# 2015-01-16 v0.16 PL: - fix for issue #3 (exception when module name="text")
  105 +# - added several suspicious keywords
  106 +# - added option -i to analyze VBA source code directly
  107 +# 2015-01-17 v0.17 PL: - removed .com from the list of executable extensions
  108 +# - added scan_vba to run all detection algorithms
  109 +# - decoded hex strings are now also scanned + reversed
  110 +# 2015-01-23 v0.18 PL: - fixed issue #3, case-insensitive search in code_modules
  111 +# 2015-01-24 v0.19 PL: - improved the detection of IOCs obfuscated with hex
  112 +# strings and StrReverse
  113 +# 2015-01-26 v0.20 PL: - added option --hex to show all hex strings decoded
  114 +# 2015-01-29 v0.21 PL: - added Dridex obfuscation decoding
  115 +# - improved display, shows obfuscation name
  116 +# 2015-02-01 v0.22 PL: - fixed issue #4: regex for URL, e-mail and exe filename
  117 +# - added Base64 obfuscation decoding (contribution from
  118 +# @JamesHabben)
  119 +# 2015-02-03 v0.23 PL: - triage now uses VBA_Scanner results, shows Base64 and
  120 +# Dridex strings
  121 +# - exception handling in detect_base64_strings
  122 +# 2015-02-07 v0.24 PL: - renamed option --hex to --decode, fixed display
  123 +# - display exceptions with stack trace
  124 +# - added several suspicious keywords
  125 +# - improved Base64 detection and decoding
  126 +# - fixed triage mode not to scan attrib lines
  127 +# 2015-03-04 v0.25 PL: - added support for Word 2003 XML
  128 +
  129 +__version__ = '0.25'
  130 +
  131 +#------------------------------------------------------------------------------
  132 +# TODO:
  133 +# + do not use logging, but a provided logger (null logger by default)
  134 +# + setup logging (common with other oletools)
  135 +# + add xor bruteforcing like bbharvest
  136 +# + add chr() decoding
  137 +
  138 +# TODO later:
  139 +# + performance improvement: instead of searching each keyword separately,
  140 +# first split vba code into a list of words (per line), then check each
  141 +# word against a dict. (or put vba words into a set/dict?)
  142 +# + for regex, maybe combine them into a single re with named groups?
  143 +# + add Yara support, include sample rules? plugins like balbuzard?
  144 +# + add balbuzard support
  145 +# + output to file (replace print by file.write, sys.stdout by default)
  146 +# + look for VBA in embedded documents (e.g. Excel in Word)
  147 +# + support SRP streams (see Lenny's article + links and sample)
  148 +# - python 3.x support
  149 +# - add support for PowerPoint macros (see libclamav, libgsf), use oledump heuristic?
  150 +# - check VBA macros in Visio, Access, Project, etc
  151 +# - extract_macros: convert to a class, split long function into smaller methods
  152 +# - extract_macros: read bytes from stream file objects instead of strings
  153 +# - extract_macros: use combined struct.unpack instead of many calls
  154 +
  155 +#------------------------------------------------------------------------------
  156 +# REFERENCES:
  157 +# - [MS-OVBA]: Microsoft Office VBA File Format Structure
  158 +# http://msdn.microsoft.com/en-us/library/office/cc313094%28v=office.12%29.aspx
  159 +# - officeparser: https://github.com/unixfreak0037/officeparser
  160 +
  161 +
  162 +#--- IMPORTS ------------------------------------------------------------------
  163 +
  164 +import sys, logging
  165 +import struct
  166 +import cStringIO
  167 +import math
  168 +import zipfile
  169 +import re
  170 +import optparse
  171 +import os.path
  172 +import binascii
  173 +import base64
  174 +import traceback
  175 +import zlib
  176 +
  177 +# import lxml or ElementTree for XML parsing:
  178 +try:
  179 + # lxml: best performance for XML processing
  180 + import lxml.etree as ET
  181 +except ImportError:
  182 + try:
  183 + # Python 2.5+: batteries included
  184 + import xml.etree.cElementTree as ET
  185 + except ImportError:
  186 + try:
  187 + # Python <2.5: standalone ElementTree install
  188 + import elementtree.cElementTree as ET
  189 + except ImportError:
  190 + raise ImportError, "lxml or ElementTree are not installed, "\
  191 + +"see http://codespeak.net/lxml "\
  192 + +"or http://effbot.org/zone/element-index.htm"
  193 +
  194 +import thirdparty.olefile as olefile
  195 +from thirdparty.prettytable import prettytable
  196 +from thirdparty.xglob import xglob
  197 +
  198 +#--- CONSTANTS ----------------------------------------------------------------
  199 +
  200 +TYPE_OLE = 'OLE'
  201 +TYPE_OpenXML = 'OpenXML'
  202 +TYPE_Word2003_XML = 'Word2003_XML'
  203 +
  204 +MODULE_EXTENSION = "bas"
  205 +CLASS_EXTENSION = "cls"
  206 +FORM_EXTENSION = "frm"
  207 +
  208 +# Namespaces and tags for Word2003 XML parsing:
  209 +NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}'
  210 +# the tag <w:binData w:name="editdata.mso"> contains the VBA macro code:
  211 +TAG_BINDATA = NS_W + 'binData'
  212 +ATTR_NAME = NS_W + 'name'
  213 +
  214 +# Keywords to detect auto-executable macros
  215 +AUTOEXEC_KEYWORDS = {
  216 + # MS Word:
  217 + 'Runs when the Word document is opened':
  218 + ('AutoExec', 'AutoOpen', 'Document_Open', 'DocumentOpen'),
  219 + 'Runs when the Word document is closed':
  220 + ('AutoExit', 'AutoClose', 'Document_Close', 'DocumentBeforeClose'),
  221 + 'Runs when the Word document is modified':
  222 + ('DocumentChange',),
  223 + 'Runs when a new Word document is created':
  224 + ('AutoNew', 'Document_New', 'NewDocument'),
  225 +
  226 + # MS Excel:
  227 + 'Runs when the Excel Workbook is opened':
  228 + ('Auto_Open', 'Workbook_Open'),
  229 + 'Runs when the Excel Workbook is closed':
  230 + ('Auto_Close', 'Workbook_Close'),
  231 +
  232 + #TODO: full list in MS specs??
  233 +}
  234 +
  235 +# Suspicious Keywords that may be used by malware
  236 +# See VBA language reference: http://msdn.microsoft.com/en-us/library/office/jj692818%28v=office.15%29.aspx
  237 +SUSPICIOUS_KEYWORDS = {
  238 + #TODO: use regex to support variable whitespaces
  239 + 'May read system environment variables':
  240 + ('Environ',),
  241 + 'May open a file':
  242 + ('Open',),
  243 + 'May write to a file (if combined with Open)':
  244 + #TODO: regex to find Open+Write on same line
  245 + ('Write', 'Put', 'Output', 'Print #'),
  246 + 'May read or write a binary file (if combined with Open)':
  247 + #TODO: regex to find Open+Binary on same line
  248 + ('Binary',),
  249 + 'May copy a file':
  250 + ('FileCopy', 'CopyFile'),
  251 + #FileCopy: http://msdn.microsoft.com/en-us/library/office/gg264390%28v=office.15%29.aspx
  252 + #CopyFile: http://msdn.microsoft.com/en-us/library/office/gg264089%28v=office.15%29.aspx
  253 + 'May delete a file':
  254 + ('Kill',),
  255 + 'May create a text file':
  256 + ('CreateTextFile','ADODB.Stream', 'WriteText', 'SaveToFile'),
  257 + #CreateTextFile: http://msdn.microsoft.com/en-us/library/office/gg264617%28v=office.15%29.aspx
  258 + #ADODB.Stream sample: http://pastebin.com/Z4TMyuq6
  259 + 'May run an executable file or a system command':
  260 + ('Shell', 'vbNormal', 'vbNormalFocus', 'vbHide', 'vbMinimizedFocus', 'vbMaximizedFocus', 'vbNormalNoFocus',
  261 + 'vbMinimizedNoFocus', 'WScript.Shell', 'Run'),
  262 + #Shell: http://msdn.microsoft.com/en-us/library/office/gg278437%28v=office.15%29.aspx
  263 + #WScript.Shell+Run sample: http://pastebin.com/Z4TMyuq6
  264 + 'May hide the application':
  265 + ('Application.Visible', 'ShowWindow', 'SW_HIDE'),
  266 + 'May create a directory':
  267 + ('MkDir',),
  268 + 'May save the current workbook':
  269 + ('ActiveWorkbook.SaveAs',),
  270 + 'May change which directory contains files to open at startup':
  271 + #TODO: confirm the actual effect
  272 + ('Application.AltStartupPath',),
  273 + 'May create an OLE object':
  274 + ('CreateObject',),
  275 + 'May run an application (if combined with CreateObject)':
  276 + ('Shell.Application',),
  277 + 'May enumerate application windows (if combined with Shell.Application object)':
  278 + ('Windows', 'FindWindow'),
  279 + 'May run code from a DLL':
  280 + #TODO: regex to find declare+lib on same line
  281 + ('Lib',),
  282 + 'May download files from the Internet':
  283 + #TODO: regex to find urlmon+URLDownloadToFileA on same line
  284 + ('URLDownloadToFileA', 'Msxml2.XMLHTTP', 'Microsoft.XMLHTTP'),
  285 + 'May control another application by simulating user keystrokes':
  286 + ('SendKeys', 'AppActivate'),
  287 + #SendKeys: http://msdn.microsoft.com/en-us/library/office/gg278655%28v=office.15%29.aspx
  288 + 'May attempt to obfuscate malicious function calls':
  289 + ('CallByName',),
  290 + #CallByName: http://msdn.microsoft.com/en-us/library/office/gg278760%28v=office.15%29.aspx
  291 + 'May attempt to obfuscate specific strings':
  292 + #TODO: regex to find several Chr*, not just one
  293 + ('Chr', 'ChrB', 'ChrW', 'StrReverse', 'Xor'),
  294 + #Chr: http://msdn.microsoft.com/en-us/library/office/gg264465%28v=office.15%29.aspx
  295 +}
  296 +
  297 +# Regular Expression for a URL:
  298 +# http://en.wikipedia.org/wiki/Uniform_resource_locator
  299 +# http://www.w3.org/Addressing/URL/uri-spec.html
  300 +#TODO: also support username:password@server
  301 +#TODO: other protocols (file, gopher, wais, ...?)
  302 +SCHEME = r'\b(?:http|ftp)s?'
  303 +# see http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains
  304 +TLD = r'(?:xn--[a-zA-Z0-9]{4,20}|[a-zA-Z]{2,20})'
  305 +DNS_NAME = r'(?:[a-zA-Z0-9\-\.]+\.' + TLD + ')'
  306 +#TODO: IPv6 - see https://www.debuggex.com/
  307 +# A literal numeric IPv6 address may be given, but must be enclosed in [ ] e.g. [db8:0cec::99:123a]
  308 +NUMBER_0_255 = r'(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'
  309 +IPv4 = r'(?:'+NUMBER_0_255+r'\.){3}'+NUMBER_0_255
  310 +# IPv4 must come before the DNS name because it is more specific
  311 +SERVER = r'(?:' + IPv4 + '|' + DNS_NAME + ')'
  312 +PORT = r'(?:\:[0-9]{1,5})?'
  313 +SERVER_PORT = SERVER + PORT
  314 +URL_PATH = r'(?:/[a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~]*)?' # [^\.\,\)\(\s"]
  315 +URL_RE = SCHEME + r'\://' + SERVER_PORT + URL_PATH
  316 +re_url = re.compile(URL_RE)
  317 +
  318 +
  319 +# Patterns to be extracted (IP addresses, URLs, etc)
  320 +# From patterns.py in balbuzard
  321 +RE_PATTERNS = (
  322 + ('URL', re.compile(URL_RE)),
  323 + ('IPv4 address', re.compile(IPv4)),
  324 + ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@'+SERVER+'\b')),
  325 + # ('Domain name', re.compile(r'(?=^.{1,254}$)(^(?:(?!\d+\.|-)[a-zA-Z0-9_\-]{1,63}(?<!-)\.?)+(?:[a-zA-Z]{2,})$)')),
  326 + # Executable file name with known extensions (except .com which is present in many URLs, and .application):
  327 + ("Executable file name", re.compile(r"(?i)\b\w+\.(EXE|PIF|GADGET|MSI|MSP|MSC|VBS|VBE|VB|JSE|JS|WSF|WSC|WSH|WS|BAT|CMD|DLL|SCR|HTA|CPL|CLASS|JAR|PS1XML|PS1|PS2XML|PS2|PSC1|PSC2|SCF|LNK|INF|REG)\b")),
  328 + # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/
  329 + #TODO: https://support.office.com/en-us/article/Blocked-attachments-in-Outlook-3811cddc-17c3-4279-a30c-060ba0207372#__attachment_file_types
  330 + #('Hex string', re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')),
  331 + )
  332 +
  333 +# regex to detect strings encoded in hexadecimal
  334 +re_hex_string = re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')
  335 +
  336 +# regex to detect strings encoded in base64
  337 +#re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?"')
  338 +# better version from balbuzard, less false positives:
  339 +re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4}){1,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)?"')
  340 +# white list of common strings matching the base64 regex, but which are not base64 strings (all lowercase):
  341 +BASE64_WHITELIST = set(['thisdocument', 'thisworkbook', 'test', 'temp', 'http', 'open', 'exit'])
  342 +
  343 +# regex to detect strings encoded with a specific Dridex algorithm
  344 +# (see https://github.com/JamesHabben/MalwareStuff)
  345 +re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"')
  346 +# regex to check that it is not just a hex string:
  347 +re_nothex_check = re.compile(r'[G-Zg-z]')
  348 +
  349 +#--- FUNCTIONS ----------------------------------------------------------------
  350 +
  351 +def copytoken_help(decompressed_current, decompressed_chunk_start):
  352 + """
  353 + compute bit masks to decode a CopyToken according to MS-OVBA 2.4.1.3.19.1 CopyToken Help
  354 +
  355 + decompressed_current: number of decompressed bytes so far, i.e. len(decompressed_container)
  356 + decompressed_chunk_start: offset of the current chunk in the decompressed container
  357 + return length_mask, offset_mask, bit_count, maximum_length
  358 + """
  359 + difference = decompressed_current - decompressed_chunk_start
  360 + bit_count = int(math.ceil(math.log(difference, 2)))
  361 + bit_count = max([bit_count, 4])
  362 + length_mask = 0xFFFF >> bit_count
  363 + offset_mask = ~length_mask
  364 + maximum_length = (0xFFFF >> bit_count) + 3
  365 + return length_mask, offset_mask, bit_count, maximum_length
  366 +
  367 +
  368 +def decompress_stream (compressed_container):
  369 + """
  370 + Decompress a stream according to MS-OVBA section 2.4.1
  371 +
  372 + compressed_container: string compressed according to the MS-OVBA 2.4.1.3.6 Compression algorithm
  373 + return the decompressed container as a string (bytes)
  374 + """
  375 + # 2.4.1.2 State Variables
  376 +
  377 + # The following state is maintained for the CompressedContainer (section 2.4.1.1.1):
  378 + # CompressedRecordEnd: The location of the byte after the last byte in the CompressedContainer (section 2.4.1.1.1).
  379 + # CompressedCurrent: The location of the next byte in the CompressedContainer (section 2.4.1.1.1) to be read by
  380 + # decompression or to be written by compression.
  381 +
  382 + # The following state is maintained for the current CompressedChunk (section 2.4.1.1.4):
  383 + # CompressedChunkStart: The location of the first byte of the CompressedChunk (section 2.4.1.1.4) within the
  384 + # CompressedContainer (section 2.4.1.1.1).
  385 +
  386 + # The following state is maintained for a DecompressedBuffer (section 2.4.1.1.2):
  387 + # DecompressedCurrent: The location of the next byte in the DecompressedBuffer (section 2.4.1.1.2) to be written by
  388 + # decompression or to be read by compression.
  389 + # DecompressedBufferEnd: The location of the byte after the last byte in the DecompressedBuffer (section 2.4.1.1.2).
  390 +
  391 + # The following state is maintained for the current DecompressedChunk (section 2.4.1.1.3):
  392 + # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the
  393 + # DecompressedBuffer (section 2.4.1.1.2).
  394 +
  395 + decompressed_container = '' # result
  396 + compressed_current = 0
  397 +
  398 + sig_byte = ord(compressed_container[compressed_current])
  399 + if sig_byte != 0x01:
  400 + raise ValueError('invalid signature byte {0:02X}'.format(sig_byte))
  401 +
  402 + compressed_current += 1
  403 +
  404 + #NOTE: the definition of CompressedRecordEnd is ambiguous. Here we assume that
  405 + # CompressedRecordEnd = len(compressed_container)
  406 + while compressed_current < len(compressed_container):
  407 + # 2.4.1.1.5
  408 + compressed_chunk_start = compressed_current
  409 + # chunk header = first 16 bits
  410 + compressed_chunk_header = struct.unpack("<H", compressed_container[compressed_chunk_start:compressed_chunk_start + 2])[0]
  411 + # chunk size = 12 first bits of header + 3
  412 + chunk_size = (compressed_chunk_header & 0x0FFF) + 3
  413 + # chunk signature = 3 next bits - should always be 0b011
  414 + chunk_signature = (compressed_chunk_header >> 12) & 0x07
  415 + if chunk_signature != 0b011:
  416 + raise ValueError('Invalid CompressedChunkSignature in VBA compressed stream')
  417 + # chunk flag = next bit - 1 == compressed, 0 == uncompressed
  418 + chunk_flag = (compressed_chunk_header >> 15) & 0x01
  419 + logging.debug("chunk size = {0}, compressed flag = {1}".format(chunk_size, chunk_flag))
  420 +
  421 + #MS-OVBA 2.4.1.3.12: the maximum size of a chunk including its header is 4098 bytes (header 2 + data 4096)
  422 + # The minimum size is 3 bytes
  423 + # NOTE: there seems to be a typo in MS-OVBA, the check should be with 4098, not 4095 (which is the max value
  424 + # in chunk header before adding 3.
  425 + # Also the first test is not useful since a 12 bits value cannot be larger than 4095.
  426 + if chunk_flag == 1 and chunk_size > 4098:
  427 + raise ValueError('CompressedChunkSize > 4098 but CompressedChunkFlag == 1')
  428 + if chunk_flag == 0 and chunk_size != 4098:
  429 + raise ValueError('CompressedChunkSize != 4098 but CompressedChunkFlag == 0')
  430 +
  431 + # check if chunk_size goes beyond the compressed data, instead of silently cutting it:
  432 + #TODO: raise an exception?
  433 + if compressed_chunk_start + chunk_size > len(compressed_container):
  434 + logging.warning('Chunk size is larger than remaining compressed data')
  435 + compressed_end = min([len(compressed_container), compressed_chunk_start + chunk_size])
  436 + # read after chunk header:
  437 + compressed_current = compressed_chunk_start + 2
  438 +
  439 + if chunk_flag == 0:
  440 + # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk
  441 + # uncompressed chunk: read the next 4096 bytes as-is
  442 + #TODO: check if there are at least 4096 bytes left
  443 + decompressed_container += compressed_container[compressed_current:compressed_current + 4096]
  444 + compressed_current += 4096
  445 + else:
  446 + # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk
  447 + # compressed chunk
  448 + decompressed_chunk_start = len(decompressed_container)
  449 + while compressed_current < compressed_end:
  450 + # MS-OVBA 2.4.1.3.4 Decompressing a TokenSequence
  451 + # logging.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end))
  452 + # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or
  453 + # copy tokens (reference to a previous literal token)
  454 + flag_byte = ord(compressed_container[compressed_current])
  455 + compressed_current += 1
  456 + for bit_index in xrange(0, 8):
  457 + # logging.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end))
  458 + if compressed_current >= compressed_end:
  459 + break
  460 + # MS-OVBA 2.4.1.3.5 Decompressing a Token
  461 + # MS-OVBA 2.4.1.3.17 Extract FlagBit
  462 + flag_bit = (flag_byte >> bit_index) & 1
  463 + #logging.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit))
  464 + if flag_bit == 0: # LiteralToken
  465 + # copy one byte directly to output
  466 + decompressed_container += compressed_container[compressed_current]
  467 + compressed_current += 1
  468 + else: # CopyToken
  469 + # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken
  470 + copy_token = struct.unpack("<H", compressed_container[compressed_current:compressed_current + 2])[0]
  471 + #TODO: check this
  472 + length_mask, offset_mask, bit_count, maximum_length = copytoken_help(
  473 + len(decompressed_container), decompressed_chunk_start)
  474 + length = (copy_token & length_mask) + 3
  475 + temp1 = copy_token & offset_mask
  476 + temp2 = 16 - bit_count
  477 + offset = (temp1 >> temp2) + 1
  478 + #logging.debug('offset=%d length=%d' % (offset, length))
  479 + copy_source = len(decompressed_container) - offset
  480 + for index in xrange(copy_source, copy_source + length):
  481 + decompressed_container += decompressed_container[index]
  482 + compressed_current += 2
  483 + return decompressed_container
  484 +
  485 +
  486 +def _extract_vba (ole, vba_root, project_path, dir_path):
  487 + """
  488 + Extract VBA macros from an OleFileIO object.
  489 + Internal function, do not call directly.
  490 +
  491 + vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream
  492 + vba_project: path to the PROJECT stream
  493 + This is a generator, yielding (stream path, VBA filename, VBA source code) for each VBA code stream
  494 + """
  495 + # Open the PROJECT stream:
  496 + project = ole.openstream(project_path)
  497 +
  498 + # sample content of the PROJECT stream:
  499 +
  500 + ## ID="{5312AC8A-349D-4950-BDD0-49BE3C4DD0F0}"
  501 + ## Document=ThisDocument/&H00000000
  502 + ## Module=NewMacros
  503 + ## Name="Project"
  504 + ## HelpContextID="0"
  505 + ## VersionCompatible32="393222000"
  506 + ## CMG="F1F301E705E705E705E705"
  507 + ## DPB="8F8D7FE3831F2020202020"
  508 + ## GC="2D2FDD81E51EE61EE6E1"
  509 + ##
  510 + ## [Host Extender Info]
  511 + ## &H00000001={3832D640-CF90-11CF-8E43-00A0C911005A};VBE;&H00000000
  512 + ## &H00000002={000209F2-0000-0000-C000-000000000046};Word8.0;&H00000000
  513 + ##
  514 + ## [Workspace]
  515 + ## ThisDocument=22, 29, 339, 477, Z
  516 + ## NewMacros=-4, 42, 832, 510, C
  517 +
  518 + code_modules = {}
  519 +
  520 + for line in project:
  521 + line = line.strip()
  522 + if '=' in line:
  523 + # split line at the 1st equal sign:
  524 + name, value = line.split('=', 1)
  525 + # looking for code modules
  526 + # add the code module as a key in the dictionary
  527 + # the value will be the extension needed later
  528 + # The value is converted to lowercase, to allow case-insensitive matching (issue #3)
  529 + value = value.lower()
  530 + if name == 'Document':
  531 + # split value at the 1st slash, keep 1st part:
  532 + value = value.split('/', 1)[0]
  533 + code_modules[value] = CLASS_EXTENSION
  534 + elif name == 'Module':
  535 + code_modules[value] = MODULE_EXTENSION
  536 + elif name == 'Class':
  537 + code_modules[value] = CLASS_EXTENSION
  538 + elif name == 'BaseClass':
  539 + code_modules[value] = FORM_EXTENSION
  540 +
  541 + # read data from dir stream (compressed)
  542 + dir_compressed = ole.openstream(dir_path).read()
  543 +
  544 + def check_value(name, expected, value):
  545 + if expected != value:
  546 + logging.error("invalid value for {0} expected {1:04X} got {2:04X}".format(name, expected, value))
  547 +
  548 + dir_stream = cStringIO.StringIO(decompress_stream(dir_compressed))
  549 +
  550 + # PROJECTSYSKIND Record
  551 + PROJECTSYSKIND_Id = struct.unpack("<H", dir_stream.read(2))[0]
  552 + check_value('PROJECTSYSKIND_Id', 0x0001, PROJECTSYSKIND_Id)
  553 + PROJECTSYSKIND_Size = struct.unpack("<L", dir_stream.read(4))[0]
  554 + check_value('PROJECTSYSKIND_Size', 0x0004, PROJECTSYSKIND_Size)
  555 + PROJECTSYSKIND_SysKind = struct.unpack("<L", dir_stream.read(4))[0]
  556 + if PROJECTSYSKIND_SysKind == 0x00:
  557 + logging.debug("16-bit Windows")
  558 + elif PROJECTSYSKIND_SysKind == 0x01:
  559 + logging.debug("32-bit Windows")
  560 + elif PROJECTSYSKIND_SysKind == 0x02:
  561 + logging.debug("Macintosh")
  562 + elif PROJECTSYSKIND_SysKind == 0x03:
  563 + logging.debug("64-bit Windows")
  564 + else:
  565 + logging.error("invalid PROJECTSYSKIND_SysKind {0:04X}".format(PROJECTSYSKIND_SysKind))
  566 +
  567 + # PROJECTLCID Record
  568 + PROJECTLCID_Id = struct.unpack("<H", dir_stream.read(2))[0]
  569 + check_value('PROJECTLCID_Id', 0x0002, PROJECTLCID_Id)
  570 + PROJECTLCID_Size = struct.unpack("<L", dir_stream.read(4))[0]
  571 + check_value('PROJECTLCID_Size', 0x0004, PROJECTLCID_Size)
  572 + PROJECTLCID_Lcid = struct.unpack("<L", dir_stream.read(4))[0]
  573 + check_value('PROJECTLCID_Lcid', 0x409, PROJECTLCID_Lcid)
  574 +
  575 + # PROJECTLCIDINVOKE Record
  576 + PROJECTLCIDINVOKE_Id = struct.unpack("<H", dir_stream.read(2))[0]
  577 + check_value('PROJECTLCIDINVOKE_Id', 0x0014, PROJECTLCIDINVOKE_Id)
  578 + PROJECTLCIDINVOKE_Size = struct.unpack("<L", dir_stream.read(4))[0]
  579 + check_value('PROJECTLCIDINVOKE_Size', 0x0004, PROJECTLCIDINVOKE_Size)
  580 + PROJECTLCIDINVOKE_LcidInvoke = struct.unpack("<L", dir_stream.read(4))[0]
  581 + check_value('PROJECTLCIDINVOKE_LcidInvoke', 0x409, PROJECTLCIDINVOKE_LcidInvoke)
  582 +
  583 + # PROJECTCODEPAGE Record
  584 + PROJECTCODEPAGE_Id = struct.unpack("<H", dir_stream.read(2))[0]
  585 + check_value('PROJECTCODEPAGE_Id', 0x0003, PROJECTCODEPAGE_Id)
  586 + PROJECTCODEPAGE_Size = struct.unpack("<L", dir_stream.read(4))[0]
  587 + check_value('PROJECTCODEPAGE_Size', 0x0002, PROJECTCODEPAGE_Size)
  588 + PROJECTCODEPAGE_CodePage = struct.unpack("<H", dir_stream.read(2))[0]
  589 +
  590 + # PROJECTNAME Record
  591 + PROJECTNAME_Id = struct.unpack("<H", dir_stream.read(2))[0]
  592 + check_value('PROJECTNAME_Id', 0x0004, PROJECTNAME_Id)
  593 + PROJECTNAME_SizeOfProjectName = struct.unpack("<L", dir_stream.read(4))[0]
  594 + if PROJECTNAME_SizeOfProjectName < 1 or PROJECTNAME_SizeOfProjectName > 128:
  595 + logging.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(PROJECTNAME_SizeOfProjectName))
  596 + PROJECTNAME_ProjectName = dir_stream.read(PROJECTNAME_SizeOfProjectName)
  597 +
  598 + # PROJECTDOCSTRING Record
  599 + PROJECTDOCSTRING_Id = struct.unpack("<H", dir_stream.read(2))[0]
  600 + check_value('PROJECTDOCSTRING_Id', 0x0005, PROJECTDOCSTRING_Id)
  601 + PROJECTDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0]
  602 + if PROJECTNAME_SizeOfProjectName > 2000:
  603 + logging.error("PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(PROJECTDOCSTRING_SizeOfDocString))
  604 + PROJECTDOCSTRING_DocString = dir_stream.read(PROJECTDOCSTRING_SizeOfDocString)
  605 + PROJECTDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  606 + check_value('PROJECTDOCSTRING_Reserved', 0x0040, PROJECTDOCSTRING_Reserved)
  607 + PROJECTDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  608 + if PROJECTDOCSTRING_SizeOfDocStringUnicode % 2 != 0:
  609 + logging.error("PROJECTDOCSTRING_SizeOfDocStringUnicode is not even")
  610 + PROJECTDOCSTRING_DocStringUnicode = dir_stream.read(PROJECTDOCSTRING_SizeOfDocStringUnicode)
  611 +
  612 + # PROJECTHELPFILEPATH Record - MS-OVBA 2.3.4.2.1.7
  613 + PROJECTHELPFILEPATH_Id = struct.unpack("<H", dir_stream.read(2))[0]
  614 + check_value('PROJECTHELPFILEPATH_Id', 0x0006, PROJECTHELPFILEPATH_Id)
  615 + PROJECTHELPFILEPATH_SizeOfHelpFile1 = struct.unpack("<L", dir_stream.read(4))[0]
  616 + if PROJECTHELPFILEPATH_SizeOfHelpFile1 > 260:
  617 + logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(PROJECTHELPFILEPATH_SizeOfHelpFile1))
  618 + PROJECTHELPFILEPATH_HelpFile1 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile1)
  619 + PROJECTHELPFILEPATH_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  620 + check_value('PROJECTHELPFILEPATH_Reserved', 0x003D, PROJECTHELPFILEPATH_Reserved)
  621 + PROJECTHELPFILEPATH_SizeOfHelpFile2 = struct.unpack("<L", dir_stream.read(4))[0]
  622 + if PROJECTHELPFILEPATH_SizeOfHelpFile2 != PROJECTHELPFILEPATH_SizeOfHelpFile1:
  623 + logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 does not equal PROJECTHELPFILEPATH_SizeOfHelpFile2")
  624 + PROJECTHELPFILEPATH_HelpFile2 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile2)
  625 + if PROJECTHELPFILEPATH_HelpFile2 != PROJECTHELPFILEPATH_HelpFile1:
  626 + logging.error("PROJECTHELPFILEPATH_HelpFile1 does not equal PROJECTHELPFILEPATH_HelpFile2")
  627 +
  628 + # PROJECTHELPCONTEXT Record
  629 + PROJECTHELPCONTEXT_Id = struct.unpack("<H", dir_stream.read(2))[0]
  630 + check_value('PROJECTHELPCONTEXT_Id', 0x0007, PROJECTHELPCONTEXT_Id)
  631 + PROJECTHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0]
  632 + check_value('PROJECTHELPCONTEXT_Size', 0x0004, PROJECTHELPCONTEXT_Size)
  633 + PROJECTHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0]
  634 +
  635 + # PROJECTLIBFLAGS Record
  636 + PROJECTLIBFLAGS_Id = struct.unpack("<H", dir_stream.read(2))[0]
  637 + check_value('PROJECTLIBFLAGS_Id', 0x0008, PROJECTLIBFLAGS_Id)
  638 + PROJECTLIBFLAGS_Size = struct.unpack("<L", dir_stream.read(4))[0]
  639 + check_value('PROJECTLIBFLAGS_Size', 0x0004, PROJECTLIBFLAGS_Size)
  640 + PROJECTLIBFLAGS_ProjectLibFlags = struct.unpack("<L", dir_stream.read(4))[0]
  641 + check_value('PROJECTLIBFLAGS_ProjectLibFlags', 0x0000, PROJECTLIBFLAGS_ProjectLibFlags)
  642 +
  643 + # PROJECTVERSION Record
  644 + PROJECTVERSION_Id = struct.unpack("<H", dir_stream.read(2))[0]
  645 + check_value('PROJECTVERSION_Id', 0x0009, PROJECTVERSION_Id)
  646 + PROJECTVERSION_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
  647 + check_value('PROJECTVERSION_Reserved', 0x0004, PROJECTVERSION_Reserved)
  648 + PROJECTVERSION_VersionMajor = struct.unpack("<L", dir_stream.read(4))[0]
  649 + PROJECTVERSION_VersionMinor = struct.unpack("<H", dir_stream.read(2))[0]
  650 +
  651 + # PROJECTCONSTANTS Record
  652 + PROJECTCONSTANTS_Id = struct.unpack("<H", dir_stream.read(2))[0]
  653 + check_value('PROJECTCONSTANTS_Id', 0x000C, PROJECTCONSTANTS_Id)
  654 + PROJECTCONSTANTS_SizeOfConstants = struct.unpack("<L", dir_stream.read(4))[0]
  655 + if PROJECTCONSTANTS_SizeOfConstants > 1015:
  656 + logging.error("PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(PROJECTCONSTANTS_SizeOfConstants))
  657 + PROJECTCONSTANTS_Constants = dir_stream.read(PROJECTCONSTANTS_SizeOfConstants)
  658 + PROJECTCONSTANTS_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  659 + check_value('PROJECTCONSTANTS_Reserved', 0x003C, PROJECTCONSTANTS_Reserved)
  660 + PROJECTCONSTANTS_SizeOfConstantsUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  661 + if PROJECTCONSTANTS_SizeOfConstantsUnicode % 2 != 0:
  662 + logging.error("PROJECTCONSTANTS_SizeOfConstantsUnicode is not even")
  663 + PROJECTCONSTANTS_ConstantsUnicode = dir_stream.read(PROJECTCONSTANTS_SizeOfConstantsUnicode)
  664 +
  665 + # array of REFERENCE records
  666 + check = None
  667 + while True:
  668 + check = struct.unpack("<H", dir_stream.read(2))[0]
  669 + logging.debug("reference type = {0:04X}".format(check))
  670 + if check == 0x000F:
  671 + break
  672 +
  673 + if check == 0x0016:
  674 + # REFERENCENAME
  675 + REFERENCE_Id = check
  676 + REFERENCE_SizeOfName = struct.unpack("<L", dir_stream.read(4))[0]
  677 + REFERENCE_Name = dir_stream.read(REFERENCE_SizeOfName)
  678 + REFERENCE_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  679 + check_value('REFERENCE_Reserved', 0x003E, REFERENCE_Reserved)
  680 + REFERENCE_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  681 + REFERENCE_NameUnicode = dir_stream.read(REFERENCE_SizeOfNameUnicode)
  682 + continue
  683 +
  684 + if check == 0x0033:
  685 + # REFERENCEORIGINAL (followed by REFERENCECONTROL)
  686 + REFERENCEORIGINAL_Id = check
  687 + REFERENCEORIGINAL_SizeOfLibidOriginal = struct.unpack("<L", dir_stream.read(4))[0]
  688 + REFERENCEORIGINAL_LibidOriginal = dir_stream.read(REFERENCEORIGINAL_SizeOfLibidOriginal)
  689 + continue
  690 +
  691 + if check == 0x002F:
  692 + # REFERENCECONTROL
  693 + REFERENCECONTROL_Id = check
  694 + REFERENCECONTROL_SizeTwiddled = struct.unpack("<L", dir_stream.read(4))[0] # ignore
  695 + REFERENCECONTROL_SizeOfLibidTwiddled = struct.unpack("<L", dir_stream.read(4))[0]
  696 + REFERENCECONTROL_LibidTwiddled = dir_stream.read(REFERENCECONTROL_SizeOfLibidTwiddled)
  697 + REFERENCECONTROL_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0] # ignore
  698 + check_value('REFERENCECONTROL_Reserved1', 0x0000, REFERENCECONTROL_Reserved1)
  699 + REFERENCECONTROL_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0] # ignore
  700 + check_value('REFERENCECONTROL_Reserved2', 0x0000, REFERENCECONTROL_Reserved2)
  701 + # optional field
  702 + check2 = struct.unpack("<H", dir_stream.read(2))[0]
  703 + if check2 == 0x0016:
  704 + REFERENCECONTROL_NameRecordExtended_Id = check
  705 + REFERENCECONTROL_NameRecordExtended_SizeofName = struct.unpack("<L", dir_stream.read(4))[0]
  706 + REFERENCECONTROL_NameRecordExtended_Name = dir_stream.read(REFERENCECONTROL_NameRecordExtended_SizeofName)
  707 + REFERENCECONTROL_NameRecordExtended_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  708 + check_value('REFERENCECONTROL_NameRecordExtended_Reserved', 0x003E, REFERENCECONTROL_NameRecordExtended_Reserved)
  709 + REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  710 + REFERENCECONTROL_NameRecordExtended_NameUnicode = dir_stream.read(REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode)
  711 + REFERENCECONTROL_Reserved3 = struct.unpack("<H", dir_stream.read(2))[0]
  712 + else:
  713 + REFERENCECONTROL_Reserved3 = check2
  714 +
  715 + check_value('REFERENCECONTROL_Reserved3', 0x0030, REFERENCECONTROL_Reserved3)
  716 + REFERENCECONTROL_SizeExtended = struct.unpack("<L", dir_stream.read(4))[0]
  717 + REFERENCECONTROL_SizeOfLibidExtended = struct.unpack("<L", dir_stream.read(4))[0]
  718 + REFERENCECONTROL_LibidExtended = dir_stream.read(REFERENCECONTROL_SizeOfLibidExtended)
  719 + REFERENCECONTROL_Reserved4 = struct.unpack("<L", dir_stream.read(4))[0]
  720 + REFERENCECONTROL_Reserved5 = struct.unpack("<H", dir_stream.read(2))[0]
  721 + REFERENCECONTROL_OriginalTypeLib = dir_stream.read(16)
  722 + REFERENCECONTROL_Cookie = struct.unpack("<L", dir_stream.read(4))[0]
  723 + continue
  724 +
  725 + if check == 0x000D:
  726 + # REFERENCEREGISTERED
  727 + REFERENCEREGISTERED_Id = check
  728 + REFERENCEREGISTERED_Size = struct.unpack("<L", dir_stream.read(4))[0]
  729 + REFERENCEREGISTERED_SizeOfLibid = struct.unpack("<L", dir_stream.read(4))[0]
  730 + REFERENCEREGISTERED_Libid = dir_stream.read(REFERENCEREGISTERED_SizeOfLibid)
  731 + REFERENCEREGISTERED_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0]
  732 + check_value('REFERENCEREGISTERED_Reserved1', 0x0000, REFERENCEREGISTERED_Reserved1)
  733 + REFERENCEREGISTERED_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0]
  734 + check_value('REFERENCEREGISTERED_Reserved2', 0x0000, REFERENCEREGISTERED_Reserved2)
  735 + continue
  736 +
  737 + if check == 0x000E:
  738 + # REFERENCEPROJECT
  739 + REFERENCEPROJECT_Id = check
  740 + REFERENCEPROJECT_Size = struct.unpack("<L", dir_stream.read(4))[0]
  741 + REFERENCEPROJECT_SizeOfLibidAbsolute = struct.unpack("<L", dir_stream.read(4))[0]
  742 + REFERENCEPROJECT_LibidAbsolute = dir_stream.read(REFERENCEPROJECT_SizeOfLibidAbsolute)
  743 + REFERENCEPROJECT_SizeOfLibidRelative = struct.unpack("<L", dir_stream.read(4))[0]
  744 + REFERENCEPROJECT_LibidRelative = dir_stream.read(REFERENCEPROJECT_SizeOfLibidRelative)
  745 + REFERENCEPROJECT_MajorVersion = struct.unpack("<L", dir_stream.read(4))[0]
  746 + REFERENCEPROJECT_MinorVersion = struct.unpack("<H", dir_stream.read(2))[0]
  747 + continue
  748 +
  749 + logging.error('invalid or unknown check Id {0:04X}'.format(check))
  750 + sys.exit(0)
  751 +
  752 + PROJECTMODULES_Id = check #struct.unpack("<H", dir_stream.read(2))[0]
  753 + check_value('PROJECTMODULES_Id', 0x000F, PROJECTMODULES_Id)
  754 + PROJECTMODULES_Size = struct.unpack("<L", dir_stream.read(4))[0]
  755 + check_value('PROJECTMODULES_Size', 0x0002, PROJECTMODULES_Size)
  756 + PROJECTMODULES_Count = struct.unpack("<H", dir_stream.read(2))[0]
  757 + PROJECTMODULES_ProjectCookieRecord_Id = struct.unpack("<H", dir_stream.read(2))[0]
  758 + check_value('PROJECTMODULES_ProjectCookieRecord_Id', 0x0013, PROJECTMODULES_ProjectCookieRecord_Id)
  759 + PROJECTMODULES_ProjectCookieRecord_Size = struct.unpack("<L", dir_stream.read(4))[0]
  760 + check_value('PROJECTMODULES_ProjectCookieRecord_Size', 0x0002, PROJECTMODULES_ProjectCookieRecord_Size)
  761 + PROJECTMODULES_ProjectCookieRecord_Cookie = struct.unpack("<H", dir_stream.read(2))[0]
  762 +
  763 + logging.debug("parsing {0} modules".format(PROJECTMODULES_Count))
  764 + for x in xrange(0, PROJECTMODULES_Count):
  765 + MODULENAME_Id = struct.unpack("<H", dir_stream.read(2))[0]
  766 + check_value('MODULENAME_Id', 0x0019, MODULENAME_Id)
  767 + MODULENAME_SizeOfModuleName = struct.unpack("<L", dir_stream.read(4))[0]
  768 + MODULENAME_ModuleName = dir_stream.read(MODULENAME_SizeOfModuleName)
  769 + # account for optional sections
  770 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  771 + if section_id == 0x0047:
  772 + MODULENAMEUNICODE_Id = section_id
  773 + MODULENAMEUNICODE_SizeOfModuleNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  774 + MODULENAMEUNICODE_ModuleNameUnicode = dir_stream.read(MODULENAMEUNICODE_SizeOfModuleNameUnicode)
  775 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  776 + if section_id == 0x001A:
  777 + MODULESTREAMNAME_id = section_id
  778 + MODULESTREAMNAME_SizeOfStreamName = struct.unpack("<L", dir_stream.read(4))[0]
  779 + MODULESTREAMNAME_StreamName = dir_stream.read(MODULESTREAMNAME_SizeOfStreamName)
  780 + MODULESTREAMNAME_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  781 + check_value('MODULESTREAMNAME_Reserved', 0x0032, MODULESTREAMNAME_Reserved)
  782 + MODULESTREAMNAME_SizeOfStreamNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  783 + MODULESTREAMNAME_StreamNameUnicode = dir_stream.read(MODULESTREAMNAME_SizeOfStreamNameUnicode)
  784 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  785 + if section_id == 0x001C:
  786 + MODULEDOCSTRING_Id = section_id
  787 + check_value('MODULEDOCSTRING_Id', 0x001C, MODULEDOCSTRING_Id)
  788 + MODULEDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0]
  789 + MODULEDOCSTRING_DocString = dir_stream.read(MODULEDOCSTRING_SizeOfDocString)
  790 + MODULEDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  791 + check_value('MODULEDOCSTRING_Reserved', 0x0048, MODULEDOCSTRING_Reserved)
  792 + MODULEDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  793 + MODULEDOCSTRING_DocStringUnicode = dir_stream.read(MODULEDOCSTRING_SizeOfDocStringUnicode)
  794 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  795 + if section_id == 0x0031:
  796 + MODULEOFFSET_Id = section_id
  797 + check_value('MODULEOFFSET_Id', 0x0031, MODULEOFFSET_Id)
  798 + MODULEOFFSET_Size = struct.unpack("<L", dir_stream.read(4))[0]
  799 + check_value('MODULEOFFSET_Size', 0x0004, MODULEOFFSET_Size)
  800 + MODULEOFFSET_TextOffset = struct.unpack("<L", dir_stream.read(4))[0]
  801 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  802 + if section_id == 0x001E:
  803 + MODULEHELPCONTEXT_Id = section_id
  804 + check_value('MODULEHELPCONTEXT_Id', 0x001E, MODULEHELPCONTEXT_Id)
  805 + MODULEHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0]
  806 + check_value('MODULEHELPCONTEXT_Size', 0x0004, MODULEHELPCONTEXT_Size)
  807 + MODULEHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0]
  808 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  809 + if section_id == 0x002C:
  810 + MODULECOOKIE_Id = section_id
  811 + check_value('MODULECOOKIE_Id', 0x002C, MODULECOOKIE_Id)
  812 + MODULECOOKIE_Size = struct.unpack("<L", dir_stream.read(4))[0]
  813 + check_value('MODULECOOKIE_Size', 0x0002, MODULECOOKIE_Size)
  814 + MODULECOOKIE_Cookie = struct.unpack("<H", dir_stream.read(2))[0]
  815 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  816 + if section_id == 0x0021 or section_id == 0x0022:
  817 + MODULETYPE_Id = section_id
  818 + MODULETYPE_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
  819 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  820 + if section_id == 0x0025:
  821 + MODULEREADONLY_Id = section_id
  822 + check_value('MODULEREADONLY_Id', 0x0025, MODULEREADONLY_Id)
  823 + MODULEREADONLY_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
  824 + check_value('MODULEREADONLY_Reserved', 0x0000, MODULEREADONLY_Reserved)
  825 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  826 + if section_id == 0x0028:
  827 + MODULEPRIVATE_Id = section_id
  828 + check_value('MODULEPRIVATE_Id', 0x0028, MODULEPRIVATE_Id)
  829 + MODULEPRIVATE_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
  830 + check_value('MODULEPRIVATE_Reserved', 0x0000, MODULEPRIVATE_Reserved)
  831 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  832 + if section_id == 0x002B: # TERMINATOR
  833 + MODULE_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
  834 + check_value('MODULE_Reserved', 0x0000, MODULE_Reserved)
  835 + section_id = None
  836 + if section_id != None:
  837 + logging.warning('unknown or invalid module section id {0:04X}'.format(section_id))
  838 +
  839 + logging.debug('Project CodePage = %d' % PROJECTCODEPAGE_CodePage)
  840 + vba_codec = 'cp%d' % PROJECTCODEPAGE_CodePage
  841 + logging.debug("ModuleName = {0}".format(MODULENAME_ModuleName))
  842 + logging.debug("StreamName = {0}".format(repr(MODULESTREAMNAME_StreamName)))
  843 + streamname_unicode = MODULESTREAMNAME_StreamName.decode(vba_codec)
  844 + logging.debug("StreamName.decode('%s') = %s" % (vba_codec, repr(streamname_unicode)))
  845 + logging.debug("StreamNameUnicode = {0}".format(repr(MODULESTREAMNAME_StreamNameUnicode)))
  846 + logging.debug("TextOffset = {0}".format(MODULEOFFSET_TextOffset))
  847 +
  848 + code_path = vba_root + u'VBA/' + streamname_unicode
  849 + #TODO: test if stream exists
  850 + logging.debug('opening VBA code stream %s' % repr(code_path))
  851 + code_data = ole.openstream(code_path).read()
  852 + logging.debug("length of code_data = {0}".format(len(code_data)))
  853 + logging.debug("offset of code_data = {0}".format(MODULEOFFSET_TextOffset))
  854 + code_data = code_data[MODULEOFFSET_TextOffset:]
  855 + if len(code_data) > 0:
  856 + code_data = decompress_stream(code_data)
  857 + # case-insensitive search in the code_modules dict to find the file extension:
  858 + filext = code_modules.get(MODULENAME_ModuleName.lower(), 'bin')
  859 + filename = '{0}.{1}'.format(MODULENAME_ModuleName, filext)
  860 + #TODO: also yield the codepage so that callers can decode it properly
  861 + yield (code_path, filename, code_data)
  862 + # print '-'*79
  863 + # print filename
  864 + # print ''
  865 + # print code_data
  866 + # print ''
  867 + logging.debug('extracted file {0}'.format(filename))
  868 + else:
  869 + logging.warning("module stream {0} has code data length 0".format(MODULESTREAMNAME_StreamName))
  870 + return
  871 +
  872 +
  873 +def filter_vba(vba_code):
  874 + """
  875 + Filter VBA source code to remove the first lines starting with "Attribute VB_",
  876 + which are automatically added by MS Office and not displayed in the VBA Editor.
  877 + This should only be used when displaying source code for human analysis.
  878 +
  879 + Note: lines are not filtered if they contain a colon, because it could be
  880 + used to hide malicious instructions.
  881 +
  882 + :param vba_code: str, VBA source code
  883 + :return: str, filtered VBA source code
  884 + """
  885 + vba_lines = vba_code.splitlines()
  886 + start = 0
  887 + for line in vba_lines:
  888 + if line.startswith("Attribute VB_") and not ':' in line:
  889 + start += 1
  890 + else:
  891 + break
  892 + #TODO: also remove empty lines?
  893 + vba = '\n'.join(vba_lines[start:])
  894 + return vba
  895 +
  896 +
  897 +def detect_autoexec(vba_code, obfuscation=None):
  898 + """
  899 + Detect if the VBA code contains keywords corresponding to macros running
  900 + automatically when triggered by specific actions (e.g. when a document is
  901 + opened or closed).
  902 +
  903 + :param vba_code: str, VBA source code
  904 + :param obfuscation: None or str, name of obfuscation to be added to description
  905 + :return: list of str tuples (keyword, description)
  906 + """
  907 + #TODO: merge code with detect_suspicious
  908 + # case-insensitive search
  909 + #vba_code = vba_code.lower()
  910 + results = []
  911 + obf_text = ''
  912 + if obfuscation:
  913 + obf_text = ' (obfuscation: %s)' % obfuscation
  914 + for description, keywords in AUTOEXEC_KEYWORDS.items():
  915 + for keyword in keywords:
  916 + #TODO: if keyword is already a compiled regex, use it as-is
  917 + # search using regex to detect word boundaries:
  918 + if re.search(r'(?i)\b'+keyword+r'\b', vba_code):
  919 + #if keyword.lower() in vba_code:
  920 + results.append((keyword, description+obf_text))
  921 + return results
  922 +
  923 +
  924 +def detect_suspicious(vba_code, obfuscation=None):
  925 + """
  926 + Detect if the VBA code contains suspicious keywords corresponding to
  927 + potential malware behaviour.
  928 +
  929 + :param vba_code: str, VBA source code
  930 + :param obfuscation: None or str, name of obfuscation to be added to description
  931 + :return: list of str tuples (keyword, description)
  932 + """
  933 + # case-insensitive search
  934 + #vba_code = vba_code.lower()
  935 + results = []
  936 + obf_text = ''
  937 + if obfuscation:
  938 + obf_text = ' (obfuscation: %s)' % obfuscation
  939 + for description, keywords in SUSPICIOUS_KEYWORDS.items():
  940 + for keyword in keywords:
  941 + # search using regex to detect word boundaries:
  942 + if re.search(r'(?i)\b'+keyword+r'\b', vba_code):
  943 + #if keyword.lower() in vba_code:
  944 + results.append((keyword, description+obf_text))
  945 + return results
  946 +
  947 +
  948 +def detect_patterns(vba_code, obfuscation=None):
  949 + """
  950 + Detect if the VBA code contains specific patterns such as IP addresses,
  951 + URLs, e-mail addresses, executable file names, etc.
  952 +
  953 + :param vba_code: str, VBA source code
  954 + :return: list of str tuples (pattern type, value)
  955 + """
  956 + results = []
  957 + found = set()
  958 + obf_text = ''
  959 + if obfuscation:
  960 + obf_text = ' (obfuscation: %s)' % obfuscation
  961 + for pattern_type, pattern_re in RE_PATTERNS:
  962 + for match in pattern_re.finditer(vba_code):
  963 + value = match.group()
  964 + if value not in found:
  965 + results.append((pattern_type+obf_text, value))
  966 + found.add(value)
  967 + return results
  968 +
  969 +
  970 +def detect_hex_strings(vba_code):
  971 + """
  972 + Detect if the VBA code contains strings encoded in hexadecimal.
  973 +
  974 + :param vba_code: str, VBA source code
  975 + :return: list of str tuples (encoded string, decoded string)
  976 + """
  977 + results = []
  978 + found = set()
  979 + for match in re_hex_string.finditer(vba_code):
  980 + value = match.group()
  981 + if value not in found:
  982 + decoded = binascii.unhexlify(value)
  983 + results.append((value, decoded))
  984 + found.add(value)
  985 + return results
  986 +
  987 +
  988 +def detect_base64_strings(vba_code):
  989 + """
  990 + Detect if the VBA code contains strings encoded in base64.
  991 +
  992 + :param vba_code: str, VBA source code
  993 + :return: list of str tuples (encoded string, decoded string)
  994 + """
  995 + #TODO: avoid matching simple hex strings as base64?
  996 + results = []
  997 + found = set()
  998 + for match in re_base64_string.finditer(vba_code):
  999 + # extract the base64 string without quotes:
  1000 + value = match.group().strip('"')
  1001 + # check it is not just a hex string:
  1002 + if not re_nothex_check.search(value):
  1003 + continue
  1004 + # only keep new values and not in the whitelist:
  1005 + if value not in found and value.lower() not in BASE64_WHITELIST:
  1006 + try:
  1007 + decoded = base64.b64decode(value)
  1008 + results.append((value, decoded))
  1009 + found.add(value)
  1010 + except:
  1011 + # if an exception occurs, it is likely not a base64-encoded string
  1012 + pass
  1013 + return results
  1014 +
  1015 +
  1016 +def detect_dridex_strings(vba_code):
  1017 + """
  1018 + Detect if the VBA code contains strings obfuscated with a specific algorithm found in Dridex samples.
  1019 +
  1020 + :param vba_code: str, VBA source code
  1021 + :return: list of str tuples (encoded string, decoded string)
  1022 + """
  1023 + from thirdparty.DridexUrlDecoder.DridexUrlDecoder import DridexUrlDecode
  1024 + results = []
  1025 + found = set()
  1026 + for match in re_dridex_string.finditer(vba_code):
  1027 + value = match.group()[1:-1]
  1028 + # check it is not just a hex string:
  1029 + if not re_nothex_check.search(value):
  1030 + continue
  1031 + if value not in found:
  1032 + try:
  1033 + decoded = DridexUrlDecode(value)
  1034 + results.append((value, decoded))
  1035 + found.add(value)
  1036 + except:
  1037 + # if an exception occurs, it is likely not a dridex-encoded string
  1038 + pass
  1039 + return results
  1040 +
  1041 +
  1042 +class VBA_Scanner (object):
  1043 + """
  1044 + Class to scan the source code of a VBA module to find obfuscated strings,
  1045 + suspicious keywords, IOCs, auto-executable macros, etc.
  1046 + """
  1047 +
  1048 + def __init__(self, vba_code):
  1049 + """
  1050 + VBA_Scanner constructor
  1051 +
  1052 + :param vba_code: str, VBA source code to be analyzed
  1053 + """
  1054 + self.code = vba_code
  1055 + self.code_hex = ''
  1056 + self.code_hex_rev = ''
  1057 + self.code_rev_hex = ''
  1058 + self.code_base64 = ''
  1059 + self.code_dridex = ''
  1060 +
  1061 +
  1062 + def scan(self, include_decoded_strings=False):
  1063 + """
  1064 + Analyze the provided VBA code to detect suspicious keywords,
  1065 + auto-executable macros, IOC patterns, obfuscation patterns
  1066 + such as hex-encoded strings.
  1067 +
  1068 + :param include_decoded_strings: bool, if True, all encoded strings will be included with their decoded content.
  1069 + :return: list of tuples (type, keyword, description)
  1070 + (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String')
  1071 + """
  1072 + # First, detect and extract hex-encoded strings:
  1073 + self.hex_strings = detect_hex_strings(self.code)
  1074 + # detect if the code contains StrReverse:
  1075 + self.strReverse = False
  1076 + if 'strreverse' in self.code.lower(): self.strReverse = True
  1077 + # Then append the decoded strings to the VBA code, to detect obfuscated IOCs and keywords:
  1078 + for encoded, decoded in self.hex_strings:
  1079 + self.code_hex += '\n'+decoded
  1080 + # if the code contains "StrReverse", also append the hex strings in reverse order:
  1081 + if self.strReverse:
  1082 + # StrReverse after hex decoding:
  1083 + self.code_hex_rev += '\n'+decoded[::-1]
  1084 + # StrReverse before hex decoding:
  1085 + self.code_rev_hex += '\n'+binascii.unhexlify(encoded[::-1])
  1086 + #example: https://malwr.com/analysis/NmFlMGI4YTY1YzYyNDkwNTg1ZTBiZmY5OGI3YjlhYzU/
  1087 + #TODO: also append the full code reversed if StrReverse? (risk of false positives?)
  1088 + # Detect Base64-encoded strings
  1089 + self.base64_strings = detect_base64_strings(self.code)
  1090 + for encoded, decoded in self.base64_strings:
  1091 + self.code_base64 += '\n'+decoded
  1092 + # Detect Dridex-encoded strings
  1093 + self.dridex_strings = detect_dridex_strings(self.code)
  1094 + for encoded, decoded in self.dridex_strings:
  1095 + self.code_dridex += '\n'+decoded
  1096 + results = []
  1097 + self.autoexec_keywords = []
  1098 + self.suspicious_keywords = []
  1099 + self.iocs = []
  1100 +
  1101 + for code, obfuscation in (
  1102 + (self.code, None),
  1103 + (self.code_hex, 'Hex'),
  1104 + (self.code_hex_rev, 'Hex+StrReverse'),
  1105 + (self.code_rev_hex, 'StrReverse+Hex'),
  1106 + (self.code_base64, 'Base64'),
  1107 + (self.code_dridex, 'Dridex'),
  1108 + ):
  1109 + self.autoexec_keywords += detect_autoexec(code, obfuscation)
  1110 + self.suspicious_keywords += detect_suspicious(code, obfuscation)
  1111 + self.iocs += detect_patterns(code, obfuscation)
  1112 +
  1113 + # If hex-encoded strings were discovered, add an item to suspicious keywords:
  1114 + if self.hex_strings:
  1115 + self.suspicious_keywords.append(('Hex Strings',
  1116 + 'Hex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))
  1117 + if self.base64_strings:
  1118 + self.suspicious_keywords.append(('Base64 Strings',
  1119 + 'Base64-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))
  1120 + if self.dridex_strings:
  1121 + self.suspicious_keywords.append(('Dridex Strings',
  1122 + 'Dridex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))
  1123 + for keyword, description in self.autoexec_keywords:
  1124 + results.append(('AutoExec', keyword, description))
  1125 + for keyword, description in self.suspicious_keywords:
  1126 + results.append(('Suspicious', keyword, description))
  1127 + for pattern_type, value in self.iocs:
  1128 + results.append(('IOC', value, pattern_type))
  1129 + if include_decoded_strings:
  1130 + for encoded, decoded in self.hex_strings:
  1131 + results.append(('Hex String', repr(decoded), encoded))
  1132 + for encoded, decoded in self.base64_strings:
  1133 + results.append(('Base64 String', repr(decoded), encoded))
  1134 + for encoded, decoded in self.dridex_strings:
  1135 + results.append(('Dridex string', repr(decoded), encoded))
  1136 + return results
  1137 +
  1138 + def scan_summary(self):
  1139 + """
  1140 + Analyze the provided VBA code to detect suspicious keywords,
  1141 + auto-executable macros, IOC patterns, obfuscation patterns
  1142 + such as hex-encoded strings.
  1143 +
  1144 + :return: tuple with the number of items found for each category:
  1145 + (autoexec, suspicious, IOCs, hex, base64, dridex)
  1146 + """
  1147 + self.scan()
  1148 + return (len(self.autoexec_keywords), len(self.suspicious_keywords),
  1149 + len(self.iocs), len(self.hex_strings), len(self.base64_strings),
  1150 + len(self.dridex_strings))
  1151 +
  1152 +
  1153 +
  1154 +def scan_vba(vba_code, include_decoded_strings):
  1155 + """
  1156 + Analyze the provided VBA code to detect suspicious keywords,
  1157 + auto-executable macros, IOC patterns, obfuscation patterns
  1158 + such as hex-encoded strings.
  1159 + (shortcut for VBA_Scanner(vba_code).scan())
  1160 +
  1161 + :param vba_code: str, VBA source code to be analyzed
  1162 + :param include_decoded_strings: bool, if True all encoded strings will be included with their decoded content.
  1163 + :return: list of tuples (type, keyword, description)
  1164 + (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String')
  1165 + """
  1166 + return VBA_Scanner(vba_code).scan(include_decoded_strings)
  1167 +
  1168 +
  1169 +#=== CLASSES =================================================================
  1170 +
  1171 +class VBA_Parser(object):
  1172 + """
  1173 + Class to parse MS Office files, to detect VBA macros and extract VBA source code
  1174 + Supported file formats:
  1175 + - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)
  1176 + - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
  1177 + - PowerPoint 2007+ (.pptm, .ppsm)
  1178 + """
  1179 +
  1180 + def __init__(self, filename, data=None):
  1181 + """
  1182 + Constructor for VBA_Parser
  1183 +
  1184 + :param filename: filename or path of file to parse, or file-like object
  1185 +
  1186 + :param data: None or bytes str, if None the file will be read from disk (or from the file-like object).
  1187 + If data is provided as a bytes string, it will be parsed as the content of the file in memory,
  1188 + and not read from disk. Note: files must be read in binary mode, i.e. open(f, 'rb').
  1189 + """
  1190 + #TODO: filename should only be a string, data should be used for the file-like object
  1191 + #TODO: filename should be mandatory, optional data is a string or file-like object
  1192 + #TODO: also support olefile and zipfile as input
  1193 + if data is None:
  1194 + # open file from disk:
  1195 + _file = filename
  1196 + else:
  1197 + # file already read in memory, make it a file-like object for zipfile:
  1198 + _file = cStringIO.StringIO(data)
  1199 + #self.file = _file
  1200 + self.ole_file = None
  1201 + self.ole_subfiles = []
  1202 + self.filename = filename
  1203 + self.type = None
  1204 + self.vba_projects = None
  1205 + # if filename is None:
  1206 + # if isinstance(_file, basestring):
  1207 + # if len(_file) < olefile.MINIMAL_OLEFILE_SIZE:
  1208 + # self.filename = _file
  1209 + # else:
  1210 + # self.filename = '<file in bytes string>'
  1211 + # else:
  1212 + # self.filename = '<file-like object>'
  1213 + if olefile.isOleFile(_file):
  1214 + # This looks like an OLE file
  1215 + logging.info('Parsing OLE file %s' % self.filename)
  1216 + # Open and parse the OLE file, using unicode for path names:
  1217 + self.ole_file = olefile.OleFileIO(_file, path_encoding=None)
  1218 + self.type = TYPE_OLE
  1219 + #TODO: raise TypeError if this is a Powerpoint 97 file, since VBA macros cannot be detected yet
  1220 + elif zipfile.is_zipfile(_file):
  1221 + # This looks like a zip file, need to look for vbaProject.bin inside
  1222 + # It can be any OLE file inside the archive
  1223 + #...because vbaProject.bin can be renamed:
  1224 + # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18
  1225 + logging.info('Opening ZIP/OpenXML file %s' % self.filename)
  1226 + self.type = TYPE_OpenXML
  1227 + z = zipfile.ZipFile(_file)
  1228 + #TODO: check if this is actually an OpenXML file
  1229 + #TODO: if the zip file is encrypted, suggest to use the -z option, or try '-z infected' automatically?
  1230 + # check each file within the zip if it is an OLE file, by reading its magic:
  1231 + for subfile in z.namelist():
  1232 + magic = z.open(subfile).read(len(olefile.MAGIC))
  1233 + if magic == olefile.MAGIC:
  1234 + logging.debug('Opening OLE file %s within zip' % subfile)
  1235 + ole_data = z.open(subfile).read()
  1236 + try:
  1237 + self.ole_subfiles.append(VBA_Parser(filename=subfile, data=ole_data))
  1238 + except:
  1239 + logging.debug('%s is not a valid OLE file' % subfile)
  1240 + continue
  1241 + z.close()
  1242 + else:
  1243 + # read file from disk, check if it is a Word 2003 XML file (WordProcessingML), Excel 2003 XML,
  1244 + # or a plain text file containing VBA code
  1245 + if data is None:
  1246 + data = open(filename, 'rb').read()
  1247 + # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace
  1248 + if 'http://schemas.microsoft.com/office/word/2003/wordml' in data:
  1249 + logging.info('Opening Word 2003 XML file %s' % self.filename)
  1250 + self.type = TYPE_Word2003_XML
  1251 + # parse the XML content
  1252 + et = ET.fromstring(data)
  1253 + # find all the binData elements:
  1254 + for bindata in et.getiterator(TAG_BINDATA):
  1255 + # the binData content is an OLE container for the VBA project, compressed
  1256 + # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded.
  1257 + # get the filename:
  1258 + fname = bindata.get(ATTR_NAME, 'noname.mso')
  1259 + # decode the base64 activemime
  1260 + activemime = binascii.a2b_base64(bindata.text)
  1261 + # decompress the zlib data starting at offset 0x32, which is the OLE container:
  1262 + ole_data = zlib.decompress(activemime[0x32:])
  1263 + try:
  1264 + self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data))
  1265 + except:
  1266 + logging.debug('%s is not a valid OLE file' % fname)
  1267 + continue
  1268 + #TODO: handle exceptions
  1269 + #TODO: Excel 2003 XML
  1270 + #TODO: plain text VBA file
  1271 + else:
  1272 + msg = '%s is not an OLE nor an OpenXML file, cannot extract VBA Macros.' % self.filename
  1273 + logging.error(msg)
  1274 + raise TypeError(msg)
  1275 +
  1276 + def find_vba_projects (self):
  1277 + """
  1278 + Finds all the VBA projects stored in an OLE file.
  1279 +
  1280 + Return None if the file is not OLE but OpenXML.
  1281 + Return a list of tuples (vba_root, project_path, dir_path) for each VBA project.
  1282 + vba_root is the path of the root OLE storage containing the VBA project,
  1283 + including a trailing slash unless it is the root of the OLE file.
  1284 + project_path is the path of the OLE stream named "PROJECT" within the VBA project.
  1285 + dir_path is the path of the OLE stream named "VBA/dir" within the VBA project.
  1286 +
  1287 + If this function returns an empty list for one of the supported formats
  1288 + (i.e. Word, Excel, Powerpoint except Powerpoint 97-2003), then the
  1289 + file does not contain VBA macros.
  1290 +
  1291 + :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path)
  1292 + for each VBA project found if OLE file
  1293 + """
  1294 + # if the file is not OLE but OpenXML, return None:
  1295 + if self.ole_file is None:
  1296 + return None
  1297 +
  1298 + # if this method has already been called, return previous result:
  1299 + if self.vba_projects is not None:
  1300 + return self.vba_projects
  1301 +
  1302 + # Find the VBA project root (different in MS Word, Excel, etc):
  1303 + # - Word 97-2003: Macros
  1304 + # - Excel 97-2003: _VBA_PROJECT_CUR
  1305 + # - PowerPoint 97-2003: not supported yet (different file structure)
  1306 + # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin.
  1307 + # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word
  1308 + # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word
  1309 + # - Visio 2007: not supported yet (different file structure)
  1310 +
  1311 + # According to MS-OVBA section 2.2.1:
  1312 + # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream
  1313 + # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream
  1314 + # - all names are case-insensitive
  1315 +
  1316 + # start with an empty list:
  1317 + self.vba_projects = []
  1318 + # Look for any storage containing those storage/streams:
  1319 + ole = self.ole_file
  1320 + for storage in ole.listdir(streams=False, storages=True):
  1321 + # Look for a storage ending with "VBA":
  1322 + if storage[-1].upper() == 'VBA':
  1323 + logging.debug('Found VBA storage: %s' % ('/'.join(storage)))
  1324 + vba_root = '/'.join(storage[:-1])
  1325 + # Add a trailing slash to vba_root, unless it is the root of the OLE file:
  1326 + # (used later to append all the child streams/storages)
  1327 + if vba_root != '':
  1328 + vba_root += '/'
  1329 + logging.debug('Checking vba_root="%s"' % vba_root)
  1330 +
  1331 + def check_vba_stream(ole, vba_root, stream_path):
  1332 + full_path = vba_root + stream_path
  1333 + if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM:
  1334 + logging.debug('Found %s stream: %s' % (stream_path, full_path))
  1335 + return full_path
  1336 + else:
  1337 + logging.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path)
  1338 + return False
  1339 +
  1340 + # Check if the VBA root storage also contains a PROJECT stream:
  1341 + project_path = check_vba_stream(ole, vba_root, 'PROJECT')
  1342 + if not project_path: continue
  1343 + # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream:
  1344 + vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT')
  1345 + if not vba_project_path: continue
  1346 + # Check if the VBA root storage also contains a VBA/dir stream:
  1347 + dir_path = check_vba_stream(ole, vba_root, 'VBA/dir')
  1348 + if not dir_path: continue
  1349 + # Now we are pretty sure it is a VBA project structure
  1350 + logging.debug('VBA root storage: "%s"' % vba_root)
  1351 + # append the results to the list as a tuple for later use:
  1352 + self.vba_projects.append((vba_root, project_path, dir_path))
  1353 + return self.vba_projects
  1354 +
  1355 + def detect_vba_macros(self):
  1356 + """
  1357 + Detect the potential presence of VBA macros in the file, by checking
  1358 + if it contains VBA projects. Both OLE and OpenXML files are supported.
  1359 +
  1360 + Important: for now, results are accurate only for Word, Excel and PowerPoint
  1361 + EXCEPT Powerpoint 97-2003, which has a different structure for VBA.
  1362 +
  1363 + Note: this method does NOT attempt to check the actual presence or validity
  1364 + of VBA macro source code, so there might be false positives.
  1365 + It may also detect VBA macros in files embedded within the main file,
  1366 + for example an Excel workbook with macros embedded into a Word
  1367 + document without macros may be detected, without distinction.
  1368 +
  1369 + :return: bool, True if at least one VBA project has been found, False otherwise
  1370 + """
  1371 + #TODO: return None or raise exception if format not supported like PPT 97-2003
  1372 + #TODO: return the number of VBA projects found instead of True/False?
  1373 + # if OpenXML, check all the OLE subfiles:
  1374 + if self.ole_file is None:
  1375 + for ole_subfile in self.ole_subfiles:
  1376 + if ole_subfile.detect_vba_macros():
  1377 + return True
  1378 + return False
  1379 + # otherwise it's an OLE file, find VBA projects:
  1380 + vba_projects = self.find_vba_projects()
  1381 + if len(vba_projects) == 0:
  1382 + return False
  1383 + else:
  1384 + return True
  1385 +
  1386 +
  1387 + def extract_macros (self):
  1388 + """
  1389 + Extract and decompress source code for each VBA macro found in the file
  1390 +
  1391 + Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found
  1392 + If the file is OLE, filename is the path of the file.
  1393 + If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros
  1394 + within the zip archive, e.g. word/vbaProject.bin.
  1395 + """
  1396 + if self.ole_file is None:
  1397 + for ole_subfile in self.ole_subfiles:
  1398 + for results in ole_subfile.extract_macros():
  1399 + yield results
  1400 + else:
  1401 + self.find_vba_projects()
  1402 + for vba_root, project_path, dir_path in self.vba_projects:
  1403 + # extract all VBA macros from that VBA root storage:
  1404 + for stream_path, vba_filename, vba_code in _extract_vba(self.ole_file, vba_root, project_path, dir_path):
  1405 + yield (self.filename, stream_path, vba_filename, vba_code)
  1406 +
  1407 +
  1408 + def close(self):
  1409 + """
  1410 + Close all the open files. This method must be called after usage, if
  1411 + the application is opening many files.
  1412 + """
  1413 + if self.ole_file is None:
  1414 + for ole_subfile in self.ole_subfiles:
  1415 + ole_subfile.close()
  1416 + else:
  1417 + self.ole_file.close()
  1418 +
  1419 +
  1420 +def print_analysis(vba_code, show_decoded_strings=False):
  1421 + """
  1422 + Analyze the provided VBA code, and print the results in a table
  1423 +
  1424 + :param vba_code: str, VBA source code to be analyzed
  1425 + :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.
  1426 + :return: None
  1427 + """
  1428 + results = scan_vba(vba_code, show_decoded_strings)
  1429 + if results:
  1430 + t = prettytable.PrettyTable(('Type', 'Keyword', 'Description'))
  1431 + t.align = 'l'
  1432 + t.max_width['Type'] = 10
  1433 + t.max_width['Keyword'] = 20
  1434 + t.max_width['Description'] = 39
  1435 + for kw_type, keyword, description in results:
  1436 + t.add_row((kw_type, keyword, description))
  1437 + print t
  1438 + else:
  1439 + print 'No suspicious keyword or IOC found.'
  1440 +
  1441 +
  1442 +
  1443 +def process_file (container, filename, data, show_decoded_strings=False):
  1444 + """
  1445 + Process a single file
  1446 +
  1447 + :param container: str, path and filename of container if the file is within
  1448 + a zip archive, None otherwise.
  1449 + :param filename: str, path and filename of file on disk, or within the container.
  1450 + :param data: bytes, content of the file if it is in a container, None if it is a file on disk.
  1451 + :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.
  1452 + """
  1453 + #TODO: replace print by writing to a provided output file (sys.stdout by default)
  1454 + if container:
  1455 + display_filename = '%s in %s' % (filename, container)
  1456 + else:
  1457 + display_filename = filename
  1458 + print '='*79
  1459 + print 'FILE:', display_filename
  1460 + try:
  1461 + #TODO: handle olefile errors, when an OLE file is malformed
  1462 + vba = VBA_Parser(filename, data)
  1463 + print 'Type:', vba.type
  1464 + if vba.detect_vba_macros():
  1465 + #print 'Contains VBA Macros:'
  1466 + for (subfilename, stream_path, vba_filename, vba_code) in vba.extract_macros():
  1467 + # hide attribute lines:
  1468 + #TODO: option to disable attribute filtering
  1469 + vba_code_filtered = filter_vba(vba_code)
  1470 + print '-'*79
  1471 + print 'VBA MACRO %s ' % vba_filename
  1472 + print 'in file: %s - OLE stream: %s' % (subfilename, repr(stream_path))
  1473 + print '- '*39
  1474 + # detect empty macros:
  1475 + if vba_code_filtered.strip() == '':
  1476 + print '(empty macro)'
  1477 + else:
  1478 + print vba_code_filtered
  1479 + print '- '*39
  1480 + print 'ANALYSIS:'
  1481 + # analyse the whole code, filtered to avoid false positives:
  1482 + print_analysis(vba_code_filtered, show_decoded_strings)
  1483 + else:
  1484 + print 'No VBA macros found.'
  1485 + except: #TypeError:
  1486 + #raise
  1487 + #TODO: print more info if debug mode
  1488 + #print sys.exc_value
  1489 + # display the exception with full stack trace for debugging, but do not stop:
  1490 + traceback.print_exc()
  1491 + print ''
  1492 +
  1493 +
  1494 +def process_file_triage (container, filename, data):
  1495 + """
  1496 + Process a single file
  1497 +
  1498 + :param container: str, path and filename of container if the file is within
  1499 + a zip archive, None otherwise.
  1500 + :param filename: str, path and filename of file on disk, or within the container.
  1501 + :param data: bytes, content of the file if it is in a container, None if it is a file on disk.
  1502 + """
  1503 + #TODO: replace print by writing to a provided output file (sys.stdout by default)
  1504 + nb_macros = 0
  1505 + nb_autoexec = 0
  1506 + nb_suspicious = 0
  1507 + nb_iocs = 0
  1508 + nb_hexstrings = 0
  1509 + nb_base64strings = 0
  1510 + nb_dridexstrings = 0
  1511 + # ftype = 'Other'
  1512 + message = ''
  1513 + try:
  1514 + #TODO: handle olefile errors, when an OLE file is malformed
  1515 + vba = VBA_Parser(filename, data)
  1516 + if vba.detect_vba_macros():
  1517 + for (subfilename, stream_path, vba_filename, vba_code) in vba.extract_macros():
  1518 + nb_macros += 1
  1519 + if vba_code.strip() != '':
  1520 + # analyse the whole code, filtered to avoid false positives:
  1521 + scanner = VBA_Scanner(filter_vba(vba_code))
  1522 + autoexec, suspicious, iocs, hexstrings, base64strings, dridex = scanner.scan_summary()
  1523 + nb_autoexec += autoexec
  1524 + nb_suspicious += suspicious
  1525 + nb_iocs += iocs
  1526 + nb_hexstrings += hexstrings
  1527 + nb_base64strings += base64strings
  1528 + nb_dridexstrings += dridex
  1529 + if vba.type == TYPE_OLE:
  1530 + flags = 'OLE:'
  1531 + elif vba.type == TYPE_OpenXML:
  1532 + flags = 'OpX:'
  1533 + elif vba.type == TYPE_Word2003_XML:
  1534 + flags = 'XML:'
  1535 + macros = autoexec = suspicious = iocs = hexstrings = base64obf = dridex = '-'
  1536 + if nb_macros: macros = 'M'
  1537 + if nb_autoexec: autoexec = 'A'
  1538 + if nb_suspicious: suspicious = 'S'
  1539 + if nb_iocs: iocs = 'I'
  1540 + if nb_hexstrings: hexstrings = 'H'
  1541 + if nb_base64strings: base64obf = 'B'
  1542 + if nb_dridexstrings: dridex = 'D'
  1543 + flags += '%s%s%s%s%s%s%s' % (macros, autoexec, suspicious, iocs, hexstrings,
  1544 + base64obf, dridex)
  1545 +
  1546 + # macros = autoexec = suspicious = iocs = hexstrings = 'no'
  1547 + # if nb_macros: macros = 'YES:%d' % nb_macros
  1548 + # if nb_autoexec: autoexec = 'YES:%d' % nb_autoexec
  1549 + # if nb_suspicious: suspicious = 'YES:%d' % nb_suspicious
  1550 + # if nb_iocs: iocs = 'YES:%d' % nb_iocs
  1551 + # if nb_hexstrings: hexstrings = 'YES:%d' % nb_hexstrings
  1552 + # # 2nd line = info
  1553 + # print '%-8s %-7s %-7s %-7s %-7s %-7s' % (vba.type, macros, autoexec, suspicious, iocs, hexstrings)
  1554 + except TypeError:
  1555 + # file type not OLE nor OpenXML
  1556 + flags = '?'
  1557 + message = 'File format not supported'
  1558 + except:
  1559 + # another error occurred
  1560 + #raise
  1561 + #TODO: print more info if debug mode
  1562 + #TODO: distinguish real errors from incorrect file types
  1563 + flags = '!ERROR'
  1564 + message = sys.exc_value
  1565 + line = '%-11s %s' % (flags, filename)
  1566 + if message:
  1567 + line += ' - %s' % message
  1568 + print line
  1569 +
  1570 + # t = prettytable.PrettyTable(('filename', 'type', 'macros', 'autoexec', 'suspicious', 'ioc', 'hexstrings'),
  1571 + # header=False, border=False)
  1572 + # t.align = 'l'
  1573 + # t.max_width['filename'] = 30
  1574 + # t.max_width['type'] = 10
  1575 + # t.max_width['macros'] = 6
  1576 + # t.max_width['autoexec'] = 6
  1577 + # t.max_width['suspicious'] = 6
  1578 + # t.max_width['ioc'] = 6
  1579 + # t.max_width['hexstrings'] = 6
  1580 + # t.add_row((filename, ftype, macros, autoexec, suspicious, iocs, hexstrings))
  1581 + # print t
  1582 +
  1583 +def main_triage_quick():
  1584 + pass
  1585 +
  1586 +#=== MAIN =====================================================================
  1587 +
  1588 +def main():
  1589 + """
  1590 + Main function, called when olevba is run from the command line
  1591 + """
  1592 + usage = 'usage: %prog [options] <filename> [filename2 ...]'
  1593 + parser = optparse.OptionParser(usage=usage)
  1594 + # parser.add_option('-o', '--outfile', dest='outfile',
  1595 + # help='output file')
  1596 + # parser.add_option('-c', '--csv', dest='csv',
  1597 + # help='export results to a CSV file')
  1598 + parser.add_option("-r", action="store_true", dest="recursive",
  1599 + help='find files recursively in subdirectories.')
  1600 + parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None,
  1601 + help='if the file is a zip archive, open first file from it, using the provided password (requires Python 2.6+)')
  1602 + parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*',
  1603 + help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)')
  1604 + parser.add_option("-t", action="store_true", dest="triage_mode",
  1605 + help='triage mode, display results as a summary table (default for multiple files)')
  1606 + parser.add_option("-d", action="store_true", dest="detailed_mode",
  1607 + help='detailed mode, display full results (default for single file)')
  1608 + parser.add_option("-i", "--input", dest='input', type='str', default=None,
  1609 + help='input file containing VBA source code to be analyzed (no parsing)')
  1610 + parser.add_option("--decode", action="store_true", dest="show_decoded_strings",
  1611 + help='display all the obfuscated strings with their decoded content (Hex, Base64, StrReverse, Dridex).')
  1612 +
  1613 + (options, args) = parser.parse_args()
  1614 +
  1615 + # Print help if no arguments are passed
  1616 + if len(args) == 0 and not options.input:
  1617 + print __doc__
  1618 + parser.print_help()
  1619 + sys.exit()
  1620 +
  1621 + # print banner with version
  1622 + print 'olevba %s - http://decalage.info/python/oletools' % __version__
  1623 +
  1624 + logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING) #INFO)
  1625 + # For now, all logging is disabled:
  1626 + logging.disable(logging.CRITICAL)
  1627 +
  1628 + if options.input:
  1629 + # input file provided with VBA source code to be analyzed directly:
  1630 + print 'Analysis of VBA source code from %s:' % options.input
  1631 + vba_code = open(options.input).read()
  1632 + print_analysis(vba_code, show_decoded_strings=options.show_decoded_strings)
  1633 + sys.exit()
  1634 +
  1635 + # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('Type', 'Macros', 'AutoEx', 'Susp.', 'IOCs', 'HexStr')
  1636 + # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('-'*8, '-'*7, '-'*7, '-'*7, '-'*7, '-'*7)
  1637 + if not options.detailed_mode or options.triage_mode:
  1638 + print '%-11s %-65s' % ('Flags', 'Filename')
  1639 + print '%-11s %-65s' % ('-'*11, '-'*65)
  1640 + previous_container = None
  1641 + count = 0
  1642 + container = filename = data = None
  1643 + for container, filename, data in xglob.iter_files(args, recursive=options.recursive,
  1644 + zip_password=options.zip_password, zip_fname=options.zip_fname):
  1645 + # ignore directory names stored in zip files:
  1646 + if container and filename.endswith('/'):
  1647 + continue
  1648 + if options.detailed_mode and not options.triage_mode:
  1649 + # fully detailed output
  1650 + process_file(container, filename, data, show_decoded_strings=options.show_decoded_strings)
  1651 + else:
  1652 + # print container name when it changes:
  1653 + if container != previous_container:
  1654 + if container is not None:
  1655 + print '\nFiles in %s:' % container
  1656 + previous_container = container
  1657 + # summarized output for triage:
  1658 + process_file_triage(container, filename, data)
  1659 + count += 1
  1660 + if not options.detailed_mode or options.triage_mode:
  1661 + print '\n(Flags: OpX=OpenXML, XML=Word2003XML, M=Macros, A=Auto-executable, S=Suspicious keywords, I=IOCs, H=Hex strings, B=Base64 strings, D=Dridex strings, ?=Unknown)\n'
  1662 +
  1663 + if count == 1 and not options.triage_mode and not options.detailed_mode:
  1664 + # if options -t and -d were not specified and it's a single file, print details:
  1665 + #TODO: avoid doing the analysis twice by storing results
  1666 + process_file(container, filename, data, show_decoded_strings=options.show_decoded_strings)
  1667 +
  1668 +if __name__ == '__main__':
  1669 + main()
  1670 +
1671 1671 # This was coded while listening to "Dust" from I Love You But I've Chosen Darkness
1672 1672 \ No newline at end of file
... ...