Peter M. Groen / oletools

Browse Code »

Commit a4ffb743f926d59e022f10313ca70d6af9f8c8b7

Authored by Philippe Lagadec 2015-03-19 08:20:13 +0100

1 parent 41896bcf

olevba: changed line endings from CRLF to LF

Inline Side-by-side

Showing 1 changed file with 1670 additions and 1670 deletions

oletools/olevba.py 100644 → 100755

View file @a4ffb74

1		-#!/usr/bin/env python
2		-"""
3		-olevba.py
4		-
5		-olevba is a script to parse OLE and OpenXML files such as MS Office documents
6		-(e.g. Word, Excel), to extract VBA Macro code in clear text, deobfuscate
7		-and analyze malicious macros.
8		-
9		-Supported formats:
10		-- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)
11		-- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
12		-- PowerPoint 2007+ (.pptm, .ppsm)
13		-- Word 2003 XML (.xml)
14		-
15		-Author: Philippe Lagadec - http://www.decalage.info
16		-License: BSD, see source code or documentation
17		-
18		-olevba is part of the python-oletools package:
19		-http://www.decalage.info/python/oletools
20		-
21		-olevba is based on source code from officeparser by John William Davison
22		-https://github.com/unixfreak0037/officeparser
23		-"""
24		-
25		-#=== LICENSE ==================================================================
26		-
27		-# olevba is copyright (c) 2014-2015 Philippe Lagadec (http://www.decalage.info)
28		-# All rights reserved.
29		-#
30		-# Redistribution and use in source and binary forms, with or without modification,
31		-# are permitted provided that the following conditions are met:
32		-#
33		-# * Redistributions of source code must retain the above copyright notice, this
34		-# list of conditions and the following disclaimer.
35		-# * Redistributions in binary form must reproduce the above copyright notice,
36		-# this list of conditions and the following disclaimer in the documentation
37		-# and/or other materials provided with the distribution.
38		-#
39		-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
40		-# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
41		-# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
42		-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
43		-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
44		-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
45		-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
46		-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
47		-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
48		-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
49		-
50		-
51		-# olevba contains modified source code from the officeparser project, published
52		-# under the following MIT License (MIT):
53		-#
54		-# officeparser is copyright (c) 2014 John William Davison
55		-#
56		-# Permission is hereby granted, free of charge, to any person obtaining a copy
57		-# of this software and associated documentation files (the "Software"), to deal
58		-# in the Software without restriction, including without limitation the rights
59		-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
60		-# copies of the Software, and to permit persons to whom the Software is
61		-# furnished to do so, subject to the following conditions:
62		-#
63		-# The above copyright notice and this permission notice shall be included in all
64		-# copies or substantial portions of the Software.
65		-#
66		-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
67		-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
68		-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
69		-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
70		-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
71		-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
72		-# SOFTWARE.
73		-
74		-#------------------------------------------------------------------------------
75		-# CHANGELOG:
76		-# 2014-08-05 v0.01 PL: - first version based on officeparser code
77		-# 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser
78		-# 2014-08-15 PL: - fixed incorrect value check in PROJECTHELPFILEPATH Record
79		-# 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats
80		-# and to find the VBA project root anywhere in the file
81		-# 2014-11-29 v0.04 PL: - use olefile instead of OleFileIO_PL
82		-# 2014-12-05 v0.05 PL: - refactored most functions into a class, new API
83		-# - added detect_vba_macros
84		-# 2014-12-10 v0.06 PL: - hide first lines with VB attributes
85		-# - detect auto-executable macros
86		-# - ignore empty macros
87		-# 2014-12-14 v0.07 PL: - detect_autoexec() is now case-insensitive
88		-# 2014-12-15 v0.08 PL: - improved display for empty macros
89		-# - added pattern extraction
90		-# 2014-12-25 v0.09 PL: - added suspicious keywords detection
91		-# 2014-12-27 v0.10 PL: - added OptionParser, main and process_file
92		-# - uses xglob to scan several files with wildcards
93		-# - option -r to recurse subdirectories
94		-# - option -z to scan files in password-protected zips
95		-# 2015-01-02 v0.11 PL: - improved filter_vba to detect colons
96		-# 2015-01-03 v0.12 PL: - fixed detect_patterns to detect all patterns
97		-# - process_file: improved display, shows container file
98		-# - improved list of executable file extensions
99		-# 2015-01-04 v0.13 PL: - added several suspicious keywords, improved display
100		-# 2015-01-08 v0.14 PL: - added hex strings detection and decoding
101		-# - fixed issue #2, decoding VBA stream names using
102		-# specified codepage and unicode stream names
103		-# 2015-01-11 v0.15 PL: - added new triage mode, options -t and -d
104		-# 2015-01-16 v0.16 PL: - fix for issue #3 (exception when module name="text")
105		-# - added several suspicious keywords
106		-# - added option -i to analyze VBA source code directly
107		-# 2015-01-17 v0.17 PL: - removed .com from the list of executable extensions
108		-# - added scan_vba to run all detection algorithms
109		-# - decoded hex strings are now also scanned + reversed
110		-# 2015-01-23 v0.18 PL: - fixed issue #3, case-insensitive search in code_modules
111		-# 2015-01-24 v0.19 PL: - improved the detection of IOCs obfuscated with hex
112		-# strings and StrReverse
113		-# 2015-01-26 v0.20 PL: - added option --hex to show all hex strings decoded
114		-# 2015-01-29 v0.21 PL: - added Dridex obfuscation decoding
115		-# - improved display, shows obfuscation name
116		-# 2015-02-01 v0.22 PL: - fixed issue #4: regex for URL, e-mail and exe filename
117		-# - added Base64 obfuscation decoding (contribution from
118		-# @JamesHabben)
119		-# 2015-02-03 v0.23 PL: - triage now uses VBA_Scanner results, shows Base64 and
120		-# Dridex strings
121		-# - exception handling in detect_base64_strings
122		-# 2015-02-07 v0.24 PL: - renamed option --hex to --decode, fixed display
123		-# - display exceptions with stack trace
124		-# - added several suspicious keywords
125		-# - improved Base64 detection and decoding
126		-# - fixed triage mode not to scan attrib lines
127		-# 2015-03-04 v0.25 PL: - added support for Word 2003 XML
128		-
129		-__version__ = '0.25'
130		-
131		-#------------------------------------------------------------------------------
132		-# TODO:
133		-# + do not use logging, but a provided logger (null logger by default)
134		-# + setup logging (common with other oletools)
135		-# + add xor bruteforcing like bbharvest
136		-# + add chr() decoding
137		-
138		-# TODO later:
139		-# + performance improvement: instead of searching each keyword separately,
140		-# first split vba code into a list of words (per line), then check each
141		-# word against a dict. (or put vba words into a set/dict?)
142		-# + for regex, maybe combine them into a single re with named groups?
143		-# + add Yara support, include sample rules? plugins like balbuzard?
144		-# + add balbuzard support
145		-# + output to file (replace print by file.write, sys.stdout by default)
146		-# + look for VBA in embedded documents (e.g. Excel in Word)
147		-# + support SRP streams (see Lenny's article + links and sample)
148		-# - python 3.x support
149		-# - add support for PowerPoint macros (see libclamav, libgsf), use oledump heuristic?
150		-# - check VBA macros in Visio, Access, Project, etc
151		-# - extract_macros: convert to a class, split long function into smaller methods
152		-# - extract_macros: read bytes from stream file objects instead of strings
153		-# - extract_macros: use combined struct.unpack instead of many calls
154		-
155		-#------------------------------------------------------------------------------
156		-# REFERENCES:
157		-# - [MS-OVBA]: Microsoft Office VBA File Format Structure
158		-# http://msdn.microsoft.com/en-us/library/office/cc313094%28v=office.12%29.aspx
159		-# - officeparser: https://github.com/unixfreak0037/officeparser
160		-
161		-
162		-#--- IMPORTS ------------------------------------------------------------------
163		-
164		-import sys, logging
165		-import struct
166		-import cStringIO
167		-import math
168		-import zipfile
169		-import re
170		-import optparse
171		-import os.path
172		-import binascii
173		-import base64
174		-import traceback
175		-import zlib
176		-
177		-# import lxml or ElementTree for XML parsing:
178		-try:
179		- # lxml: best performance for XML processing
180		- import lxml.etree as ET
181		-except ImportError:
182		- try:
183		- # Python 2.5+: batteries included
184		- import xml.etree.cElementTree as ET
185		- except ImportError:
186		- try:
187		- # Python <2.5: standalone ElementTree install
188		- import elementtree.cElementTree as ET
189		- except ImportError:
190		- raise ImportError, "lxml or ElementTree are not installed, "\
191		- +"see http://codespeak.net/lxml "\
192		- +"or http://effbot.org/zone/element-index.htm"
193		-
194		-import thirdparty.olefile as olefile
195		-from thirdparty.prettytable import prettytable
196		-from thirdparty.xglob import xglob
197		-
198		-#--- CONSTANTS ----------------------------------------------------------------
199		-
200		-TYPE_OLE = 'OLE'
201		-TYPE_OpenXML = 'OpenXML'
202		-TYPE_Word2003_XML = 'Word2003_XML'
203		-
204		-MODULE_EXTENSION = "bas"
205		-CLASS_EXTENSION = "cls"
206		-FORM_EXTENSION = "frm"
207		-
208		-# Namespaces and tags for Word2003 XML parsing:
209		-NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}'
210		-# the tag <w:binData w:name="editdata.mso"> contains the VBA macro code:
211		-TAG_BINDATA = NS_W + 'binData'
212		-ATTR_NAME = NS_W + 'name'
213		-
214		-# Keywords to detect auto-executable macros
215		-AUTOEXEC_KEYWORDS = {
216		- # MS Word:
217		- 'Runs when the Word document is opened':
218		- ('AutoExec', 'AutoOpen', 'Document_Open', 'DocumentOpen'),
219		- 'Runs when the Word document is closed':
220		- ('AutoExit', 'AutoClose', 'Document_Close', 'DocumentBeforeClose'),
221		- 'Runs when the Word document is modified':
222		- ('DocumentChange',),
223		- 'Runs when a new Word document is created':
224		- ('AutoNew', 'Document_New', 'NewDocument'),
225		-
226		- # MS Excel:
227		- 'Runs when the Excel Workbook is opened':
228		- ('Auto_Open', 'Workbook_Open'),
229		- 'Runs when the Excel Workbook is closed':
230		- ('Auto_Close', 'Workbook_Close'),
231		-
232		- #TODO: full list in MS specs??
233		-}
234		-
235		-# Suspicious Keywords that may be used by malware
236		-# See VBA language reference: http://msdn.microsoft.com/en-us/library/office/jj692818%28v=office.15%29.aspx
237		-SUSPICIOUS_KEYWORDS = {
238		- #TODO: use regex to support variable whitespaces
239		- 'May read system environment variables':
240		- ('Environ',),
241		- 'May open a file':
242		- ('Open',),
243		- 'May write to a file (if combined with Open)':
244		- #TODO: regex to find Open+Write on same line
245		- ('Write', 'Put', 'Output', 'Print #'),
246		- 'May read or write a binary file (if combined with Open)':
247		- #TODO: regex to find Open+Binary on same line
248		- ('Binary',),
249		- 'May copy a file':
250		- ('FileCopy', 'CopyFile'),
251		- #FileCopy: http://msdn.microsoft.com/en-us/library/office/gg264390%28v=office.15%29.aspx
252		- #CopyFile: http://msdn.microsoft.com/en-us/library/office/gg264089%28v=office.15%29.aspx
253		- 'May delete a file':
254		- ('Kill',),
255		- 'May create a text file':
256		- ('CreateTextFile','ADODB.Stream', 'WriteText', 'SaveToFile'),
257		- #CreateTextFile: http://msdn.microsoft.com/en-us/library/office/gg264617%28v=office.15%29.aspx
258		- #ADODB.Stream sample: http://pastebin.com/Z4TMyuq6
259		- 'May run an executable file or a system command':
260		- ('Shell', 'vbNormal', 'vbNormalFocus', 'vbHide', 'vbMinimizedFocus', 'vbMaximizedFocus', 'vbNormalNoFocus',
261		- 'vbMinimizedNoFocus', 'WScript.Shell', 'Run'),
262		- #Shell: http://msdn.microsoft.com/en-us/library/office/gg278437%28v=office.15%29.aspx
263		- #WScript.Shell+Run sample: http://pastebin.com/Z4TMyuq6
264		- 'May hide the application':
265		- ('Application.Visible', 'ShowWindow', 'SW_HIDE'),
266		- 'May create a directory':
267		- ('MkDir',),
268		- 'May save the current workbook':
269		- ('ActiveWorkbook.SaveAs',),
270		- 'May change which directory contains files to open at startup':
271		- #TODO: confirm the actual effect
272		- ('Application.AltStartupPath',),
273		- 'May create an OLE object':
274		- ('CreateObject',),
275		- 'May run an application (if combined with CreateObject)':
276		- ('Shell.Application',),
277		- 'May enumerate application windows (if combined with Shell.Application object)':
278		- ('Windows', 'FindWindow'),
279		- 'May run code from a DLL':
280		- #TODO: regex to find declare+lib on same line
281		- ('Lib',),
282		- 'May download files from the Internet':
283		- #TODO: regex to find urlmon+URLDownloadToFileA on same line
284		- ('URLDownloadToFileA', 'Msxml2.XMLHTTP', 'Microsoft.XMLHTTP'),
285		- 'May control another application by simulating user keystrokes':
286		- ('SendKeys', 'AppActivate'),
287		- #SendKeys: http://msdn.microsoft.com/en-us/library/office/gg278655%28v=office.15%29.aspx
288		- 'May attempt to obfuscate malicious function calls':
289		- ('CallByName',),
290		- #CallByName: http://msdn.microsoft.com/en-us/library/office/gg278760%28v=office.15%29.aspx
291		- 'May attempt to obfuscate specific strings':
292		- #TODO: regex to find several Chr*, not just one
293		- ('Chr', 'ChrB', 'ChrW', 'StrReverse', 'Xor'),
294		- #Chr: http://msdn.microsoft.com/en-us/library/office/gg264465%28v=office.15%29.aspx
295		-}
296		-
297		-# Regular Expression for a URL:
298		-# http://en.wikipedia.org/wiki/Uniform_resource_locator
299		-# http://www.w3.org/Addressing/URL/uri-spec.html
300		-#TODO: also support username:password@server
301		-#TODO: other protocols (file, gopher, wais, ...?)
302		-SCHEME = r'\b(?:http\|ftp)s?'
303		-# see http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains
304		-TLD = r'(?:xn--[a-zA-Z0-9]{4,20}\|[a-zA-Z]{2,20})'
305		-DNS_NAME = r'(?:[a-zA-Z0-9\-\.]+\.' + TLD + ')'
306		-#TODO: IPv6 - see https://www.debuggex.com/
307		-# A literal numeric IPv6 address may be given, but must be enclosed in [ ] e.g. [db8:0cec::99:123a]
308		-NUMBER_0_255 = r'(?:25[0-5]\|2[0-4][0-9]\|1[0-9]{2}\|[1-9][0-9]\|[0-9])'
309		-IPv4 = r'(?:'+NUMBER_0_255+r'\.){3}'+NUMBER_0_255
310		-# IPv4 must come before the DNS name because it is more specific
311		-SERVER = r'(?:' + IPv4 + '\|' + DNS_NAME + ')'
312		-PORT = r'(?:\:[0-9]{1,5})?'
313		-SERVER_PORT = SERVER + PORT
314		-URL_PATH = r'(?:/[a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~]*)?' # [^\.\,\)\(\s"]
315		-URL_RE = SCHEME + r'\://' + SERVER_PORT + URL_PATH
316		-re_url = re.compile(URL_RE)
317		-
318		-
319		-# Patterns to be extracted (IP addresses, URLs, etc)
320		-# From patterns.py in balbuzard
321		-RE_PATTERNS = (
322		- ('URL', re.compile(URL_RE)),
323		- ('IPv4 address', re.compile(IPv4)),
324		- ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@'+SERVER+'\b')),
325		- # ('Domain name', re.compile(r'(?=^.{1,254}$)(^(?:(?!\d+\.\|-)[a-zA-Z0-9_\-]{1,63}(?<!-)\.?)+(?:[a-zA-Z]{2,})$)')),
326		- # Executable file name with known extensions (except .com which is present in many URLs, and .application):
327		- ("Executable file name", re.compile(r"(?i)\b\w+\.(EXE\|PIF\|GADGET\|MSI\|MSP\|MSC\|VBS\|VBE\|VB\|JSE\|JS\|WSF\|WSC\|WSH\|WS\|BAT\|CMD\|DLL\|SCR\|HTA\|CPL\|CLASS\|JAR\|PS1XML\|PS1\|PS2XML\|PS2\|PSC1\|PSC2\|SCF\|LNK\|INF\|REG)\b")),
328		- # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/
329		- #TODO: https://support.office.com/en-us/article/Blocked-attachments-in-Outlook-3811cddc-17c3-4279-a30c-060ba0207372#__attachment_file_types
330		- #('Hex string', re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')),
331		- )
332		-
333		-# regex to detect strings encoded in hexadecimal
334		-re_hex_string = re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')
335		-
336		-# regex to detect strings encoded in base64
337		-#re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==\|[A-Za-z0-9+/]{3}=)?"')
338		-# better version from balbuzard, less false positives:
339		-re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4}){1,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=\|[A-Za-z0-9+/][AQgw]==)?"')
340		-# white list of common strings matching the base64 regex, but which are not base64 strings (all lowercase):
341		-BASE64_WHITELIST = set(['thisdocument', 'thisworkbook', 'test', 'temp', 'http', 'open', 'exit'])
342		-
343		-# regex to detect strings encoded with a specific Dridex algorithm
344		-# (see https://github.com/JamesHabben/MalwareStuff)
345		-re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"')
346		-# regex to check that it is not just a hex string:
347		-re_nothex_check = re.compile(r'[G-Zg-z]')
348		-
349		-#--- FUNCTIONS ----------------------------------------------------------------
350		-
351		-def copytoken_help(decompressed_current, decompressed_chunk_start):
352		- """
353		- compute bit masks to decode a CopyToken according to MS-OVBA 2.4.1.3.19.1 CopyToken Help
354		-
355		- decompressed_current: number of decompressed bytes so far, i.e. len(decompressed_container)
356		- decompressed_chunk_start: offset of the current chunk in the decompressed container
357		- return length_mask, offset_mask, bit_count, maximum_length
358		- """
359		- difference = decompressed_current - decompressed_chunk_start
360		- bit_count = int(math.ceil(math.log(difference, 2)))
361		- bit_count = max([bit_count, 4])
362		- length_mask = 0xFFFF >> bit_count
363		- offset_mask = ~length_mask
364		- maximum_length = (0xFFFF >> bit_count) + 3
365		- return length_mask, offset_mask, bit_count, maximum_length
366		-
367		-
368		-def decompress_stream (compressed_container):
369		- """
370		- Decompress a stream according to MS-OVBA section 2.4.1
371		-
372		- compressed_container: string compressed according to the MS-OVBA 2.4.1.3.6 Compression algorithm
373		- return the decompressed container as a string (bytes)
374		- """
375		- # 2.4.1.2 State Variables
376		-
377		- # The following state is maintained for the CompressedContainer (section 2.4.1.1.1):
378		- # CompressedRecordEnd: The location of the byte after the last byte in the CompressedContainer (section 2.4.1.1.1).
379		- # CompressedCurrent: The location of the next byte in the CompressedContainer (section 2.4.1.1.1) to be read by
380		- # decompression or to be written by compression.
381		-
382		- # The following state is maintained for the current CompressedChunk (section 2.4.1.1.4):
383		- # CompressedChunkStart: The location of the first byte of the CompressedChunk (section 2.4.1.1.4) within the
384		- # CompressedContainer (section 2.4.1.1.1).
385		-
386		- # The following state is maintained for a DecompressedBuffer (section 2.4.1.1.2):
387		- # DecompressedCurrent: The location of the next byte in the DecompressedBuffer (section 2.4.1.1.2) to be written by
388		- # decompression or to be read by compression.
389		- # DecompressedBufferEnd: The location of the byte after the last byte in the DecompressedBuffer (section 2.4.1.1.2).
390		-
391		- # The following state is maintained for the current DecompressedChunk (section 2.4.1.1.3):
392		- # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the
393		- # DecompressedBuffer (section 2.4.1.1.2).
394		-
395		- decompressed_container = '' # result
396		- compressed_current = 0
397		-
398		- sig_byte = ord(compressed_container[compressed_current])
399		- if sig_byte != 0x01:
400		- raise ValueError('invalid signature byte {0:02X}'.format(sig_byte))
401		-
402		- compressed_current += 1
403		-
404		- #NOTE: the definition of CompressedRecordEnd is ambiguous. Here we assume that
405		- # CompressedRecordEnd = len(compressed_container)
406		- while compressed_current < len(compressed_container):
407		- # 2.4.1.1.5
408		- compressed_chunk_start = compressed_current
409		- # chunk header = first 16 bits
410		- compressed_chunk_header = struct.unpack("<H", compressed_container[compressed_chunk_start:compressed_chunk_start + 2])[0]
411		- # chunk size = 12 first bits of header + 3
412		- chunk_size = (compressed_chunk_header & 0x0FFF) + 3
413		- # chunk signature = 3 next bits - should always be 0b011
414		- chunk_signature = (compressed_chunk_header >> 12) & 0x07
415		- if chunk_signature != 0b011:
416		- raise ValueError('Invalid CompressedChunkSignature in VBA compressed stream')
417		- # chunk flag = next bit - 1 == compressed, 0 == uncompressed
418		- chunk_flag = (compressed_chunk_header >> 15) & 0x01
419		- logging.debug("chunk size = {0}, compressed flag = {1}".format(chunk_size, chunk_flag))
420		-
421		- #MS-OVBA 2.4.1.3.12: the maximum size of a chunk including its header is 4098 bytes (header 2 + data 4096)
422		- # The minimum size is 3 bytes
423		- # NOTE: there seems to be a typo in MS-OVBA, the check should be with 4098, not 4095 (which is the max value
424		- # in chunk header before adding 3.
425		- # Also the first test is not useful since a 12 bits value cannot be larger than 4095.
426		- if chunk_flag == 1 and chunk_size > 4098:
427		- raise ValueError('CompressedChunkSize > 4098 but CompressedChunkFlag == 1')
428		- if chunk_flag == 0 and chunk_size != 4098:
429		- raise ValueError('CompressedChunkSize != 4098 but CompressedChunkFlag == 0')
430		-
431		- # check if chunk_size goes beyond the compressed data, instead of silently cutting it:
432		- #TODO: raise an exception?
433		- if compressed_chunk_start + chunk_size > len(compressed_container):
434		- logging.warning('Chunk size is larger than remaining compressed data')
435		- compressed_end = min([len(compressed_container), compressed_chunk_start + chunk_size])
436		- # read after chunk header:
437		- compressed_current = compressed_chunk_start + 2
438		-
439		- if chunk_flag == 0:
440		- # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk
441		- # uncompressed chunk: read the next 4096 bytes as-is
442		- #TODO: check if there are at least 4096 bytes left
443		- decompressed_container += compressed_container[compressed_current:compressed_current + 4096]
444		- compressed_current += 4096
445		- else:
446		- # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk
447		- # compressed chunk
448		- decompressed_chunk_start = len(decompressed_container)
449		- while compressed_current < compressed_end:
450		- # MS-OVBA 2.4.1.3.4 Decompressing a TokenSequence
451		- # logging.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end))
452		- # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or
453		- # copy tokens (reference to a previous literal token)
454		- flag_byte = ord(compressed_container[compressed_current])
455		- compressed_current += 1
456		- for bit_index in xrange(0, 8):
457		- # logging.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end))
458		- if compressed_current >= compressed_end:
459		- break
460		- # MS-OVBA 2.4.1.3.5 Decompressing a Token
461		- # MS-OVBA 2.4.1.3.17 Extract FlagBit
462		- flag_bit = (flag_byte >> bit_index) & 1
463		- #logging.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit))
464		- if flag_bit == 0: # LiteralToken
465		- # copy one byte directly to output
466		- decompressed_container += compressed_container[compressed_current]
467		- compressed_current += 1
468		- else: # CopyToken
469		- # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken
470		- copy_token = struct.unpack("<H", compressed_container[compressed_current:compressed_current + 2])[0]
471		- #TODO: check this
472		- length_mask, offset_mask, bit_count, maximum_length = copytoken_help(
473		- len(decompressed_container), decompressed_chunk_start)
474		- length = (copy_token & length_mask) + 3
475		- temp1 = copy_token & offset_mask
476		- temp2 = 16 - bit_count
477		- offset = (temp1 >> temp2) + 1
478		- #logging.debug('offset=%d length=%d' % (offset, length))
479		- copy_source = len(decompressed_container) - offset
480		- for index in xrange(copy_source, copy_source + length):
481		- decompressed_container += decompressed_container[index]
482		- compressed_current += 2
483		- return decompressed_container
484		-
485		-
486		-def _extract_vba (ole, vba_root, project_path, dir_path):
487		- """
488		- Extract VBA macros from an OleFileIO object.
489		- Internal function, do not call directly.
490		-
491		- vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream
492		- vba_project: path to the PROJECT stream
493		- This is a generator, yielding (stream path, VBA filename, VBA source code) for each VBA code stream
494		- """
495		- # Open the PROJECT stream:
496		- project = ole.openstream(project_path)
497		-
498		- # sample content of the PROJECT stream:
499		-
500		- ## ID="{5312AC8A-349D-4950-BDD0-49BE3C4DD0F0}"
501		- ## Document=ThisDocument/&H00000000
502		- ## Module=NewMacros
503		- ## Name="Project"
504		- ## HelpContextID="0"
505		- ## VersionCompatible32="393222000"
506		- ## CMG="F1F301E705E705E705E705"
507		- ## DPB="8F8D7FE3831F2020202020"
508		- ## GC="2D2FDD81E51EE61EE6E1"
509		- ##
510		- ## [Host Extender Info]
511		- ## &H00000001={3832D640-CF90-11CF-8E43-00A0C911005A};VBE;&H00000000
512		- ## &H00000002={000209F2-0000-0000-C000-000000000046};Word8.0;&H00000000
513		- ##
514		- ## [Workspace]
515		- ## ThisDocument=22, 29, 339, 477, Z
516		- ## NewMacros=-4, 42, 832, 510, C
517		-
518		- code_modules = {}
519		-
520		- for line in project:
521		- line = line.strip()
522		- if '=' in line:
523		- # split line at the 1st equal sign:
524		- name, value = line.split('=', 1)
525		- # looking for code modules
526		- # add the code module as a key in the dictionary
527		- # the value will be the extension needed later
528		- # The value is converted to lowercase, to allow case-insensitive matching (issue #3)
529		- value = value.lower()
530		- if name == 'Document':
531		- # split value at the 1st slash, keep 1st part:
532		- value = value.split('/', 1)[0]
533		- code_modules[value] = CLASS_EXTENSION
534		- elif name == 'Module':
535		- code_modules[value] = MODULE_EXTENSION
536		- elif name == 'Class':
537		- code_modules[value] = CLASS_EXTENSION
538		- elif name == 'BaseClass':
539		- code_modules[value] = FORM_EXTENSION
540		-
541		- # read data from dir stream (compressed)
542		- dir_compressed = ole.openstream(dir_path).read()
543		-
544		- def check_value(name, expected, value):
545		- if expected != value:
546		- logging.error("invalid value for {0} expected {1:04X} got {2:04X}".format(name, expected, value))
547		-
548		- dir_stream = cStringIO.StringIO(decompress_stream(dir_compressed))
549		-
550		- # PROJECTSYSKIND Record
551		- PROJECTSYSKIND_Id = struct.unpack("<H", dir_stream.read(2))[0]
552		- check_value('PROJECTSYSKIND_Id', 0x0001, PROJECTSYSKIND_Id)
553		- PROJECTSYSKIND_Size = struct.unpack("<L", dir_stream.read(4))[0]
554		- check_value('PROJECTSYSKIND_Size', 0x0004, PROJECTSYSKIND_Size)
555		- PROJECTSYSKIND_SysKind = struct.unpack("<L", dir_stream.read(4))[0]
556		- if PROJECTSYSKIND_SysKind == 0x00:
557		- logging.debug("16-bit Windows")
558		- elif PROJECTSYSKIND_SysKind == 0x01:
559		- logging.debug("32-bit Windows")
560		- elif PROJECTSYSKIND_SysKind == 0x02:
561		- logging.debug("Macintosh")
562		- elif PROJECTSYSKIND_SysKind == 0x03:
563		- logging.debug("64-bit Windows")
564		- else:
565		- logging.error("invalid PROJECTSYSKIND_SysKind {0:04X}".format(PROJECTSYSKIND_SysKind))
566		-
567		- # PROJECTLCID Record
568		- PROJECTLCID_Id = struct.unpack("<H", dir_stream.read(2))[0]
569		- check_value('PROJECTLCID_Id', 0x0002, PROJECTLCID_Id)
570		- PROJECTLCID_Size = struct.unpack("<L", dir_stream.read(4))[0]
571		- check_value('PROJECTLCID_Size', 0x0004, PROJECTLCID_Size)
572		- PROJECTLCID_Lcid = struct.unpack("<L", dir_stream.read(4))[0]
573		- check_value('PROJECTLCID_Lcid', 0x409, PROJECTLCID_Lcid)
574		-
575		- # PROJECTLCIDINVOKE Record
576		- PROJECTLCIDINVOKE_Id = struct.unpack("<H", dir_stream.read(2))[0]
577		- check_value('PROJECTLCIDINVOKE_Id', 0x0014, PROJECTLCIDINVOKE_Id)
578		- PROJECTLCIDINVOKE_Size = struct.unpack("<L", dir_stream.read(4))[0]
579		- check_value('PROJECTLCIDINVOKE_Size', 0x0004, PROJECTLCIDINVOKE_Size)
580		- PROJECTLCIDINVOKE_LcidInvoke = struct.unpack("<L", dir_stream.read(4))[0]
581		- check_value('PROJECTLCIDINVOKE_LcidInvoke', 0x409, PROJECTLCIDINVOKE_LcidInvoke)
582		-
583		- # PROJECTCODEPAGE Record
584		- PROJECTCODEPAGE_Id = struct.unpack("<H", dir_stream.read(2))[0]
585		- check_value('PROJECTCODEPAGE_Id', 0x0003, PROJECTCODEPAGE_Id)
586		- PROJECTCODEPAGE_Size = struct.unpack("<L", dir_stream.read(4))[0]
587		- check_value('PROJECTCODEPAGE_Size', 0x0002, PROJECTCODEPAGE_Size)
588		- PROJECTCODEPAGE_CodePage = struct.unpack("<H", dir_stream.read(2))[0]
589		-
590		- # PROJECTNAME Record
591		- PROJECTNAME_Id = struct.unpack("<H", dir_stream.read(2))[0]
592		- check_value('PROJECTNAME_Id', 0x0004, PROJECTNAME_Id)
593		- PROJECTNAME_SizeOfProjectName = struct.unpack("<L", dir_stream.read(4))[0]
594		- if PROJECTNAME_SizeOfProjectName < 1 or PROJECTNAME_SizeOfProjectName > 128:
595		- logging.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(PROJECTNAME_SizeOfProjectName))
596		- PROJECTNAME_ProjectName = dir_stream.read(PROJECTNAME_SizeOfProjectName)
597		-
598		- # PROJECTDOCSTRING Record
599		- PROJECTDOCSTRING_Id = struct.unpack("<H", dir_stream.read(2))[0]
600		- check_value('PROJECTDOCSTRING_Id', 0x0005, PROJECTDOCSTRING_Id)
601		- PROJECTDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0]
602		- if PROJECTNAME_SizeOfProjectName > 2000:
603		- logging.error("PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(PROJECTDOCSTRING_SizeOfDocString))
604		- PROJECTDOCSTRING_DocString = dir_stream.read(PROJECTDOCSTRING_SizeOfDocString)
605		- PROJECTDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
606		- check_value('PROJECTDOCSTRING_Reserved', 0x0040, PROJECTDOCSTRING_Reserved)
607		- PROJECTDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0]
608		- if PROJECTDOCSTRING_SizeOfDocStringUnicode % 2 != 0:
609		- logging.error("PROJECTDOCSTRING_SizeOfDocStringUnicode is not even")
610		- PROJECTDOCSTRING_DocStringUnicode = dir_stream.read(PROJECTDOCSTRING_SizeOfDocStringUnicode)
611		-
612		- # PROJECTHELPFILEPATH Record - MS-OVBA 2.3.4.2.1.7
613		- PROJECTHELPFILEPATH_Id = struct.unpack("<H", dir_stream.read(2))[0]
614		- check_value('PROJECTHELPFILEPATH_Id', 0x0006, PROJECTHELPFILEPATH_Id)
615		- PROJECTHELPFILEPATH_SizeOfHelpFile1 = struct.unpack("<L", dir_stream.read(4))[0]
616		- if PROJECTHELPFILEPATH_SizeOfHelpFile1 > 260:
617		- logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(PROJECTHELPFILEPATH_SizeOfHelpFile1))
618		- PROJECTHELPFILEPATH_HelpFile1 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile1)
619		- PROJECTHELPFILEPATH_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
620		- check_value('PROJECTHELPFILEPATH_Reserved', 0x003D, PROJECTHELPFILEPATH_Reserved)
621		- PROJECTHELPFILEPATH_SizeOfHelpFile2 = struct.unpack("<L", dir_stream.read(4))[0]
622		- if PROJECTHELPFILEPATH_SizeOfHelpFile2 != PROJECTHELPFILEPATH_SizeOfHelpFile1:
623		- logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 does not equal PROJECTHELPFILEPATH_SizeOfHelpFile2")
624		- PROJECTHELPFILEPATH_HelpFile2 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile2)
625		- if PROJECTHELPFILEPATH_HelpFile2 != PROJECTHELPFILEPATH_HelpFile1:
626		- logging.error("PROJECTHELPFILEPATH_HelpFile1 does not equal PROJECTHELPFILEPATH_HelpFile2")
627		-
628		- # PROJECTHELPCONTEXT Record
629		- PROJECTHELPCONTEXT_Id = struct.unpack("<H", dir_stream.read(2))[0]
630		- check_value('PROJECTHELPCONTEXT_Id', 0x0007, PROJECTHELPCONTEXT_Id)
631		- PROJECTHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0]
632		- check_value('PROJECTHELPCONTEXT_Size', 0x0004, PROJECTHELPCONTEXT_Size)
633		- PROJECTHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0]
634		-
635		- # PROJECTLIBFLAGS Record
636		- PROJECTLIBFLAGS_Id = struct.unpack("<H", dir_stream.read(2))[0]
637		- check_value('PROJECTLIBFLAGS_Id', 0x0008, PROJECTLIBFLAGS_Id)
638		- PROJECTLIBFLAGS_Size = struct.unpack("<L", dir_stream.read(4))[0]
639		- check_value('PROJECTLIBFLAGS_Size', 0x0004, PROJECTLIBFLAGS_Size)
640		- PROJECTLIBFLAGS_ProjectLibFlags = struct.unpack("<L", dir_stream.read(4))[0]
641		- check_value('PROJECTLIBFLAGS_ProjectLibFlags', 0x0000, PROJECTLIBFLAGS_ProjectLibFlags)
642		-
643		- # PROJECTVERSION Record
644		- PROJECTVERSION_Id = struct.unpack("<H", dir_stream.read(2))[0]
645		- check_value('PROJECTVERSION_Id', 0x0009, PROJECTVERSION_Id)
646		- PROJECTVERSION_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
647		- check_value('PROJECTVERSION_Reserved', 0x0004, PROJECTVERSION_Reserved)
648		- PROJECTVERSION_VersionMajor = struct.unpack("<L", dir_stream.read(4))[0]
649		- PROJECTVERSION_VersionMinor = struct.unpack("<H", dir_stream.read(2))[0]
650		-
651		- # PROJECTCONSTANTS Record
652		- PROJECTCONSTANTS_Id = struct.unpack("<H", dir_stream.read(2))[0]
653		- check_value('PROJECTCONSTANTS_Id', 0x000C, PROJECTCONSTANTS_Id)
654		- PROJECTCONSTANTS_SizeOfConstants = struct.unpack("<L", dir_stream.read(4))[0]
655		- if PROJECTCONSTANTS_SizeOfConstants > 1015:
656		- logging.error("PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(PROJECTCONSTANTS_SizeOfConstants))
657		- PROJECTCONSTANTS_Constants = dir_stream.read(PROJECTCONSTANTS_SizeOfConstants)
658		- PROJECTCONSTANTS_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
659		- check_value('PROJECTCONSTANTS_Reserved', 0x003C, PROJECTCONSTANTS_Reserved)
660		- PROJECTCONSTANTS_SizeOfConstantsUnicode = struct.unpack("<L", dir_stream.read(4))[0]
661		- if PROJECTCONSTANTS_SizeOfConstantsUnicode % 2 != 0:
662		- logging.error("PROJECTCONSTANTS_SizeOfConstantsUnicode is not even")
663		- PROJECTCONSTANTS_ConstantsUnicode = dir_stream.read(PROJECTCONSTANTS_SizeOfConstantsUnicode)
664		-
665		- # array of REFERENCE records
666		- check = None
667		- while True:
668		- check = struct.unpack("<H", dir_stream.read(2))[0]
669		- logging.debug("reference type = {0:04X}".format(check))
670		- if check == 0x000F:
671		- break
672		-
673		- if check == 0x0016:
674		- # REFERENCENAME
675		- REFERENCE_Id = check
676		- REFERENCE_SizeOfName = struct.unpack("<L", dir_stream.read(4))[0]
677		- REFERENCE_Name = dir_stream.read(REFERENCE_SizeOfName)
678		- REFERENCE_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
679		- check_value('REFERENCE_Reserved', 0x003E, REFERENCE_Reserved)
680		- REFERENCE_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
681		- REFERENCE_NameUnicode = dir_stream.read(REFERENCE_SizeOfNameUnicode)
682		- continue
683		-
684		- if check == 0x0033:
685		- # REFERENCEORIGINAL (followed by REFERENCECONTROL)
686		- REFERENCEORIGINAL_Id = check
687		- REFERENCEORIGINAL_SizeOfLibidOriginal = struct.unpack("<L", dir_stream.read(4))[0]
688		- REFERENCEORIGINAL_LibidOriginal = dir_stream.read(REFERENCEORIGINAL_SizeOfLibidOriginal)
689		- continue
690		-
691		- if check == 0x002F:
692		- # REFERENCECONTROL
693		- REFERENCECONTROL_Id = check
694		- REFERENCECONTROL_SizeTwiddled = struct.unpack("<L", dir_stream.read(4))[0] # ignore
695		- REFERENCECONTROL_SizeOfLibidTwiddled = struct.unpack("<L", dir_stream.read(4))[0]
696		- REFERENCECONTROL_LibidTwiddled = dir_stream.read(REFERENCECONTROL_SizeOfLibidTwiddled)
697		- REFERENCECONTROL_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0] # ignore
698		- check_value('REFERENCECONTROL_Reserved1', 0x0000, REFERENCECONTROL_Reserved1)
699		- REFERENCECONTROL_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0] # ignore
700		- check_value('REFERENCECONTROL_Reserved2', 0x0000, REFERENCECONTROL_Reserved2)
701		- # optional field
702		- check2 = struct.unpack("<H", dir_stream.read(2))[0]
703		- if check2 == 0x0016:
704		- REFERENCECONTROL_NameRecordExtended_Id = check
705		- REFERENCECONTROL_NameRecordExtended_SizeofName = struct.unpack("<L", dir_stream.read(4))[0]
706		- REFERENCECONTROL_NameRecordExtended_Name = dir_stream.read(REFERENCECONTROL_NameRecordExtended_SizeofName)
707		- REFERENCECONTROL_NameRecordExtended_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
708		- check_value('REFERENCECONTROL_NameRecordExtended_Reserved', 0x003E, REFERENCECONTROL_NameRecordExtended_Reserved)
709		- REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
710		- REFERENCECONTROL_NameRecordExtended_NameUnicode = dir_stream.read(REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode)
711		- REFERENCECONTROL_Reserved3 = struct.unpack("<H", dir_stream.read(2))[0]
712		- else:
713		- REFERENCECONTROL_Reserved3 = check2
714		-
715		- check_value('REFERENCECONTROL_Reserved3', 0x0030, REFERENCECONTROL_Reserved3)
716		- REFERENCECONTROL_SizeExtended = struct.unpack("<L", dir_stream.read(4))[0]
717		- REFERENCECONTROL_SizeOfLibidExtended = struct.unpack("<L", dir_stream.read(4))[0]
718		- REFERENCECONTROL_LibidExtended = dir_stream.read(REFERENCECONTROL_SizeOfLibidExtended)
719		- REFERENCECONTROL_Reserved4 = struct.unpack("<L", dir_stream.read(4))[0]
720		- REFERENCECONTROL_Reserved5 = struct.unpack("<H", dir_stream.read(2))[0]
721		- REFERENCECONTROL_OriginalTypeLib = dir_stream.read(16)
722		- REFERENCECONTROL_Cookie = struct.unpack("<L", dir_stream.read(4))[0]
723		- continue
724		-
725		- if check == 0x000D:
726		- # REFERENCEREGISTERED
727		- REFERENCEREGISTERED_Id = check
728		- REFERENCEREGISTERED_Size = struct.unpack("<L", dir_stream.read(4))[0]
729		- REFERENCEREGISTERED_SizeOfLibid = struct.unpack("<L", dir_stream.read(4))[0]
730		- REFERENCEREGISTERED_Libid = dir_stream.read(REFERENCEREGISTERED_SizeOfLibid)
731		- REFERENCEREGISTERED_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0]
732		- check_value('REFERENCEREGISTERED_Reserved1', 0x0000, REFERENCEREGISTERED_Reserved1)
733		- REFERENCEREGISTERED_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0]
734		- check_value('REFERENCEREGISTERED_Reserved2', 0x0000, REFERENCEREGISTERED_Reserved2)
735		- continue
736		-
737		- if check == 0x000E:
738		- # REFERENCEPROJECT
739		- REFERENCEPROJECT_Id = check
740		- REFERENCEPROJECT_Size = struct.unpack("<L", dir_stream.read(4))[0]
741		- REFERENCEPROJECT_SizeOfLibidAbsolute = struct.unpack("<L", dir_stream.read(4))[0]
742		- REFERENCEPROJECT_LibidAbsolute = dir_stream.read(REFERENCEPROJECT_SizeOfLibidAbsolute)
743		- REFERENCEPROJECT_SizeOfLibidRelative = struct.unpack("<L", dir_stream.read(4))[0]
744		- REFERENCEPROJECT_LibidRelative = dir_stream.read(REFERENCEPROJECT_SizeOfLibidRelative)
745		- REFERENCEPROJECT_MajorVersion = struct.unpack("<L", dir_stream.read(4))[0]
746		- REFERENCEPROJECT_MinorVersion = struct.unpack("<H", dir_stream.read(2))[0]
747		- continue
748		-
749		- logging.error('invalid or unknown check Id {0:04X}'.format(check))
750		- sys.exit(0)
751		-
752		- PROJECTMODULES_Id = check #struct.unpack("<H", dir_stream.read(2))[0]
753		- check_value('PROJECTMODULES_Id', 0x000F, PROJECTMODULES_Id)
754		- PROJECTMODULES_Size = struct.unpack("<L", dir_stream.read(4))[0]
755		- check_value('PROJECTMODULES_Size', 0x0002, PROJECTMODULES_Size)
756		- PROJECTMODULES_Count = struct.unpack("<H", dir_stream.read(2))[0]
757		- PROJECTMODULES_ProjectCookieRecord_Id = struct.unpack("<H", dir_stream.read(2))[0]
758		- check_value('PROJECTMODULES_ProjectCookieRecord_Id', 0x0013, PROJECTMODULES_ProjectCookieRecord_Id)
759		- PROJECTMODULES_ProjectCookieRecord_Size = struct.unpack("<L", dir_stream.read(4))[0]
760		- check_value('PROJECTMODULES_ProjectCookieRecord_Size', 0x0002, PROJECTMODULES_ProjectCookieRecord_Size)
761		- PROJECTMODULES_ProjectCookieRecord_Cookie = struct.unpack("<H", dir_stream.read(2))[0]
762		-
763		- logging.debug("parsing {0} modules".format(PROJECTMODULES_Count))
764		- for x in xrange(0, PROJECTMODULES_Count):
765		- MODULENAME_Id = struct.unpack("<H", dir_stream.read(2))[0]
766		- check_value('MODULENAME_Id', 0x0019, MODULENAME_Id)
767		- MODULENAME_SizeOfModuleName = struct.unpack("<L", dir_stream.read(4))[0]
768		- MODULENAME_ModuleName = dir_stream.read(MODULENAME_SizeOfModuleName)
769		- # account for optional sections
770		- section_id = struct.unpack("<H", dir_stream.read(2))[0]
771		- if section_id == 0x0047:
772		- MODULENAMEUNICODE_Id = section_id
773		- MODULENAMEUNICODE_SizeOfModuleNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
774		- MODULENAMEUNICODE_ModuleNameUnicode = dir_stream.read(MODULENAMEUNICODE_SizeOfModuleNameUnicode)
775		- section_id = struct.unpack("<H", dir_stream.read(2))[0]
776		- if section_id == 0x001A:
777		- MODULESTREAMNAME_id = section_id
778		- MODULESTREAMNAME_SizeOfStreamName = struct.unpack("<L", dir_stream.read(4))[0]
779		- MODULESTREAMNAME_StreamName = dir_stream.read(MODULESTREAMNAME_SizeOfStreamName)
780		- MODULESTREAMNAME_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
781		- check_value('MODULESTREAMNAME_Reserved', 0x0032, MODULESTREAMNAME_Reserved)
782		- MODULESTREAMNAME_SizeOfStreamNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
783		- MODULESTREAMNAME_StreamNameUnicode = dir_stream.read(MODULESTREAMNAME_SizeOfStreamNameUnicode)
784		- section_id = struct.unpack("<H", dir_stream.read(2))[0]
785		- if section_id == 0x001C:
786		- MODULEDOCSTRING_Id = section_id
787		- check_value('MODULEDOCSTRING_Id', 0x001C, MODULEDOCSTRING_Id)
788		- MODULEDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0]
789		- MODULEDOCSTRING_DocString = dir_stream.read(MODULEDOCSTRING_SizeOfDocString)
790		- MODULEDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
791		- check_value('MODULEDOCSTRING_Reserved', 0x0048, MODULEDOCSTRING_Reserved)
792		- MODULEDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0]
793		- MODULEDOCSTRING_DocStringUnicode = dir_stream.read(MODULEDOCSTRING_SizeOfDocStringUnicode)
794		- section_id = struct.unpack("<H", dir_stream.read(2))[0]
795		- if section_id == 0x0031:
796		- MODULEOFFSET_Id = section_id
797		- check_value('MODULEOFFSET_Id', 0x0031, MODULEOFFSET_Id)
798		- MODULEOFFSET_Size = struct.unpack("<L", dir_stream.read(4))[0]
799		- check_value('MODULEOFFSET_Size', 0x0004, MODULEOFFSET_Size)
800		- MODULEOFFSET_TextOffset = struct.unpack("<L", dir_stream.read(4))[0]
801		- section_id = struct.unpack("<H", dir_stream.read(2))[0]
802		- if section_id == 0x001E:
803		- MODULEHELPCONTEXT_Id = section_id
804		- check_value('MODULEHELPCONTEXT_Id', 0x001E, MODULEHELPCONTEXT_Id)
805		- MODULEHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0]
806		- check_value('MODULEHELPCONTEXT_Size', 0x0004, MODULEHELPCONTEXT_Size)
807		- MODULEHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0]
808		- section_id = struct.unpack("<H", dir_stream.read(2))[0]
809		- if section_id == 0x002C:
810		- MODULECOOKIE_Id = section_id
811		- check_value('MODULECOOKIE_Id', 0x002C, MODULECOOKIE_Id)
812		- MODULECOOKIE_Size = struct.unpack("<L", dir_stream.read(4))[0]
813		- check_value('MODULECOOKIE_Size', 0x0002, MODULECOOKIE_Size)
814		- MODULECOOKIE_Cookie = struct.unpack("<H", dir_stream.read(2))[0]
815		- section_id = struct.unpack("<H", dir_stream.read(2))[0]
816		- if section_id == 0x0021 or section_id == 0x0022:
817		- MODULETYPE_Id = section_id
818		- MODULETYPE_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
819		- section_id = struct.unpack("<H", dir_stream.read(2))[0]
820		- if section_id == 0x0025:
821		- MODULEREADONLY_Id = section_id
822		- check_value('MODULEREADONLY_Id', 0x0025, MODULEREADONLY_Id)
823		- MODULEREADONLY_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
824		- check_value('MODULEREADONLY_Reserved', 0x0000, MODULEREADONLY_Reserved)
825		- section_id = struct.unpack("<H", dir_stream.read(2))[0]
826		- if section_id == 0x0028:
827		- MODULEPRIVATE_Id = section_id
828		- check_value('MODULEPRIVATE_Id', 0x0028, MODULEPRIVATE_Id)
829		- MODULEPRIVATE_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
830		- check_value('MODULEPRIVATE_Reserved', 0x0000, MODULEPRIVATE_Reserved)
831		- section_id = struct.unpack("<H", dir_stream.read(2))[0]
832		- if section_id == 0x002B: # TERMINATOR
833		- MODULE_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
834		- check_value('MODULE_Reserved', 0x0000, MODULE_Reserved)
835		- section_id = None
836		- if section_id != None:
837		- logging.warning('unknown or invalid module section id {0:04X}'.format(section_id))
838		-
839		- logging.debug('Project CodePage = %d' % PROJECTCODEPAGE_CodePage)
840		- vba_codec = 'cp%d' % PROJECTCODEPAGE_CodePage
841		- logging.debug("ModuleName = {0}".format(MODULENAME_ModuleName))
842		- logging.debug("StreamName = {0}".format(repr(MODULESTREAMNAME_StreamName)))
843		- streamname_unicode = MODULESTREAMNAME_StreamName.decode(vba_codec)
844		- logging.debug("StreamName.decode('%s') = %s" % (vba_codec, repr(streamname_unicode)))
845		- logging.debug("StreamNameUnicode = {0}".format(repr(MODULESTREAMNAME_StreamNameUnicode)))
846		- logging.debug("TextOffset = {0}".format(MODULEOFFSET_TextOffset))
847		-
848		- code_path = vba_root + u'VBA/' + streamname_unicode
849		- #TODO: test if stream exists
850		- logging.debug('opening VBA code stream %s' % repr(code_path))
851		- code_data = ole.openstream(code_path).read()
852		- logging.debug("length of code_data = {0}".format(len(code_data)))
853		- logging.debug("offset of code_data = {0}".format(MODULEOFFSET_TextOffset))
854		- code_data = code_data[MODULEOFFSET_TextOffset:]
855		- if len(code_data) > 0:
856		- code_data = decompress_stream(code_data)
857		- # case-insensitive search in the code_modules dict to find the file extension:
858		- filext = code_modules.get(MODULENAME_ModuleName.lower(), 'bin')
859		- filename = '{0}.{1}'.format(MODULENAME_ModuleName, filext)
860		- #TODO: also yield the codepage so that callers can decode it properly
861		- yield (code_path, filename, code_data)
862		- # print '-'*79
863		- # print filename
864		- # print ''
865		- # print code_data
866		- # print ''
867		- logging.debug('extracted file {0}'.format(filename))
868		- else:
869		- logging.warning("module stream {0} has code data length 0".format(MODULESTREAMNAME_StreamName))
870		- return
871		-
872		-
873		-def filter_vba(vba_code):
874		- """
875		- Filter VBA source code to remove the first lines starting with "Attribute VB_",
876		- which are automatically added by MS Office and not displayed in the VBA Editor.
877		- This should only be used when displaying source code for human analysis.
878		-
879		- Note: lines are not filtered if they contain a colon, because it could be
880		- used to hide malicious instructions.
881		-
882		- :param vba_code: str, VBA source code
883		- :return: str, filtered VBA source code
884		- """
885		- vba_lines = vba_code.splitlines()
886		- start = 0
887		- for line in vba_lines:
888		- if line.startswith("Attribute VB_") and not ':' in line:
889		- start += 1
890		- else:
891		- break
892		- #TODO: also remove empty lines?
893		- vba = '\n'.join(vba_lines[start:])
894		- return vba
895		-
896		-
897		-def detect_autoexec(vba_code, obfuscation=None):
898		- """
899		- Detect if the VBA code contains keywords corresponding to macros running
900		- automatically when triggered by specific actions (e.g. when a document is
901		- opened or closed).
902		-
903		- :param vba_code: str, VBA source code
904		- :param obfuscation: None or str, name of obfuscation to be added to description
905		- :return: list of str tuples (keyword, description)
906		- """
907		- #TODO: merge code with detect_suspicious
908		- # case-insensitive search
909		- #vba_code = vba_code.lower()
910		- results = []
911		- obf_text = ''
912		- if obfuscation:
913		- obf_text = ' (obfuscation: %s)' % obfuscation
914		- for description, keywords in AUTOEXEC_KEYWORDS.items():
915		- for keyword in keywords:
916		- #TODO: if keyword is already a compiled regex, use it as-is
917		- # search using regex to detect word boundaries:
918		- if re.search(r'(?i)\b'+keyword+r'\b', vba_code):
919		- #if keyword.lower() in vba_code:
920		- results.append((keyword, description+obf_text))
921		- return results
922		-
923		-
924		-def detect_suspicious(vba_code, obfuscation=None):
925		- """
926		- Detect if the VBA code contains suspicious keywords corresponding to
927		- potential malware behaviour.
928		-
929		- :param vba_code: str, VBA source code
930		- :param obfuscation: None or str, name of obfuscation to be added to description
931		- :return: list of str tuples (keyword, description)
932		- """
933		- # case-insensitive search
934		- #vba_code = vba_code.lower()
935		- results = []
936		- obf_text = ''
937		- if obfuscation:
938		- obf_text = ' (obfuscation: %s)' % obfuscation
939		- for description, keywords in SUSPICIOUS_KEYWORDS.items():
940		- for keyword in keywords:
941		- # search using regex to detect word boundaries:
942		- if re.search(r'(?i)\b'+keyword+r'\b', vba_code):
943		- #if keyword.lower() in vba_code:
944		- results.append((keyword, description+obf_text))
945		- return results
946		-
947		-
948		-def detect_patterns(vba_code, obfuscation=None):
949		- """
950		- Detect if the VBA code contains specific patterns such as IP addresses,
951		- URLs, e-mail addresses, executable file names, etc.
952		-
953		- :param vba_code: str, VBA source code
954		- :return: list of str tuples (pattern type, value)
955		- """
956		- results = []
957		- found = set()
958		- obf_text = ''
959		- if obfuscation:
960		- obf_text = ' (obfuscation: %s)' % obfuscation
961		- for pattern_type, pattern_re in RE_PATTERNS:
962		- for match in pattern_re.finditer(vba_code):
963		- value = match.group()
964		- if value not in found:
965		- results.append((pattern_type+obf_text, value))
966		- found.add(value)
967		- return results
968		-
969		-
970		-def detect_hex_strings(vba_code):
971		- """
972		- Detect if the VBA code contains strings encoded in hexadecimal.
973		-
974		- :param vba_code: str, VBA source code
975		- :return: list of str tuples (encoded string, decoded string)
976		- """
977		- results = []
978		- found = set()
979		- for match in re_hex_string.finditer(vba_code):
980		- value = match.group()
981		- if value not in found:
982		- decoded = binascii.unhexlify(value)
983		- results.append((value, decoded))
984		- found.add(value)
985		- return results
986		-
987		-
988		-def detect_base64_strings(vba_code):
989		- """
990		- Detect if the VBA code contains strings encoded in base64.
991		-
992		- :param vba_code: str, VBA source code
993		- :return: list of str tuples (encoded string, decoded string)
994		- """
995		- #TODO: avoid matching simple hex strings as base64?
996		- results = []
997		- found = set()
998		- for match in re_base64_string.finditer(vba_code):
999		- # extract the base64 string without quotes:
1000		- value = match.group().strip('"')
1001		- # check it is not just a hex string:
1002		- if not re_nothex_check.search(value):
1003		- continue
1004		- # only keep new values and not in the whitelist:
1005		- if value not in found and value.lower() not in BASE64_WHITELIST:
1006		- try:
1007		- decoded = base64.b64decode(value)
1008		- results.append((value, decoded))
1009		- found.add(value)
1010		- except:
1011		- # if an exception occurs, it is likely not a base64-encoded string
1012		- pass
1013		- return results
1014		-
1015		-
1016		-def detect_dridex_strings(vba_code):
1017		- """
1018		- Detect if the VBA code contains strings obfuscated with a specific algorithm found in Dridex samples.
1019		-
1020		- :param vba_code: str, VBA source code
1021		- :return: list of str tuples (encoded string, decoded string)
1022		- """
1023		- from thirdparty.DridexUrlDecoder.DridexUrlDecoder import DridexUrlDecode
1024		- results = []
1025		- found = set()
1026		- for match in re_dridex_string.finditer(vba_code):
1027		- value = match.group()[1:-1]
1028		- # check it is not just a hex string:
1029		- if not re_nothex_check.search(value):
1030		- continue
1031		- if value not in found:
1032		- try:
1033		- decoded = DridexUrlDecode(value)
1034		- results.append((value, decoded))
1035		- found.add(value)
1036		- except:
1037		- # if an exception occurs, it is likely not a dridex-encoded string
1038		- pass
1039		- return results
1040		-
1041		-
1042		-class VBA_Scanner (object):
1043		- """
1044		- Class to scan the source code of a VBA module to find obfuscated strings,
1045		- suspicious keywords, IOCs, auto-executable macros, etc.
1046		- """
1047		-
1048		- def __init__(self, vba_code):
1049		- """
1050		- VBA_Scanner constructor
1051		-
1052		- :param vba_code: str, VBA source code to be analyzed
1053		- """
1054		- self.code = vba_code
1055		- self.code_hex = ''
1056		- self.code_hex_rev = ''
1057		- self.code_rev_hex = ''
1058		- self.code_base64 = ''
1059		- self.code_dridex = ''
1060		-
1061		-
1062		- def scan(self, include_decoded_strings=False):
1063		- """
1064		- Analyze the provided VBA code to detect suspicious keywords,
1065		- auto-executable macros, IOC patterns, obfuscation patterns
1066		- such as hex-encoded strings.
1067		-
1068		- :param include_decoded_strings: bool, if True, all encoded strings will be included with their decoded content.
1069		- :return: list of tuples (type, keyword, description)
1070		- (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String')
1071		- """
1072		- # First, detect and extract hex-encoded strings:
1073		- self.hex_strings = detect_hex_strings(self.code)
1074		- # detect if the code contains StrReverse:
1075		- self.strReverse = False
1076		- if 'strreverse' in self.code.lower(): self.strReverse = True
1077		- # Then append the decoded strings to the VBA code, to detect obfuscated IOCs and keywords:
1078		- for encoded, decoded in self.hex_strings:
1079		- self.code_hex += '\n'+decoded
1080		- # if the code contains "StrReverse", also append the hex strings in reverse order:
1081		- if self.strReverse:
1082		- # StrReverse after hex decoding:
1083		- self.code_hex_rev += '\n'+decoded[::-1]
1084		- # StrReverse before hex decoding:
1085		- self.code_rev_hex += '\n'+binascii.unhexlify(encoded[::-1])
1086		- #example: https://malwr.com/analysis/NmFlMGI4YTY1YzYyNDkwNTg1ZTBiZmY5OGI3YjlhYzU/
1087		- #TODO: also append the full code reversed if StrReverse? (risk of false positives?)
1088		- # Detect Base64-encoded strings
1089		- self.base64_strings = detect_base64_strings(self.code)
1090		- for encoded, decoded in self.base64_strings:
1091		- self.code_base64 += '\n'+decoded
1092		- # Detect Dridex-encoded strings
1093		- self.dridex_strings = detect_dridex_strings(self.code)
1094		- for encoded, decoded in self.dridex_strings:
1095		- self.code_dridex += '\n'+decoded
1096		- results = []
1097		- self.autoexec_keywords = []
1098		- self.suspicious_keywords = []
1099		- self.iocs = []
1100		-
1101		- for code, obfuscation in (
1102		- (self.code, None),
1103		- (self.code_hex, 'Hex'),
1104		- (self.code_hex_rev, 'Hex+StrReverse'),
1105		- (self.code_rev_hex, 'StrReverse+Hex'),
1106		- (self.code_base64, 'Base64'),
1107		- (self.code_dridex, 'Dridex'),
1108		- ):
1109		- self.autoexec_keywords += detect_autoexec(code, obfuscation)
1110		- self.suspicious_keywords += detect_suspicious(code, obfuscation)
1111		- self.iocs += detect_patterns(code, obfuscation)
1112		-
1113		- # If hex-encoded strings were discovered, add an item to suspicious keywords:
1114		- if self.hex_strings:
1115		- self.suspicious_keywords.append(('Hex Strings',
1116		- 'Hex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))
1117		- if self.base64_strings:
1118		- self.suspicious_keywords.append(('Base64 Strings',
1119		- 'Base64-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))
1120		- if self.dridex_strings:
1121		- self.suspicious_keywords.append(('Dridex Strings',
1122		- 'Dridex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))
1123		- for keyword, description in self.autoexec_keywords:
1124		- results.append(('AutoExec', keyword, description))
1125		- for keyword, description in self.suspicious_keywords:
1126		- results.append(('Suspicious', keyword, description))
1127		- for pattern_type, value in self.iocs:
1128		- results.append(('IOC', value, pattern_type))
1129		- if include_decoded_strings:
1130		- for encoded, decoded in self.hex_strings:
1131		- results.append(('Hex String', repr(decoded), encoded))
1132		- for encoded, decoded in self.base64_strings:
1133		- results.append(('Base64 String', repr(decoded), encoded))
1134		- for encoded, decoded in self.dridex_strings:
1135		- results.append(('Dridex string', repr(decoded), encoded))
1136		- return results
1137		-
1138		- def scan_summary(self):
1139		- """
1140		- Analyze the provided VBA code to detect suspicious keywords,
1141		- auto-executable macros, IOC patterns, obfuscation patterns
1142		- such as hex-encoded strings.
1143		-
1144		- :return: tuple with the number of items found for each category:
1145		- (autoexec, suspicious, IOCs, hex, base64, dridex)
1146		- """
1147		- self.scan()
1148		- return (len(self.autoexec_keywords), len(self.suspicious_keywords),
1149		- len(self.iocs), len(self.hex_strings), len(self.base64_strings),
1150		- len(self.dridex_strings))
1151		-
1152		-
1153		-
1154		-def scan_vba(vba_code, include_decoded_strings):
1155		- """
1156		- Analyze the provided VBA code to detect suspicious keywords,
1157		- auto-executable macros, IOC patterns, obfuscation patterns
1158		- such as hex-encoded strings.
1159		- (shortcut for VBA_Scanner(vba_code).scan())
1160		-
1161		- :param vba_code: str, VBA source code to be analyzed
1162		- :param include_decoded_strings: bool, if True all encoded strings will be included with their decoded content.
1163		- :return: list of tuples (type, keyword, description)
1164		- (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String')
1165		- """
1166		- return VBA_Scanner(vba_code).scan(include_decoded_strings)
1167		-
1168		-
1169		-#=== CLASSES =================================================================
1170		-
1171		-class VBA_Parser(object):
1172		- """
1173		- Class to parse MS Office files, to detect VBA macros and extract VBA source code
1174		- Supported file formats:
1175		- - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)
1176		- - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
1177		- - PowerPoint 2007+ (.pptm, .ppsm)
1178		- """
1179		-
1180		- def __init__(self, filename, data=None):
1181		- """
1182		- Constructor for VBA_Parser
1183		-
1184		- :param filename: filename or path of file to parse, or file-like object
1185		-
1186		- :param data: None or bytes str, if None the file will be read from disk (or from the file-like object).
1187		- If data is provided as a bytes string, it will be parsed as the content of the file in memory,
1188		- and not read from disk. Note: files must be read in binary mode, i.e. open(f, 'rb').
1189		- """
1190		- #TODO: filename should only be a string, data should be used for the file-like object
1191		- #TODO: filename should be mandatory, optional data is a string or file-like object
1192		- #TODO: also support olefile and zipfile as input
1193		- if data is None:
1194		- # open file from disk:
1195		- _file = filename
1196		- else:
1197		- # file already read in memory, make it a file-like object for zipfile:
1198		- _file = cStringIO.StringIO(data)
1199		- #self.file = _file
1200		- self.ole_file = None
1201		- self.ole_subfiles = []
1202		- self.filename = filename
1203		- self.type = None
1204		- self.vba_projects = None
1205		- # if filename is None:
1206		- # if isinstance(_file, basestring):
1207		- # if len(_file) < olefile.MINIMAL_OLEFILE_SIZE:
1208		- # self.filename = _file
1209		- # else:
1210		- # self.filename = '<file in bytes string>'
1211		- # else:
1212		- # self.filename = '<file-like object>'
1213		- if olefile.isOleFile(_file):
1214		- # This looks like an OLE file
1215		- logging.info('Parsing OLE file %s' % self.filename)
1216		- # Open and parse the OLE file, using unicode for path names:
1217		- self.ole_file = olefile.OleFileIO(_file, path_encoding=None)
1218		- self.type = TYPE_OLE
1219		- #TODO: raise TypeError if this is a Powerpoint 97 file, since VBA macros cannot be detected yet
1220		- elif zipfile.is_zipfile(_file):
1221		- # This looks like a zip file, need to look for vbaProject.bin inside
1222		- # It can be any OLE file inside the archive
1223		- #...because vbaProject.bin can be renamed:
1224		- # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18
1225		- logging.info('Opening ZIP/OpenXML file %s' % self.filename)
1226		- self.type = TYPE_OpenXML
1227		- z = zipfile.ZipFile(_file)
1228		- #TODO: check if this is actually an OpenXML file
1229		- #TODO: if the zip file is encrypted, suggest to use the -z option, or try '-z infected' automatically?
1230		- # check each file within the zip if it is an OLE file, by reading its magic:
1231		- for subfile in z.namelist():
1232		- magic = z.open(subfile).read(len(olefile.MAGIC))
1233		- if magic == olefile.MAGIC:
1234		- logging.debug('Opening OLE file %s within zip' % subfile)
1235		- ole_data = z.open(subfile).read()
1236		- try:
1237		- self.ole_subfiles.append(VBA_Parser(filename=subfile, data=ole_data))
1238		- except:
1239		- logging.debug('%s is not a valid OLE file' % subfile)
1240		- continue
1241		- z.close()
1242		- else:
1243		- # read file from disk, check if it is a Word 2003 XML file (WordProcessingML), Excel 2003 XML,
1244		- # or a plain text file containing VBA code
1245		- if data is None:
1246		- data = open(filename, 'rb').read()
1247		- # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace
1248		- if 'http://schemas.microsoft.com/office/word/2003/wordml' in data:
1249		- logging.info('Opening Word 2003 XML file %s' % self.filename)
1250		- self.type = TYPE_Word2003_XML
1251		- # parse the XML content
1252		- et = ET.fromstring(data)
1253		- # find all the binData elements:
1254		- for bindata in et.getiterator(TAG_BINDATA):
1255		- # the binData content is an OLE container for the VBA project, compressed
1256		- # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded.
1257		- # get the filename:
1258		- fname = bindata.get(ATTR_NAME, 'noname.mso')
1259		- # decode the base64 activemime
1260		- activemime = binascii.a2b_base64(bindata.text)
1261		- # decompress the zlib data starting at offset 0x32, which is the OLE container:
1262		- ole_data = zlib.decompress(activemime[0x32:])
1263		- try:
1264		- self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data))
1265		- except:
1266		- logging.debug('%s is not a valid OLE file' % fname)
1267		- continue
1268		- #TODO: handle exceptions
1269		- #TODO: Excel 2003 XML
1270		- #TODO: plain text VBA file
1271		- else:
1272		- msg = '%s is not an OLE nor an OpenXML file, cannot extract VBA Macros.' % self.filename
1273		- logging.error(msg)
1274		- raise TypeError(msg)
1275		-
1276		- def find_vba_projects (self):
1277		- """
1278		- Finds all the VBA projects stored in an OLE file.
1279		-
1280		- Return None if the file is not OLE but OpenXML.
1281		- Return a list of tuples (vba_root, project_path, dir_path) for each VBA project.
1282		- vba_root is the path of the root OLE storage containing the VBA project,
1283		- including a trailing slash unless it is the root of the OLE file.
1284		- project_path is the path of the OLE stream named "PROJECT" within the VBA project.
1285		- dir_path is the path of the OLE stream named "VBA/dir" within the VBA project.
1286		-
1287		- If this function returns an empty list for one of the supported formats
1288		- (i.e. Word, Excel, Powerpoint except Powerpoint 97-2003), then the
1289		- file does not contain VBA macros.
1290		-
1291		- :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path)
1292		- for each VBA project found if OLE file
1293		- """
1294		- # if the file is not OLE but OpenXML, return None:
1295		- if self.ole_file is None:
1296		- return None
1297		-
1298		- # if this method has already been called, return previous result:
1299		- if self.vba_projects is not None:
1300		- return self.vba_projects
1301		-
1302		- # Find the VBA project root (different in MS Word, Excel, etc):
1303		- # - Word 97-2003: Macros
1304		- # - Excel 97-2003: _VBA_PROJECT_CUR
1305		- # - PowerPoint 97-2003: not supported yet (different file structure)
1306		- # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin.
1307		- # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word
1308		- # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word
1309		- # - Visio 2007: not supported yet (different file structure)
1310		-
1311		- # According to MS-OVBA section 2.2.1:
1312		- # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream
1313		- # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream
1314		- # - all names are case-insensitive
1315		-
1316		- # start with an empty list:
1317		- self.vba_projects = []
1318		- # Look for any storage containing those storage/streams:
1319		- ole = self.ole_file
1320		- for storage in ole.listdir(streams=False, storages=True):
1321		- # Look for a storage ending with "VBA":
1322		- if storage[-1].upper() == 'VBA':
1323		- logging.debug('Found VBA storage: %s' % ('/'.join(storage)))
1324		- vba_root = '/'.join(storage[:-1])
1325		- # Add a trailing slash to vba_root, unless it is the root of the OLE file:
1326		- # (used later to append all the child streams/storages)
1327		- if vba_root != '':
1328		- vba_root += '/'
1329		- logging.debug('Checking vba_root="%s"' % vba_root)
1330		-
1331		- def check_vba_stream(ole, vba_root, stream_path):
1332		- full_path = vba_root + stream_path
1333		- if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM:
1334		- logging.debug('Found %s stream: %s' % (stream_path, full_path))
1335		- return full_path
1336		- else:
1337		- logging.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path)
1338		- return False
1339		-
1340		- # Check if the VBA root storage also contains a PROJECT stream:
1341		- project_path = check_vba_stream(ole, vba_root, 'PROJECT')
1342		- if not project_path: continue
1343		- # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream:
1344		- vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT')
1345		- if not vba_project_path: continue
1346		- # Check if the VBA root storage also contains a VBA/dir stream:
1347		- dir_path = check_vba_stream(ole, vba_root, 'VBA/dir')
1348		- if not dir_path: continue
1349		- # Now we are pretty sure it is a VBA project structure
1350		- logging.debug('VBA root storage: "%s"' % vba_root)
1351		- # append the results to the list as a tuple for later use:
1352		- self.vba_projects.append((vba_root, project_path, dir_path))
1353		- return self.vba_projects
1354		-
1355		- def detect_vba_macros(self):
1356		- """
1357		- Detect the potential presence of VBA macros in the file, by checking
1358		- if it contains VBA projects. Both OLE and OpenXML files are supported.
1359		-
1360		- Important: for now, results are accurate only for Word, Excel and PowerPoint
1361		- EXCEPT Powerpoint 97-2003, which has a different structure for VBA.
1362		-
1363		- Note: this method does NOT attempt to check the actual presence or validity
1364		- of VBA macro source code, so there might be false positives.
1365		- It may also detect VBA macros in files embedded within the main file,
1366		- for example an Excel workbook with macros embedded into a Word
1367		- document without macros may be detected, without distinction.
1368		-
1369		- :return: bool, True if at least one VBA project has been found, False otherwise
1370		- """
1371		- #TODO: return None or raise exception if format not supported like PPT 97-2003
1372		- #TODO: return the number of VBA projects found instead of True/False?
1373		- # if OpenXML, check all the OLE subfiles:
1374		- if self.ole_file is None:
1375		- for ole_subfile in self.ole_subfiles:
1376		- if ole_subfile.detect_vba_macros():
1377		- return True
1378		- return False
1379		- # otherwise it's an OLE file, find VBA projects:
1380		- vba_projects = self.find_vba_projects()
1381		- if len(vba_projects) == 0:
1382		- return False
1383		- else:
1384		- return True
1385		-
1386		-
1387		- def extract_macros (self):
1388		- """
1389		- Extract and decompress source code for each VBA macro found in the file
1390		-
1391		- Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found
1392		- If the file is OLE, filename is the path of the file.
1393		- If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros
1394		- within the zip archive, e.g. word/vbaProject.bin.
1395		- """
1396		- if self.ole_file is None:
1397		- for ole_subfile in self.ole_subfiles:
1398		- for results in ole_subfile.extract_macros():
1399		- yield results
1400		- else:
1401		- self.find_vba_projects()
1402		- for vba_root, project_path, dir_path in self.vba_projects:
1403		- # extract all VBA macros from that VBA root storage:
1404		- for stream_path, vba_filename, vba_code in _extract_vba(self.ole_file, vba_root, project_path, dir_path):
1405		- yield (self.filename, stream_path, vba_filename, vba_code)
1406		-
1407		-
1408		- def close(self):
1409		- """
1410		- Close all the open files. This method must be called after usage, if
1411		- the application is opening many files.
1412		- """
1413		- if self.ole_file is None:
1414		- for ole_subfile in self.ole_subfiles:
1415		- ole_subfile.close()
1416		- else:
1417		- self.ole_file.close()
1418		-
1419		-
1420		-def print_analysis(vba_code, show_decoded_strings=False):
1421		- """
1422		- Analyze the provided VBA code, and print the results in a table
1423		-
1424		- :param vba_code: str, VBA source code to be analyzed
1425		- :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.
1426		- :return: None
1427		- """
1428		- results = scan_vba(vba_code, show_decoded_strings)
1429		- if results:
1430		- t = prettytable.PrettyTable(('Type', 'Keyword', 'Description'))
1431		- t.align = 'l'
1432		- t.max_width['Type'] = 10
1433		- t.max_width['Keyword'] = 20
1434		- t.max_width['Description'] = 39
1435		- for kw_type, keyword, description in results:
1436		- t.add_row((kw_type, keyword, description))
1437		- print t
1438		- else:
1439		- print 'No suspicious keyword or IOC found.'
1440		-
1441		-
1442		-
1443		-def process_file (container, filename, data, show_decoded_strings=False):
1444		- """
1445		- Process a single file
1446		-
1447		- :param container: str, path and filename of container if the file is within
1448		- a zip archive, None otherwise.
1449		- :param filename: str, path and filename of file on disk, or within the container.
1450		- :param data: bytes, content of the file if it is in a container, None if it is a file on disk.
1451		- :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.
1452		- """
1453		- #TODO: replace print by writing to a provided output file (sys.stdout by default)
1454		- if container:
1455		- display_filename = '%s in %s' % (filename, container)
1456		- else:
1457		- display_filename = filename
1458		- print '='*79
1459		- print 'FILE:', display_filename
1460		- try:
1461		- #TODO: handle olefile errors, when an OLE file is malformed
1462		- vba = VBA_Parser(filename, data)
1463		- print 'Type:', vba.type
1464		- if vba.detect_vba_macros():
1465		- #print 'Contains VBA Macros:'
1466		- for (subfilename, stream_path, vba_filename, vba_code) in vba.extract_macros():
1467		- # hide attribute lines:
1468		- #TODO: option to disable attribute filtering
1469		- vba_code_filtered = filter_vba(vba_code)
1470		- print '-'*79
1471		- print 'VBA MACRO %s ' % vba_filename
1472		- print 'in file: %s - OLE stream: %s' % (subfilename, repr(stream_path))
1473		- print '- '*39
1474		- # detect empty macros:
1475		- if vba_code_filtered.strip() == '':
1476		- print '(empty macro)'
1477		- else:
1478		- print vba_code_filtered
1479		- print '- '*39
1480		- print 'ANALYSIS:'
1481		- # analyse the whole code, filtered to avoid false positives:
1482		- print_analysis(vba_code_filtered, show_decoded_strings)
1483		- else:
1484		- print 'No VBA macros found.'
1485		- except: #TypeError:
1486		- #raise
1487		- #TODO: print more info if debug mode
1488		- #print sys.exc_value
1489		- # display the exception with full stack trace for debugging, but do not stop:
1490		- traceback.print_exc()
1491		- print ''
1492		-
1493		-
1494		-def process_file_triage (container, filename, data):
1495		- """
1496		- Process a single file
1497		-
1498		- :param container: str, path and filename of container if the file is within
1499		- a zip archive, None otherwise.
1500		- :param filename: str, path and filename of file on disk, or within the container.
1501		- :param data: bytes, content of the file if it is in a container, None if it is a file on disk.
1502		- """
1503		- #TODO: replace print by writing to a provided output file (sys.stdout by default)
1504		- nb_macros = 0
1505		- nb_autoexec = 0
1506		- nb_suspicious = 0
1507		- nb_iocs = 0
1508		- nb_hexstrings = 0
1509		- nb_base64strings = 0
1510		- nb_dridexstrings = 0
1511		- # ftype = 'Other'
1512		- message = ''
1513		- try:
1514		- #TODO: handle olefile errors, when an OLE file is malformed
1515		- vba = VBA_Parser(filename, data)
1516		- if vba.detect_vba_macros():
1517		- for (subfilename, stream_path, vba_filename, vba_code) in vba.extract_macros():
1518		- nb_macros += 1
1519		- if vba_code.strip() != '':
1520		- # analyse the whole code, filtered to avoid false positives:
1521		- scanner = VBA_Scanner(filter_vba(vba_code))
1522		- autoexec, suspicious, iocs, hexstrings, base64strings, dridex = scanner.scan_summary()
1523		- nb_autoexec += autoexec
1524		- nb_suspicious += suspicious
1525		- nb_iocs += iocs
1526		- nb_hexstrings += hexstrings
1527		- nb_base64strings += base64strings
1528		- nb_dridexstrings += dridex
1529		- if vba.type == TYPE_OLE:
1530		- flags = 'OLE:'
1531		- elif vba.type == TYPE_OpenXML:
1532		- flags = 'OpX:'
1533		- elif vba.type == TYPE_Word2003_XML:
1534		- flags = 'XML:'
1535		- macros = autoexec = suspicious = iocs = hexstrings = base64obf = dridex = '-'
1536		- if nb_macros: macros = 'M'
1537		- if nb_autoexec: autoexec = 'A'
1538		- if nb_suspicious: suspicious = 'S'
1539		- if nb_iocs: iocs = 'I'
1540		- if nb_hexstrings: hexstrings = 'H'
1541		- if nb_base64strings: base64obf = 'B'
1542		- if nb_dridexstrings: dridex = 'D'
1543		- flags += '%s%s%s%s%s%s%s' % (macros, autoexec, suspicious, iocs, hexstrings,
1544		- base64obf, dridex)
1545		-
1546		- # macros = autoexec = suspicious = iocs = hexstrings = 'no'
1547		- # if nb_macros: macros = 'YES:%d' % nb_macros
1548		- # if nb_autoexec: autoexec = 'YES:%d' % nb_autoexec
1549		- # if nb_suspicious: suspicious = 'YES:%d' % nb_suspicious
1550		- # if nb_iocs: iocs = 'YES:%d' % nb_iocs
1551		- # if nb_hexstrings: hexstrings = 'YES:%d' % nb_hexstrings
1552		- # # 2nd line = info
1553		- # print '%-8s %-7s %-7s %-7s %-7s %-7s' % (vba.type, macros, autoexec, suspicious, iocs, hexstrings)
1554		- except TypeError:
1555		- # file type not OLE nor OpenXML
1556		- flags = '?'
1557		- message = 'File format not supported'
1558		- except:
1559		- # another error occurred
1560		- #raise
1561		- #TODO: print more info if debug mode
1562		- #TODO: distinguish real errors from incorrect file types
1563		- flags = '!ERROR'
1564		- message = sys.exc_value
1565		- line = '%-11s %s' % (flags, filename)
1566		- if message:
1567		- line += ' - %s' % message
1568		- print line
1569		-
1570		- # t = prettytable.PrettyTable(('filename', 'type', 'macros', 'autoexec', 'suspicious', 'ioc', 'hexstrings'),
1571		- # header=False, border=False)
1572		- # t.align = 'l'
1573		- # t.max_width['filename'] = 30
1574		- # t.max_width['type'] = 10
1575		- # t.max_width['macros'] = 6
1576		- # t.max_width['autoexec'] = 6
1577		- # t.max_width['suspicious'] = 6
1578		- # t.max_width['ioc'] = 6
1579		- # t.max_width['hexstrings'] = 6
1580		- # t.add_row((filename, ftype, macros, autoexec, suspicious, iocs, hexstrings))
1581		- # print t
1582		-
1583		-def main_triage_quick():
1584		- pass
1585		-
1586		-#=== MAIN =====================================================================
1587		-
1588		-def main():
1589		- """
1590		- Main function, called when olevba is run from the command line
1591		- """
1592		- usage = 'usage: %prog [options] <filename> [filename2 ...]'
1593		- parser = optparse.OptionParser(usage=usage)
1594		- # parser.add_option('-o', '--outfile', dest='outfile',
1595		- # help='output file')
1596		- # parser.add_option('-c', '--csv', dest='csv',
1597		- # help='export results to a CSV file')
1598		- parser.add_option("-r", action="store_true", dest="recursive",
1599		- help='find files recursively in subdirectories.')
1600		- parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None,
1601		- help='if the file is a zip archive, open first file from it, using the provided password (requires Python 2.6+)')
1602		- parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*',
1603		- help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)')
1604		- parser.add_option("-t", action="store_true", dest="triage_mode",
1605		- help='triage mode, display results as a summary table (default for multiple files)')
1606		- parser.add_option("-d", action="store_true", dest="detailed_mode",
1607		- help='detailed mode, display full results (default for single file)')
1608		- parser.add_option("-i", "--input", dest='input', type='str', default=None,
1609		- help='input file containing VBA source code to be analyzed (no parsing)')
1610		- parser.add_option("--decode", action="store_true", dest="show_decoded_strings",
1611		- help='display all the obfuscated strings with their decoded content (Hex, Base64, StrReverse, Dridex).')
1612		-
1613		- (options, args) = parser.parse_args()
1614		-
1615		- # Print help if no arguments are passed
1616		- if len(args) == 0 and not options.input:
1617		- print __doc__
1618		- parser.print_help()
1619		- sys.exit()
1620		-
1621		- # print banner with version
1622		- print 'olevba %s - http://decalage.info/python/oletools' % __version__
1623		-
1624		- logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING) #INFO)
1625		- # For now, all logging is disabled:
1626		- logging.disable(logging.CRITICAL)
1627		-
1628		- if options.input:
1629		- # input file provided with VBA source code to be analyzed directly:
1630		- print 'Analysis of VBA source code from %s:' % options.input
1631		- vba_code = open(options.input).read()
1632		- print_analysis(vba_code, show_decoded_strings=options.show_decoded_strings)
1633		- sys.exit()
1634		-
1635		- # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('Type', 'Macros', 'AutoEx', 'Susp.', 'IOCs', 'HexStr')
1636		- # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('-'8, '-'7, '-'7, '-'7, '-'7, '-'7)
1637		- if not options.detailed_mode or options.triage_mode:
1638		- print '%-11s %-65s' % ('Flags', 'Filename')
1639		- print '%-11s %-65s' % ('-'11, '-'65)
1640		- previous_container = None
1641		- count = 0
1642		- container = filename = data = None
1643		- for container, filename, data in xglob.iter_files(args, recursive=options.recursive,
1644		- zip_password=options.zip_password, zip_fname=options.zip_fname):
1645		- # ignore directory names stored in zip files:
1646		- if container and filename.endswith('/'):
1647		- continue
1648		- if options.detailed_mode and not options.triage_mode:
1649		- # fully detailed output
1650		- process_file(container, filename, data, show_decoded_strings=options.show_decoded_strings)
1651		- else:
1652		- # print container name when it changes:
1653		- if container != previous_container:
1654		- if container is not None:
1655		- print '\nFiles in %s:' % container
1656		- previous_container = container
1657		- # summarized output for triage:
1658		- process_file_triage(container, filename, data)
1659		- count += 1
1660		- if not options.detailed_mode or options.triage_mode:
1661		- print '\n(Flags: OpX=OpenXML, XML=Word2003XML, M=Macros, A=Auto-executable, S=Suspicious keywords, I=IOCs, H=Hex strings, B=Base64 strings, D=Dridex strings, ?=Unknown)\n'
1662		-
1663		- if count == 1 and not options.triage_mode and not options.detailed_mode:
1664		- # if options -t and -d were not specified and it's a single file, print details:
1665		- #TODO: avoid doing the analysis twice by storing results
1666		- process_file(container, filename, data, show_decoded_strings=options.show_decoded_strings)
1667		-
1668		-if __name__ == '__main__':
1669		- main()
1670		-
	1	+#!/usr/bin/env python
	2	+"""
	3	+olevba.py
	4	+
	5	+olevba is a script to parse OLE and OpenXML files such as MS Office documents
	6	+(e.g. Word, Excel), to extract VBA Macro code in clear text, deobfuscate
	7	+and analyze malicious macros.
	8	+
	9	+Supported formats:
	10	+- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)
	11	+- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
	12	+- PowerPoint 2007+ (.pptm, .ppsm)
	13	+- Word 2003 XML (.xml)
	14	+
	15	+Author: Philippe Lagadec - http://www.decalage.info
	16	+License: BSD, see source code or documentation
	17	+
	18	+olevba is part of the python-oletools package:
	19	+http://www.decalage.info/python/oletools
	20	+
	21	+olevba is based on source code from officeparser by John William Davison
	22	+https://github.com/unixfreak0037/officeparser
	23	+"""
	24	+
	25	+#=== LICENSE ==================================================================
	26	+
	27	+# olevba is copyright (c) 2014-2015 Philippe Lagadec (http://www.decalage.info)
	28	+# All rights reserved.
	29	+#
	30	+# Redistribution and use in source and binary forms, with or without modification,
	31	+# are permitted provided that the following conditions are met:
	32	+#
	33	+# * Redistributions of source code must retain the above copyright notice, this
	34	+# list of conditions and the following disclaimer.
	35	+# * Redistributions in binary form must reproduce the above copyright notice,
	36	+# this list of conditions and the following disclaimer in the documentation
	37	+# and/or other materials provided with the distribution.
	38	+#
	39	+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	40	+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	41	+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	42	+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
	43	+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	44	+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
	45	+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	46	+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	47	+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	48	+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	49	+
	50	+
	51	+# olevba contains modified source code from the officeparser project, published
	52	+# under the following MIT License (MIT):
	53	+#
	54	+# officeparser is copyright (c) 2014 John William Davison
	55	+#
	56	+# Permission is hereby granted, free of charge, to any person obtaining a copy
	57	+# of this software and associated documentation files (the "Software"), to deal
	58	+# in the Software without restriction, including without limitation the rights
	59	+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	60	+# copies of the Software, and to permit persons to whom the Software is
	61	+# furnished to do so, subject to the following conditions:
	62	+#
	63	+# The above copyright notice and this permission notice shall be included in all
	64	+# copies or substantial portions of the Software.
	65	+#
	66	+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	67	+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	68	+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	69	+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	70	+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	71	+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	72	+# SOFTWARE.
	73	+
	74	+#------------------------------------------------------------------------------
	75	+# CHANGELOG:
	76	+# 2014-08-05 v0.01 PL: - first version based on officeparser code
	77	+# 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser
	78	+# 2014-08-15 PL: - fixed incorrect value check in PROJECTHELPFILEPATH Record
	79	+# 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats
	80	+# and to find the VBA project root anywhere in the file
	81	+# 2014-11-29 v0.04 PL: - use olefile instead of OleFileIO_PL
	82	+# 2014-12-05 v0.05 PL: - refactored most functions into a class, new API
	83	+# - added detect_vba_macros
	84	+# 2014-12-10 v0.06 PL: - hide first lines with VB attributes
	85	+# - detect auto-executable macros
	86	+# - ignore empty macros
	87	+# 2014-12-14 v0.07 PL: - detect_autoexec() is now case-insensitive
	88	+# 2014-12-15 v0.08 PL: - improved display for empty macros
	89	+# - added pattern extraction
	90	+# 2014-12-25 v0.09 PL: - added suspicious keywords detection
	91	+# 2014-12-27 v0.10 PL: - added OptionParser, main and process_file
	92	+# - uses xglob to scan several files with wildcards
	93	+# - option -r to recurse subdirectories
	94	+# - option -z to scan files in password-protected zips
	95	+# 2015-01-02 v0.11 PL: - improved filter_vba to detect colons
	96	+# 2015-01-03 v0.12 PL: - fixed detect_patterns to detect all patterns
	97	+# - process_file: improved display, shows container file
	98	+# - improved list of executable file extensions
	99	+# 2015-01-04 v0.13 PL: - added several suspicious keywords, improved display
	100	+# 2015-01-08 v0.14 PL: - added hex strings detection and decoding
	101	+# - fixed issue #2, decoding VBA stream names using
	102	+# specified codepage and unicode stream names
	103	+# 2015-01-11 v0.15 PL: - added new triage mode, options -t and -d
	104	+# 2015-01-16 v0.16 PL: - fix for issue #3 (exception when module name="text")
	105	+# - added several suspicious keywords
	106	+# - added option -i to analyze VBA source code directly
	107	+# 2015-01-17 v0.17 PL: - removed .com from the list of executable extensions
	108	+# - added scan_vba to run all detection algorithms
	109	+# - decoded hex strings are now also scanned + reversed
	110	+# 2015-01-23 v0.18 PL: - fixed issue #3, case-insensitive search in code_modules
	111	+# 2015-01-24 v0.19 PL: - improved the detection of IOCs obfuscated with hex
	112	+# strings and StrReverse
	113	+# 2015-01-26 v0.20 PL: - added option --hex to show all hex strings decoded
	114	+# 2015-01-29 v0.21 PL: - added Dridex obfuscation decoding
	115	+# - improved display, shows obfuscation name
	116	+# 2015-02-01 v0.22 PL: - fixed issue #4: regex for URL, e-mail and exe filename
	117	+# - added Base64 obfuscation decoding (contribution from
	118	+# @JamesHabben)
	119	+# 2015-02-03 v0.23 PL: - triage now uses VBA_Scanner results, shows Base64 and
	120	+# Dridex strings
	121	+# - exception handling in detect_base64_strings
	122	+# 2015-02-07 v0.24 PL: - renamed option --hex to --decode, fixed display
	123	+# - display exceptions with stack trace
	124	+# - added several suspicious keywords
	125	+# - improved Base64 detection and decoding
	126	+# - fixed triage mode not to scan attrib lines
	127	+# 2015-03-04 v0.25 PL: - added support for Word 2003 XML
	128	+
	129	+__version__ = '0.25'
	130	+
	131	+#------------------------------------------------------------------------------
	132	+# TODO:
	133	+# + do not use logging, but a provided logger (null logger by default)
	134	+# + setup logging (common with other oletools)
	135	+# + add xor bruteforcing like bbharvest
	136	+# + add chr() decoding
	137	+
	138	+# TODO later:
	139	+# + performance improvement: instead of searching each keyword separately,
	140	+# first split vba code into a list of words (per line), then check each
	141	+# word against a dict. (or put vba words into a set/dict?)
	142	+# + for regex, maybe combine them into a single re with named groups?
	143	+# + add Yara support, include sample rules? plugins like balbuzard?
	144	+# + add balbuzard support
	145	+# + output to file (replace print by file.write, sys.stdout by default)
	146	+# + look for VBA in embedded documents (e.g. Excel in Word)
	147	+# + support SRP streams (see Lenny's article + links and sample)
	148	+# - python 3.x support
	149	+# - add support for PowerPoint macros (see libclamav, libgsf), use oledump heuristic?
	150	+# - check VBA macros in Visio, Access, Project, etc
	151	+# - extract_macros: convert to a class, split long function into smaller methods
	152	+# - extract_macros: read bytes from stream file objects instead of strings
	153	+# - extract_macros: use combined struct.unpack instead of many calls
	154	+
	155	+#------------------------------------------------------------------------------
	156	+# REFERENCES:
	157	+# - [MS-OVBA]: Microsoft Office VBA File Format Structure
	158	+# http://msdn.microsoft.com/en-us/library/office/cc313094%28v=office.12%29.aspx
	159	+# - officeparser: https://github.com/unixfreak0037/officeparser
	160	+
	161	+
	162	+#--- IMPORTS ------------------------------------------------------------------
	163	+
	164	+import sys, logging
	165	+import struct
	166	+import cStringIO
	167	+import math
	168	+import zipfile
	169	+import re
	170	+import optparse
	171	+import os.path
	172	+import binascii
	173	+import base64
	174	+import traceback
	175	+import zlib
	176	+
	177	+# import lxml or ElementTree for XML parsing:
	178	+try:
	179	+ # lxml: best performance for XML processing
	180	+ import lxml.etree as ET
	181	+except ImportError:
	182	+ try:
	183	+ # Python 2.5+: batteries included
	184	+ import xml.etree.cElementTree as ET
	185	+ except ImportError:
	186	+ try:
	187	+ # Python <2.5: standalone ElementTree install
	188	+ import elementtree.cElementTree as ET
	189	+ except ImportError:
	190	+ raise ImportError, "lxml or ElementTree are not installed, "\
	191	+ +"see http://codespeak.net/lxml "\
	192	+ +"or http://effbot.org/zone/element-index.htm"
	193	+
	194	+import thirdparty.olefile as olefile
	195	+from thirdparty.prettytable import prettytable
	196	+from thirdparty.xglob import xglob
	197	+
	198	+#--- CONSTANTS ----------------------------------------------------------------
	199	+
	200	+TYPE_OLE = 'OLE'
	201	+TYPE_OpenXML = 'OpenXML'
	202	+TYPE_Word2003_XML = 'Word2003_XML'
	203	+
	204	+MODULE_EXTENSION = "bas"
	205	+CLASS_EXTENSION = "cls"
	206	+FORM_EXTENSION = "frm"
	207	+
	208	+# Namespaces and tags for Word2003 XML parsing:
	209	+NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}'
	210	+# the tag <w:binData w:name="editdata.mso"> contains the VBA macro code:
	211	+TAG_BINDATA = NS_W + 'binData'
	212	+ATTR_NAME = NS_W + 'name'
	213	+
	214	+# Keywords to detect auto-executable macros
	215	+AUTOEXEC_KEYWORDS = {
	216	+ # MS Word:
	217	+ 'Runs when the Word document is opened':
	218	+ ('AutoExec', 'AutoOpen', 'Document_Open', 'DocumentOpen'),
	219	+ 'Runs when the Word document is closed':
	220	+ ('AutoExit', 'AutoClose', 'Document_Close', 'DocumentBeforeClose'),
	221	+ 'Runs when the Word document is modified':
	222	+ ('DocumentChange',),
	223	+ 'Runs when a new Word document is created':
	224	+ ('AutoNew', 'Document_New', 'NewDocument'),
	225	+
	226	+ # MS Excel:
	227	+ 'Runs when the Excel Workbook is opened':
	228	+ ('Auto_Open', 'Workbook_Open'),
	229	+ 'Runs when the Excel Workbook is closed':
	230	+ ('Auto_Close', 'Workbook_Close'),
	231	+
	232	+ #TODO: full list in MS specs??
	233	+}
	234	+
	235	+# Suspicious Keywords that may be used by malware
	236	+# See VBA language reference: http://msdn.microsoft.com/en-us/library/office/jj692818%28v=office.15%29.aspx
	237	+SUSPICIOUS_KEYWORDS = {
	238	+ #TODO: use regex to support variable whitespaces
	239	+ 'May read system environment variables':
	240	+ ('Environ',),
	241	+ 'May open a file':
	242	+ ('Open',),
	243	+ 'May write to a file (if combined with Open)':
	244	+ #TODO: regex to find Open+Write on same line
	245	+ ('Write', 'Put', 'Output', 'Print #'),
	246	+ 'May read or write a binary file (if combined with Open)':
	247	+ #TODO: regex to find Open+Binary on same line
	248	+ ('Binary',),
	249	+ 'May copy a file':
	250	+ ('FileCopy', 'CopyFile'),
	251	+ #FileCopy: http://msdn.microsoft.com/en-us/library/office/gg264390%28v=office.15%29.aspx
	252	+ #CopyFile: http://msdn.microsoft.com/en-us/library/office/gg264089%28v=office.15%29.aspx
	253	+ 'May delete a file':
	254	+ ('Kill',),
	255	+ 'May create a text file':
	256	+ ('CreateTextFile','ADODB.Stream', 'WriteText', 'SaveToFile'),
	257	+ #CreateTextFile: http://msdn.microsoft.com/en-us/library/office/gg264617%28v=office.15%29.aspx
	258	+ #ADODB.Stream sample: http://pastebin.com/Z4TMyuq6
	259	+ 'May run an executable file or a system command':
	260	+ ('Shell', 'vbNormal', 'vbNormalFocus', 'vbHide', 'vbMinimizedFocus', 'vbMaximizedFocus', 'vbNormalNoFocus',
	261	+ 'vbMinimizedNoFocus', 'WScript.Shell', 'Run'),
	262	+ #Shell: http://msdn.microsoft.com/en-us/library/office/gg278437%28v=office.15%29.aspx
	263	+ #WScript.Shell+Run sample: http://pastebin.com/Z4TMyuq6
	264	+ 'May hide the application':
	265	+ ('Application.Visible', 'ShowWindow', 'SW_HIDE'),
	266	+ 'May create a directory':
	267	+ ('MkDir',),
	268	+ 'May save the current workbook':
	269	+ ('ActiveWorkbook.SaveAs',),
	270	+ 'May change which directory contains files to open at startup':
	271	+ #TODO: confirm the actual effect
	272	+ ('Application.AltStartupPath',),
	273	+ 'May create an OLE object':
	274	+ ('CreateObject',),
	275	+ 'May run an application (if combined with CreateObject)':
	276	+ ('Shell.Application',),
	277	+ 'May enumerate application windows (if combined with Shell.Application object)':
	278	+ ('Windows', 'FindWindow'),
	279	+ 'May run code from a DLL':
	280	+ #TODO: regex to find declare+lib on same line
	281	+ ('Lib',),
	282	+ 'May download files from the Internet':
	283	+ #TODO: regex to find urlmon+URLDownloadToFileA on same line
	284	+ ('URLDownloadToFileA', 'Msxml2.XMLHTTP', 'Microsoft.XMLHTTP'),
	285	+ 'May control another application by simulating user keystrokes':
	286	+ ('SendKeys', 'AppActivate'),
	287	+ #SendKeys: http://msdn.microsoft.com/en-us/library/office/gg278655%28v=office.15%29.aspx
	288	+ 'May attempt to obfuscate malicious function calls':
	289	+ ('CallByName',),
	290	+ #CallByName: http://msdn.microsoft.com/en-us/library/office/gg278760%28v=office.15%29.aspx
	291	+ 'May attempt to obfuscate specific strings':
	292	+ #TODO: regex to find several Chr*, not just one
	293	+ ('Chr', 'ChrB', 'ChrW', 'StrReverse', 'Xor'),
	294	+ #Chr: http://msdn.microsoft.com/en-us/library/office/gg264465%28v=office.15%29.aspx
	295	+}
	296	+
	297	+# Regular Expression for a URL:
	298	+# http://en.wikipedia.org/wiki/Uniform_resource_locator
	299	+# http://www.w3.org/Addressing/URL/uri-spec.html
	300	+#TODO: also support username:password@server
	301	+#TODO: other protocols (file, gopher, wais, ...?)
	302	+SCHEME = r'\b(?:http\|ftp)s?'
	303	+# see http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains
	304	+TLD = r'(?:xn--[a-zA-Z0-9]{4,20}\|[a-zA-Z]{2,20})'
	305	+DNS_NAME = r'(?:[a-zA-Z0-9\-\.]+\.' + TLD + ')'
	306	+#TODO: IPv6 - see https://www.debuggex.com/
	307	+# A literal numeric IPv6 address may be given, but must be enclosed in [ ] e.g. [db8:0cec::99:123a]
	308	+NUMBER_0_255 = r'(?:25[0-5]\|2[0-4][0-9]\|1[0-9]{2}\|[1-9][0-9]\|[0-9])'
	309	+IPv4 = r'(?:'+NUMBER_0_255+r'\.){3}'+NUMBER_0_255
	310	+# IPv4 must come before the DNS name because it is more specific
	311	+SERVER = r'(?:' + IPv4 + '\|' + DNS_NAME + ')'
	312	+PORT = r'(?:\:[0-9]{1,5})?'
	313	+SERVER_PORT = SERVER + PORT
	314	+URL_PATH = r'(?:/[a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~]*)?' # [^\.\,\)\(\s"]
	315	+URL_RE = SCHEME + r'\://' + SERVER_PORT + URL_PATH
	316	+re_url = re.compile(URL_RE)
	317	+
	318	+
	319	+# Patterns to be extracted (IP addresses, URLs, etc)
	320	+# From patterns.py in balbuzard
	321	+RE_PATTERNS = (
	322	+ ('URL', re.compile(URL_RE)),
	323	+ ('IPv4 address', re.compile(IPv4)),
	324	+ ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@'+SERVER+'\b')),
	325	+ # ('Domain name', re.compile(r'(?=^.{1,254}$)(^(?:(?!\d+\.\|-)[a-zA-Z0-9_\-]{1,63}(?<!-)\.?)+(?:[a-zA-Z]{2,})$)')),
	326	+ # Executable file name with known extensions (except .com which is present in many URLs, and .application):
	327	+ ("Executable file name", re.compile(r"(?i)\b\w+\.(EXE\|PIF\|GADGET\|MSI\|MSP\|MSC\|VBS\|VBE\|VB\|JSE\|JS\|WSF\|WSC\|WSH\|WS\|BAT\|CMD\|DLL\|SCR\|HTA\|CPL\|CLASS\|JAR\|PS1XML\|PS1\|PS2XML\|PS2\|PSC1\|PSC2\|SCF\|LNK\|INF\|REG)\b")),
	328	+ # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/
	329	+ #TODO: https://support.office.com/en-us/article/Blocked-attachments-in-Outlook-3811cddc-17c3-4279-a30c-060ba0207372#__attachment_file_types
	330	+ #('Hex string', re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')),
	331	+ )
	332	+
	333	+# regex to detect strings encoded in hexadecimal
	334	+re_hex_string = re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')
	335	+
	336	+# regex to detect strings encoded in base64
	337	+#re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==\|[A-Za-z0-9+/]{3}=)?"')
	338	+# better version from balbuzard, less false positives:
	339	+re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4}){1,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=\|[A-Za-z0-9+/][AQgw]==)?"')
	340	+# white list of common strings matching the base64 regex, but which are not base64 strings (all lowercase):
	341	+BASE64_WHITELIST = set(['thisdocument', 'thisworkbook', 'test', 'temp', 'http', 'open', 'exit'])
	342	+
	343	+# regex to detect strings encoded with a specific Dridex algorithm
	344	+# (see https://github.com/JamesHabben/MalwareStuff)
	345	+re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"')
	346	+# regex to check that it is not just a hex string:
	347	+re_nothex_check = re.compile(r'[G-Zg-z]')
	348	+
	349	+#--- FUNCTIONS ----------------------------------------------------------------
	350	+
	351	+def copytoken_help(decompressed_current, decompressed_chunk_start):
	352	+ """
	353	+ compute bit masks to decode a CopyToken according to MS-OVBA 2.4.1.3.19.1 CopyToken Help
	354	+
	355	+ decompressed_current: number of decompressed bytes so far, i.e. len(decompressed_container)
	356	+ decompressed_chunk_start: offset of the current chunk in the decompressed container
	357	+ return length_mask, offset_mask, bit_count, maximum_length
	358	+ """
	359	+ difference = decompressed_current - decompressed_chunk_start
	360	+ bit_count = int(math.ceil(math.log(difference, 2)))
	361	+ bit_count = max([bit_count, 4])
	362	+ length_mask = 0xFFFF >> bit_count
	363	+ offset_mask = ~length_mask
	364	+ maximum_length = (0xFFFF >> bit_count) + 3
	365	+ return length_mask, offset_mask, bit_count, maximum_length
	366	+
	367	+
	368	+def decompress_stream (compressed_container):
	369	+ """
	370	+ Decompress a stream according to MS-OVBA section 2.4.1
	371	+
	372	+ compressed_container: string compressed according to the MS-OVBA 2.4.1.3.6 Compression algorithm
	373	+ return the decompressed container as a string (bytes)
	374	+ """
	375	+ # 2.4.1.2 State Variables
	376	+
	377	+ # The following state is maintained for the CompressedContainer (section 2.4.1.1.1):
	378	+ # CompressedRecordEnd: The location of the byte after the last byte in the CompressedContainer (section 2.4.1.1.1).
	379	+ # CompressedCurrent: The location of the next byte in the CompressedContainer (section 2.4.1.1.1) to be read by
	380	+ # decompression or to be written by compression.
	381	+
	382	+ # The following state is maintained for the current CompressedChunk (section 2.4.1.1.4):
	383	+ # CompressedChunkStart: The location of the first byte of the CompressedChunk (section 2.4.1.1.4) within the
	384	+ # CompressedContainer (section 2.4.1.1.1).
	385	+
	386	+ # The following state is maintained for a DecompressedBuffer (section 2.4.1.1.2):
	387	+ # DecompressedCurrent: The location of the next byte in the DecompressedBuffer (section 2.4.1.1.2) to be written by
	388	+ # decompression or to be read by compression.
	389	+ # DecompressedBufferEnd: The location of the byte after the last byte in the DecompressedBuffer (section 2.4.1.1.2).
	390	+
	391	+ # The following state is maintained for the current DecompressedChunk (section 2.4.1.1.3):
	392	+ # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the
	393	+ # DecompressedBuffer (section 2.4.1.1.2).
	394	+
	395	+ decompressed_container = '' # result
	396	+ compressed_current = 0
	397	+
	398	+ sig_byte = ord(compressed_container[compressed_current])
	399	+ if sig_byte != 0x01:
	400	+ raise ValueError('invalid signature byte {0:02X}'.format(sig_byte))
	401	+
	402	+ compressed_current += 1
	403	+
	404	+ #NOTE: the definition of CompressedRecordEnd is ambiguous. Here we assume that
	405	+ # CompressedRecordEnd = len(compressed_container)
	406	+ while compressed_current < len(compressed_container):
	407	+ # 2.4.1.1.5
	408	+ compressed_chunk_start = compressed_current
	409	+ # chunk header = first 16 bits
	410	+ compressed_chunk_header = struct.unpack("<H", compressed_container[compressed_chunk_start:compressed_chunk_start + 2])[0]
	411	+ # chunk size = 12 first bits of header + 3
	412	+ chunk_size = (compressed_chunk_header & 0x0FFF) + 3
	413	+ # chunk signature = 3 next bits - should always be 0b011
	414	+ chunk_signature = (compressed_chunk_header >> 12) & 0x07
	415	+ if chunk_signature != 0b011:
	416	+ raise ValueError('Invalid CompressedChunkSignature in VBA compressed stream')
	417	+ # chunk flag = next bit - 1 == compressed, 0 == uncompressed
	418	+ chunk_flag = (compressed_chunk_header >> 15) & 0x01
	419	+ logging.debug("chunk size = {0}, compressed flag = {1}".format(chunk_size, chunk_flag))
	420	+
	421	+ #MS-OVBA 2.4.1.3.12: the maximum size of a chunk including its header is 4098 bytes (header 2 + data 4096)
	422	+ # The minimum size is 3 bytes
	423	+ # NOTE: there seems to be a typo in MS-OVBA, the check should be with 4098, not 4095 (which is the max value
	424	+ # in chunk header before adding 3.
	425	+ # Also the first test is not useful since a 12 bits value cannot be larger than 4095.
	426	+ if chunk_flag == 1 and chunk_size > 4098:
	427	+ raise ValueError('CompressedChunkSize > 4098 but CompressedChunkFlag == 1')
	428	+ if chunk_flag == 0 and chunk_size != 4098:
	429	+ raise ValueError('CompressedChunkSize != 4098 but CompressedChunkFlag == 0')
	430	+
	431	+ # check if chunk_size goes beyond the compressed data, instead of silently cutting it:
	432	+ #TODO: raise an exception?
	433	+ if compressed_chunk_start + chunk_size > len(compressed_container):
	434	+ logging.warning('Chunk size is larger than remaining compressed data')
	435	+ compressed_end = min([len(compressed_container), compressed_chunk_start + chunk_size])
	436	+ # read after chunk header:
	437	+ compressed_current = compressed_chunk_start + 2
	438	+
	439	+ if chunk_flag == 0:
	440	+ # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk
	441	+ # uncompressed chunk: read the next 4096 bytes as-is
	442	+ #TODO: check if there are at least 4096 bytes left
	443	+ decompressed_container += compressed_container[compressed_current:compressed_current + 4096]
	444	+ compressed_current += 4096
	445	+ else:
	446	+ # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk
	447	+ # compressed chunk
	448	+ decompressed_chunk_start = len(decompressed_container)
	449	+ while compressed_current < compressed_end:
	450	+ # MS-OVBA 2.4.1.3.4 Decompressing a TokenSequence
	451	+ # logging.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end))
	452	+ # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or
	453	+ # copy tokens (reference to a previous literal token)
	454	+ flag_byte = ord(compressed_container[compressed_current])
	455	+ compressed_current += 1
	456	+ for bit_index in xrange(0, 8):
	457	+ # logging.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end))
	458	+ if compressed_current >= compressed_end:
	459	+ break
	460	+ # MS-OVBA 2.4.1.3.5 Decompressing a Token
	461	+ # MS-OVBA 2.4.1.3.17 Extract FlagBit
	462	+ flag_bit = (flag_byte >> bit_index) & 1
	463	+ #logging.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit))
	464	+ if flag_bit == 0: # LiteralToken
	465	+ # copy one byte directly to output
	466	+ decompressed_container += compressed_container[compressed_current]
	467	+ compressed_current += 1
	468	+ else: # CopyToken
	469	+ # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken
	470	+ copy_token = struct.unpack("<H", compressed_container[compressed_current:compressed_current + 2])[0]
	471	+ #TODO: check this
	472	+ length_mask, offset_mask, bit_count, maximum_length = copytoken_help(
	473	+ len(decompressed_container), decompressed_chunk_start)
	474	+ length = (copy_token & length_mask) + 3
	475	+ temp1 = copy_token & offset_mask
	476	+ temp2 = 16 - bit_count
	477	+ offset = (temp1 >> temp2) + 1
	478	+ #logging.debug('offset=%d length=%d' % (offset, length))
	479	+ copy_source = len(decompressed_container) - offset
	480	+ for index in xrange(copy_source, copy_source + length):
	481	+ decompressed_container += decompressed_container[index]
	482	+ compressed_current += 2
	483	+ return decompressed_container
	484	+
	485	+
	486	+def _extract_vba (ole, vba_root, project_path, dir_path):
	487	+ """
	488	+ Extract VBA macros from an OleFileIO object.
	489	+ Internal function, do not call directly.
	490	+
	491	+ vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream
	492	+ vba_project: path to the PROJECT stream
	493	+ This is a generator, yielding (stream path, VBA filename, VBA source code) for each VBA code stream
	494	+ """
	495	+ # Open the PROJECT stream:
	496	+ project = ole.openstream(project_path)
	497	+
	498	+ # sample content of the PROJECT stream:
	499	+
	500	+ ## ID="{5312AC8A-349D-4950-BDD0-49BE3C4DD0F0}"
	501	+ ## Document=ThisDocument/&H00000000
	502	+ ## Module=NewMacros
	503	+ ## Name="Project"
	504	+ ## HelpContextID="0"
	505	+ ## VersionCompatible32="393222000"
	506	+ ## CMG="F1F301E705E705E705E705"
	507	+ ## DPB="8F8D7FE3831F2020202020"
	508	+ ## GC="2D2FDD81E51EE61EE6E1"
	509	+ ##
	510	+ ## [Host Extender Info]
	511	+ ## &H00000001={3832D640-CF90-11CF-8E43-00A0C911005A};VBE;&H00000000
	512	+ ## &H00000002={000209F2-0000-0000-C000-000000000046};Word8.0;&H00000000
	513	+ ##
	514	+ ## [Workspace]
	515	+ ## ThisDocument=22, 29, 339, 477, Z
	516	+ ## NewMacros=-4, 42, 832, 510, C
	517	+
	518	+ code_modules = {}
	519	+
	520	+ for line in project:
	521	+ line = line.strip()
	522	+ if '=' in line:
	523	+ # split line at the 1st equal sign:
	524	+ name, value = line.split('=', 1)
	525	+ # looking for code modules
	526	+ # add the code module as a key in the dictionary
	527	+ # the value will be the extension needed later
	528	+ # The value is converted to lowercase, to allow case-insensitive matching (issue #3)
	529	+ value = value.lower()
	530	+ if name == 'Document':
	531	+ # split value at the 1st slash, keep 1st part:
	532	+ value = value.split('/', 1)[0]
	533	+ code_modules[value] = CLASS_EXTENSION
	534	+ elif name == 'Module':
	535	+ code_modules[value] = MODULE_EXTENSION
	536	+ elif name == 'Class':
	537	+ code_modules[value] = CLASS_EXTENSION
	538	+ elif name == 'BaseClass':
	539	+ code_modules[value] = FORM_EXTENSION
	540	+
	541	+ # read data from dir stream (compressed)
	542	+ dir_compressed = ole.openstream(dir_path).read()
	543	+
	544	+ def check_value(name, expected, value):
	545	+ if expected != value:
	546	+ logging.error("invalid value for {0} expected {1:04X} got {2:04X}".format(name, expected, value))
	547	+
	548	+ dir_stream = cStringIO.StringIO(decompress_stream(dir_compressed))
	549	+
	550	+ # PROJECTSYSKIND Record
	551	+ PROJECTSYSKIND_Id = struct.unpack("<H", dir_stream.read(2))[0]
	552	+ check_value('PROJECTSYSKIND_Id', 0x0001, PROJECTSYSKIND_Id)
	553	+ PROJECTSYSKIND_Size = struct.unpack("<L", dir_stream.read(4))[0]
	554	+ check_value('PROJECTSYSKIND_Size', 0x0004, PROJECTSYSKIND_Size)
	555	+ PROJECTSYSKIND_SysKind = struct.unpack("<L", dir_stream.read(4))[0]
	556	+ if PROJECTSYSKIND_SysKind == 0x00:
	557	+ logging.debug("16-bit Windows")
	558	+ elif PROJECTSYSKIND_SysKind == 0x01:
	559	+ logging.debug("32-bit Windows")
	560	+ elif PROJECTSYSKIND_SysKind == 0x02:
	561	+ logging.debug("Macintosh")
	562	+ elif PROJECTSYSKIND_SysKind == 0x03:
	563	+ logging.debug("64-bit Windows")
	564	+ else:
	565	+ logging.error("invalid PROJECTSYSKIND_SysKind {0:04X}".format(PROJECTSYSKIND_SysKind))
	566	+
	567	+ # PROJECTLCID Record
	568	+ PROJECTLCID_Id = struct.unpack("<H", dir_stream.read(2))[0]
	569	+ check_value('PROJECTLCID_Id', 0x0002, PROJECTLCID_Id)
	570	+ PROJECTLCID_Size = struct.unpack("<L", dir_stream.read(4))[0]
	571	+ check_value('PROJECTLCID_Size', 0x0004, PROJECTLCID_Size)
	572	+ PROJECTLCID_Lcid = struct.unpack("<L", dir_stream.read(4))[0]
	573	+ check_value('PROJECTLCID_Lcid', 0x409, PROJECTLCID_Lcid)
	574	+
	575	+ # PROJECTLCIDINVOKE Record
	576	+ PROJECTLCIDINVOKE_Id = struct.unpack("<H", dir_stream.read(2))[0]
	577	+ check_value('PROJECTLCIDINVOKE_Id', 0x0014, PROJECTLCIDINVOKE_Id)
	578	+ PROJECTLCIDINVOKE_Size = struct.unpack("<L", dir_stream.read(4))[0]
	579	+ check_value('PROJECTLCIDINVOKE_Size', 0x0004, PROJECTLCIDINVOKE_Size)
	580	+ PROJECTLCIDINVOKE_LcidInvoke = struct.unpack("<L", dir_stream.read(4))[0]
	581	+ check_value('PROJECTLCIDINVOKE_LcidInvoke', 0x409, PROJECTLCIDINVOKE_LcidInvoke)
	582	+
	583	+ # PROJECTCODEPAGE Record
	584	+ PROJECTCODEPAGE_Id = struct.unpack("<H", dir_stream.read(2))[0]
	585	+ check_value('PROJECTCODEPAGE_Id', 0x0003, PROJECTCODEPAGE_Id)
	586	+ PROJECTCODEPAGE_Size = struct.unpack("<L", dir_stream.read(4))[0]
	587	+ check_value('PROJECTCODEPAGE_Size', 0x0002, PROJECTCODEPAGE_Size)
	588	+ PROJECTCODEPAGE_CodePage = struct.unpack("<H", dir_stream.read(2))[0]
	589	+
	590	+ # PROJECTNAME Record
	591	+ PROJECTNAME_Id = struct.unpack("<H", dir_stream.read(2))[0]
	592	+ check_value('PROJECTNAME_Id', 0x0004, PROJECTNAME_Id)
	593	+ PROJECTNAME_SizeOfProjectName = struct.unpack("<L", dir_stream.read(4))[0]
	594	+ if PROJECTNAME_SizeOfProjectName < 1 or PROJECTNAME_SizeOfProjectName > 128:
	595	+ logging.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(PROJECTNAME_SizeOfProjectName))
	596	+ PROJECTNAME_ProjectName = dir_stream.read(PROJECTNAME_SizeOfProjectName)
	597	+
	598	+ # PROJECTDOCSTRING Record
	599	+ PROJECTDOCSTRING_Id = struct.unpack("<H", dir_stream.read(2))[0]
	600	+ check_value('PROJECTDOCSTRING_Id', 0x0005, PROJECTDOCSTRING_Id)
	601	+ PROJECTDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0]
	602	+ if PROJECTNAME_SizeOfProjectName > 2000:
	603	+ logging.error("PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(PROJECTDOCSTRING_SizeOfDocString))
	604	+ PROJECTDOCSTRING_DocString = dir_stream.read(PROJECTDOCSTRING_SizeOfDocString)
	605	+ PROJECTDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
	606	+ check_value('PROJECTDOCSTRING_Reserved', 0x0040, PROJECTDOCSTRING_Reserved)
	607	+ PROJECTDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0]
	608	+ if PROJECTDOCSTRING_SizeOfDocStringUnicode % 2 != 0:
	609	+ logging.error("PROJECTDOCSTRING_SizeOfDocStringUnicode is not even")
	610	+ PROJECTDOCSTRING_DocStringUnicode = dir_stream.read(PROJECTDOCSTRING_SizeOfDocStringUnicode)
	611	+
	612	+ # PROJECTHELPFILEPATH Record - MS-OVBA 2.3.4.2.1.7
	613	+ PROJECTHELPFILEPATH_Id = struct.unpack("<H", dir_stream.read(2))[0]
	614	+ check_value('PROJECTHELPFILEPATH_Id', 0x0006, PROJECTHELPFILEPATH_Id)
	615	+ PROJECTHELPFILEPATH_SizeOfHelpFile1 = struct.unpack("<L", dir_stream.read(4))[0]
	616	+ if PROJECTHELPFILEPATH_SizeOfHelpFile1 > 260:
	617	+ logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(PROJECTHELPFILEPATH_SizeOfHelpFile1))
	618	+ PROJECTHELPFILEPATH_HelpFile1 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile1)
	619	+ PROJECTHELPFILEPATH_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
	620	+ check_value('PROJECTHELPFILEPATH_Reserved', 0x003D, PROJECTHELPFILEPATH_Reserved)
	621	+ PROJECTHELPFILEPATH_SizeOfHelpFile2 = struct.unpack("<L", dir_stream.read(4))[0]
	622	+ if PROJECTHELPFILEPATH_SizeOfHelpFile2 != PROJECTHELPFILEPATH_SizeOfHelpFile1:
	623	+ logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 does not equal PROJECTHELPFILEPATH_SizeOfHelpFile2")
	624	+ PROJECTHELPFILEPATH_HelpFile2 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile2)
	625	+ if PROJECTHELPFILEPATH_HelpFile2 != PROJECTHELPFILEPATH_HelpFile1:
	626	+ logging.error("PROJECTHELPFILEPATH_HelpFile1 does not equal PROJECTHELPFILEPATH_HelpFile2")
	627	+
	628	+ # PROJECTHELPCONTEXT Record
	629	+ PROJECTHELPCONTEXT_Id = struct.unpack("<H", dir_stream.read(2))[0]
	630	+ check_value('PROJECTHELPCONTEXT_Id', 0x0007, PROJECTHELPCONTEXT_Id)
	631	+ PROJECTHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0]
	632	+ check_value('PROJECTHELPCONTEXT_Size', 0x0004, PROJECTHELPCONTEXT_Size)
	633	+ PROJECTHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0]
	634	+
	635	+ # PROJECTLIBFLAGS Record
	636	+ PROJECTLIBFLAGS_Id = struct.unpack("<H", dir_stream.read(2))[0]
	637	+ check_value('PROJECTLIBFLAGS_Id', 0x0008, PROJECTLIBFLAGS_Id)
	638	+ PROJECTLIBFLAGS_Size = struct.unpack("<L", dir_stream.read(4))[0]
	639	+ check_value('PROJECTLIBFLAGS_Size', 0x0004, PROJECTLIBFLAGS_Size)
	640	+ PROJECTLIBFLAGS_ProjectLibFlags = struct.unpack("<L", dir_stream.read(4))[0]
	641	+ check_value('PROJECTLIBFLAGS_ProjectLibFlags', 0x0000, PROJECTLIBFLAGS_ProjectLibFlags)
	642	+
	643	+ # PROJECTVERSION Record
	644	+ PROJECTVERSION_Id = struct.unpack("<H", dir_stream.read(2))[0]
	645	+ check_value('PROJECTVERSION_Id', 0x0009, PROJECTVERSION_Id)
	646	+ PROJECTVERSION_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
	647	+ check_value('PROJECTVERSION_Reserved', 0x0004, PROJECTVERSION_Reserved)
	648	+ PROJECTVERSION_VersionMajor = struct.unpack("<L", dir_stream.read(4))[0]
	649	+ PROJECTVERSION_VersionMinor = struct.unpack("<H", dir_stream.read(2))[0]
	650	+
	651	+ # PROJECTCONSTANTS Record
	652	+ PROJECTCONSTANTS_Id = struct.unpack("<H", dir_stream.read(2))[0]
	653	+ check_value('PROJECTCONSTANTS_Id', 0x000C, PROJECTCONSTANTS_Id)
	654	+ PROJECTCONSTANTS_SizeOfConstants = struct.unpack("<L", dir_stream.read(4))[0]
	655	+ if PROJECTCONSTANTS_SizeOfConstants > 1015:
	656	+ logging.error("PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(PROJECTCONSTANTS_SizeOfConstants))
	657	+ PROJECTCONSTANTS_Constants = dir_stream.read(PROJECTCONSTANTS_SizeOfConstants)
	658	+ PROJECTCONSTANTS_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
	659	+ check_value('PROJECTCONSTANTS_Reserved', 0x003C, PROJECTCONSTANTS_Reserved)
	660	+ PROJECTCONSTANTS_SizeOfConstantsUnicode = struct.unpack("<L", dir_stream.read(4))[0]
	661	+ if PROJECTCONSTANTS_SizeOfConstantsUnicode % 2 != 0:
	662	+ logging.error("PROJECTCONSTANTS_SizeOfConstantsUnicode is not even")
	663	+ PROJECTCONSTANTS_ConstantsUnicode = dir_stream.read(PROJECTCONSTANTS_SizeOfConstantsUnicode)
	664	+
	665	+ # array of REFERENCE records
	666	+ check = None
	667	+ while True:
	668	+ check = struct.unpack("<H", dir_stream.read(2))[0]
	669	+ logging.debug("reference type = {0:04X}".format(check))
	670	+ if check == 0x000F:
	671	+ break
	672	+
	673	+ if check == 0x0016:
	674	+ # REFERENCENAME
	675	+ REFERENCE_Id = check
	676	+ REFERENCE_SizeOfName = struct.unpack("<L", dir_stream.read(4))[0]
	677	+ REFERENCE_Name = dir_stream.read(REFERENCE_SizeOfName)
	678	+ REFERENCE_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
	679	+ check_value('REFERENCE_Reserved', 0x003E, REFERENCE_Reserved)
	680	+ REFERENCE_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
	681	+ REFERENCE_NameUnicode = dir_stream.read(REFERENCE_SizeOfNameUnicode)
	682	+ continue
	683	+
	684	+ if check == 0x0033:
	685	+ # REFERENCEORIGINAL (followed by REFERENCECONTROL)
	686	+ REFERENCEORIGINAL_Id = check
	687	+ REFERENCEORIGINAL_SizeOfLibidOriginal = struct.unpack("<L", dir_stream.read(4))[0]
	688	+ REFERENCEORIGINAL_LibidOriginal = dir_stream.read(REFERENCEORIGINAL_SizeOfLibidOriginal)
	689	+ continue
	690	+
	691	+ if check == 0x002F:
	692	+ # REFERENCECONTROL
	693	+ REFERENCECONTROL_Id = check
	694	+ REFERENCECONTROL_SizeTwiddled = struct.unpack("<L", dir_stream.read(4))[0] # ignore
	695	+ REFERENCECONTROL_SizeOfLibidTwiddled = struct.unpack("<L", dir_stream.read(4))[0]
	696	+ REFERENCECONTROL_LibidTwiddled = dir_stream.read(REFERENCECONTROL_SizeOfLibidTwiddled)
	697	+ REFERENCECONTROL_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0] # ignore
	698	+ check_value('REFERENCECONTROL_Reserved1', 0x0000, REFERENCECONTROL_Reserved1)
	699	+ REFERENCECONTROL_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0] # ignore
	700	+ check_value('REFERENCECONTROL_Reserved2', 0x0000, REFERENCECONTROL_Reserved2)
	701	+ # optional field
	702	+ check2 = struct.unpack("<H", dir_stream.read(2))[0]
	703	+ if check2 == 0x0016:
	704	+ REFERENCECONTROL_NameRecordExtended_Id = check
	705	+ REFERENCECONTROL_NameRecordExtended_SizeofName = struct.unpack("<L", dir_stream.read(4))[0]
	706	+ REFERENCECONTROL_NameRecordExtended_Name = dir_stream.read(REFERENCECONTROL_NameRecordExtended_SizeofName)
	707	+ REFERENCECONTROL_NameRecordExtended_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
	708	+ check_value('REFERENCECONTROL_NameRecordExtended_Reserved', 0x003E, REFERENCECONTROL_NameRecordExtended_Reserved)
	709	+ REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
	710	+ REFERENCECONTROL_NameRecordExtended_NameUnicode = dir_stream.read(REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode)
	711	+ REFERENCECONTROL_Reserved3 = struct.unpack("<H", dir_stream.read(2))[0]
	712	+ else:
	713	+ REFERENCECONTROL_Reserved3 = check2
	714	+
	715	+ check_value('REFERENCECONTROL_Reserved3', 0x0030, REFERENCECONTROL_Reserved3)
	716	+ REFERENCECONTROL_SizeExtended = struct.unpack("<L", dir_stream.read(4))[0]
	717	+ REFERENCECONTROL_SizeOfLibidExtended = struct.unpack("<L", dir_stream.read(4))[0]
	718	+ REFERENCECONTROL_LibidExtended = dir_stream.read(REFERENCECONTROL_SizeOfLibidExtended)
	719	+ REFERENCECONTROL_Reserved4 = struct.unpack("<L", dir_stream.read(4))[0]
	720	+ REFERENCECONTROL_Reserved5 = struct.unpack("<H", dir_stream.read(2))[0]
	721	+ REFERENCECONTROL_OriginalTypeLib = dir_stream.read(16)
	722	+ REFERENCECONTROL_Cookie = struct.unpack("<L", dir_stream.read(4))[0]
	723	+ continue
	724	+
	725	+ if check == 0x000D:
	726	+ # REFERENCEREGISTERED
	727	+ REFERENCEREGISTERED_Id = check
	728	+ REFERENCEREGISTERED_Size = struct.unpack("<L", dir_stream.read(4))[0]
	729	+ REFERENCEREGISTERED_SizeOfLibid = struct.unpack("<L", dir_stream.read(4))[0]
	730	+ REFERENCEREGISTERED_Libid = dir_stream.read(REFERENCEREGISTERED_SizeOfLibid)
	731	+ REFERENCEREGISTERED_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0]
	732	+ check_value('REFERENCEREGISTERED_Reserved1', 0x0000, REFERENCEREGISTERED_Reserved1)
	733	+ REFERENCEREGISTERED_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0]
	734	+ check_value('REFERENCEREGISTERED_Reserved2', 0x0000, REFERENCEREGISTERED_Reserved2)
	735	+ continue
	736	+
	737	+ if check == 0x000E:
	738	+ # REFERENCEPROJECT
	739	+ REFERENCEPROJECT_Id = check
	740	+ REFERENCEPROJECT_Size = struct.unpack("<L", dir_stream.read(4))[0]
	741	+ REFERENCEPROJECT_SizeOfLibidAbsolute = struct.unpack("<L", dir_stream.read(4))[0]
	742	+ REFERENCEPROJECT_LibidAbsolute = dir_stream.read(REFERENCEPROJECT_SizeOfLibidAbsolute)
	743	+ REFERENCEPROJECT_SizeOfLibidRelative = struct.unpack("<L", dir_stream.read(4))[0]
	744	+ REFERENCEPROJECT_LibidRelative = dir_stream.read(REFERENCEPROJECT_SizeOfLibidRelative)
	745	+ REFERENCEPROJECT_MajorVersion = struct.unpack("<L", dir_stream.read(4))[0]
	746	+ REFERENCEPROJECT_MinorVersion = struct.unpack("<H", dir_stream.read(2))[0]
	747	+ continue
	748	+
	749	+ logging.error('invalid or unknown check Id {0:04X}'.format(check))
	750	+ sys.exit(0)
	751	+
	752	+ PROJECTMODULES_Id = check #struct.unpack("<H", dir_stream.read(2))[0]
	753	+ check_value('PROJECTMODULES_Id', 0x000F, PROJECTMODULES_Id)
	754	+ PROJECTMODULES_Size = struct.unpack("<L", dir_stream.read(4))[0]
	755	+ check_value('PROJECTMODULES_Size', 0x0002, PROJECTMODULES_Size)
	756	+ PROJECTMODULES_Count = struct.unpack("<H", dir_stream.read(2))[0]
	757	+ PROJECTMODULES_ProjectCookieRecord_Id = struct.unpack("<H", dir_stream.read(2))[0]
	758	+ check_value('PROJECTMODULES_ProjectCookieRecord_Id', 0x0013, PROJECTMODULES_ProjectCookieRecord_Id)
	759	+ PROJECTMODULES_ProjectCookieRecord_Size = struct.unpack("<L", dir_stream.read(4))[0]
	760	+ check_value('PROJECTMODULES_ProjectCookieRecord_Size', 0x0002, PROJECTMODULES_ProjectCookieRecord_Size)
	761	+ PROJECTMODULES_ProjectCookieRecord_Cookie = struct.unpack("<H", dir_stream.read(2))[0]
	762	+
	763	+ logging.debug("parsing {0} modules".format(PROJECTMODULES_Count))
	764	+ for x in xrange(0, PROJECTMODULES_Count):
	765	+ MODULENAME_Id = struct.unpack("<H", dir_stream.read(2))[0]
	766	+ check_value('MODULENAME_Id', 0x0019, MODULENAME_Id)
	767	+ MODULENAME_SizeOfModuleName = struct.unpack("<L", dir_stream.read(4))[0]
	768	+ MODULENAME_ModuleName = dir_stream.read(MODULENAME_SizeOfModuleName)
	769	+ # account for optional sections
	770	+ section_id = struct.unpack("<H", dir_stream.read(2))[0]
	771	+ if section_id == 0x0047:
	772	+ MODULENAMEUNICODE_Id = section_id
	773	+ MODULENAMEUNICODE_SizeOfModuleNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
	774	+ MODULENAMEUNICODE_ModuleNameUnicode = dir_stream.read(MODULENAMEUNICODE_SizeOfModuleNameUnicode)
	775	+ section_id = struct.unpack("<H", dir_stream.read(2))[0]
	776	+ if section_id == 0x001A:
	777	+ MODULESTREAMNAME_id = section_id
	778	+ MODULESTREAMNAME_SizeOfStreamName = struct.unpack("<L", dir_stream.read(4))[0]
	779	+ MODULESTREAMNAME_StreamName = dir_stream.read(MODULESTREAMNAME_SizeOfStreamName)
	780	+ MODULESTREAMNAME_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
	781	+ check_value('MODULESTREAMNAME_Reserved', 0x0032, MODULESTREAMNAME_Reserved)
	782	+ MODULESTREAMNAME_SizeOfStreamNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
	783	+ MODULESTREAMNAME_StreamNameUnicode = dir_stream.read(MODULESTREAMNAME_SizeOfStreamNameUnicode)
	784	+ section_id = struct.unpack("<H", dir_stream.read(2))[0]
	785	+ if section_id == 0x001C:
	786	+ MODULEDOCSTRING_Id = section_id
	787	+ check_value('MODULEDOCSTRING_Id', 0x001C, MODULEDOCSTRING_Id)
	788	+ MODULEDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0]
	789	+ MODULEDOCSTRING_DocString = dir_stream.read(MODULEDOCSTRING_SizeOfDocString)
	790	+ MODULEDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
	791	+ check_value('MODULEDOCSTRING_Reserved', 0x0048, MODULEDOCSTRING_Reserved)
	792	+ MODULEDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0]
	793	+ MODULEDOCSTRING_DocStringUnicode = dir_stream.read(MODULEDOCSTRING_SizeOfDocStringUnicode)
	794	+ section_id = struct.unpack("<H", dir_stream.read(2))[0]
	795	+ if section_id == 0x0031:
	796	+ MODULEOFFSET_Id = section_id
	797	+ check_value('MODULEOFFSET_Id', 0x0031, MODULEOFFSET_Id)
	798	+ MODULEOFFSET_Size = struct.unpack("<L", dir_stream.read(4))[0]
	799	+ check_value('MODULEOFFSET_Size', 0x0004, MODULEOFFSET_Size)
	800	+ MODULEOFFSET_TextOffset = struct.unpack("<L", dir_stream.read(4))[0]
	801	+ section_id = struct.unpack("<H", dir_stream.read(2))[0]
	802	+ if section_id == 0x001E:
	803	+ MODULEHELPCONTEXT_Id = section_id
	804	+ check_value('MODULEHELPCONTEXT_Id', 0x001E, MODULEHELPCONTEXT_Id)
	805	+ MODULEHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0]
	806	+ check_value('MODULEHELPCONTEXT_Size', 0x0004, MODULEHELPCONTEXT_Size)
	807	+ MODULEHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0]
	808	+ section_id = struct.unpack("<H", dir_stream.read(2))[0]
	809	+ if section_id == 0x002C:
	810	+ MODULECOOKIE_Id = section_id
	811	+ check_value('MODULECOOKIE_Id', 0x002C, MODULECOOKIE_Id)
	812	+ MODULECOOKIE_Size = struct.unpack("<L", dir_stream.read(4))[0]
	813	+ check_value('MODULECOOKIE_Size', 0x0002, MODULECOOKIE_Size)
	814	+ MODULECOOKIE_Cookie = struct.unpack("<H", dir_stream.read(2))[0]
	815	+ section_id = struct.unpack("<H", dir_stream.read(2))[0]
	816	+ if section_id == 0x0021 or section_id == 0x0022:
	817	+ MODULETYPE_Id = section_id
	818	+ MODULETYPE_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
	819	+ section_id = struct.unpack("<H", dir_stream.read(2))[0]
	820	+ if section_id == 0x0025:
	821	+ MODULEREADONLY_Id = section_id
	822	+ check_value('MODULEREADONLY_Id', 0x0025, MODULEREADONLY_Id)
	823	+ MODULEREADONLY_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
	824	+ check_value('MODULEREADONLY_Reserved', 0x0000, MODULEREADONLY_Reserved)
	825	+ section_id = struct.unpack("<H", dir_stream.read(2))[0]
	826	+ if section_id == 0x0028:
	827	+ MODULEPRIVATE_Id = section_id
	828	+ check_value('MODULEPRIVATE_Id', 0x0028, MODULEPRIVATE_Id)
	829	+ MODULEPRIVATE_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
	830	+ check_value('MODULEPRIVATE_Reserved', 0x0000, MODULEPRIVATE_Reserved)
	831	+ section_id = struct.unpack("<H", dir_stream.read(2))[0]
	832	+ if section_id == 0x002B: # TERMINATOR
	833	+ MODULE_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
	834	+ check_value('MODULE_Reserved', 0x0000, MODULE_Reserved)
	835	+ section_id = None
	836	+ if section_id != None:
	837	+ logging.warning('unknown or invalid module section id {0:04X}'.format(section_id))
	838	+
	839	+ logging.debug('Project CodePage = %d' % PROJECTCODEPAGE_CodePage)
	840	+ vba_codec = 'cp%d' % PROJECTCODEPAGE_CodePage
	841	+ logging.debug("ModuleName = {0}".format(MODULENAME_ModuleName))
	842	+ logging.debug("StreamName = {0}".format(repr(MODULESTREAMNAME_StreamName)))
	843	+ streamname_unicode = MODULESTREAMNAME_StreamName.decode(vba_codec)
	844	+ logging.debug("StreamName.decode('%s') = %s" % (vba_codec, repr(streamname_unicode)))
	845	+ logging.debug("StreamNameUnicode = {0}".format(repr(MODULESTREAMNAME_StreamNameUnicode)))
	846	+ logging.debug("TextOffset = {0}".format(MODULEOFFSET_TextOffset))
	847	+
	848	+ code_path = vba_root + u'VBA/' + streamname_unicode
	849	+ #TODO: test if stream exists
	850	+ logging.debug('opening VBA code stream %s' % repr(code_path))
	851	+ code_data = ole.openstream(code_path).read()
	852	+ logging.debug("length of code_data = {0}".format(len(code_data)))
	853	+ logging.debug("offset of code_data = {0}".format(MODULEOFFSET_TextOffset))
	854	+ code_data = code_data[MODULEOFFSET_TextOffset:]
	855	+ if len(code_data) > 0:
	856	+ code_data = decompress_stream(code_data)
	857	+ # case-insensitive search in the code_modules dict to find the file extension:
	858	+ filext = code_modules.get(MODULENAME_ModuleName.lower(), 'bin')
	859	+ filename = '{0}.{1}'.format(MODULENAME_ModuleName, filext)
	860	+ #TODO: also yield the codepage so that callers can decode it properly
	861	+ yield (code_path, filename, code_data)
	862	+ # print '-'*79
	863	+ # print filename
	864	+ # print ''
	865	+ # print code_data
	866	+ # print ''
	867	+ logging.debug('extracted file {0}'.format(filename))
	868	+ else:
	869	+ logging.warning("module stream {0} has code data length 0".format(MODULESTREAMNAME_StreamName))
	870	+ return
	871	+
	872	+
	873	+def filter_vba(vba_code):
	874	+ """
	875	+ Filter VBA source code to remove the first lines starting with "Attribute VB_",
	876	+ which are automatically added by MS Office and not displayed in the VBA Editor.
	877	+ This should only be used when displaying source code for human analysis.
	878	+
	879	+ Note: lines are not filtered if they contain a colon, because it could be
	880	+ used to hide malicious instructions.
	881	+
	882	+ :param vba_code: str, VBA source code
	883	+ :return: str, filtered VBA source code
	884	+ """
	885	+ vba_lines = vba_code.splitlines()
	886	+ start = 0
	887	+ for line in vba_lines:
	888	+ if line.startswith("Attribute VB_") and not ':' in line:
	889	+ start += 1
	890	+ else:
	891	+ break
	892	+ #TODO: also remove empty lines?
	893	+ vba = '\n'.join(vba_lines[start:])
	894	+ return vba
	895	+
	896	+
	897	+def detect_autoexec(vba_code, obfuscation=None):
	898	+ """
	899	+ Detect if the VBA code contains keywords corresponding to macros running
	900	+ automatically when triggered by specific actions (e.g. when a document is
	901	+ opened or closed).
	902	+
	903	+ :param vba_code: str, VBA source code
	904	+ :param obfuscation: None or str, name of obfuscation to be added to description
	905	+ :return: list of str tuples (keyword, description)
	906	+ """
	907	+ #TODO: merge code with detect_suspicious
	908	+ # case-insensitive search
	909	+ #vba_code = vba_code.lower()
	910	+ results = []
	911	+ obf_text = ''
	912	+ if obfuscation:
	913	+ obf_text = ' (obfuscation: %s)' % obfuscation
	914	+ for description, keywords in AUTOEXEC_KEYWORDS.items():
	915	+ for keyword in keywords:
	916	+ #TODO: if keyword is already a compiled regex, use it as-is
	917	+ # search using regex to detect word boundaries:
	918	+ if re.search(r'(?i)\b'+keyword+r'\b', vba_code):
	919	+ #if keyword.lower() in vba_code:
	920	+ results.append((keyword, description+obf_text))
	921	+ return results
	922	+
	923	+
	924	+def detect_suspicious(vba_code, obfuscation=None):
	925	+ """
	926	+ Detect if the VBA code contains suspicious keywords corresponding to
	927	+ potential malware behaviour.
	928	+
	929	+ :param vba_code: str, VBA source code
	930	+ :param obfuscation: None or str, name of obfuscation to be added to description
	931	+ :return: list of str tuples (keyword, description)
	932	+ """
	933	+ # case-insensitive search
	934	+ #vba_code = vba_code.lower()
	935	+ results = []
	936	+ obf_text = ''
	937	+ if obfuscation:
	938	+ obf_text = ' (obfuscation: %s)' % obfuscation
	939	+ for description, keywords in SUSPICIOUS_KEYWORDS.items():
	940	+ for keyword in keywords:
	941	+ # search using regex to detect word boundaries:
	942	+ if re.search(r'(?i)\b'+keyword+r'\b', vba_code):
	943	+ #if keyword.lower() in vba_code:
	944	+ results.append((keyword, description+obf_text))
	945	+ return results
	946	+
	947	+
	948	+def detect_patterns(vba_code, obfuscation=None):
	949	+ """
	950	+ Detect if the VBA code contains specific patterns such as IP addresses,
	951	+ URLs, e-mail addresses, executable file names, etc.
	952	+
	953	+ :param vba_code: str, VBA source code
	954	+ :return: list of str tuples (pattern type, value)
	955	+ """
	956	+ results = []
	957	+ found = set()
	958	+ obf_text = ''
	959	+ if obfuscation:
	960	+ obf_text = ' (obfuscation: %s)' % obfuscation
	961	+ for pattern_type, pattern_re in RE_PATTERNS:
	962	+ for match in pattern_re.finditer(vba_code):
	963	+ value = match.group()
	964	+ if value not in found:
	965	+ results.append((pattern_type+obf_text, value))
	966	+ found.add(value)
	967	+ return results
	968	+
	969	+
	970	+def detect_hex_strings(vba_code):
	971	+ """
	972	+ Detect if the VBA code contains strings encoded in hexadecimal.
	973	+
	974	+ :param vba_code: str, VBA source code
	975	+ :return: list of str tuples (encoded string, decoded string)
	976	+ """
	977	+ results = []
	978	+ found = set()
	979	+ for match in re_hex_string.finditer(vba_code):
	980	+ value = match.group()
	981	+ if value not in found:
	982	+ decoded = binascii.unhexlify(value)
	983	+ results.append((value, decoded))
	984	+ found.add(value)
	985	+ return results
	986	+
	987	+
	988	+def detect_base64_strings(vba_code):
	989	+ """
	990	+ Detect if the VBA code contains strings encoded in base64.
	991	+
	992	+ :param vba_code: str, VBA source code
	993	+ :return: list of str tuples (encoded string, decoded string)
	994	+ """
	995	+ #TODO: avoid matching simple hex strings as base64?
	996	+ results = []
	997	+ found = set()
	998	+ for match in re_base64_string.finditer(vba_code):
	999	+ # extract the base64 string without quotes:
	1000	+ value = match.group().strip('"')
	1001	+ # check it is not just a hex string:
	1002	+ if not re_nothex_check.search(value):
	1003	+ continue
	1004	+ # only keep new values and not in the whitelist:
	1005	+ if value not in found and value.lower() not in BASE64_WHITELIST:
	1006	+ try:
	1007	+ decoded = base64.b64decode(value)
	1008	+ results.append((value, decoded))
	1009	+ found.add(value)
	1010	+ except:
	1011	+ # if an exception occurs, it is likely not a base64-encoded string
	1012	+ pass
	1013	+ return results
	1014	+
	1015	+
	1016	+def detect_dridex_strings(vba_code):
	1017	+ """
	1018	+ Detect if the VBA code contains strings obfuscated with a specific algorithm found in Dridex samples.
	1019	+
	1020	+ :param vba_code: str, VBA source code
	1021	+ :return: list of str tuples (encoded string, decoded string)
	1022	+ """
	1023	+ from thirdparty.DridexUrlDecoder.DridexUrlDecoder import DridexUrlDecode
	1024	+ results = []
	1025	+ found = set()
	1026	+ for match in re_dridex_string.finditer(vba_code):
	1027	+ value = match.group()[1:-1]
	1028	+ # check it is not just a hex string:
	1029	+ if not re_nothex_check.search(value):
	1030	+ continue
	1031	+ if value not in found:
	1032	+ try:
	1033	+ decoded = DridexUrlDecode(value)
	1034	+ results.append((value, decoded))
	1035	+ found.add(value)
	1036	+ except:
	1037	+ # if an exception occurs, it is likely not a dridex-encoded string
	1038	+ pass
	1039	+ return results
	1040	+
	1041	+
	1042	+class VBA_Scanner (object):
	1043	+ """
	1044	+ Class to scan the source code of a VBA module to find obfuscated strings,
	1045	+ suspicious keywords, IOCs, auto-executable macros, etc.
	1046	+ """
	1047	+
	1048	+ def __init__(self, vba_code):
	1049	+ """
	1050	+ VBA_Scanner constructor
	1051	+
	1052	+ :param vba_code: str, VBA source code to be analyzed
	1053	+ """
	1054	+ self.code = vba_code
	1055	+ self.code_hex = ''
	1056	+ self.code_hex_rev = ''
	1057	+ self.code_rev_hex = ''
	1058	+ self.code_base64 = ''
	1059	+ self.code_dridex = ''
	1060	+
	1061	+
	1062	+ def scan(self, include_decoded_strings=False):
	1063	+ """
	1064	+ Analyze the provided VBA code to detect suspicious keywords,
	1065	+ auto-executable macros, IOC patterns, obfuscation patterns
	1066	+ such as hex-encoded strings.
	1067	+
	1068	+ :param include_decoded_strings: bool, if True, all encoded strings will be included with their decoded content.
	1069	+ :return: list of tuples (type, keyword, description)
	1070	+ (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String')
	1071	+ """
	1072	+ # First, detect and extract hex-encoded strings:
	1073	+ self.hex_strings = detect_hex_strings(self.code)
	1074	+ # detect if the code contains StrReverse:
	1075	+ self.strReverse = False
	1076	+ if 'strreverse' in self.code.lower(): self.strReverse = True
	1077	+ # Then append the decoded strings to the VBA code, to detect obfuscated IOCs and keywords:
	1078	+ for encoded, decoded in self.hex_strings:
	1079	+ self.code_hex += '\n'+decoded
	1080	+ # if the code contains "StrReverse", also append the hex strings in reverse order:
	1081	+ if self.strReverse:
	1082	+ # StrReverse after hex decoding:
	1083	+ self.code_hex_rev += '\n'+decoded[::-1]
	1084	+ # StrReverse before hex decoding:
	1085	+ self.code_rev_hex += '\n'+binascii.unhexlify(encoded[::-1])
	1086	+ #example: https://malwr.com/analysis/NmFlMGI4YTY1YzYyNDkwNTg1ZTBiZmY5OGI3YjlhYzU/
	1087	+ #TODO: also append the full code reversed if StrReverse? (risk of false positives?)
	1088	+ # Detect Base64-encoded strings
	1089	+ self.base64_strings = detect_base64_strings(self.code)
	1090	+ for encoded, decoded in self.base64_strings:
	1091	+ self.code_base64 += '\n'+decoded
	1092	+ # Detect Dridex-encoded strings
	1093	+ self.dridex_strings = detect_dridex_strings(self.code)
	1094	+ for encoded, decoded in self.dridex_strings:
	1095	+ self.code_dridex += '\n'+decoded
	1096	+ results = []
	1097	+ self.autoexec_keywords = []
	1098	+ self.suspicious_keywords = []
	1099	+ self.iocs = []
	1100	+
	1101	+ for code, obfuscation in (
	1102	+ (self.code, None),
	1103	+ (self.code_hex, 'Hex'),
	1104	+ (self.code_hex_rev, 'Hex+StrReverse'),
	1105	+ (self.code_rev_hex, 'StrReverse+Hex'),
	1106	+ (self.code_base64, 'Base64'),
	1107	+ (self.code_dridex, 'Dridex'),
	1108	+ ):
	1109	+ self.autoexec_keywords += detect_autoexec(code, obfuscation)
	1110	+ self.suspicious_keywords += detect_suspicious(code, obfuscation)
	1111	+ self.iocs += detect_patterns(code, obfuscation)
	1112	+
	1113	+ # If hex-encoded strings were discovered, add an item to suspicious keywords:
	1114	+ if self.hex_strings:
	1115	+ self.suspicious_keywords.append(('Hex Strings',
	1116	+ 'Hex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))
	1117	+ if self.base64_strings:
	1118	+ self.suspicious_keywords.append(('Base64 Strings',
	1119	+ 'Base64-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))
	1120	+ if self.dridex_strings:
	1121	+ self.suspicious_keywords.append(('Dridex Strings',
	1122	+ 'Dridex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))
	1123	+ for keyword, description in self.autoexec_keywords:
	1124	+ results.append(('AutoExec', keyword, description))
	1125	+ for keyword, description in self.suspicious_keywords:
	1126	+ results.append(('Suspicious', keyword, description))
	1127	+ for pattern_type, value in self.iocs:
	1128	+ results.append(('IOC', value, pattern_type))
	1129	+ if include_decoded_strings:
	1130	+ for encoded, decoded in self.hex_strings:
	1131	+ results.append(('Hex String', repr(decoded), encoded))
	1132	+ for encoded, decoded in self.base64_strings:
	1133	+ results.append(('Base64 String', repr(decoded), encoded))
	1134	+ for encoded, decoded in self.dridex_strings:
	1135	+ results.append(('Dridex string', repr(decoded), encoded))
	1136	+ return results
	1137	+
	1138	+ def scan_summary(self):
	1139	+ """
	1140	+ Analyze the provided VBA code to detect suspicious keywords,
	1141	+ auto-executable macros, IOC patterns, obfuscation patterns
	1142	+ such as hex-encoded strings.
	1143	+
	1144	+ :return: tuple with the number of items found for each category:
	1145	+ (autoexec, suspicious, IOCs, hex, base64, dridex)
	1146	+ """
	1147	+ self.scan()
	1148	+ return (len(self.autoexec_keywords), len(self.suspicious_keywords),
	1149	+ len(self.iocs), len(self.hex_strings), len(self.base64_strings),
	1150	+ len(self.dridex_strings))
	1151	+
	1152	+
	1153	+
	1154	+def scan_vba(vba_code, include_decoded_strings):
	1155	+ """
	1156	+ Analyze the provided VBA code to detect suspicious keywords,
	1157	+ auto-executable macros, IOC patterns, obfuscation patterns
	1158	+ such as hex-encoded strings.
	1159	+ (shortcut for VBA_Scanner(vba_code).scan())
	1160	+
	1161	+ :param vba_code: str, VBA source code to be analyzed
	1162	+ :param include_decoded_strings: bool, if True all encoded strings will be included with their decoded content.
	1163	+ :return: list of tuples (type, keyword, description)
	1164	+ (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String')
	1165	+ """
	1166	+ return VBA_Scanner(vba_code).scan(include_decoded_strings)
	1167	+
	1168	+
	1169	+#=== CLASSES =================================================================
	1170	+
	1171	+class VBA_Parser(object):
	1172	+ """
	1173	+ Class to parse MS Office files, to detect VBA macros and extract VBA source code
	1174	+ Supported file formats:
	1175	+ - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)
	1176	+ - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
	1177	+ - PowerPoint 2007+ (.pptm, .ppsm)
	1178	+ """
	1179	+
	1180	+ def __init__(self, filename, data=None):
	1181	+ """
	1182	+ Constructor for VBA_Parser
	1183	+
	1184	+ :param filename: filename or path of file to parse, or file-like object
	1185	+
	1186	+ :param data: None or bytes str, if None the file will be read from disk (or from the file-like object).
	1187	+ If data is provided as a bytes string, it will be parsed as the content of the file in memory,
	1188	+ and not read from disk. Note: files must be read in binary mode, i.e. open(f, 'rb').
	1189	+ """
	1190	+ #TODO: filename should only be a string, data should be used for the file-like object
	1191	+ #TODO: filename should be mandatory, optional data is a string or file-like object
	1192	+ #TODO: also support olefile and zipfile as input
	1193	+ if data is None:
	1194	+ # open file from disk:
	1195	+ _file = filename
	1196	+ else:
	1197	+ # file already read in memory, make it a file-like object for zipfile:
	1198	+ _file = cStringIO.StringIO(data)
	1199	+ #self.file = _file
	1200	+ self.ole_file = None
	1201	+ self.ole_subfiles = []
	1202	+ self.filename = filename
	1203	+ self.type = None
	1204	+ self.vba_projects = None
	1205	+ # if filename is None:
	1206	+ # if isinstance(_file, basestring):
	1207	+ # if len(_file) < olefile.MINIMAL_OLEFILE_SIZE:
	1208	+ # self.filename = _file
	1209	+ # else:
	1210	+ # self.filename = '<file in bytes string>'
	1211	+ # else:
	1212	+ # self.filename = '<file-like object>'
	1213	+ if olefile.isOleFile(_file):
	1214	+ # This looks like an OLE file
	1215	+ logging.info('Parsing OLE file %s' % self.filename)
	1216	+ # Open and parse the OLE file, using unicode for path names:
	1217	+ self.ole_file = olefile.OleFileIO(_file, path_encoding=None)
	1218	+ self.type = TYPE_OLE
	1219	+ #TODO: raise TypeError if this is a Powerpoint 97 file, since VBA macros cannot be detected yet
	1220	+ elif zipfile.is_zipfile(_file):
	1221	+ # This looks like a zip file, need to look for vbaProject.bin inside
	1222	+ # It can be any OLE file inside the archive
	1223	+ #...because vbaProject.bin can be renamed:
	1224	+ # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18
	1225	+ logging.info('Opening ZIP/OpenXML file %s' % self.filename)
	1226	+ self.type = TYPE_OpenXML
	1227	+ z = zipfile.ZipFile(_file)
	1228	+ #TODO: check if this is actually an OpenXML file
	1229	+ #TODO: if the zip file is encrypted, suggest to use the -z option, or try '-z infected' automatically?
	1230	+ # check each file within the zip if it is an OLE file, by reading its magic:
	1231	+ for subfile in z.namelist():
	1232	+ magic = z.open(subfile).read(len(olefile.MAGIC))
	1233	+ if magic == olefile.MAGIC:
	1234	+ logging.debug('Opening OLE file %s within zip' % subfile)
	1235	+ ole_data = z.open(subfile).read()
	1236	+ try:
	1237	+ self.ole_subfiles.append(VBA_Parser(filename=subfile, data=ole_data))
	1238	+ except:
	1239	+ logging.debug('%s is not a valid OLE file' % subfile)
	1240	+ continue
	1241	+ z.close()
	1242	+ else:
	1243	+ # read file from disk, check if it is a Word 2003 XML file (WordProcessingML), Excel 2003 XML,
	1244	+ # or a plain text file containing VBA code
	1245	+ if data is None:
	1246	+ data = open(filename, 'rb').read()
	1247	+ # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace
	1248	+ if 'http://schemas.microsoft.com/office/word/2003/wordml' in data:
	1249	+ logging.info('Opening Word 2003 XML file %s' % self.filename)
	1250	+ self.type = TYPE_Word2003_XML
	1251	+ # parse the XML content
	1252	+ et = ET.fromstring(data)
	1253	+ # find all the binData elements:
	1254	+ for bindata in et.getiterator(TAG_BINDATA):
	1255	+ # the binData content is an OLE container for the VBA project, compressed
	1256	+ # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded.
	1257	+ # get the filename:
	1258	+ fname = bindata.get(ATTR_NAME, 'noname.mso')
	1259	+ # decode the base64 activemime
	1260	+ activemime = binascii.a2b_base64(bindata.text)
	1261	+ # decompress the zlib data starting at offset 0x32, which is the OLE container:
	1262	+ ole_data = zlib.decompress(activemime[0x32:])
	1263	+ try:
	1264	+ self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data))
	1265	+ except:
	1266	+ logging.debug('%s is not a valid OLE file' % fname)
	1267	+ continue
	1268	+ #TODO: handle exceptions
	1269	+ #TODO: Excel 2003 XML
	1270	+ #TODO: plain text VBA file
	1271	+ else:
	1272	+ msg = '%s is not an OLE nor an OpenXML file, cannot extract VBA Macros.' % self.filename
	1273	+ logging.error(msg)
	1274	+ raise TypeError(msg)
	1275	+
	1276	+ def find_vba_projects (self):
	1277	+ """
	1278	+ Finds all the VBA projects stored in an OLE file.
	1279	+
	1280	+ Return None if the file is not OLE but OpenXML.
	1281	+ Return a list of tuples (vba_root, project_path, dir_path) for each VBA project.
	1282	+ vba_root is the path of the root OLE storage containing the VBA project,
	1283	+ including a trailing slash unless it is the root of the OLE file.
	1284	+ project_path is the path of the OLE stream named "PROJECT" within the VBA project.
	1285	+ dir_path is the path of the OLE stream named "VBA/dir" within the VBA project.
	1286	+
	1287	+ If this function returns an empty list for one of the supported formats
	1288	+ (i.e. Word, Excel, Powerpoint except Powerpoint 97-2003), then the
	1289	+ file does not contain VBA macros.
	1290	+
	1291	+ :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path)
	1292	+ for each VBA project found if OLE file
	1293	+ """
	1294	+ # if the file is not OLE but OpenXML, return None:
	1295	+ if self.ole_file is None:
	1296	+ return None
	1297	+
	1298	+ # if this method has already been called, return previous result:
	1299	+ if self.vba_projects is not None:
	1300	+ return self.vba_projects
	1301	+
	1302	+ # Find the VBA project root (different in MS Word, Excel, etc):
	1303	+ # - Word 97-2003: Macros
	1304	+ # - Excel 97-2003: _VBA_PROJECT_CUR
	1305	+ # - PowerPoint 97-2003: not supported yet (different file structure)
	1306	+ # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin.
	1307	+ # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word
	1308	+ # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word
	1309	+ # - Visio 2007: not supported yet (different file structure)
	1310	+
	1311	+ # According to MS-OVBA section 2.2.1:
	1312	+ # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream
	1313	+ # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream
	1314	+ # - all names are case-insensitive
	1315	+
	1316	+ # start with an empty list:
	1317	+ self.vba_projects = []
	1318	+ # Look for any storage containing those storage/streams:
	1319	+ ole = self.ole_file
	1320	+ for storage in ole.listdir(streams=False, storages=True):
	1321	+ # Look for a storage ending with "VBA":
	1322	+ if storage[-1].upper() == 'VBA':
	1323	+ logging.debug('Found VBA storage: %s' % ('/'.join(storage)))
	1324	+ vba_root = '/'.join(storage[:-1])
	1325	+ # Add a trailing slash to vba_root, unless it is the root of the OLE file:
	1326	+ # (used later to append all the child streams/storages)
	1327	+ if vba_root != '':
	1328	+ vba_root += '/'
	1329	+ logging.debug('Checking vba_root="%s"' % vba_root)
	1330	+
	1331	+ def check_vba_stream(ole, vba_root, stream_path):
	1332	+ full_path = vba_root + stream_path
	1333	+ if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM:
	1334	+ logging.debug('Found %s stream: %s' % (stream_path, full_path))
	1335	+ return full_path
	1336	+ else:
	1337	+ logging.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path)
	1338	+ return False
	1339	+
	1340	+ # Check if the VBA root storage also contains a PROJECT stream:
	1341	+ project_path = check_vba_stream(ole, vba_root, 'PROJECT')
	1342	+ if not project_path: continue
	1343	+ # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream:
	1344	+ vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT')
	1345	+ if not vba_project_path: continue
	1346	+ # Check if the VBA root storage also contains a VBA/dir stream:
	1347	+ dir_path = check_vba_stream(ole, vba_root, 'VBA/dir')
	1348	+ if not dir_path: continue
	1349	+ # Now we are pretty sure it is a VBA project structure
	1350	+ logging.debug('VBA root storage: "%s"' % vba_root)
	1351	+ # append the results to the list as a tuple for later use:
	1352	+ self.vba_projects.append((vba_root, project_path, dir_path))
	1353	+ return self.vba_projects
	1354	+
	1355	+ def detect_vba_macros(self):
	1356	+ """
	1357	+ Detect the potential presence of VBA macros in the file, by checking
	1358	+ if it contains VBA projects. Both OLE and OpenXML files are supported.
	1359	+
	1360	+ Important: for now, results are accurate only for Word, Excel and PowerPoint
	1361	+ EXCEPT Powerpoint 97-2003, which has a different structure for VBA.
	1362	+
	1363	+ Note: this method does NOT attempt to check the actual presence or validity
	1364	+ of VBA macro source code, so there might be false positives.
	1365	+ It may also detect VBA macros in files embedded within the main file,
	1366	+ for example an Excel workbook with macros embedded into a Word
	1367	+ document without macros may be detected, without distinction.
	1368	+
	1369	+ :return: bool, True if at least one VBA project has been found, False otherwise
	1370	+ """
	1371	+ #TODO: return None or raise exception if format not supported like PPT 97-2003
	1372	+ #TODO: return the number of VBA projects found instead of True/False?
	1373	+ # if OpenXML, check all the OLE subfiles:
	1374	+ if self.ole_file is None:
	1375	+ for ole_subfile in self.ole_subfiles:
	1376	+ if ole_subfile.detect_vba_macros():
	1377	+ return True
	1378	+ return False
	1379	+ # otherwise it's an OLE file, find VBA projects:
	1380	+ vba_projects = self.find_vba_projects()
	1381	+ if len(vba_projects) == 0:
	1382	+ return False
	1383	+ else:
	1384	+ return True
	1385	+
	1386	+
	1387	+ def extract_macros (self):
	1388	+ """
	1389	+ Extract and decompress source code for each VBA macro found in the file
	1390	+
	1391	+ Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found
	1392	+ If the file is OLE, filename is the path of the file.
	1393	+ If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros
	1394	+ within the zip archive, e.g. word/vbaProject.bin.
	1395	+ """
	1396	+ if self.ole_file is None:
	1397	+ for ole_subfile in self.ole_subfiles:
	1398	+ for results in ole_subfile.extract_macros():
	1399	+ yield results
	1400	+ else:
	1401	+ self.find_vba_projects()
	1402	+ for vba_root, project_path, dir_path in self.vba_projects:
	1403	+ # extract all VBA macros from that VBA root storage:
	1404	+ for stream_path, vba_filename, vba_code in _extract_vba(self.ole_file, vba_root, project_path, dir_path):
	1405	+ yield (self.filename, stream_path, vba_filename, vba_code)
	1406	+
	1407	+
	1408	+ def close(self):
	1409	+ """
	1410	+ Close all the open files. This method must be called after usage, if
	1411	+ the application is opening many files.
	1412	+ """
	1413	+ if self.ole_file is None:
	1414	+ for ole_subfile in self.ole_subfiles:
	1415	+ ole_subfile.close()
	1416	+ else:
	1417	+ self.ole_file.close()
	1418	+
	1419	+
	1420	+def print_analysis(vba_code, show_decoded_strings=False):
	1421	+ """
	1422	+ Analyze the provided VBA code, and print the results in a table
	1423	+
	1424	+ :param vba_code: str, VBA source code to be analyzed
	1425	+ :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.
	1426	+ :return: None
	1427	+ """
	1428	+ results = scan_vba(vba_code, show_decoded_strings)
	1429	+ if results:
	1430	+ t = prettytable.PrettyTable(('Type', 'Keyword', 'Description'))
	1431	+ t.align = 'l'
	1432	+ t.max_width['Type'] = 10
	1433	+ t.max_width['Keyword'] = 20
	1434	+ t.max_width['Description'] = 39
	1435	+ for kw_type, keyword, description in results:
	1436	+ t.add_row((kw_type, keyword, description))
	1437	+ print t
	1438	+ else:
	1439	+ print 'No suspicious keyword or IOC found.'
	1440	+
	1441	+
	1442	+
	1443	+def process_file (container, filename, data, show_decoded_strings=False):
	1444	+ """
	1445	+ Process a single file
	1446	+
	1447	+ :param container: str, path and filename of container if the file is within
	1448	+ a zip archive, None otherwise.
	1449	+ :param filename: str, path and filename of file on disk, or within the container.
	1450	+ :param data: bytes, content of the file if it is in a container, None if it is a file on disk.
	1451	+ :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.
	1452	+ """
	1453	+ #TODO: replace print by writing to a provided output file (sys.stdout by default)
	1454	+ if container:
	1455	+ display_filename = '%s in %s' % (filename, container)
	1456	+ else:
	1457	+ display_filename = filename
	1458	+ print '='*79
	1459	+ print 'FILE:', display_filename
	1460	+ try:
	1461	+ #TODO: handle olefile errors, when an OLE file is malformed
	1462	+ vba = VBA_Parser(filename, data)
	1463	+ print 'Type:', vba.type
	1464	+ if vba.detect_vba_macros():
	1465	+ #print 'Contains VBA Macros:'
	1466	+ for (subfilename, stream_path, vba_filename, vba_code) in vba.extract_macros():
	1467	+ # hide attribute lines:
	1468	+ #TODO: option to disable attribute filtering
	1469	+ vba_code_filtered = filter_vba(vba_code)
	1470	+ print '-'*79
	1471	+ print 'VBA MACRO %s ' % vba_filename
	1472	+ print 'in file: %s - OLE stream: %s' % (subfilename, repr(stream_path))
	1473	+ print '- '*39
	1474	+ # detect empty macros:
	1475	+ if vba_code_filtered.strip() == '':
	1476	+ print '(empty macro)'
	1477	+ else:
	1478	+ print vba_code_filtered
	1479	+ print '- '*39
	1480	+ print 'ANALYSIS:'
	1481	+ # analyse the whole code, filtered to avoid false positives:
	1482	+ print_analysis(vba_code_filtered, show_decoded_strings)
	1483	+ else:
	1484	+ print 'No VBA macros found.'
	1485	+ except: #TypeError:
	1486	+ #raise
	1487	+ #TODO: print more info if debug mode
	1488	+ #print sys.exc_value
	1489	+ # display the exception with full stack trace for debugging, but do not stop:
	1490	+ traceback.print_exc()
	1491	+ print ''
	1492	+
	1493	+
	1494	+def process_file_triage (container, filename, data):
	1495	+ """
	1496	+ Process a single file
	1497	+
	1498	+ :param container: str, path and filename of container if the file is within
	1499	+ a zip archive, None otherwise.
	1500	+ :param filename: str, path and filename of file on disk, or within the container.
	1501	+ :param data: bytes, content of the file if it is in a container, None if it is a file on disk.
	1502	+ """
	1503	+ #TODO: replace print by writing to a provided output file (sys.stdout by default)
	1504	+ nb_macros = 0
	1505	+ nb_autoexec = 0
	1506	+ nb_suspicious = 0
	1507	+ nb_iocs = 0
	1508	+ nb_hexstrings = 0
	1509	+ nb_base64strings = 0
	1510	+ nb_dridexstrings = 0
	1511	+ # ftype = 'Other'
	1512	+ message = ''
	1513	+ try:
	1514	+ #TODO: handle olefile errors, when an OLE file is malformed
	1515	+ vba = VBA_Parser(filename, data)
	1516	+ if vba.detect_vba_macros():
	1517	+ for (subfilename, stream_path, vba_filename, vba_code) in vba.extract_macros():
	1518	+ nb_macros += 1
	1519	+ if vba_code.strip() != '':
	1520	+ # analyse the whole code, filtered to avoid false positives:
	1521	+ scanner = VBA_Scanner(filter_vba(vba_code))
	1522	+ autoexec, suspicious, iocs, hexstrings, base64strings, dridex = scanner.scan_summary()
	1523	+ nb_autoexec += autoexec
	1524	+ nb_suspicious += suspicious
	1525	+ nb_iocs += iocs
	1526	+ nb_hexstrings += hexstrings
	1527	+ nb_base64strings += base64strings
	1528	+ nb_dridexstrings += dridex
	1529	+ if vba.type == TYPE_OLE:
	1530	+ flags = 'OLE:'
	1531	+ elif vba.type == TYPE_OpenXML:
	1532	+ flags = 'OpX:'
	1533	+ elif vba.type == TYPE_Word2003_XML:
	1534	+ flags = 'XML:'
	1535	+ macros = autoexec = suspicious = iocs = hexstrings = base64obf = dridex = '-'
	1536	+ if nb_macros: macros = 'M'
	1537	+ if nb_autoexec: autoexec = 'A'
	1538	+ if nb_suspicious: suspicious = 'S'
	1539	+ if nb_iocs: iocs = 'I'
	1540	+ if nb_hexstrings: hexstrings = 'H'
	1541	+ if nb_base64strings: base64obf = 'B'
	1542	+ if nb_dridexstrings: dridex = 'D'
	1543	+ flags += '%s%s%s%s%s%s%s' % (macros, autoexec, suspicious, iocs, hexstrings,
	1544	+ base64obf, dridex)
	1545	+
	1546	+ # macros = autoexec = suspicious = iocs = hexstrings = 'no'
	1547	+ # if nb_macros: macros = 'YES:%d' % nb_macros
	1548	+ # if nb_autoexec: autoexec = 'YES:%d' % nb_autoexec
	1549	+ # if nb_suspicious: suspicious = 'YES:%d' % nb_suspicious
	1550	+ # if nb_iocs: iocs = 'YES:%d' % nb_iocs
	1551	+ # if nb_hexstrings: hexstrings = 'YES:%d' % nb_hexstrings
	1552	+ # # 2nd line = info
	1553	+ # print '%-8s %-7s %-7s %-7s %-7s %-7s' % (vba.type, macros, autoexec, suspicious, iocs, hexstrings)
	1554	+ except TypeError:
	1555	+ # file type not OLE nor OpenXML
	1556	+ flags = '?'
	1557	+ message = 'File format not supported'
	1558	+ except:
	1559	+ # another error occurred
	1560	+ #raise
	1561	+ #TODO: print more info if debug mode
	1562	+ #TODO: distinguish real errors from incorrect file types
	1563	+ flags = '!ERROR'
	1564	+ message = sys.exc_value
	1565	+ line = '%-11s %s' % (flags, filename)
	1566	+ if message:
	1567	+ line += ' - %s' % message
	1568	+ print line
	1569	+
	1570	+ # t = prettytable.PrettyTable(('filename', 'type', 'macros', 'autoexec', 'suspicious', 'ioc', 'hexstrings'),
	1571	+ # header=False, border=False)
	1572	+ # t.align = 'l'
	1573	+ # t.max_width['filename'] = 30
	1574	+ # t.max_width['type'] = 10
	1575	+ # t.max_width['macros'] = 6
	1576	+ # t.max_width['autoexec'] = 6
	1577	+ # t.max_width['suspicious'] = 6
	1578	+ # t.max_width['ioc'] = 6
	1579	+ # t.max_width['hexstrings'] = 6
	1580	+ # t.add_row((filename, ftype, macros, autoexec, suspicious, iocs, hexstrings))
	1581	+ # print t
	1582	+
	1583	+def main_triage_quick():
	1584	+ pass
	1585	+
	1586	+#=== MAIN =====================================================================
	1587	+
	1588	+def main():
	1589	+ """
	1590	+ Main function, called when olevba is run from the command line
	1591	+ """
	1592	+ usage = 'usage: %prog [options] <filename> [filename2 ...]'
	1593	+ parser = optparse.OptionParser(usage=usage)
	1594	+ # parser.add_option('-o', '--outfile', dest='outfile',
	1595	+ # help='output file')
	1596	+ # parser.add_option('-c', '--csv', dest='csv',
	1597	+ # help='export results to a CSV file')
	1598	+ parser.add_option("-r", action="store_true", dest="recursive",
	1599	+ help='find files recursively in subdirectories.')
	1600	+ parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None,
	1601	+ help='if the file is a zip archive, open first file from it, using the provided password (requires Python 2.6+)')
	1602	+ parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*',
	1603	+ help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)')
	1604	+ parser.add_option("-t", action="store_true", dest="triage_mode",
	1605	+ help='triage mode, display results as a summary table (default for multiple files)')
	1606	+ parser.add_option("-d", action="store_true", dest="detailed_mode",
	1607	+ help='detailed mode, display full results (default for single file)')
	1608	+ parser.add_option("-i", "--input", dest='input', type='str', default=None,
	1609	+ help='input file containing VBA source code to be analyzed (no parsing)')
	1610	+ parser.add_option("--decode", action="store_true", dest="show_decoded_strings",
	1611	+ help='display all the obfuscated strings with their decoded content (Hex, Base64, StrReverse, Dridex).')
	1612	+
	1613	+ (options, args) = parser.parse_args()
	1614	+
	1615	+ # Print help if no arguments are passed
	1616	+ if len(args) == 0 and not options.input:
	1617	+ print __doc__
	1618	+ parser.print_help()
	1619	+ sys.exit()
	1620	+
	1621	+ # print banner with version
	1622	+ print 'olevba %s - http://decalage.info/python/oletools' % __version__
	1623	+
	1624	+ logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING) #INFO)
	1625	+ # For now, all logging is disabled:
	1626	+ logging.disable(logging.CRITICAL)
	1627	+
	1628	+ if options.input:
	1629	+ # input file provided with VBA source code to be analyzed directly:
	1630	+ print 'Analysis of VBA source code from %s:' % options.input
	1631	+ vba_code = open(options.input).read()
	1632	+ print_analysis(vba_code, show_decoded_strings=options.show_decoded_strings)
	1633	+ sys.exit()
	1634	+
	1635	+ # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('Type', 'Macros', 'AutoEx', 'Susp.', 'IOCs', 'HexStr')
	1636	+ # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('-'8, '-'7, '-'7, '-'7, '-'7, '-'7)
	1637	+ if not options.detailed_mode or options.triage_mode:
	1638	+ print '%-11s %-65s' % ('Flags', 'Filename')
	1639	+ print '%-11s %-65s' % ('-'11, '-'65)
	1640	+ previous_container = None
	1641	+ count = 0
	1642	+ container = filename = data = None
	1643	+ for container, filename, data in xglob.iter_files(args, recursive=options.recursive,
	1644	+ zip_password=options.zip_password, zip_fname=options.zip_fname):
	1645	+ # ignore directory names stored in zip files:
	1646	+ if container and filename.endswith('/'):
	1647	+ continue
	1648	+ if options.detailed_mode and not options.triage_mode:
	1649	+ # fully detailed output
	1650	+ process_file(container, filename, data, show_decoded_strings=options.show_decoded_strings)
	1651	+ else:
	1652	+ # print container name when it changes:
	1653	+ if container != previous_container:
	1654	+ if container is not None:
	1655	+ print '\nFiles in %s:' % container
	1656	+ previous_container = container
	1657	+ # summarized output for triage:
	1658	+ process_file_triage(container, filename, data)
	1659	+ count += 1
	1660	+ if not options.detailed_mode or options.triage_mode:
	1661	+ print '\n(Flags: OpX=OpenXML, XML=Word2003XML, M=Macros, A=Auto-executable, S=Suspicious keywords, I=IOCs, H=Hex strings, B=Base64 strings, D=Dridex strings, ?=Unknown)\n'
	1662	+
	1663	+ if count == 1 and not options.triage_mode and not options.detailed_mode:
	1664	+ # if options -t and -d were not specified and it's a single file, print details:
	1665	+ #TODO: avoid doing the analysis twice by storing results
	1666	+ process_file(container, filename, data, show_decoded_strings=options.show_decoded_strings)
	1667	+
	1668	+if __name__ == '__main__':
	1669	+ main()
	1670	+
1671	1671	# This was coded while listening to "Dust" from I Love You But I've Chosen Darkness
1672	1672	\ No newline at end of file
...	...