Commit a4ffb743f926d59e022f10313ca70d6af9f8c8b7

Authored by Philippe Lagadec
1 parent 41896bcf

olevba: changed line endings from CRLF to LF

Showing 1 changed file with 1670 additions and 1670 deletions
oletools/olevba.py 100644 → 100755
1 -#!/usr/bin/env python  
2 -"""  
3 -olevba.py  
4 -  
5 -olevba is a script to parse OLE and OpenXML files such as MS Office documents  
6 -(e.g. Word, Excel), to extract VBA Macro code in clear text, deobfuscate  
7 -and analyze malicious macros.  
8 -  
9 -Supported formats:  
10 -- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)  
11 -- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)  
12 -- PowerPoint 2007+ (.pptm, .ppsm)  
13 -- Word 2003 XML (.xml)  
14 -  
15 -Author: Philippe Lagadec - http://www.decalage.info  
16 -License: BSD, see source code or documentation  
17 -  
18 -olevba is part of the python-oletools package:  
19 -http://www.decalage.info/python/oletools  
20 -  
21 -olevba is based on source code from officeparser by John William Davison  
22 -https://github.com/unixfreak0037/officeparser  
23 -"""  
24 -  
25 -#=== LICENSE ==================================================================  
26 -  
27 -# olevba is copyright (c) 2014-2015 Philippe Lagadec (http://www.decalage.info)  
28 -# All rights reserved.  
29 -#  
30 -# Redistribution and use in source and binary forms, with or without modification,  
31 -# are permitted provided that the following conditions are met:  
32 -#  
33 -# * Redistributions of source code must retain the above copyright notice, this  
34 -# list of conditions and the following disclaimer.  
35 -# * Redistributions in binary form must reproduce the above copyright notice,  
36 -# this list of conditions and the following disclaimer in the documentation  
37 -# and/or other materials provided with the distribution.  
38 -#  
39 -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND  
40 -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED  
41 -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE  
42 -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE  
43 -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL  
44 -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR  
45 -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER  
46 -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,  
47 -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE  
48 -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
49 -  
50 -  
51 -# olevba contains modified source code from the officeparser project, published  
52 -# under the following MIT License (MIT):  
53 -#  
54 -# officeparser is copyright (c) 2014 John William Davison  
55 -#  
56 -# Permission is hereby granted, free of charge, to any person obtaining a copy  
57 -# of this software and associated documentation files (the "Software"), to deal  
58 -# in the Software without restriction, including without limitation the rights  
59 -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell  
60 -# copies of the Software, and to permit persons to whom the Software is  
61 -# furnished to do so, subject to the following conditions:  
62 -#  
63 -# The above copyright notice and this permission notice shall be included in all  
64 -# copies or substantial portions of the Software.  
65 -#  
66 -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR  
67 -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,  
68 -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE  
69 -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER  
70 -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,  
71 -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE  
72 -# SOFTWARE.  
73 -  
74 -#------------------------------------------------------------------------------  
75 -# CHANGELOG:  
76 -# 2014-08-05 v0.01 PL: - first version based on officeparser code  
77 -# 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser  
78 -# 2014-08-15 PL: - fixed incorrect value check in PROJECTHELPFILEPATH Record  
79 -# 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats  
80 -# and to find the VBA project root anywhere in the file  
81 -# 2014-11-29 v0.04 PL: - use olefile instead of OleFileIO_PL  
82 -# 2014-12-05 v0.05 PL: - refactored most functions into a class, new API  
83 -# - added detect_vba_macros  
84 -# 2014-12-10 v0.06 PL: - hide first lines with VB attributes  
85 -# - detect auto-executable macros  
86 -# - ignore empty macros  
87 -# 2014-12-14 v0.07 PL: - detect_autoexec() is now case-insensitive  
88 -# 2014-12-15 v0.08 PL: - improved display for empty macros  
89 -# - added pattern extraction  
90 -# 2014-12-25 v0.09 PL: - added suspicious keywords detection  
91 -# 2014-12-27 v0.10 PL: - added OptionParser, main and process_file  
92 -# - uses xglob to scan several files with wildcards  
93 -# - option -r to recurse subdirectories  
94 -# - option -z to scan files in password-protected zips  
95 -# 2015-01-02 v0.11 PL: - improved filter_vba to detect colons  
96 -# 2015-01-03 v0.12 PL: - fixed detect_patterns to detect all patterns  
97 -# - process_file: improved display, shows container file  
98 -# - improved list of executable file extensions  
99 -# 2015-01-04 v0.13 PL: - added several suspicious keywords, improved display  
100 -# 2015-01-08 v0.14 PL: - added hex strings detection and decoding  
101 -# - fixed issue #2, decoding VBA stream names using  
102 -# specified codepage and unicode stream names  
103 -# 2015-01-11 v0.15 PL: - added new triage mode, options -t and -d  
104 -# 2015-01-16 v0.16 PL: - fix for issue #3 (exception when module name="text")  
105 -# - added several suspicious keywords  
106 -# - added option -i to analyze VBA source code directly  
107 -# 2015-01-17 v0.17 PL: - removed .com from the list of executable extensions  
108 -# - added scan_vba to run all detection algorithms  
109 -# - decoded hex strings are now also scanned + reversed  
110 -# 2015-01-23 v0.18 PL: - fixed issue #3, case-insensitive search in code_modules  
111 -# 2015-01-24 v0.19 PL: - improved the detection of IOCs obfuscated with hex  
112 -# strings and StrReverse  
113 -# 2015-01-26 v0.20 PL: - added option --hex to show all hex strings decoded  
114 -# 2015-01-29 v0.21 PL: - added Dridex obfuscation decoding  
115 -# - improved display, shows obfuscation name  
116 -# 2015-02-01 v0.22 PL: - fixed issue #4: regex for URL, e-mail and exe filename  
117 -# - added Base64 obfuscation decoding (contribution from  
118 -# @JamesHabben)  
119 -# 2015-02-03 v0.23 PL: - triage now uses VBA_Scanner results, shows Base64 and  
120 -# Dridex strings  
121 -# - exception handling in detect_base64_strings  
122 -# 2015-02-07 v0.24 PL: - renamed option --hex to --decode, fixed display  
123 -# - display exceptions with stack trace  
124 -# - added several suspicious keywords  
125 -# - improved Base64 detection and decoding  
126 -# - fixed triage mode not to scan attrib lines  
127 -# 2015-03-04 v0.25 PL: - added support for Word 2003 XML  
128 -  
129 -__version__ = '0.25'  
130 -  
131 -#------------------------------------------------------------------------------  
132 -# TODO:  
133 -# + do not use logging, but a provided logger (null logger by default)  
134 -# + setup logging (common with other oletools)  
135 -# + add xor bruteforcing like bbharvest  
136 -# + add chr() decoding  
137 -  
138 -# TODO later:  
139 -# + performance improvement: instead of searching each keyword separately,  
140 -# first split vba code into a list of words (per line), then check each  
141 -# word against a dict. (or put vba words into a set/dict?)  
142 -# + for regex, maybe combine them into a single re with named groups?  
143 -# + add Yara support, include sample rules? plugins like balbuzard?  
144 -# + add balbuzard support  
145 -# + output to file (replace print by file.write, sys.stdout by default)  
146 -# + look for VBA in embedded documents (e.g. Excel in Word)  
147 -# + support SRP streams (see Lenny's article + links and sample)  
148 -# - python 3.x support  
149 -# - add support for PowerPoint macros (see libclamav, libgsf), use oledump heuristic?  
150 -# - check VBA macros in Visio, Access, Project, etc  
151 -# - extract_macros: convert to a class, split long function into smaller methods  
152 -# - extract_macros: read bytes from stream file objects instead of strings  
153 -# - extract_macros: use combined struct.unpack instead of many calls  
154 -  
155 -#------------------------------------------------------------------------------  
156 -# REFERENCES:  
157 -# - [MS-OVBA]: Microsoft Office VBA File Format Structure  
158 -# http://msdn.microsoft.com/en-us/library/office/cc313094%28v=office.12%29.aspx  
159 -# - officeparser: https://github.com/unixfreak0037/officeparser  
160 -  
161 -  
162 -#--- IMPORTS ------------------------------------------------------------------  
163 -  
164 -import sys, logging  
165 -import struct  
166 -import cStringIO  
167 -import math  
168 -import zipfile  
169 -import re  
170 -import optparse  
171 -import os.path  
172 -import binascii  
173 -import base64  
174 -import traceback  
175 -import zlib  
176 -  
177 -# import lxml or ElementTree for XML parsing:  
178 -try:  
179 - # lxml: best performance for XML processing  
180 - import lxml.etree as ET  
181 -except ImportError:  
182 - try:  
183 - # Python 2.5+: batteries included  
184 - import xml.etree.cElementTree as ET  
185 - except ImportError:  
186 - try:  
187 - # Python <2.5: standalone ElementTree install  
188 - import elementtree.cElementTree as ET  
189 - except ImportError:  
190 - raise ImportError, "lxml or ElementTree are not installed, "\  
191 - +"see http://codespeak.net/lxml "\  
192 - +"or http://effbot.org/zone/element-index.htm"  
193 -  
194 -import thirdparty.olefile as olefile  
195 -from thirdparty.prettytable import prettytable  
196 -from thirdparty.xglob import xglob  
197 -  
198 -#--- CONSTANTS ----------------------------------------------------------------  
199 -  
200 -TYPE_OLE = 'OLE'  
201 -TYPE_OpenXML = 'OpenXML'  
202 -TYPE_Word2003_XML = 'Word2003_XML'  
203 -  
204 -MODULE_EXTENSION = "bas"  
205 -CLASS_EXTENSION = "cls"  
206 -FORM_EXTENSION = "frm"  
207 -  
208 -# Namespaces and tags for Word2003 XML parsing:  
209 -NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}'  
210 -# the tag <w:binData w:name="editdata.mso"> contains the VBA macro code:  
211 -TAG_BINDATA = NS_W + 'binData'  
212 -ATTR_NAME = NS_W + 'name'  
213 -  
214 -# Keywords to detect auto-executable macros  
215 -AUTOEXEC_KEYWORDS = {  
216 - # MS Word:  
217 - 'Runs when the Word document is opened':  
218 - ('AutoExec', 'AutoOpen', 'Document_Open', 'DocumentOpen'),  
219 - 'Runs when the Word document is closed':  
220 - ('AutoExit', 'AutoClose', 'Document_Close', 'DocumentBeforeClose'),  
221 - 'Runs when the Word document is modified':  
222 - ('DocumentChange',),  
223 - 'Runs when a new Word document is created':  
224 - ('AutoNew', 'Document_New', 'NewDocument'),  
225 -  
226 - # MS Excel:  
227 - 'Runs when the Excel Workbook is opened':  
228 - ('Auto_Open', 'Workbook_Open'),  
229 - 'Runs when the Excel Workbook is closed':  
230 - ('Auto_Close', 'Workbook_Close'),  
231 -  
232 - #TODO: full list in MS specs??  
233 -}  
234 -  
235 -# Suspicious Keywords that may be used by malware  
236 -# See VBA language reference: http://msdn.microsoft.com/en-us/library/office/jj692818%28v=office.15%29.aspx  
237 -SUSPICIOUS_KEYWORDS = {  
238 - #TODO: use regex to support variable whitespaces  
239 - 'May read system environment variables':  
240 - ('Environ',),  
241 - 'May open a file':  
242 - ('Open',),  
243 - 'May write to a file (if combined with Open)':  
244 - #TODO: regex to find Open+Write on same line  
245 - ('Write', 'Put', 'Output', 'Print #'),  
246 - 'May read or write a binary file (if combined with Open)':  
247 - #TODO: regex to find Open+Binary on same line  
248 - ('Binary',),  
249 - 'May copy a file':  
250 - ('FileCopy', 'CopyFile'),  
251 - #FileCopy: http://msdn.microsoft.com/en-us/library/office/gg264390%28v=office.15%29.aspx  
252 - #CopyFile: http://msdn.microsoft.com/en-us/library/office/gg264089%28v=office.15%29.aspx  
253 - 'May delete a file':  
254 - ('Kill',),  
255 - 'May create a text file':  
256 - ('CreateTextFile','ADODB.Stream', 'WriteText', 'SaveToFile'),  
257 - #CreateTextFile: http://msdn.microsoft.com/en-us/library/office/gg264617%28v=office.15%29.aspx  
258 - #ADODB.Stream sample: http://pastebin.com/Z4TMyuq6  
259 - 'May run an executable file or a system command':  
260 - ('Shell', 'vbNormal', 'vbNormalFocus', 'vbHide', 'vbMinimizedFocus', 'vbMaximizedFocus', 'vbNormalNoFocus',  
261 - 'vbMinimizedNoFocus', 'WScript.Shell', 'Run'),  
262 - #Shell: http://msdn.microsoft.com/en-us/library/office/gg278437%28v=office.15%29.aspx  
263 - #WScript.Shell+Run sample: http://pastebin.com/Z4TMyuq6  
264 - 'May hide the application':  
265 - ('Application.Visible', 'ShowWindow', 'SW_HIDE'),  
266 - 'May create a directory':  
267 - ('MkDir',),  
268 - 'May save the current workbook':  
269 - ('ActiveWorkbook.SaveAs',),  
270 - 'May change which directory contains files to open at startup':  
271 - #TODO: confirm the actual effect  
272 - ('Application.AltStartupPath',),  
273 - 'May create an OLE object':  
274 - ('CreateObject',),  
275 - 'May run an application (if combined with CreateObject)':  
276 - ('Shell.Application',),  
277 - 'May enumerate application windows (if combined with Shell.Application object)':  
278 - ('Windows', 'FindWindow'),  
279 - 'May run code from a DLL':  
280 - #TODO: regex to find declare+lib on same line  
281 - ('Lib',),  
282 - 'May download files from the Internet':  
283 - #TODO: regex to find urlmon+URLDownloadToFileA on same line  
284 - ('URLDownloadToFileA', 'Msxml2.XMLHTTP', 'Microsoft.XMLHTTP'),  
285 - 'May control another application by simulating user keystrokes':  
286 - ('SendKeys', 'AppActivate'),  
287 - #SendKeys: http://msdn.microsoft.com/en-us/library/office/gg278655%28v=office.15%29.aspx  
288 - 'May attempt to obfuscate malicious function calls':  
289 - ('CallByName',),  
290 - #CallByName: http://msdn.microsoft.com/en-us/library/office/gg278760%28v=office.15%29.aspx  
291 - 'May attempt to obfuscate specific strings':  
292 - #TODO: regex to find several Chr*, not just one  
293 - ('Chr', 'ChrB', 'ChrW', 'StrReverse', 'Xor'),  
294 - #Chr: http://msdn.microsoft.com/en-us/library/office/gg264465%28v=office.15%29.aspx  
295 -}  
296 -  
297 -# Regular Expression for a URL:  
298 -# http://en.wikipedia.org/wiki/Uniform_resource_locator  
299 -# http://www.w3.org/Addressing/URL/uri-spec.html  
300 -#TODO: also support username:password@server  
301 -#TODO: other protocols (file, gopher, wais, ...?)  
302 -SCHEME = r'\b(?:http|ftp)s?'  
303 -# see http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains  
304 -TLD = r'(?:xn--[a-zA-Z0-9]{4,20}|[a-zA-Z]{2,20})'  
305 -DNS_NAME = r'(?:[a-zA-Z0-9\-\.]+\.' + TLD + ')'  
306 -#TODO: IPv6 - see https://www.debuggex.com/  
307 -# A literal numeric IPv6 address may be given, but must be enclosed in [ ] e.g. [db8:0cec::99:123a]  
308 -NUMBER_0_255 = r'(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'  
309 -IPv4 = r'(?:'+NUMBER_0_255+r'\.){3}'+NUMBER_0_255  
310 -# IPv4 must come before the DNS name because it is more specific  
311 -SERVER = r'(?:' + IPv4 + '|' + DNS_NAME + ')'  
312 -PORT = r'(?:\:[0-9]{1,5})?'  
313 -SERVER_PORT = SERVER + PORT  
314 -URL_PATH = r'(?:/[a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~]*)?' # [^\.\,\)\(\s"]  
315 -URL_RE = SCHEME + r'\://' + SERVER_PORT + URL_PATH  
316 -re_url = re.compile(URL_RE)  
317 -  
318 -  
319 -# Patterns to be extracted (IP addresses, URLs, etc)  
320 -# From patterns.py in balbuzard  
321 -RE_PATTERNS = (  
322 - ('URL', re.compile(URL_RE)),  
323 - ('IPv4 address', re.compile(IPv4)),  
324 - ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@'+SERVER+'\b')),  
325 - # ('Domain name', re.compile(r'(?=^.{1,254}$)(^(?:(?!\d+\.|-)[a-zA-Z0-9_\-]{1,63}(?<!-)\.?)+(?:[a-zA-Z]{2,})$)')),  
326 - # Executable file name with known extensions (except .com which is present in many URLs, and .application):  
327 - ("Executable file name", re.compile(r"(?i)\b\w+\.(EXE|PIF|GADGET|MSI|MSP|MSC|VBS|VBE|VB|JSE|JS|WSF|WSC|WSH|WS|BAT|CMD|DLL|SCR|HTA|CPL|CLASS|JAR|PS1XML|PS1|PS2XML|PS2|PSC1|PSC2|SCF|LNK|INF|REG)\b")),  
328 - # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/  
329 - #TODO: https://support.office.com/en-us/article/Blocked-attachments-in-Outlook-3811cddc-17c3-4279-a30c-060ba0207372#__attachment_file_types  
330 - #('Hex string', re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')),  
331 - )  
332 -  
333 -# regex to detect strings encoded in hexadecimal  
334 -re_hex_string = re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')  
335 -  
336 -# regex to detect strings encoded in base64  
337 -#re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?"')  
338 -# better version from balbuzard, less false positives:  
339 -re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4}){1,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)?"')  
340 -# white list of common strings matching the base64 regex, but which are not base64 strings (all lowercase):  
341 -BASE64_WHITELIST = set(['thisdocument', 'thisworkbook', 'test', 'temp', 'http', 'open', 'exit'])  
342 -  
343 -# regex to detect strings encoded with a specific Dridex algorithm  
344 -# (see https://github.com/JamesHabben/MalwareStuff)  
345 -re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"')  
346 -# regex to check that it is not just a hex string:  
347 -re_nothex_check = re.compile(r'[G-Zg-z]')  
348 -  
349 -#--- FUNCTIONS ----------------------------------------------------------------  
350 -  
351 -def copytoken_help(decompressed_current, decompressed_chunk_start):  
352 - """  
353 - compute bit masks to decode a CopyToken according to MS-OVBA 2.4.1.3.19.1 CopyToken Help  
354 -  
355 - decompressed_current: number of decompressed bytes so far, i.e. len(decompressed_container)  
356 - decompressed_chunk_start: offset of the current chunk in the decompressed container  
357 - return length_mask, offset_mask, bit_count, maximum_length  
358 - """  
359 - difference = decompressed_current - decompressed_chunk_start  
360 - bit_count = int(math.ceil(math.log(difference, 2)))  
361 - bit_count = max([bit_count, 4])  
362 - length_mask = 0xFFFF >> bit_count  
363 - offset_mask = ~length_mask  
364 - maximum_length = (0xFFFF >> bit_count) + 3  
365 - return length_mask, offset_mask, bit_count, maximum_length  
366 -  
367 -  
368 -def decompress_stream (compressed_container):  
369 - """  
370 - Decompress a stream according to MS-OVBA section 2.4.1  
371 -  
372 - compressed_container: string compressed according to the MS-OVBA 2.4.1.3.6 Compression algorithm  
373 - return the decompressed container as a string (bytes)  
374 - """  
375 - # 2.4.1.2 State Variables  
376 -  
377 - # The following state is maintained for the CompressedContainer (section 2.4.1.1.1):  
378 - # CompressedRecordEnd: The location of the byte after the last byte in the CompressedContainer (section 2.4.1.1.1).  
379 - # CompressedCurrent: The location of the next byte in the CompressedContainer (section 2.4.1.1.1) to be read by  
380 - # decompression or to be written by compression.  
381 -  
382 - # The following state is maintained for the current CompressedChunk (section 2.4.1.1.4):  
383 - # CompressedChunkStart: The location of the first byte of the CompressedChunk (section 2.4.1.1.4) within the  
384 - # CompressedContainer (section 2.4.1.1.1).  
385 -  
386 - # The following state is maintained for a DecompressedBuffer (section 2.4.1.1.2):  
387 - # DecompressedCurrent: The location of the next byte in the DecompressedBuffer (section 2.4.1.1.2) to be written by  
388 - # decompression or to be read by compression.  
389 - # DecompressedBufferEnd: The location of the byte after the last byte in the DecompressedBuffer (section 2.4.1.1.2).  
390 -  
391 - # The following state is maintained for the current DecompressedChunk (section 2.4.1.1.3):  
392 - # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the  
393 - # DecompressedBuffer (section 2.4.1.1.2).  
394 -  
395 - decompressed_container = '' # result  
396 - compressed_current = 0  
397 -  
398 - sig_byte = ord(compressed_container[compressed_current])  
399 - if sig_byte != 0x01:  
400 - raise ValueError('invalid signature byte {0:02X}'.format(sig_byte))  
401 -  
402 - compressed_current += 1  
403 -  
404 - #NOTE: the definition of CompressedRecordEnd is ambiguous. Here we assume that  
405 - # CompressedRecordEnd = len(compressed_container)  
406 - while compressed_current < len(compressed_container):  
407 - # 2.4.1.1.5  
408 - compressed_chunk_start = compressed_current  
409 - # chunk header = first 16 bits  
410 - compressed_chunk_header = struct.unpack("<H", compressed_container[compressed_chunk_start:compressed_chunk_start + 2])[0]  
411 - # chunk size = 12 first bits of header + 3  
412 - chunk_size = (compressed_chunk_header & 0x0FFF) + 3  
413 - # chunk signature = 3 next bits - should always be 0b011  
414 - chunk_signature = (compressed_chunk_header >> 12) & 0x07  
415 - if chunk_signature != 0b011:  
416 - raise ValueError('Invalid CompressedChunkSignature in VBA compressed stream')  
417 - # chunk flag = next bit - 1 == compressed, 0 == uncompressed  
418 - chunk_flag = (compressed_chunk_header >> 15) & 0x01  
419 - logging.debug("chunk size = {0}, compressed flag = {1}".format(chunk_size, chunk_flag))  
420 -  
421 - #MS-OVBA 2.4.1.3.12: the maximum size of a chunk including its header is 4098 bytes (header 2 + data 4096)  
422 - # The minimum size is 3 bytes  
423 - # NOTE: there seems to be a typo in MS-OVBA, the check should be with 4098, not 4095 (which is the max value  
424 - # in chunk header before adding 3.  
425 - # Also the first test is not useful since a 12 bits value cannot be larger than 4095.  
426 - if chunk_flag == 1 and chunk_size > 4098:  
427 - raise ValueError('CompressedChunkSize > 4098 but CompressedChunkFlag == 1')  
428 - if chunk_flag == 0 and chunk_size != 4098:  
429 - raise ValueError('CompressedChunkSize != 4098 but CompressedChunkFlag == 0')  
430 -  
431 - # check if chunk_size goes beyond the compressed data, instead of silently cutting it:  
432 - #TODO: raise an exception?  
433 - if compressed_chunk_start + chunk_size > len(compressed_container):  
434 - logging.warning('Chunk size is larger than remaining compressed data')  
435 - compressed_end = min([len(compressed_container), compressed_chunk_start + chunk_size])  
436 - # read after chunk header:  
437 - compressed_current = compressed_chunk_start + 2  
438 -  
439 - if chunk_flag == 0:  
440 - # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk  
441 - # uncompressed chunk: read the next 4096 bytes as-is  
442 - #TODO: check if there are at least 4096 bytes left  
443 - decompressed_container += compressed_container[compressed_current:compressed_current + 4096]  
444 - compressed_current += 4096  
445 - else:  
446 - # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk  
447 - # compressed chunk  
448 - decompressed_chunk_start = len(decompressed_container)  
449 - while compressed_current < compressed_end:  
450 - # MS-OVBA 2.4.1.3.4 Decompressing a TokenSequence  
451 - # logging.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end))  
452 - # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or  
453 - # copy tokens (reference to a previous literal token)  
454 - flag_byte = ord(compressed_container[compressed_current])  
455 - compressed_current += 1  
456 - for bit_index in xrange(0, 8):  
457 - # logging.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end))  
458 - if compressed_current >= compressed_end:  
459 - break  
460 - # MS-OVBA 2.4.1.3.5 Decompressing a Token  
461 - # MS-OVBA 2.4.1.3.17 Extract FlagBit  
462 - flag_bit = (flag_byte >> bit_index) & 1  
463 - #logging.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit))  
464 - if flag_bit == 0: # LiteralToken  
465 - # copy one byte directly to output  
466 - decompressed_container += compressed_container[compressed_current]  
467 - compressed_current += 1  
468 - else: # CopyToken  
469 - # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken  
470 - copy_token = struct.unpack("<H", compressed_container[compressed_current:compressed_current + 2])[0]  
471 - #TODO: check this  
472 - length_mask, offset_mask, bit_count, maximum_length = copytoken_help(  
473 - len(decompressed_container), decompressed_chunk_start)  
474 - length = (copy_token & length_mask) + 3  
475 - temp1 = copy_token & offset_mask  
476 - temp2 = 16 - bit_count  
477 - offset = (temp1 >> temp2) + 1  
478 - #logging.debug('offset=%d length=%d' % (offset, length))  
479 - copy_source = len(decompressed_container) - offset  
480 - for index in xrange(copy_source, copy_source + length):  
481 - decompressed_container += decompressed_container[index]  
482 - compressed_current += 2  
483 - return decompressed_container  
484 -  
485 -  
486 -def _extract_vba (ole, vba_root, project_path, dir_path):  
487 - """  
488 - Extract VBA macros from an OleFileIO object.  
489 - Internal function, do not call directly.  
490 -  
491 - vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream  
492 - vba_project: path to the PROJECT stream  
493 - This is a generator, yielding (stream path, VBA filename, VBA source code) for each VBA code stream  
494 - """  
495 - # Open the PROJECT stream:  
496 - project = ole.openstream(project_path)  
497 -  
498 - # sample content of the PROJECT stream:  
499 -  
500 - ## ID="{5312AC8A-349D-4950-BDD0-49BE3C4DD0F0}"  
501 - ## Document=ThisDocument/&H00000000  
502 - ## Module=NewMacros  
503 - ## Name="Project"  
504 - ## HelpContextID="0"  
505 - ## VersionCompatible32="393222000"  
506 - ## CMG="F1F301E705E705E705E705"  
507 - ## DPB="8F8D7FE3831F2020202020"  
508 - ## GC="2D2FDD81E51EE61EE6E1"  
509 - ##  
510 - ## [Host Extender Info]  
511 - ## &H00000001={3832D640-CF90-11CF-8E43-00A0C911005A};VBE;&H00000000  
512 - ## &H00000002={000209F2-0000-0000-C000-000000000046};Word8.0;&H00000000  
513 - ##  
514 - ## [Workspace]  
515 - ## ThisDocument=22, 29, 339, 477, Z  
516 - ## NewMacros=-4, 42, 832, 510, C  
517 -  
518 - code_modules = {}  
519 -  
520 - for line in project:  
521 - line = line.strip()  
522 - if '=' in line:  
523 - # split line at the 1st equal sign:  
524 - name, value = line.split('=', 1)  
525 - # looking for code modules  
526 - # add the code module as a key in the dictionary  
527 - # the value will be the extension needed later  
528 - # The value is converted to lowercase, to allow case-insensitive matching (issue #3)  
529 - value = value.lower()  
530 - if name == 'Document':  
531 - # split value at the 1st slash, keep 1st part:  
532 - value = value.split('/', 1)[0]  
533 - code_modules[value] = CLASS_EXTENSION  
534 - elif name == 'Module':  
535 - code_modules[value] = MODULE_EXTENSION  
536 - elif name == 'Class':  
537 - code_modules[value] = CLASS_EXTENSION  
538 - elif name == 'BaseClass':  
539 - code_modules[value] = FORM_EXTENSION  
540 -  
541 - # read data from dir stream (compressed)  
542 - dir_compressed = ole.openstream(dir_path).read()  
543 -  
544 - def check_value(name, expected, value):  
545 - if expected != value:  
546 - logging.error("invalid value for {0} expected {1:04X} got {2:04X}".format(name, expected, value))  
547 -  
548 - dir_stream = cStringIO.StringIO(decompress_stream(dir_compressed))  
549 -  
550 - # PROJECTSYSKIND Record  
551 - PROJECTSYSKIND_Id = struct.unpack("<H", dir_stream.read(2))[0]  
552 - check_value('PROJECTSYSKIND_Id', 0x0001, PROJECTSYSKIND_Id)  
553 - PROJECTSYSKIND_Size = struct.unpack("<L", dir_stream.read(4))[0]  
554 - check_value('PROJECTSYSKIND_Size', 0x0004, PROJECTSYSKIND_Size)  
555 - PROJECTSYSKIND_SysKind = struct.unpack("<L", dir_stream.read(4))[0]  
556 - if PROJECTSYSKIND_SysKind == 0x00:  
557 - logging.debug("16-bit Windows")  
558 - elif PROJECTSYSKIND_SysKind == 0x01:  
559 - logging.debug("32-bit Windows")  
560 - elif PROJECTSYSKIND_SysKind == 0x02:  
561 - logging.debug("Macintosh")  
562 - elif PROJECTSYSKIND_SysKind == 0x03:  
563 - logging.debug("64-bit Windows")  
564 - else:  
565 - logging.error("invalid PROJECTSYSKIND_SysKind {0:04X}".format(PROJECTSYSKIND_SysKind))  
566 -  
567 - # PROJECTLCID Record  
568 - PROJECTLCID_Id = struct.unpack("<H", dir_stream.read(2))[0]  
569 - check_value('PROJECTLCID_Id', 0x0002, PROJECTLCID_Id)  
570 - PROJECTLCID_Size = struct.unpack("<L", dir_stream.read(4))[0]  
571 - check_value('PROJECTLCID_Size', 0x0004, PROJECTLCID_Size)  
572 - PROJECTLCID_Lcid = struct.unpack("<L", dir_stream.read(4))[0]  
573 - check_value('PROJECTLCID_Lcid', 0x409, PROJECTLCID_Lcid)  
574 -  
575 - # PROJECTLCIDINVOKE Record  
576 - PROJECTLCIDINVOKE_Id = struct.unpack("<H", dir_stream.read(2))[0]  
577 - check_value('PROJECTLCIDINVOKE_Id', 0x0014, PROJECTLCIDINVOKE_Id)  
578 - PROJECTLCIDINVOKE_Size = struct.unpack("<L", dir_stream.read(4))[0]  
579 - check_value('PROJECTLCIDINVOKE_Size', 0x0004, PROJECTLCIDINVOKE_Size)  
580 - PROJECTLCIDINVOKE_LcidInvoke = struct.unpack("<L", dir_stream.read(4))[0]  
581 - check_value('PROJECTLCIDINVOKE_LcidInvoke', 0x409, PROJECTLCIDINVOKE_LcidInvoke)  
582 -  
583 - # PROJECTCODEPAGE Record  
584 - PROJECTCODEPAGE_Id = struct.unpack("<H", dir_stream.read(2))[0]  
585 - check_value('PROJECTCODEPAGE_Id', 0x0003, PROJECTCODEPAGE_Id)  
586 - PROJECTCODEPAGE_Size = struct.unpack("<L", dir_stream.read(4))[0]  
587 - check_value('PROJECTCODEPAGE_Size', 0x0002, PROJECTCODEPAGE_Size)  
588 - PROJECTCODEPAGE_CodePage = struct.unpack("<H", dir_stream.read(2))[0]  
589 -  
590 - # PROJECTNAME Record  
591 - PROJECTNAME_Id = struct.unpack("<H", dir_stream.read(2))[0]  
592 - check_value('PROJECTNAME_Id', 0x0004, PROJECTNAME_Id)  
593 - PROJECTNAME_SizeOfProjectName = struct.unpack("<L", dir_stream.read(4))[0]  
594 - if PROJECTNAME_SizeOfProjectName < 1 or PROJECTNAME_SizeOfProjectName > 128:  
595 - logging.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(PROJECTNAME_SizeOfProjectName))  
596 - PROJECTNAME_ProjectName = dir_stream.read(PROJECTNAME_SizeOfProjectName)  
597 -  
598 - # PROJECTDOCSTRING Record  
599 - PROJECTDOCSTRING_Id = struct.unpack("<H", dir_stream.read(2))[0]  
600 - check_value('PROJECTDOCSTRING_Id', 0x0005, PROJECTDOCSTRING_Id)  
601 - PROJECTDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0]  
602 - if PROJECTNAME_SizeOfProjectName > 2000:  
603 - logging.error("PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(PROJECTDOCSTRING_SizeOfDocString))  
604 - PROJECTDOCSTRING_DocString = dir_stream.read(PROJECTDOCSTRING_SizeOfDocString)  
605 - PROJECTDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0]  
606 - check_value('PROJECTDOCSTRING_Reserved', 0x0040, PROJECTDOCSTRING_Reserved)  
607 - PROJECTDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0]  
608 - if PROJECTDOCSTRING_SizeOfDocStringUnicode % 2 != 0:  
609 - logging.error("PROJECTDOCSTRING_SizeOfDocStringUnicode is not even")  
610 - PROJECTDOCSTRING_DocStringUnicode = dir_stream.read(PROJECTDOCSTRING_SizeOfDocStringUnicode)  
611 -  
612 - # PROJECTHELPFILEPATH Record - MS-OVBA 2.3.4.2.1.7  
613 - PROJECTHELPFILEPATH_Id = struct.unpack("<H", dir_stream.read(2))[0]  
614 - check_value('PROJECTHELPFILEPATH_Id', 0x0006, PROJECTHELPFILEPATH_Id)  
615 - PROJECTHELPFILEPATH_SizeOfHelpFile1 = struct.unpack("<L", dir_stream.read(4))[0]  
616 - if PROJECTHELPFILEPATH_SizeOfHelpFile1 > 260:  
617 - logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(PROJECTHELPFILEPATH_SizeOfHelpFile1))  
618 - PROJECTHELPFILEPATH_HelpFile1 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile1)  
619 - PROJECTHELPFILEPATH_Reserved = struct.unpack("<H", dir_stream.read(2))[0]  
620 - check_value('PROJECTHELPFILEPATH_Reserved', 0x003D, PROJECTHELPFILEPATH_Reserved)  
621 - PROJECTHELPFILEPATH_SizeOfHelpFile2 = struct.unpack("<L", dir_stream.read(4))[0]  
622 - if PROJECTHELPFILEPATH_SizeOfHelpFile2 != PROJECTHELPFILEPATH_SizeOfHelpFile1:  
623 - logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 does not equal PROJECTHELPFILEPATH_SizeOfHelpFile2")  
624 - PROJECTHELPFILEPATH_HelpFile2 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile2)  
625 - if PROJECTHELPFILEPATH_HelpFile2 != PROJECTHELPFILEPATH_HelpFile1:  
626 - logging.error("PROJECTHELPFILEPATH_HelpFile1 does not equal PROJECTHELPFILEPATH_HelpFile2")  
627 -  
628 - # PROJECTHELPCONTEXT Record  
629 - PROJECTHELPCONTEXT_Id = struct.unpack("<H", dir_stream.read(2))[0]  
630 - check_value('PROJECTHELPCONTEXT_Id', 0x0007, PROJECTHELPCONTEXT_Id)  
631 - PROJECTHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0]  
632 - check_value('PROJECTHELPCONTEXT_Size', 0x0004, PROJECTHELPCONTEXT_Size)  
633 - PROJECTHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0]  
634 -  
635 - # PROJECTLIBFLAGS Record  
636 - PROJECTLIBFLAGS_Id = struct.unpack("<H", dir_stream.read(2))[0]  
637 - check_value('PROJECTLIBFLAGS_Id', 0x0008, PROJECTLIBFLAGS_Id)  
638 - PROJECTLIBFLAGS_Size = struct.unpack("<L", dir_stream.read(4))[0]  
639 - check_value('PROJECTLIBFLAGS_Size', 0x0004, PROJECTLIBFLAGS_Size)  
640 - PROJECTLIBFLAGS_ProjectLibFlags = struct.unpack("<L", dir_stream.read(4))[0]  
641 - check_value('PROJECTLIBFLAGS_ProjectLibFlags', 0x0000, PROJECTLIBFLAGS_ProjectLibFlags)  
642 -  
643 - # PROJECTVERSION Record  
644 - PROJECTVERSION_Id = struct.unpack("<H", dir_stream.read(2))[0]  
645 - check_value('PROJECTVERSION_Id', 0x0009, PROJECTVERSION_Id)  
646 - PROJECTVERSION_Reserved = struct.unpack("<L", dir_stream.read(4))[0]  
647 - check_value('PROJECTVERSION_Reserved', 0x0004, PROJECTVERSION_Reserved)  
648 - PROJECTVERSION_VersionMajor = struct.unpack("<L", dir_stream.read(4))[0]  
649 - PROJECTVERSION_VersionMinor = struct.unpack("<H", dir_stream.read(2))[0]  
650 -  
651 - # PROJECTCONSTANTS Record  
652 - PROJECTCONSTANTS_Id = struct.unpack("<H", dir_stream.read(2))[0]  
653 - check_value('PROJECTCONSTANTS_Id', 0x000C, PROJECTCONSTANTS_Id)  
654 - PROJECTCONSTANTS_SizeOfConstants = struct.unpack("<L", dir_stream.read(4))[0]  
655 - if PROJECTCONSTANTS_SizeOfConstants > 1015:  
656 - logging.error("PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(PROJECTCONSTANTS_SizeOfConstants))  
657 - PROJECTCONSTANTS_Constants = dir_stream.read(PROJECTCONSTANTS_SizeOfConstants)  
658 - PROJECTCONSTANTS_Reserved = struct.unpack("<H", dir_stream.read(2))[0]  
659 - check_value('PROJECTCONSTANTS_Reserved', 0x003C, PROJECTCONSTANTS_Reserved)  
660 - PROJECTCONSTANTS_SizeOfConstantsUnicode = struct.unpack("<L", dir_stream.read(4))[0]  
661 - if PROJECTCONSTANTS_SizeOfConstantsUnicode % 2 != 0:  
662 - logging.error("PROJECTCONSTANTS_SizeOfConstantsUnicode is not even")  
663 - PROJECTCONSTANTS_ConstantsUnicode = dir_stream.read(PROJECTCONSTANTS_SizeOfConstantsUnicode)  
664 -  
665 - # array of REFERENCE records  
666 - check = None  
667 - while True:  
668 - check = struct.unpack("<H", dir_stream.read(2))[0]  
669 - logging.debug("reference type = {0:04X}".format(check))  
670 - if check == 0x000F:  
671 - break  
672 -  
673 - if check == 0x0016:  
674 - # REFERENCENAME  
675 - REFERENCE_Id = check  
676 - REFERENCE_SizeOfName = struct.unpack("<L", dir_stream.read(4))[0]  
677 - REFERENCE_Name = dir_stream.read(REFERENCE_SizeOfName)  
678 - REFERENCE_Reserved = struct.unpack("<H", dir_stream.read(2))[0]  
679 - check_value('REFERENCE_Reserved', 0x003E, REFERENCE_Reserved)  
680 - REFERENCE_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]  
681 - REFERENCE_NameUnicode = dir_stream.read(REFERENCE_SizeOfNameUnicode)  
682 - continue  
683 -  
684 - if check == 0x0033:  
685 - # REFERENCEORIGINAL (followed by REFERENCECONTROL)  
686 - REFERENCEORIGINAL_Id = check  
687 - REFERENCEORIGINAL_SizeOfLibidOriginal = struct.unpack("<L", dir_stream.read(4))[0]  
688 - REFERENCEORIGINAL_LibidOriginal = dir_stream.read(REFERENCEORIGINAL_SizeOfLibidOriginal)  
689 - continue  
690 -  
691 - if check == 0x002F:  
692 - # REFERENCECONTROL  
693 - REFERENCECONTROL_Id = check  
694 - REFERENCECONTROL_SizeTwiddled = struct.unpack("<L", dir_stream.read(4))[0] # ignore  
695 - REFERENCECONTROL_SizeOfLibidTwiddled = struct.unpack("<L", dir_stream.read(4))[0]  
696 - REFERENCECONTROL_LibidTwiddled = dir_stream.read(REFERENCECONTROL_SizeOfLibidTwiddled)  
697 - REFERENCECONTROL_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0] # ignore  
698 - check_value('REFERENCECONTROL_Reserved1', 0x0000, REFERENCECONTROL_Reserved1)  
699 - REFERENCECONTROL_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0] # ignore  
700 - check_value('REFERENCECONTROL_Reserved2', 0x0000, REFERENCECONTROL_Reserved2)  
701 - # optional field  
702 - check2 = struct.unpack("<H", dir_stream.read(2))[0]  
703 - if check2 == 0x0016:  
704 - REFERENCECONTROL_NameRecordExtended_Id = check  
705 - REFERENCECONTROL_NameRecordExtended_SizeofName = struct.unpack("<L", dir_stream.read(4))[0]  
706 - REFERENCECONTROL_NameRecordExtended_Name = dir_stream.read(REFERENCECONTROL_NameRecordExtended_SizeofName)  
707 - REFERENCECONTROL_NameRecordExtended_Reserved = struct.unpack("<H", dir_stream.read(2))[0]  
708 - check_value('REFERENCECONTROL_NameRecordExtended_Reserved', 0x003E, REFERENCECONTROL_NameRecordExtended_Reserved)  
709 - REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]  
710 - REFERENCECONTROL_NameRecordExtended_NameUnicode = dir_stream.read(REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode)  
711 - REFERENCECONTROL_Reserved3 = struct.unpack("<H", dir_stream.read(2))[0]  
712 - else:  
713 - REFERENCECONTROL_Reserved3 = check2  
714 -  
715 - check_value('REFERENCECONTROL_Reserved3', 0x0030, REFERENCECONTROL_Reserved3)  
716 - REFERENCECONTROL_SizeExtended = struct.unpack("<L", dir_stream.read(4))[0]  
717 - REFERENCECONTROL_SizeOfLibidExtended = struct.unpack("<L", dir_stream.read(4))[0]  
718 - REFERENCECONTROL_LibidExtended = dir_stream.read(REFERENCECONTROL_SizeOfLibidExtended)  
719 - REFERENCECONTROL_Reserved4 = struct.unpack("<L", dir_stream.read(4))[0]  
720 - REFERENCECONTROL_Reserved5 = struct.unpack("<H", dir_stream.read(2))[0]  
721 - REFERENCECONTROL_OriginalTypeLib = dir_stream.read(16)  
722 - REFERENCECONTROL_Cookie = struct.unpack("<L", dir_stream.read(4))[0]  
723 - continue  
724 -  
725 - if check == 0x000D:  
726 - # REFERENCEREGISTERED  
727 - REFERENCEREGISTERED_Id = check  
728 - REFERENCEREGISTERED_Size = struct.unpack("<L", dir_stream.read(4))[0]  
729 - REFERENCEREGISTERED_SizeOfLibid = struct.unpack("<L", dir_stream.read(4))[0]  
730 - REFERENCEREGISTERED_Libid = dir_stream.read(REFERENCEREGISTERED_SizeOfLibid)  
731 - REFERENCEREGISTERED_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0]  
732 - check_value('REFERENCEREGISTERED_Reserved1', 0x0000, REFERENCEREGISTERED_Reserved1)  
733 - REFERENCEREGISTERED_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0]  
734 - check_value('REFERENCEREGISTERED_Reserved2', 0x0000, REFERENCEREGISTERED_Reserved2)  
735 - continue  
736 -  
737 - if check == 0x000E:  
738 - # REFERENCEPROJECT  
739 - REFERENCEPROJECT_Id = check  
740 - REFERENCEPROJECT_Size = struct.unpack("<L", dir_stream.read(4))[0]  
741 - REFERENCEPROJECT_SizeOfLibidAbsolute = struct.unpack("<L", dir_stream.read(4))[0]  
742 - REFERENCEPROJECT_LibidAbsolute = dir_stream.read(REFERENCEPROJECT_SizeOfLibidAbsolute)  
743 - REFERENCEPROJECT_SizeOfLibidRelative = struct.unpack("<L", dir_stream.read(4))[0]  
744 - REFERENCEPROJECT_LibidRelative = dir_stream.read(REFERENCEPROJECT_SizeOfLibidRelative)  
745 - REFERENCEPROJECT_MajorVersion = struct.unpack("<L", dir_stream.read(4))[0]  
746 - REFERENCEPROJECT_MinorVersion = struct.unpack("<H", dir_stream.read(2))[0]  
747 - continue  
748 -  
749 - logging.error('invalid or unknown check Id {0:04X}'.format(check))  
750 - sys.exit(0)  
751 -  
752 - PROJECTMODULES_Id = check #struct.unpack("<H", dir_stream.read(2))[0]  
753 - check_value('PROJECTMODULES_Id', 0x000F, PROJECTMODULES_Id)  
754 - PROJECTMODULES_Size = struct.unpack("<L", dir_stream.read(4))[0]  
755 - check_value('PROJECTMODULES_Size', 0x0002, PROJECTMODULES_Size)  
756 - PROJECTMODULES_Count = struct.unpack("<H", dir_stream.read(2))[0]  
757 - PROJECTMODULES_ProjectCookieRecord_Id = struct.unpack("<H", dir_stream.read(2))[0]  
758 - check_value('PROJECTMODULES_ProjectCookieRecord_Id', 0x0013, PROJECTMODULES_ProjectCookieRecord_Id)  
759 - PROJECTMODULES_ProjectCookieRecord_Size = struct.unpack("<L", dir_stream.read(4))[0]  
760 - check_value('PROJECTMODULES_ProjectCookieRecord_Size', 0x0002, PROJECTMODULES_ProjectCookieRecord_Size)  
761 - PROJECTMODULES_ProjectCookieRecord_Cookie = struct.unpack("<H", dir_stream.read(2))[0]  
762 -  
763 - logging.debug("parsing {0} modules".format(PROJECTMODULES_Count))  
764 - for x in xrange(0, PROJECTMODULES_Count):  
765 - MODULENAME_Id = struct.unpack("<H", dir_stream.read(2))[0]  
766 - check_value('MODULENAME_Id', 0x0019, MODULENAME_Id)  
767 - MODULENAME_SizeOfModuleName = struct.unpack("<L", dir_stream.read(4))[0]  
768 - MODULENAME_ModuleName = dir_stream.read(MODULENAME_SizeOfModuleName)  
769 - # account for optional sections  
770 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
771 - if section_id == 0x0047:  
772 - MODULENAMEUNICODE_Id = section_id  
773 - MODULENAMEUNICODE_SizeOfModuleNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]  
774 - MODULENAMEUNICODE_ModuleNameUnicode = dir_stream.read(MODULENAMEUNICODE_SizeOfModuleNameUnicode)  
775 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
776 - if section_id == 0x001A:  
777 - MODULESTREAMNAME_id = section_id  
778 - MODULESTREAMNAME_SizeOfStreamName = struct.unpack("<L", dir_stream.read(4))[0]  
779 - MODULESTREAMNAME_StreamName = dir_stream.read(MODULESTREAMNAME_SizeOfStreamName)  
780 - MODULESTREAMNAME_Reserved = struct.unpack("<H", dir_stream.read(2))[0]  
781 - check_value('MODULESTREAMNAME_Reserved', 0x0032, MODULESTREAMNAME_Reserved)  
782 - MODULESTREAMNAME_SizeOfStreamNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]  
783 - MODULESTREAMNAME_StreamNameUnicode = dir_stream.read(MODULESTREAMNAME_SizeOfStreamNameUnicode)  
784 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
785 - if section_id == 0x001C:  
786 - MODULEDOCSTRING_Id = section_id  
787 - check_value('MODULEDOCSTRING_Id', 0x001C, MODULEDOCSTRING_Id)  
788 - MODULEDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0]  
789 - MODULEDOCSTRING_DocString = dir_stream.read(MODULEDOCSTRING_SizeOfDocString)  
790 - MODULEDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0]  
791 - check_value('MODULEDOCSTRING_Reserved', 0x0048, MODULEDOCSTRING_Reserved)  
792 - MODULEDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0]  
793 - MODULEDOCSTRING_DocStringUnicode = dir_stream.read(MODULEDOCSTRING_SizeOfDocStringUnicode)  
794 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
795 - if section_id == 0x0031:  
796 - MODULEOFFSET_Id = section_id  
797 - check_value('MODULEOFFSET_Id', 0x0031, MODULEOFFSET_Id)  
798 - MODULEOFFSET_Size = struct.unpack("<L", dir_stream.read(4))[0]  
799 - check_value('MODULEOFFSET_Size', 0x0004, MODULEOFFSET_Size)  
800 - MODULEOFFSET_TextOffset = struct.unpack("<L", dir_stream.read(4))[0]  
801 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
802 - if section_id == 0x001E:  
803 - MODULEHELPCONTEXT_Id = section_id  
804 - check_value('MODULEHELPCONTEXT_Id', 0x001E, MODULEHELPCONTEXT_Id)  
805 - MODULEHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0]  
806 - check_value('MODULEHELPCONTEXT_Size', 0x0004, MODULEHELPCONTEXT_Size)  
807 - MODULEHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0]  
808 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
809 - if section_id == 0x002C:  
810 - MODULECOOKIE_Id = section_id  
811 - check_value('MODULECOOKIE_Id', 0x002C, MODULECOOKIE_Id)  
812 - MODULECOOKIE_Size = struct.unpack("<L", dir_stream.read(4))[0]  
813 - check_value('MODULECOOKIE_Size', 0x0002, MODULECOOKIE_Size)  
814 - MODULECOOKIE_Cookie = struct.unpack("<H", dir_stream.read(2))[0]  
815 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
816 - if section_id == 0x0021 or section_id == 0x0022:  
817 - MODULETYPE_Id = section_id  
818 - MODULETYPE_Reserved = struct.unpack("<L", dir_stream.read(4))[0]  
819 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
820 - if section_id == 0x0025:  
821 - MODULEREADONLY_Id = section_id  
822 - check_value('MODULEREADONLY_Id', 0x0025, MODULEREADONLY_Id)  
823 - MODULEREADONLY_Reserved = struct.unpack("<L", dir_stream.read(4))[0]  
824 - check_value('MODULEREADONLY_Reserved', 0x0000, MODULEREADONLY_Reserved)  
825 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
826 - if section_id == 0x0028:  
827 - MODULEPRIVATE_Id = section_id  
828 - check_value('MODULEPRIVATE_Id', 0x0028, MODULEPRIVATE_Id)  
829 - MODULEPRIVATE_Reserved = struct.unpack("<L", dir_stream.read(4))[0]  
830 - check_value('MODULEPRIVATE_Reserved', 0x0000, MODULEPRIVATE_Reserved)  
831 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
832 - if section_id == 0x002B: # TERMINATOR  
833 - MODULE_Reserved = struct.unpack("<L", dir_stream.read(4))[0]  
834 - check_value('MODULE_Reserved', 0x0000, MODULE_Reserved)  
835 - section_id = None  
836 - if section_id != None:  
837 - logging.warning('unknown or invalid module section id {0:04X}'.format(section_id))  
838 -  
839 - logging.debug('Project CodePage = %d' % PROJECTCODEPAGE_CodePage)  
840 - vba_codec = 'cp%d' % PROJECTCODEPAGE_CodePage  
841 - logging.debug("ModuleName = {0}".format(MODULENAME_ModuleName))  
842 - logging.debug("StreamName = {0}".format(repr(MODULESTREAMNAME_StreamName)))  
843 - streamname_unicode = MODULESTREAMNAME_StreamName.decode(vba_codec)  
844 - logging.debug("StreamName.decode('%s') = %s" % (vba_codec, repr(streamname_unicode)))  
845 - logging.debug("StreamNameUnicode = {0}".format(repr(MODULESTREAMNAME_StreamNameUnicode)))  
846 - logging.debug("TextOffset = {0}".format(MODULEOFFSET_TextOffset))  
847 -  
848 - code_path = vba_root + u'VBA/' + streamname_unicode  
849 - #TODO: test if stream exists  
850 - logging.debug('opening VBA code stream %s' % repr(code_path))  
851 - code_data = ole.openstream(code_path).read()  
852 - logging.debug("length of code_data = {0}".format(len(code_data)))  
853 - logging.debug("offset of code_data = {0}".format(MODULEOFFSET_TextOffset))  
854 - code_data = code_data[MODULEOFFSET_TextOffset:]  
855 - if len(code_data) > 0:  
856 - code_data = decompress_stream(code_data)  
857 - # case-insensitive search in the code_modules dict to find the file extension:  
858 - filext = code_modules.get(MODULENAME_ModuleName.lower(), 'bin')  
859 - filename = '{0}.{1}'.format(MODULENAME_ModuleName, filext)  
860 - #TODO: also yield the codepage so that callers can decode it properly  
861 - yield (code_path, filename, code_data)  
862 - # print '-'*79  
863 - # print filename  
864 - # print ''  
865 - # print code_data  
866 - # print ''  
867 - logging.debug('extracted file {0}'.format(filename))  
868 - else:  
869 - logging.warning("module stream {0} has code data length 0".format(MODULESTREAMNAME_StreamName))  
870 - return  
871 -  
872 -  
873 -def filter_vba(vba_code):  
874 - """  
875 - Filter VBA source code to remove the first lines starting with "Attribute VB_",  
876 - which are automatically added by MS Office and not displayed in the VBA Editor.  
877 - This should only be used when displaying source code for human analysis.  
878 -  
879 - Note: lines are not filtered if they contain a colon, because it could be  
880 - used to hide malicious instructions.  
881 -  
882 - :param vba_code: str, VBA source code  
883 - :return: str, filtered VBA source code  
884 - """  
885 - vba_lines = vba_code.splitlines()  
886 - start = 0  
887 - for line in vba_lines:  
888 - if line.startswith("Attribute VB_") and not ':' in line:  
889 - start += 1  
890 - else:  
891 - break  
892 - #TODO: also remove empty lines?  
893 - vba = '\n'.join(vba_lines[start:])  
894 - return vba  
895 -  
896 -  
897 -def detect_autoexec(vba_code, obfuscation=None):  
898 - """  
899 - Detect if the VBA code contains keywords corresponding to macros running  
900 - automatically when triggered by specific actions (e.g. when a document is  
901 - opened or closed).  
902 -  
903 - :param vba_code: str, VBA source code  
904 - :param obfuscation: None or str, name of obfuscation to be added to description  
905 - :return: list of str tuples (keyword, description)  
906 - """  
907 - #TODO: merge code with detect_suspicious  
908 - # case-insensitive search  
909 - #vba_code = vba_code.lower()  
910 - results = []  
911 - obf_text = ''  
912 - if obfuscation:  
913 - obf_text = ' (obfuscation: %s)' % obfuscation  
914 - for description, keywords in AUTOEXEC_KEYWORDS.items():  
915 - for keyword in keywords:  
916 - #TODO: if keyword is already a compiled regex, use it as-is  
917 - # search using regex to detect word boundaries:  
918 - if re.search(r'(?i)\b'+keyword+r'\b', vba_code):  
919 - #if keyword.lower() in vba_code:  
920 - results.append((keyword, description+obf_text))  
921 - return results  
922 -  
923 -  
924 -def detect_suspicious(vba_code, obfuscation=None):  
925 - """  
926 - Detect if the VBA code contains suspicious keywords corresponding to  
927 - potential malware behaviour.  
928 -  
929 - :param vba_code: str, VBA source code  
930 - :param obfuscation: None or str, name of obfuscation to be added to description  
931 - :return: list of str tuples (keyword, description)  
932 - """  
933 - # case-insensitive search  
934 - #vba_code = vba_code.lower()  
935 - results = []  
936 - obf_text = ''  
937 - if obfuscation:  
938 - obf_text = ' (obfuscation: %s)' % obfuscation  
939 - for description, keywords in SUSPICIOUS_KEYWORDS.items():  
940 - for keyword in keywords:  
941 - # search using regex to detect word boundaries:  
942 - if re.search(r'(?i)\b'+keyword+r'\b', vba_code):  
943 - #if keyword.lower() in vba_code:  
944 - results.append((keyword, description+obf_text))  
945 - return results  
946 -  
947 -  
948 -def detect_patterns(vba_code, obfuscation=None):  
949 - """  
950 - Detect if the VBA code contains specific patterns such as IP addresses,  
951 - URLs, e-mail addresses, executable file names, etc.  
952 -  
953 - :param vba_code: str, VBA source code  
954 - :return: list of str tuples (pattern type, value)  
955 - """  
956 - results = []  
957 - found = set()  
958 - obf_text = ''  
959 - if obfuscation:  
960 - obf_text = ' (obfuscation: %s)' % obfuscation  
961 - for pattern_type, pattern_re in RE_PATTERNS:  
962 - for match in pattern_re.finditer(vba_code):  
963 - value = match.group()  
964 - if value not in found:  
965 - results.append((pattern_type+obf_text, value))  
966 - found.add(value)  
967 - return results  
968 -  
969 -  
970 -def detect_hex_strings(vba_code):  
971 - """  
972 - Detect if the VBA code contains strings encoded in hexadecimal.  
973 -  
974 - :param vba_code: str, VBA source code  
975 - :return: list of str tuples (encoded string, decoded string)  
976 - """  
977 - results = []  
978 - found = set()  
979 - for match in re_hex_string.finditer(vba_code):  
980 - value = match.group()  
981 - if value not in found:  
982 - decoded = binascii.unhexlify(value)  
983 - results.append((value, decoded))  
984 - found.add(value)  
985 - return results  
986 -  
987 -  
988 -def detect_base64_strings(vba_code):  
989 - """  
990 - Detect if the VBA code contains strings encoded in base64.  
991 -  
992 - :param vba_code: str, VBA source code  
993 - :return: list of str tuples (encoded string, decoded string)  
994 - """  
995 - #TODO: avoid matching simple hex strings as base64?  
996 - results = []  
997 - found = set()  
998 - for match in re_base64_string.finditer(vba_code):  
999 - # extract the base64 string without quotes:  
1000 - value = match.group().strip('"')  
1001 - # check it is not just a hex string:  
1002 - if not re_nothex_check.search(value):  
1003 - continue  
1004 - # only keep new values and not in the whitelist:  
1005 - if value not in found and value.lower() not in BASE64_WHITELIST:  
1006 - try:  
1007 - decoded = base64.b64decode(value)  
1008 - results.append((value, decoded))  
1009 - found.add(value)  
1010 - except:  
1011 - # if an exception occurs, it is likely not a base64-encoded string  
1012 - pass  
1013 - return results  
1014 -  
1015 -  
1016 -def detect_dridex_strings(vba_code):  
1017 - """  
1018 - Detect if the VBA code contains strings obfuscated with a specific algorithm found in Dridex samples.  
1019 -  
1020 - :param vba_code: str, VBA source code  
1021 - :return: list of str tuples (encoded string, decoded string)  
1022 - """  
1023 - from thirdparty.DridexUrlDecoder.DridexUrlDecoder import DridexUrlDecode  
1024 - results = []  
1025 - found = set()  
1026 - for match in re_dridex_string.finditer(vba_code):  
1027 - value = match.group()[1:-1]  
1028 - # check it is not just a hex string:  
1029 - if not re_nothex_check.search(value):  
1030 - continue  
1031 - if value not in found:  
1032 - try:  
1033 - decoded = DridexUrlDecode(value)  
1034 - results.append((value, decoded))  
1035 - found.add(value)  
1036 - except:  
1037 - # if an exception occurs, it is likely not a dridex-encoded string  
1038 - pass  
1039 - return results  
1040 -  
1041 -  
1042 -class VBA_Scanner (object):  
1043 - """  
1044 - Class to scan the source code of a VBA module to find obfuscated strings,  
1045 - suspicious keywords, IOCs, auto-executable macros, etc.  
1046 - """  
1047 -  
1048 - def __init__(self, vba_code):  
1049 - """  
1050 - VBA_Scanner constructor  
1051 -  
1052 - :param vba_code: str, VBA source code to be analyzed  
1053 - """  
1054 - self.code = vba_code  
1055 - self.code_hex = ''  
1056 - self.code_hex_rev = ''  
1057 - self.code_rev_hex = ''  
1058 - self.code_base64 = ''  
1059 - self.code_dridex = ''  
1060 -  
1061 -  
1062 - def scan(self, include_decoded_strings=False):  
1063 - """  
1064 - Analyze the provided VBA code to detect suspicious keywords,  
1065 - auto-executable macros, IOC patterns, obfuscation patterns  
1066 - such as hex-encoded strings.  
1067 -  
1068 - :param include_decoded_strings: bool, if True, all encoded strings will be included with their decoded content.  
1069 - :return: list of tuples (type, keyword, description)  
1070 - (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String')  
1071 - """  
1072 - # First, detect and extract hex-encoded strings:  
1073 - self.hex_strings = detect_hex_strings(self.code)  
1074 - # detect if the code contains StrReverse:  
1075 - self.strReverse = False  
1076 - if 'strreverse' in self.code.lower(): self.strReverse = True  
1077 - # Then append the decoded strings to the VBA code, to detect obfuscated IOCs and keywords:  
1078 - for encoded, decoded in self.hex_strings:  
1079 - self.code_hex += '\n'+decoded  
1080 - # if the code contains "StrReverse", also append the hex strings in reverse order:  
1081 - if self.strReverse:  
1082 - # StrReverse after hex decoding:  
1083 - self.code_hex_rev += '\n'+decoded[::-1]  
1084 - # StrReverse before hex decoding:  
1085 - self.code_rev_hex += '\n'+binascii.unhexlify(encoded[::-1])  
1086 - #example: https://malwr.com/analysis/NmFlMGI4YTY1YzYyNDkwNTg1ZTBiZmY5OGI3YjlhYzU/  
1087 - #TODO: also append the full code reversed if StrReverse? (risk of false positives?)  
1088 - # Detect Base64-encoded strings  
1089 - self.base64_strings = detect_base64_strings(self.code)  
1090 - for encoded, decoded in self.base64_strings:  
1091 - self.code_base64 += '\n'+decoded  
1092 - # Detect Dridex-encoded strings  
1093 - self.dridex_strings = detect_dridex_strings(self.code)  
1094 - for encoded, decoded in self.dridex_strings:  
1095 - self.code_dridex += '\n'+decoded  
1096 - results = []  
1097 - self.autoexec_keywords = []  
1098 - self.suspicious_keywords = []  
1099 - self.iocs = []  
1100 -  
1101 - for code, obfuscation in (  
1102 - (self.code, None),  
1103 - (self.code_hex, 'Hex'),  
1104 - (self.code_hex_rev, 'Hex+StrReverse'),  
1105 - (self.code_rev_hex, 'StrReverse+Hex'),  
1106 - (self.code_base64, 'Base64'),  
1107 - (self.code_dridex, 'Dridex'),  
1108 - ):  
1109 - self.autoexec_keywords += detect_autoexec(code, obfuscation)  
1110 - self.suspicious_keywords += detect_suspicious(code, obfuscation)  
1111 - self.iocs += detect_patterns(code, obfuscation)  
1112 -  
1113 - # If hex-encoded strings were discovered, add an item to suspicious keywords:  
1114 - if self.hex_strings:  
1115 - self.suspicious_keywords.append(('Hex Strings',  
1116 - 'Hex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))  
1117 - if self.base64_strings:  
1118 - self.suspicious_keywords.append(('Base64 Strings',  
1119 - 'Base64-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))  
1120 - if self.dridex_strings:  
1121 - self.suspicious_keywords.append(('Dridex Strings',  
1122 - 'Dridex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))  
1123 - for keyword, description in self.autoexec_keywords:  
1124 - results.append(('AutoExec', keyword, description))  
1125 - for keyword, description in self.suspicious_keywords:  
1126 - results.append(('Suspicious', keyword, description))  
1127 - for pattern_type, value in self.iocs:  
1128 - results.append(('IOC', value, pattern_type))  
1129 - if include_decoded_strings:  
1130 - for encoded, decoded in self.hex_strings:  
1131 - results.append(('Hex String', repr(decoded), encoded))  
1132 - for encoded, decoded in self.base64_strings:  
1133 - results.append(('Base64 String', repr(decoded), encoded))  
1134 - for encoded, decoded in self.dridex_strings:  
1135 - results.append(('Dridex string', repr(decoded), encoded))  
1136 - return results  
1137 -  
1138 - def scan_summary(self):  
1139 - """  
1140 - Analyze the provided VBA code to detect suspicious keywords,  
1141 - auto-executable macros, IOC patterns, obfuscation patterns  
1142 - such as hex-encoded strings.  
1143 -  
1144 - :return: tuple with the number of items found for each category:  
1145 - (autoexec, suspicious, IOCs, hex, base64, dridex)  
1146 - """  
1147 - self.scan()  
1148 - return (len(self.autoexec_keywords), len(self.suspicious_keywords),  
1149 - len(self.iocs), len(self.hex_strings), len(self.base64_strings),  
1150 - len(self.dridex_strings))  
1151 -  
1152 -  
1153 -  
1154 -def scan_vba(vba_code, include_decoded_strings):  
1155 - """  
1156 - Analyze the provided VBA code to detect suspicious keywords,  
1157 - auto-executable macros, IOC patterns, obfuscation patterns  
1158 - such as hex-encoded strings.  
1159 - (shortcut for VBA_Scanner(vba_code).scan())  
1160 -  
1161 - :param vba_code: str, VBA source code to be analyzed  
1162 - :param include_decoded_strings: bool, if True all encoded strings will be included with their decoded content.  
1163 - :return: list of tuples (type, keyword, description)  
1164 - (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String')  
1165 - """  
1166 - return VBA_Scanner(vba_code).scan(include_decoded_strings)  
1167 -  
1168 -  
1169 -#=== CLASSES =================================================================  
1170 -  
1171 -class VBA_Parser(object):  
1172 - """  
1173 - Class to parse MS Office files, to detect VBA macros and extract VBA source code  
1174 - Supported file formats:  
1175 - - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)  
1176 - - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)  
1177 - - PowerPoint 2007+ (.pptm, .ppsm)  
1178 - """  
1179 -  
1180 - def __init__(self, filename, data=None):  
1181 - """  
1182 - Constructor for VBA_Parser  
1183 -  
1184 - :param filename: filename or path of file to parse, or file-like object  
1185 -  
1186 - :param data: None or bytes str, if None the file will be read from disk (or from the file-like object).  
1187 - If data is provided as a bytes string, it will be parsed as the content of the file in memory,  
1188 - and not read from disk. Note: files must be read in binary mode, i.e. open(f, 'rb').  
1189 - """  
1190 - #TODO: filename should only be a string, data should be used for the file-like object  
1191 - #TODO: filename should be mandatory, optional data is a string or file-like object  
1192 - #TODO: also support olefile and zipfile as input  
1193 - if data is None:  
1194 - # open file from disk:  
1195 - _file = filename  
1196 - else:  
1197 - # file already read in memory, make it a file-like object for zipfile:  
1198 - _file = cStringIO.StringIO(data)  
1199 - #self.file = _file  
1200 - self.ole_file = None  
1201 - self.ole_subfiles = []  
1202 - self.filename = filename  
1203 - self.type = None  
1204 - self.vba_projects = None  
1205 - # if filename is None:  
1206 - # if isinstance(_file, basestring):  
1207 - # if len(_file) < olefile.MINIMAL_OLEFILE_SIZE:  
1208 - # self.filename = _file  
1209 - # else:  
1210 - # self.filename = '<file in bytes string>'  
1211 - # else:  
1212 - # self.filename = '<file-like object>'  
1213 - if olefile.isOleFile(_file):  
1214 - # This looks like an OLE file  
1215 - logging.info('Parsing OLE file %s' % self.filename)  
1216 - # Open and parse the OLE file, using unicode for path names:  
1217 - self.ole_file = olefile.OleFileIO(_file, path_encoding=None)  
1218 - self.type = TYPE_OLE  
1219 - #TODO: raise TypeError if this is a Powerpoint 97 file, since VBA macros cannot be detected yet  
1220 - elif zipfile.is_zipfile(_file):  
1221 - # This looks like a zip file, need to look for vbaProject.bin inside  
1222 - # It can be any OLE file inside the archive  
1223 - #...because vbaProject.bin can be renamed:  
1224 - # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18  
1225 - logging.info('Opening ZIP/OpenXML file %s' % self.filename)  
1226 - self.type = TYPE_OpenXML  
1227 - z = zipfile.ZipFile(_file)  
1228 - #TODO: check if this is actually an OpenXML file  
1229 - #TODO: if the zip file is encrypted, suggest to use the -z option, or try '-z infected' automatically?  
1230 - # check each file within the zip if it is an OLE file, by reading its magic:  
1231 - for subfile in z.namelist():  
1232 - magic = z.open(subfile).read(len(olefile.MAGIC))  
1233 - if magic == olefile.MAGIC:  
1234 - logging.debug('Opening OLE file %s within zip' % subfile)  
1235 - ole_data = z.open(subfile).read()  
1236 - try:  
1237 - self.ole_subfiles.append(VBA_Parser(filename=subfile, data=ole_data))  
1238 - except:  
1239 - logging.debug('%s is not a valid OLE file' % subfile)  
1240 - continue  
1241 - z.close()  
1242 - else:  
1243 - # read file from disk, check if it is a Word 2003 XML file (WordProcessingML), Excel 2003 XML,  
1244 - # or a plain text file containing VBA code  
1245 - if data is None:  
1246 - data = open(filename, 'rb').read()  
1247 - # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace  
1248 - if 'http://schemas.microsoft.com/office/word/2003/wordml' in data:  
1249 - logging.info('Opening Word 2003 XML file %s' % self.filename)  
1250 - self.type = TYPE_Word2003_XML  
1251 - # parse the XML content  
1252 - et = ET.fromstring(data)  
1253 - # find all the binData elements:  
1254 - for bindata in et.getiterator(TAG_BINDATA):  
1255 - # the binData content is an OLE container for the VBA project, compressed  
1256 - # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded.  
1257 - # get the filename:  
1258 - fname = bindata.get(ATTR_NAME, 'noname.mso')  
1259 - # decode the base64 activemime  
1260 - activemime = binascii.a2b_base64(bindata.text)  
1261 - # decompress the zlib data starting at offset 0x32, which is the OLE container:  
1262 - ole_data = zlib.decompress(activemime[0x32:])  
1263 - try:  
1264 - self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data))  
1265 - except:  
1266 - logging.debug('%s is not a valid OLE file' % fname)  
1267 - continue  
1268 - #TODO: handle exceptions  
1269 - #TODO: Excel 2003 XML  
1270 - #TODO: plain text VBA file  
1271 - else:  
1272 - msg = '%s is not an OLE nor an OpenXML file, cannot extract VBA Macros.' % self.filename  
1273 - logging.error(msg)  
1274 - raise TypeError(msg)  
1275 -  
1276 - def find_vba_projects (self):  
1277 - """  
1278 - Finds all the VBA projects stored in an OLE file.  
1279 -  
1280 - Return None if the file is not OLE but OpenXML.  
1281 - Return a list of tuples (vba_root, project_path, dir_path) for each VBA project.  
1282 - vba_root is the path of the root OLE storage containing the VBA project,  
1283 - including a trailing slash unless it is the root of the OLE file.  
1284 - project_path is the path of the OLE stream named "PROJECT" within the VBA project.  
1285 - dir_path is the path of the OLE stream named "VBA/dir" within the VBA project.  
1286 -  
1287 - If this function returns an empty list for one of the supported formats  
1288 - (i.e. Word, Excel, Powerpoint except Powerpoint 97-2003), then the  
1289 - file does not contain VBA macros.  
1290 -  
1291 - :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path)  
1292 - for each VBA project found if OLE file  
1293 - """  
1294 - # if the file is not OLE but OpenXML, return None:  
1295 - if self.ole_file is None:  
1296 - return None  
1297 -  
1298 - # if this method has already been called, return previous result:  
1299 - if self.vba_projects is not None:  
1300 - return self.vba_projects  
1301 -  
1302 - # Find the VBA project root (different in MS Word, Excel, etc):  
1303 - # - Word 97-2003: Macros  
1304 - # - Excel 97-2003: _VBA_PROJECT_CUR  
1305 - # - PowerPoint 97-2003: not supported yet (different file structure)  
1306 - # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin.  
1307 - # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word  
1308 - # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word  
1309 - # - Visio 2007: not supported yet (different file structure)  
1310 -  
1311 - # According to MS-OVBA section 2.2.1:  
1312 - # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream  
1313 - # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream  
1314 - # - all names are case-insensitive  
1315 -  
1316 - # start with an empty list:  
1317 - self.vba_projects = []  
1318 - # Look for any storage containing those storage/streams:  
1319 - ole = self.ole_file  
1320 - for storage in ole.listdir(streams=False, storages=True):  
1321 - # Look for a storage ending with "VBA":  
1322 - if storage[-1].upper() == 'VBA':  
1323 - logging.debug('Found VBA storage: %s' % ('/'.join(storage)))  
1324 - vba_root = '/'.join(storage[:-1])  
1325 - # Add a trailing slash to vba_root, unless it is the root of the OLE file:  
1326 - # (used later to append all the child streams/storages)  
1327 - if vba_root != '':  
1328 - vba_root += '/'  
1329 - logging.debug('Checking vba_root="%s"' % vba_root)  
1330 -  
1331 - def check_vba_stream(ole, vba_root, stream_path):  
1332 - full_path = vba_root + stream_path  
1333 - if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM:  
1334 - logging.debug('Found %s stream: %s' % (stream_path, full_path))  
1335 - return full_path  
1336 - else:  
1337 - logging.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path)  
1338 - return False  
1339 -  
1340 - # Check if the VBA root storage also contains a PROJECT stream:  
1341 - project_path = check_vba_stream(ole, vba_root, 'PROJECT')  
1342 - if not project_path: continue  
1343 - # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream:  
1344 - vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT')  
1345 - if not vba_project_path: continue  
1346 - # Check if the VBA root storage also contains a VBA/dir stream:  
1347 - dir_path = check_vba_stream(ole, vba_root, 'VBA/dir')  
1348 - if not dir_path: continue  
1349 - # Now we are pretty sure it is a VBA project structure  
1350 - logging.debug('VBA root storage: "%s"' % vba_root)  
1351 - # append the results to the list as a tuple for later use:  
1352 - self.vba_projects.append((vba_root, project_path, dir_path))  
1353 - return self.vba_projects  
1354 -  
1355 - def detect_vba_macros(self):  
1356 - """  
1357 - Detect the potential presence of VBA macros in the file, by checking  
1358 - if it contains VBA projects. Both OLE and OpenXML files are supported.  
1359 -  
1360 - Important: for now, results are accurate only for Word, Excel and PowerPoint  
1361 - EXCEPT Powerpoint 97-2003, which has a different structure for VBA.  
1362 -  
1363 - Note: this method does NOT attempt to check the actual presence or validity  
1364 - of VBA macro source code, so there might be false positives.  
1365 - It may also detect VBA macros in files embedded within the main file,  
1366 - for example an Excel workbook with macros embedded into a Word  
1367 - document without macros may be detected, without distinction.  
1368 -  
1369 - :return: bool, True if at least one VBA project has been found, False otherwise  
1370 - """  
1371 - #TODO: return None or raise exception if format not supported like PPT 97-2003  
1372 - #TODO: return the number of VBA projects found instead of True/False?  
1373 - # if OpenXML, check all the OLE subfiles:  
1374 - if self.ole_file is None:  
1375 - for ole_subfile in self.ole_subfiles:  
1376 - if ole_subfile.detect_vba_macros():  
1377 - return True  
1378 - return False  
1379 - # otherwise it's an OLE file, find VBA projects:  
1380 - vba_projects = self.find_vba_projects()  
1381 - if len(vba_projects) == 0:  
1382 - return False  
1383 - else:  
1384 - return True  
1385 -  
1386 -  
1387 - def extract_macros (self):  
1388 - """  
1389 - Extract and decompress source code for each VBA macro found in the file  
1390 -  
1391 - Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found  
1392 - If the file is OLE, filename is the path of the file.  
1393 - If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros  
1394 - within the zip archive, e.g. word/vbaProject.bin.  
1395 - """  
1396 - if self.ole_file is None:  
1397 - for ole_subfile in self.ole_subfiles:  
1398 - for results in ole_subfile.extract_macros():  
1399 - yield results  
1400 - else:  
1401 - self.find_vba_projects()  
1402 - for vba_root, project_path, dir_path in self.vba_projects:  
1403 - # extract all VBA macros from that VBA root storage:  
1404 - for stream_path, vba_filename, vba_code in _extract_vba(self.ole_file, vba_root, project_path, dir_path):  
1405 - yield (self.filename, stream_path, vba_filename, vba_code)  
1406 -  
1407 -  
1408 - def close(self):  
1409 - """  
1410 - Close all the open files. This method must be called after usage, if  
1411 - the application is opening many files.  
1412 - """  
1413 - if self.ole_file is None:  
1414 - for ole_subfile in self.ole_subfiles:  
1415 - ole_subfile.close()  
1416 - else:  
1417 - self.ole_file.close()  
1418 -  
1419 -  
1420 -def print_analysis(vba_code, show_decoded_strings=False):  
1421 - """  
1422 - Analyze the provided VBA code, and print the results in a table  
1423 -  
1424 - :param vba_code: str, VBA source code to be analyzed  
1425 - :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.  
1426 - :return: None  
1427 - """  
1428 - results = scan_vba(vba_code, show_decoded_strings)  
1429 - if results:  
1430 - t = prettytable.PrettyTable(('Type', 'Keyword', 'Description'))  
1431 - t.align = 'l'  
1432 - t.max_width['Type'] = 10  
1433 - t.max_width['Keyword'] = 20  
1434 - t.max_width['Description'] = 39  
1435 - for kw_type, keyword, description in results:  
1436 - t.add_row((kw_type, keyword, description))  
1437 - print t  
1438 - else:  
1439 - print 'No suspicious keyword or IOC found.'  
1440 -  
1441 -  
1442 -  
1443 -def process_file (container, filename, data, show_decoded_strings=False):  
1444 - """  
1445 - Process a single file  
1446 -  
1447 - :param container: str, path and filename of container if the file is within  
1448 - a zip archive, None otherwise.  
1449 - :param filename: str, path and filename of file on disk, or within the container.  
1450 - :param data: bytes, content of the file if it is in a container, None if it is a file on disk.  
1451 - :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.  
1452 - """  
1453 - #TODO: replace print by writing to a provided output file (sys.stdout by default)  
1454 - if container:  
1455 - display_filename = '%s in %s' % (filename, container)  
1456 - else:  
1457 - display_filename = filename  
1458 - print '='*79  
1459 - print 'FILE:', display_filename  
1460 - try:  
1461 - #TODO: handle olefile errors, when an OLE file is malformed  
1462 - vba = VBA_Parser(filename, data)  
1463 - print 'Type:', vba.type  
1464 - if vba.detect_vba_macros():  
1465 - #print 'Contains VBA Macros:'  
1466 - for (subfilename, stream_path, vba_filename, vba_code) in vba.extract_macros():  
1467 - # hide attribute lines:  
1468 - #TODO: option to disable attribute filtering  
1469 - vba_code_filtered = filter_vba(vba_code)  
1470 - print '-'*79  
1471 - print 'VBA MACRO %s ' % vba_filename  
1472 - print 'in file: %s - OLE stream: %s' % (subfilename, repr(stream_path))  
1473 - print '- '*39  
1474 - # detect empty macros:  
1475 - if vba_code_filtered.strip() == '':  
1476 - print '(empty macro)'  
1477 - else:  
1478 - print vba_code_filtered  
1479 - print '- '*39  
1480 - print 'ANALYSIS:'  
1481 - # analyse the whole code, filtered to avoid false positives:  
1482 - print_analysis(vba_code_filtered, show_decoded_strings)  
1483 - else:  
1484 - print 'No VBA macros found.'  
1485 - except: #TypeError:  
1486 - #raise  
1487 - #TODO: print more info if debug mode  
1488 - #print sys.exc_value  
1489 - # display the exception with full stack trace for debugging, but do not stop:  
1490 - traceback.print_exc()  
1491 - print ''  
1492 -  
1493 -  
1494 -def process_file_triage (container, filename, data):  
1495 - """  
1496 - Process a single file  
1497 -  
1498 - :param container: str, path and filename of container if the file is within  
1499 - a zip archive, None otherwise.  
1500 - :param filename: str, path and filename of file on disk, or within the container.  
1501 - :param data: bytes, content of the file if it is in a container, None if it is a file on disk.  
1502 - """  
1503 - #TODO: replace print by writing to a provided output file (sys.stdout by default)  
1504 - nb_macros = 0  
1505 - nb_autoexec = 0  
1506 - nb_suspicious = 0  
1507 - nb_iocs = 0  
1508 - nb_hexstrings = 0  
1509 - nb_base64strings = 0  
1510 - nb_dridexstrings = 0  
1511 - # ftype = 'Other'  
1512 - message = ''  
1513 - try:  
1514 - #TODO: handle olefile errors, when an OLE file is malformed  
1515 - vba = VBA_Parser(filename, data)  
1516 - if vba.detect_vba_macros():  
1517 - for (subfilename, stream_path, vba_filename, vba_code) in vba.extract_macros():  
1518 - nb_macros += 1  
1519 - if vba_code.strip() != '':  
1520 - # analyse the whole code, filtered to avoid false positives:  
1521 - scanner = VBA_Scanner(filter_vba(vba_code))  
1522 - autoexec, suspicious, iocs, hexstrings, base64strings, dridex = scanner.scan_summary()  
1523 - nb_autoexec += autoexec  
1524 - nb_suspicious += suspicious  
1525 - nb_iocs += iocs  
1526 - nb_hexstrings += hexstrings  
1527 - nb_base64strings += base64strings  
1528 - nb_dridexstrings += dridex  
1529 - if vba.type == TYPE_OLE:  
1530 - flags = 'OLE:'  
1531 - elif vba.type == TYPE_OpenXML:  
1532 - flags = 'OpX:'  
1533 - elif vba.type == TYPE_Word2003_XML:  
1534 - flags = 'XML:'  
1535 - macros = autoexec = suspicious = iocs = hexstrings = base64obf = dridex = '-'  
1536 - if nb_macros: macros = 'M'  
1537 - if nb_autoexec: autoexec = 'A'  
1538 - if nb_suspicious: suspicious = 'S'  
1539 - if nb_iocs: iocs = 'I'  
1540 - if nb_hexstrings: hexstrings = 'H'  
1541 - if nb_base64strings: base64obf = 'B'  
1542 - if nb_dridexstrings: dridex = 'D'  
1543 - flags += '%s%s%s%s%s%s%s' % (macros, autoexec, suspicious, iocs, hexstrings,  
1544 - base64obf, dridex)  
1545 -  
1546 - # macros = autoexec = suspicious = iocs = hexstrings = 'no'  
1547 - # if nb_macros: macros = 'YES:%d' % nb_macros  
1548 - # if nb_autoexec: autoexec = 'YES:%d' % nb_autoexec  
1549 - # if nb_suspicious: suspicious = 'YES:%d' % nb_suspicious  
1550 - # if nb_iocs: iocs = 'YES:%d' % nb_iocs  
1551 - # if nb_hexstrings: hexstrings = 'YES:%d' % nb_hexstrings  
1552 - # # 2nd line = info  
1553 - # print '%-8s %-7s %-7s %-7s %-7s %-7s' % (vba.type, macros, autoexec, suspicious, iocs, hexstrings)  
1554 - except TypeError:  
1555 - # file type not OLE nor OpenXML  
1556 - flags = '?'  
1557 - message = 'File format not supported'  
1558 - except:  
1559 - # another error occurred  
1560 - #raise  
1561 - #TODO: print more info if debug mode  
1562 - #TODO: distinguish real errors from incorrect file types  
1563 - flags = '!ERROR'  
1564 - message = sys.exc_value  
1565 - line = '%-11s %s' % (flags, filename)  
1566 - if message:  
1567 - line += ' - %s' % message  
1568 - print line  
1569 -  
1570 - # t = prettytable.PrettyTable(('filename', 'type', 'macros', 'autoexec', 'suspicious', 'ioc', 'hexstrings'),  
1571 - # header=False, border=False)  
1572 - # t.align = 'l'  
1573 - # t.max_width['filename'] = 30  
1574 - # t.max_width['type'] = 10  
1575 - # t.max_width['macros'] = 6  
1576 - # t.max_width['autoexec'] = 6  
1577 - # t.max_width['suspicious'] = 6  
1578 - # t.max_width['ioc'] = 6  
1579 - # t.max_width['hexstrings'] = 6  
1580 - # t.add_row((filename, ftype, macros, autoexec, suspicious, iocs, hexstrings))  
1581 - # print t  
1582 -  
1583 -def main_triage_quick():  
1584 - pass  
1585 -  
1586 -#=== MAIN =====================================================================  
1587 -  
1588 -def main():  
1589 - """  
1590 - Main function, called when olevba is run from the command line  
1591 - """  
1592 - usage = 'usage: %prog [options] <filename> [filename2 ...]'  
1593 - parser = optparse.OptionParser(usage=usage)  
1594 - # parser.add_option('-o', '--outfile', dest='outfile',  
1595 - # help='output file')  
1596 - # parser.add_option('-c', '--csv', dest='csv',  
1597 - # help='export results to a CSV file')  
1598 - parser.add_option("-r", action="store_true", dest="recursive",  
1599 - help='find files recursively in subdirectories.')  
1600 - parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None,  
1601 - help='if the file is a zip archive, open first file from it, using the provided password (requires Python 2.6+)')  
1602 - parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*',  
1603 - help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)')  
1604 - parser.add_option("-t", action="store_true", dest="triage_mode",  
1605 - help='triage mode, display results as a summary table (default for multiple files)')  
1606 - parser.add_option("-d", action="store_true", dest="detailed_mode",  
1607 - help='detailed mode, display full results (default for single file)')  
1608 - parser.add_option("-i", "--input", dest='input', type='str', default=None,  
1609 - help='input file containing VBA source code to be analyzed (no parsing)')  
1610 - parser.add_option("--decode", action="store_true", dest="show_decoded_strings",  
1611 - help='display all the obfuscated strings with their decoded content (Hex, Base64, StrReverse, Dridex).')  
1612 -  
1613 - (options, args) = parser.parse_args()  
1614 -  
1615 - # Print help if no arguments are passed  
1616 - if len(args) == 0 and not options.input:  
1617 - print __doc__  
1618 - parser.print_help()  
1619 - sys.exit()  
1620 -  
1621 - # print banner with version  
1622 - print 'olevba %s - http://decalage.info/python/oletools' % __version__  
1623 -  
1624 - logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING) #INFO)  
1625 - # For now, all logging is disabled:  
1626 - logging.disable(logging.CRITICAL)  
1627 -  
1628 - if options.input:  
1629 - # input file provided with VBA source code to be analyzed directly:  
1630 - print 'Analysis of VBA source code from %s:' % options.input  
1631 - vba_code = open(options.input).read()  
1632 - print_analysis(vba_code, show_decoded_strings=options.show_decoded_strings)  
1633 - sys.exit()  
1634 -  
1635 - # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('Type', 'Macros', 'AutoEx', 'Susp.', 'IOCs', 'HexStr')  
1636 - # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('-'*8, '-'*7, '-'*7, '-'*7, '-'*7, '-'*7)  
1637 - if not options.detailed_mode or options.triage_mode:  
1638 - print '%-11s %-65s' % ('Flags', 'Filename')  
1639 - print '%-11s %-65s' % ('-'*11, '-'*65)  
1640 - previous_container = None  
1641 - count = 0  
1642 - container = filename = data = None  
1643 - for container, filename, data in xglob.iter_files(args, recursive=options.recursive,  
1644 - zip_password=options.zip_password, zip_fname=options.zip_fname):  
1645 - # ignore directory names stored in zip files:  
1646 - if container and filename.endswith('/'):  
1647 - continue  
1648 - if options.detailed_mode and not options.triage_mode:  
1649 - # fully detailed output  
1650 - process_file(container, filename, data, show_decoded_strings=options.show_decoded_strings)  
1651 - else:  
1652 - # print container name when it changes:  
1653 - if container != previous_container:  
1654 - if container is not None:  
1655 - print '\nFiles in %s:' % container  
1656 - previous_container = container  
1657 - # summarized output for triage:  
1658 - process_file_triage(container, filename, data)  
1659 - count += 1  
1660 - if not options.detailed_mode or options.triage_mode:  
1661 - print '\n(Flags: OpX=OpenXML, XML=Word2003XML, M=Macros, A=Auto-executable, S=Suspicious keywords, I=IOCs, H=Hex strings, B=Base64 strings, D=Dridex strings, ?=Unknown)\n'  
1662 -  
1663 - if count == 1 and not options.triage_mode and not options.detailed_mode:  
1664 - # if options -t and -d were not specified and it's a single file, print details:  
1665 - #TODO: avoid doing the analysis twice by storing results  
1666 - process_file(container, filename, data, show_decoded_strings=options.show_decoded_strings)  
1667 -  
1668 -if __name__ == '__main__':  
1669 - main()  
1670 - 1 +#!/usr/bin/env python
  2 +"""
  3 +olevba.py
  4 +
  5 +olevba is a script to parse OLE and OpenXML files such as MS Office documents
  6 +(e.g. Word, Excel), to extract VBA Macro code in clear text, deobfuscate
  7 +and analyze malicious macros.
  8 +
  9 +Supported formats:
  10 +- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)
  11 +- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
  12 +- PowerPoint 2007+ (.pptm, .ppsm)
  13 +- Word 2003 XML (.xml)
  14 +
  15 +Author: Philippe Lagadec - http://www.decalage.info
  16 +License: BSD, see source code or documentation
  17 +
  18 +olevba is part of the python-oletools package:
  19 +http://www.decalage.info/python/oletools
  20 +
  21 +olevba is based on source code from officeparser by John William Davison
  22 +https://github.com/unixfreak0037/officeparser
  23 +"""
  24 +
  25 +#=== LICENSE ==================================================================
  26 +
  27 +# olevba is copyright (c) 2014-2015 Philippe Lagadec (http://www.decalage.info)
  28 +# All rights reserved.
  29 +#
  30 +# Redistribution and use in source and binary forms, with or without modification,
  31 +# are permitted provided that the following conditions are met:
  32 +#
  33 +# * Redistributions of source code must retain the above copyright notice, this
  34 +# list of conditions and the following disclaimer.
  35 +# * Redistributions in binary form must reproduce the above copyright notice,
  36 +# this list of conditions and the following disclaimer in the documentation
  37 +# and/or other materials provided with the distribution.
  38 +#
  39 +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  40 +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  41 +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  42 +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  43 +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  44 +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  45 +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  46 +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  47 +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  48 +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  49 +
  50 +
  51 +# olevba contains modified source code from the officeparser project, published
  52 +# under the following MIT License (MIT):
  53 +#
  54 +# officeparser is copyright (c) 2014 John William Davison
  55 +#
  56 +# Permission is hereby granted, free of charge, to any person obtaining a copy
  57 +# of this software and associated documentation files (the "Software"), to deal
  58 +# in the Software without restriction, including without limitation the rights
  59 +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  60 +# copies of the Software, and to permit persons to whom the Software is
  61 +# furnished to do so, subject to the following conditions:
  62 +#
  63 +# The above copyright notice and this permission notice shall be included in all
  64 +# copies or substantial portions of the Software.
  65 +#
  66 +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  67 +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  68 +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  69 +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  70 +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  71 +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  72 +# SOFTWARE.
  73 +
  74 +#------------------------------------------------------------------------------
  75 +# CHANGELOG:
  76 +# 2014-08-05 v0.01 PL: - first version based on officeparser code
  77 +# 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser
  78 +# 2014-08-15 PL: - fixed incorrect value check in PROJECTHELPFILEPATH Record
  79 +# 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats
  80 +# and to find the VBA project root anywhere in the file
  81 +# 2014-11-29 v0.04 PL: - use olefile instead of OleFileIO_PL
  82 +# 2014-12-05 v0.05 PL: - refactored most functions into a class, new API
  83 +# - added detect_vba_macros
  84 +# 2014-12-10 v0.06 PL: - hide first lines with VB attributes
  85 +# - detect auto-executable macros
  86 +# - ignore empty macros
  87 +# 2014-12-14 v0.07 PL: - detect_autoexec() is now case-insensitive
  88 +# 2014-12-15 v0.08 PL: - improved display for empty macros
  89 +# - added pattern extraction
  90 +# 2014-12-25 v0.09 PL: - added suspicious keywords detection
  91 +# 2014-12-27 v0.10 PL: - added OptionParser, main and process_file
  92 +# - uses xglob to scan several files with wildcards
  93 +# - option -r to recurse subdirectories
  94 +# - option -z to scan files in password-protected zips
  95 +# 2015-01-02 v0.11 PL: - improved filter_vba to detect colons
  96 +# 2015-01-03 v0.12 PL: - fixed detect_patterns to detect all patterns
  97 +# - process_file: improved display, shows container file
  98 +# - improved list of executable file extensions
  99 +# 2015-01-04 v0.13 PL: - added several suspicious keywords, improved display
  100 +# 2015-01-08 v0.14 PL: - added hex strings detection and decoding
  101 +# - fixed issue #2, decoding VBA stream names using
  102 +# specified codepage and unicode stream names
  103 +# 2015-01-11 v0.15 PL: - added new triage mode, options -t and -d
  104 +# 2015-01-16 v0.16 PL: - fix for issue #3 (exception when module name="text")
  105 +# - added several suspicious keywords
  106 +# - added option -i to analyze VBA source code directly
  107 +# 2015-01-17 v0.17 PL: - removed .com from the list of executable extensions
  108 +# - added scan_vba to run all detection algorithms
  109 +# - decoded hex strings are now also scanned + reversed
  110 +# 2015-01-23 v0.18 PL: - fixed issue #3, case-insensitive search in code_modules
  111 +# 2015-01-24 v0.19 PL: - improved the detection of IOCs obfuscated with hex
  112 +# strings and StrReverse
  113 +# 2015-01-26 v0.20 PL: - added option --hex to show all hex strings decoded
  114 +# 2015-01-29 v0.21 PL: - added Dridex obfuscation decoding
  115 +# - improved display, shows obfuscation name
  116 +# 2015-02-01 v0.22 PL: - fixed issue #4: regex for URL, e-mail and exe filename
  117 +# - added Base64 obfuscation decoding (contribution from
  118 +# @JamesHabben)
  119 +# 2015-02-03 v0.23 PL: - triage now uses VBA_Scanner results, shows Base64 and
  120 +# Dridex strings
  121 +# - exception handling in detect_base64_strings
  122 +# 2015-02-07 v0.24 PL: - renamed option --hex to --decode, fixed display
  123 +# - display exceptions with stack trace
  124 +# - added several suspicious keywords
  125 +# - improved Base64 detection and decoding
  126 +# - fixed triage mode not to scan attrib lines
  127 +# 2015-03-04 v0.25 PL: - added support for Word 2003 XML
  128 +
  129 +__version__ = '0.25'
  130 +
  131 +#------------------------------------------------------------------------------
  132 +# TODO:
  133 +# + do not use logging, but a provided logger (null logger by default)
  134 +# + setup logging (common with other oletools)
  135 +# + add xor bruteforcing like bbharvest
  136 +# + add chr() decoding
  137 +
  138 +# TODO later:
  139 +# + performance improvement: instead of searching each keyword separately,
  140 +# first split vba code into a list of words (per line), then check each
  141 +# word against a dict. (or put vba words into a set/dict?)
  142 +# + for regex, maybe combine them into a single re with named groups?
  143 +# + add Yara support, include sample rules? plugins like balbuzard?
  144 +# + add balbuzard support
  145 +# + output to file (replace print by file.write, sys.stdout by default)
  146 +# + look for VBA in embedded documents (e.g. Excel in Word)
  147 +# + support SRP streams (see Lenny's article + links and sample)
  148 +# - python 3.x support
  149 +# - add support for PowerPoint macros (see libclamav, libgsf), use oledump heuristic?
  150 +# - check VBA macros in Visio, Access, Project, etc
  151 +# - extract_macros: convert to a class, split long function into smaller methods
  152 +# - extract_macros: read bytes from stream file objects instead of strings
  153 +# - extract_macros: use combined struct.unpack instead of many calls
  154 +
  155 +#------------------------------------------------------------------------------
  156 +# REFERENCES:
  157 +# - [MS-OVBA]: Microsoft Office VBA File Format Structure
  158 +# http://msdn.microsoft.com/en-us/library/office/cc313094%28v=office.12%29.aspx
  159 +# - officeparser: https://github.com/unixfreak0037/officeparser
  160 +
  161 +
  162 +#--- IMPORTS ------------------------------------------------------------------
  163 +
  164 +import sys, logging
  165 +import struct
  166 +import cStringIO
  167 +import math
  168 +import zipfile
  169 +import re
  170 +import optparse
  171 +import os.path
  172 +import binascii
  173 +import base64
  174 +import traceback
  175 +import zlib
  176 +
  177 +# import lxml or ElementTree for XML parsing:
  178 +try:
  179 + # lxml: best performance for XML processing
  180 + import lxml.etree as ET
  181 +except ImportError:
  182 + try:
  183 + # Python 2.5+: batteries included
  184 + import xml.etree.cElementTree as ET
  185 + except ImportError:
  186 + try:
  187 + # Python <2.5: standalone ElementTree install
  188 + import elementtree.cElementTree as ET
  189 + except ImportError:
  190 + raise ImportError, "lxml or ElementTree are not installed, "\
  191 + +"see http://codespeak.net/lxml "\
  192 + +"or http://effbot.org/zone/element-index.htm"
  193 +
  194 +import thirdparty.olefile as olefile
  195 +from thirdparty.prettytable import prettytable
  196 +from thirdparty.xglob import xglob
  197 +
  198 +#--- CONSTANTS ----------------------------------------------------------------
  199 +
  200 +TYPE_OLE = 'OLE'
  201 +TYPE_OpenXML = 'OpenXML'
  202 +TYPE_Word2003_XML = 'Word2003_XML'
  203 +
  204 +MODULE_EXTENSION = "bas"
  205 +CLASS_EXTENSION = "cls"
  206 +FORM_EXTENSION = "frm"
  207 +
  208 +# Namespaces and tags for Word2003 XML parsing:
  209 +NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}'
  210 +# the tag <w:binData w:name="editdata.mso"> contains the VBA macro code:
  211 +TAG_BINDATA = NS_W + 'binData'
  212 +ATTR_NAME = NS_W + 'name'
  213 +
  214 +# Keywords to detect auto-executable macros
  215 +AUTOEXEC_KEYWORDS = {
  216 + # MS Word:
  217 + 'Runs when the Word document is opened':
  218 + ('AutoExec', 'AutoOpen', 'Document_Open', 'DocumentOpen'),
  219 + 'Runs when the Word document is closed':
  220 + ('AutoExit', 'AutoClose', 'Document_Close', 'DocumentBeforeClose'),
  221 + 'Runs when the Word document is modified':
  222 + ('DocumentChange',),
  223 + 'Runs when a new Word document is created':
  224 + ('AutoNew', 'Document_New', 'NewDocument'),
  225 +
  226 + # MS Excel:
  227 + 'Runs when the Excel Workbook is opened':
  228 + ('Auto_Open', 'Workbook_Open'),
  229 + 'Runs when the Excel Workbook is closed':
  230 + ('Auto_Close', 'Workbook_Close'),
  231 +
  232 + #TODO: full list in MS specs??
  233 +}
  234 +
  235 +# Suspicious Keywords that may be used by malware
  236 +# See VBA language reference: http://msdn.microsoft.com/en-us/library/office/jj692818%28v=office.15%29.aspx
  237 +SUSPICIOUS_KEYWORDS = {
  238 + #TODO: use regex to support variable whitespaces
  239 + 'May read system environment variables':
  240 + ('Environ',),
  241 + 'May open a file':
  242 + ('Open',),
  243 + 'May write to a file (if combined with Open)':
  244 + #TODO: regex to find Open+Write on same line
  245 + ('Write', 'Put', 'Output', 'Print #'),
  246 + 'May read or write a binary file (if combined with Open)':
  247 + #TODO: regex to find Open+Binary on same line
  248 + ('Binary',),
  249 + 'May copy a file':
  250 + ('FileCopy', 'CopyFile'),
  251 + #FileCopy: http://msdn.microsoft.com/en-us/library/office/gg264390%28v=office.15%29.aspx
  252 + #CopyFile: http://msdn.microsoft.com/en-us/library/office/gg264089%28v=office.15%29.aspx
  253 + 'May delete a file':
  254 + ('Kill',),
  255 + 'May create a text file':
  256 + ('CreateTextFile','ADODB.Stream', 'WriteText', 'SaveToFile'),
  257 + #CreateTextFile: http://msdn.microsoft.com/en-us/library/office/gg264617%28v=office.15%29.aspx
  258 + #ADODB.Stream sample: http://pastebin.com/Z4TMyuq6
  259 + 'May run an executable file or a system command':
  260 + ('Shell', 'vbNormal', 'vbNormalFocus', 'vbHide', 'vbMinimizedFocus', 'vbMaximizedFocus', 'vbNormalNoFocus',
  261 + 'vbMinimizedNoFocus', 'WScript.Shell', 'Run'),
  262 + #Shell: http://msdn.microsoft.com/en-us/library/office/gg278437%28v=office.15%29.aspx
  263 + #WScript.Shell+Run sample: http://pastebin.com/Z4TMyuq6
  264 + 'May hide the application':
  265 + ('Application.Visible', 'ShowWindow', 'SW_HIDE'),
  266 + 'May create a directory':
  267 + ('MkDir',),
  268 + 'May save the current workbook':
  269 + ('ActiveWorkbook.SaveAs',),
  270 + 'May change which directory contains files to open at startup':
  271 + #TODO: confirm the actual effect
  272 + ('Application.AltStartupPath',),
  273 + 'May create an OLE object':
  274 + ('CreateObject',),
  275 + 'May run an application (if combined with CreateObject)':
  276 + ('Shell.Application',),
  277 + 'May enumerate application windows (if combined with Shell.Application object)':
  278 + ('Windows', 'FindWindow'),
  279 + 'May run code from a DLL':
  280 + #TODO: regex to find declare+lib on same line
  281 + ('Lib',),
  282 + 'May download files from the Internet':
  283 + #TODO: regex to find urlmon+URLDownloadToFileA on same line
  284 + ('URLDownloadToFileA', 'Msxml2.XMLHTTP', 'Microsoft.XMLHTTP'),
  285 + 'May control another application by simulating user keystrokes':
  286 + ('SendKeys', 'AppActivate'),
  287 + #SendKeys: http://msdn.microsoft.com/en-us/library/office/gg278655%28v=office.15%29.aspx
  288 + 'May attempt to obfuscate malicious function calls':
  289 + ('CallByName',),
  290 + #CallByName: http://msdn.microsoft.com/en-us/library/office/gg278760%28v=office.15%29.aspx
  291 + 'May attempt to obfuscate specific strings':
  292 + #TODO: regex to find several Chr*, not just one
  293 + ('Chr', 'ChrB', 'ChrW', 'StrReverse', 'Xor'),
  294 + #Chr: http://msdn.microsoft.com/en-us/library/office/gg264465%28v=office.15%29.aspx
  295 +}
  296 +
  297 +# Regular Expression for a URL:
  298 +# http://en.wikipedia.org/wiki/Uniform_resource_locator
  299 +# http://www.w3.org/Addressing/URL/uri-spec.html
  300 +#TODO: also support username:password@server
  301 +#TODO: other protocols (file, gopher, wais, ...?)
  302 +SCHEME = r'\b(?:http|ftp)s?'
  303 +# see http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains
  304 +TLD = r'(?:xn--[a-zA-Z0-9]{4,20}|[a-zA-Z]{2,20})'
  305 +DNS_NAME = r'(?:[a-zA-Z0-9\-\.]+\.' + TLD + ')'
  306 +#TODO: IPv6 - see https://www.debuggex.com/
  307 +# A literal numeric IPv6 address may be given, but must be enclosed in [ ] e.g. [db8:0cec::99:123a]
  308 +NUMBER_0_255 = r'(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'
  309 +IPv4 = r'(?:'+NUMBER_0_255+r'\.){3}'+NUMBER_0_255
  310 +# IPv4 must come before the DNS name because it is more specific
  311 +SERVER = r'(?:' + IPv4 + '|' + DNS_NAME + ')'
  312 +PORT = r'(?:\:[0-9]{1,5})?'
  313 +SERVER_PORT = SERVER + PORT
  314 +URL_PATH = r'(?:/[a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~]*)?' # [^\.\,\)\(\s"]
  315 +URL_RE = SCHEME + r'\://' + SERVER_PORT + URL_PATH
  316 +re_url = re.compile(URL_RE)
  317 +
  318 +
  319 +# Patterns to be extracted (IP addresses, URLs, etc)
  320 +# From patterns.py in balbuzard
  321 +RE_PATTERNS = (
  322 + ('URL', re.compile(URL_RE)),
  323 + ('IPv4 address', re.compile(IPv4)),
  324 + ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@'+SERVER+'\b')),
  325 + # ('Domain name', re.compile(r'(?=^.{1,254}$)(^(?:(?!\d+\.|-)[a-zA-Z0-9_\-]{1,63}(?<!-)\.?)+(?:[a-zA-Z]{2,})$)')),
  326 + # Executable file name with known extensions (except .com which is present in many URLs, and .application):
  327 + ("Executable file name", re.compile(r"(?i)\b\w+\.(EXE|PIF|GADGET|MSI|MSP|MSC|VBS|VBE|VB|JSE|JS|WSF|WSC|WSH|WS|BAT|CMD|DLL|SCR|HTA|CPL|CLASS|JAR|PS1XML|PS1|PS2XML|PS2|PSC1|PSC2|SCF|LNK|INF|REG)\b")),
  328 + # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/
  329 + #TODO: https://support.office.com/en-us/article/Blocked-attachments-in-Outlook-3811cddc-17c3-4279-a30c-060ba0207372#__attachment_file_types
  330 + #('Hex string', re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')),
  331 + )
  332 +
  333 +# regex to detect strings encoded in hexadecimal
  334 +re_hex_string = re.compile(r'(?:[0-9A-Fa-f]{2}){4,}')
  335 +
  336 +# regex to detect strings encoded in base64
  337 +#re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?"')
  338 +# better version from balbuzard, less false positives:
  339 +re_base64_string = re.compile(r'"(?:[A-Za-z0-9+/]{4}){1,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)?"')
  340 +# white list of common strings matching the base64 regex, but which are not base64 strings (all lowercase):
  341 +BASE64_WHITELIST = set(['thisdocument', 'thisworkbook', 'test', 'temp', 'http', 'open', 'exit'])
  342 +
  343 +# regex to detect strings encoded with a specific Dridex algorithm
  344 +# (see https://github.com/JamesHabben/MalwareStuff)
  345 +re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"')
  346 +# regex to check that it is not just a hex string:
  347 +re_nothex_check = re.compile(r'[G-Zg-z]')
  348 +
  349 +#--- FUNCTIONS ----------------------------------------------------------------
  350 +
  351 +def copytoken_help(decompressed_current, decompressed_chunk_start):
  352 + """
  353 + compute bit masks to decode a CopyToken according to MS-OVBA 2.4.1.3.19.1 CopyToken Help
  354 +
  355 + decompressed_current: number of decompressed bytes so far, i.e. len(decompressed_container)
  356 + decompressed_chunk_start: offset of the current chunk in the decompressed container
  357 + return length_mask, offset_mask, bit_count, maximum_length
  358 + """
  359 + difference = decompressed_current - decompressed_chunk_start
  360 + bit_count = int(math.ceil(math.log(difference, 2)))
  361 + bit_count = max([bit_count, 4])
  362 + length_mask = 0xFFFF >> bit_count
  363 + offset_mask = ~length_mask
  364 + maximum_length = (0xFFFF >> bit_count) + 3
  365 + return length_mask, offset_mask, bit_count, maximum_length
  366 +
  367 +
  368 +def decompress_stream (compressed_container):
  369 + """
  370 + Decompress a stream according to MS-OVBA section 2.4.1
  371 +
  372 + compressed_container: string compressed according to the MS-OVBA 2.4.1.3.6 Compression algorithm
  373 + return the decompressed container as a string (bytes)
  374 + """
  375 + # 2.4.1.2 State Variables
  376 +
  377 + # The following state is maintained for the CompressedContainer (section 2.4.1.1.1):
  378 + # CompressedRecordEnd: The location of the byte after the last byte in the CompressedContainer (section 2.4.1.1.1).
  379 + # CompressedCurrent: The location of the next byte in the CompressedContainer (section 2.4.1.1.1) to be read by
  380 + # decompression or to be written by compression.
  381 +
  382 + # The following state is maintained for the current CompressedChunk (section 2.4.1.1.4):
  383 + # CompressedChunkStart: The location of the first byte of the CompressedChunk (section 2.4.1.1.4) within the
  384 + # CompressedContainer (section 2.4.1.1.1).
  385 +
  386 + # The following state is maintained for a DecompressedBuffer (section 2.4.1.1.2):
  387 + # DecompressedCurrent: The location of the next byte in the DecompressedBuffer (section 2.4.1.1.2) to be written by
  388 + # decompression or to be read by compression.
  389 + # DecompressedBufferEnd: The location of the byte after the last byte in the DecompressedBuffer (section 2.4.1.1.2).
  390 +
  391 + # The following state is maintained for the current DecompressedChunk (section 2.4.1.1.3):
  392 + # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the
  393 + # DecompressedBuffer (section 2.4.1.1.2).
  394 +
  395 + decompressed_container = '' # result
  396 + compressed_current = 0
  397 +
  398 + sig_byte = ord(compressed_container[compressed_current])
  399 + if sig_byte != 0x01:
  400 + raise ValueError('invalid signature byte {0:02X}'.format(sig_byte))
  401 +
  402 + compressed_current += 1
  403 +
  404 + #NOTE: the definition of CompressedRecordEnd is ambiguous. Here we assume that
  405 + # CompressedRecordEnd = len(compressed_container)
  406 + while compressed_current < len(compressed_container):
  407 + # 2.4.1.1.5
  408 + compressed_chunk_start = compressed_current
  409 + # chunk header = first 16 bits
  410 + compressed_chunk_header = struct.unpack("<H", compressed_container[compressed_chunk_start:compressed_chunk_start + 2])[0]
  411 + # chunk size = 12 first bits of header + 3
  412 + chunk_size = (compressed_chunk_header & 0x0FFF) + 3
  413 + # chunk signature = 3 next bits - should always be 0b011
  414 + chunk_signature = (compressed_chunk_header >> 12) & 0x07
  415 + if chunk_signature != 0b011:
  416 + raise ValueError('Invalid CompressedChunkSignature in VBA compressed stream')
  417 + # chunk flag = next bit - 1 == compressed, 0 == uncompressed
  418 + chunk_flag = (compressed_chunk_header >> 15) & 0x01
  419 + logging.debug("chunk size = {0}, compressed flag = {1}".format(chunk_size, chunk_flag))
  420 +
  421 + #MS-OVBA 2.4.1.3.12: the maximum size of a chunk including its header is 4098 bytes (header 2 + data 4096)
  422 + # The minimum size is 3 bytes
  423 + # NOTE: there seems to be a typo in MS-OVBA, the check should be with 4098, not 4095 (which is the max value
  424 + # in chunk header before adding 3.
  425 + # Also the first test is not useful since a 12 bits value cannot be larger than 4095.
  426 + if chunk_flag == 1 and chunk_size > 4098:
  427 + raise ValueError('CompressedChunkSize > 4098 but CompressedChunkFlag == 1')
  428 + if chunk_flag == 0 and chunk_size != 4098:
  429 + raise ValueError('CompressedChunkSize != 4098 but CompressedChunkFlag == 0')
  430 +
  431 + # check if chunk_size goes beyond the compressed data, instead of silently cutting it:
  432 + #TODO: raise an exception?
  433 + if compressed_chunk_start + chunk_size > len(compressed_container):
  434 + logging.warning('Chunk size is larger than remaining compressed data')
  435 + compressed_end = min([len(compressed_container), compressed_chunk_start + chunk_size])
  436 + # read after chunk header:
  437 + compressed_current = compressed_chunk_start + 2
  438 +
  439 + if chunk_flag == 0:
  440 + # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk
  441 + # uncompressed chunk: read the next 4096 bytes as-is
  442 + #TODO: check if there are at least 4096 bytes left
  443 + decompressed_container += compressed_container[compressed_current:compressed_current + 4096]
  444 + compressed_current += 4096
  445 + else:
  446 + # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk
  447 + # compressed chunk
  448 + decompressed_chunk_start = len(decompressed_container)
  449 + while compressed_current < compressed_end:
  450 + # MS-OVBA 2.4.1.3.4 Decompressing a TokenSequence
  451 + # logging.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end))
  452 + # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or
  453 + # copy tokens (reference to a previous literal token)
  454 + flag_byte = ord(compressed_container[compressed_current])
  455 + compressed_current += 1
  456 + for bit_index in xrange(0, 8):
  457 + # logging.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end))
  458 + if compressed_current >= compressed_end:
  459 + break
  460 + # MS-OVBA 2.4.1.3.5 Decompressing a Token
  461 + # MS-OVBA 2.4.1.3.17 Extract FlagBit
  462 + flag_bit = (flag_byte >> bit_index) & 1
  463 + #logging.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit))
  464 + if flag_bit == 0: # LiteralToken
  465 + # copy one byte directly to output
  466 + decompressed_container += compressed_container[compressed_current]
  467 + compressed_current += 1
  468 + else: # CopyToken
  469 + # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken
  470 + copy_token = struct.unpack("<H", compressed_container[compressed_current:compressed_current + 2])[0]
  471 + #TODO: check this
  472 + length_mask, offset_mask, bit_count, maximum_length = copytoken_help(
  473 + len(decompressed_container), decompressed_chunk_start)
  474 + length = (copy_token & length_mask) + 3
  475 + temp1 = copy_token & offset_mask
  476 + temp2 = 16 - bit_count
  477 + offset = (temp1 >> temp2) + 1
  478 + #logging.debug('offset=%d length=%d' % (offset, length))
  479 + copy_source = len(decompressed_container) - offset
  480 + for index in xrange(copy_source, copy_source + length):
  481 + decompressed_container += decompressed_container[index]
  482 + compressed_current += 2
  483 + return decompressed_container
  484 +
  485 +
  486 +def _extract_vba (ole, vba_root, project_path, dir_path):
  487 + """
  488 + Extract VBA macros from an OleFileIO object.
  489 + Internal function, do not call directly.
  490 +
  491 + vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream
  492 + vba_project: path to the PROJECT stream
  493 + This is a generator, yielding (stream path, VBA filename, VBA source code) for each VBA code stream
  494 + """
  495 + # Open the PROJECT stream:
  496 + project = ole.openstream(project_path)
  497 +
  498 + # sample content of the PROJECT stream:
  499 +
  500 + ## ID="{5312AC8A-349D-4950-BDD0-49BE3C4DD0F0}"
  501 + ## Document=ThisDocument/&H00000000
  502 + ## Module=NewMacros
  503 + ## Name="Project"
  504 + ## HelpContextID="0"
  505 + ## VersionCompatible32="393222000"
  506 + ## CMG="F1F301E705E705E705E705"
  507 + ## DPB="8F8D7FE3831F2020202020"
  508 + ## GC="2D2FDD81E51EE61EE6E1"
  509 + ##
  510 + ## [Host Extender Info]
  511 + ## &H00000001={3832D640-CF90-11CF-8E43-00A0C911005A};VBE;&H00000000
  512 + ## &H00000002={000209F2-0000-0000-C000-000000000046};Word8.0;&H00000000
  513 + ##
  514 + ## [Workspace]
  515 + ## ThisDocument=22, 29, 339, 477, Z
  516 + ## NewMacros=-4, 42, 832, 510, C
  517 +
  518 + code_modules = {}
  519 +
  520 + for line in project:
  521 + line = line.strip()
  522 + if '=' in line:
  523 + # split line at the 1st equal sign:
  524 + name, value = line.split('=', 1)
  525 + # looking for code modules
  526 + # add the code module as a key in the dictionary
  527 + # the value will be the extension needed later
  528 + # The value is converted to lowercase, to allow case-insensitive matching (issue #3)
  529 + value = value.lower()
  530 + if name == 'Document':
  531 + # split value at the 1st slash, keep 1st part:
  532 + value = value.split('/', 1)[0]
  533 + code_modules[value] = CLASS_EXTENSION
  534 + elif name == 'Module':
  535 + code_modules[value] = MODULE_EXTENSION
  536 + elif name == 'Class':
  537 + code_modules[value] = CLASS_EXTENSION
  538 + elif name == 'BaseClass':
  539 + code_modules[value] = FORM_EXTENSION
  540 +
  541 + # read data from dir stream (compressed)
  542 + dir_compressed = ole.openstream(dir_path).read()
  543 +
  544 + def check_value(name, expected, value):
  545 + if expected != value:
  546 + logging.error("invalid value for {0} expected {1:04X} got {2:04X}".format(name, expected, value))
  547 +
  548 + dir_stream = cStringIO.StringIO(decompress_stream(dir_compressed))
  549 +
  550 + # PROJECTSYSKIND Record
  551 + PROJECTSYSKIND_Id = struct.unpack("<H", dir_stream.read(2))[0]
  552 + check_value('PROJECTSYSKIND_Id', 0x0001, PROJECTSYSKIND_Id)
  553 + PROJECTSYSKIND_Size = struct.unpack("<L", dir_stream.read(4))[0]
  554 + check_value('PROJECTSYSKIND_Size', 0x0004, PROJECTSYSKIND_Size)
  555 + PROJECTSYSKIND_SysKind = struct.unpack("<L", dir_stream.read(4))[0]
  556 + if PROJECTSYSKIND_SysKind == 0x00:
  557 + logging.debug("16-bit Windows")
  558 + elif PROJECTSYSKIND_SysKind == 0x01:
  559 + logging.debug("32-bit Windows")
  560 + elif PROJECTSYSKIND_SysKind == 0x02:
  561 + logging.debug("Macintosh")
  562 + elif PROJECTSYSKIND_SysKind == 0x03:
  563 + logging.debug("64-bit Windows")
  564 + else:
  565 + logging.error("invalid PROJECTSYSKIND_SysKind {0:04X}".format(PROJECTSYSKIND_SysKind))
  566 +
  567 + # PROJECTLCID Record
  568 + PROJECTLCID_Id = struct.unpack("<H", dir_stream.read(2))[0]
  569 + check_value('PROJECTLCID_Id', 0x0002, PROJECTLCID_Id)
  570 + PROJECTLCID_Size = struct.unpack("<L", dir_stream.read(4))[0]
  571 + check_value('PROJECTLCID_Size', 0x0004, PROJECTLCID_Size)
  572 + PROJECTLCID_Lcid = struct.unpack("<L", dir_stream.read(4))[0]
  573 + check_value('PROJECTLCID_Lcid', 0x409, PROJECTLCID_Lcid)
  574 +
  575 + # PROJECTLCIDINVOKE Record
  576 + PROJECTLCIDINVOKE_Id = struct.unpack("<H", dir_stream.read(2))[0]
  577 + check_value('PROJECTLCIDINVOKE_Id', 0x0014, PROJECTLCIDINVOKE_Id)
  578 + PROJECTLCIDINVOKE_Size = struct.unpack("<L", dir_stream.read(4))[0]
  579 + check_value('PROJECTLCIDINVOKE_Size', 0x0004, PROJECTLCIDINVOKE_Size)
  580 + PROJECTLCIDINVOKE_LcidInvoke = struct.unpack("<L", dir_stream.read(4))[0]
  581 + check_value('PROJECTLCIDINVOKE_LcidInvoke', 0x409, PROJECTLCIDINVOKE_LcidInvoke)
  582 +
  583 + # PROJECTCODEPAGE Record
  584 + PROJECTCODEPAGE_Id = struct.unpack("<H", dir_stream.read(2))[0]
  585 + check_value('PROJECTCODEPAGE_Id', 0x0003, PROJECTCODEPAGE_Id)
  586 + PROJECTCODEPAGE_Size = struct.unpack("<L", dir_stream.read(4))[0]
  587 + check_value('PROJECTCODEPAGE_Size', 0x0002, PROJECTCODEPAGE_Size)
  588 + PROJECTCODEPAGE_CodePage = struct.unpack("<H", dir_stream.read(2))[0]
  589 +
  590 + # PROJECTNAME Record
  591 + PROJECTNAME_Id = struct.unpack("<H", dir_stream.read(2))[0]
  592 + check_value('PROJECTNAME_Id', 0x0004, PROJECTNAME_Id)
  593 + PROJECTNAME_SizeOfProjectName = struct.unpack("<L", dir_stream.read(4))[0]
  594 + if PROJECTNAME_SizeOfProjectName < 1 or PROJECTNAME_SizeOfProjectName > 128:
  595 + logging.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(PROJECTNAME_SizeOfProjectName))
  596 + PROJECTNAME_ProjectName = dir_stream.read(PROJECTNAME_SizeOfProjectName)
  597 +
  598 + # PROJECTDOCSTRING Record
  599 + PROJECTDOCSTRING_Id = struct.unpack("<H", dir_stream.read(2))[0]
  600 + check_value('PROJECTDOCSTRING_Id', 0x0005, PROJECTDOCSTRING_Id)
  601 + PROJECTDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0]
  602 + if PROJECTNAME_SizeOfProjectName > 2000:
  603 + logging.error("PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(PROJECTDOCSTRING_SizeOfDocString))
  604 + PROJECTDOCSTRING_DocString = dir_stream.read(PROJECTDOCSTRING_SizeOfDocString)
  605 + PROJECTDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  606 + check_value('PROJECTDOCSTRING_Reserved', 0x0040, PROJECTDOCSTRING_Reserved)
  607 + PROJECTDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  608 + if PROJECTDOCSTRING_SizeOfDocStringUnicode % 2 != 0:
  609 + logging.error("PROJECTDOCSTRING_SizeOfDocStringUnicode is not even")
  610 + PROJECTDOCSTRING_DocStringUnicode = dir_stream.read(PROJECTDOCSTRING_SizeOfDocStringUnicode)
  611 +
  612 + # PROJECTHELPFILEPATH Record - MS-OVBA 2.3.4.2.1.7
  613 + PROJECTHELPFILEPATH_Id = struct.unpack("<H", dir_stream.read(2))[0]
  614 + check_value('PROJECTHELPFILEPATH_Id', 0x0006, PROJECTHELPFILEPATH_Id)
  615 + PROJECTHELPFILEPATH_SizeOfHelpFile1 = struct.unpack("<L", dir_stream.read(4))[0]
  616 + if PROJECTHELPFILEPATH_SizeOfHelpFile1 > 260:
  617 + logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(PROJECTHELPFILEPATH_SizeOfHelpFile1))
  618 + PROJECTHELPFILEPATH_HelpFile1 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile1)
  619 + PROJECTHELPFILEPATH_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  620 + check_value('PROJECTHELPFILEPATH_Reserved', 0x003D, PROJECTHELPFILEPATH_Reserved)
  621 + PROJECTHELPFILEPATH_SizeOfHelpFile2 = struct.unpack("<L", dir_stream.read(4))[0]
  622 + if PROJECTHELPFILEPATH_SizeOfHelpFile2 != PROJECTHELPFILEPATH_SizeOfHelpFile1:
  623 + logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 does not equal PROJECTHELPFILEPATH_SizeOfHelpFile2")
  624 + PROJECTHELPFILEPATH_HelpFile2 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile2)
  625 + if PROJECTHELPFILEPATH_HelpFile2 != PROJECTHELPFILEPATH_HelpFile1:
  626 + logging.error("PROJECTHELPFILEPATH_HelpFile1 does not equal PROJECTHELPFILEPATH_HelpFile2")
  627 +
  628 + # PROJECTHELPCONTEXT Record
  629 + PROJECTHELPCONTEXT_Id = struct.unpack("<H", dir_stream.read(2))[0]
  630 + check_value('PROJECTHELPCONTEXT_Id', 0x0007, PROJECTHELPCONTEXT_Id)
  631 + PROJECTHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0]
  632 + check_value('PROJECTHELPCONTEXT_Size', 0x0004, PROJECTHELPCONTEXT_Size)
  633 + PROJECTHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0]
  634 +
  635 + # PROJECTLIBFLAGS Record
  636 + PROJECTLIBFLAGS_Id = struct.unpack("<H", dir_stream.read(2))[0]
  637 + check_value('PROJECTLIBFLAGS_Id', 0x0008, PROJECTLIBFLAGS_Id)
  638 + PROJECTLIBFLAGS_Size = struct.unpack("<L", dir_stream.read(4))[0]
  639 + check_value('PROJECTLIBFLAGS_Size', 0x0004, PROJECTLIBFLAGS_Size)
  640 + PROJECTLIBFLAGS_ProjectLibFlags = struct.unpack("<L", dir_stream.read(4))[0]
  641 + check_value('PROJECTLIBFLAGS_ProjectLibFlags', 0x0000, PROJECTLIBFLAGS_ProjectLibFlags)
  642 +
  643 + # PROJECTVERSION Record
  644 + PROJECTVERSION_Id = struct.unpack("<H", dir_stream.read(2))[0]
  645 + check_value('PROJECTVERSION_Id', 0x0009, PROJECTVERSION_Id)
  646 + PROJECTVERSION_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
  647 + check_value('PROJECTVERSION_Reserved', 0x0004, PROJECTVERSION_Reserved)
  648 + PROJECTVERSION_VersionMajor = struct.unpack("<L", dir_stream.read(4))[0]
  649 + PROJECTVERSION_VersionMinor = struct.unpack("<H", dir_stream.read(2))[0]
  650 +
  651 + # PROJECTCONSTANTS Record
  652 + PROJECTCONSTANTS_Id = struct.unpack("<H", dir_stream.read(2))[0]
  653 + check_value('PROJECTCONSTANTS_Id', 0x000C, PROJECTCONSTANTS_Id)
  654 + PROJECTCONSTANTS_SizeOfConstants = struct.unpack("<L", dir_stream.read(4))[0]
  655 + if PROJECTCONSTANTS_SizeOfConstants > 1015:
  656 + logging.error("PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(PROJECTCONSTANTS_SizeOfConstants))
  657 + PROJECTCONSTANTS_Constants = dir_stream.read(PROJECTCONSTANTS_SizeOfConstants)
  658 + PROJECTCONSTANTS_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  659 + check_value('PROJECTCONSTANTS_Reserved', 0x003C, PROJECTCONSTANTS_Reserved)
  660 + PROJECTCONSTANTS_SizeOfConstantsUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  661 + if PROJECTCONSTANTS_SizeOfConstantsUnicode % 2 != 0:
  662 + logging.error("PROJECTCONSTANTS_SizeOfConstantsUnicode is not even")
  663 + PROJECTCONSTANTS_ConstantsUnicode = dir_stream.read(PROJECTCONSTANTS_SizeOfConstantsUnicode)
  664 +
  665 + # array of REFERENCE records
  666 + check = None
  667 + while True:
  668 + check = struct.unpack("<H", dir_stream.read(2))[0]
  669 + logging.debug("reference type = {0:04X}".format(check))
  670 + if check == 0x000F:
  671 + break
  672 +
  673 + if check == 0x0016:
  674 + # REFERENCENAME
  675 + REFERENCE_Id = check
  676 + REFERENCE_SizeOfName = struct.unpack("<L", dir_stream.read(4))[0]
  677 + REFERENCE_Name = dir_stream.read(REFERENCE_SizeOfName)
  678 + REFERENCE_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  679 + check_value('REFERENCE_Reserved', 0x003E, REFERENCE_Reserved)
  680 + REFERENCE_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  681 + REFERENCE_NameUnicode = dir_stream.read(REFERENCE_SizeOfNameUnicode)
  682 + continue
  683 +
  684 + if check == 0x0033:
  685 + # REFERENCEORIGINAL (followed by REFERENCECONTROL)
  686 + REFERENCEORIGINAL_Id = check
  687 + REFERENCEORIGINAL_SizeOfLibidOriginal = struct.unpack("<L", dir_stream.read(4))[0]
  688 + REFERENCEORIGINAL_LibidOriginal = dir_stream.read(REFERENCEORIGINAL_SizeOfLibidOriginal)
  689 + continue
  690 +
  691 + if check == 0x002F:
  692 + # REFERENCECONTROL
  693 + REFERENCECONTROL_Id = check
  694 + REFERENCECONTROL_SizeTwiddled = struct.unpack("<L", dir_stream.read(4))[0] # ignore
  695 + REFERENCECONTROL_SizeOfLibidTwiddled = struct.unpack("<L", dir_stream.read(4))[0]
  696 + REFERENCECONTROL_LibidTwiddled = dir_stream.read(REFERENCECONTROL_SizeOfLibidTwiddled)
  697 + REFERENCECONTROL_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0] # ignore
  698 + check_value('REFERENCECONTROL_Reserved1', 0x0000, REFERENCECONTROL_Reserved1)
  699 + REFERENCECONTROL_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0] # ignore
  700 + check_value('REFERENCECONTROL_Reserved2', 0x0000, REFERENCECONTROL_Reserved2)
  701 + # optional field
  702 + check2 = struct.unpack("<H", dir_stream.read(2))[0]
  703 + if check2 == 0x0016:
  704 + REFERENCECONTROL_NameRecordExtended_Id = check
  705 + REFERENCECONTROL_NameRecordExtended_SizeofName = struct.unpack("<L", dir_stream.read(4))[0]
  706 + REFERENCECONTROL_NameRecordExtended_Name = dir_stream.read(REFERENCECONTROL_NameRecordExtended_SizeofName)
  707 + REFERENCECONTROL_NameRecordExtended_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  708 + check_value('REFERENCECONTROL_NameRecordExtended_Reserved', 0x003E, REFERENCECONTROL_NameRecordExtended_Reserved)
  709 + REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  710 + REFERENCECONTROL_NameRecordExtended_NameUnicode = dir_stream.read(REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode)
  711 + REFERENCECONTROL_Reserved3 = struct.unpack("<H", dir_stream.read(2))[0]
  712 + else:
  713 + REFERENCECONTROL_Reserved3 = check2
  714 +
  715 + check_value('REFERENCECONTROL_Reserved3', 0x0030, REFERENCECONTROL_Reserved3)
  716 + REFERENCECONTROL_SizeExtended = struct.unpack("<L", dir_stream.read(4))[0]
  717 + REFERENCECONTROL_SizeOfLibidExtended = struct.unpack("<L", dir_stream.read(4))[0]
  718 + REFERENCECONTROL_LibidExtended = dir_stream.read(REFERENCECONTROL_SizeOfLibidExtended)
  719 + REFERENCECONTROL_Reserved4 = struct.unpack("<L", dir_stream.read(4))[0]
  720 + REFERENCECONTROL_Reserved5 = struct.unpack("<H", dir_stream.read(2))[0]
  721 + REFERENCECONTROL_OriginalTypeLib = dir_stream.read(16)
  722 + REFERENCECONTROL_Cookie = struct.unpack("<L", dir_stream.read(4))[0]
  723 + continue
  724 +
  725 + if check == 0x000D:
  726 + # REFERENCEREGISTERED
  727 + REFERENCEREGISTERED_Id = check
  728 + REFERENCEREGISTERED_Size = struct.unpack("<L", dir_stream.read(4))[0]
  729 + REFERENCEREGISTERED_SizeOfLibid = struct.unpack("<L", dir_stream.read(4))[0]
  730 + REFERENCEREGISTERED_Libid = dir_stream.read(REFERENCEREGISTERED_SizeOfLibid)
  731 + REFERENCEREGISTERED_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0]
  732 + check_value('REFERENCEREGISTERED_Reserved1', 0x0000, REFERENCEREGISTERED_Reserved1)
  733 + REFERENCEREGISTERED_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0]
  734 + check_value('REFERENCEREGISTERED_Reserved2', 0x0000, REFERENCEREGISTERED_Reserved2)
  735 + continue
  736 +
  737 + if check == 0x000E:
  738 + # REFERENCEPROJECT
  739 + REFERENCEPROJECT_Id = check
  740 + REFERENCEPROJECT_Size = struct.unpack("<L", dir_stream.read(4))[0]
  741 + REFERENCEPROJECT_SizeOfLibidAbsolute = struct.unpack("<L", dir_stream.read(4))[0]
  742 + REFERENCEPROJECT_LibidAbsolute = dir_stream.read(REFERENCEPROJECT_SizeOfLibidAbsolute)
  743 + REFERENCEPROJECT_SizeOfLibidRelative = struct.unpack("<L", dir_stream.read(4))[0]
  744 + REFERENCEPROJECT_LibidRelative = dir_stream.read(REFERENCEPROJECT_SizeOfLibidRelative)
  745 + REFERENCEPROJECT_MajorVersion = struct.unpack("<L", dir_stream.read(4))[0]
  746 + REFERENCEPROJECT_MinorVersion = struct.unpack("<H", dir_stream.read(2))[0]
  747 + continue
  748 +
  749 + logging.error('invalid or unknown check Id {0:04X}'.format(check))
  750 + sys.exit(0)
  751 +
  752 + PROJECTMODULES_Id = check #struct.unpack("<H", dir_stream.read(2))[0]
  753 + check_value('PROJECTMODULES_Id', 0x000F, PROJECTMODULES_Id)
  754 + PROJECTMODULES_Size = struct.unpack("<L", dir_stream.read(4))[0]
  755 + check_value('PROJECTMODULES_Size', 0x0002, PROJECTMODULES_Size)
  756 + PROJECTMODULES_Count = struct.unpack("<H", dir_stream.read(2))[0]
  757 + PROJECTMODULES_ProjectCookieRecord_Id = struct.unpack("<H", dir_stream.read(2))[0]
  758 + check_value('PROJECTMODULES_ProjectCookieRecord_Id', 0x0013, PROJECTMODULES_ProjectCookieRecord_Id)
  759 + PROJECTMODULES_ProjectCookieRecord_Size = struct.unpack("<L", dir_stream.read(4))[0]
  760 + check_value('PROJECTMODULES_ProjectCookieRecord_Size', 0x0002, PROJECTMODULES_ProjectCookieRecord_Size)
  761 + PROJECTMODULES_ProjectCookieRecord_Cookie = struct.unpack("<H", dir_stream.read(2))[0]
  762 +
  763 + logging.debug("parsing {0} modules".format(PROJECTMODULES_Count))
  764 + for x in xrange(0, PROJECTMODULES_Count):
  765 + MODULENAME_Id = struct.unpack("<H", dir_stream.read(2))[0]
  766 + check_value('MODULENAME_Id', 0x0019, MODULENAME_Id)
  767 + MODULENAME_SizeOfModuleName = struct.unpack("<L", dir_stream.read(4))[0]
  768 + MODULENAME_ModuleName = dir_stream.read(MODULENAME_SizeOfModuleName)
  769 + # account for optional sections
  770 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  771 + if section_id == 0x0047:
  772 + MODULENAMEUNICODE_Id = section_id
  773 + MODULENAMEUNICODE_SizeOfModuleNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  774 + MODULENAMEUNICODE_ModuleNameUnicode = dir_stream.read(MODULENAMEUNICODE_SizeOfModuleNameUnicode)
  775 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  776 + if section_id == 0x001A:
  777 + MODULESTREAMNAME_id = section_id
  778 + MODULESTREAMNAME_SizeOfStreamName = struct.unpack("<L", dir_stream.read(4))[0]
  779 + MODULESTREAMNAME_StreamName = dir_stream.read(MODULESTREAMNAME_SizeOfStreamName)
  780 + MODULESTREAMNAME_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  781 + check_value('MODULESTREAMNAME_Reserved', 0x0032, MODULESTREAMNAME_Reserved)
  782 + MODULESTREAMNAME_SizeOfStreamNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  783 + MODULESTREAMNAME_StreamNameUnicode = dir_stream.read(MODULESTREAMNAME_SizeOfStreamNameUnicode)
  784 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  785 + if section_id == 0x001C:
  786 + MODULEDOCSTRING_Id = section_id
  787 + check_value('MODULEDOCSTRING_Id', 0x001C, MODULEDOCSTRING_Id)
  788 + MODULEDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0]
  789 + MODULEDOCSTRING_DocString = dir_stream.read(MODULEDOCSTRING_SizeOfDocString)
  790 + MODULEDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  791 + check_value('MODULEDOCSTRING_Reserved', 0x0048, MODULEDOCSTRING_Reserved)
  792 + MODULEDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  793 + MODULEDOCSTRING_DocStringUnicode = dir_stream.read(MODULEDOCSTRING_SizeOfDocStringUnicode)
  794 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  795 + if section_id == 0x0031:
  796 + MODULEOFFSET_Id = section_id
  797 + check_value('MODULEOFFSET_Id', 0x0031, MODULEOFFSET_Id)
  798 + MODULEOFFSET_Size = struct.unpack("<L", dir_stream.read(4))[0]
  799 + check_value('MODULEOFFSET_Size', 0x0004, MODULEOFFSET_Size)
  800 + MODULEOFFSET_TextOffset = struct.unpack("<L", dir_stream.read(4))[0]
  801 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  802 + if section_id == 0x001E:
  803 + MODULEHELPCONTEXT_Id = section_id
  804 + check_value('MODULEHELPCONTEXT_Id', 0x001E, MODULEHELPCONTEXT_Id)
  805 + MODULEHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0]
  806 + check_value('MODULEHELPCONTEXT_Size', 0x0004, MODULEHELPCONTEXT_Size)
  807 + MODULEHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0]
  808 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  809 + if section_id == 0x002C:
  810 + MODULECOOKIE_Id = section_id
  811 + check_value('MODULECOOKIE_Id', 0x002C, MODULECOOKIE_Id)
  812 + MODULECOOKIE_Size = struct.unpack("<L", dir_stream.read(4))[0]
  813 + check_value('MODULECOOKIE_Size', 0x0002, MODULECOOKIE_Size)
  814 + MODULECOOKIE_Cookie = struct.unpack("<H", dir_stream.read(2))[0]
  815 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  816 + if section_id == 0x0021 or section_id == 0x0022:
  817 + MODULETYPE_Id = section_id
  818 + MODULETYPE_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
  819 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  820 + if section_id == 0x0025:
  821 + MODULEREADONLY_Id = section_id
  822 + check_value('MODULEREADONLY_Id', 0x0025, MODULEREADONLY_Id)
  823 + MODULEREADONLY_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
  824 + check_value('MODULEREADONLY_Reserved', 0x0000, MODULEREADONLY_Reserved)
  825 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  826 + if section_id == 0x0028:
  827 + MODULEPRIVATE_Id = section_id
  828 + check_value('MODULEPRIVATE_Id', 0x0028, MODULEPRIVATE_Id)
  829 + MODULEPRIVATE_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
  830 + check_value('MODULEPRIVATE_Reserved', 0x0000, MODULEPRIVATE_Reserved)
  831 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  832 + if section_id == 0x002B: # TERMINATOR
  833 + MODULE_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
  834 + check_value('MODULE_Reserved', 0x0000, MODULE_Reserved)
  835 + section_id = None
  836 + if section_id != None:
  837 + logging.warning('unknown or invalid module section id {0:04X}'.format(section_id))
  838 +
  839 + logging.debug('Project CodePage = %d' % PROJECTCODEPAGE_CodePage)
  840 + vba_codec = 'cp%d' % PROJECTCODEPAGE_CodePage
  841 + logging.debug("ModuleName = {0}".format(MODULENAME_ModuleName))
  842 + logging.debug("StreamName = {0}".format(repr(MODULESTREAMNAME_StreamName)))
  843 + streamname_unicode = MODULESTREAMNAME_StreamName.decode(vba_codec)
  844 + logging.debug("StreamName.decode('%s') = %s" % (vba_codec, repr(streamname_unicode)))
  845 + logging.debug("StreamNameUnicode = {0}".format(repr(MODULESTREAMNAME_StreamNameUnicode)))
  846 + logging.debug("TextOffset = {0}".format(MODULEOFFSET_TextOffset))
  847 +
  848 + code_path = vba_root + u'VBA/' + streamname_unicode
  849 + #TODO: test if stream exists
  850 + logging.debug('opening VBA code stream %s' % repr(code_path))
  851 + code_data = ole.openstream(code_path).read()
  852 + logging.debug("length of code_data = {0}".format(len(code_data)))
  853 + logging.debug("offset of code_data = {0}".format(MODULEOFFSET_TextOffset))
  854 + code_data = code_data[MODULEOFFSET_TextOffset:]
  855 + if len(code_data) > 0:
  856 + code_data = decompress_stream(code_data)
  857 + # case-insensitive search in the code_modules dict to find the file extension:
  858 + filext = code_modules.get(MODULENAME_ModuleName.lower(), 'bin')
  859 + filename = '{0}.{1}'.format(MODULENAME_ModuleName, filext)
  860 + #TODO: also yield the codepage so that callers can decode it properly
  861 + yield (code_path, filename, code_data)
  862 + # print '-'*79
  863 + # print filename
  864 + # print ''
  865 + # print code_data
  866 + # print ''
  867 + logging.debug('extracted file {0}'.format(filename))
  868 + else:
  869 + logging.warning("module stream {0} has code data length 0".format(MODULESTREAMNAME_StreamName))
  870 + return
  871 +
  872 +
  873 +def filter_vba(vba_code):
  874 + """
  875 + Filter VBA source code to remove the first lines starting with "Attribute VB_",
  876 + which are automatically added by MS Office and not displayed in the VBA Editor.
  877 + This should only be used when displaying source code for human analysis.
  878 +
  879 + Note: lines are not filtered if they contain a colon, because it could be
  880 + used to hide malicious instructions.
  881 +
  882 + :param vba_code: str, VBA source code
  883 + :return: str, filtered VBA source code
  884 + """
  885 + vba_lines = vba_code.splitlines()
  886 + start = 0
  887 + for line in vba_lines:
  888 + if line.startswith("Attribute VB_") and not ':' in line:
  889 + start += 1
  890 + else:
  891 + break
  892 + #TODO: also remove empty lines?
  893 + vba = '\n'.join(vba_lines[start:])
  894 + return vba
  895 +
  896 +
  897 +def detect_autoexec(vba_code, obfuscation=None):
  898 + """
  899 + Detect if the VBA code contains keywords corresponding to macros running
  900 + automatically when triggered by specific actions (e.g. when a document is
  901 + opened or closed).
  902 +
  903 + :param vba_code: str, VBA source code
  904 + :param obfuscation: None or str, name of obfuscation to be added to description
  905 + :return: list of str tuples (keyword, description)
  906 + """
  907 + #TODO: merge code with detect_suspicious
  908 + # case-insensitive search
  909 + #vba_code = vba_code.lower()
  910 + results = []
  911 + obf_text = ''
  912 + if obfuscation:
  913 + obf_text = ' (obfuscation: %s)' % obfuscation
  914 + for description, keywords in AUTOEXEC_KEYWORDS.items():
  915 + for keyword in keywords:
  916 + #TODO: if keyword is already a compiled regex, use it as-is
  917 + # search using regex to detect word boundaries:
  918 + if re.search(r'(?i)\b'+keyword+r'\b', vba_code):
  919 + #if keyword.lower() in vba_code:
  920 + results.append((keyword, description+obf_text))
  921 + return results
  922 +
  923 +
  924 +def detect_suspicious(vba_code, obfuscation=None):
  925 + """
  926 + Detect if the VBA code contains suspicious keywords corresponding to
  927 + potential malware behaviour.
  928 +
  929 + :param vba_code: str, VBA source code
  930 + :param obfuscation: None or str, name of obfuscation to be added to description
  931 + :return: list of str tuples (keyword, description)
  932 + """
  933 + # case-insensitive search
  934 + #vba_code = vba_code.lower()
  935 + results = []
  936 + obf_text = ''
  937 + if obfuscation:
  938 + obf_text = ' (obfuscation: %s)' % obfuscation
  939 + for description, keywords in SUSPICIOUS_KEYWORDS.items():
  940 + for keyword in keywords:
  941 + # search using regex to detect word boundaries:
  942 + if re.search(r'(?i)\b'+keyword+r'\b', vba_code):
  943 + #if keyword.lower() in vba_code:
  944 + results.append((keyword, description+obf_text))
  945 + return results
  946 +
  947 +
  948 +def detect_patterns(vba_code, obfuscation=None):
  949 + """
  950 + Detect if the VBA code contains specific patterns such as IP addresses,
  951 + URLs, e-mail addresses, executable file names, etc.
  952 +
  953 + :param vba_code: str, VBA source code
  954 + :return: list of str tuples (pattern type, value)
  955 + """
  956 + results = []
  957 + found = set()
  958 + obf_text = ''
  959 + if obfuscation:
  960 + obf_text = ' (obfuscation: %s)' % obfuscation
  961 + for pattern_type, pattern_re in RE_PATTERNS:
  962 + for match in pattern_re.finditer(vba_code):
  963 + value = match.group()
  964 + if value not in found:
  965 + results.append((pattern_type+obf_text, value))
  966 + found.add(value)
  967 + return results
  968 +
  969 +
  970 +def detect_hex_strings(vba_code):
  971 + """
  972 + Detect if the VBA code contains strings encoded in hexadecimal.
  973 +
  974 + :param vba_code: str, VBA source code
  975 + :return: list of str tuples (encoded string, decoded string)
  976 + """
  977 + results = []
  978 + found = set()
  979 + for match in re_hex_string.finditer(vba_code):
  980 + value = match.group()
  981 + if value not in found:
  982 + decoded = binascii.unhexlify(value)
  983 + results.append((value, decoded))
  984 + found.add(value)
  985 + return results
  986 +
  987 +
  988 +def detect_base64_strings(vba_code):
  989 + """
  990 + Detect if the VBA code contains strings encoded in base64.
  991 +
  992 + :param vba_code: str, VBA source code
  993 + :return: list of str tuples (encoded string, decoded string)
  994 + """
  995 + #TODO: avoid matching simple hex strings as base64?
  996 + results = []
  997 + found = set()
  998 + for match in re_base64_string.finditer(vba_code):
  999 + # extract the base64 string without quotes:
  1000 + value = match.group().strip('"')
  1001 + # check it is not just a hex string:
  1002 + if not re_nothex_check.search(value):
  1003 + continue
  1004 + # only keep new values and not in the whitelist:
  1005 + if value not in found and value.lower() not in BASE64_WHITELIST:
  1006 + try:
  1007 + decoded = base64.b64decode(value)
  1008 + results.append((value, decoded))
  1009 + found.add(value)
  1010 + except:
  1011 + # if an exception occurs, it is likely not a base64-encoded string
  1012 + pass
  1013 + return results
  1014 +
  1015 +
  1016 +def detect_dridex_strings(vba_code):
  1017 + """
  1018 + Detect if the VBA code contains strings obfuscated with a specific algorithm found in Dridex samples.
  1019 +
  1020 + :param vba_code: str, VBA source code
  1021 + :return: list of str tuples (encoded string, decoded string)
  1022 + """
  1023 + from thirdparty.DridexUrlDecoder.DridexUrlDecoder import DridexUrlDecode
  1024 + results = []
  1025 + found = set()
  1026 + for match in re_dridex_string.finditer(vba_code):
  1027 + value = match.group()[1:-1]
  1028 + # check it is not just a hex string:
  1029 + if not re_nothex_check.search(value):
  1030 + continue
  1031 + if value not in found:
  1032 + try:
  1033 + decoded = DridexUrlDecode(value)
  1034 + results.append((value, decoded))
  1035 + found.add(value)
  1036 + except:
  1037 + # if an exception occurs, it is likely not a dridex-encoded string
  1038 + pass
  1039 + return results
  1040 +
  1041 +
  1042 +class VBA_Scanner (object):
  1043 + """
  1044 + Class to scan the source code of a VBA module to find obfuscated strings,
  1045 + suspicious keywords, IOCs, auto-executable macros, etc.
  1046 + """
  1047 +
  1048 + def __init__(self, vba_code):
  1049 + """
  1050 + VBA_Scanner constructor
  1051 +
  1052 + :param vba_code: str, VBA source code to be analyzed
  1053 + """
  1054 + self.code = vba_code
  1055 + self.code_hex = ''
  1056 + self.code_hex_rev = ''
  1057 + self.code_rev_hex = ''
  1058 + self.code_base64 = ''
  1059 + self.code_dridex = ''
  1060 +
  1061 +
  1062 + def scan(self, include_decoded_strings=False):
  1063 + """
  1064 + Analyze the provided VBA code to detect suspicious keywords,
  1065 + auto-executable macros, IOC patterns, obfuscation patterns
  1066 + such as hex-encoded strings.
  1067 +
  1068 + :param include_decoded_strings: bool, if True, all encoded strings will be included with their decoded content.
  1069 + :return: list of tuples (type, keyword, description)
  1070 + (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String')
  1071 + """
  1072 + # First, detect and extract hex-encoded strings:
  1073 + self.hex_strings = detect_hex_strings(self.code)
  1074 + # detect if the code contains StrReverse:
  1075 + self.strReverse = False
  1076 + if 'strreverse' in self.code.lower(): self.strReverse = True
  1077 + # Then append the decoded strings to the VBA code, to detect obfuscated IOCs and keywords:
  1078 + for encoded, decoded in self.hex_strings:
  1079 + self.code_hex += '\n'+decoded
  1080 + # if the code contains "StrReverse", also append the hex strings in reverse order:
  1081 + if self.strReverse:
  1082 + # StrReverse after hex decoding:
  1083 + self.code_hex_rev += '\n'+decoded[::-1]
  1084 + # StrReverse before hex decoding:
  1085 + self.code_rev_hex += '\n'+binascii.unhexlify(encoded[::-1])
  1086 + #example: https://malwr.com/analysis/NmFlMGI4YTY1YzYyNDkwNTg1ZTBiZmY5OGI3YjlhYzU/
  1087 + #TODO: also append the full code reversed if StrReverse? (risk of false positives?)
  1088 + # Detect Base64-encoded strings
  1089 + self.base64_strings = detect_base64_strings(self.code)
  1090 + for encoded, decoded in self.base64_strings:
  1091 + self.code_base64 += '\n'+decoded
  1092 + # Detect Dridex-encoded strings
  1093 + self.dridex_strings = detect_dridex_strings(self.code)
  1094 + for encoded, decoded in self.dridex_strings:
  1095 + self.code_dridex += '\n'+decoded
  1096 + results = []
  1097 + self.autoexec_keywords = []
  1098 + self.suspicious_keywords = []
  1099 + self.iocs = []
  1100 +
  1101 + for code, obfuscation in (
  1102 + (self.code, None),
  1103 + (self.code_hex, 'Hex'),
  1104 + (self.code_hex_rev, 'Hex+StrReverse'),
  1105 + (self.code_rev_hex, 'StrReverse+Hex'),
  1106 + (self.code_base64, 'Base64'),
  1107 + (self.code_dridex, 'Dridex'),
  1108 + ):
  1109 + self.autoexec_keywords += detect_autoexec(code, obfuscation)
  1110 + self.suspicious_keywords += detect_suspicious(code, obfuscation)
  1111 + self.iocs += detect_patterns(code, obfuscation)
  1112 +
  1113 + # If hex-encoded strings were discovered, add an item to suspicious keywords:
  1114 + if self.hex_strings:
  1115 + self.suspicious_keywords.append(('Hex Strings',
  1116 + 'Hex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))
  1117 + if self.base64_strings:
  1118 + self.suspicious_keywords.append(('Base64 Strings',
  1119 + 'Base64-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))
  1120 + if self.dridex_strings:
  1121 + self.suspicious_keywords.append(('Dridex Strings',
  1122 + 'Dridex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)'))
  1123 + for keyword, description in self.autoexec_keywords:
  1124 + results.append(('AutoExec', keyword, description))
  1125 + for keyword, description in self.suspicious_keywords:
  1126 + results.append(('Suspicious', keyword, description))
  1127 + for pattern_type, value in self.iocs:
  1128 + results.append(('IOC', value, pattern_type))
  1129 + if include_decoded_strings:
  1130 + for encoded, decoded in self.hex_strings:
  1131 + results.append(('Hex String', repr(decoded), encoded))
  1132 + for encoded, decoded in self.base64_strings:
  1133 + results.append(('Base64 String', repr(decoded), encoded))
  1134 + for encoded, decoded in self.dridex_strings:
  1135 + results.append(('Dridex string', repr(decoded), encoded))
  1136 + return results
  1137 +
  1138 + def scan_summary(self):
  1139 + """
  1140 + Analyze the provided VBA code to detect suspicious keywords,
  1141 + auto-executable macros, IOC patterns, obfuscation patterns
  1142 + such as hex-encoded strings.
  1143 +
  1144 + :return: tuple with the number of items found for each category:
  1145 + (autoexec, suspicious, IOCs, hex, base64, dridex)
  1146 + """
  1147 + self.scan()
  1148 + return (len(self.autoexec_keywords), len(self.suspicious_keywords),
  1149 + len(self.iocs), len(self.hex_strings), len(self.base64_strings),
  1150 + len(self.dridex_strings))
  1151 +
  1152 +
  1153 +
  1154 +def scan_vba(vba_code, include_decoded_strings):
  1155 + """
  1156 + Analyze the provided VBA code to detect suspicious keywords,
  1157 + auto-executable macros, IOC patterns, obfuscation patterns
  1158 + such as hex-encoded strings.
  1159 + (shortcut for VBA_Scanner(vba_code).scan())
  1160 +
  1161 + :param vba_code: str, VBA source code to be analyzed
  1162 + :param include_decoded_strings: bool, if True all encoded strings will be included with their decoded content.
  1163 + :return: list of tuples (type, keyword, description)
  1164 + (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String')
  1165 + """
  1166 + return VBA_Scanner(vba_code).scan(include_decoded_strings)
  1167 +
  1168 +
  1169 +#=== CLASSES =================================================================
  1170 +
  1171 +class VBA_Parser(object):
  1172 + """
  1173 + Class to parse MS Office files, to detect VBA macros and extract VBA source code
  1174 + Supported file formats:
  1175 + - Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm)
  1176 + - Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb)
  1177 + - PowerPoint 2007+ (.pptm, .ppsm)
  1178 + """
  1179 +
  1180 + def __init__(self, filename, data=None):
  1181 + """
  1182 + Constructor for VBA_Parser
  1183 +
  1184 + :param filename: filename or path of file to parse, or file-like object
  1185 +
  1186 + :param data: None or bytes str, if None the file will be read from disk (or from the file-like object).
  1187 + If data is provided as a bytes string, it will be parsed as the content of the file in memory,
  1188 + and not read from disk. Note: files must be read in binary mode, i.e. open(f, 'rb').
  1189 + """
  1190 + #TODO: filename should only be a string, data should be used for the file-like object
  1191 + #TODO: filename should be mandatory, optional data is a string or file-like object
  1192 + #TODO: also support olefile and zipfile as input
  1193 + if data is None:
  1194 + # open file from disk:
  1195 + _file = filename
  1196 + else:
  1197 + # file already read in memory, make it a file-like object for zipfile:
  1198 + _file = cStringIO.StringIO(data)
  1199 + #self.file = _file
  1200 + self.ole_file = None
  1201 + self.ole_subfiles = []
  1202 + self.filename = filename
  1203 + self.type = None
  1204 + self.vba_projects = None
  1205 + # if filename is None:
  1206 + # if isinstance(_file, basestring):
  1207 + # if len(_file) < olefile.MINIMAL_OLEFILE_SIZE:
  1208 + # self.filename = _file
  1209 + # else:
  1210 + # self.filename = '<file in bytes string>'
  1211 + # else:
  1212 + # self.filename = '<file-like object>'
  1213 + if olefile.isOleFile(_file):
  1214 + # This looks like an OLE file
  1215 + logging.info('Parsing OLE file %s' % self.filename)
  1216 + # Open and parse the OLE file, using unicode for path names:
  1217 + self.ole_file = olefile.OleFileIO(_file, path_encoding=None)
  1218 + self.type = TYPE_OLE
  1219 + #TODO: raise TypeError if this is a Powerpoint 97 file, since VBA macros cannot be detected yet
  1220 + elif zipfile.is_zipfile(_file):
  1221 + # This looks like a zip file, need to look for vbaProject.bin inside
  1222 + # It can be any OLE file inside the archive
  1223 + #...because vbaProject.bin can be renamed:
  1224 + # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18
  1225 + logging.info('Opening ZIP/OpenXML file %s' % self.filename)
  1226 + self.type = TYPE_OpenXML
  1227 + z = zipfile.ZipFile(_file)
  1228 + #TODO: check if this is actually an OpenXML file
  1229 + #TODO: if the zip file is encrypted, suggest to use the -z option, or try '-z infected' automatically?
  1230 + # check each file within the zip if it is an OLE file, by reading its magic:
  1231 + for subfile in z.namelist():
  1232 + magic = z.open(subfile).read(len(olefile.MAGIC))
  1233 + if magic == olefile.MAGIC:
  1234 + logging.debug('Opening OLE file %s within zip' % subfile)
  1235 + ole_data = z.open(subfile).read()
  1236 + try:
  1237 + self.ole_subfiles.append(VBA_Parser(filename=subfile, data=ole_data))
  1238 + except:
  1239 + logging.debug('%s is not a valid OLE file' % subfile)
  1240 + continue
  1241 + z.close()
  1242 + else:
  1243 + # read file from disk, check if it is a Word 2003 XML file (WordProcessingML), Excel 2003 XML,
  1244 + # or a plain text file containing VBA code
  1245 + if data is None:
  1246 + data = open(filename, 'rb').read()
  1247 + # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace
  1248 + if 'http://schemas.microsoft.com/office/word/2003/wordml' in data:
  1249 + logging.info('Opening Word 2003 XML file %s' % self.filename)
  1250 + self.type = TYPE_Word2003_XML
  1251 + # parse the XML content
  1252 + et = ET.fromstring(data)
  1253 + # find all the binData elements:
  1254 + for bindata in et.getiterator(TAG_BINDATA):
  1255 + # the binData content is an OLE container for the VBA project, compressed
  1256 + # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded.
  1257 + # get the filename:
  1258 + fname = bindata.get(ATTR_NAME, 'noname.mso')
  1259 + # decode the base64 activemime
  1260 + activemime = binascii.a2b_base64(bindata.text)
  1261 + # decompress the zlib data starting at offset 0x32, which is the OLE container:
  1262 + ole_data = zlib.decompress(activemime[0x32:])
  1263 + try:
  1264 + self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data))
  1265 + except:
  1266 + logging.debug('%s is not a valid OLE file' % fname)
  1267 + continue
  1268 + #TODO: handle exceptions
  1269 + #TODO: Excel 2003 XML
  1270 + #TODO: plain text VBA file
  1271 + else:
  1272 + msg = '%s is not an OLE nor an OpenXML file, cannot extract VBA Macros.' % self.filename
  1273 + logging.error(msg)
  1274 + raise TypeError(msg)
  1275 +
  1276 + def find_vba_projects (self):
  1277 + """
  1278 + Finds all the VBA projects stored in an OLE file.
  1279 +
  1280 + Return None if the file is not OLE but OpenXML.
  1281 + Return a list of tuples (vba_root, project_path, dir_path) for each VBA project.
  1282 + vba_root is the path of the root OLE storage containing the VBA project,
  1283 + including a trailing slash unless it is the root of the OLE file.
  1284 + project_path is the path of the OLE stream named "PROJECT" within the VBA project.
  1285 + dir_path is the path of the OLE stream named "VBA/dir" within the VBA project.
  1286 +
  1287 + If this function returns an empty list for one of the supported formats
  1288 + (i.e. Word, Excel, Powerpoint except Powerpoint 97-2003), then the
  1289 + file does not contain VBA macros.
  1290 +
  1291 + :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path)
  1292 + for each VBA project found if OLE file
  1293 + """
  1294 + # if the file is not OLE but OpenXML, return None:
  1295 + if self.ole_file is None:
  1296 + return None
  1297 +
  1298 + # if this method has already been called, return previous result:
  1299 + if self.vba_projects is not None:
  1300 + return self.vba_projects
  1301 +
  1302 + # Find the VBA project root (different in MS Word, Excel, etc):
  1303 + # - Word 97-2003: Macros
  1304 + # - Excel 97-2003: _VBA_PROJECT_CUR
  1305 + # - PowerPoint 97-2003: not supported yet (different file structure)
  1306 + # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin.
  1307 + # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word
  1308 + # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word
  1309 + # - Visio 2007: not supported yet (different file structure)
  1310 +
  1311 + # According to MS-OVBA section 2.2.1:
  1312 + # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream
  1313 + # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream
  1314 + # - all names are case-insensitive
  1315 +
  1316 + # start with an empty list:
  1317 + self.vba_projects = []
  1318 + # Look for any storage containing those storage/streams:
  1319 + ole = self.ole_file
  1320 + for storage in ole.listdir(streams=False, storages=True):
  1321 + # Look for a storage ending with "VBA":
  1322 + if storage[-1].upper() == 'VBA':
  1323 + logging.debug('Found VBA storage: %s' % ('/'.join(storage)))
  1324 + vba_root = '/'.join(storage[:-1])
  1325 + # Add a trailing slash to vba_root, unless it is the root of the OLE file:
  1326 + # (used later to append all the child streams/storages)
  1327 + if vba_root != '':
  1328 + vba_root += '/'
  1329 + logging.debug('Checking vba_root="%s"' % vba_root)
  1330 +
  1331 + def check_vba_stream(ole, vba_root, stream_path):
  1332 + full_path = vba_root + stream_path
  1333 + if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM:
  1334 + logging.debug('Found %s stream: %s' % (stream_path, full_path))
  1335 + return full_path
  1336 + else:
  1337 + logging.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path)
  1338 + return False
  1339 +
  1340 + # Check if the VBA root storage also contains a PROJECT stream:
  1341 + project_path = check_vba_stream(ole, vba_root, 'PROJECT')
  1342 + if not project_path: continue
  1343 + # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream:
  1344 + vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT')
  1345 + if not vba_project_path: continue
  1346 + # Check if the VBA root storage also contains a VBA/dir stream:
  1347 + dir_path = check_vba_stream(ole, vba_root, 'VBA/dir')
  1348 + if not dir_path: continue
  1349 + # Now we are pretty sure it is a VBA project structure
  1350 + logging.debug('VBA root storage: "%s"' % vba_root)
  1351 + # append the results to the list as a tuple for later use:
  1352 + self.vba_projects.append((vba_root, project_path, dir_path))
  1353 + return self.vba_projects
  1354 +
  1355 + def detect_vba_macros(self):
  1356 + """
  1357 + Detect the potential presence of VBA macros in the file, by checking
  1358 + if it contains VBA projects. Both OLE and OpenXML files are supported.
  1359 +
  1360 + Important: for now, results are accurate only for Word, Excel and PowerPoint
  1361 + EXCEPT Powerpoint 97-2003, which has a different structure for VBA.
  1362 +
  1363 + Note: this method does NOT attempt to check the actual presence or validity
  1364 + of VBA macro source code, so there might be false positives.
  1365 + It may also detect VBA macros in files embedded within the main file,
  1366 + for example an Excel workbook with macros embedded into a Word
  1367 + document without macros may be detected, without distinction.
  1368 +
  1369 + :return: bool, True if at least one VBA project has been found, False otherwise
  1370 + """
  1371 + #TODO: return None or raise exception if format not supported like PPT 97-2003
  1372 + #TODO: return the number of VBA projects found instead of True/False?
  1373 + # if OpenXML, check all the OLE subfiles:
  1374 + if self.ole_file is None:
  1375 + for ole_subfile in self.ole_subfiles:
  1376 + if ole_subfile.detect_vba_macros():
  1377 + return True
  1378 + return False
  1379 + # otherwise it's an OLE file, find VBA projects:
  1380 + vba_projects = self.find_vba_projects()
  1381 + if len(vba_projects) == 0:
  1382 + return False
  1383 + else:
  1384 + return True
  1385 +
  1386 +
  1387 + def extract_macros (self):
  1388 + """
  1389 + Extract and decompress source code for each VBA macro found in the file
  1390 +
  1391 + Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found
  1392 + If the file is OLE, filename is the path of the file.
  1393 + If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros
  1394 + within the zip archive, e.g. word/vbaProject.bin.
  1395 + """
  1396 + if self.ole_file is None:
  1397 + for ole_subfile in self.ole_subfiles:
  1398 + for results in ole_subfile.extract_macros():
  1399 + yield results
  1400 + else:
  1401 + self.find_vba_projects()
  1402 + for vba_root, project_path, dir_path in self.vba_projects:
  1403 + # extract all VBA macros from that VBA root storage:
  1404 + for stream_path, vba_filename, vba_code in _extract_vba(self.ole_file, vba_root, project_path, dir_path):
  1405 + yield (self.filename, stream_path, vba_filename, vba_code)
  1406 +
  1407 +
  1408 + def close(self):
  1409 + """
  1410 + Close all the open files. This method must be called after usage, if
  1411 + the application is opening many files.
  1412 + """
  1413 + if self.ole_file is None:
  1414 + for ole_subfile in self.ole_subfiles:
  1415 + ole_subfile.close()
  1416 + else:
  1417 + self.ole_file.close()
  1418 +
  1419 +
  1420 +def print_analysis(vba_code, show_decoded_strings=False):
  1421 + """
  1422 + Analyze the provided VBA code, and print the results in a table
  1423 +
  1424 + :param vba_code: str, VBA source code to be analyzed
  1425 + :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.
  1426 + :return: None
  1427 + """
  1428 + results = scan_vba(vba_code, show_decoded_strings)
  1429 + if results:
  1430 + t = prettytable.PrettyTable(('Type', 'Keyword', 'Description'))
  1431 + t.align = 'l'
  1432 + t.max_width['Type'] = 10
  1433 + t.max_width['Keyword'] = 20
  1434 + t.max_width['Description'] = 39
  1435 + for kw_type, keyword, description in results:
  1436 + t.add_row((kw_type, keyword, description))
  1437 + print t
  1438 + else:
  1439 + print 'No suspicious keyword or IOC found.'
  1440 +
  1441 +
  1442 +
  1443 +def process_file (container, filename, data, show_decoded_strings=False):
  1444 + """
  1445 + Process a single file
  1446 +
  1447 + :param container: str, path and filename of container if the file is within
  1448 + a zip archive, None otherwise.
  1449 + :param filename: str, path and filename of file on disk, or within the container.
  1450 + :param data: bytes, content of the file if it is in a container, None if it is a file on disk.
  1451 + :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.
  1452 + """
  1453 + #TODO: replace print by writing to a provided output file (sys.stdout by default)
  1454 + if container:
  1455 + display_filename = '%s in %s' % (filename, container)
  1456 + else:
  1457 + display_filename = filename
  1458 + print '='*79
  1459 + print 'FILE:', display_filename
  1460 + try:
  1461 + #TODO: handle olefile errors, when an OLE file is malformed
  1462 + vba = VBA_Parser(filename, data)
  1463 + print 'Type:', vba.type
  1464 + if vba.detect_vba_macros():
  1465 + #print 'Contains VBA Macros:'
  1466 + for (subfilename, stream_path, vba_filename, vba_code) in vba.extract_macros():
  1467 + # hide attribute lines:
  1468 + #TODO: option to disable attribute filtering
  1469 + vba_code_filtered = filter_vba(vba_code)
  1470 + print '-'*79
  1471 + print 'VBA MACRO %s ' % vba_filename
  1472 + print 'in file: %s - OLE stream: %s' % (subfilename, repr(stream_path))
  1473 + print '- '*39
  1474 + # detect empty macros:
  1475 + if vba_code_filtered.strip() == '':
  1476 + print '(empty macro)'
  1477 + else:
  1478 + print vba_code_filtered
  1479 + print '- '*39
  1480 + print 'ANALYSIS:'
  1481 + # analyse the whole code, filtered to avoid false positives:
  1482 + print_analysis(vba_code_filtered, show_decoded_strings)
  1483 + else:
  1484 + print 'No VBA macros found.'
  1485 + except: #TypeError:
  1486 + #raise
  1487 + #TODO: print more info if debug mode
  1488 + #print sys.exc_value
  1489 + # display the exception with full stack trace for debugging, but do not stop:
  1490 + traceback.print_exc()
  1491 + print ''
  1492 +
  1493 +
  1494 +def process_file_triage (container, filename, data):
  1495 + """
  1496 + Process a single file
  1497 +
  1498 + :param container: str, path and filename of container if the file is within
  1499 + a zip archive, None otherwise.
  1500 + :param filename: str, path and filename of file on disk, or within the container.
  1501 + :param data: bytes, content of the file if it is in a container, None if it is a file on disk.
  1502 + """
  1503 + #TODO: replace print by writing to a provided output file (sys.stdout by default)
  1504 + nb_macros = 0
  1505 + nb_autoexec = 0
  1506 + nb_suspicious = 0
  1507 + nb_iocs = 0
  1508 + nb_hexstrings = 0
  1509 + nb_base64strings = 0
  1510 + nb_dridexstrings = 0
  1511 + # ftype = 'Other'
  1512 + message = ''
  1513 + try:
  1514 + #TODO: handle olefile errors, when an OLE file is malformed
  1515 + vba = VBA_Parser(filename, data)
  1516 + if vba.detect_vba_macros():
  1517 + for (subfilename, stream_path, vba_filename, vba_code) in vba.extract_macros():
  1518 + nb_macros += 1
  1519 + if vba_code.strip() != '':
  1520 + # analyse the whole code, filtered to avoid false positives:
  1521 + scanner = VBA_Scanner(filter_vba(vba_code))
  1522 + autoexec, suspicious, iocs, hexstrings, base64strings, dridex = scanner.scan_summary()
  1523 + nb_autoexec += autoexec
  1524 + nb_suspicious += suspicious
  1525 + nb_iocs += iocs
  1526 + nb_hexstrings += hexstrings
  1527 + nb_base64strings += base64strings
  1528 + nb_dridexstrings += dridex
  1529 + if vba.type == TYPE_OLE:
  1530 + flags = 'OLE:'
  1531 + elif vba.type == TYPE_OpenXML:
  1532 + flags = 'OpX:'
  1533 + elif vba.type == TYPE_Word2003_XML:
  1534 + flags = 'XML:'
  1535 + macros = autoexec = suspicious = iocs = hexstrings = base64obf = dridex = '-'
  1536 + if nb_macros: macros = 'M'
  1537 + if nb_autoexec: autoexec = 'A'
  1538 + if nb_suspicious: suspicious = 'S'
  1539 + if nb_iocs: iocs = 'I'
  1540 + if nb_hexstrings: hexstrings = 'H'
  1541 + if nb_base64strings: base64obf = 'B'
  1542 + if nb_dridexstrings: dridex = 'D'
  1543 + flags += '%s%s%s%s%s%s%s' % (macros, autoexec, suspicious, iocs, hexstrings,
  1544 + base64obf, dridex)
  1545 +
  1546 + # macros = autoexec = suspicious = iocs = hexstrings = 'no'
  1547 + # if nb_macros: macros = 'YES:%d' % nb_macros
  1548 + # if nb_autoexec: autoexec = 'YES:%d' % nb_autoexec
  1549 + # if nb_suspicious: suspicious = 'YES:%d' % nb_suspicious
  1550 + # if nb_iocs: iocs = 'YES:%d' % nb_iocs
  1551 + # if nb_hexstrings: hexstrings = 'YES:%d' % nb_hexstrings
  1552 + # # 2nd line = info
  1553 + # print '%-8s %-7s %-7s %-7s %-7s %-7s' % (vba.type, macros, autoexec, suspicious, iocs, hexstrings)
  1554 + except TypeError:
  1555 + # file type not OLE nor OpenXML
  1556 + flags = '?'
  1557 + message = 'File format not supported'
  1558 + except:
  1559 + # another error occurred
  1560 + #raise
  1561 + #TODO: print more info if debug mode
  1562 + #TODO: distinguish real errors from incorrect file types
  1563 + flags = '!ERROR'
  1564 + message = sys.exc_value
  1565 + line = '%-11s %s' % (flags, filename)
  1566 + if message:
  1567 + line += ' - %s' % message
  1568 + print line
  1569 +
  1570 + # t = prettytable.PrettyTable(('filename', 'type', 'macros', 'autoexec', 'suspicious', 'ioc', 'hexstrings'),
  1571 + # header=False, border=False)
  1572 + # t.align = 'l'
  1573 + # t.max_width['filename'] = 30
  1574 + # t.max_width['type'] = 10
  1575 + # t.max_width['macros'] = 6
  1576 + # t.max_width['autoexec'] = 6
  1577 + # t.max_width['suspicious'] = 6
  1578 + # t.max_width['ioc'] = 6
  1579 + # t.max_width['hexstrings'] = 6
  1580 + # t.add_row((filename, ftype, macros, autoexec, suspicious, iocs, hexstrings))
  1581 + # print t
  1582 +
  1583 +def main_triage_quick():
  1584 + pass
  1585 +
  1586 +#=== MAIN =====================================================================
  1587 +
  1588 +def main():
  1589 + """
  1590 + Main function, called when olevba is run from the command line
  1591 + """
  1592 + usage = 'usage: %prog [options] <filename> [filename2 ...]'
  1593 + parser = optparse.OptionParser(usage=usage)
  1594 + # parser.add_option('-o', '--outfile', dest='outfile',
  1595 + # help='output file')
  1596 + # parser.add_option('-c', '--csv', dest='csv',
  1597 + # help='export results to a CSV file')
  1598 + parser.add_option("-r", action="store_true", dest="recursive",
  1599 + help='find files recursively in subdirectories.')
  1600 + parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None,
  1601 + help='if the file is a zip archive, open first file from it, using the provided password (requires Python 2.6+)')
  1602 + parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*',
  1603 + help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)')
  1604 + parser.add_option("-t", action="store_true", dest="triage_mode",
  1605 + help='triage mode, display results as a summary table (default for multiple files)')
  1606 + parser.add_option("-d", action="store_true", dest="detailed_mode",
  1607 + help='detailed mode, display full results (default for single file)')
  1608 + parser.add_option("-i", "--input", dest='input', type='str', default=None,
  1609 + help='input file containing VBA source code to be analyzed (no parsing)')
  1610 + parser.add_option("--decode", action="store_true", dest="show_decoded_strings",
  1611 + help='display all the obfuscated strings with their decoded content (Hex, Base64, StrReverse, Dridex).')
  1612 +
  1613 + (options, args) = parser.parse_args()
  1614 +
  1615 + # Print help if no arguments are passed
  1616 + if len(args) == 0 and not options.input:
  1617 + print __doc__
  1618 + parser.print_help()
  1619 + sys.exit()
  1620 +
  1621 + # print banner with version
  1622 + print 'olevba %s - http://decalage.info/python/oletools' % __version__
  1623 +
  1624 + logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING) #INFO)
  1625 + # For now, all logging is disabled:
  1626 + logging.disable(logging.CRITICAL)
  1627 +
  1628 + if options.input:
  1629 + # input file provided with VBA source code to be analyzed directly:
  1630 + print 'Analysis of VBA source code from %s:' % options.input
  1631 + vba_code = open(options.input).read()
  1632 + print_analysis(vba_code, show_decoded_strings=options.show_decoded_strings)
  1633 + sys.exit()
  1634 +
  1635 + # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('Type', 'Macros', 'AutoEx', 'Susp.', 'IOCs', 'HexStr')
  1636 + # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('-'*8, '-'*7, '-'*7, '-'*7, '-'*7, '-'*7)
  1637 + if not options.detailed_mode or options.triage_mode:
  1638 + print '%-11s %-65s' % ('Flags', 'Filename')
  1639 + print '%-11s %-65s' % ('-'*11, '-'*65)
  1640 + previous_container = None
  1641 + count = 0
  1642 + container = filename = data = None
  1643 + for container, filename, data in xglob.iter_files(args, recursive=options.recursive,
  1644 + zip_password=options.zip_password, zip_fname=options.zip_fname):
  1645 + # ignore directory names stored in zip files:
  1646 + if container and filename.endswith('/'):
  1647 + continue
  1648 + if options.detailed_mode and not options.triage_mode:
  1649 + # fully detailed output
  1650 + process_file(container, filename, data, show_decoded_strings=options.show_decoded_strings)
  1651 + else:
  1652 + # print container name when it changes:
  1653 + if container != previous_container:
  1654 + if container is not None:
  1655 + print '\nFiles in %s:' % container
  1656 + previous_container = container
  1657 + # summarized output for triage:
  1658 + process_file_triage(container, filename, data)
  1659 + count += 1
  1660 + if not options.detailed_mode or options.triage_mode:
  1661 + print '\n(Flags: OpX=OpenXML, XML=Word2003XML, M=Macros, A=Auto-executable, S=Suspicious keywords, I=IOCs, H=Hex strings, B=Base64 strings, D=Dridex strings, ?=Unknown)\n'
  1662 +
  1663 + if count == 1 and not options.triage_mode and not options.detailed_mode:
  1664 + # if options -t and -d were not specified and it's a single file, print details:
  1665 + #TODO: avoid doing the analysis twice by storing results
  1666 + process_file(container, filename, data, show_decoded_strings=options.show_decoded_strings)
  1667 +
  1668 +if __name__ == '__main__':
  1669 + main()
  1670 +
1671 # This was coded while listening to "Dust" from I Love You But I've Chosen Darkness 1671 # This was coded while listening to "Dust" from I Love You But I've Chosen Darkness
1672 \ No newline at end of file 1672 \ No newline at end of file