rtfobj.py
16.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
#!/usr/bin/env python
"""
rtfobj.py
rtfobj is a Python module to extract embedded objects from RTF files, such as
OLE ojects. It can be used as a Python library or a command-line tool.
Usage: rtfobj.py <file.rtf>
rtfobj project website: http://www.decalage.info/python/rtfobj
rtfobj is part of the python-oletools package:
http://www.decalage.info/python/oletools
"""
#=== LICENSE =================================================================
# rtfobj is copyright (c) 2012-2016, Philippe Lagadec (http://www.decalage.info)
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#------------------------------------------------------------------------------
# CHANGELOG:
# 2012-11-09 v0.01 PL: - first version
# 2013-04-02 v0.02 PL: - fixed bug in main
# 2015-12-09 v0.03 PL: - configurable logging, CLI options
# - extract OLE 1.0 objects
# - extract files from OLE Package objects
# 2016-04-01 v0.04 PL: - fixed logging output to use stdout instead of stderr
# 2016-04-07 v0.45 PL: - improved parsing to handle some malware tricks
__version__ = '0.45'
#------------------------------------------------------------------------------
# TODO:
# - improve regex pattern for better performance?
# - allow semicolon within hex, as found in this sample:
# http://contagiodump.blogspot.nl/2011/10/sep-28-cve-2010-3333-manuscript-with.html
#=== IMPORTS =================================================================
import re, sys, string, binascii, logging, optparse
from thirdparty.xglob import xglob
from oleobj import OleObject, OleNativeStream
import oleobj
# === LOGGING =================================================================
class NullHandler(logging.Handler):
"""
Log Handler without output, to avoid printing messages if logging is not
configured by the main application.
Python 2.7 has logging.NullHandler, but this is necessary for 2.6:
see https://docs.python.org/2.6/library/logging.html#configuring-logging-for-a-library
"""
def emit(self, record):
pass
def get_logger(name, level=logging.CRITICAL+1):
"""
Create a suitable logger object for this module.
The goal is not to change settings of the root logger, to avoid getting
other modules' logs on the screen.
If a logger exists with same name, reuse it. (Else it would have duplicate
handlers and messages would be doubled.)
The level is set to CRITICAL+1 by default, to avoid any logging.
"""
# First, test if there is already a logger with the same name, else it
# will generate duplicate messages (due to duplicate handlers):
if name in logging.Logger.manager.loggerDict:
#NOTE: another less intrusive but more "hackish" solution would be to
# use getLogger then test if its effective level is not default.
logger = logging.getLogger(name)
# make sure level is OK:
logger.setLevel(level)
return logger
# get a new logger:
logger = logging.getLogger(name)
# only add a NullHandler for this logger, it is up to the application
# to configure its own logging:
logger.addHandler(NullHandler())
logger.setLevel(level)
return logger
# a global logger object used for debugging:
log = get_logger('rtfobj')
#=== CONSTANTS=================================================================
# REGEX pattern to extract embedded OLE objects in hexadecimal format:
# alphanum digit: [0-9A-Fa-f]
HEX_DIGIT = r'[0-9A-Fa-f]'
# hex char = two alphanum digits: [0-9A-Fa-f]{2}
# HEX_CHAR = r'[0-9A-Fa-f]{2}'
# in fact MS Word allows whitespaces in between the hex digits!
# HEX_CHAR = r'[0-9A-Fa-f]\s*[0-9A-Fa-f]'
# Even worse, MS Word also allows ANY RTF-style tag {*} in between!!
# AND the tags can be nested...
SINGLE_RTF_TAG = r'[{][^{}]*[}]'
# Nested tags, two levels (because Python's re does not support nested matching):
NESTED_RTF_TAG = r'[{](?:[^{}]|'+SINGLE_RTF_TAG+r')*[}]'
# ignored whitespaces and tags within a hex block:
IGNORED = r'(?:\s|'+NESTED_RTF_TAG+r')*'
HEX_CHAR = HEX_DIGIT + IGNORED + HEX_DIGIT
# several hex chars, at least 4: (?:[0-9A-Fa-f]{2}){4,}
# + word boundaries
HEX_CHARS_4orMORE = r'\b(?:' + HEX_CHAR + r'){4,}\b'
# at least 1 hex char:
HEX_CHARS_1orMORE = r'(?:' + HEX_CHAR + r')+'
# at least 1 hex char, followed by whitespace or CR/LF:
HEX_CHARS_1orMORE_WHITESPACES = r'(?:' + HEX_CHAR + r')+\s+'
# + word boundaries around hex block
# HEX_CHARS_1orMORE_WHITESPACES = r'\b(?:' + HEX_CHAR + r')+\b\s*'
# at least one block of hex and whitespace chars, followed by closing curly bracket:
# HEX_BLOCK_CURLY_BRACKET = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')+\}'
# PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE
PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}\b'
# at least 4 hex chars, followed by whitespace or CR/LF: (?:[0-9A-Fa-f]{2}){4,}\s*
# PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}'
# improved pattern, allowing semicolons within hex:
#PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}'
# a dummy translation table for str.translate, which does not change anythying:
TRANSTABLE_NOCHANGE = string.maketrans('', '')
re_hexblock = re.compile(PATTERN)
re_embedded_tags = re.compile(IGNORED)
re_decimal = re.compile(r'\d+')
re_delimiter = re.compile(r'[ \t\r\n\f\v]')
DELIMITER = r'[ \t\r\n\f\v]'
DELIMITERS_ZeroOrMore = r'[ \t\r\n\f\v]*'
BACKSLASH_BIN = r'\\bin'
# According to my tests, Word accepts up to 250 digits (leading zeroes)
DECIMAL_GROUP = r'(\d{1,250})'
re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + BACKSLASH_BIN
+ DECIMAL_GROUP + DELIMITER)
re_delim_hexblock = re.compile(DELIMITER + PATTERN)
#=== FUNCTIONS ===============================================================
def rtf_iter_objects_old (filename, min_size=32):
"""
Open a RTF file, extract each embedded object encoded in hexadecimal of
size > min_size, yield the index of the object in the RTF file and its data
in binary format.
This is an iterator.
"""
data = open(filename, 'rb').read()
for m in re.finditer(PATTERN, data):
found = m.group(0)
orig_len = len(found)
# remove all whitespace and line feeds:
#NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v}')
found = binascii.unhexlify(found)
#print repr(found)
if len(found)>min_size:
yield m.start(), orig_len, found
# TODO: backward-compatible API?
def search_hex_block(data, pos=0, min_size=32, first=True):
if first:
# Search 1st occurence of a hex block:
match = re_hexblock.search(data, pos=pos)
else:
# Match next occurences of a hex block, from the current position only:
match = re_hexblock.match(data, pos=pos)
def rtf_iter_objects (data, min_size=32):
"""
Open a RTF file, extract each embedded object encoded in hexadecimal of
size > min_size, yield the index of the object in the RTF file and its data
in binary format.
This is an iterator.
"""
# Search 1st occurence of a hex block:
match = re_hexblock.search(data)
if match is None:
# no hex block found
return
while match is not None:
found = match.group(0)
# start index
start = match.start()
# current position
current = match.end()
if len(found) < min_size:
match = re_hexblock.search(data, pos=current)
continue
log.debug('Found hex block starting at %08X, end %08X' % (start, current))
log.debug('Match: %s' % found)
# remove all whitespace and line feeds:
#NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')
# TODO: make it a function
# Also remove embedded RTF tags:
found = re_embedded_tags.sub('', found)
# object data extracted from the RTF file
# MS Word accepts an extra hex digit, so we need to trim it if present:
if len(found) & 1:
found = found[:-1]
log.debug('Cleaned match: %s' % found)
objdata = binascii.unhexlify(found)
# Detect the "\bin" control word, which is sometimes used for obfuscation:
bin_match = re_delims_bin_decimal.match(data, pos=current)
while bin_match is not None:
log.debug('Found \\bin block starting at %08X : %r'
% (bin_match.start(), bin_match.group(0)))
# extract the decimal integer following '\bin'
bin_len = int(bin_match.group(1))
log.debug('\\bin block length = %d' % bin_len)
if current+bin_len > len(data):
log.error('\\bin block length is larger than the remaining data')
# move the current index, ignore the \bin block
current += len(bin_match.group(0))
break
# read that number of bytes:
objdata += data[current:current+bin_len]
# TODO: handle exception
current += len(bin_match.group(0)) + bin_len
# TODO: check if current is out of range
# TODO: is Word limiting the \bin length to a number of digits?
log.debug('Current position = %08X' % current)
match = re_delim_hexblock.match(data, pos=current)
if match is not None:
log.debug('Found next hex block starting at %08X, end %08X'
% (match.start(), match.end()))
found = match.group(0)
log.debug('Match: %s' % found)
# remove all whitespace and line feeds:
#NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')
# Also remove embedded RTF tags:
found = re_embedded_tags.sub(found, '')
objdata += binascii.unhexlify(found)
current = match.end()
bin_match = re_delims_bin_decimal.match(data, pos=current)
# print repr(found)
if len(objdata)>min_size:
yield start, current-start, objdata
# Search next occurence of a hex block:
match = re_hexblock.search(data, pos=current)
def process_file(container, filename, data):
# TODO: option to extract objects to files (false by default)
if data is None:
data = open(filename, 'rb').read()
print '-'*79
print 'File: %r - %d bytes' % (filename, len(data))
for index, orig_len, objdata in rtf_iter_objects(data):
print 'found object size %d at index %08X - end %08X' % (len(objdata), index, index+orig_len)
fname = '%s_object_%08X.raw' % (filename, index)
print 'saving object to file %s' % fname
open(fname, 'wb').write(objdata)
# TODO: check if all hex data is extracted properly
obj = OleObject()
try:
obj.parse(objdata)
print 'extract file embedded in OLE object:'
print 'format_id = %d' % obj.format_id
print 'class name = %r' % obj.class_name
print 'data size = %d' % obj.data_size
# set a file extension according to the class name:
class_name = obj.class_name.lower()
if class_name.startswith('word'):
ext = 'doc'
elif class_name.startswith('package'):
ext = 'package'
else:
ext = 'bin'
fname = '%s_object_%08X.%s' % (filename, index, ext)
print 'saving to file %s' % fname
open(fname, 'wb').write(obj.data)
if obj.class_name.lower() == 'package':
print 'Parsing OLE Package'
opkg = OleNativeStream(bindata=obj.data)
print 'Filename = %r' % opkg.filename
print 'Source path = %r' % opkg.src_path
print 'Temp path = %r' % opkg.temp_path
if opkg.filename:
fname = '%s_%s' % (filename, opkg.filename)
else:
fname = '%s_object_%08X.noname' % (filename, index)
print 'saving to file %s' % fname
open(fname, 'wb').write(opkg.data)
except:
pass
# log.exception('*** Not an OLE 1.0 Object')
#=== MAIN =================================================================
if __name__ == '__main__':
# print banner with version
print ('rtfobj %s - http://decalage.info/python/oletools' % __version__)
print ('THIS IS WORK IN PROGRESS - Check updates regularly!')
print ('Please report any issue at https://bitbucket.org/decalage/oletools/issues')
print ('')
DEFAULT_LOG_LEVEL = "warning" # Default log level
LOG_LEVELS = {'debug': logging.DEBUG,
'info': logging.INFO,
'warning': logging.WARNING,
'error': logging.ERROR,
'critical': logging.CRITICAL
}
usage = 'usage: %prog [options] <filename> [filename2 ...]'
parser = optparse.OptionParser(usage=usage)
# parser.add_option('-o', '--outfile', dest='outfile',
# help='output file')
# parser.add_option('-c', '--csv', dest='csv',
# help='export results to a CSV file')
parser.add_option("-r", action="store_true", dest="recursive",
help='find files recursively in subdirectories.')
parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None,
help='if the file is a zip archive, open first file from it, using the provided password (requires Python 2.6+)')
parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*',
help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)')
parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL,
help="logging level debug/info/warning/error/critical (default=%default)")
(options, args) = parser.parse_args()
# Print help if no arguments are passed
if len(args) == 0:
print __doc__
parser.print_help()
sys.exit()
# Setup logging to the console:
# here we use stdout instead of stderr by default, so that the output
# can be redirected properly.
logging.basicConfig(level=LOG_LEVELS[options.loglevel], stream=sys.stdout,
format='%(levelname)-8s %(message)s')
# enable logging in the modules:
log.setLevel(logging.NOTSET)
oleobj.log.setLevel(logging.NOTSET)
for container, filename, data in xglob.iter_files(args, recursive=options.recursive,
zip_password=options.zip_password, zip_fname=options.zip_fname):
# ignore directory names stored in zip files:
if container and filename.endswith('/'):
continue
process_file(container, filename, data)