From 383ae415084903920c31ef787cb343b8572187e5 Mon Sep 17 00:00:00 2001 From: Philippe Lagadec Date: Wed, 9 Dec 2015 21:22:16 +0100 Subject: [PATCH] rtfobj: extract OLE 1.0 objects and files from OLE Package objects, improved CLI options and logging. Added new module oleobj to parse OLE structures. --- oletools/oleobj.py | 287 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ oletools/rtfobj.py | 282 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 554 insertions(+), 15 deletions(-) create mode 100644 oletools/oleobj.py diff --git a/oletools/oleobj.py b/oletools/oleobj.py new file mode 100644 index 0000000..6424649 --- /dev/null +++ b/oletools/oleobj.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python +""" +oleobj.py + +oleobj is a Python script and module to parse OLE objects and files stored +into various file formats such as RTF or MS Office documents (e.g. Word, Excel). + +Author: Philippe Lagadec - http://www.decalage.info +License: BSD, see source code or documentation + +oleobj is part of the python-oletools package: +http://www.decalage.info/python/oletools +""" + +# === LICENSE ================================================================== + +# oleobj is copyright (c) 2015 Philippe Lagadec (http://www.decalage.info) +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +#------------------------------------------------------------------------------ +# CHANGELOG: +# 2015-12-05 v0.01 PL: - first version + +__version__ = '0.01' + +#------------------------------------------------------------------------------ +# TODO: +# + setup logging (common with other oletools) + + +#------------------------------------------------------------------------------ +# REFERENCES: + +# Reference for the storage of embedded OLE objects/files: +# [MS-OLEDS]: Object Linking and Embedding (OLE) Data Structures +# https://msdn.microsoft.com/en-us/library/dd942265.aspx + +# - officeparser: https://github.com/unixfreak0037/officeparser +# TODO: oledump + + +#--- IMPORTS ------------------------------------------------------------------ + +import logging, struct + + +# === LOGGING ================================================================= + +class NullHandler(logging.Handler): + """ + Log Handler without output, to avoid printing messages if logging is not + configured by the main application. + Python 2.7 has logging.NullHandler, but this is necessary for 2.6: + see https://docs.python.org/2.6/library/logging.html#configuring-logging-for-a-library + """ + def emit(self, record): + pass + +def get_logger(name, level=logging.CRITICAL+1): + """ + Create a suitable logger object for this module. + The goal is not to change settings of the root logger, to avoid getting + other modules' logs on the screen. + If a logger exists with same name, reuse it. (Else it would have duplicate + handlers and messages would be doubled.) + The level is set to CRITICAL+1 by default, to avoid any logging. + """ + # First, test if there is already a logger with the same name, else it + # will generate duplicate messages (due to duplicate handlers): + if name in logging.Logger.manager.loggerDict: + #NOTE: another less intrusive but more "hackish" solution would be to + # use getLogger then test if its effective level is not default. + logger = logging.getLogger(name) + # make sure level is OK: + logger.setLevel(level) + return logger + # get a new logger: + logger = logging.getLogger(name) + # only add a NullHandler for this logger, it is up to the application + # to configure its own logging: + logger.addHandler(NullHandler()) + logger.setLevel(level) + return logger + +# a global logger object used for debugging: +log = get_logger('oleobj') + + +# === GLOBAL VARIABLES ======================================================= + +# struct to parse an unsigned integer of 32 bits: +struct_uint32 = struct.Struct(' FILETIME from olefile + self.unknown_long_1, data = read_uint32(data) + self.unknown_long_2, data = read_uint32(data) + # temp path? + self.temp_path, data = data.split('\x00', 1) + # size of the rest of the data + self.actual_size, data = read_uint32(data) + self.data = data[0:self.actual_size] + # TODO: exception when size > remaining data + # TODO: SLACK DATA + + +class OleObject (object): + """ + OLE 1.0 Object + + see MS-OLEDS 2.2 OLE1.0 Format Structures + """ + + # constants for the format_id attribute: + # see MS-OLEDS 2.2.4 ObjectHeader + TYPE_LINKED = 0x01 + TYPE_EMBEDDED = 0x02 + + + def __init__(self, bindata=None): + """ + Constructor for OleObject. + If bindata is provided, it will be parsed using the parse() method. + + :param bindata: bytes, OLE 1.0 Object structure containing an OLE object + """ + self.ole_version = None + self.format_id = None + self.class_name = None + self.topic_name = None + self.item_name = None + self.data = None + self.data_size = None + + def parse(self, data): + """ + Parse binary data containing an OLE 1.0 Object structure, + to extract the OLE object it contains. + (see MS-OLEDS 2.2 OLE1.0 Format Structures) + + :param data: bytes, OLE 1.0 Object structure containing an OLE object + :return: + """ + # Header: see MS-OLEDS 2.2.4 ObjectHeader + self.ole_version, data = read_uint32(data) + self.format_id, data = read_uint32(data) + log.debug('OLE version=%08X - Format ID=%08X' % (self.ole_version, self.format_id)) + assert self.format_id in (self.TYPE_EMBEDDED, self.TYPE_LINKED) + self.class_name, data = read_LengthPrefixedAnsiString(data) + self.topic_name, data = read_LengthPrefixedAnsiString(data) + self.item_name, data = read_LengthPrefixedAnsiString(data) + log.debug('Class name=%r - Topic name=%r - Item name=%r' + % (self.class_name, self.topic_name, self.item_name)) + if self.format_id == self.TYPE_EMBEDDED: + # Embedded object: see MS-OLEDS 2.2.5 EmbeddedObject + #assert self.topic_name != '' and self.item_name != '' + self.data_size, data = read_uint32(data) + log.debug('Declared data size=%d - remaining size=%d' % (self.data_size, len(data))) + # TODO: handle incorrect size to avoid exception + self.data = data[:self.data_size] + assert len(self.data) == self.data_size + self.extra_data = data[self.data_size:] diff --git a/oletools/rtfobj.py b/oletools/rtfobj.py index 0e882bb..74483f4 100755 --- a/oletools/rtfobj.py +++ b/oletools/rtfobj.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """ -rtfobj.py - Philippe Lagadec 2013-04-02 +rtfobj.py rtfobj is a Python module to extract embedded objects from RTF files, such as OLE ojects. It can be used as a Python library or a command-line tool. @@ -43,8 +43,11 @@ http://www.decalage.info/python/oletools # CHANGELOG: # 2012-11-09 v0.01 PL: - first version # 2013-04-02 v0.02 PL: - fixed bug in main +# 2015-12-09 v0.03 PL: - configurable logging, CLI options +# - extract OLE 1.0 objects +# - extract files from OLE Package objects -__version__ = '0.02' +__version__ = '0.03' #------------------------------------------------------------------------------ # TODO: @@ -52,9 +55,55 @@ __version__ = '0.02' # - allow semicolon within hex, as found in this sample: # http://contagiodump.blogspot.nl/2011/10/sep-28-cve-2010-3333-manuscript-with.html + #=== IMPORTS ================================================================= -import re, sys, string, binascii +import re, sys, string, binascii, logging, optparse + +from thirdparty.xglob import xglob +from oleobj import OleObject, OleNativeStream +import oleobj + +# === LOGGING ================================================================= + +class NullHandler(logging.Handler): + """ + Log Handler without output, to avoid printing messages if logging is not + configured by the main application. + Python 2.7 has logging.NullHandler, but this is necessary for 2.6: + see https://docs.python.org/2.6/library/logging.html#configuring-logging-for-a-library + """ + def emit(self, record): + pass + +def get_logger(name, level=logging.CRITICAL+1): + """ + Create a suitable logger object for this module. + The goal is not to change settings of the root logger, to avoid getting + other modules' logs on the screen. + If a logger exists with same name, reuse it. (Else it would have duplicate + handlers and messages would be doubled.) + The level is set to CRITICAL+1 by default, to avoid any logging. + """ + # First, test if there is already a logger with the same name, else it + # will generate duplicate messages (due to duplicate handlers): + if name in logging.Logger.manager.loggerDict: + #NOTE: another less intrusive but more "hackish" solution would be to + # use getLogger then test if its effective level is not default. + logger = logging.getLogger(name) + # make sure level is OK: + logger.setLevel(level) + return logger + # get a new logger: + logger = logging.getLogger(name) + # only add a NullHandler for this logger, it is up to the application + # to configure its own logging: + logger.addHandler(NullHandler()) + logger.setLevel(level) + return logger + +# a global logger object used for debugging: +log = get_logger('rtfobj') #=== CONSTANTS================================================================= @@ -62,19 +111,47 @@ import re, sys, string, binascii # REGEX pattern to extract embedded OLE objects in hexadecimal format: # alphanum digit: [0-9A-Fa-f] # hex char = two alphanum digits: [0-9A-Fa-f]{2} +HEX_CHAR = r'[0-9A-Fa-f]{2}' # several hex chars, at least 4: (?:[0-9A-Fa-f]{2}){4,} +# + word boundaries +HEX_CHARS_4orMORE = r'\b(?:' + HEX_CHAR + r'){4,}\b' +# at least 1 hex char: +HEX_CHARS_1orMORE = r'(?:' + HEX_CHAR + r')+' +# at least 1 hex char, followed by whitespace or CR/LF: +HEX_CHARS_1orMORE_WHITESPACES = r'(?:' + HEX_CHAR + r')+\s+' +# + word boundaries around hex block +# HEX_CHARS_1orMORE_WHITESPACES = r'\b(?:' + HEX_CHAR + r')+\b\s*' +# at least one block of hex and whitespace chars, followed by closing curly bracket: +# HEX_BLOCK_CURLY_BRACKET = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')+\}' +PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE + # at least 4 hex chars, followed by whitespace or CR/LF: (?:[0-9A-Fa-f]{2}){4,}\s* -PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}' +# PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}' # improved pattern, allowing semicolons within hex: #PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}' # a dummy translation table for str.translate, which does not change anythying: TRANSTABLE_NOCHANGE = string.maketrans('', '') +re_hexblock = re.compile(PATTERN) +re_decimal = re.compile(r'\d+') + +re_delimiter = re.compile(r'[ \t\r\n\f\v]') -#=== FUNCTIONS ================================================================= +DELIMITER = r'[ \t\r\n\f\v]' +DELIMITERS_ZeroOrMore = r'[ \t\r\n\f\v]*' +ANTISLASH_BIN = r'\\bin' +# According to my tests, Word accepts up to 250 digits (leading zeroes) +DECIMAL_GROUP = r'(\d{1,250})' -def rtf_iter_objects (filename, min_size=32): +re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + ANTISLASH_BIN + + DECIMAL_GROUP + DELIMITER) +re_delim_hexblock = re.compile(DELIMITER + PATTERN) + + +#=== FUNCTIONS =============================================================== + +def rtf_iter_objects_old (filename, min_size=32): """ Open a RTF file, extract each embedded object encoded in hexadecimal of size > min_size, yield the index of the object in the RTF file and its data @@ -84,22 +161,197 @@ def rtf_iter_objects (filename, min_size=32): data = open(filename, 'rb').read() for m in re.finditer(PATTERN, data): found = m.group(0) + orig_len = len(found) # remove all whitespace and line feeds: #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE - found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') + found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v}') found = binascii.unhexlify(found) #print repr(found) if len(found)>min_size: - yield m.start(), found + yield m.start(), orig_len, found + +# TODO: backward-compatible API? + + +def search_hex_block(data, pos=0, min_size=32, first=True): + if first: + # Search 1st occurence of a hex block: + match = re_hexblock.search(data, pos=pos) + else: + # Match next occurences of a hex block, from the current position only: + match = re_hexblock.match(data, pos=pos) + + + +def rtf_iter_objects (data, min_size=32): + """ + Open a RTF file, extract each embedded object encoded in hexadecimal of + size > min_size, yield the index of the object in the RTF file and its data + in binary format. + This is an iterator. + """ + # Search 1st occurence of a hex block: + match = re_hexblock.search(data) + if match is None: + # no hex block found + return + while match is not None: + found = match.group(0) + # start index + start = match.start() + # current position + current = match.end() + if len(found) < min_size: + match = re_hexblock.search(data, pos=current) + continue + log.debug('Found hex block starting at %08X, end %08X' % (start, current)) + # remove all whitespace and line feeds: + #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE + found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') + # object data extracted from the RTF file + objdata = binascii.unhexlify(found) + # Detect the "\bin" control word, which is sometimes used for obfuscation: + bin_match = re_delims_bin_decimal.match(data, pos=current) + while bin_match is not None: + log.debug('Found \\bin block starting at %08X : %r' + % (bin_match.start(), bin_match.group(0))) + # extract the decimal integer following '\bin' + bin_len = int(bin_match.group(1)) + log.debug('\\bin block length = %d' % bin_len) + if current+bin_len > len(data): + log.error('\\bin block length is larger than the remaining data') + # move the current index, ignore the \bin block + current += len(bin_match.group(0)) + break + # read that number of bytes: + objdata += data[current:current+bin_len] + # TODO: handle exception + current += len(bin_match.group(0)) + bin_len + # TODO: check if current is out of range + # TODO: is Word limiting the \bin length to a number of digits? + log.debug('Current position = %08X' % current) + match = re_delim_hexblock.match(data, pos=current) + if match is not None: + log.debug('Found next hex block starting at %08X, end %08X' + % (match.start(), match.end())) + found = match.group(0) + # remove all whitespace and line feeds: + #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE + found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') + objdata += binascii.unhexlify(found) + current = match.end() + bin_match = re_delims_bin_decimal.match(data, pos=current) + + # print repr(found) + if len(objdata)>min_size: + yield start, current-start, objdata + # Search next occurence of a hex block: + match = re_hexblock.search(data, pos=current) + + +def process_file(container, filename, data): + # TODO: option to extract objects to files (false by default) + if data is None: + data = open(filename, 'rb').read() + print '-'*79 + print 'File: %r - %d bytes' % (filename, len(data)) + for index, orig_len, objdata in rtf_iter_objects(data): + print 'found object size %d at index %08X - end %08X' % (len(objdata), index, index+orig_len) + fname = '%s_object_%08X.raw' % (filename, index) + print 'saving object to file %s' % fname + open(fname, 'wb').write(objdata) + # TODO: check if all hex data is extracted properly + + obj = OleObject() + try: + obj.parse(objdata) + print 'extract file embedded in OLE object:' + print 'format_id = %d' % obj.format_id + print 'class name = %r' % obj.class_name + print 'data size = %d' % obj.data_size + # set a file extension according to the class name: + class_name = obj.class_name.lower() + if class_name.startswith('word'): + ext = 'doc' + elif class_name.startswith('package'): + ext = 'package' + else: + ext = 'bin' + fname = '%s_object_%08X.%s' % (filename, index, ext) + print 'saving to file %s' % fname + open(fname, 'wb').write(obj.data) + if obj.class_name.lower() == 'package': + print 'Parsing OLE Package' + opkg = OleNativeStream(bindata=obj.data) + print 'Filename = %r' % opkg.filename + print 'Source path = %r' % opkg.src_path + print 'Temp path = %r' % opkg.temp_path + if opkg.filename: + fname = '%s_%s' % (filename, opkg.filename) + else: + fname = '%s_object_%08X.noname' % (filename, index) + print 'saving to file %s' % fname + open(fname, 'wb').write(opkg.data) + except: + pass + # log.exception('*** Not an OLE 1.0 Object') + #=== MAIN ================================================================= if __name__ == '__main__': - if len(sys.argv)<2: - sys.exit(__doc__) - for index, data in rtf_iter_objects(sys.argv[1]): - print 'found object size %d at index %08X' % (len(data), index) - fname = 'object_%08X.bin' % index - print 'saving to file %s' % fname - open(fname, 'wb').write(data) + # print banner with version + print ('rtfobj %s - http://decalage.info/python/oletools' % __version__) + print ('THIS IS WORK IN PROGRESS - Check updates regularly!') + print ('Please report any issue at https://bitbucket.org/decalage/oletools/issues') + print ('') + + DEFAULT_LOG_LEVEL = "warning" # Default log level + LOG_LEVELS = {'debug': logging.DEBUG, + 'info': logging.INFO, + 'warning': logging.WARNING, + 'error': logging.ERROR, + 'critical': logging.CRITICAL + } + + usage = 'usage: %prog [options] [filename2 ...]' + parser = optparse.OptionParser(usage=usage) + # parser.add_option('-o', '--outfile', dest='outfile', + # help='output file') + # parser.add_option('-c', '--csv', dest='csv', + # help='export results to a CSV file') + parser.add_option("-r", action="store_true", dest="recursive", + help='find files recursively in subdirectories.') + parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None, + help='if the file is a zip archive, open first file from it, using the provided password (requires Python 2.6+)') + parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*', + help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)') + parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL, + help="logging level debug/info/warning/error/critical (default=%default)") + + (options, args) = parser.parse_args() + + # Print help if no arguments are passed + if len(args) == 0: + print __doc__ + parser.print_help() + sys.exit() + + # setup logging to the console + logging.basicConfig(level=LOG_LEVELS[options.loglevel], format='%(levelname)-8s %(message)s') + # enable logging in the modules: + log.setLevel(logging.NOTSET) + oleobj.log.setLevel(logging.NOTSET) + + + for container, filename, data in xglob.iter_files(args, recursive=options.recursive, + zip_password=options.zip_password, zip_fname=options.zip_fname): + # ignore directory names stored in zip files: + if container and filename.endswith('/'): + continue + process_file(container, filename, data) + + + + -- libgit2 0.21.4