rtfobj.py 16.1 KB

Edit Raw Blame History

#!/usr/bin/env python
"""
rtfobj.py

rtfobj is a Python module to extract embedded objects from RTF files, such as
OLE ojects. It can be used as a Python library or a command-line tool.

Usage: rtfobj.py <file.rtf>

rtfobj project website: http://www.decalage.info/python/rtfobj

rtfobj is part of the python-oletools package:
http://www.decalage.info/python/oletools
"""

#=== LICENSE =================================================================

# rtfobj is copyright (c) 2012-2016, Philippe Lagadec (http://www.decalage.info)
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
#  * Redistributions of source code must retain the above copyright notice, this
#    list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright notice,
#    this list of conditions and the following disclaimer in the documentation
#    and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#------------------------------------------------------------------------------
# CHANGELOG:
# 2012-11-09 v0.01 PL: - first version
# 2013-04-02 v0.02 PL: - fixed bug in main
# 2015-12-09 v0.03 PL: - configurable logging, CLI options
#                      - extract OLE 1.0 objects
#                      - extract files from OLE Package objects
# 2016-04-01 v0.04 PL: - fixed logging output to use stdout instead of stderr
# 2016-04-07 v0.45 PL: - improved parsing to handle some malware tricks

__version__ = '0.45'

#------------------------------------------------------------------------------
# TODO:
# - improve regex pattern for better performance?
# - allow semicolon within hex, as found in  this sample:
#   http://contagiodump.blogspot.nl/2011/10/sep-28-cve-2010-3333-manuscript-with.html


#=== IMPORTS =================================================================

import re, sys, string, binascii, logging, optparse

from thirdparty.xglob import xglob
from oleobj import OleObject, OleNativeStream
import oleobj

# === LOGGING =================================================================

class NullHandler(logging.Handler):
    """
    Log Handler without output, to avoid printing messages if logging is not
    configured by the main application.
    Python 2.7 has logging.NullHandler, but this is necessary for 2.6:
    see https://docs.python.org/2.6/library/logging.html#configuring-logging-for-a-library
    """
    def emit(self, record):
        pass

def get_logger(name, level=logging.CRITICAL+1):
    """
    Create a suitable logger object for this module.
    The goal is not to change settings of the root logger, to avoid getting
    other modules' logs on the screen.
    If a logger exists with same name, reuse it. (Else it would have duplicate
    handlers and messages would be doubled.)
    The level is set to CRITICAL+1 by default, to avoid any logging.
    """
    # First, test if there is already a logger with the same name, else it
    # will generate duplicate messages (due to duplicate handlers):
    if name in logging.Logger.manager.loggerDict:
        #NOTE: another less intrusive but more "hackish" solution would be to
        # use getLogger then test if its effective level is not default.
        logger = logging.getLogger(name)
        # make sure level is OK:
        logger.setLevel(level)
        return logger
    # get a new logger:
    logger = logging.getLogger(name)
    # only add a NullHandler for this logger, it is up to the application
    # to configure its own logging:
    logger.addHandler(NullHandler())
    logger.setLevel(level)
    return logger

# a global logger object used for debugging:
log = get_logger('rtfobj')


#=== CONSTANTS=================================================================

# REGEX pattern to extract embedded OLE objects in hexadecimal format:

# alphanum digit: [0-9A-Fa-f]
HEX_DIGIT = r'[0-9A-Fa-f]'

# hex char = two alphanum digits: [0-9A-Fa-f]{2}
# HEX_CHAR = r'[0-9A-Fa-f]{2}'
# in fact MS Word allows whitespaces in between the hex digits!
# HEX_CHAR = r'[0-9A-Fa-f]\s*[0-9A-Fa-f]'
# Even worse, MS Word also allows ANY RTF-style tag {*} in between!!
# AND the tags can be nested...
SINGLE_RTF_TAG = r'[{][^{}]*[}]'
# Nested tags, two levels (because Python's re does not support nested matching):
NESTED_RTF_TAG = r'[{](?:[^{}]|'+SINGLE_RTF_TAG+r')*[}]'
# ignored whitespaces and tags within a hex block:
IGNORED = r'(?:\s|'+NESTED_RTF_TAG+r')*'

HEX_CHAR = HEX_DIGIT + IGNORED + HEX_DIGIT

# several hex chars, at least 4: (?:[0-9A-Fa-f]{2}){4,}
# + word boundaries
HEX_CHARS_4orMORE = r'\b(?:' + HEX_CHAR + r'){4,}\b'
# at least 1 hex char:
HEX_CHARS_1orMORE = r'(?:' + HEX_CHAR + r')+'
# at least 1 hex char, followed by whitespace or CR/LF:
HEX_CHARS_1orMORE_WHITESPACES = r'(?:' + HEX_CHAR + r')+\s+'
# + word boundaries around hex block
# HEX_CHARS_1orMORE_WHITESPACES = r'\b(?:' + HEX_CHAR + r')+\b\s*'
# at least one block of hex and whitespace chars, followed by closing curly bracket:
# HEX_BLOCK_CURLY_BRACKET = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')+\}'
# PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE

PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}\b'

# at least 4 hex chars, followed by whitespace or CR/LF: (?:[0-9A-Fa-f]{2}){4,}\s*
# PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}'
# improved pattern, allowing semicolons within hex:
#PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}'

# a dummy translation table for str.translate, which does not change anythying:
TRANSTABLE_NOCHANGE = string.maketrans('', '')

re_hexblock = re.compile(PATTERN)
re_embedded_tags = re.compile(IGNORED)
re_decimal = re.compile(r'\d+')

re_delimiter = re.compile(r'[ \t\r\n\f\v]')

DELIMITER = r'[ \t\r\n\f\v]'
DELIMITERS_ZeroOrMore = r'[ \t\r\n\f\v]*'
BACKSLASH_BIN = r'\\bin'
# According to my tests, Word accepts up to 250 digits (leading zeroes)
DECIMAL_GROUP = r'(\d{1,250})'

re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + BACKSLASH_BIN
                                   + DECIMAL_GROUP + DELIMITER)
re_delim_hexblock = re.compile(DELIMITER + PATTERN)


#=== FUNCTIONS ===============================================================

def rtf_iter_objects_old (filename, min_size=32):
    """
    Open a RTF file, extract each embedded object encoded in hexadecimal of
    size > min_size, yield the index of the object in the RTF file and its data
    in binary format.
    This is an iterator.
    """
    data = open(filename, 'rb').read()
    for m in re.finditer(PATTERN, data):
        found = m.group(0)
        orig_len = len(found)
        # remove all whitespace and line feeds:
        #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
        found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v}')
        found = binascii.unhexlify(found)
        #print repr(found)
        if len(found)>min_size:
            yield m.start(), orig_len, found

# TODO: backward-compatible API?


def search_hex_block(data, pos=0, min_size=32, first=True):
    if first:
        # Search 1st occurence of a hex block:
        match = re_hexblock.search(data, pos=pos)
    else:
        # Match next occurences of a hex block, from the current position only:
        match = re_hexblock.match(data, pos=pos)


def rtf_iter_objects (data, min_size=32):
    """
    Open a RTF file, extract each embedded object encoded in hexadecimal of
    size > min_size, yield the index of the object in the RTF file and its data
    in binary format.
    This is an iterator.
    """
    # Search 1st occurence of a hex block:
    match = re_hexblock.search(data)
    if match is None:
        # no hex block found
        return
    while match is not None:
        found = match.group(0)
        # start index
        start = match.start()
        # current position
        current = match.end()
        if len(found) < min_size:
            match = re_hexblock.search(data, pos=current)
            continue
        log.debug('Found hex block starting at %08X, end %08X' % (start, current))
        log.debug('Match: %s' % found)
        # remove all whitespace and line feeds:
        #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
        found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')
        # TODO: make it a function
        # Also remove embedded RTF tags:
        found = re_embedded_tags.sub('', found)
        # object data extracted from the RTF file
        # MS Word accepts an extra hex digit, so we need to trim it if present:
        if len(found) & 1:
            found = found[:-1]
        log.debug('Cleaned match: %s' % found)
        objdata = binascii.unhexlify(found)
        # Detect the "\bin" control word, which is sometimes used for obfuscation:
        bin_match = re_delims_bin_decimal.match(data, pos=current)
        while bin_match is not None:
            log.debug('Found \\bin block starting at %08X : %r'
                          % (bin_match.start(), bin_match.group(0)))
            # extract the decimal integer following '\bin'
            bin_len = int(bin_match.group(1))
            log.debug('\\bin block length = %d' % bin_len)
            if current+bin_len > len(data):
                log.error('\\bin block length is larger than the remaining data')
                # move the current index, ignore the \bin block
                current += len(bin_match.group(0))
                break
            # read that number of bytes:
            objdata += data[current:current+bin_len]
            # TODO: handle exception
            current += len(bin_match.group(0)) + bin_len
            # TODO: check if current is out of range
            # TODO: is Word limiting the \bin length to a number of digits?
            log.debug('Current position = %08X' % current)
            match = re_delim_hexblock.match(data, pos=current)
            if match is not None:
                log.debug('Found next hex block starting at %08X, end %08X'
                    % (match.start(), match.end()))
                found = match.group(0)
                log.debug('Match: %s' % found)
                # remove all whitespace and line feeds:
                #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
                found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')
                # Also remove embedded RTF tags:
                found = re_embedded_tags.sub(found, '')
                objdata += binascii.unhexlify(found)
                current = match.end()
            bin_match = re_delims_bin_decimal.match(data, pos=current)

        # print repr(found)
        if len(objdata)>min_size:
            yield start, current-start, objdata
        # Search next occurence of a hex block:
        match = re_hexblock.search(data, pos=current)


def process_file(container, filename, data):
    # TODO: option to extract objects to files (false by default)
    if data is None:
        data = open(filename, 'rb').read()
    print '-'*79
    print 'File: %r - %d bytes' % (filename, len(data))
    for index, orig_len, objdata in rtf_iter_objects(data):
        print 'found object size %d at index %08X - end %08X' % (len(objdata), index, index+orig_len)
        fname = '%s_object_%08X.raw' % (filename, index)
        print 'saving object to file %s' % fname
        open(fname, 'wb').write(objdata)
        # TODO: check if all hex data is extracted properly

        obj = OleObject()
        try:
            obj.parse(objdata)
            print 'extract file embedded in OLE object:'
            print 'format_id  = %d' % obj.format_id
            print 'class name = %r' % obj.class_name
            print 'data size  = %d' % obj.data_size
            # set a file extension according to the class name:
            class_name = obj.class_name.lower()
            if class_name.startswith('word'):
                ext = 'doc'
            elif class_name.startswith('package'):
                ext = 'package'
            else:
                ext = 'bin'
            fname = '%s_object_%08X.%s' % (filename, index, ext)
            print 'saving to file %s' % fname
            open(fname, 'wb').write(obj.data)
            if obj.class_name.lower() == 'package':
                print 'Parsing OLE Package'
                opkg = OleNativeStream(bindata=obj.data)
                print 'Filename = %r' % opkg.filename
                print 'Source path = %r' % opkg.src_path
                print 'Temp path = %r' % opkg.temp_path
                if opkg.filename:
                    fname = '%s_%s' % (filename, opkg.filename)
                else:
                    fname = '%s_object_%08X.noname' % (filename, index)
                print 'saving to file %s' % fname
                open(fname, 'wb').write(opkg.data)
        except:
            pass
            # log.exception('*** Not an OLE 1.0 Object')


#=== MAIN =================================================================

if __name__ == '__main__':
    # print banner with version
    print ('rtfobj %s - http://decalage.info/python/oletools' % __version__)
    print ('THIS IS WORK IN PROGRESS - Check updates regularly!')
    print ('Please report any issue at https://bitbucket.org/decalage/oletools/issues')
    print ('')

    DEFAULT_LOG_LEVEL = "warning" # Default log level
    LOG_LEVELS = {'debug':    logging.DEBUG,
              'info':     logging.INFO,
              'warning':  logging.WARNING,
              'error':    logging.ERROR,
              'critical': logging.CRITICAL
             }

    usage = 'usage: %prog [options] <filename> [filename2 ...]'
    parser = optparse.OptionParser(usage=usage)
    # parser.add_option('-o', '--outfile', dest='outfile',
    #     help='output file')
    # parser.add_option('-c', '--csv', dest='csv',
    #     help='export results to a CSV file')
    parser.add_option("-r", action="store_true", dest="recursive",
        help='find files recursively in subdirectories.')
    parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None,
        help='if the file is a zip archive, open first file from it, using the provided password (requires Python 2.6+)')
    parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*',
        help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)')
    parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL,
                            help="logging level debug/info/warning/error/critical (default=%default)")

    (options, args) = parser.parse_args()

    # Print help if no arguments are passed
    if len(args) == 0:
        print __doc__
        parser.print_help()
        sys.exit()

    # Setup logging to the console:
    # here we use stdout instead of stderr by default, so that the output
    # can be redirected properly.
    logging.basicConfig(level=LOG_LEVELS[options.loglevel], stream=sys.stdout,
                        format='%(levelname)-8s %(message)s')
    # enable logging in the modules:
    log.setLevel(logging.NOTSET)
    oleobj.log.setLevel(logging.NOTSET)


    for container, filename, data in xglob.iter_files(args, recursive=options.recursive,
        zip_password=options.zip_password, zip_fname=options.zip_fname):
        # ignore directory names stored in zip files:
        if container and filename.endswith('/'):
            continue
        process_file(container, filename, data)