From 5268bc721ad2ff42905f39d13e4ba4cf0bae414c Mon Sep 17 00:00:00 2001 From: Philippe Lagadec Date: Tue, 12 Apr 2016 21:17:22 +0200 Subject: [PATCH] rtfobj: improved parsing to handle some malware tricks --- oletools/rtfobj.py | 42 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/oletools/rtfobj.py b/oletools/rtfobj.py index 8855ea2..4208948 100755 --- a/oletools/rtfobj.py +++ b/oletools/rtfobj.py @@ -47,8 +47,9 @@ http://www.decalage.info/python/oletools # - extract OLE 1.0 objects # - extract files from OLE Package objects # 2016-04-01 v0.04 PL: - fixed logging output to use stdout instead of stderr +# 2016-04-07 v0.45 PL: - improved parsing to handle some malware tricks -__version__ = '0.04' +__version__ = '0.45' #------------------------------------------------------------------------------ # TODO: @@ -110,9 +111,24 @@ log = get_logger('rtfobj') #=== CONSTANTS================================================================= # REGEX pattern to extract embedded OLE objects in hexadecimal format: + # alphanum digit: [0-9A-Fa-f] +HEX_DIGIT = r'[0-9A-Fa-f]' + # hex char = two alphanum digits: [0-9A-Fa-f]{2} -HEX_CHAR = r'[0-9A-Fa-f]{2}' +# HEX_CHAR = r'[0-9A-Fa-f]{2}' +# in fact MS Word allows whitespaces in between the hex digits! +# HEX_CHAR = r'[0-9A-Fa-f]\s*[0-9A-Fa-f]' +# Even worse, MS Word also allows ANY RTF-style tag {*} in between!! +# AND the tags can be nested... +SINGLE_RTF_TAG = r'[{][^{}]*[}]' +# Nested tags, two levels (because Python's re does not support nested matching): +NESTED_RTF_TAG = r'[{](?:[^{}]|'+SINGLE_RTF_TAG+r')*[}]' +# ignored whitespaces and tags within a hex block: +IGNORED = r'(?:\s|'+NESTED_RTF_TAG+r')*' + +HEX_CHAR = HEX_DIGIT + IGNORED + HEX_DIGIT + # several hex chars, at least 4: (?:[0-9A-Fa-f]{2}){4,} # + word boundaries HEX_CHARS_4orMORE = r'\b(?:' + HEX_CHAR + r'){4,}\b' @@ -124,7 +140,9 @@ HEX_CHARS_1orMORE_WHITESPACES = r'(?:' + HEX_CHAR + r')+\s+' # HEX_CHARS_1orMORE_WHITESPACES = r'\b(?:' + HEX_CHAR + r')+\b\s*' # at least one block of hex and whitespace chars, followed by closing curly bracket: # HEX_BLOCK_CURLY_BRACKET = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')+\}' -PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE +# PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE + +PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}\b' # at least 4 hex chars, followed by whitespace or CR/LF: (?:[0-9A-Fa-f]{2}){4,}\s* # PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}' @@ -135,18 +153,19 @@ PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE TRANSTABLE_NOCHANGE = string.maketrans('', '') re_hexblock = re.compile(PATTERN) +re_embedded_tags = re.compile(IGNORED) re_decimal = re.compile(r'\d+') re_delimiter = re.compile(r'[ \t\r\n\f\v]') DELIMITER = r'[ \t\r\n\f\v]' DELIMITERS_ZeroOrMore = r'[ \t\r\n\f\v]*' -ANTISLASH_BIN = r'\\bin' +BACKSLASH_BIN = r'\\bin' # According to my tests, Word accepts up to 250 digits (leading zeroes) DECIMAL_GROUP = r'(\d{1,250})' -re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + ANTISLASH_BIN - + DECIMAL_GROUP + DELIMITER) +re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + BACKSLASH_BIN + + DECIMAL_GROUP + DELIMITER) re_delim_hexblock = re.compile(DELIMITER + PATTERN) @@ -206,10 +225,18 @@ def rtf_iter_objects (data, min_size=32): match = re_hexblock.search(data, pos=current) continue log.debug('Found hex block starting at %08X, end %08X' % (start, current)) + log.debug('Match: %s' % found) # remove all whitespace and line feeds: #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') + # TODO: make it a function + # Also remove embedded RTF tags: + found = re_embedded_tags.sub('', found) # object data extracted from the RTF file + # MS Word accepts an extra hex digit, so we need to trim it if present: + if len(found) & 1: + found = found[:-1] + log.debug('Cleaned match: %s' % found) objdata = binascii.unhexlify(found) # Detect the "\bin" control word, which is sometimes used for obfuscation: bin_match = re_delims_bin_decimal.match(data, pos=current) @@ -236,9 +263,12 @@ def rtf_iter_objects (data, min_size=32): log.debug('Found next hex block starting at %08X, end %08X' % (match.start(), match.end())) found = match.group(0) + log.debug('Match: %s' % found) # remove all whitespace and line feeds: #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') + # Also remove embedded RTF tags: + found = re_embedded_tags.sub(found, '') objdata += binascii.unhexlify(found) current = match.end() bin_match = re_delims_bin_decimal.match(data, pos=current) -- libgit2 0.21.4