Commit 5268bc721ad2ff42905f39d13e4ba4cf0bae414c

Authored by Philippe Lagadec
1 parent 2363247a

rtfobj: improved parsing to handle some malware tricks

Showing 1 changed file with 36 additions and 6 deletions
oletools/rtfobj.py
@@ -47,8 +47,9 @@ http://www.decalage.info/python/oletools @@ -47,8 +47,9 @@ http://www.decalage.info/python/oletools
47 # - extract OLE 1.0 objects 47 # - extract OLE 1.0 objects
48 # - extract files from OLE Package objects 48 # - extract files from OLE Package objects
49 # 2016-04-01 v0.04 PL: - fixed logging output to use stdout instead of stderr 49 # 2016-04-01 v0.04 PL: - fixed logging output to use stdout instead of stderr
  50 +# 2016-04-07 v0.45 PL: - improved parsing to handle some malware tricks
50 51
51 -__version__ = '0.04' 52 +__version__ = '0.45'
52 53
53 #------------------------------------------------------------------------------ 54 #------------------------------------------------------------------------------
54 # TODO: 55 # TODO:
@@ -110,9 +111,24 @@ log = get_logger('rtfobj') @@ -110,9 +111,24 @@ log = get_logger('rtfobj')
110 #=== CONSTANTS================================================================= 111 #=== CONSTANTS=================================================================
111 112
112 # REGEX pattern to extract embedded OLE objects in hexadecimal format: 113 # REGEX pattern to extract embedded OLE objects in hexadecimal format:
  114 +
113 # alphanum digit: [0-9A-Fa-f] 115 # alphanum digit: [0-9A-Fa-f]
  116 +HEX_DIGIT = r'[0-9A-Fa-f]'
  117 +
114 # hex char = two alphanum digits: [0-9A-Fa-f]{2} 118 # hex char = two alphanum digits: [0-9A-Fa-f]{2}
115 -HEX_CHAR = r'[0-9A-Fa-f]{2}' 119 +# HEX_CHAR = r'[0-9A-Fa-f]{2}'
  120 +# in fact MS Word allows whitespaces in between the hex digits!
  121 +# HEX_CHAR = r'[0-9A-Fa-f]\s*[0-9A-Fa-f]'
  122 +# Even worse, MS Word also allows ANY RTF-style tag {*} in between!!
  123 +# AND the tags can be nested...
  124 +SINGLE_RTF_TAG = r'[{][^{}]*[}]'
  125 +# Nested tags, two levels (because Python's re does not support nested matching):
  126 +NESTED_RTF_TAG = r'[{](?:[^{}]|'+SINGLE_RTF_TAG+r')*[}]'
  127 +# ignored whitespaces and tags within a hex block:
  128 +IGNORED = r'(?:\s|'+NESTED_RTF_TAG+r')*'
  129 +
  130 +HEX_CHAR = HEX_DIGIT + IGNORED + HEX_DIGIT
  131 +
116 # several hex chars, at least 4: (?:[0-9A-Fa-f]{2}){4,} 132 # several hex chars, at least 4: (?:[0-9A-Fa-f]{2}){4,}
117 # + word boundaries 133 # + word boundaries
118 HEX_CHARS_4orMORE = r'\b(?:' + HEX_CHAR + r'){4,}\b' 134 HEX_CHARS_4orMORE = r'\b(?:' + HEX_CHAR + r'){4,}\b'
@@ -124,7 +140,9 @@ HEX_CHARS_1orMORE_WHITESPACES = r'(?:' + HEX_CHAR + r')+\s+' @@ -124,7 +140,9 @@ HEX_CHARS_1orMORE_WHITESPACES = r'(?:' + HEX_CHAR + r')+\s+'
124 # HEX_CHARS_1orMORE_WHITESPACES = r'\b(?:' + HEX_CHAR + r')+\b\s*' 140 # HEX_CHARS_1orMORE_WHITESPACES = r'\b(?:' + HEX_CHAR + r')+\b\s*'
125 # at least one block of hex and whitespace chars, followed by closing curly bracket: 141 # at least one block of hex and whitespace chars, followed by closing curly bracket:
126 # HEX_BLOCK_CURLY_BRACKET = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')+\}' 142 # HEX_BLOCK_CURLY_BRACKET = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')+\}'
127 -PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE 143 +# PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE
  144 +
  145 +PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}\b'
128 146
129 # at least 4 hex chars, followed by whitespace or CR/LF: (?:[0-9A-Fa-f]{2}){4,}\s* 147 # at least 4 hex chars, followed by whitespace or CR/LF: (?:[0-9A-Fa-f]{2}){4,}\s*
130 # PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}' 148 # PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}'
@@ -135,18 +153,19 @@ PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE @@ -135,18 +153,19 @@ PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE
135 TRANSTABLE_NOCHANGE = string.maketrans('', '') 153 TRANSTABLE_NOCHANGE = string.maketrans('', '')
136 154
137 re_hexblock = re.compile(PATTERN) 155 re_hexblock = re.compile(PATTERN)
  156 +re_embedded_tags = re.compile(IGNORED)
138 re_decimal = re.compile(r'\d+') 157 re_decimal = re.compile(r'\d+')
139 158
140 re_delimiter = re.compile(r'[ \t\r\n\f\v]') 159 re_delimiter = re.compile(r'[ \t\r\n\f\v]')
141 160
142 DELIMITER = r'[ \t\r\n\f\v]' 161 DELIMITER = r'[ \t\r\n\f\v]'
143 DELIMITERS_ZeroOrMore = r'[ \t\r\n\f\v]*' 162 DELIMITERS_ZeroOrMore = r'[ \t\r\n\f\v]*'
144 -ANTISLASH_BIN = r'\bin' 163 +BACKSLASH_BIN = r'\bin'
145 # According to my tests, Word accepts up to 250 digits (leading zeroes) 164 # According to my tests, Word accepts up to 250 digits (leading zeroes)
146 DECIMAL_GROUP = r'(\d{1,250})' 165 DECIMAL_GROUP = r'(\d{1,250})'
147 166
148 -re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + ANTISLASH_BIN  
149 - + DECIMAL_GROUP + DELIMITER) 167 +re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + BACKSLASH_BIN
  168 + + DECIMAL_GROUP + DELIMITER)
150 re_delim_hexblock = re.compile(DELIMITER + PATTERN) 169 re_delim_hexblock = re.compile(DELIMITER + PATTERN)
151 170
152 171
@@ -206,10 +225,18 @@ def rtf_iter_objects (data, min_size=32): @@ -206,10 +225,18 @@ def rtf_iter_objects (data, min_size=32):
206 match = re_hexblock.search(data, pos=current) 225 match = re_hexblock.search(data, pos=current)
207 continue 226 continue
208 log.debug('Found hex block starting at %08X, end %08X' % (start, current)) 227 log.debug('Found hex block starting at %08X, end %08X' % (start, current))
  228 + log.debug('Match: %s' % found)
209 # remove all whitespace and line feeds: 229 # remove all whitespace and line feeds:
210 #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE 230 #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
211 found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') 231 found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')
  232 + # TODO: make it a function
  233 + # Also remove embedded RTF tags:
  234 + found = re_embedded_tags.sub('', found)
212 # object data extracted from the RTF file 235 # object data extracted from the RTF file
  236 + # MS Word accepts an extra hex digit, so we need to trim it if present:
  237 + if len(found) & 1:
  238 + found = found[:-1]
  239 + log.debug('Cleaned match: %s' % found)
213 objdata = binascii.unhexlify(found) 240 objdata = binascii.unhexlify(found)
214 # Detect the "\bin" control word, which is sometimes used for obfuscation: 241 # Detect the "\bin" control word, which is sometimes used for obfuscation:
215 bin_match = re_delims_bin_decimal.match(data, pos=current) 242 bin_match = re_delims_bin_decimal.match(data, pos=current)
@@ -236,9 +263,12 @@ def rtf_iter_objects (data, min_size=32): @@ -236,9 +263,12 @@ def rtf_iter_objects (data, min_size=32):
236 log.debug('Found next hex block starting at %08X, end %08X' 263 log.debug('Found next hex block starting at %08X, end %08X'
237 % (match.start(), match.end())) 264 % (match.start(), match.end()))
238 found = match.group(0) 265 found = match.group(0)
  266 + log.debug('Match: %s' % found)
239 # remove all whitespace and line feeds: 267 # remove all whitespace and line feeds:
240 #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE 268 #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
241 found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') 269 found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')
  270 + # Also remove embedded RTF tags:
  271 + found = re_embedded_tags.sub(found, '')
242 objdata += binascii.unhexlify(found) 272 objdata += binascii.unhexlify(found)
243 current = match.end() 273 current = match.end()
244 bin_match = re_delims_bin_decimal.match(data, pos=current) 274 bin_match = re_delims_bin_decimal.match(data, pos=current)