Commit 5268bc721ad2ff42905f39d13e4ba4cf0bae414c

Authored by Philippe Lagadec
1 parent 2363247a

rtfobj: improved parsing to handle some malware tricks

Showing 1 changed file with 36 additions and 6 deletions
oletools/rtfobj.py
... ... @@ -47,8 +47,9 @@ http://www.decalage.info/python/oletools
47 47 # - extract OLE 1.0 objects
48 48 # - extract files from OLE Package objects
49 49 # 2016-04-01 v0.04 PL: - fixed logging output to use stdout instead of stderr
  50 +# 2016-04-07 v0.45 PL: - improved parsing to handle some malware tricks
50 51  
51   -__version__ = '0.04'
  52 +__version__ = '0.45'
52 53  
53 54 #------------------------------------------------------------------------------
54 55 # TODO:
... ... @@ -110,9 +111,24 @@ log = get_logger('rtfobj')
110 111 #=== CONSTANTS=================================================================
111 112  
112 113 # REGEX pattern to extract embedded OLE objects in hexadecimal format:
  114 +
113 115 # alphanum digit: [0-9A-Fa-f]
  116 +HEX_DIGIT = r'[0-9A-Fa-f]'
  117 +
114 118 # hex char = two alphanum digits: [0-9A-Fa-f]{2}
115   -HEX_CHAR = r'[0-9A-Fa-f]{2}'
  119 +# HEX_CHAR = r'[0-9A-Fa-f]{2}'
  120 +# in fact MS Word allows whitespaces in between the hex digits!
  121 +# HEX_CHAR = r'[0-9A-Fa-f]\s*[0-9A-Fa-f]'
  122 +# Even worse, MS Word also allows ANY RTF-style tag {*} in between!!
  123 +# AND the tags can be nested...
  124 +SINGLE_RTF_TAG = r'[{][^{}]*[}]'
  125 +# Nested tags, two levels (because Python's re does not support nested matching):
  126 +NESTED_RTF_TAG = r'[{](?:[^{}]|'+SINGLE_RTF_TAG+r')*[}]'
  127 +# ignored whitespaces and tags within a hex block:
  128 +IGNORED = r'(?:\s|'+NESTED_RTF_TAG+r')*'
  129 +
  130 +HEX_CHAR = HEX_DIGIT + IGNORED + HEX_DIGIT
  131 +
116 132 # several hex chars, at least 4: (?:[0-9A-Fa-f]{2}){4,}
117 133 # + word boundaries
118 134 HEX_CHARS_4orMORE = r'\b(?:' + HEX_CHAR + r'){4,}\b'
... ... @@ -124,7 +140,9 @@ HEX_CHARS_1orMORE_WHITESPACES = r'(?:' + HEX_CHAR + r')+\s+'
124 140 # HEX_CHARS_1orMORE_WHITESPACES = r'\b(?:' + HEX_CHAR + r')+\b\s*'
125 141 # at least one block of hex and whitespace chars, followed by closing curly bracket:
126 142 # HEX_BLOCK_CURLY_BRACKET = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')+\}'
127   -PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE
  143 +# PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE
  144 +
  145 +PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}\b'
128 146  
129 147 # at least 4 hex chars, followed by whitespace or CR/LF: (?:[0-9A-Fa-f]{2}){4,}\s*
130 148 # PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}'
... ... @@ -135,18 +153,19 @@ PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE
135 153 TRANSTABLE_NOCHANGE = string.maketrans('', '')
136 154  
137 155 re_hexblock = re.compile(PATTERN)
  156 +re_embedded_tags = re.compile(IGNORED)
138 157 re_decimal = re.compile(r'\d+')
139 158  
140 159 re_delimiter = re.compile(r'[ \t\r\n\f\v]')
141 160  
142 161 DELIMITER = r'[ \t\r\n\f\v]'
143 162 DELIMITERS_ZeroOrMore = r'[ \t\r\n\f\v]*'
144   -ANTISLASH_BIN = r'\bin'
  163 +BACKSLASH_BIN = r'\bin'
145 164 # According to my tests, Word accepts up to 250 digits (leading zeroes)
146 165 DECIMAL_GROUP = r'(\d{1,250})'
147 166  
148   -re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + ANTISLASH_BIN
149   - + DECIMAL_GROUP + DELIMITER)
  167 +re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + BACKSLASH_BIN
  168 + + DECIMAL_GROUP + DELIMITER)
150 169 re_delim_hexblock = re.compile(DELIMITER + PATTERN)
151 170  
152 171  
... ... @@ -206,10 +225,18 @@ def rtf_iter_objects (data, min_size=32):
206 225 match = re_hexblock.search(data, pos=current)
207 226 continue
208 227 log.debug('Found hex block starting at %08X, end %08X' % (start, current))
  228 + log.debug('Match: %s' % found)
209 229 # remove all whitespace and line feeds:
210 230 #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
211 231 found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')
  232 + # TODO: make it a function
  233 + # Also remove embedded RTF tags:
  234 + found = re_embedded_tags.sub('', found)
212 235 # object data extracted from the RTF file
  236 + # MS Word accepts an extra hex digit, so we need to trim it if present:
  237 + if len(found) & 1:
  238 + found = found[:-1]
  239 + log.debug('Cleaned match: %s' % found)
213 240 objdata = binascii.unhexlify(found)
214 241 # Detect the "\bin" control word, which is sometimes used for obfuscation:
215 242 bin_match = re_delims_bin_decimal.match(data, pos=current)
... ... @@ -236,9 +263,12 @@ def rtf_iter_objects (data, min_size=32):
236 263 log.debug('Found next hex block starting at %08X, end %08X'
237 264 % (match.start(), match.end()))
238 265 found = match.group(0)
  266 + log.debug('Match: %s' % found)
239 267 # remove all whitespace and line feeds:
240 268 #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
241 269 found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')
  270 + # Also remove embedded RTF tags:
  271 + found = re_embedded_tags.sub(found, '')
242 272 objdata += binascii.unhexlify(found)
243 273 current = match.end()
244 274 bin_match = re_delims_bin_decimal.match(data, pos=current)
... ...