Commit 23ffafb33dc68e43b281045300ff219120f82ff9

Authored by decalage2
1 parent 5f8e0b88

rtfobj: improved parsing, fixed issue #42

Showing 1 changed file with 16 additions and 9 deletions
oletools/rtfobj.py
@@ -51,6 +51,7 @@ http://www.decalage.info/python/oletools @@ -51,6 +51,7 @@ http://www.decalage.info/python/oletools
51 # 2016-05-06 v0.47 TJ: - added option -d to set the output directory 51 # 2016-05-06 v0.47 TJ: - added option -d to set the output directory
52 # (contribution by Thomas Jarosch) 52 # (contribution by Thomas Jarosch)
53 # TJ: - sanitize filenames to avoid special characters 53 # TJ: - sanitize filenames to avoid special characters
  54 +# 2016-05-29 PL: - improved parsing, fixed issue #42
54 55
55 __version__ = '0.47' 56 __version__ = '0.47'
56 57
@@ -129,23 +130,26 @@ SINGLE_RTF_TAG = r'[{][^{}]*[}]' @@ -129,23 +130,26 @@ SINGLE_RTF_TAG = r'[{][^{}]*[}]'
129 NESTED_RTF_TAG = r'[{](?:[^{}]|'+SINGLE_RTF_TAG+r')*[}]' 130 NESTED_RTF_TAG = r'[{](?:[^{}]|'+SINGLE_RTF_TAG+r')*[}]'
130 # ignored whitespaces and tags within a hex block: 131 # ignored whitespaces and tags within a hex block:
131 IGNORED = r'(?:\s|'+NESTED_RTF_TAG+r')*' 132 IGNORED = r'(?:\s|'+NESTED_RTF_TAG+r')*'
  133 +#IGNORED = r'\s*'
132 134
133 -HEX_CHAR = HEX_DIGIT + IGNORED + HEX_DIGIT 135 +# HEX_CHAR = HEX_DIGIT + IGNORED + HEX_DIGIT
134 136
135 # several hex chars, at least 4: (?:[0-9A-Fa-f]{2}){4,} 137 # several hex chars, at least 4: (?:[0-9A-Fa-f]{2}){4,}
136 # + word boundaries 138 # + word boundaries
137 -HEX_CHARS_4orMORE = r'\b(?:' + HEX_CHAR + r'){4,}\b' 139 +# HEX_CHARS_4orMORE = r'\b(?:' + HEX_CHAR + r'){4,}\b'
138 # at least 1 hex char: 140 # at least 1 hex char:
139 -HEX_CHARS_1orMORE = r'(?:' + HEX_CHAR + r')+' 141 +# HEX_CHARS_1orMORE = r'(?:' + HEX_CHAR + r')+'
140 # at least 1 hex char, followed by whitespace or CR/LF: 142 # at least 1 hex char, followed by whitespace or CR/LF:
141 -HEX_CHARS_1orMORE_WHITESPACES = r'(?:' + HEX_CHAR + r')+\s+' 143 +# HEX_CHARS_1orMORE_WHITESPACES = r'(?:' + HEX_CHAR + r')+\s+'
142 # + word boundaries around hex block 144 # + word boundaries around hex block
143 # HEX_CHARS_1orMORE_WHITESPACES = r'\b(?:' + HEX_CHAR + r')+\b\s*' 145 # HEX_CHARS_1orMORE_WHITESPACES = r'\b(?:' + HEX_CHAR + r')+\b\s*'
144 # at least one block of hex and whitespace chars, followed by closing curly bracket: 146 # at least one block of hex and whitespace chars, followed by closing curly bracket:
145 # HEX_BLOCK_CURLY_BRACKET = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')+\}' 147 # HEX_BLOCK_CURLY_BRACKET = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')+\}'
146 # PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE 148 # PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE
147 149
148 -PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}\b' 150 +#TODO PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}\b'
  151 +# PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}' #+ HEX_CHAR + r'\b'
  152 +PATTERN = r'\b(?:' + HEX_DIGIT + IGNORED + r'){7,}' + HEX_DIGIT + r'\b'
149 153
150 # at least 4 hex chars, followed by whitespace or CR/LF: (?:[0-9A-Fa-f]{2}){4,}\s* 154 # at least 4 hex chars, followed by whitespace or CR/LF: (?:[0-9A-Fa-f]{2}){4,}\s*
151 # PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}' 155 # PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}'
@@ -216,6 +220,7 @@ def rtf_iter_objects (data, min_size=32): @@ -216,6 +220,7 @@ def rtf_iter_objects (data, min_size=32):
216 # Search 1st occurence of a hex block: 220 # Search 1st occurence of a hex block:
217 match = re_hexblock.search(data) 221 match = re_hexblock.search(data)
218 if match is None: 222 if match is None:
  223 + log.debug('No hex block found.')
219 # no hex block found 224 # no hex block found
220 return 225 return
221 while match is not None: 226 while match is not None:
@@ -224,11 +229,12 @@ def rtf_iter_objects (data, min_size=32): @@ -224,11 +229,12 @@ def rtf_iter_objects (data, min_size=32):
224 start = match.start() 229 start = match.start()
225 # current position 230 # current position
226 current = match.end() 231 current = match.end()
  232 + log.debug('Found hex block starting at %08X, end %08X, size=%d' % (start, current, len(found)))
227 if len(found) < min_size: 233 if len(found) < min_size:
  234 + log.debug('Too small - size<%d, ignored.' % min_size)
228 match = re_hexblock.search(data, pos=current) 235 match = re_hexblock.search(data, pos=current)
229 continue 236 continue
230 - log.debug('Found hex block starting at %08X, end %08X' % (start, current))  
231 - log.debug('Match: %s' % found) 237 + #log.debug('Match: %s' % found)
232 # remove all whitespace and line feeds: 238 # remove all whitespace and line feeds:
233 #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE 239 #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
234 found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') 240 found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')
@@ -238,8 +244,9 @@ def rtf_iter_objects (data, min_size=32): @@ -238,8 +244,9 @@ def rtf_iter_objects (data, min_size=32):
238 # object data extracted from the RTF file 244 # object data extracted from the RTF file
239 # MS Word accepts an extra hex digit, so we need to trim it if present: 245 # MS Word accepts an extra hex digit, so we need to trim it if present:
240 if len(found) & 1: 246 if len(found) & 1:
  247 + log.debug('Odd length, trimmed last byte.')
241 found = found[:-1] 248 found = found[:-1]
242 - log.debug('Cleaned match: %s' % found) 249 + #log.debug('Cleaned match: %s' % found)
243 objdata = binascii.unhexlify(found) 250 objdata = binascii.unhexlify(found)
244 # Detect the "\bin" control word, which is sometimes used for obfuscation: 251 # Detect the "\bin" control word, which is sometimes used for obfuscation:
245 bin_match = re_delims_bin_decimal.match(data, pos=current) 252 bin_match = re_delims_bin_decimal.match(data, pos=current)
@@ -365,7 +372,7 @@ def process_file(container, filename, data, output_dir=None): @@ -365,7 +372,7 @@ def process_file(container, filename, data, output_dir=None):
365 open(fname, 'wb').write(opkg.data) 372 open(fname, 'wb').write(opkg.data)
366 except: 373 except:
367 pass 374 pass
368 - # log.exception('*** Not an OLE 1.0 Object') 375 + log.exception('*** Not an OLE 1.0 Object')
369 376
370 377
371 378