Commit 23ffafb33dc68e43b281045300ff219120f82ff9

Authored by decalage2
1 parent 5f8e0b88

rtfobj: improved parsing, fixed issue #42

Showing 1 changed file with 16 additions and 9 deletions
oletools/rtfobj.py
... ... @@ -51,6 +51,7 @@ http://www.decalage.info/python/oletools
51 51 # 2016-05-06 v0.47 TJ: - added option -d to set the output directory
52 52 # (contribution by Thomas Jarosch)
53 53 # TJ: - sanitize filenames to avoid special characters
  54 +# 2016-05-29 PL: - improved parsing, fixed issue #42
54 55  
55 56 __version__ = '0.47'
56 57  
... ... @@ -129,23 +130,26 @@ SINGLE_RTF_TAG = r'[{][^{}]*[}]'
129 130 NESTED_RTF_TAG = r'[{](?:[^{}]|'+SINGLE_RTF_TAG+r')*[}]'
130 131 # ignored whitespaces and tags within a hex block:
131 132 IGNORED = r'(?:\s|'+NESTED_RTF_TAG+r')*'
  133 +#IGNORED = r'\s*'
132 134  
133   -HEX_CHAR = HEX_DIGIT + IGNORED + HEX_DIGIT
  135 +# HEX_CHAR = HEX_DIGIT + IGNORED + HEX_DIGIT
134 136  
135 137 # several hex chars, at least 4: (?:[0-9A-Fa-f]{2}){4,}
136 138 # + word boundaries
137   -HEX_CHARS_4orMORE = r'\b(?:' + HEX_CHAR + r'){4,}\b'
  139 +# HEX_CHARS_4orMORE = r'\b(?:' + HEX_CHAR + r'){4,}\b'
138 140 # at least 1 hex char:
139   -HEX_CHARS_1orMORE = r'(?:' + HEX_CHAR + r')+'
  141 +# HEX_CHARS_1orMORE = r'(?:' + HEX_CHAR + r')+'
140 142 # at least 1 hex char, followed by whitespace or CR/LF:
141   -HEX_CHARS_1orMORE_WHITESPACES = r'(?:' + HEX_CHAR + r')+\s+'
  143 +# HEX_CHARS_1orMORE_WHITESPACES = r'(?:' + HEX_CHAR + r')+\s+'
142 144 # + word boundaries around hex block
143 145 # HEX_CHARS_1orMORE_WHITESPACES = r'\b(?:' + HEX_CHAR + r')+\b\s*'
144 146 # at least one block of hex and whitespace chars, followed by closing curly bracket:
145 147 # HEX_BLOCK_CURLY_BRACKET = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')+\}'
146 148 # PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE
147 149  
148   -PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}\b'
  150 +#TODO PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}\b'
  151 +# PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}' #+ HEX_CHAR + r'\b'
  152 +PATTERN = r'\b(?:' + HEX_DIGIT + IGNORED + r'){7,}' + HEX_DIGIT + r'\b'
149 153  
150 154 # at least 4 hex chars, followed by whitespace or CR/LF: (?:[0-9A-Fa-f]{2}){4,}\s*
151 155 # PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}'
... ... @@ -216,6 +220,7 @@ def rtf_iter_objects (data, min_size=32):
216 220 # Search 1st occurence of a hex block:
217 221 match = re_hexblock.search(data)
218 222 if match is None:
  223 + log.debug('No hex block found.')
219 224 # no hex block found
220 225 return
221 226 while match is not None:
... ... @@ -224,11 +229,12 @@ def rtf_iter_objects (data, min_size=32):
224 229 start = match.start()
225 230 # current position
226 231 current = match.end()
  232 + log.debug('Found hex block starting at %08X, end %08X, size=%d' % (start, current, len(found)))
227 233 if len(found) < min_size:
  234 + log.debug('Too small - size<%d, ignored.' % min_size)
228 235 match = re_hexblock.search(data, pos=current)
229 236 continue
230   - log.debug('Found hex block starting at %08X, end %08X' % (start, current))
231   - log.debug('Match: %s' % found)
  237 + #log.debug('Match: %s' % found)
232 238 # remove all whitespace and line feeds:
233 239 #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
234 240 found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')
... ... @@ -238,8 +244,9 @@ def rtf_iter_objects (data, min_size=32):
238 244 # object data extracted from the RTF file
239 245 # MS Word accepts an extra hex digit, so we need to trim it if present:
240 246 if len(found) & 1:
  247 + log.debug('Odd length, trimmed last byte.')
241 248 found = found[:-1]
242   - log.debug('Cleaned match: %s' % found)
  249 + #log.debug('Cleaned match: %s' % found)
243 250 objdata = binascii.unhexlify(found)
244 251 # Detect the "\bin" control word, which is sometimes used for obfuscation:
245 252 bin_match = re_delims_bin_decimal.match(data, pos=current)
... ... @@ -365,7 +372,7 @@ def process_file(container, filename, data, output_dir=None):
365 372 open(fname, 'wb').write(opkg.data)
366 373 except:
367 374 pass
368   - # log.exception('*** Not an OLE 1.0 Object')
  375 + log.exception('*** Not an OLE 1.0 Object')
369 376  
370 377  
371 378  
... ...