Commit 94fe13c76a3ccd4d274d254c2804b399d4393a9f

Authored by decalage2
1 parent d5f0e1a9

rtfobj: fixed RtfParser to handle issue #152 (control word with long parameter)

oletools/oleobj.py
... ... @@ -290,6 +290,9 @@ class OleObject (object):
290 290 :param data: bytes, OLE 1.0 Object structure containing an OLE object
291 291 :return:
292 292 """
  293 + # from ezhexviewer import hexdump3
  294 + # print("Parsing OLE object data:")
  295 + # print(hexdump3(data, length=16))
293 296 # Header: see MS-OLEDS 2.2.4 ObjectHeader
294 297 self.ole_version, data = read_uint32(data)
295 298 self.format_id, data = read_uint32(data)
... ...
oletools/rtfobj.py
... ... @@ -17,7 +17,7 @@ http://www.decalage.info/python/oletools
17 17  
18 18 #=== LICENSE =================================================================
19 19  
20   -# rtfobj is copyright (c) 2012-2016, Philippe Lagadec (http://www.decalage.info)
  20 +# rtfobj is copyright (c) 2012-2017, Philippe Lagadec (http://www.decalage.info)
21 21 # All rights reserved.
22 22 #
23 23 # Redistribution and use in source and binary forms, with or without modification,
... ... @@ -67,8 +67,10 @@ http://www.decalage.info/python/oletools
67 67 # 2016-11-17 v0.51 PL: - updated call to oleobj.OleNativeStream
68 68 # 2017-03-12 PL: - fixed imports for Python 2+3
69 69 # - fixed hex decoding bug in RtfObjParser (issue #103)
  70 +# 2017-03-29 PL: - fixed RtfParser to handle issue #152 (control word with
  71 +# long parameter)
70 72  
71   -__version__ = '0.51dev2'
  73 +__version__ = '0.51dev4'
72 74  
73 75 # ------------------------------------------------------------------------------
74 76 # TODO:
... ... @@ -186,8 +188,8 @@ ASCII_NAME = b'([a-zA-Z]{1,250})'
186 188 # SIGNED_INTEGER = r'(-?\d{1,250})'
187 189 SIGNED_INTEGER = b'(-?\\d+)'
188 190  
189   -# Note for issue #78: need to match "\A-" not followed by digits
190   -CONTROL_WORD = b'(?:\\\\' + ASCII_NAME + b'(?:' + SIGNED_INTEGER + b'(?=[^0-9])|(?=[^a-zA-Z0-9])))'
  191 +# Note for issue #78: need to match "\A-" not followed by digits (or the end of string)
  192 +CONTROL_WORD = b'(?:\\\\' + ASCII_NAME + b'(?:' + SIGNED_INTEGER + b'(?=[^0-9])|(?=[^a-zA-Z0-9])|$))'
191 193  
192 194 re_control_word = re.compile(CONTROL_WORD)
193 195  
... ... @@ -323,10 +325,16 @@ class Destination(object):
323 325  
324 326 class RtfParser(object):
325 327 """
326   - Very simple generic RTF parser
  328 + Very simple but robust generic RTF parser, designed to handle
  329 + malformed malicious RTF as MS Word does
327 330 """
328 331  
329 332 def __init__(self, data):
  333 + """
  334 + RtfParser constructor.
  335 +
  336 + :param data: bytes object containing the RTF data to be parsed
  337 + """
330 338 self.data = data
331 339 self.index = 0
332 340 self.size = len(data)
... ... @@ -337,18 +345,38 @@ class RtfParser(object):
337 345 self.current_destination = document_destination
338 346  
339 347 def parse(self):
  348 + """
  349 + Parse the RTF data
  350 +
  351 + :return: nothing
  352 + """
  353 + # Start at beginning of data
340 354 self.index = 0
  355 + # Loop until the end
341 356 while self.index < self.size:
342 357 if self.data[self.index] == BRACE_OPEN:
  358 + # Found an opening brace "{": Start of a group
343 359 self._open_group()
344 360 self.index += 1
345 361 continue
346 362 if self.data[self.index] == BRACE_CLOSE:
  363 + # Found a closing brace "}": End of a group
347 364 self._close_group()
348 365 self.index += 1
349 366 continue
350 367 if self.data[self.index] == BACKSLASH:
351   - m = re_control_word.match(self.data, self.index)
  368 + # Found a backslash "\": Start of a control word or control symbol
  369 + # Use a regex to extract the control word name if present:
  370 + # NOTE: the full length of the control word + its optional integer parameter
  371 + # is limited by MS Word at 253 characters, so we have to run the regex
  372 + # on a cropped string:
  373 + data_cropped = self.data[self.index:]
  374 + if len(data_cropped)>253:
  375 + data_cropped = data_cropped[:254]
  376 + # append a space so that the regex can check the following character:
  377 + data_cropped += b' '
  378 + # m = re_control_word.match(self.data, self.index, self.index+253)
  379 + m = re_control_word.match(data_cropped)
352 380 if m:
353 381 cword = m.group(1)
354 382 param = None
... ... @@ -361,11 +389,14 @@ class RtfParser(object):
361 389 if cword == b'bin':
362 390 self._bin(m, param)
363 391 continue
  392 + # Otherwise, it may be a control symbol:
364 393 m = re_control_symbol.match(self.data, self.index)
365 394 if m:
366 395 self.control_symbol(m)
367 396 self.index += len(m.group())
368 397 continue
  398 + # Otherwise, this is plain text:
  399 + # Use a regex to match all characters until the next brace or backslash:
369 400 m = re_text.match(self.data, self.index)
370 401 if m:
371 402 self._text(m)
... ...
setup.py
... ... @@ -41,7 +41,7 @@ import os, fnmatch
41 41 #--- METADATA -----------------------------------------------------------------
42 42  
43 43 name = "oletools"
44   -version = '0.51dev3'
  44 +version = '0.51dev4'
45 45 desc = "Python tools to analyze security characteristics of MS Office and OLE files (also called Structured Storage, Compound File Binary Format or Compound Document File Format), for Malware Analysis and Incident Response #DFIR"
46 46 long_desc = open('oletools/README.rst').read()
47 47 author = "Philippe Lagadec"
... ...