Commit 94fe13c76a3ccd4d274d254c2804b399d4393a9f

Authored by decalage2
1 parent d5f0e1a9

rtfobj: fixed RtfParser to handle issue #152 (control word with long parameter)

oletools/oleobj.py
@@ -290,6 +290,9 @@ class OleObject (object): @@ -290,6 +290,9 @@ class OleObject (object):
290 :param data: bytes, OLE 1.0 Object structure containing an OLE object 290 :param data: bytes, OLE 1.0 Object structure containing an OLE object
291 :return: 291 :return:
292 """ 292 """
  293 + # from ezhexviewer import hexdump3
  294 + # print("Parsing OLE object data:")
  295 + # print(hexdump3(data, length=16))
293 # Header: see MS-OLEDS 2.2.4 ObjectHeader 296 # Header: see MS-OLEDS 2.2.4 ObjectHeader
294 self.ole_version, data = read_uint32(data) 297 self.ole_version, data = read_uint32(data)
295 self.format_id, data = read_uint32(data) 298 self.format_id, data = read_uint32(data)
oletools/rtfobj.py
@@ -17,7 +17,7 @@ http://www.decalage.info/python/oletools @@ -17,7 +17,7 @@ http://www.decalage.info/python/oletools
17 17
18 #=== LICENSE ================================================================= 18 #=== LICENSE =================================================================
19 19
20 -# rtfobj is copyright (c) 2012-2016, Philippe Lagadec (http://www.decalage.info) 20 +# rtfobj is copyright (c) 2012-2017, Philippe Lagadec (http://www.decalage.info)
21 # All rights reserved. 21 # All rights reserved.
22 # 22 #
23 # Redistribution and use in source and binary forms, with or without modification, 23 # Redistribution and use in source and binary forms, with or without modification,
@@ -67,8 +67,10 @@ http://www.decalage.info/python/oletools @@ -67,8 +67,10 @@ http://www.decalage.info/python/oletools
67 # 2016-11-17 v0.51 PL: - updated call to oleobj.OleNativeStream 67 # 2016-11-17 v0.51 PL: - updated call to oleobj.OleNativeStream
68 # 2017-03-12 PL: - fixed imports for Python 2+3 68 # 2017-03-12 PL: - fixed imports for Python 2+3
69 # - fixed hex decoding bug in RtfObjParser (issue #103) 69 # - fixed hex decoding bug in RtfObjParser (issue #103)
  70 +# 2017-03-29 PL: - fixed RtfParser to handle issue #152 (control word with
  71 +# long parameter)
70 72
71 -__version__ = '0.51dev2' 73 +__version__ = '0.51dev4'
72 74
73 # ------------------------------------------------------------------------------ 75 # ------------------------------------------------------------------------------
74 # TODO: 76 # TODO:
@@ -186,8 +188,8 @@ ASCII_NAME = b'([a-zA-Z]{1,250})' @@ -186,8 +188,8 @@ ASCII_NAME = b'([a-zA-Z]{1,250})'
186 # SIGNED_INTEGER = r'(-?\d{1,250})' 188 # SIGNED_INTEGER = r'(-?\d{1,250})'
187 SIGNED_INTEGER = b'(-?\\d+)' 189 SIGNED_INTEGER = b'(-?\\d+)'
188 190
189 -# Note for issue #78: need to match "\A-" not followed by digits  
190 -CONTROL_WORD = b'(?:\\\\' + ASCII_NAME + b'(?:' + SIGNED_INTEGER + b'(?=[^0-9])|(?=[^a-zA-Z0-9])))' 191 +# Note for issue #78: need to match "\A-" not followed by digits (or the end of string)
  192 +CONTROL_WORD = b'(?:\\\\' + ASCII_NAME + b'(?:' + SIGNED_INTEGER + b'(?=[^0-9])|(?=[^a-zA-Z0-9])|$))'
191 193
192 re_control_word = re.compile(CONTROL_WORD) 194 re_control_word = re.compile(CONTROL_WORD)
193 195
@@ -323,10 +325,16 @@ class Destination(object): @@ -323,10 +325,16 @@ class Destination(object):
323 325
324 class RtfParser(object): 326 class RtfParser(object):
325 """ 327 """
326 - Very simple generic RTF parser 328 + Very simple but robust generic RTF parser, designed to handle
  329 + malformed malicious RTF as MS Word does
327 """ 330 """
328 331
329 def __init__(self, data): 332 def __init__(self, data):
  333 + """
  334 + RtfParser constructor.
  335 +
  336 + :param data: bytes object containing the RTF data to be parsed
  337 + """
330 self.data = data 338 self.data = data
331 self.index = 0 339 self.index = 0
332 self.size = len(data) 340 self.size = len(data)
@@ -337,18 +345,38 @@ class RtfParser(object): @@ -337,18 +345,38 @@ class RtfParser(object):
337 self.current_destination = document_destination 345 self.current_destination = document_destination
338 346
339 def parse(self): 347 def parse(self):
  348 + """
  349 + Parse the RTF data
  350 +
  351 + :return: nothing
  352 + """
  353 + # Start at beginning of data
340 self.index = 0 354 self.index = 0
  355 + # Loop until the end
341 while self.index < self.size: 356 while self.index < self.size:
342 if self.data[self.index] == BRACE_OPEN: 357 if self.data[self.index] == BRACE_OPEN:
  358 + # Found an opening brace "{": Start of a group
343 self._open_group() 359 self._open_group()
344 self.index += 1 360 self.index += 1
345 continue 361 continue
346 if self.data[self.index] == BRACE_CLOSE: 362 if self.data[self.index] == BRACE_CLOSE:
  363 + # Found a closing brace "}": End of a group
347 self._close_group() 364 self._close_group()
348 self.index += 1 365 self.index += 1
349 continue 366 continue
350 if self.data[self.index] == BACKSLASH: 367 if self.data[self.index] == BACKSLASH:
351 - m = re_control_word.match(self.data, self.index) 368 + # Found a backslash "\": Start of a control word or control symbol
  369 + # Use a regex to extract the control word name if present:
  370 + # NOTE: the full length of the control word + its optional integer parameter
  371 + # is limited by MS Word at 253 characters, so we have to run the regex
  372 + # on a cropped string:
  373 + data_cropped = self.data[self.index:]
  374 + if len(data_cropped)>253:
  375 + data_cropped = data_cropped[:254]
  376 + # append a space so that the regex can check the following character:
  377 + data_cropped += b' '
  378 + # m = re_control_word.match(self.data, self.index, self.index+253)
  379 + m = re_control_word.match(data_cropped)
352 if m: 380 if m:
353 cword = m.group(1) 381 cword = m.group(1)
354 param = None 382 param = None
@@ -361,11 +389,14 @@ class RtfParser(object): @@ -361,11 +389,14 @@ class RtfParser(object):
361 if cword == b'bin': 389 if cword == b'bin':
362 self._bin(m, param) 390 self._bin(m, param)
363 continue 391 continue
  392 + # Otherwise, it may be a control symbol:
364 m = re_control_symbol.match(self.data, self.index) 393 m = re_control_symbol.match(self.data, self.index)
365 if m: 394 if m:
366 self.control_symbol(m) 395 self.control_symbol(m)
367 self.index += len(m.group()) 396 self.index += len(m.group())
368 continue 397 continue
  398 + # Otherwise, this is plain text:
  399 + # Use a regex to match all characters until the next brace or backslash:
369 m = re_text.match(self.data, self.index) 400 m = re_text.match(self.data, self.index)
370 if m: 401 if m:
371 self._text(m) 402 self._text(m)
setup.py
@@ -41,7 +41,7 @@ import os, fnmatch @@ -41,7 +41,7 @@ import os, fnmatch
41 #--- METADATA ----------------------------------------------------------------- 41 #--- METADATA -----------------------------------------------------------------
42 42
43 name = "oletools" 43 name = "oletools"
44 -version = '0.51dev3' 44 +version = '0.51dev4'
45 desc = "Python tools to analyze security characteristics of MS Office and OLE files (also called Structured Storage, Compound File Binary Format or Compound Document File Format), for Malware Analysis and Incident Response #DFIR" 45 desc = "Python tools to analyze security characteristics of MS Office and OLE files (also called Structured Storage, Compound File Binary Format or Compound Document File Format), for Malware Analysis and Incident Response #DFIR"
46 long_desc = open('oletools/README.rst').read() 46 long_desc = open('oletools/README.rst').read()
47 author = "Philippe Lagadec" 47 author = "Philippe Lagadec"