Commit 94fe13c76a3ccd4d274d254c2804b399d4393a9f
1 parent
d5f0e1a9
rtfobj: fixed RtfParser to handle issue #152 (control word with long parameter)
Showing
3 changed files
with
41 additions
and
7 deletions
oletools/oleobj.py
| ... | ... | @@ -290,6 +290,9 @@ class OleObject (object): |
| 290 | 290 | :param data: bytes, OLE 1.0 Object structure containing an OLE object |
| 291 | 291 | :return: |
| 292 | 292 | """ |
| 293 | + # from ezhexviewer import hexdump3 | |
| 294 | + # print("Parsing OLE object data:") | |
| 295 | + # print(hexdump3(data, length=16)) | |
| 293 | 296 | # Header: see MS-OLEDS 2.2.4 ObjectHeader |
| 294 | 297 | self.ole_version, data = read_uint32(data) |
| 295 | 298 | self.format_id, data = read_uint32(data) | ... | ... |
oletools/rtfobj.py
| ... | ... | @@ -17,7 +17,7 @@ http://www.decalage.info/python/oletools |
| 17 | 17 | |
| 18 | 18 | #=== LICENSE ================================================================= |
| 19 | 19 | |
| 20 | -# rtfobj is copyright (c) 2012-2016, Philippe Lagadec (http://www.decalage.info) | |
| 20 | +# rtfobj is copyright (c) 2012-2017, Philippe Lagadec (http://www.decalage.info) | |
| 21 | 21 | # All rights reserved. |
| 22 | 22 | # |
| 23 | 23 | # Redistribution and use in source and binary forms, with or without modification, |
| ... | ... | @@ -67,8 +67,10 @@ http://www.decalage.info/python/oletools |
| 67 | 67 | # 2016-11-17 v0.51 PL: - updated call to oleobj.OleNativeStream |
| 68 | 68 | # 2017-03-12 PL: - fixed imports for Python 2+3 |
| 69 | 69 | # - fixed hex decoding bug in RtfObjParser (issue #103) |
| 70 | +# 2017-03-29 PL: - fixed RtfParser to handle issue #152 (control word with | |
| 71 | +# long parameter) | |
| 70 | 72 | |
| 71 | -__version__ = '0.51dev2' | |
| 73 | +__version__ = '0.51dev4' | |
| 72 | 74 | |
| 73 | 75 | # ------------------------------------------------------------------------------ |
| 74 | 76 | # TODO: |
| ... | ... | @@ -186,8 +188,8 @@ ASCII_NAME = b'([a-zA-Z]{1,250})' |
| 186 | 188 | # SIGNED_INTEGER = r'(-?\d{1,250})' |
| 187 | 189 | SIGNED_INTEGER = b'(-?\\d+)' |
| 188 | 190 | |
| 189 | -# Note for issue #78: need to match "\A-" not followed by digits | |
| 190 | -CONTROL_WORD = b'(?:\\\\' + ASCII_NAME + b'(?:' + SIGNED_INTEGER + b'(?=[^0-9])|(?=[^a-zA-Z0-9])))' | |
| 191 | +# Note for issue #78: need to match "\A-" not followed by digits (or the end of string) | |
| 192 | +CONTROL_WORD = b'(?:\\\\' + ASCII_NAME + b'(?:' + SIGNED_INTEGER + b'(?=[^0-9])|(?=[^a-zA-Z0-9])|$))' | |
| 191 | 193 | |
| 192 | 194 | re_control_word = re.compile(CONTROL_WORD) |
| 193 | 195 | |
| ... | ... | @@ -323,10 +325,16 @@ class Destination(object): |
| 323 | 325 | |
| 324 | 326 | class RtfParser(object): |
| 325 | 327 | """ |
| 326 | - Very simple generic RTF parser | |
| 328 | + Very simple but robust generic RTF parser, designed to handle | |
| 329 | + malformed malicious RTF as MS Word does | |
| 327 | 330 | """ |
| 328 | 331 | |
| 329 | 332 | def __init__(self, data): |
| 333 | + """ | |
| 334 | + RtfParser constructor. | |
| 335 | + | |
| 336 | + :param data: bytes object containing the RTF data to be parsed | |
| 337 | + """ | |
| 330 | 338 | self.data = data |
| 331 | 339 | self.index = 0 |
| 332 | 340 | self.size = len(data) |
| ... | ... | @@ -337,18 +345,38 @@ class RtfParser(object): |
| 337 | 345 | self.current_destination = document_destination |
| 338 | 346 | |
| 339 | 347 | def parse(self): |
| 348 | + """ | |
| 349 | + Parse the RTF data | |
| 350 | + | |
| 351 | + :return: nothing | |
| 352 | + """ | |
| 353 | + # Start at beginning of data | |
| 340 | 354 | self.index = 0 |
| 355 | + # Loop until the end | |
| 341 | 356 | while self.index < self.size: |
| 342 | 357 | if self.data[self.index] == BRACE_OPEN: |
| 358 | + # Found an opening brace "{": Start of a group | |
| 343 | 359 | self._open_group() |
| 344 | 360 | self.index += 1 |
| 345 | 361 | continue |
| 346 | 362 | if self.data[self.index] == BRACE_CLOSE: |
| 363 | + # Found a closing brace "}": End of a group | |
| 347 | 364 | self._close_group() |
| 348 | 365 | self.index += 1 |
| 349 | 366 | continue |
| 350 | 367 | if self.data[self.index] == BACKSLASH: |
| 351 | - m = re_control_word.match(self.data, self.index) | |
| 368 | + # Found a backslash "\": Start of a control word or control symbol | |
| 369 | + # Use a regex to extract the control word name if present: | |
| 370 | + # NOTE: the full length of the control word + its optional integer parameter | |
| 371 | + # is limited by MS Word at 253 characters, so we have to run the regex | |
| 372 | + # on a cropped string: | |
| 373 | + data_cropped = self.data[self.index:] | |
| 374 | + if len(data_cropped)>253: | |
| 375 | + data_cropped = data_cropped[:254] | |
| 376 | + # append a space so that the regex can check the following character: | |
| 377 | + data_cropped += b' ' | |
| 378 | + # m = re_control_word.match(self.data, self.index, self.index+253) | |
| 379 | + m = re_control_word.match(data_cropped) | |
| 352 | 380 | if m: |
| 353 | 381 | cword = m.group(1) |
| 354 | 382 | param = None |
| ... | ... | @@ -361,11 +389,14 @@ class RtfParser(object): |
| 361 | 389 | if cword == b'bin': |
| 362 | 390 | self._bin(m, param) |
| 363 | 391 | continue |
| 392 | + # Otherwise, it may be a control symbol: | |
| 364 | 393 | m = re_control_symbol.match(self.data, self.index) |
| 365 | 394 | if m: |
| 366 | 395 | self.control_symbol(m) |
| 367 | 396 | self.index += len(m.group()) |
| 368 | 397 | continue |
| 398 | + # Otherwise, this is plain text: | |
| 399 | + # Use a regex to match all characters until the next brace or backslash: | |
| 369 | 400 | m = re_text.match(self.data, self.index) |
| 370 | 401 | if m: |
| 371 | 402 | self._text(m) | ... | ... |
setup.py
| ... | ... | @@ -41,7 +41,7 @@ import os, fnmatch |
| 41 | 41 | #--- METADATA ----------------------------------------------------------------- |
| 42 | 42 | |
| 43 | 43 | name = "oletools" |
| 44 | -version = '0.51dev3' | |
| 44 | +version = '0.51dev4' | |
| 45 | 45 | desc = "Python tools to analyze security characteristics of MS Office and OLE files (also called Structured Storage, Compound File Binary Format or Compound Document File Format), for Malware Analysis and Incident Response #DFIR" |
| 46 | 46 | long_desc = open('oletools/README.rst').read() |
| 47 | 47 | author = "Philippe Lagadec" | ... | ... |