Commit 579c1386f3010ddd6eb7004af9428f083f2bcc45

Authored by decalage2
1 parent 29276cd0

rtfobj: added support for RTF files (issue #223)

Showing 2 changed files with 63 additions and 2 deletions
oletools/msodde.py
@@ -57,8 +57,9 @@ from __future__ import print_function @@ -57,8 +57,9 @@ from __future__ import print_function
57 # 2017-11-23 CH: - added support for xlsx files 57 # 2017-11-23 CH: - added support for xlsx files
58 # 2017-11-24 CH: - added support for xls files 58 # 2017-11-24 CH: - added support for xls files
59 # 2017-11-29 CH: - added support for xlsb files 59 # 2017-11-29 CH: - added support for xlsb files
  60 +# 2017-11-29 PL: - added support for RTF files (issue #223)
60 61
61 -__version__ = '0.52dev8' 62 +__version__ = '0.52dev9'
62 63
63 #------------------------------------------------------------------------------ 64 #------------------------------------------------------------------------------
64 # TODO: field codes can be in headers/footers/comments - parse these 65 # TODO: field codes can be in headers/footers/comments - parse these
@@ -100,6 +101,7 @@ if not _parent_dir in sys.path: @@ -100,6 +101,7 @@ if not _parent_dir in sys.path:
100 from oletools.thirdparty import olefile 101 from oletools.thirdparty import olefile
101 from oletools import ooxml 102 from oletools import ooxml
102 from oletools import xls_parser 103 from oletools import xls_parser
  104 +from oletools import rtfobj
103 105
104 # === PYTHON 2+3 SUPPORT ====================================================== 106 # === PYTHON 2+3 SUPPORT ======================================================
105 107
@@ -748,6 +750,62 @@ def process_xlsx(filepath, filed_filter_mode=None): @@ -748,6 +750,62 @@ def process_xlsx(filepath, filed_filter_mode=None):
748 return u'\n'.join(dde_links) 750 return u'\n'.join(dde_links)
749 751
750 752
  753 +class RtfFieldParser(rtfobj.RtfParser):
  754 + """
  755 + Specialized RTF parser to extract fields such as DDEAUTO
  756 + """
  757 +
  758 + def __init__(self, data):
  759 + super(RtfFieldParser, self).__init__(data)
  760 + # list of RtfObjects found
  761 + self.fields = []
  762 +
  763 + def open_destination(self, destination):
  764 + if destination.cword == b'fldinst':
  765 + log.debug('*** Start field data at index %Xh' % destination.start)
  766 +
  767 + def close_destination(self, destination):
  768 + if destination.cword == b'fldinst':
  769 + log.debug('*** Close field data at index %Xh' % self.index)
  770 + log.debug('Field text: %r' % destination.data)
  771 + # remove extra spaces and newline chars:
  772 + field_clean = destination.data.translate(None, b'\r\n').strip()
  773 + log.debug('Cleaned Field text: %r' % field_clean)
  774 + self.fields.append(field_clean)
  775 +
  776 + def control_symbol(self, matchobject):
  777 + # required to handle control symbols such as '\\'
  778 + # inject the symbol as-is in the text:
  779 + # TODO: handle special symbols properly
  780 + self.current_destination.data += matchobject.group()[1]
  781 +
  782 +
  783 +
  784 +def process_rtf(filepath, field_filter_mode=None):
  785 + log.debug('process_rtf')
  786 + all_fields = []
  787 + data = open(filepath, 'rb').read()
  788 + rtfparser = RtfFieldParser(data)
  789 + rtfparser.parse()
  790 + all_fields = rtfparser.fields
  791 + # apply field command filter
  792 + log.debug('filtering with mode "{0}"'.format(field_filter_mode))
  793 + if field_filter_mode in (FIELD_FILTER_ALL, None):
  794 + clean_fields = all_fields
  795 + elif field_filter_mode == FIELD_FILTER_DDE:
  796 + clean_fields = [field for field in all_fields
  797 + if FIELD_DDE_REGEX.match(field)]
  798 + elif field_filter_mode == FIELD_FILTER_BLACKLIST:
  799 + # check if fields are acceptable and should not be returned
  800 + clean_fields = [field for field in all_fields
  801 + if not field_is_blacklisted(field.strip())]
  802 + else:
  803 + raise ValueError('Unexpected field_filter_mode: "{0}"'
  804 + .format(field_filter_mode))
  805 +
  806 + return u'\n'.join(clean_fields)
  807 +
  808 +
751 def process_file(filepath, field_filter_mode=None): 809 def process_file(filepath, field_filter_mode=None):
752 """ decides which of process_doc/x or process_xls/x to call """ 810 """ decides which of process_doc/x or process_xls/x to call """
753 if olefile.isOleFile(filepath): 811 if olefile.isOleFile(filepath):
@@ -756,6 +814,9 @@ def process_file(filepath, field_filter_mode=None): @@ -756,6 +814,9 @@ def process_file(filepath, field_filter_mode=None):
756 return process_xls(filepath) 814 return process_xls(filepath)
757 else: 815 else:
758 return process_doc(filepath) 816 return process_doc(filepath)
  817 + elif open(filepath, 'rb').read(4) == b'{\\rt':
  818 + # This is a RTF file
  819 + return process_rtf(filepath, field_filter_mode)
759 try: 820 try:
760 doctype = ooxml.get_type(filepath) 821 doctype = ooxml.get_type(filepath)
761 log.debug('Detected file type: {0}'.format(doctype)) 822 log.debug('Detected file type: {0}'.format(doctype))
setup.py
@@ -42,7 +42,7 @@ import os, fnmatch @@ -42,7 +42,7 @@ import os, fnmatch
42 #--- METADATA ----------------------------------------------------------------- 42 #--- METADATA -----------------------------------------------------------------
43 43
44 name = "oletools" 44 name = "oletools"
45 -version = '0.52dev5' 45 +version = '0.52dev9'
46 desc = "Python tools to analyze security characteristics of MS Office and OLE files (also called Structured Storage, Compound File Binary Format or Compound Document File Format), for Malware Analysis and Incident Response #DFIR" 46 desc = "Python tools to analyze security characteristics of MS Office and OLE files (also called Structured Storage, Compound File Binary Format or Compound Document File Format), for Malware Analysis and Incident Response #DFIR"
47 long_desc = open('oletools/README.rst').read() 47 long_desc = open('oletools/README.rst').read()
48 author = "Philippe Lagadec" 48 author = "Philippe Lagadec"