From 2685c6f5b70d40c7606fb55773f7ca8159130520 Mon Sep 17 00:00:00 2001 From: decalage2 Date: Fri, 15 Jul 2016 16:36:40 +0200 Subject: [PATCH] rtfobj: new RtfParser and RtfObjParser classes - a more complete RTF parser to support tricky edge cases exploited by malware --- oletools/rtfobj.py | 454 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------------------------------- 1 file changed, 403 insertions(+), 51 deletions(-) diff --git a/oletools/rtfobj.py b/oletools/rtfobj.py index 051701e..7da5110 100755 --- a/oletools/rtfobj.py +++ b/oletools/rtfobj.py @@ -1,4 +1,6 @@ #!/usr/bin/env python +from __future__ import print_function + """ rtfobj.py @@ -52,12 +54,12 @@ http://www.decalage.info/python/oletools # (contribution by Thomas Jarosch) # TJ: - sanitize filenames to avoid special characters # 2016-05-29 PL: - improved parsing, fixed issue #42 +# 2016-07-13 v0.48 PL: - new RtfParser and RtfObjParser classes -__version__ = '0.47' +__version__ = '0.48' #------------------------------------------------------------------------------ # TODO: -# - improve regex pattern for better performance? # - allow semicolon within hex, as found in this sample: # http://contagiodump.blogspot.nl/2011/10/sep-28-cve-2010-3333-manuscript-with.html @@ -70,6 +72,7 @@ from thirdparty.xglob import xglob from oleobj import OleObject, OleNativeStream import oleobj + # === LOGGING ================================================================= class NullHandler(logging.Handler): @@ -125,11 +128,47 @@ HEX_DIGIT = r'[0-9A-Fa-f]' # HEX_CHAR = r'[0-9A-Fa-f]\s*[0-9A-Fa-f]' # Even worse, MS Word also allows ANY RTF-style tag {*} in between!! # AND the tags can be nested... -SINGLE_RTF_TAG = r'[{][^{}]*[}]' +#SINGLE_RTF_TAG = r'[{][^{}]*[}]' +# Actually RTF tags may contain braces escaped with backslash (\{ \}): +SINGLE_RTF_TAG = r'[{](?:\\.|[^{}\\])*[}]' + # Nested tags, two levels (because Python's re does not support nested matching): -NESTED_RTF_TAG = r'[{](?:[^{}]|'+SINGLE_RTF_TAG+r')*[}]' +# NESTED_RTF_TAG = r'[{](?:[^{}]|'+SINGLE_RTF_TAG+r')*[}]' +NESTED_RTF_TAG = r'[{](?:\\.|[^{}\\]|'+SINGLE_RTF_TAG+r')*[}]' + +# AND it is also allowed to insert ANY control word or control symbol (ignored) +# According to Rich Text Format (RTF) Specification Version 1.9.1, +# section "Control Word": +# control word = \ +# delimiter = space, OR signed integer followed by any non-digit, +# OR any character except letter and digit +# examples of valid control words: +# "\AnyThing " "\AnyThing123z" ""\AnyThing-456{" "\AnyThing{" +# control symbol = \ (followed by anything) + +ASCII_NAME = r'([a-zA-Z]{1,250})' + +# using Python's re lookahead assumption: +# (?=...) Matches if ... matches next, but doesn't consume any of the string. +# This is called a lookahead assertion. For example, Isaac (?=Asimov) will +# match 'Isaac ' only if it's followed by 'Asimov'. + +# TODO: Find the actual limit on the number of digits for Word +# SIGNED_INTEGER = r'(-?\d{1,250})' +SIGNED_INTEGER = r'(-?\d+)' + +CONTROL_WORD = r'(?:\\' + ASCII_NAME + r'(?:(?=[^a-zA-Z0-9-])|' + SIGNED_INTEGER + r'(?=[^0-9])))' +re_control_word = re.compile(CONTROL_WORD) + +CONTROL_SYMBOL = r'(?:\\[^a-zA-Z0-9])' +re_control_symbol = re.compile(CONTROL_SYMBOL) + +# Text that is not a control word/symbol or a group: +TEXT = r'[^{}\\]+' +re_text = re.compile(TEXT) + # ignored whitespaces and tags within a hex block: -IGNORED = r'(?:\s|'+NESTED_RTF_TAG+r')*' +IGNORED = r'(?:\s|'+NESTED_RTF_TAG+'|'+CONTROL_SYMBOL+'|'+CONTROL_WORD+r')*' #IGNORED = r'\s*' # HEX_CHAR = HEX_DIGIT + IGNORED + HEX_DIGIT @@ -175,6 +214,316 @@ re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + BACKSLASH_BIN + DECIMAL_GROUP + DELIMITER) re_delim_hexblock = re.compile(DELIMITER + PATTERN) +# Destination Control Words, according to MS RTF Specifications v1.9.1: +DESTINATION_CONTROL_WORDS = frozenset(( + "aftncn", "aftnsep", "aftnsepc", "annotation", "atnauthor", "atndate", "atnicn", "atnid", "atnparent", "atnref", + "atntime", "atrfend", "atrfstart", "author", "background", "bkmkend", "bkmkstart", "blipuid", "buptim", "category", + "colorschememapping", "colortbl", "comment", "company", "creatim", "datafield", "datastore", "defchp", "defpap", + "do", "doccomm", "docvar", "dptxbxtext", "ebcend", "ebcstart", "factoidname", "falt", "fchars", "ffdeftext", + "ffentrymcr", "ffexitmcr", "ffformat", "ffhelptext", "ffl", "ffname", "ffstattext", "field", "file", "filetbl", + "fldinst", "fldrslt", "fldtype", "fname", "fontemb", "fontfile", "fonttbl", "footer", "footerf", "footerl", + "footerr", "footnote", "formfield", "ftncn", "ftnsep", "ftnsepc", "g", "generator", "gridtbl", "header", "headerf", + "headerl", "headerr", "hl", "hlfr", "hlinkbase", "hlloc", "hlsrc", "hsv", "htmltag", "info", "keycode", "keywords", + "latentstyles", "lchars", "levelnumbers", "leveltext", "lfolevel", "linkval", "list", "listlevel", "listname", + "listoverride", "listoverridetable", "listpicture", "liststylename", "listtable", "listtext", "lsdlockedexcept", + "macc", "maccPr", "mailmerge", "maln", "malnScr", "manager", "margPr", "mbar", "mbarPr", "mbaseJc", "mbegChr", + "mborderBox", "mborderBoxPr", "mbox", "mboxPr", "mchr", "mcount", "mctrlPr", "md", "mdeg", "mdegHide", "mden", + "mdiff", "mdPr", "me", "mendChr", "meqArr", "meqArrPr", "mf", "mfName", "mfPr", "mfunc", "mfuncPr", "mgroupChr", + "mgroupChrPr", "mgrow", "mhideBot", "mhideLeft", "mhideRight", "mhideTop", "mhtmltag", "mlim", "mlimloc", "mlimlow", + "mlimlowPr", "mlimupp", "mlimuppPr", "mm", "mmaddfieldname", "mmath", "mmathPict", "mmathPr", "mmaxdist", "mmc", + "mmcJc", "mmconnectstr", "mmconnectstrdata", "mmcPr", "mmcs", "mmdatasource", "mmheadersource", "mmmailsubject", + "mmodso", "mmodsofilter", "mmodsofldmpdata", "mmodsomappedname", "mmodsoname", "mmodsorecipdata", "mmodsosort", + "mmodsosrc", "mmodsotable", "mmodsoudl", "mmodsoudldata", "mmodsouniquetag", "mmPr", "mmquery", "mmr", "mnary", + "mnaryPr", "mnoBreak", "mnum", "mobjDist", "moMath", "moMathPara", "moMathParaPr", "mopEmu", "mphant", "mphantPr", + "mplcHide", "mpos", "mr", "mrad", "mradPr", "mrPr", "msepChr", "mshow", "mshp", "msPre", "msPrePr", "msSub", + "msSubPr", "msSubSup", "msSubSupPr", "msSup", "msSupPr", "mstrikeBLTR", "mstrikeH", "mstrikeTLBR", "mstrikeV", + "msub", "msubHide", "msup", "msupHide", "mtransp", "mtype", "mvertJc", "mvfmf", "mvfml", "mvtof", "mvtol", + "mzeroAsc", "mzeroDesc", "mzeroWid", "nesttableprops", "nextfile", "nonesttables", "objalias", "objclass", + "objdata", "object", "objname", "objsect", "objtime", "oldcprops", "oldpprops", "oldsprops", "oldtprops", + "oleclsid", "operator", "panose", "password", "passwordhash", "pgp", "pgptbl", "picprop", "pict", "pn", "pnseclvl", + "pntext", "pntxta", "pntxtb", "printim", "private", "propname", "protend", "protstart", "protusertbl", "pxe", + "result", "revtbl", "revtim", "rsidtbl", "rtf", "rxe", "shp", "shpgrp", "shpinst", "shppict", "shprslt", "shptxt", + "sn", "sp", "staticval", "stylesheet", "subject", "sv", "svb", "tc", "template", "themedata", "title", "txe", "ud", + "upr", "userprops", "wgrffmtfilter", "windowcaption", "writereservation", "writereservhash", "xe", "xform", + "xmlattrname", "xmlattrvalue", "xmlclose", "xmlname", "xmlnstbl", "xmlopen" + )) + + + +#=== CLASSES ================================================================= + +class Destination(object): + """ + Stores the data associated with a destination control word + """ + def __init__(self, cword=None): + self.cword = cword + self.data = '' + self.start = None + self.end = None + self.group_level = 0 + + +# class Group(object): +# """ +# Stores the data associated with a group between braces {...} +# """ +# def __init__(self, cword=None): +# self.start = None +# self.end = None +# self.level = None + + + +class RtfParser(object): + """ + Very simple generic RTF parser + """ + + def __init__(self, data): + self.data = data + self.index = 0 + self.size = len(data) + self.group_level = 0 + # default destination for the document text: + document_destination = Destination() + self.destinations = [document_destination] + self.current_destination = document_destination + + def parse(self): + self.index = 0 + while self.index < self.size: + if self.data[self.index] == '{': + self._open_group() + self.index += 1 + continue + if self.data[self.index] == '}': + self._close_group() + self.index += 1 + continue + if self.data[self.index] == '\\': + m = re_control_word.match(self.data, self.index) + if m: + cword = m.group(1) + param = None + if len(m.groups()) > 1: + param = m.group(2) + # log.debug('control word %r at index %Xh - cword=%r param=%r' % (m.group(), self.index, cword, param)) + self._control_word(m, cword, param) + self.index += len(m.group()) + # if it's \bin, call _bin after updating index + if cword == 'bin': + self._bin(m, param) + continue + m = re_control_symbol.match(self.data, self.index) + if m: + self.control_symbol(m) + self.index += len(m.group()) + continue + m = re_text.match(self.data, self.index) + if m: + self._text(m) + self.index += len(m.group()) + continue + raise RuntimeError('Should not have reached this point - index=%Xh' % self.index) + self.end_of_file() + + + def _open_group(self): + self.group_level += 1 + log.debug('{ Open Group at index %Xh - level=%d' % (self.index, self.group_level)) + # call user method AFTER increasing the level: + self.open_group() + + def open_group(self): + #log.debug('open group at index %Xh' % self.index) + pass + + def _close_group(self): + log.debug('} Close Group at index %Xh - level=%d' % (self.index, self.group_level)) + # call user method BEFORE decreasing the level: + self.close_group() + # if the destination level is the same as the group level, close the destination: + if self.group_level == self.current_destination.group_level: + log.debug('Current Destination %r level = %d => Close Destination' % ( + self.current_destination.cword, self.current_destination.group_level)) + self._close_destination() + else: + log.debug('Current Destination %r level = %d => Continue with same Destination' % ( + self.current_destination.cword, self.current_destination.group_level)) + self.group_level -= 1 + log.debug('Decreased group level to %d' % self.group_level) + + def close_group(self): + #log.debug('close group at index %Xh' % self.index) + pass + + def _open_destination(self, matchobject, cword): + # if the current destination is at the same group level, close it first: + if self.current_destination.group_level == self.group_level: + self._close_destination() + new_dest = Destination(cword) + new_dest.group_level = self.group_level + self.destinations.append(new_dest) + self.current_destination = new_dest + # start of the destination is right after the control word: + new_dest.start = self.index + len(matchobject.group()) + log.debug("Open Destination %r start=%Xh - level=%d" % (cword, new_dest.start, new_dest.group_level)) + # call the corresponding user method for additional processing: + self.open_destination(self.current_destination) + + def open_destination(self, destination): + pass + + def _close_destination(self): + log.debug("Close Destination %r end=%Xh - level=%d" % (self.current_destination.cword, + self.index, self.current_destination.group_level)) + self.current_destination.end = self.index + # call the corresponding user method for additional processing: + self.close_destination(self.current_destination) + if len(self.destinations)>0: + # remove the current destination from the stack, and go back to the previous one: + self.destinations.pop() + if len(self.destinations) > 0: + self.current_destination = self.destinations[-1] + else: + log.debug('All destinations are closed, keeping the document destination open') + + def close_destination(self, destination): + pass + + def _control_word(self, matchobject, cword, param): + #log.debug('control word %r at index %Xh' % (matchobject.group(), self.index)) + if cword in DESTINATION_CONTROL_WORDS: + # log.debug('%r is a destination control word: starting a new destination' % cword) + self._open_destination(matchobject, cword) + # call the corresponding user method for additional processing: + self.control_word(matchobject, cword, param) + + def control_word(self, matchobject, cword, param): + pass + + def control_symbol(self, matchobject): + #log.debug('control symbol %r at index %Xh' % (matchobject.group(), self.index)) + pass + + def _text(self, matchobject): + text = matchobject.group() + self.current_destination.data += text + self.text(matchobject, text) + + def text(self, matchobject, text): + #log.debug('text %r at index %Xh' % (matchobject.group(), self.index)) + pass + + def _bin(self, matchobject, param): + binlen = int(param) + log.debug('\\bin: reading %d bytes of binary data' % binlen) + # TODO: handle optional space? + # TODO: handle negative length, and length greater than data + bindata = self.data[self.index:self.index + binlen] + self.index += binlen + self.bin(bindata) + + def bin(self, bindata): + pass + + def _end_of_file(self): + log.debug('%Xh Reached End of File') + # close any group/destination that is still open: + while self.group_level > 0: + log.debug('Group Level = %d, closing group' % self.group_level) + self._close_group() + self.end_of_file() + + def end_of_file(self): + pass + + +class RtfObjParser(RtfParser): + """ + Specialized RTF parser to extract OLE objects + """ + + def __init__(self, data, fname_prefix='rtf'): + super(RtfObjParser, self).__init__(data) + self.fname_prefix = fname_prefix + + def open_destination(self, destination): + if destination.cword == 'objdata': + log.debug('*** Start object data at index %Xh' % destination.start) + + def close_destination(self, destination): + if destination.cword == 'objdata': + log.debug('*** Close object data at index %Xh' % self.index) + # Filter out all whitespaces first (just ignored): + hexdata1 = destination.data.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') + # Then filter out any other non-hex character: + hexdata = re.sub(r'[^a-hA-H0-9]', '', hexdata1) + if len(hexdata) < len(hexdata1): + # this is only for debugging: + nonhex = re.sub(r'[a-hA-H0-9]', '', hexdata1) + log.debug('Found non-hex chars in hexdata: %r' % nonhex) + # MS Word accepts an extra hex digit, so we need to trim it if present: + if len(hexdata) & 1: + log.debug('Odd length, trimmed last byte.') + hexdata = hexdata[:-1] + object_data = binascii.unhexlify(hexdata) + print('found object size %d at index %08X - end %08X' % (len(object_data), + destination.start, self.index)) + fname = '%s_object_%08X.raw' % (self.fname_prefix, destination.start) + print('saving object to file %s' % fname) + open(fname, 'wb').write(object_data) + # TODO: check if all hex data is extracted properly + + obj = OleObject() + try: + obj.parse(object_data) + print('extract file embedded in OLE object:') + print('format_id = %d' % obj.format_id) + print('class name = %r' % obj.class_name) + print('data size = %d' % obj.data_size) + # set a file extension according to the class name: + class_name = obj.class_name.lower() + if class_name.startswith('word'): + ext = 'doc' + elif class_name.startswith('package'): + ext = 'package' + else: + ext = 'bin' + + fname = '%s_object_%08X.%s' % (self.fname_prefix, destination.start, ext) + print('saving to file %s' % fname) + open(fname, 'wb').write(obj.data) + if obj.class_name.lower() == 'package': + print('Parsing OLE Package') + opkg = OleNativeStream(bindata=obj.data) + print('Filename = %r' % opkg.filename) + print('Source path = %r' % opkg.src_path) + print('Temp path = %r' % opkg.temp_path) + if opkg.filename: + fname = '%s_%s' % (self.fname_prefix, + sanitize_filename(opkg.filename)) + else: + fname = '%s_object_%08X.noname' % (self.fname_prefix, destination.start) + print('saving to file %s' % fname) + open(fname, 'wb').write(opkg.data) + except: + pass + log.exception('*** Not an OLE 1.0 Object') + + def bin(self, bindata): + if self.current_destination.cword == 'objdata': + # TODO: keep track of this, because it is unusual and indicates potential obfuscation + # trick: hexlify binary data, add it to hex data + self.current_destination.data += binascii.hexlify(bindata) + + def control_word(self, matchobject, cword, param): + # TODO: extract useful cwords such as objclass + # TODO: keep track of cwords inside objdata, because it is unusual and indicates potential obfuscation + # TODO: same with control symbols, and opening bracket + pass + #=== FUNCTIONS =============================================================== @@ -329,50 +678,53 @@ def process_file(container, filename, data, output_dir=None): # TODO: option to extract objects to files (false by default) if data is None: data = open(filename, 'rb').read() - print '-'*79 - print 'File: %r - %d bytes' % (filename, len(data)) - for index, orig_len, objdata in rtf_iter_objects(data): - print 'found object size %d at index %08X - end %08X' % (len(objdata), index, index+orig_len) - fname = '%s_object_%08X.raw' % (fname_prefix, index) - print 'saving object to file %s' % fname - open(fname, 'wb').write(objdata) - # TODO: check if all hex data is extracted properly - - obj = OleObject() - try: - obj.parse(objdata) - print 'extract file embedded in OLE object:' - print 'format_id = %d' % obj.format_id - print 'class name = %r' % obj.class_name - print 'data size = %d' % obj.data_size - # set a file extension according to the class name: - class_name = obj.class_name.lower() - if class_name.startswith('word'): - ext = 'doc' - elif class_name.startswith('package'): - ext = 'package' - else: - ext = 'bin' - - fname = '%s_object_%08X.%s' % (fname_prefix, index, ext) - print 'saving to file %s' % fname - open(fname, 'wb').write(obj.data) - if obj.class_name.lower() == 'package': - print 'Parsing OLE Package' - opkg = OleNativeStream(bindata=obj.data) - print 'Filename = %r' % opkg.filename - print 'Source path = %r' % opkg.src_path - print 'Temp path = %r' % opkg.temp_path - if opkg.filename: - fname = '%s_%s' % (fname_prefix, - sanitize_filename(opkg.filename)) - else: - fname = '%s_object_%08X.noname' % (fname_prefix, index) - print 'saving to file %s' % fname - open(fname, 'wb').write(opkg.data) - except: - pass - log.exception('*** Not an OLE 1.0 Object') + rtfp = RtfObjParser(data, fname_prefix) + rtfp.parse() + + # print '-'*79 + # print 'File: %r - %d bytes' % (filename, len(data)) + # for index, orig_len, objdata in rtf_iter_objects(data): + # print 'found object size %d at index %08X - end %08X' % (len(objdata), index, index+orig_len) + # fname = '%s_object_%08X.raw' % (fname_prefix, index) + # print 'saving object to file %s' % fname + # open(fname, 'wb').write(objdata) + # # TODO: check if all hex data is extracted properly + # + # obj = OleObject() + # try: + # obj.parse(objdata) + # print 'extract file embedded in OLE object:' + # print 'format_id = %d' % obj.format_id + # print 'class name = %r' % obj.class_name + # print 'data size = %d' % obj.data_size + # # set a file extension according to the class name: + # class_name = obj.class_name.lower() + # if class_name.startswith('word'): + # ext = 'doc' + # elif class_name.startswith('package'): + # ext = 'package' + # else: + # ext = 'bin' + # + # fname = '%s_object_%08X.%s' % (fname_prefix, index, ext) + # print 'saving to file %s' % fname + # open(fname, 'wb').write(obj.data) + # if obj.class_name.lower() == 'package': + # print 'Parsing OLE Package' + # opkg = OleNativeStream(bindata=obj.data) + # print 'Filename = %r' % opkg.filename + # print 'Source path = %r' % opkg.src_path + # print 'Temp path = %r' % opkg.temp_path + # if opkg.filename: + # fname = '%s_%s' % (fname_prefix, + # sanitize_filename(opkg.filename)) + # else: + # fname = '%s_object_%08X.noname' % (fname_prefix, index) + # print 'saving to file %s' % fname + # open(fname, 'wb').write(opkg.data) + # except: + # pass + # log.exception('*** Not an OLE 1.0 Object') @@ -414,7 +766,7 @@ if __name__ == '__main__': # Print help if no arguments are passed if len(args) == 0: - print __doc__ + print (__doc__) parser.print_help() sys.exit() @@ -436,5 +788,5 @@ if __name__ == '__main__': process_file(container, filename, data, options.output_dir) - +# This code was developed while listening to The Mary Onettes "Lost" -- libgit2 0.21.4