Commit 2685c6f5b70d40c7606fb55773f7ca8159130520
1 parent
44ec0bd8
rtfobj: new RtfParser and RtfObjParser classes - a more complete RTF parser to s…
…upport tricky edge cases exploited by malware
Showing
1 changed file
with
403 additions
and
51 deletions
oletools/rtfobj.py
| 1 | 1 | #!/usr/bin/env python |
| 2 | +from __future__ import print_function | |
| 3 | + | |
| 2 | 4 | """ |
| 3 | 5 | rtfobj.py |
| 4 | 6 | |
| ... | ... | @@ -52,12 +54,12 @@ http://www.decalage.info/python/oletools |
| 52 | 54 | # (contribution by Thomas Jarosch) |
| 53 | 55 | # TJ: - sanitize filenames to avoid special characters |
| 54 | 56 | # 2016-05-29 PL: - improved parsing, fixed issue #42 |
| 57 | +# 2016-07-13 v0.48 PL: - new RtfParser and RtfObjParser classes | |
| 55 | 58 | |
| 56 | -__version__ = '0.47' | |
| 59 | +__version__ = '0.48' | |
| 57 | 60 | |
| 58 | 61 | #------------------------------------------------------------------------------ |
| 59 | 62 | # TODO: |
| 60 | -# - improve regex pattern for better performance? | |
| 61 | 63 | # - allow semicolon within hex, as found in this sample: |
| 62 | 64 | # http://contagiodump.blogspot.nl/2011/10/sep-28-cve-2010-3333-manuscript-with.html |
| 63 | 65 | |
| ... | ... | @@ -70,6 +72,7 @@ from thirdparty.xglob import xglob |
| 70 | 72 | from oleobj import OleObject, OleNativeStream |
| 71 | 73 | import oleobj |
| 72 | 74 | |
| 75 | + | |
| 73 | 76 | # === LOGGING ================================================================= |
| 74 | 77 | |
| 75 | 78 | class NullHandler(logging.Handler): |
| ... | ... | @@ -125,11 +128,47 @@ HEX_DIGIT = r'[0-9A-Fa-f]' |
| 125 | 128 | # HEX_CHAR = r'[0-9A-Fa-f]\s*[0-9A-Fa-f]' |
| 126 | 129 | # Even worse, MS Word also allows ANY RTF-style tag {*} in between!! |
| 127 | 130 | # AND the tags can be nested... |
| 128 | -SINGLE_RTF_TAG = r'[{][^{}]*[}]' | |
| 131 | +#SINGLE_RTF_TAG = r'[{][^{}]*[}]' | |
| 132 | +# Actually RTF tags may contain braces escaped with backslash (\{ \}): | |
| 133 | +SINGLE_RTF_TAG = r'[{](?:\\.|[^{}\\])*[}]' | |
| 134 | + | |
| 129 | 135 | # Nested tags, two levels (because Python's re does not support nested matching): |
| 130 | -NESTED_RTF_TAG = r'[{](?:[^{}]|'+SINGLE_RTF_TAG+r')*[}]' | |
| 136 | +# NESTED_RTF_TAG = r'[{](?:[^{}]|'+SINGLE_RTF_TAG+r')*[}]' | |
| 137 | +NESTED_RTF_TAG = r'[{](?:\\.|[^{}\\]|'+SINGLE_RTF_TAG+r')*[}]' | |
| 138 | + | |
| 139 | +# AND it is also allowed to insert ANY control word or control symbol (ignored) | |
| 140 | +# According to Rich Text Format (RTF) Specification Version 1.9.1, | |
| 141 | +# section "Control Word": | |
| 142 | +# control word = \<ASCII Letter [a-zA-Z] Sequence max 32><Delimiter> | |
| 143 | +# delimiter = space, OR signed integer followed by any non-digit, | |
| 144 | +# OR any character except letter and digit | |
| 145 | +# examples of valid control words: | |
| 146 | +# "\AnyThing " "\AnyThing123z" ""\AnyThing-456{" "\AnyThing{" | |
| 147 | +# control symbol = \<any char except letter or digit> (followed by anything) | |
| 148 | + | |
| 149 | +ASCII_NAME = r'([a-zA-Z]{1,250})' | |
| 150 | + | |
| 151 | +# using Python's re lookahead assumption: | |
| 152 | +# (?=...) Matches if ... matches next, but doesn't consume any of the string. | |
| 153 | +# This is called a lookahead assertion. For example, Isaac (?=Asimov) will | |
| 154 | +# match 'Isaac ' only if it's followed by 'Asimov'. | |
| 155 | + | |
| 156 | +# TODO: Find the actual limit on the number of digits for Word | |
| 157 | +# SIGNED_INTEGER = r'(-?\d{1,250})' | |
| 158 | +SIGNED_INTEGER = r'(-?\d+)' | |
| 159 | + | |
| 160 | +CONTROL_WORD = r'(?:\\' + ASCII_NAME + r'(?:(?=[^a-zA-Z0-9-])|' + SIGNED_INTEGER + r'(?=[^0-9])))' | |
| 161 | +re_control_word = re.compile(CONTROL_WORD) | |
| 162 | + | |
| 163 | +CONTROL_SYMBOL = r'(?:\\[^a-zA-Z0-9])' | |
| 164 | +re_control_symbol = re.compile(CONTROL_SYMBOL) | |
| 165 | + | |
| 166 | +# Text that is not a control word/symbol or a group: | |
| 167 | +TEXT = r'[^{}\\]+' | |
| 168 | +re_text = re.compile(TEXT) | |
| 169 | + | |
| 131 | 170 | # ignored whitespaces and tags within a hex block: |
| 132 | -IGNORED = r'(?:\s|'+NESTED_RTF_TAG+r')*' | |
| 171 | +IGNORED = r'(?:\s|'+NESTED_RTF_TAG+'|'+CONTROL_SYMBOL+'|'+CONTROL_WORD+r')*' | |
| 133 | 172 | #IGNORED = r'\s*' |
| 134 | 173 | |
| 135 | 174 | # HEX_CHAR = HEX_DIGIT + IGNORED + HEX_DIGIT |
| ... | ... | @@ -175,6 +214,316 @@ re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + BACKSLASH_BIN |
| 175 | 214 | + DECIMAL_GROUP + DELIMITER) |
| 176 | 215 | re_delim_hexblock = re.compile(DELIMITER + PATTERN) |
| 177 | 216 | |
| 217 | +# Destination Control Words, according to MS RTF Specifications v1.9.1: | |
| 218 | +DESTINATION_CONTROL_WORDS = frozenset(( | |
| 219 | + "aftncn", "aftnsep", "aftnsepc", "annotation", "atnauthor", "atndate", "atnicn", "atnid", "atnparent", "atnref", | |
| 220 | + "atntime", "atrfend", "atrfstart", "author", "background", "bkmkend", "bkmkstart", "blipuid", "buptim", "category", | |
| 221 | + "colorschememapping", "colortbl", "comment", "company", "creatim", "datafield", "datastore", "defchp", "defpap", | |
| 222 | + "do", "doccomm", "docvar", "dptxbxtext", "ebcend", "ebcstart", "factoidname", "falt", "fchars", "ffdeftext", | |
| 223 | + "ffentrymcr", "ffexitmcr", "ffformat", "ffhelptext", "ffl", "ffname", "ffstattext", "field", "file", "filetbl", | |
| 224 | + "fldinst", "fldrslt", "fldtype", "fname", "fontemb", "fontfile", "fonttbl", "footer", "footerf", "footerl", | |
| 225 | + "footerr", "footnote", "formfield", "ftncn", "ftnsep", "ftnsepc", "g", "generator", "gridtbl", "header", "headerf", | |
| 226 | + "headerl", "headerr", "hl", "hlfr", "hlinkbase", "hlloc", "hlsrc", "hsv", "htmltag", "info", "keycode", "keywords", | |
| 227 | + "latentstyles", "lchars", "levelnumbers", "leveltext", "lfolevel", "linkval", "list", "listlevel", "listname", | |
| 228 | + "listoverride", "listoverridetable", "listpicture", "liststylename", "listtable", "listtext", "lsdlockedexcept", | |
| 229 | + "macc", "maccPr", "mailmerge", "maln", "malnScr", "manager", "margPr", "mbar", "mbarPr", "mbaseJc", "mbegChr", | |
| 230 | + "mborderBox", "mborderBoxPr", "mbox", "mboxPr", "mchr", "mcount", "mctrlPr", "md", "mdeg", "mdegHide", "mden", | |
| 231 | + "mdiff", "mdPr", "me", "mendChr", "meqArr", "meqArrPr", "mf", "mfName", "mfPr", "mfunc", "mfuncPr", "mgroupChr", | |
| 232 | + "mgroupChrPr", "mgrow", "mhideBot", "mhideLeft", "mhideRight", "mhideTop", "mhtmltag", "mlim", "mlimloc", "mlimlow", | |
| 233 | + "mlimlowPr", "mlimupp", "mlimuppPr", "mm", "mmaddfieldname", "mmath", "mmathPict", "mmathPr", "mmaxdist", "mmc", | |
| 234 | + "mmcJc", "mmconnectstr", "mmconnectstrdata", "mmcPr", "mmcs", "mmdatasource", "mmheadersource", "mmmailsubject", | |
| 235 | + "mmodso", "mmodsofilter", "mmodsofldmpdata", "mmodsomappedname", "mmodsoname", "mmodsorecipdata", "mmodsosort", | |
| 236 | + "mmodsosrc", "mmodsotable", "mmodsoudl", "mmodsoudldata", "mmodsouniquetag", "mmPr", "mmquery", "mmr", "mnary", | |
| 237 | + "mnaryPr", "mnoBreak", "mnum", "mobjDist", "moMath", "moMathPara", "moMathParaPr", "mopEmu", "mphant", "mphantPr", | |
| 238 | + "mplcHide", "mpos", "mr", "mrad", "mradPr", "mrPr", "msepChr", "mshow", "mshp", "msPre", "msPrePr", "msSub", | |
| 239 | + "msSubPr", "msSubSup", "msSubSupPr", "msSup", "msSupPr", "mstrikeBLTR", "mstrikeH", "mstrikeTLBR", "mstrikeV", | |
| 240 | + "msub", "msubHide", "msup", "msupHide", "mtransp", "mtype", "mvertJc", "mvfmf", "mvfml", "mvtof", "mvtol", | |
| 241 | + "mzeroAsc", "mzeroDesc", "mzeroWid", "nesttableprops", "nextfile", "nonesttables", "objalias", "objclass", | |
| 242 | + "objdata", "object", "objname", "objsect", "objtime", "oldcprops", "oldpprops", "oldsprops", "oldtprops", | |
| 243 | + "oleclsid", "operator", "panose", "password", "passwordhash", "pgp", "pgptbl", "picprop", "pict", "pn", "pnseclvl", | |
| 244 | + "pntext", "pntxta", "pntxtb", "printim", "private", "propname", "protend", "protstart", "protusertbl", "pxe", | |
| 245 | + "result", "revtbl", "revtim", "rsidtbl", "rtf", "rxe", "shp", "shpgrp", "shpinst", "shppict", "shprslt", "shptxt", | |
| 246 | + "sn", "sp", "staticval", "stylesheet", "subject", "sv", "svb", "tc", "template", "themedata", "title", "txe", "ud", | |
| 247 | + "upr", "userprops", "wgrffmtfilter", "windowcaption", "writereservation", "writereservhash", "xe", "xform", | |
| 248 | + "xmlattrname", "xmlattrvalue", "xmlclose", "xmlname", "xmlnstbl", "xmlopen" | |
| 249 | + )) | |
| 250 | + | |
| 251 | + | |
| 252 | + | |
| 253 | +#=== CLASSES ================================================================= | |
| 254 | + | |
| 255 | +class Destination(object): | |
| 256 | + """ | |
| 257 | + Stores the data associated with a destination control word | |
| 258 | + """ | |
| 259 | + def __init__(self, cword=None): | |
| 260 | + self.cword = cword | |
| 261 | + self.data = '' | |
| 262 | + self.start = None | |
| 263 | + self.end = None | |
| 264 | + self.group_level = 0 | |
| 265 | + | |
| 266 | + | |
| 267 | +# class Group(object): | |
| 268 | +# """ | |
| 269 | +# Stores the data associated with a group between braces {...} | |
| 270 | +# """ | |
| 271 | +# def __init__(self, cword=None): | |
| 272 | +# self.start = None | |
| 273 | +# self.end = None | |
| 274 | +# self.level = None | |
| 275 | + | |
| 276 | + | |
| 277 | + | |
| 278 | +class RtfParser(object): | |
| 279 | + """ | |
| 280 | + Very simple generic RTF parser | |
| 281 | + """ | |
| 282 | + | |
| 283 | + def __init__(self, data): | |
| 284 | + self.data = data | |
| 285 | + self.index = 0 | |
| 286 | + self.size = len(data) | |
| 287 | + self.group_level = 0 | |
| 288 | + # default destination for the document text: | |
| 289 | + document_destination = Destination() | |
| 290 | + self.destinations = [document_destination] | |
| 291 | + self.current_destination = document_destination | |
| 292 | + | |
| 293 | + def parse(self): | |
| 294 | + self.index = 0 | |
| 295 | + while self.index < self.size: | |
| 296 | + if self.data[self.index] == '{': | |
| 297 | + self._open_group() | |
| 298 | + self.index += 1 | |
| 299 | + continue | |
| 300 | + if self.data[self.index] == '}': | |
| 301 | + self._close_group() | |
| 302 | + self.index += 1 | |
| 303 | + continue | |
| 304 | + if self.data[self.index] == '\\': | |
| 305 | + m = re_control_word.match(self.data, self.index) | |
| 306 | + if m: | |
| 307 | + cword = m.group(1) | |
| 308 | + param = None | |
| 309 | + if len(m.groups()) > 1: | |
| 310 | + param = m.group(2) | |
| 311 | + # log.debug('control word %r at index %Xh - cword=%r param=%r' % (m.group(), self.index, cword, param)) | |
| 312 | + self._control_word(m, cword, param) | |
| 313 | + self.index += len(m.group()) | |
| 314 | + # if it's \bin, call _bin after updating index | |
| 315 | + if cword == 'bin': | |
| 316 | + self._bin(m, param) | |
| 317 | + continue | |
| 318 | + m = re_control_symbol.match(self.data, self.index) | |
| 319 | + if m: | |
| 320 | + self.control_symbol(m) | |
| 321 | + self.index += len(m.group()) | |
| 322 | + continue | |
| 323 | + m = re_text.match(self.data, self.index) | |
| 324 | + if m: | |
| 325 | + self._text(m) | |
| 326 | + self.index += len(m.group()) | |
| 327 | + continue | |
| 328 | + raise RuntimeError('Should not have reached this point - index=%Xh' % self.index) | |
| 329 | + self.end_of_file() | |
| 330 | + | |
| 331 | + | |
| 332 | + def _open_group(self): | |
| 333 | + self.group_level += 1 | |
| 334 | + log.debug('{ Open Group at index %Xh - level=%d' % (self.index, self.group_level)) | |
| 335 | + # call user method AFTER increasing the level: | |
| 336 | + self.open_group() | |
| 337 | + | |
| 338 | + def open_group(self): | |
| 339 | + #log.debug('open group at index %Xh' % self.index) | |
| 340 | + pass | |
| 341 | + | |
| 342 | + def _close_group(self): | |
| 343 | + log.debug('} Close Group at index %Xh - level=%d' % (self.index, self.group_level)) | |
| 344 | + # call user method BEFORE decreasing the level: | |
| 345 | + self.close_group() | |
| 346 | + # if the destination level is the same as the group level, close the destination: | |
| 347 | + if self.group_level == self.current_destination.group_level: | |
| 348 | + log.debug('Current Destination %r level = %d => Close Destination' % ( | |
| 349 | + self.current_destination.cword, self.current_destination.group_level)) | |
| 350 | + self._close_destination() | |
| 351 | + else: | |
| 352 | + log.debug('Current Destination %r level = %d => Continue with same Destination' % ( | |
| 353 | + self.current_destination.cword, self.current_destination.group_level)) | |
| 354 | + self.group_level -= 1 | |
| 355 | + log.debug('Decreased group level to %d' % self.group_level) | |
| 356 | + | |
| 357 | + def close_group(self): | |
| 358 | + #log.debug('close group at index %Xh' % self.index) | |
| 359 | + pass | |
| 360 | + | |
| 361 | + def _open_destination(self, matchobject, cword): | |
| 362 | + # if the current destination is at the same group level, close it first: | |
| 363 | + if self.current_destination.group_level == self.group_level: | |
| 364 | + self._close_destination() | |
| 365 | + new_dest = Destination(cword) | |
| 366 | + new_dest.group_level = self.group_level | |
| 367 | + self.destinations.append(new_dest) | |
| 368 | + self.current_destination = new_dest | |
| 369 | + # start of the destination is right after the control word: | |
| 370 | + new_dest.start = self.index + len(matchobject.group()) | |
| 371 | + log.debug("Open Destination %r start=%Xh - level=%d" % (cword, new_dest.start, new_dest.group_level)) | |
| 372 | + # call the corresponding user method for additional processing: | |
| 373 | + self.open_destination(self.current_destination) | |
| 374 | + | |
| 375 | + def open_destination(self, destination): | |
| 376 | + pass | |
| 377 | + | |
| 378 | + def _close_destination(self): | |
| 379 | + log.debug("Close Destination %r end=%Xh - level=%d" % (self.current_destination.cword, | |
| 380 | + self.index, self.current_destination.group_level)) | |
| 381 | + self.current_destination.end = self.index | |
| 382 | + # call the corresponding user method for additional processing: | |
| 383 | + self.close_destination(self.current_destination) | |
| 384 | + if len(self.destinations)>0: | |
| 385 | + # remove the current destination from the stack, and go back to the previous one: | |
| 386 | + self.destinations.pop() | |
| 387 | + if len(self.destinations) > 0: | |
| 388 | + self.current_destination = self.destinations[-1] | |
| 389 | + else: | |
| 390 | + log.debug('All destinations are closed, keeping the document destination open') | |
| 391 | + | |
| 392 | + def close_destination(self, destination): | |
| 393 | + pass | |
| 394 | + | |
| 395 | + def _control_word(self, matchobject, cword, param): | |
| 396 | + #log.debug('control word %r at index %Xh' % (matchobject.group(), self.index)) | |
| 397 | + if cword in DESTINATION_CONTROL_WORDS: | |
| 398 | + # log.debug('%r is a destination control word: starting a new destination' % cword) | |
| 399 | + self._open_destination(matchobject, cword) | |
| 400 | + # call the corresponding user method for additional processing: | |
| 401 | + self.control_word(matchobject, cword, param) | |
| 402 | + | |
| 403 | + def control_word(self, matchobject, cword, param): | |
| 404 | + pass | |
| 405 | + | |
| 406 | + def control_symbol(self, matchobject): | |
| 407 | + #log.debug('control symbol %r at index %Xh' % (matchobject.group(), self.index)) | |
| 408 | + pass | |
| 409 | + | |
| 410 | + def _text(self, matchobject): | |
| 411 | + text = matchobject.group() | |
| 412 | + self.current_destination.data += text | |
| 413 | + self.text(matchobject, text) | |
| 414 | + | |
| 415 | + def text(self, matchobject, text): | |
| 416 | + #log.debug('text %r at index %Xh' % (matchobject.group(), self.index)) | |
| 417 | + pass | |
| 418 | + | |
| 419 | + def _bin(self, matchobject, param): | |
| 420 | + binlen = int(param) | |
| 421 | + log.debug('\\bin: reading %d bytes of binary data' % binlen) | |
| 422 | + # TODO: handle optional space? | |
| 423 | + # TODO: handle negative length, and length greater than data | |
| 424 | + bindata = self.data[self.index:self.index + binlen] | |
| 425 | + self.index += binlen | |
| 426 | + self.bin(bindata) | |
| 427 | + | |
| 428 | + def bin(self, bindata): | |
| 429 | + pass | |
| 430 | + | |
| 431 | + def _end_of_file(self): | |
| 432 | + log.debug('%Xh Reached End of File') | |
| 433 | + # close any group/destination that is still open: | |
| 434 | + while self.group_level > 0: | |
| 435 | + log.debug('Group Level = %d, closing group' % self.group_level) | |
| 436 | + self._close_group() | |
| 437 | + self.end_of_file() | |
| 438 | + | |
| 439 | + def end_of_file(self): | |
| 440 | + pass | |
| 441 | + | |
| 442 | + | |
| 443 | +class RtfObjParser(RtfParser): | |
| 444 | + """ | |
| 445 | + Specialized RTF parser to extract OLE objects | |
| 446 | + """ | |
| 447 | + | |
| 448 | + def __init__(self, data, fname_prefix='rtf'): | |
| 449 | + super(RtfObjParser, self).__init__(data) | |
| 450 | + self.fname_prefix = fname_prefix | |
| 451 | + | |
| 452 | + def open_destination(self, destination): | |
| 453 | + if destination.cword == 'objdata': | |
| 454 | + log.debug('*** Start object data at index %Xh' % destination.start) | |
| 455 | + | |
| 456 | + def close_destination(self, destination): | |
| 457 | + if destination.cword == 'objdata': | |
| 458 | + log.debug('*** Close object data at index %Xh' % self.index) | |
| 459 | + # Filter out all whitespaces first (just ignored): | |
| 460 | + hexdata1 = destination.data.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') | |
| 461 | + # Then filter out any other non-hex character: | |
| 462 | + hexdata = re.sub(r'[^a-hA-H0-9]', '', hexdata1) | |
| 463 | + if len(hexdata) < len(hexdata1): | |
| 464 | + # this is only for debugging: | |
| 465 | + nonhex = re.sub(r'[a-hA-H0-9]', '', hexdata1) | |
| 466 | + log.debug('Found non-hex chars in hexdata: %r' % nonhex) | |
| 467 | + # MS Word accepts an extra hex digit, so we need to trim it if present: | |
| 468 | + if len(hexdata) & 1: | |
| 469 | + log.debug('Odd length, trimmed last byte.') | |
| 470 | + hexdata = hexdata[:-1] | |
| 471 | + object_data = binascii.unhexlify(hexdata) | |
| 472 | + print('found object size %d at index %08X - end %08X' % (len(object_data), | |
| 473 | + destination.start, self.index)) | |
| 474 | + fname = '%s_object_%08X.raw' % (self.fname_prefix, destination.start) | |
| 475 | + print('saving object to file %s' % fname) | |
| 476 | + open(fname, 'wb').write(object_data) | |
| 477 | + # TODO: check if all hex data is extracted properly | |
| 478 | + | |
| 479 | + obj = OleObject() | |
| 480 | + try: | |
| 481 | + obj.parse(object_data) | |
| 482 | + print('extract file embedded in OLE object:') | |
| 483 | + print('format_id = %d' % obj.format_id) | |
| 484 | + print('class name = %r' % obj.class_name) | |
| 485 | + print('data size = %d' % obj.data_size) | |
| 486 | + # set a file extension according to the class name: | |
| 487 | + class_name = obj.class_name.lower() | |
| 488 | + if class_name.startswith('word'): | |
| 489 | + ext = 'doc' | |
| 490 | + elif class_name.startswith('package'): | |
| 491 | + ext = 'package' | |
| 492 | + else: | |
| 493 | + ext = 'bin' | |
| 494 | + | |
| 495 | + fname = '%s_object_%08X.%s' % (self.fname_prefix, destination.start, ext) | |
| 496 | + print('saving to file %s' % fname) | |
| 497 | + open(fname, 'wb').write(obj.data) | |
| 498 | + if obj.class_name.lower() == 'package': | |
| 499 | + print('Parsing OLE Package') | |
| 500 | + opkg = OleNativeStream(bindata=obj.data) | |
| 501 | + print('Filename = %r' % opkg.filename) | |
| 502 | + print('Source path = %r' % opkg.src_path) | |
| 503 | + print('Temp path = %r' % opkg.temp_path) | |
| 504 | + if opkg.filename: | |
| 505 | + fname = '%s_%s' % (self.fname_prefix, | |
| 506 | + sanitize_filename(opkg.filename)) | |
| 507 | + else: | |
| 508 | + fname = '%s_object_%08X.noname' % (self.fname_prefix, destination.start) | |
| 509 | + print('saving to file %s' % fname) | |
| 510 | + open(fname, 'wb').write(opkg.data) | |
| 511 | + except: | |
| 512 | + pass | |
| 513 | + log.exception('*** Not an OLE 1.0 Object') | |
| 514 | + | |
| 515 | + def bin(self, bindata): | |
| 516 | + if self.current_destination.cword == 'objdata': | |
| 517 | + # TODO: keep track of this, because it is unusual and indicates potential obfuscation | |
| 518 | + # trick: hexlify binary data, add it to hex data | |
| 519 | + self.current_destination.data += binascii.hexlify(bindata) | |
| 520 | + | |
| 521 | + def control_word(self, matchobject, cword, param): | |
| 522 | + # TODO: extract useful cwords such as objclass | |
| 523 | + # TODO: keep track of cwords inside objdata, because it is unusual and indicates potential obfuscation | |
| 524 | + # TODO: same with control symbols, and opening bracket | |
| 525 | + pass | |
| 526 | + | |
| 178 | 527 | |
| 179 | 528 | #=== FUNCTIONS =============================================================== |
| 180 | 529 | |
| ... | ... | @@ -329,50 +678,53 @@ def process_file(container, filename, data, output_dir=None): |
| 329 | 678 | # TODO: option to extract objects to files (false by default) |
| 330 | 679 | if data is None: |
| 331 | 680 | data = open(filename, 'rb').read() |
| 332 | - print '-'*79 | |
| 333 | - print 'File: %r - %d bytes' % (filename, len(data)) | |
| 334 | - for index, orig_len, objdata in rtf_iter_objects(data): | |
| 335 | - print 'found object size %d at index %08X - end %08X' % (len(objdata), index, index+orig_len) | |
| 336 | - fname = '%s_object_%08X.raw' % (fname_prefix, index) | |
| 337 | - print 'saving object to file %s' % fname | |
| 338 | - open(fname, 'wb').write(objdata) | |
| 339 | - # TODO: check if all hex data is extracted properly | |
| 340 | - | |
| 341 | - obj = OleObject() | |
| 342 | - try: | |
| 343 | - obj.parse(objdata) | |
| 344 | - print 'extract file embedded in OLE object:' | |
| 345 | - print 'format_id = %d' % obj.format_id | |
| 346 | - print 'class name = %r' % obj.class_name | |
| 347 | - print 'data size = %d' % obj.data_size | |
| 348 | - # set a file extension according to the class name: | |
| 349 | - class_name = obj.class_name.lower() | |
| 350 | - if class_name.startswith('word'): | |
| 351 | - ext = 'doc' | |
| 352 | - elif class_name.startswith('package'): | |
| 353 | - ext = 'package' | |
| 354 | - else: | |
| 355 | - ext = 'bin' | |
| 356 | - | |
| 357 | - fname = '%s_object_%08X.%s' % (fname_prefix, index, ext) | |
| 358 | - print 'saving to file %s' % fname | |
| 359 | - open(fname, 'wb').write(obj.data) | |
| 360 | - if obj.class_name.lower() == 'package': | |
| 361 | - print 'Parsing OLE Package' | |
| 362 | - opkg = OleNativeStream(bindata=obj.data) | |
| 363 | - print 'Filename = %r' % opkg.filename | |
| 364 | - print 'Source path = %r' % opkg.src_path | |
| 365 | - print 'Temp path = %r' % opkg.temp_path | |
| 366 | - if opkg.filename: | |
| 367 | - fname = '%s_%s' % (fname_prefix, | |
| 368 | - sanitize_filename(opkg.filename)) | |
| 369 | - else: | |
| 370 | - fname = '%s_object_%08X.noname' % (fname_prefix, index) | |
| 371 | - print 'saving to file %s' % fname | |
| 372 | - open(fname, 'wb').write(opkg.data) | |
| 373 | - except: | |
| 374 | - pass | |
| 375 | - log.exception('*** Not an OLE 1.0 Object') | |
| 681 | + rtfp = RtfObjParser(data, fname_prefix) | |
| 682 | + rtfp.parse() | |
| 683 | + | |
| 684 | + # print '-'*79 | |
| 685 | + # print 'File: %r - %d bytes' % (filename, len(data)) | |
| 686 | + # for index, orig_len, objdata in rtf_iter_objects(data): | |
| 687 | + # print 'found object size %d at index %08X - end %08X' % (len(objdata), index, index+orig_len) | |
| 688 | + # fname = '%s_object_%08X.raw' % (fname_prefix, index) | |
| 689 | + # print 'saving object to file %s' % fname | |
| 690 | + # open(fname, 'wb').write(objdata) | |
| 691 | + # # TODO: check if all hex data is extracted properly | |
| 692 | + # | |
| 693 | + # obj = OleObject() | |
| 694 | + # try: | |
| 695 | + # obj.parse(objdata) | |
| 696 | + # print 'extract file embedded in OLE object:' | |
| 697 | + # print 'format_id = %d' % obj.format_id | |
| 698 | + # print 'class name = %r' % obj.class_name | |
| 699 | + # print 'data size = %d' % obj.data_size | |
| 700 | + # # set a file extension according to the class name: | |
| 701 | + # class_name = obj.class_name.lower() | |
| 702 | + # if class_name.startswith('word'): | |
| 703 | + # ext = 'doc' | |
| 704 | + # elif class_name.startswith('package'): | |
| 705 | + # ext = 'package' | |
| 706 | + # else: | |
| 707 | + # ext = 'bin' | |
| 708 | + # | |
| 709 | + # fname = '%s_object_%08X.%s' % (fname_prefix, index, ext) | |
| 710 | + # print 'saving to file %s' % fname | |
| 711 | + # open(fname, 'wb').write(obj.data) | |
| 712 | + # if obj.class_name.lower() == 'package': | |
| 713 | + # print 'Parsing OLE Package' | |
| 714 | + # opkg = OleNativeStream(bindata=obj.data) | |
| 715 | + # print 'Filename = %r' % opkg.filename | |
| 716 | + # print 'Source path = %r' % opkg.src_path | |
| 717 | + # print 'Temp path = %r' % opkg.temp_path | |
| 718 | + # if opkg.filename: | |
| 719 | + # fname = '%s_%s' % (fname_prefix, | |
| 720 | + # sanitize_filename(opkg.filename)) | |
| 721 | + # else: | |
| 722 | + # fname = '%s_object_%08X.noname' % (fname_prefix, index) | |
| 723 | + # print 'saving to file %s' % fname | |
| 724 | + # open(fname, 'wb').write(opkg.data) | |
| 725 | + # except: | |
| 726 | + # pass | |
| 727 | + # log.exception('*** Not an OLE 1.0 Object') | |
| 376 | 728 | |
| 377 | 729 | |
| 378 | 730 | |
| ... | ... | @@ -414,7 +766,7 @@ if __name__ == '__main__': |
| 414 | 766 | |
| 415 | 767 | # Print help if no arguments are passed |
| 416 | 768 | if len(args) == 0: |
| 417 | - print __doc__ | |
| 769 | + print (__doc__) | |
| 418 | 770 | parser.print_help() |
| 419 | 771 | sys.exit() |
| 420 | 772 | |
| ... | ... | @@ -436,5 +788,5 @@ if __name__ == '__main__': |
| 436 | 788 | process_file(container, filename, data, options.output_dir) |
| 437 | 789 | |
| 438 | 790 | |
| 439 | - | |
| 791 | +# This code was developed while listening to The Mary Onettes "Lost" | |
| 440 | 792 | ... | ... |