Commit 2685c6f5b70d40c7606fb55773f7ca8159130520
1 parent
44ec0bd8
rtfobj: new RtfParser and RtfObjParser classes - a more complete RTF parser to s…
…upport tricky edge cases exploited by malware
Showing
1 changed file
with
403 additions
and
51 deletions
oletools/rtfobj.py
| 1 | #!/usr/bin/env python | 1 | #!/usr/bin/env python |
| 2 | +from __future__ import print_function | ||
| 3 | + | ||
| 2 | """ | 4 | """ |
| 3 | rtfobj.py | 5 | rtfobj.py |
| 4 | 6 | ||
| @@ -52,12 +54,12 @@ http://www.decalage.info/python/oletools | @@ -52,12 +54,12 @@ http://www.decalage.info/python/oletools | ||
| 52 | # (contribution by Thomas Jarosch) | 54 | # (contribution by Thomas Jarosch) |
| 53 | # TJ: - sanitize filenames to avoid special characters | 55 | # TJ: - sanitize filenames to avoid special characters |
| 54 | # 2016-05-29 PL: - improved parsing, fixed issue #42 | 56 | # 2016-05-29 PL: - improved parsing, fixed issue #42 |
| 57 | +# 2016-07-13 v0.48 PL: - new RtfParser and RtfObjParser classes | ||
| 55 | 58 | ||
| 56 | -__version__ = '0.47' | 59 | +__version__ = '0.48' |
| 57 | 60 | ||
| 58 | #------------------------------------------------------------------------------ | 61 | #------------------------------------------------------------------------------ |
| 59 | # TODO: | 62 | # TODO: |
| 60 | -# - improve regex pattern for better performance? | ||
| 61 | # - allow semicolon within hex, as found in this sample: | 63 | # - allow semicolon within hex, as found in this sample: |
| 62 | # http://contagiodump.blogspot.nl/2011/10/sep-28-cve-2010-3333-manuscript-with.html | 64 | # http://contagiodump.blogspot.nl/2011/10/sep-28-cve-2010-3333-manuscript-with.html |
| 63 | 65 | ||
| @@ -70,6 +72,7 @@ from thirdparty.xglob import xglob | @@ -70,6 +72,7 @@ from thirdparty.xglob import xglob | ||
| 70 | from oleobj import OleObject, OleNativeStream | 72 | from oleobj import OleObject, OleNativeStream |
| 71 | import oleobj | 73 | import oleobj |
| 72 | 74 | ||
| 75 | + | ||
| 73 | # === LOGGING ================================================================= | 76 | # === LOGGING ================================================================= |
| 74 | 77 | ||
| 75 | class NullHandler(logging.Handler): | 78 | class NullHandler(logging.Handler): |
| @@ -125,11 +128,47 @@ HEX_DIGIT = r'[0-9A-Fa-f]' | @@ -125,11 +128,47 @@ HEX_DIGIT = r'[0-9A-Fa-f]' | ||
| 125 | # HEX_CHAR = r'[0-9A-Fa-f]\s*[0-9A-Fa-f]' | 128 | # HEX_CHAR = r'[0-9A-Fa-f]\s*[0-9A-Fa-f]' |
| 126 | # Even worse, MS Word also allows ANY RTF-style tag {*} in between!! | 129 | # Even worse, MS Word also allows ANY RTF-style tag {*} in between!! |
| 127 | # AND the tags can be nested... | 130 | # AND the tags can be nested... |
| 128 | -SINGLE_RTF_TAG = r'[{][^{}]*[}]' | 131 | +#SINGLE_RTF_TAG = r'[{][^{}]*[}]' |
| 132 | +# Actually RTF tags may contain braces escaped with backslash (\{ \}): | ||
| 133 | +SINGLE_RTF_TAG = r'[{](?:\\.|[^{}\\])*[}]' | ||
| 134 | + | ||
| 129 | # Nested tags, two levels (because Python's re does not support nested matching): | 135 | # Nested tags, two levels (because Python's re does not support nested matching): |
| 130 | -NESTED_RTF_TAG = r'[{](?:[^{}]|'+SINGLE_RTF_TAG+r')*[}]' | 136 | +# NESTED_RTF_TAG = r'[{](?:[^{}]|'+SINGLE_RTF_TAG+r')*[}]' |
| 137 | +NESTED_RTF_TAG = r'[{](?:\\.|[^{}\\]|'+SINGLE_RTF_TAG+r')*[}]' | ||
| 138 | + | ||
| 139 | +# AND it is also allowed to insert ANY control word or control symbol (ignored) | ||
| 140 | +# According to Rich Text Format (RTF) Specification Version 1.9.1, | ||
| 141 | +# section "Control Word": | ||
| 142 | +# control word = \<ASCII Letter [a-zA-Z] Sequence max 32><Delimiter> | ||
| 143 | +# delimiter = space, OR signed integer followed by any non-digit, | ||
| 144 | +# OR any character except letter and digit | ||
| 145 | +# examples of valid control words: | ||
| 146 | +# "\AnyThing " "\AnyThing123z" ""\AnyThing-456{" "\AnyThing{" | ||
| 147 | +# control symbol = \<any char except letter or digit> (followed by anything) | ||
| 148 | + | ||
| 149 | +ASCII_NAME = r'([a-zA-Z]{1,250})' | ||
| 150 | + | ||
| 151 | +# using Python's re lookahead assumption: | ||
| 152 | +# (?=...) Matches if ... matches next, but doesn't consume any of the string. | ||
| 153 | +# This is called a lookahead assertion. For example, Isaac (?=Asimov) will | ||
| 154 | +# match 'Isaac ' only if it's followed by 'Asimov'. | ||
| 155 | + | ||
| 156 | +# TODO: Find the actual limit on the number of digits for Word | ||
| 157 | +# SIGNED_INTEGER = r'(-?\d{1,250})' | ||
| 158 | +SIGNED_INTEGER = r'(-?\d+)' | ||
| 159 | + | ||
| 160 | +CONTROL_WORD = r'(?:\\' + ASCII_NAME + r'(?:(?=[^a-zA-Z0-9-])|' + SIGNED_INTEGER + r'(?=[^0-9])))' | ||
| 161 | +re_control_word = re.compile(CONTROL_WORD) | ||
| 162 | + | ||
| 163 | +CONTROL_SYMBOL = r'(?:\\[^a-zA-Z0-9])' | ||
| 164 | +re_control_symbol = re.compile(CONTROL_SYMBOL) | ||
| 165 | + | ||
| 166 | +# Text that is not a control word/symbol or a group: | ||
| 167 | +TEXT = r'[^{}\\]+' | ||
| 168 | +re_text = re.compile(TEXT) | ||
| 169 | + | ||
| 131 | # ignored whitespaces and tags within a hex block: | 170 | # ignored whitespaces and tags within a hex block: |
| 132 | -IGNORED = r'(?:\s|'+NESTED_RTF_TAG+r')*' | 171 | +IGNORED = r'(?:\s|'+NESTED_RTF_TAG+'|'+CONTROL_SYMBOL+'|'+CONTROL_WORD+r')*' |
| 133 | #IGNORED = r'\s*' | 172 | #IGNORED = r'\s*' |
| 134 | 173 | ||
| 135 | # HEX_CHAR = HEX_DIGIT + IGNORED + HEX_DIGIT | 174 | # HEX_CHAR = HEX_DIGIT + IGNORED + HEX_DIGIT |
| @@ -175,6 +214,316 @@ re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + BACKSLASH_BIN | @@ -175,6 +214,316 @@ re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + BACKSLASH_BIN | ||
| 175 | + DECIMAL_GROUP + DELIMITER) | 214 | + DECIMAL_GROUP + DELIMITER) |
| 176 | re_delim_hexblock = re.compile(DELIMITER + PATTERN) | 215 | re_delim_hexblock = re.compile(DELIMITER + PATTERN) |
| 177 | 216 | ||
| 217 | +# Destination Control Words, according to MS RTF Specifications v1.9.1: | ||
| 218 | +DESTINATION_CONTROL_WORDS = frozenset(( | ||
| 219 | + "aftncn", "aftnsep", "aftnsepc", "annotation", "atnauthor", "atndate", "atnicn", "atnid", "atnparent", "atnref", | ||
| 220 | + "atntime", "atrfend", "atrfstart", "author", "background", "bkmkend", "bkmkstart", "blipuid", "buptim", "category", | ||
| 221 | + "colorschememapping", "colortbl", "comment", "company", "creatim", "datafield", "datastore", "defchp", "defpap", | ||
| 222 | + "do", "doccomm", "docvar", "dptxbxtext", "ebcend", "ebcstart", "factoidname", "falt", "fchars", "ffdeftext", | ||
| 223 | + "ffentrymcr", "ffexitmcr", "ffformat", "ffhelptext", "ffl", "ffname", "ffstattext", "field", "file", "filetbl", | ||
| 224 | + "fldinst", "fldrslt", "fldtype", "fname", "fontemb", "fontfile", "fonttbl", "footer", "footerf", "footerl", | ||
| 225 | + "footerr", "footnote", "formfield", "ftncn", "ftnsep", "ftnsepc", "g", "generator", "gridtbl", "header", "headerf", | ||
| 226 | + "headerl", "headerr", "hl", "hlfr", "hlinkbase", "hlloc", "hlsrc", "hsv", "htmltag", "info", "keycode", "keywords", | ||
| 227 | + "latentstyles", "lchars", "levelnumbers", "leveltext", "lfolevel", "linkval", "list", "listlevel", "listname", | ||
| 228 | + "listoverride", "listoverridetable", "listpicture", "liststylename", "listtable", "listtext", "lsdlockedexcept", | ||
| 229 | + "macc", "maccPr", "mailmerge", "maln", "malnScr", "manager", "margPr", "mbar", "mbarPr", "mbaseJc", "mbegChr", | ||
| 230 | + "mborderBox", "mborderBoxPr", "mbox", "mboxPr", "mchr", "mcount", "mctrlPr", "md", "mdeg", "mdegHide", "mden", | ||
| 231 | + "mdiff", "mdPr", "me", "mendChr", "meqArr", "meqArrPr", "mf", "mfName", "mfPr", "mfunc", "mfuncPr", "mgroupChr", | ||
| 232 | + "mgroupChrPr", "mgrow", "mhideBot", "mhideLeft", "mhideRight", "mhideTop", "mhtmltag", "mlim", "mlimloc", "mlimlow", | ||
| 233 | + "mlimlowPr", "mlimupp", "mlimuppPr", "mm", "mmaddfieldname", "mmath", "mmathPict", "mmathPr", "mmaxdist", "mmc", | ||
| 234 | + "mmcJc", "mmconnectstr", "mmconnectstrdata", "mmcPr", "mmcs", "mmdatasource", "mmheadersource", "mmmailsubject", | ||
| 235 | + "mmodso", "mmodsofilter", "mmodsofldmpdata", "mmodsomappedname", "mmodsoname", "mmodsorecipdata", "mmodsosort", | ||
| 236 | + "mmodsosrc", "mmodsotable", "mmodsoudl", "mmodsoudldata", "mmodsouniquetag", "mmPr", "mmquery", "mmr", "mnary", | ||
| 237 | + "mnaryPr", "mnoBreak", "mnum", "mobjDist", "moMath", "moMathPara", "moMathParaPr", "mopEmu", "mphant", "mphantPr", | ||
| 238 | + "mplcHide", "mpos", "mr", "mrad", "mradPr", "mrPr", "msepChr", "mshow", "mshp", "msPre", "msPrePr", "msSub", | ||
| 239 | + "msSubPr", "msSubSup", "msSubSupPr", "msSup", "msSupPr", "mstrikeBLTR", "mstrikeH", "mstrikeTLBR", "mstrikeV", | ||
| 240 | + "msub", "msubHide", "msup", "msupHide", "mtransp", "mtype", "mvertJc", "mvfmf", "mvfml", "mvtof", "mvtol", | ||
| 241 | + "mzeroAsc", "mzeroDesc", "mzeroWid", "nesttableprops", "nextfile", "nonesttables", "objalias", "objclass", | ||
| 242 | + "objdata", "object", "objname", "objsect", "objtime", "oldcprops", "oldpprops", "oldsprops", "oldtprops", | ||
| 243 | + "oleclsid", "operator", "panose", "password", "passwordhash", "pgp", "pgptbl", "picprop", "pict", "pn", "pnseclvl", | ||
| 244 | + "pntext", "pntxta", "pntxtb", "printim", "private", "propname", "protend", "protstart", "protusertbl", "pxe", | ||
| 245 | + "result", "revtbl", "revtim", "rsidtbl", "rtf", "rxe", "shp", "shpgrp", "shpinst", "shppict", "shprslt", "shptxt", | ||
| 246 | + "sn", "sp", "staticval", "stylesheet", "subject", "sv", "svb", "tc", "template", "themedata", "title", "txe", "ud", | ||
| 247 | + "upr", "userprops", "wgrffmtfilter", "windowcaption", "writereservation", "writereservhash", "xe", "xform", | ||
| 248 | + "xmlattrname", "xmlattrvalue", "xmlclose", "xmlname", "xmlnstbl", "xmlopen" | ||
| 249 | + )) | ||
| 250 | + | ||
| 251 | + | ||
| 252 | + | ||
| 253 | +#=== CLASSES ================================================================= | ||
| 254 | + | ||
| 255 | +class Destination(object): | ||
| 256 | + """ | ||
| 257 | + Stores the data associated with a destination control word | ||
| 258 | + """ | ||
| 259 | + def __init__(self, cword=None): | ||
| 260 | + self.cword = cword | ||
| 261 | + self.data = '' | ||
| 262 | + self.start = None | ||
| 263 | + self.end = None | ||
| 264 | + self.group_level = 0 | ||
| 265 | + | ||
| 266 | + | ||
| 267 | +# class Group(object): | ||
| 268 | +# """ | ||
| 269 | +# Stores the data associated with a group between braces {...} | ||
| 270 | +# """ | ||
| 271 | +# def __init__(self, cword=None): | ||
| 272 | +# self.start = None | ||
| 273 | +# self.end = None | ||
| 274 | +# self.level = None | ||
| 275 | + | ||
| 276 | + | ||
| 277 | + | ||
| 278 | +class RtfParser(object): | ||
| 279 | + """ | ||
| 280 | + Very simple generic RTF parser | ||
| 281 | + """ | ||
| 282 | + | ||
| 283 | + def __init__(self, data): | ||
| 284 | + self.data = data | ||
| 285 | + self.index = 0 | ||
| 286 | + self.size = len(data) | ||
| 287 | + self.group_level = 0 | ||
| 288 | + # default destination for the document text: | ||
| 289 | + document_destination = Destination() | ||
| 290 | + self.destinations = [document_destination] | ||
| 291 | + self.current_destination = document_destination | ||
| 292 | + | ||
| 293 | + def parse(self): | ||
| 294 | + self.index = 0 | ||
| 295 | + while self.index < self.size: | ||
| 296 | + if self.data[self.index] == '{': | ||
| 297 | + self._open_group() | ||
| 298 | + self.index += 1 | ||
| 299 | + continue | ||
| 300 | + if self.data[self.index] == '}': | ||
| 301 | + self._close_group() | ||
| 302 | + self.index += 1 | ||
| 303 | + continue | ||
| 304 | + if self.data[self.index] == '\\': | ||
| 305 | + m = re_control_word.match(self.data, self.index) | ||
| 306 | + if m: | ||
| 307 | + cword = m.group(1) | ||
| 308 | + param = None | ||
| 309 | + if len(m.groups()) > 1: | ||
| 310 | + param = m.group(2) | ||
| 311 | + # log.debug('control word %r at index %Xh - cword=%r param=%r' % (m.group(), self.index, cword, param)) | ||
| 312 | + self._control_word(m, cword, param) | ||
| 313 | + self.index += len(m.group()) | ||
| 314 | + # if it's \bin, call _bin after updating index | ||
| 315 | + if cword == 'bin': | ||
| 316 | + self._bin(m, param) | ||
| 317 | + continue | ||
| 318 | + m = re_control_symbol.match(self.data, self.index) | ||
| 319 | + if m: | ||
| 320 | + self.control_symbol(m) | ||
| 321 | + self.index += len(m.group()) | ||
| 322 | + continue | ||
| 323 | + m = re_text.match(self.data, self.index) | ||
| 324 | + if m: | ||
| 325 | + self._text(m) | ||
| 326 | + self.index += len(m.group()) | ||
| 327 | + continue | ||
| 328 | + raise RuntimeError('Should not have reached this point - index=%Xh' % self.index) | ||
| 329 | + self.end_of_file() | ||
| 330 | + | ||
| 331 | + | ||
| 332 | + def _open_group(self): | ||
| 333 | + self.group_level += 1 | ||
| 334 | + log.debug('{ Open Group at index %Xh - level=%d' % (self.index, self.group_level)) | ||
| 335 | + # call user method AFTER increasing the level: | ||
| 336 | + self.open_group() | ||
| 337 | + | ||
| 338 | + def open_group(self): | ||
| 339 | + #log.debug('open group at index %Xh' % self.index) | ||
| 340 | + pass | ||
| 341 | + | ||
| 342 | + def _close_group(self): | ||
| 343 | + log.debug('} Close Group at index %Xh - level=%d' % (self.index, self.group_level)) | ||
| 344 | + # call user method BEFORE decreasing the level: | ||
| 345 | + self.close_group() | ||
| 346 | + # if the destination level is the same as the group level, close the destination: | ||
| 347 | + if self.group_level == self.current_destination.group_level: | ||
| 348 | + log.debug('Current Destination %r level = %d => Close Destination' % ( | ||
| 349 | + self.current_destination.cword, self.current_destination.group_level)) | ||
| 350 | + self._close_destination() | ||
| 351 | + else: | ||
| 352 | + log.debug('Current Destination %r level = %d => Continue with same Destination' % ( | ||
| 353 | + self.current_destination.cword, self.current_destination.group_level)) | ||
| 354 | + self.group_level -= 1 | ||
| 355 | + log.debug('Decreased group level to %d' % self.group_level) | ||
| 356 | + | ||
| 357 | + def close_group(self): | ||
| 358 | + #log.debug('close group at index %Xh' % self.index) | ||
| 359 | + pass | ||
| 360 | + | ||
| 361 | + def _open_destination(self, matchobject, cword): | ||
| 362 | + # if the current destination is at the same group level, close it first: | ||
| 363 | + if self.current_destination.group_level == self.group_level: | ||
| 364 | + self._close_destination() | ||
| 365 | + new_dest = Destination(cword) | ||
| 366 | + new_dest.group_level = self.group_level | ||
| 367 | + self.destinations.append(new_dest) | ||
| 368 | + self.current_destination = new_dest | ||
| 369 | + # start of the destination is right after the control word: | ||
| 370 | + new_dest.start = self.index + len(matchobject.group()) | ||
| 371 | + log.debug("Open Destination %r start=%Xh - level=%d" % (cword, new_dest.start, new_dest.group_level)) | ||
| 372 | + # call the corresponding user method for additional processing: | ||
| 373 | + self.open_destination(self.current_destination) | ||
| 374 | + | ||
| 375 | + def open_destination(self, destination): | ||
| 376 | + pass | ||
| 377 | + | ||
| 378 | + def _close_destination(self): | ||
| 379 | + log.debug("Close Destination %r end=%Xh - level=%d" % (self.current_destination.cword, | ||
| 380 | + self.index, self.current_destination.group_level)) | ||
| 381 | + self.current_destination.end = self.index | ||
| 382 | + # call the corresponding user method for additional processing: | ||
| 383 | + self.close_destination(self.current_destination) | ||
| 384 | + if len(self.destinations)>0: | ||
| 385 | + # remove the current destination from the stack, and go back to the previous one: | ||
| 386 | + self.destinations.pop() | ||
| 387 | + if len(self.destinations) > 0: | ||
| 388 | + self.current_destination = self.destinations[-1] | ||
| 389 | + else: | ||
| 390 | + log.debug('All destinations are closed, keeping the document destination open') | ||
| 391 | + | ||
| 392 | + def close_destination(self, destination): | ||
| 393 | + pass | ||
| 394 | + | ||
| 395 | + def _control_word(self, matchobject, cword, param): | ||
| 396 | + #log.debug('control word %r at index %Xh' % (matchobject.group(), self.index)) | ||
| 397 | + if cword in DESTINATION_CONTROL_WORDS: | ||
| 398 | + # log.debug('%r is a destination control word: starting a new destination' % cword) | ||
| 399 | + self._open_destination(matchobject, cword) | ||
| 400 | + # call the corresponding user method for additional processing: | ||
| 401 | + self.control_word(matchobject, cword, param) | ||
| 402 | + | ||
| 403 | + def control_word(self, matchobject, cword, param): | ||
| 404 | + pass | ||
| 405 | + | ||
| 406 | + def control_symbol(self, matchobject): | ||
| 407 | + #log.debug('control symbol %r at index %Xh' % (matchobject.group(), self.index)) | ||
| 408 | + pass | ||
| 409 | + | ||
| 410 | + def _text(self, matchobject): | ||
| 411 | + text = matchobject.group() | ||
| 412 | + self.current_destination.data += text | ||
| 413 | + self.text(matchobject, text) | ||
| 414 | + | ||
| 415 | + def text(self, matchobject, text): | ||
| 416 | + #log.debug('text %r at index %Xh' % (matchobject.group(), self.index)) | ||
| 417 | + pass | ||
| 418 | + | ||
| 419 | + def _bin(self, matchobject, param): | ||
| 420 | + binlen = int(param) | ||
| 421 | + log.debug('\\bin: reading %d bytes of binary data' % binlen) | ||
| 422 | + # TODO: handle optional space? | ||
| 423 | + # TODO: handle negative length, and length greater than data | ||
| 424 | + bindata = self.data[self.index:self.index + binlen] | ||
| 425 | + self.index += binlen | ||
| 426 | + self.bin(bindata) | ||
| 427 | + | ||
| 428 | + def bin(self, bindata): | ||
| 429 | + pass | ||
| 430 | + | ||
| 431 | + def _end_of_file(self): | ||
| 432 | + log.debug('%Xh Reached End of File') | ||
| 433 | + # close any group/destination that is still open: | ||
| 434 | + while self.group_level > 0: | ||
| 435 | + log.debug('Group Level = %d, closing group' % self.group_level) | ||
| 436 | + self._close_group() | ||
| 437 | + self.end_of_file() | ||
| 438 | + | ||
| 439 | + def end_of_file(self): | ||
| 440 | + pass | ||
| 441 | + | ||
| 442 | + | ||
| 443 | +class RtfObjParser(RtfParser): | ||
| 444 | + """ | ||
| 445 | + Specialized RTF parser to extract OLE objects | ||
| 446 | + """ | ||
| 447 | + | ||
| 448 | + def __init__(self, data, fname_prefix='rtf'): | ||
| 449 | + super(RtfObjParser, self).__init__(data) | ||
| 450 | + self.fname_prefix = fname_prefix | ||
| 451 | + | ||
| 452 | + def open_destination(self, destination): | ||
| 453 | + if destination.cword == 'objdata': | ||
| 454 | + log.debug('*** Start object data at index %Xh' % destination.start) | ||
| 455 | + | ||
| 456 | + def close_destination(self, destination): | ||
| 457 | + if destination.cword == 'objdata': | ||
| 458 | + log.debug('*** Close object data at index %Xh' % self.index) | ||
| 459 | + # Filter out all whitespaces first (just ignored): | ||
| 460 | + hexdata1 = destination.data.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') | ||
| 461 | + # Then filter out any other non-hex character: | ||
| 462 | + hexdata = re.sub(r'[^a-hA-H0-9]', '', hexdata1) | ||
| 463 | + if len(hexdata) < len(hexdata1): | ||
| 464 | + # this is only for debugging: | ||
| 465 | + nonhex = re.sub(r'[a-hA-H0-9]', '', hexdata1) | ||
| 466 | + log.debug('Found non-hex chars in hexdata: %r' % nonhex) | ||
| 467 | + # MS Word accepts an extra hex digit, so we need to trim it if present: | ||
| 468 | + if len(hexdata) & 1: | ||
| 469 | + log.debug('Odd length, trimmed last byte.') | ||
| 470 | + hexdata = hexdata[:-1] | ||
| 471 | + object_data = binascii.unhexlify(hexdata) | ||
| 472 | + print('found object size %d at index %08X - end %08X' % (len(object_data), | ||
| 473 | + destination.start, self.index)) | ||
| 474 | + fname = '%s_object_%08X.raw' % (self.fname_prefix, destination.start) | ||
| 475 | + print('saving object to file %s' % fname) | ||
| 476 | + open(fname, 'wb').write(object_data) | ||
| 477 | + # TODO: check if all hex data is extracted properly | ||
| 478 | + | ||
| 479 | + obj = OleObject() | ||
| 480 | + try: | ||
| 481 | + obj.parse(object_data) | ||
| 482 | + print('extract file embedded in OLE object:') | ||
| 483 | + print('format_id = %d' % obj.format_id) | ||
| 484 | + print('class name = %r' % obj.class_name) | ||
| 485 | + print('data size = %d' % obj.data_size) | ||
| 486 | + # set a file extension according to the class name: | ||
| 487 | + class_name = obj.class_name.lower() | ||
| 488 | + if class_name.startswith('word'): | ||
| 489 | + ext = 'doc' | ||
| 490 | + elif class_name.startswith('package'): | ||
| 491 | + ext = 'package' | ||
| 492 | + else: | ||
| 493 | + ext = 'bin' | ||
| 494 | + | ||
| 495 | + fname = '%s_object_%08X.%s' % (self.fname_prefix, destination.start, ext) | ||
| 496 | + print('saving to file %s' % fname) | ||
| 497 | + open(fname, 'wb').write(obj.data) | ||
| 498 | + if obj.class_name.lower() == 'package': | ||
| 499 | + print('Parsing OLE Package') | ||
| 500 | + opkg = OleNativeStream(bindata=obj.data) | ||
| 501 | + print('Filename = %r' % opkg.filename) | ||
| 502 | + print('Source path = %r' % opkg.src_path) | ||
| 503 | + print('Temp path = %r' % opkg.temp_path) | ||
| 504 | + if opkg.filename: | ||
| 505 | + fname = '%s_%s' % (self.fname_prefix, | ||
| 506 | + sanitize_filename(opkg.filename)) | ||
| 507 | + else: | ||
| 508 | + fname = '%s_object_%08X.noname' % (self.fname_prefix, destination.start) | ||
| 509 | + print('saving to file %s' % fname) | ||
| 510 | + open(fname, 'wb').write(opkg.data) | ||
| 511 | + except: | ||
| 512 | + pass | ||
| 513 | + log.exception('*** Not an OLE 1.0 Object') | ||
| 514 | + | ||
| 515 | + def bin(self, bindata): | ||
| 516 | + if self.current_destination.cword == 'objdata': | ||
| 517 | + # TODO: keep track of this, because it is unusual and indicates potential obfuscation | ||
| 518 | + # trick: hexlify binary data, add it to hex data | ||
| 519 | + self.current_destination.data += binascii.hexlify(bindata) | ||
| 520 | + | ||
| 521 | + def control_word(self, matchobject, cword, param): | ||
| 522 | + # TODO: extract useful cwords such as objclass | ||
| 523 | + # TODO: keep track of cwords inside objdata, because it is unusual and indicates potential obfuscation | ||
| 524 | + # TODO: same with control symbols, and opening bracket | ||
| 525 | + pass | ||
| 526 | + | ||
| 178 | 527 | ||
| 179 | #=== FUNCTIONS =============================================================== | 528 | #=== FUNCTIONS =============================================================== |
| 180 | 529 | ||
| @@ -329,50 +678,53 @@ def process_file(container, filename, data, output_dir=None): | @@ -329,50 +678,53 @@ def process_file(container, filename, data, output_dir=None): | ||
| 329 | # TODO: option to extract objects to files (false by default) | 678 | # TODO: option to extract objects to files (false by default) |
| 330 | if data is None: | 679 | if data is None: |
| 331 | data = open(filename, 'rb').read() | 680 | data = open(filename, 'rb').read() |
| 332 | - print '-'*79 | ||
| 333 | - print 'File: %r - %d bytes' % (filename, len(data)) | ||
| 334 | - for index, orig_len, objdata in rtf_iter_objects(data): | ||
| 335 | - print 'found object size %d at index %08X - end %08X' % (len(objdata), index, index+orig_len) | ||
| 336 | - fname = '%s_object_%08X.raw' % (fname_prefix, index) | ||
| 337 | - print 'saving object to file %s' % fname | ||
| 338 | - open(fname, 'wb').write(objdata) | ||
| 339 | - # TODO: check if all hex data is extracted properly | ||
| 340 | - | ||
| 341 | - obj = OleObject() | ||
| 342 | - try: | ||
| 343 | - obj.parse(objdata) | ||
| 344 | - print 'extract file embedded in OLE object:' | ||
| 345 | - print 'format_id = %d' % obj.format_id | ||
| 346 | - print 'class name = %r' % obj.class_name | ||
| 347 | - print 'data size = %d' % obj.data_size | ||
| 348 | - # set a file extension according to the class name: | ||
| 349 | - class_name = obj.class_name.lower() | ||
| 350 | - if class_name.startswith('word'): | ||
| 351 | - ext = 'doc' | ||
| 352 | - elif class_name.startswith('package'): | ||
| 353 | - ext = 'package' | ||
| 354 | - else: | ||
| 355 | - ext = 'bin' | ||
| 356 | - | ||
| 357 | - fname = '%s_object_%08X.%s' % (fname_prefix, index, ext) | ||
| 358 | - print 'saving to file %s' % fname | ||
| 359 | - open(fname, 'wb').write(obj.data) | ||
| 360 | - if obj.class_name.lower() == 'package': | ||
| 361 | - print 'Parsing OLE Package' | ||
| 362 | - opkg = OleNativeStream(bindata=obj.data) | ||
| 363 | - print 'Filename = %r' % opkg.filename | ||
| 364 | - print 'Source path = %r' % opkg.src_path | ||
| 365 | - print 'Temp path = %r' % opkg.temp_path | ||
| 366 | - if opkg.filename: | ||
| 367 | - fname = '%s_%s' % (fname_prefix, | ||
| 368 | - sanitize_filename(opkg.filename)) | ||
| 369 | - else: | ||
| 370 | - fname = '%s_object_%08X.noname' % (fname_prefix, index) | ||
| 371 | - print 'saving to file %s' % fname | ||
| 372 | - open(fname, 'wb').write(opkg.data) | ||
| 373 | - except: | ||
| 374 | - pass | ||
| 375 | - log.exception('*** Not an OLE 1.0 Object') | 681 | + rtfp = RtfObjParser(data, fname_prefix) |
| 682 | + rtfp.parse() | ||
| 683 | + | ||
| 684 | + # print '-'*79 | ||
| 685 | + # print 'File: %r - %d bytes' % (filename, len(data)) | ||
| 686 | + # for index, orig_len, objdata in rtf_iter_objects(data): | ||
| 687 | + # print 'found object size %d at index %08X - end %08X' % (len(objdata), index, index+orig_len) | ||
| 688 | + # fname = '%s_object_%08X.raw' % (fname_prefix, index) | ||
| 689 | + # print 'saving object to file %s' % fname | ||
| 690 | + # open(fname, 'wb').write(objdata) | ||
| 691 | + # # TODO: check if all hex data is extracted properly | ||
| 692 | + # | ||
| 693 | + # obj = OleObject() | ||
| 694 | + # try: | ||
| 695 | + # obj.parse(objdata) | ||
| 696 | + # print 'extract file embedded in OLE object:' | ||
| 697 | + # print 'format_id = %d' % obj.format_id | ||
| 698 | + # print 'class name = %r' % obj.class_name | ||
| 699 | + # print 'data size = %d' % obj.data_size | ||
| 700 | + # # set a file extension according to the class name: | ||
| 701 | + # class_name = obj.class_name.lower() | ||
| 702 | + # if class_name.startswith('word'): | ||
| 703 | + # ext = 'doc' | ||
| 704 | + # elif class_name.startswith('package'): | ||
| 705 | + # ext = 'package' | ||
| 706 | + # else: | ||
| 707 | + # ext = 'bin' | ||
| 708 | + # | ||
| 709 | + # fname = '%s_object_%08X.%s' % (fname_prefix, index, ext) | ||
| 710 | + # print 'saving to file %s' % fname | ||
| 711 | + # open(fname, 'wb').write(obj.data) | ||
| 712 | + # if obj.class_name.lower() == 'package': | ||
| 713 | + # print 'Parsing OLE Package' | ||
| 714 | + # opkg = OleNativeStream(bindata=obj.data) | ||
| 715 | + # print 'Filename = %r' % opkg.filename | ||
| 716 | + # print 'Source path = %r' % opkg.src_path | ||
| 717 | + # print 'Temp path = %r' % opkg.temp_path | ||
| 718 | + # if opkg.filename: | ||
| 719 | + # fname = '%s_%s' % (fname_prefix, | ||
| 720 | + # sanitize_filename(opkg.filename)) | ||
| 721 | + # else: | ||
| 722 | + # fname = '%s_object_%08X.noname' % (fname_prefix, index) | ||
| 723 | + # print 'saving to file %s' % fname | ||
| 724 | + # open(fname, 'wb').write(opkg.data) | ||
| 725 | + # except: | ||
| 726 | + # pass | ||
| 727 | + # log.exception('*** Not an OLE 1.0 Object') | ||
| 376 | 728 | ||
| 377 | 729 | ||
| 378 | 730 | ||
| @@ -414,7 +766,7 @@ if __name__ == '__main__': | @@ -414,7 +766,7 @@ if __name__ == '__main__': | ||
| 414 | 766 | ||
| 415 | # Print help if no arguments are passed | 767 | # Print help if no arguments are passed |
| 416 | if len(args) == 0: | 768 | if len(args) == 0: |
| 417 | - print __doc__ | 769 | + print (__doc__) |
| 418 | parser.print_help() | 770 | parser.print_help() |
| 419 | sys.exit() | 771 | sys.exit() |
| 420 | 772 | ||
| @@ -436,5 +788,5 @@ if __name__ == '__main__': | @@ -436,5 +788,5 @@ if __name__ == '__main__': | ||
| 436 | process_file(container, filename, data, options.output_dir) | 788 | process_file(container, filename, data, options.output_dir) |
| 437 | 789 | ||
| 438 | 790 | ||
| 439 | - | 791 | +# This code was developed while listening to The Mary Onettes "Lost" |
| 440 | 792 |