Commit 9153fb6cb8cebdb8116787cb28156fe96f8065cf

Authored by decalage2
1 parent d154c483

rtfobj: new API with class RtfObject, file output moved to process_file

Showing 1 changed file with 108 additions and 133 deletions
oletools/rtfobj.py
... ... @@ -54,11 +54,12 @@ http://www.decalage.info/python/oletools
54 54 # (contribution by Thomas Jarosch)
55 55 # TJ: - sanitize filenames to avoid special characters
56 56 # 2016-05-29 PL: - improved parsing, fixed issue #42
57   -# 2016-07-13 v0.48 PL: - new RtfParser and RtfObjParser classes
  57 +# 2016-07-13 v0.50 PL: - new RtfParser and RtfObjParser classes
58 58 # 2016-07-18 SL: - added Python 3.5 support
59 59 # 2016-07-19 PL: - fixed Python 2.6-2.7 support
  60 +# 2016-07-30 PL: - new API with class RtfObject
60 61  
61   -__version__ = '0.48'
  62 +__version__ = '0.50'
62 63  
63 64 # ------------------------------------------------------------------------------
64 65 # TODO:
... ... @@ -455,14 +456,45 @@ class RtfParser(object):
455 456 pass
456 457  
457 458  
  459 +class RtfObject(object):
  460 + """
  461 + An object or a file (OLE Package) embedded into an RTF document
  462 + """
  463 + def __init__(self):
  464 + """
  465 + RtfObject constructor
  466 + """
  467 + # start and end index in the RTF file:
  468 + self.start = None
  469 + self.end = None
  470 + # raw object data encoded in hexadecimal, as found in the RTF file:
  471 + self.hexdata = None
  472 + # raw object data in binary form, decoded from hexadecimal
  473 + self.rawdata = None
  474 + # OLE object data (extracted from rawdata)
  475 + self.is_ole = False
  476 + self.oledata = None
  477 + self.format_id = None
  478 + self.class_name = None
  479 + self.oledata_size = None
  480 + # OLE Package data (extracted from oledata)
  481 + self.is_package = False
  482 + self.olepkgdata = None
  483 + self.filename = None
  484 + self.src_path = None
  485 + self.temp_path = None
  486 +
  487 +
  488 +
458 489 class RtfObjParser(RtfParser):
459 490 """
460 491 Specialized RTF parser to extract OLE objects
461 492 """
462 493  
463   - def __init__(self, data, fname_prefix='rtf'):
  494 + def __init__(self, data):
464 495 super(RtfObjParser, self).__init__(data)
465   - self.fname_prefix = fname_prefix
  496 + # list of RtfObjects found
  497 + self.objects = []
466 498  
467 499 def open_destination(self, destination):
468 500 if destination.cword == b'objdata':
... ... @@ -471,6 +503,10 @@ class RtfObjParser(RtfParser):
471 503 def close_destination(self, destination):
472 504 if destination.cword == b'objdata':
473 505 log.debug('*** Close object data at index %Xh' % self.index)
  506 + rtfobj = RtfObject()
  507 + self.objects.append(rtfobj)
  508 + rtfobj.start = destination.start
  509 + rtfobj.end = destination.end
474 510 # Filter out all whitespaces first (just ignored):
475 511 hexdata1 = destination.data.translate(None, b' \t\r\n\f\v')
476 512 # Then filter out any other non-hex character:
... ... @@ -483,46 +519,26 @@ class RtfObjParser(RtfParser):
483 519 if len(hexdata) & 1:
484 520 log.debug('Odd length, trimmed last byte.')
485 521 hexdata = hexdata[:-1]
  522 + rtfobj.hexdata = hexdata
486 523 object_data = binascii.unhexlify(hexdata)
487   - print('found object size %d at index %08X - end %08X' % (len(object_data),
488   - destination.start, self.index))
489   - fname = '%s_object_%08X.raw' % (self.fname_prefix, destination.start)
490   - print('saving object to file %s' % fname)
491   - open(fname, 'wb').write(object_data)
  524 + rtfobj.rawdata = object_data
492 525 # TODO: check if all hex data is extracted properly
493 526  
494 527 obj = OleObject()
495 528 try:
496 529 obj.parse(object_data)
497   - print('extract file embedded in OLE object:')
498   - print('format_id = %d' % obj.format_id)
499   - print('class name = %r' % obj.class_name)
500   - print('data size = %d' % obj.data_size)
501   - # set a file extension according to the class name:
502   - class_name = obj.class_name.lower()
503   - if class_name.startswith(b'word'):
504   - ext = 'doc'
505   - elif class_name.startswith(b'package'):
506   - ext = 'package'
507   - else:
508   - ext = 'bin'
509   -
510   - fname = '%s_object_%08X.%s' % (self.fname_prefix, destination.start, ext)
511   - print('saving to file %s' % fname)
512   - open(fname, 'wb').write(obj.data)
  530 + rtfobj.format_id = obj.format_id
  531 + rtfobj.class_name = obj.class_name
  532 + rtfobj.oledata_size = obj.data_size
  533 + rtfobj.oledata = obj.data
  534 + rtfobj.is_ole = True
513 535 if obj.class_name.lower() == 'package':
514   - print('Parsing OLE Package')
515 536 opkg = OleNativeStream(bindata=obj.data)
516   - print('Filename = %r' % opkg.filename)
517   - print('Source path = %r' % opkg.src_path)
518   - print('Temp path = %r' % opkg.temp_path)
519   - if opkg.filename:
520   - fname = '%s_%s' % (self.fname_prefix,
521   - sanitize_filename(opkg.filename))
522   - else:
523   - fname = '%s_object_%08X.noname' % (self.fname_prefix, destination.start)
524   - print('saving to file %s' % fname)
525   - open(fname, 'wb').write(opkg.data)
  537 + rtfobj.filename = opkg.filename
  538 + rtfobj.src_path = opkg.src_path
  539 + rtfobj.temp_path = opkg.temp_path
  540 + rtfobj.olepkgdata = opkg.data
  541 + rtfobj.is_package = True
526 542 except:
527 543 pass
528 544 log.exception('*** Not an OLE 1.0 Object')
... ... @@ -564,94 +580,6 @@ class RtfObjParser(RtfParser):
564 580 # TODO: backward-compatible API?
565 581  
566 582  
567   -# def search_hex_block(data, pos=0, min_size=32, first=True):
568   -# if first:
569   -# # Search 1st occurence of a hex block:
570   -# match = re_hexblock.search(data, pos=pos)
571   -# else:
572   -# # Match next occurences of a hex block, from the current position only:
573   -# match = re_hexblock.match(data, pos=pos)
574   -#
575   -#
576   -#
577   -# def rtf_iter_objects (data, min_size=32):
578   -# """
579   -# Open a RTF file, extract each embedded object encoded in hexadecimal of
580   -# size > min_size, yield the index of the object in the RTF file and its data
581   -# in binary format.
582   -# This is an iterator.
583   -# """
584   -# # Search 1st occurence of a hex block:
585   -# match = re_hexblock.search(data)
586   -# if match is None:
587   -# log.debug('No hex block found.')
588   -# # no hex block found
589   -# return
590   -# while match is not None:
591   -# found = match.group(0)
592   -# # start index
593   -# start = match.start()
594   -# # current position
595   -# current = match.end()
596   -# log.debug('Found hex block starting at %08X, end %08X, size=%d' % (start, current, len(found)))
597   -# if len(found) < min_size:
598   -# log.debug('Too small - size<%d, ignored.' % min_size)
599   -# match = re_hexblock.search(data, pos=current)
600   -# continue
601   -# #log.debug('Match: %s' % found)
602   -# # remove all whitespace and line feeds:
603   -# #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
604   -# found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')
605   -# # TODO: make it a function
606   -# # Also remove embedded RTF tags:
607   -# found = re_embedded_tags.sub('', found)
608   -# # object data extracted from the RTF file
609   -# # MS Word accepts an extra hex digit, so we need to trim it if present:
610   -# if len(found) & 1:
611   -# log.debug('Odd length, trimmed last byte.')
612   -# found = found[:-1]
613   -# #log.debug('Cleaned match: %s' % found)
614   -# objdata = binascii.unhexlify(found)
615   -# # Detect the "\bin" control word, which is sometimes used for obfuscation:
616   -# bin_match = re_delims_bin_decimal.match(data, pos=current)
617   -# while bin_match is not None:
618   -# log.debug('Found \\bin block starting at %08X : %r'
619   -# % (bin_match.start(), bin_match.group(0)))
620   -# # extract the decimal integer following '\bin'
621   -# bin_len = int(bin_match.group(1))
622   -# log.debug('\\bin block length = %d' % bin_len)
623   -# if current+bin_len > len(data):
624   -# log.error('\\bin block length is larger than the remaining data')
625   -# # move the current index, ignore the \bin block
626   -# current += len(bin_match.group(0))
627   -# break
628   -# # read that number of bytes:
629   -# objdata += data[current:current+bin_len]
630   -# # TODO: handle exception
631   -# current += len(bin_match.group(0)) + bin_len
632   -# # TODO: check if current is out of range
633   -# # TODO: is Word limiting the \bin length to a number of digits?
634   -# log.debug('Current position = %08X' % current)
635   -# match = re_delim_hexblock.match(data, pos=current)
636   -# if match is not None:
637   -# log.debug('Found next hex block starting at %08X, end %08X'
638   -# % (match.start(), match.end()))
639   -# found = match.group(0)
640   -# log.debug('Match: %s' % found)
641   -# # remove all whitespace and line feeds:
642   -# #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
643   -# found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')
644   -# # Also remove embedded RTF tags:
645   -# found = re_embedded_tags.sub(found, '')
646   -# objdata += binascii.unhexlify(found)
647   -# current = match.end()
648   -# bin_match = re_delims_bin_decimal.match(data, pos=current)
649   -#
650   -# # print repr(found)
651   -# if len(objdata)>min_size:
652   -# yield start, current-start, objdata
653   -# # Search next occurence of a hex block:
654   -# match = re_hexblock.search(data, pos=current)
655 583  
656 584  
657 585  
... ... @@ -693,10 +621,53 @@ def process_file(container, filename, data, output_dir=None):
693 621 # TODO: option to extract objects to files (false by default)
694 622 if data is None:
695 623 data = open(filename, 'rb').read()
696   - rtfp = RtfObjParser(data, fname_prefix)
  624 + print('='*79)
  625 + print('File: %r - %d bytes' % (filename, len(data)))
  626 + rtfp = RtfObjParser(data)
697 627 rtfp.parse()
  628 + for rtfobj in rtfp.objects:
  629 + print('-'*79)
  630 + print('found object size %d at index %08X - end %08X'
  631 + % (len(rtfobj.rawdata), rtfobj.start, rtfobj.end))
  632 + fname = '%s_object_%08X.raw' % (fname_prefix, rtfobj.start)
  633 + print('saving object to file %s' % fname)
  634 + open(fname, 'wb').write(rtfobj.rawdata)
  635 + if rtfobj.is_ole:
  636 + print('extract file embedded in OLE object:')
  637 + print('format_id = %d' % rtfobj.format_id)
  638 + print('class name = %r' % rtfobj.class_name)
  639 + print('data size = %d' % rtfobj.oledata_size)
  640 + # set a file extension according to the class name:
  641 + class_name = rtfobj.class_name.lower()
  642 + if class_name.startswith(b'word'):
  643 + ext = 'doc'
  644 + elif class_name.startswith(b'package'):
  645 + ext = 'package'
  646 + else:
  647 + ext = 'bin'
  648 + fname = '%s_object_%08X.%s' % (fname_prefix, rtfobj.start, ext)
  649 + print('saving to file %s' % fname)
  650 + open(fname, 'wb').write(rtfobj.oledata)
  651 + if rtfobj.is_package:
  652 + print('Parsing OLE Package')
  653 + print('Filename = %r' % rtfobj.filename)
  654 + print('Source path = %r' % rtfobj.src_path)
  655 + print('Temp path = %r' % rtfobj.temp_path)
  656 + if rtfobj.filename:
  657 + fname = '%s_%s' % (fname_prefix,
  658 + sanitize_filename(rtfobj.filename))
  659 + else:
  660 + fname = '%s_object_%08X.noname' % (fname_prefix, rtfobj.start)
  661 + print('saving to file %s' % fname)
  662 + open(fname, 'wb').write(rtfobj.olepkgdata)
  663 + else:
  664 + print('Not an OLE Package')
  665 + else:
  666 + print('Not a well-formed OLE object')
  667 +
  668 +
698 669  
699   - # print '-'*79
  670 + # print '-'*79
700 671 # print 'File: %r - %d bytes' % (filename, len(data))
701 672 # for index, orig_len, objdata in rtf_iter_objects(data):
702 673 # print 'found object size %d at index %08X - end %08X' % (len(objdata), index, index+orig_len)
... ... @@ -745,7 +716,7 @@ def process_file(container, filename, data, output_dir=None):
745 716  
746 717 #=== MAIN =================================================================
747 718  
748   -if __name__ == '__main__':
  719 +def main():
749 720 # print banner with version
750 721 print ('rtfobj %s - http://decalage.info/python/oletools' % __version__)
751 722 print ('THIS IS WORK IN PROGRESS - Check updates regularly!')
... ... @@ -753,12 +724,13 @@ if __name__ == &#39;__main__&#39;:
753 724 print ('')
754 725  
755 726 DEFAULT_LOG_LEVEL = "warning" # Default log level
756   - LOG_LEVELS = {'debug': logging.DEBUG,
757   - 'info': logging.INFO,
758   - 'warning': logging.WARNING,
759   - 'error': logging.ERROR,
760   - 'critical': logging.CRITICAL
761   - }
  727 + LOG_LEVELS = {
  728 + 'debug': logging.DEBUG,
  729 + 'info': logging.INFO,
  730 + 'warning': logging.WARNING,
  731 + 'error': logging.ERROR,
  732 + 'critical': logging.CRITICAL
  733 + }
762 734  
763 735 usage = 'usage: %prog [options] <filename> [filename2 ...]'
764 736 parser = optparse.OptionParser(usage=usage)
... ... @@ -803,5 +775,8 @@ if __name__ == &#39;__main__&#39;:
803 775 process_file(container, filename, data, options.output_dir)
804 776  
805 777  
  778 +if __name__ == '__main__':
  779 + main()
  780 +
806 781 # This code was developed while listening to The Mary Onettes "Lost"
807 782  
... ...