Commit 9153fb6cb8cebdb8116787cb28156fe96f8065cf

Authored by decalage2
1 parent d154c483

rtfobj: new API with class RtfObject, file output moved to process_file

Showing 1 changed file with 108 additions and 133 deletions
oletools/rtfobj.py
@@ -54,11 +54,12 @@ http://www.decalage.info/python/oletools @@ -54,11 +54,12 @@ http://www.decalage.info/python/oletools
54 # (contribution by Thomas Jarosch) 54 # (contribution by Thomas Jarosch)
55 # TJ: - sanitize filenames to avoid special characters 55 # TJ: - sanitize filenames to avoid special characters
56 # 2016-05-29 PL: - improved parsing, fixed issue #42 56 # 2016-05-29 PL: - improved parsing, fixed issue #42
57 -# 2016-07-13 v0.48 PL: - new RtfParser and RtfObjParser classes 57 +# 2016-07-13 v0.50 PL: - new RtfParser and RtfObjParser classes
58 # 2016-07-18 SL: - added Python 3.5 support 58 # 2016-07-18 SL: - added Python 3.5 support
59 # 2016-07-19 PL: - fixed Python 2.6-2.7 support 59 # 2016-07-19 PL: - fixed Python 2.6-2.7 support
  60 +# 2016-07-30 PL: - new API with class RtfObject
60 61
61 -__version__ = '0.48' 62 +__version__ = '0.50'
62 63
63 # ------------------------------------------------------------------------------ 64 # ------------------------------------------------------------------------------
64 # TODO: 65 # TODO:
@@ -455,14 +456,45 @@ class RtfParser(object): @@ -455,14 +456,45 @@ class RtfParser(object):
455 pass 456 pass
456 457
457 458
  459 +class RtfObject(object):
  460 + """
  461 + An object or a file (OLE Package) embedded into an RTF document
  462 + """
  463 + def __init__(self):
  464 + """
  465 + RtfObject constructor
  466 + """
  467 + # start and end index in the RTF file:
  468 + self.start = None
  469 + self.end = None
  470 + # raw object data encoded in hexadecimal, as found in the RTF file:
  471 + self.hexdata = None
  472 + # raw object data in binary form, decoded from hexadecimal
  473 + self.rawdata = None
  474 + # OLE object data (extracted from rawdata)
  475 + self.is_ole = False
  476 + self.oledata = None
  477 + self.format_id = None
  478 + self.class_name = None
  479 + self.oledata_size = None
  480 + # OLE Package data (extracted from oledata)
  481 + self.is_package = False
  482 + self.olepkgdata = None
  483 + self.filename = None
  484 + self.src_path = None
  485 + self.temp_path = None
  486 +
  487 +
  488 +
458 class RtfObjParser(RtfParser): 489 class RtfObjParser(RtfParser):
459 """ 490 """
460 Specialized RTF parser to extract OLE objects 491 Specialized RTF parser to extract OLE objects
461 """ 492 """
462 493
463 - def __init__(self, data, fname_prefix='rtf'): 494 + def __init__(self, data):
464 super(RtfObjParser, self).__init__(data) 495 super(RtfObjParser, self).__init__(data)
465 - self.fname_prefix = fname_prefix 496 + # list of RtfObjects found
  497 + self.objects = []
466 498
467 def open_destination(self, destination): 499 def open_destination(self, destination):
468 if destination.cword == b'objdata': 500 if destination.cword == b'objdata':
@@ -471,6 +503,10 @@ class RtfObjParser(RtfParser): @@ -471,6 +503,10 @@ class RtfObjParser(RtfParser):
471 def close_destination(self, destination): 503 def close_destination(self, destination):
472 if destination.cword == b'objdata': 504 if destination.cword == b'objdata':
473 log.debug('*** Close object data at index %Xh' % self.index) 505 log.debug('*** Close object data at index %Xh' % self.index)
  506 + rtfobj = RtfObject()
  507 + self.objects.append(rtfobj)
  508 + rtfobj.start = destination.start
  509 + rtfobj.end = destination.end
474 # Filter out all whitespaces first (just ignored): 510 # Filter out all whitespaces first (just ignored):
475 hexdata1 = destination.data.translate(None, b' \t\r\n\f\v') 511 hexdata1 = destination.data.translate(None, b' \t\r\n\f\v')
476 # Then filter out any other non-hex character: 512 # Then filter out any other non-hex character:
@@ -483,46 +519,26 @@ class RtfObjParser(RtfParser): @@ -483,46 +519,26 @@ class RtfObjParser(RtfParser):
483 if len(hexdata) & 1: 519 if len(hexdata) & 1:
484 log.debug('Odd length, trimmed last byte.') 520 log.debug('Odd length, trimmed last byte.')
485 hexdata = hexdata[:-1] 521 hexdata = hexdata[:-1]
  522 + rtfobj.hexdata = hexdata
486 object_data = binascii.unhexlify(hexdata) 523 object_data = binascii.unhexlify(hexdata)
487 - print('found object size %d at index %08X - end %08X' % (len(object_data),  
488 - destination.start, self.index))  
489 - fname = '%s_object_%08X.raw' % (self.fname_prefix, destination.start)  
490 - print('saving object to file %s' % fname)  
491 - open(fname, 'wb').write(object_data) 524 + rtfobj.rawdata = object_data
492 # TODO: check if all hex data is extracted properly 525 # TODO: check if all hex data is extracted properly
493 526
494 obj = OleObject() 527 obj = OleObject()
495 try: 528 try:
496 obj.parse(object_data) 529 obj.parse(object_data)
497 - print('extract file embedded in OLE object:')  
498 - print('format_id = %d' % obj.format_id)  
499 - print('class name = %r' % obj.class_name)  
500 - print('data size = %d' % obj.data_size)  
501 - # set a file extension according to the class name:  
502 - class_name = obj.class_name.lower()  
503 - if class_name.startswith(b'word'):  
504 - ext = 'doc'  
505 - elif class_name.startswith(b'package'):  
506 - ext = 'package'  
507 - else:  
508 - ext = 'bin'  
509 -  
510 - fname = '%s_object_%08X.%s' % (self.fname_prefix, destination.start, ext)  
511 - print('saving to file %s' % fname)  
512 - open(fname, 'wb').write(obj.data) 530 + rtfobj.format_id = obj.format_id
  531 + rtfobj.class_name = obj.class_name
  532 + rtfobj.oledata_size = obj.data_size
  533 + rtfobj.oledata = obj.data
  534 + rtfobj.is_ole = True
513 if obj.class_name.lower() == 'package': 535 if obj.class_name.lower() == 'package':
514 - print('Parsing OLE Package')  
515 opkg = OleNativeStream(bindata=obj.data) 536 opkg = OleNativeStream(bindata=obj.data)
516 - print('Filename = %r' % opkg.filename)  
517 - print('Source path = %r' % opkg.src_path)  
518 - print('Temp path = %r' % opkg.temp_path)  
519 - if opkg.filename:  
520 - fname = '%s_%s' % (self.fname_prefix,  
521 - sanitize_filename(opkg.filename))  
522 - else:  
523 - fname = '%s_object_%08X.noname' % (self.fname_prefix, destination.start)  
524 - print('saving to file %s' % fname)  
525 - open(fname, 'wb').write(opkg.data) 537 + rtfobj.filename = opkg.filename
  538 + rtfobj.src_path = opkg.src_path
  539 + rtfobj.temp_path = opkg.temp_path
  540 + rtfobj.olepkgdata = opkg.data
  541 + rtfobj.is_package = True
526 except: 542 except:
527 pass 543 pass
528 log.exception('*** Not an OLE 1.0 Object') 544 log.exception('*** Not an OLE 1.0 Object')
@@ -564,94 +580,6 @@ class RtfObjParser(RtfParser): @@ -564,94 +580,6 @@ class RtfObjParser(RtfParser):
564 # TODO: backward-compatible API? 580 # TODO: backward-compatible API?
565 581
566 582
567 -# def search_hex_block(data, pos=0, min_size=32, first=True):  
568 -# if first:  
569 -# # Search 1st occurence of a hex block:  
570 -# match = re_hexblock.search(data, pos=pos)  
571 -# else:  
572 -# # Match next occurences of a hex block, from the current position only:  
573 -# match = re_hexblock.match(data, pos=pos)  
574 -#  
575 -#  
576 -#  
577 -# def rtf_iter_objects (data, min_size=32):  
578 -# """  
579 -# Open a RTF file, extract each embedded object encoded in hexadecimal of  
580 -# size > min_size, yield the index of the object in the RTF file and its data  
581 -# in binary format.  
582 -# This is an iterator.  
583 -# """  
584 -# # Search 1st occurence of a hex block:  
585 -# match = re_hexblock.search(data)  
586 -# if match is None:  
587 -# log.debug('No hex block found.')  
588 -# # no hex block found  
589 -# return  
590 -# while match is not None:  
591 -# found = match.group(0)  
592 -# # start index  
593 -# start = match.start()  
594 -# # current position  
595 -# current = match.end()  
596 -# log.debug('Found hex block starting at %08X, end %08X, size=%d' % (start, current, len(found)))  
597 -# if len(found) < min_size:  
598 -# log.debug('Too small - size<%d, ignored.' % min_size)  
599 -# match = re_hexblock.search(data, pos=current)  
600 -# continue  
601 -# #log.debug('Match: %s' % found)  
602 -# # remove all whitespace and line feeds:  
603 -# #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE  
604 -# found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')  
605 -# # TODO: make it a function  
606 -# # Also remove embedded RTF tags:  
607 -# found = re_embedded_tags.sub('', found)  
608 -# # object data extracted from the RTF file  
609 -# # MS Word accepts an extra hex digit, so we need to trim it if present:  
610 -# if len(found) & 1:  
611 -# log.debug('Odd length, trimmed last byte.')  
612 -# found = found[:-1]  
613 -# #log.debug('Cleaned match: %s' % found)  
614 -# objdata = binascii.unhexlify(found)  
615 -# # Detect the "\bin" control word, which is sometimes used for obfuscation:  
616 -# bin_match = re_delims_bin_decimal.match(data, pos=current)  
617 -# while bin_match is not None:  
618 -# log.debug('Found \\bin block starting at %08X : %r'  
619 -# % (bin_match.start(), bin_match.group(0)))  
620 -# # extract the decimal integer following '\bin'  
621 -# bin_len = int(bin_match.group(1))  
622 -# log.debug('\\bin block length = %d' % bin_len)  
623 -# if current+bin_len > len(data):  
624 -# log.error('\\bin block length is larger than the remaining data')  
625 -# # move the current index, ignore the \bin block  
626 -# current += len(bin_match.group(0))  
627 -# break  
628 -# # read that number of bytes:  
629 -# objdata += data[current:current+bin_len]  
630 -# # TODO: handle exception  
631 -# current += len(bin_match.group(0)) + bin_len  
632 -# # TODO: check if current is out of range  
633 -# # TODO: is Word limiting the \bin length to a number of digits?  
634 -# log.debug('Current position = %08X' % current)  
635 -# match = re_delim_hexblock.match(data, pos=current)  
636 -# if match is not None:  
637 -# log.debug('Found next hex block starting at %08X, end %08X'  
638 -# % (match.start(), match.end()))  
639 -# found = match.group(0)  
640 -# log.debug('Match: %s' % found)  
641 -# # remove all whitespace and line feeds:  
642 -# #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE  
643 -# found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')  
644 -# # Also remove embedded RTF tags:  
645 -# found = re_embedded_tags.sub(found, '')  
646 -# objdata += binascii.unhexlify(found)  
647 -# current = match.end()  
648 -# bin_match = re_delims_bin_decimal.match(data, pos=current)  
649 -#  
650 -# # print repr(found)  
651 -# if len(objdata)>min_size:  
652 -# yield start, current-start, objdata  
653 -# # Search next occurence of a hex block:  
654 -# match = re_hexblock.search(data, pos=current)  
655 583
656 584
657 585
@@ -693,10 +621,53 @@ def process_file(container, filename, data, output_dir=None): @@ -693,10 +621,53 @@ def process_file(container, filename, data, output_dir=None):
693 # TODO: option to extract objects to files (false by default) 621 # TODO: option to extract objects to files (false by default)
694 if data is None: 622 if data is None:
695 data = open(filename, 'rb').read() 623 data = open(filename, 'rb').read()
696 - rtfp = RtfObjParser(data, fname_prefix) 624 + print('='*79)
  625 + print('File: %r - %d bytes' % (filename, len(data)))
  626 + rtfp = RtfObjParser(data)
697 rtfp.parse() 627 rtfp.parse()
  628 + for rtfobj in rtfp.objects:
  629 + print('-'*79)
  630 + print('found object size %d at index %08X - end %08X'
  631 + % (len(rtfobj.rawdata), rtfobj.start, rtfobj.end))
  632 + fname = '%s_object_%08X.raw' % (fname_prefix, rtfobj.start)
  633 + print('saving object to file %s' % fname)
  634 + open(fname, 'wb').write(rtfobj.rawdata)
  635 + if rtfobj.is_ole:
  636 + print('extract file embedded in OLE object:')
  637 + print('format_id = %d' % rtfobj.format_id)
  638 + print('class name = %r' % rtfobj.class_name)
  639 + print('data size = %d' % rtfobj.oledata_size)
  640 + # set a file extension according to the class name:
  641 + class_name = rtfobj.class_name.lower()
  642 + if class_name.startswith(b'word'):
  643 + ext = 'doc'
  644 + elif class_name.startswith(b'package'):
  645 + ext = 'package'
  646 + else:
  647 + ext = 'bin'
  648 + fname = '%s_object_%08X.%s' % (fname_prefix, rtfobj.start, ext)
  649 + print('saving to file %s' % fname)
  650 + open(fname, 'wb').write(rtfobj.oledata)
  651 + if rtfobj.is_package:
  652 + print('Parsing OLE Package')
  653 + print('Filename = %r' % rtfobj.filename)
  654 + print('Source path = %r' % rtfobj.src_path)
  655 + print('Temp path = %r' % rtfobj.temp_path)
  656 + if rtfobj.filename:
  657 + fname = '%s_%s' % (fname_prefix,
  658 + sanitize_filename(rtfobj.filename))
  659 + else:
  660 + fname = '%s_object_%08X.noname' % (fname_prefix, rtfobj.start)
  661 + print('saving to file %s' % fname)
  662 + open(fname, 'wb').write(rtfobj.olepkgdata)
  663 + else:
  664 + print('Not an OLE Package')
  665 + else:
  666 + print('Not a well-formed OLE object')
  667 +
  668 +
698 669
699 - # print '-'*79 670 + # print '-'*79
700 # print 'File: %r - %d bytes' % (filename, len(data)) 671 # print 'File: %r - %d bytes' % (filename, len(data))
701 # for index, orig_len, objdata in rtf_iter_objects(data): 672 # for index, orig_len, objdata in rtf_iter_objects(data):
702 # print 'found object size %d at index %08X - end %08X' % (len(objdata), index, index+orig_len) 673 # print 'found object size %d at index %08X - end %08X' % (len(objdata), index, index+orig_len)
@@ -745,7 +716,7 @@ def process_file(container, filename, data, output_dir=None): @@ -745,7 +716,7 @@ def process_file(container, filename, data, output_dir=None):
745 716
746 #=== MAIN ================================================================= 717 #=== MAIN =================================================================
747 718
748 -if __name__ == '__main__': 719 +def main():
749 # print banner with version 720 # print banner with version
750 print ('rtfobj %s - http://decalage.info/python/oletools' % __version__) 721 print ('rtfobj %s - http://decalage.info/python/oletools' % __version__)
751 print ('THIS IS WORK IN PROGRESS - Check updates regularly!') 722 print ('THIS IS WORK IN PROGRESS - Check updates regularly!')
@@ -753,12 +724,13 @@ if __name__ == &#39;__main__&#39;: @@ -753,12 +724,13 @@ if __name__ == &#39;__main__&#39;:
753 print ('') 724 print ('')
754 725
755 DEFAULT_LOG_LEVEL = "warning" # Default log level 726 DEFAULT_LOG_LEVEL = "warning" # Default log level
756 - LOG_LEVELS = {'debug': logging.DEBUG,  
757 - 'info': logging.INFO,  
758 - 'warning': logging.WARNING,  
759 - 'error': logging.ERROR,  
760 - 'critical': logging.CRITICAL  
761 - } 727 + LOG_LEVELS = {
  728 + 'debug': logging.DEBUG,
  729 + 'info': logging.INFO,
  730 + 'warning': logging.WARNING,
  731 + 'error': logging.ERROR,
  732 + 'critical': logging.CRITICAL
  733 + }
762 734
763 usage = 'usage: %prog [options] <filename> [filename2 ...]' 735 usage = 'usage: %prog [options] <filename> [filename2 ...]'
764 parser = optparse.OptionParser(usage=usage) 736 parser = optparse.OptionParser(usage=usage)
@@ -803,5 +775,8 @@ if __name__ == &#39;__main__&#39;: @@ -803,5 +775,8 @@ if __name__ == &#39;__main__&#39;:
803 process_file(container, filename, data, options.output_dir) 775 process_file(container, filename, data, options.output_dir)
804 776
805 777
  778 +if __name__ == '__main__':
  779 + main()
  780 +
806 # This code was developed while listening to The Mary Onettes "Lost" 781 # This code was developed while listening to The Mary Onettes "Lost"
807 782