Commit 9153fb6cb8cebdb8116787cb28156fe96f8065cf
1 parent
d154c483
rtfobj: new API with class RtfObject, file output moved to process_file
Showing
1 changed file
with
108 additions
and
133 deletions
oletools/rtfobj.py
| ... | ... | @@ -54,11 +54,12 @@ http://www.decalage.info/python/oletools |
| 54 | 54 | # (contribution by Thomas Jarosch) |
| 55 | 55 | # TJ: - sanitize filenames to avoid special characters |
| 56 | 56 | # 2016-05-29 PL: - improved parsing, fixed issue #42 |
| 57 | -# 2016-07-13 v0.48 PL: - new RtfParser and RtfObjParser classes | |
| 57 | +# 2016-07-13 v0.50 PL: - new RtfParser and RtfObjParser classes | |
| 58 | 58 | # 2016-07-18 SL: - added Python 3.5 support |
| 59 | 59 | # 2016-07-19 PL: - fixed Python 2.6-2.7 support |
| 60 | +# 2016-07-30 PL: - new API with class RtfObject | |
| 60 | 61 | |
| 61 | -__version__ = '0.48' | |
| 62 | +__version__ = '0.50' | |
| 62 | 63 | |
| 63 | 64 | # ------------------------------------------------------------------------------ |
| 64 | 65 | # TODO: |
| ... | ... | @@ -455,14 +456,45 @@ class RtfParser(object): |
| 455 | 456 | pass |
| 456 | 457 | |
| 457 | 458 | |
| 459 | +class RtfObject(object): | |
| 460 | + """ | |
| 461 | + An object or a file (OLE Package) embedded into an RTF document | |
| 462 | + """ | |
| 463 | + def __init__(self): | |
| 464 | + """ | |
| 465 | + RtfObject constructor | |
| 466 | + """ | |
| 467 | + # start and end index in the RTF file: | |
| 468 | + self.start = None | |
| 469 | + self.end = None | |
| 470 | + # raw object data encoded in hexadecimal, as found in the RTF file: | |
| 471 | + self.hexdata = None | |
| 472 | + # raw object data in binary form, decoded from hexadecimal | |
| 473 | + self.rawdata = None | |
| 474 | + # OLE object data (extracted from rawdata) | |
| 475 | + self.is_ole = False | |
| 476 | + self.oledata = None | |
| 477 | + self.format_id = None | |
| 478 | + self.class_name = None | |
| 479 | + self.oledata_size = None | |
| 480 | + # OLE Package data (extracted from oledata) | |
| 481 | + self.is_package = False | |
| 482 | + self.olepkgdata = None | |
| 483 | + self.filename = None | |
| 484 | + self.src_path = None | |
| 485 | + self.temp_path = None | |
| 486 | + | |
| 487 | + | |
| 488 | + | |
| 458 | 489 | class RtfObjParser(RtfParser): |
| 459 | 490 | """ |
| 460 | 491 | Specialized RTF parser to extract OLE objects |
| 461 | 492 | """ |
| 462 | 493 | |
| 463 | - def __init__(self, data, fname_prefix='rtf'): | |
| 494 | + def __init__(self, data): | |
| 464 | 495 | super(RtfObjParser, self).__init__(data) |
| 465 | - self.fname_prefix = fname_prefix | |
| 496 | + # list of RtfObjects found | |
| 497 | + self.objects = [] | |
| 466 | 498 | |
| 467 | 499 | def open_destination(self, destination): |
| 468 | 500 | if destination.cword == b'objdata': |
| ... | ... | @@ -471,6 +503,10 @@ class RtfObjParser(RtfParser): |
| 471 | 503 | def close_destination(self, destination): |
| 472 | 504 | if destination.cword == b'objdata': |
| 473 | 505 | log.debug('*** Close object data at index %Xh' % self.index) |
| 506 | + rtfobj = RtfObject() | |
| 507 | + self.objects.append(rtfobj) | |
| 508 | + rtfobj.start = destination.start | |
| 509 | + rtfobj.end = destination.end | |
| 474 | 510 | # Filter out all whitespaces first (just ignored): |
| 475 | 511 | hexdata1 = destination.data.translate(None, b' \t\r\n\f\v') |
| 476 | 512 | # Then filter out any other non-hex character: |
| ... | ... | @@ -483,46 +519,26 @@ class RtfObjParser(RtfParser): |
| 483 | 519 | if len(hexdata) & 1: |
| 484 | 520 | log.debug('Odd length, trimmed last byte.') |
| 485 | 521 | hexdata = hexdata[:-1] |
| 522 | + rtfobj.hexdata = hexdata | |
| 486 | 523 | object_data = binascii.unhexlify(hexdata) |
| 487 | - print('found object size %d at index %08X - end %08X' % (len(object_data), | |
| 488 | - destination.start, self.index)) | |
| 489 | - fname = '%s_object_%08X.raw' % (self.fname_prefix, destination.start) | |
| 490 | - print('saving object to file %s' % fname) | |
| 491 | - open(fname, 'wb').write(object_data) | |
| 524 | + rtfobj.rawdata = object_data | |
| 492 | 525 | # TODO: check if all hex data is extracted properly |
| 493 | 526 | |
| 494 | 527 | obj = OleObject() |
| 495 | 528 | try: |
| 496 | 529 | obj.parse(object_data) |
| 497 | - print('extract file embedded in OLE object:') | |
| 498 | - print('format_id = %d' % obj.format_id) | |
| 499 | - print('class name = %r' % obj.class_name) | |
| 500 | - print('data size = %d' % obj.data_size) | |
| 501 | - # set a file extension according to the class name: | |
| 502 | - class_name = obj.class_name.lower() | |
| 503 | - if class_name.startswith(b'word'): | |
| 504 | - ext = 'doc' | |
| 505 | - elif class_name.startswith(b'package'): | |
| 506 | - ext = 'package' | |
| 507 | - else: | |
| 508 | - ext = 'bin' | |
| 509 | - | |
| 510 | - fname = '%s_object_%08X.%s' % (self.fname_prefix, destination.start, ext) | |
| 511 | - print('saving to file %s' % fname) | |
| 512 | - open(fname, 'wb').write(obj.data) | |
| 530 | + rtfobj.format_id = obj.format_id | |
| 531 | + rtfobj.class_name = obj.class_name | |
| 532 | + rtfobj.oledata_size = obj.data_size | |
| 533 | + rtfobj.oledata = obj.data | |
| 534 | + rtfobj.is_ole = True | |
| 513 | 535 | if obj.class_name.lower() == 'package': |
| 514 | - print('Parsing OLE Package') | |
| 515 | 536 | opkg = OleNativeStream(bindata=obj.data) |
| 516 | - print('Filename = %r' % opkg.filename) | |
| 517 | - print('Source path = %r' % opkg.src_path) | |
| 518 | - print('Temp path = %r' % opkg.temp_path) | |
| 519 | - if opkg.filename: | |
| 520 | - fname = '%s_%s' % (self.fname_prefix, | |
| 521 | - sanitize_filename(opkg.filename)) | |
| 522 | - else: | |
| 523 | - fname = '%s_object_%08X.noname' % (self.fname_prefix, destination.start) | |
| 524 | - print('saving to file %s' % fname) | |
| 525 | - open(fname, 'wb').write(opkg.data) | |
| 537 | + rtfobj.filename = opkg.filename | |
| 538 | + rtfobj.src_path = opkg.src_path | |
| 539 | + rtfobj.temp_path = opkg.temp_path | |
| 540 | + rtfobj.olepkgdata = opkg.data | |
| 541 | + rtfobj.is_package = True | |
| 526 | 542 | except: |
| 527 | 543 | pass |
| 528 | 544 | log.exception('*** Not an OLE 1.0 Object') |
| ... | ... | @@ -564,94 +580,6 @@ class RtfObjParser(RtfParser): |
| 564 | 580 | # TODO: backward-compatible API? |
| 565 | 581 | |
| 566 | 582 | |
| 567 | -# def search_hex_block(data, pos=0, min_size=32, first=True): | |
| 568 | -# if first: | |
| 569 | -# # Search 1st occurence of a hex block: | |
| 570 | -# match = re_hexblock.search(data, pos=pos) | |
| 571 | -# else: | |
| 572 | -# # Match next occurences of a hex block, from the current position only: | |
| 573 | -# match = re_hexblock.match(data, pos=pos) | |
| 574 | -# | |
| 575 | -# | |
| 576 | -# | |
| 577 | -# def rtf_iter_objects (data, min_size=32): | |
| 578 | -# """ | |
| 579 | -# Open a RTF file, extract each embedded object encoded in hexadecimal of | |
| 580 | -# size > min_size, yield the index of the object in the RTF file and its data | |
| 581 | -# in binary format. | |
| 582 | -# This is an iterator. | |
| 583 | -# """ | |
| 584 | -# # Search 1st occurence of a hex block: | |
| 585 | -# match = re_hexblock.search(data) | |
| 586 | -# if match is None: | |
| 587 | -# log.debug('No hex block found.') | |
| 588 | -# # no hex block found | |
| 589 | -# return | |
| 590 | -# while match is not None: | |
| 591 | -# found = match.group(0) | |
| 592 | -# # start index | |
| 593 | -# start = match.start() | |
| 594 | -# # current position | |
| 595 | -# current = match.end() | |
| 596 | -# log.debug('Found hex block starting at %08X, end %08X, size=%d' % (start, current, len(found))) | |
| 597 | -# if len(found) < min_size: | |
| 598 | -# log.debug('Too small - size<%d, ignored.' % min_size) | |
| 599 | -# match = re_hexblock.search(data, pos=current) | |
| 600 | -# continue | |
| 601 | -# #log.debug('Match: %s' % found) | |
| 602 | -# # remove all whitespace and line feeds: | |
| 603 | -# #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE | |
| 604 | -# found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') | |
| 605 | -# # TODO: make it a function | |
| 606 | -# # Also remove embedded RTF tags: | |
| 607 | -# found = re_embedded_tags.sub('', found) | |
| 608 | -# # object data extracted from the RTF file | |
| 609 | -# # MS Word accepts an extra hex digit, so we need to trim it if present: | |
| 610 | -# if len(found) & 1: | |
| 611 | -# log.debug('Odd length, trimmed last byte.') | |
| 612 | -# found = found[:-1] | |
| 613 | -# #log.debug('Cleaned match: %s' % found) | |
| 614 | -# objdata = binascii.unhexlify(found) | |
| 615 | -# # Detect the "\bin" control word, which is sometimes used for obfuscation: | |
| 616 | -# bin_match = re_delims_bin_decimal.match(data, pos=current) | |
| 617 | -# while bin_match is not None: | |
| 618 | -# log.debug('Found \\bin block starting at %08X : %r' | |
| 619 | -# % (bin_match.start(), bin_match.group(0))) | |
| 620 | -# # extract the decimal integer following '\bin' | |
| 621 | -# bin_len = int(bin_match.group(1)) | |
| 622 | -# log.debug('\\bin block length = %d' % bin_len) | |
| 623 | -# if current+bin_len > len(data): | |
| 624 | -# log.error('\\bin block length is larger than the remaining data') | |
| 625 | -# # move the current index, ignore the \bin block | |
| 626 | -# current += len(bin_match.group(0)) | |
| 627 | -# break | |
| 628 | -# # read that number of bytes: | |
| 629 | -# objdata += data[current:current+bin_len] | |
| 630 | -# # TODO: handle exception | |
| 631 | -# current += len(bin_match.group(0)) + bin_len | |
| 632 | -# # TODO: check if current is out of range | |
| 633 | -# # TODO: is Word limiting the \bin length to a number of digits? | |
| 634 | -# log.debug('Current position = %08X' % current) | |
| 635 | -# match = re_delim_hexblock.match(data, pos=current) | |
| 636 | -# if match is not None: | |
| 637 | -# log.debug('Found next hex block starting at %08X, end %08X' | |
| 638 | -# % (match.start(), match.end())) | |
| 639 | -# found = match.group(0) | |
| 640 | -# log.debug('Match: %s' % found) | |
| 641 | -# # remove all whitespace and line feeds: | |
| 642 | -# #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE | |
| 643 | -# found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') | |
| 644 | -# # Also remove embedded RTF tags: | |
| 645 | -# found = re_embedded_tags.sub(found, '') | |
| 646 | -# objdata += binascii.unhexlify(found) | |
| 647 | -# current = match.end() | |
| 648 | -# bin_match = re_delims_bin_decimal.match(data, pos=current) | |
| 649 | -# | |
| 650 | -# # print repr(found) | |
| 651 | -# if len(objdata)>min_size: | |
| 652 | -# yield start, current-start, objdata | |
| 653 | -# # Search next occurence of a hex block: | |
| 654 | -# match = re_hexblock.search(data, pos=current) | |
| 655 | 583 | |
| 656 | 584 | |
| 657 | 585 | |
| ... | ... | @@ -693,10 +621,53 @@ def process_file(container, filename, data, output_dir=None): |
| 693 | 621 | # TODO: option to extract objects to files (false by default) |
| 694 | 622 | if data is None: |
| 695 | 623 | data = open(filename, 'rb').read() |
| 696 | - rtfp = RtfObjParser(data, fname_prefix) | |
| 624 | + print('='*79) | |
| 625 | + print('File: %r - %d bytes' % (filename, len(data))) | |
| 626 | + rtfp = RtfObjParser(data) | |
| 697 | 627 | rtfp.parse() |
| 628 | + for rtfobj in rtfp.objects: | |
| 629 | + print('-'*79) | |
| 630 | + print('found object size %d at index %08X - end %08X' | |
| 631 | + % (len(rtfobj.rawdata), rtfobj.start, rtfobj.end)) | |
| 632 | + fname = '%s_object_%08X.raw' % (fname_prefix, rtfobj.start) | |
| 633 | + print('saving object to file %s' % fname) | |
| 634 | + open(fname, 'wb').write(rtfobj.rawdata) | |
| 635 | + if rtfobj.is_ole: | |
| 636 | + print('extract file embedded in OLE object:') | |
| 637 | + print('format_id = %d' % rtfobj.format_id) | |
| 638 | + print('class name = %r' % rtfobj.class_name) | |
| 639 | + print('data size = %d' % rtfobj.oledata_size) | |
| 640 | + # set a file extension according to the class name: | |
| 641 | + class_name = rtfobj.class_name.lower() | |
| 642 | + if class_name.startswith(b'word'): | |
| 643 | + ext = 'doc' | |
| 644 | + elif class_name.startswith(b'package'): | |
| 645 | + ext = 'package' | |
| 646 | + else: | |
| 647 | + ext = 'bin' | |
| 648 | + fname = '%s_object_%08X.%s' % (fname_prefix, rtfobj.start, ext) | |
| 649 | + print('saving to file %s' % fname) | |
| 650 | + open(fname, 'wb').write(rtfobj.oledata) | |
| 651 | + if rtfobj.is_package: | |
| 652 | + print('Parsing OLE Package') | |
| 653 | + print('Filename = %r' % rtfobj.filename) | |
| 654 | + print('Source path = %r' % rtfobj.src_path) | |
| 655 | + print('Temp path = %r' % rtfobj.temp_path) | |
| 656 | + if rtfobj.filename: | |
| 657 | + fname = '%s_%s' % (fname_prefix, | |
| 658 | + sanitize_filename(rtfobj.filename)) | |
| 659 | + else: | |
| 660 | + fname = '%s_object_%08X.noname' % (fname_prefix, rtfobj.start) | |
| 661 | + print('saving to file %s' % fname) | |
| 662 | + open(fname, 'wb').write(rtfobj.olepkgdata) | |
| 663 | + else: | |
| 664 | + print('Not an OLE Package') | |
| 665 | + else: | |
| 666 | + print('Not a well-formed OLE object') | |
| 667 | + | |
| 668 | + | |
| 698 | 669 | |
| 699 | - # print '-'*79 | |
| 670 | + # print '-'*79 | |
| 700 | 671 | # print 'File: %r - %d bytes' % (filename, len(data)) |
| 701 | 672 | # for index, orig_len, objdata in rtf_iter_objects(data): |
| 702 | 673 | # print 'found object size %d at index %08X - end %08X' % (len(objdata), index, index+orig_len) |
| ... | ... | @@ -745,7 +716,7 @@ def process_file(container, filename, data, output_dir=None): |
| 745 | 716 | |
| 746 | 717 | #=== MAIN ================================================================= |
| 747 | 718 | |
| 748 | -if __name__ == '__main__': | |
| 719 | +def main(): | |
| 749 | 720 | # print banner with version |
| 750 | 721 | print ('rtfobj %s - http://decalage.info/python/oletools' % __version__) |
| 751 | 722 | print ('THIS IS WORK IN PROGRESS - Check updates regularly!') |
| ... | ... | @@ -753,12 +724,13 @@ if __name__ == '__main__': |
| 753 | 724 | print ('') |
| 754 | 725 | |
| 755 | 726 | DEFAULT_LOG_LEVEL = "warning" # Default log level |
| 756 | - LOG_LEVELS = {'debug': logging.DEBUG, | |
| 757 | - 'info': logging.INFO, | |
| 758 | - 'warning': logging.WARNING, | |
| 759 | - 'error': logging.ERROR, | |
| 760 | - 'critical': logging.CRITICAL | |
| 761 | - } | |
| 727 | + LOG_LEVELS = { | |
| 728 | + 'debug': logging.DEBUG, | |
| 729 | + 'info': logging.INFO, | |
| 730 | + 'warning': logging.WARNING, | |
| 731 | + 'error': logging.ERROR, | |
| 732 | + 'critical': logging.CRITICAL | |
| 733 | + } | |
| 762 | 734 | |
| 763 | 735 | usage = 'usage: %prog [options] <filename> [filename2 ...]' |
| 764 | 736 | parser = optparse.OptionParser(usage=usage) |
| ... | ... | @@ -803,5 +775,8 @@ if __name__ == '__main__': |
| 803 | 775 | process_file(container, filename, data, options.output_dir) |
| 804 | 776 | |
| 805 | 777 | |
| 778 | +if __name__ == '__main__': | |
| 779 | + main() | |
| 780 | + | |
| 806 | 781 | # This code was developed while listening to The Mary Onettes "Lost" |
| 807 | 782 | ... | ... |