Commit 9153fb6cb8cebdb8116787cb28156fe96f8065cf
1 parent
d154c483
rtfobj: new API with class RtfObject, file output moved to process_file
Showing
1 changed file
with
108 additions
and
133 deletions
oletools/rtfobj.py
| @@ -54,11 +54,12 @@ http://www.decalage.info/python/oletools | @@ -54,11 +54,12 @@ http://www.decalage.info/python/oletools | ||
| 54 | # (contribution by Thomas Jarosch) | 54 | # (contribution by Thomas Jarosch) |
| 55 | # TJ: - sanitize filenames to avoid special characters | 55 | # TJ: - sanitize filenames to avoid special characters |
| 56 | # 2016-05-29 PL: - improved parsing, fixed issue #42 | 56 | # 2016-05-29 PL: - improved parsing, fixed issue #42 |
| 57 | -# 2016-07-13 v0.48 PL: - new RtfParser and RtfObjParser classes | 57 | +# 2016-07-13 v0.50 PL: - new RtfParser and RtfObjParser classes |
| 58 | # 2016-07-18 SL: - added Python 3.5 support | 58 | # 2016-07-18 SL: - added Python 3.5 support |
| 59 | # 2016-07-19 PL: - fixed Python 2.6-2.7 support | 59 | # 2016-07-19 PL: - fixed Python 2.6-2.7 support |
| 60 | +# 2016-07-30 PL: - new API with class RtfObject | ||
| 60 | 61 | ||
| 61 | -__version__ = '0.48' | 62 | +__version__ = '0.50' |
| 62 | 63 | ||
| 63 | # ------------------------------------------------------------------------------ | 64 | # ------------------------------------------------------------------------------ |
| 64 | # TODO: | 65 | # TODO: |
| @@ -455,14 +456,45 @@ class RtfParser(object): | @@ -455,14 +456,45 @@ class RtfParser(object): | ||
| 455 | pass | 456 | pass |
| 456 | 457 | ||
| 457 | 458 | ||
| 459 | +class RtfObject(object): | ||
| 460 | + """ | ||
| 461 | + An object or a file (OLE Package) embedded into an RTF document | ||
| 462 | + """ | ||
| 463 | + def __init__(self): | ||
| 464 | + """ | ||
| 465 | + RtfObject constructor | ||
| 466 | + """ | ||
| 467 | + # start and end index in the RTF file: | ||
| 468 | + self.start = None | ||
| 469 | + self.end = None | ||
| 470 | + # raw object data encoded in hexadecimal, as found in the RTF file: | ||
| 471 | + self.hexdata = None | ||
| 472 | + # raw object data in binary form, decoded from hexadecimal | ||
| 473 | + self.rawdata = None | ||
| 474 | + # OLE object data (extracted from rawdata) | ||
| 475 | + self.is_ole = False | ||
| 476 | + self.oledata = None | ||
| 477 | + self.format_id = None | ||
| 478 | + self.class_name = None | ||
| 479 | + self.oledata_size = None | ||
| 480 | + # OLE Package data (extracted from oledata) | ||
| 481 | + self.is_package = False | ||
| 482 | + self.olepkgdata = None | ||
| 483 | + self.filename = None | ||
| 484 | + self.src_path = None | ||
| 485 | + self.temp_path = None | ||
| 486 | + | ||
| 487 | + | ||
| 488 | + | ||
| 458 | class RtfObjParser(RtfParser): | 489 | class RtfObjParser(RtfParser): |
| 459 | """ | 490 | """ |
| 460 | Specialized RTF parser to extract OLE objects | 491 | Specialized RTF parser to extract OLE objects |
| 461 | """ | 492 | """ |
| 462 | 493 | ||
| 463 | - def __init__(self, data, fname_prefix='rtf'): | 494 | + def __init__(self, data): |
| 464 | super(RtfObjParser, self).__init__(data) | 495 | super(RtfObjParser, self).__init__(data) |
| 465 | - self.fname_prefix = fname_prefix | 496 | + # list of RtfObjects found |
| 497 | + self.objects = [] | ||
| 466 | 498 | ||
| 467 | def open_destination(self, destination): | 499 | def open_destination(self, destination): |
| 468 | if destination.cword == b'objdata': | 500 | if destination.cword == b'objdata': |
| @@ -471,6 +503,10 @@ class RtfObjParser(RtfParser): | @@ -471,6 +503,10 @@ class RtfObjParser(RtfParser): | ||
| 471 | def close_destination(self, destination): | 503 | def close_destination(self, destination): |
| 472 | if destination.cword == b'objdata': | 504 | if destination.cword == b'objdata': |
| 473 | log.debug('*** Close object data at index %Xh' % self.index) | 505 | log.debug('*** Close object data at index %Xh' % self.index) |
| 506 | + rtfobj = RtfObject() | ||
| 507 | + self.objects.append(rtfobj) | ||
| 508 | + rtfobj.start = destination.start | ||
| 509 | + rtfobj.end = destination.end | ||
| 474 | # Filter out all whitespaces first (just ignored): | 510 | # Filter out all whitespaces first (just ignored): |
| 475 | hexdata1 = destination.data.translate(None, b' \t\r\n\f\v') | 511 | hexdata1 = destination.data.translate(None, b' \t\r\n\f\v') |
| 476 | # Then filter out any other non-hex character: | 512 | # Then filter out any other non-hex character: |
| @@ -483,46 +519,26 @@ class RtfObjParser(RtfParser): | @@ -483,46 +519,26 @@ class RtfObjParser(RtfParser): | ||
| 483 | if len(hexdata) & 1: | 519 | if len(hexdata) & 1: |
| 484 | log.debug('Odd length, trimmed last byte.') | 520 | log.debug('Odd length, trimmed last byte.') |
| 485 | hexdata = hexdata[:-1] | 521 | hexdata = hexdata[:-1] |
| 522 | + rtfobj.hexdata = hexdata | ||
| 486 | object_data = binascii.unhexlify(hexdata) | 523 | object_data = binascii.unhexlify(hexdata) |
| 487 | - print('found object size %d at index %08X - end %08X' % (len(object_data), | ||
| 488 | - destination.start, self.index)) | ||
| 489 | - fname = '%s_object_%08X.raw' % (self.fname_prefix, destination.start) | ||
| 490 | - print('saving object to file %s' % fname) | ||
| 491 | - open(fname, 'wb').write(object_data) | 524 | + rtfobj.rawdata = object_data |
| 492 | # TODO: check if all hex data is extracted properly | 525 | # TODO: check if all hex data is extracted properly |
| 493 | 526 | ||
| 494 | obj = OleObject() | 527 | obj = OleObject() |
| 495 | try: | 528 | try: |
| 496 | obj.parse(object_data) | 529 | obj.parse(object_data) |
| 497 | - print('extract file embedded in OLE object:') | ||
| 498 | - print('format_id = %d' % obj.format_id) | ||
| 499 | - print('class name = %r' % obj.class_name) | ||
| 500 | - print('data size = %d' % obj.data_size) | ||
| 501 | - # set a file extension according to the class name: | ||
| 502 | - class_name = obj.class_name.lower() | ||
| 503 | - if class_name.startswith(b'word'): | ||
| 504 | - ext = 'doc' | ||
| 505 | - elif class_name.startswith(b'package'): | ||
| 506 | - ext = 'package' | ||
| 507 | - else: | ||
| 508 | - ext = 'bin' | ||
| 509 | - | ||
| 510 | - fname = '%s_object_%08X.%s' % (self.fname_prefix, destination.start, ext) | ||
| 511 | - print('saving to file %s' % fname) | ||
| 512 | - open(fname, 'wb').write(obj.data) | 530 | + rtfobj.format_id = obj.format_id |
| 531 | + rtfobj.class_name = obj.class_name | ||
| 532 | + rtfobj.oledata_size = obj.data_size | ||
| 533 | + rtfobj.oledata = obj.data | ||
| 534 | + rtfobj.is_ole = True | ||
| 513 | if obj.class_name.lower() == 'package': | 535 | if obj.class_name.lower() == 'package': |
| 514 | - print('Parsing OLE Package') | ||
| 515 | opkg = OleNativeStream(bindata=obj.data) | 536 | opkg = OleNativeStream(bindata=obj.data) |
| 516 | - print('Filename = %r' % opkg.filename) | ||
| 517 | - print('Source path = %r' % opkg.src_path) | ||
| 518 | - print('Temp path = %r' % opkg.temp_path) | ||
| 519 | - if opkg.filename: | ||
| 520 | - fname = '%s_%s' % (self.fname_prefix, | ||
| 521 | - sanitize_filename(opkg.filename)) | ||
| 522 | - else: | ||
| 523 | - fname = '%s_object_%08X.noname' % (self.fname_prefix, destination.start) | ||
| 524 | - print('saving to file %s' % fname) | ||
| 525 | - open(fname, 'wb').write(opkg.data) | 537 | + rtfobj.filename = opkg.filename |
| 538 | + rtfobj.src_path = opkg.src_path | ||
| 539 | + rtfobj.temp_path = opkg.temp_path | ||
| 540 | + rtfobj.olepkgdata = opkg.data | ||
| 541 | + rtfobj.is_package = True | ||
| 526 | except: | 542 | except: |
| 527 | pass | 543 | pass |
| 528 | log.exception('*** Not an OLE 1.0 Object') | 544 | log.exception('*** Not an OLE 1.0 Object') |
| @@ -564,94 +580,6 @@ class RtfObjParser(RtfParser): | @@ -564,94 +580,6 @@ class RtfObjParser(RtfParser): | ||
| 564 | # TODO: backward-compatible API? | 580 | # TODO: backward-compatible API? |
| 565 | 581 | ||
| 566 | 582 | ||
| 567 | -# def search_hex_block(data, pos=0, min_size=32, first=True): | ||
| 568 | -# if first: | ||
| 569 | -# # Search 1st occurence of a hex block: | ||
| 570 | -# match = re_hexblock.search(data, pos=pos) | ||
| 571 | -# else: | ||
| 572 | -# # Match next occurences of a hex block, from the current position only: | ||
| 573 | -# match = re_hexblock.match(data, pos=pos) | ||
| 574 | -# | ||
| 575 | -# | ||
| 576 | -# | ||
| 577 | -# def rtf_iter_objects (data, min_size=32): | ||
| 578 | -# """ | ||
| 579 | -# Open a RTF file, extract each embedded object encoded in hexadecimal of | ||
| 580 | -# size > min_size, yield the index of the object in the RTF file and its data | ||
| 581 | -# in binary format. | ||
| 582 | -# This is an iterator. | ||
| 583 | -# """ | ||
| 584 | -# # Search 1st occurence of a hex block: | ||
| 585 | -# match = re_hexblock.search(data) | ||
| 586 | -# if match is None: | ||
| 587 | -# log.debug('No hex block found.') | ||
| 588 | -# # no hex block found | ||
| 589 | -# return | ||
| 590 | -# while match is not None: | ||
| 591 | -# found = match.group(0) | ||
| 592 | -# # start index | ||
| 593 | -# start = match.start() | ||
| 594 | -# # current position | ||
| 595 | -# current = match.end() | ||
| 596 | -# log.debug('Found hex block starting at %08X, end %08X, size=%d' % (start, current, len(found))) | ||
| 597 | -# if len(found) < min_size: | ||
| 598 | -# log.debug('Too small - size<%d, ignored.' % min_size) | ||
| 599 | -# match = re_hexblock.search(data, pos=current) | ||
| 600 | -# continue | ||
| 601 | -# #log.debug('Match: %s' % found) | ||
| 602 | -# # remove all whitespace and line feeds: | ||
| 603 | -# #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE | ||
| 604 | -# found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') | ||
| 605 | -# # TODO: make it a function | ||
| 606 | -# # Also remove embedded RTF tags: | ||
| 607 | -# found = re_embedded_tags.sub('', found) | ||
| 608 | -# # object data extracted from the RTF file | ||
| 609 | -# # MS Word accepts an extra hex digit, so we need to trim it if present: | ||
| 610 | -# if len(found) & 1: | ||
| 611 | -# log.debug('Odd length, trimmed last byte.') | ||
| 612 | -# found = found[:-1] | ||
| 613 | -# #log.debug('Cleaned match: %s' % found) | ||
| 614 | -# objdata = binascii.unhexlify(found) | ||
| 615 | -# # Detect the "\bin" control word, which is sometimes used for obfuscation: | ||
| 616 | -# bin_match = re_delims_bin_decimal.match(data, pos=current) | ||
| 617 | -# while bin_match is not None: | ||
| 618 | -# log.debug('Found \\bin block starting at %08X : %r' | ||
| 619 | -# % (bin_match.start(), bin_match.group(0))) | ||
| 620 | -# # extract the decimal integer following '\bin' | ||
| 621 | -# bin_len = int(bin_match.group(1)) | ||
| 622 | -# log.debug('\\bin block length = %d' % bin_len) | ||
| 623 | -# if current+bin_len > len(data): | ||
| 624 | -# log.error('\\bin block length is larger than the remaining data') | ||
| 625 | -# # move the current index, ignore the \bin block | ||
| 626 | -# current += len(bin_match.group(0)) | ||
| 627 | -# break | ||
| 628 | -# # read that number of bytes: | ||
| 629 | -# objdata += data[current:current+bin_len] | ||
| 630 | -# # TODO: handle exception | ||
| 631 | -# current += len(bin_match.group(0)) + bin_len | ||
| 632 | -# # TODO: check if current is out of range | ||
| 633 | -# # TODO: is Word limiting the \bin length to a number of digits? | ||
| 634 | -# log.debug('Current position = %08X' % current) | ||
| 635 | -# match = re_delim_hexblock.match(data, pos=current) | ||
| 636 | -# if match is not None: | ||
| 637 | -# log.debug('Found next hex block starting at %08X, end %08X' | ||
| 638 | -# % (match.start(), match.end())) | ||
| 639 | -# found = match.group(0) | ||
| 640 | -# log.debug('Match: %s' % found) | ||
| 641 | -# # remove all whitespace and line feeds: | ||
| 642 | -# #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE | ||
| 643 | -# found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') | ||
| 644 | -# # Also remove embedded RTF tags: | ||
| 645 | -# found = re_embedded_tags.sub(found, '') | ||
| 646 | -# objdata += binascii.unhexlify(found) | ||
| 647 | -# current = match.end() | ||
| 648 | -# bin_match = re_delims_bin_decimal.match(data, pos=current) | ||
| 649 | -# | ||
| 650 | -# # print repr(found) | ||
| 651 | -# if len(objdata)>min_size: | ||
| 652 | -# yield start, current-start, objdata | ||
| 653 | -# # Search next occurence of a hex block: | ||
| 654 | -# match = re_hexblock.search(data, pos=current) | ||
| 655 | 583 | ||
| 656 | 584 | ||
| 657 | 585 | ||
| @@ -693,10 +621,53 @@ def process_file(container, filename, data, output_dir=None): | @@ -693,10 +621,53 @@ def process_file(container, filename, data, output_dir=None): | ||
| 693 | # TODO: option to extract objects to files (false by default) | 621 | # TODO: option to extract objects to files (false by default) |
| 694 | if data is None: | 622 | if data is None: |
| 695 | data = open(filename, 'rb').read() | 623 | data = open(filename, 'rb').read() |
| 696 | - rtfp = RtfObjParser(data, fname_prefix) | 624 | + print('='*79) |
| 625 | + print('File: %r - %d bytes' % (filename, len(data))) | ||
| 626 | + rtfp = RtfObjParser(data) | ||
| 697 | rtfp.parse() | 627 | rtfp.parse() |
| 628 | + for rtfobj in rtfp.objects: | ||
| 629 | + print('-'*79) | ||
| 630 | + print('found object size %d at index %08X - end %08X' | ||
| 631 | + % (len(rtfobj.rawdata), rtfobj.start, rtfobj.end)) | ||
| 632 | + fname = '%s_object_%08X.raw' % (fname_prefix, rtfobj.start) | ||
| 633 | + print('saving object to file %s' % fname) | ||
| 634 | + open(fname, 'wb').write(rtfobj.rawdata) | ||
| 635 | + if rtfobj.is_ole: | ||
| 636 | + print('extract file embedded in OLE object:') | ||
| 637 | + print('format_id = %d' % rtfobj.format_id) | ||
| 638 | + print('class name = %r' % rtfobj.class_name) | ||
| 639 | + print('data size = %d' % rtfobj.oledata_size) | ||
| 640 | + # set a file extension according to the class name: | ||
| 641 | + class_name = rtfobj.class_name.lower() | ||
| 642 | + if class_name.startswith(b'word'): | ||
| 643 | + ext = 'doc' | ||
| 644 | + elif class_name.startswith(b'package'): | ||
| 645 | + ext = 'package' | ||
| 646 | + else: | ||
| 647 | + ext = 'bin' | ||
| 648 | + fname = '%s_object_%08X.%s' % (fname_prefix, rtfobj.start, ext) | ||
| 649 | + print('saving to file %s' % fname) | ||
| 650 | + open(fname, 'wb').write(rtfobj.oledata) | ||
| 651 | + if rtfobj.is_package: | ||
| 652 | + print('Parsing OLE Package') | ||
| 653 | + print('Filename = %r' % rtfobj.filename) | ||
| 654 | + print('Source path = %r' % rtfobj.src_path) | ||
| 655 | + print('Temp path = %r' % rtfobj.temp_path) | ||
| 656 | + if rtfobj.filename: | ||
| 657 | + fname = '%s_%s' % (fname_prefix, | ||
| 658 | + sanitize_filename(rtfobj.filename)) | ||
| 659 | + else: | ||
| 660 | + fname = '%s_object_%08X.noname' % (fname_prefix, rtfobj.start) | ||
| 661 | + print('saving to file %s' % fname) | ||
| 662 | + open(fname, 'wb').write(rtfobj.olepkgdata) | ||
| 663 | + else: | ||
| 664 | + print('Not an OLE Package') | ||
| 665 | + else: | ||
| 666 | + print('Not a well-formed OLE object') | ||
| 667 | + | ||
| 668 | + | ||
| 698 | 669 | ||
| 699 | - # print '-'*79 | 670 | + # print '-'*79 |
| 700 | # print 'File: %r - %d bytes' % (filename, len(data)) | 671 | # print 'File: %r - %d bytes' % (filename, len(data)) |
| 701 | # for index, orig_len, objdata in rtf_iter_objects(data): | 672 | # for index, orig_len, objdata in rtf_iter_objects(data): |
| 702 | # print 'found object size %d at index %08X - end %08X' % (len(objdata), index, index+orig_len) | 673 | # print 'found object size %d at index %08X - end %08X' % (len(objdata), index, index+orig_len) |
| @@ -745,7 +716,7 @@ def process_file(container, filename, data, output_dir=None): | @@ -745,7 +716,7 @@ def process_file(container, filename, data, output_dir=None): | ||
| 745 | 716 | ||
| 746 | #=== MAIN ================================================================= | 717 | #=== MAIN ================================================================= |
| 747 | 718 | ||
| 748 | -if __name__ == '__main__': | 719 | +def main(): |
| 749 | # print banner with version | 720 | # print banner with version |
| 750 | print ('rtfobj %s - http://decalage.info/python/oletools' % __version__) | 721 | print ('rtfobj %s - http://decalage.info/python/oletools' % __version__) |
| 751 | print ('THIS IS WORK IN PROGRESS - Check updates regularly!') | 722 | print ('THIS IS WORK IN PROGRESS - Check updates regularly!') |
| @@ -753,12 +724,13 @@ if __name__ == '__main__': | @@ -753,12 +724,13 @@ if __name__ == '__main__': | ||
| 753 | print ('') | 724 | print ('') |
| 754 | 725 | ||
| 755 | DEFAULT_LOG_LEVEL = "warning" # Default log level | 726 | DEFAULT_LOG_LEVEL = "warning" # Default log level |
| 756 | - LOG_LEVELS = {'debug': logging.DEBUG, | ||
| 757 | - 'info': logging.INFO, | ||
| 758 | - 'warning': logging.WARNING, | ||
| 759 | - 'error': logging.ERROR, | ||
| 760 | - 'critical': logging.CRITICAL | ||
| 761 | - } | 727 | + LOG_LEVELS = { |
| 728 | + 'debug': logging.DEBUG, | ||
| 729 | + 'info': logging.INFO, | ||
| 730 | + 'warning': logging.WARNING, | ||
| 731 | + 'error': logging.ERROR, | ||
| 732 | + 'critical': logging.CRITICAL | ||
| 733 | + } | ||
| 762 | 734 | ||
| 763 | usage = 'usage: %prog [options] <filename> [filename2 ...]' | 735 | usage = 'usage: %prog [options] <filename> [filename2 ...]' |
| 764 | parser = optparse.OptionParser(usage=usage) | 736 | parser = optparse.OptionParser(usage=usage) |
| @@ -803,5 +775,8 @@ if __name__ == '__main__': | @@ -803,5 +775,8 @@ if __name__ == '__main__': | ||
| 803 | process_file(container, filename, data, options.output_dir) | 775 | process_file(container, filename, data, options.output_dir) |
| 804 | 776 | ||
| 805 | 777 | ||
| 778 | +if __name__ == '__main__': | ||
| 779 | + main() | ||
| 780 | + | ||
| 806 | # This code was developed while listening to The Mary Onettes "Lost" | 781 | # This code was developed while listening to The Mary Onettes "Lost" |
| 807 | 782 |