From 7e150ed3635bbf18437aa16e04a6bc78d63783fd Mon Sep 17 00:00:00 2001 From: Philippe Lagadec Date: Thu, 18 Apr 2013 23:51:58 +0200 Subject: [PATCH] oletools v0.04: Fixed bug in rtfobj, added documentation for rtfobj --- README.md | 24 +++++++++++++++++++++++- oletools/README.txt | 34 +++++++++++++++++++++++++++++++++- oletools/rtfobj.py | 13 +++++++++---- 3 files changed, 65 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 019cb87..f845319 100644 --- a/README.md +++ b/README.md @@ -14,11 +14,13 @@ Tools in python-oletools: - **pyxswf**: a tool to detect, extract and analyze Flash objects (SWF) that may be embedded in files such as MS Office documents (e.g. Word, Excel) and RTF, which is especially useful for malware analysis. +- **rtfobj**: a tool and python module to extract embedded objects from RTF files. - and a few others (coming soon) News ---- +- 2013-04-18 v0.04: fixed bug in rtfobj, added documentation for rtfobj - 2012-11-09 v0.03: Improved pyxswf to extract Flash objects from RTF - 2012-10-29 v0.02: Added oleid - 2012-10-09 v0.01: Initial version of olebrowse and pyxswf @@ -137,6 +139,26 @@ Example 2 - detecting and extracting a SWF file from a RTF document on Windows: For more info, see [http://www.decalage.info/python/pyxswf](http://www.decalage.info/python/pyxswf) +rtfobj +------ + +rtfobj is a Python module to extract embedded objects from RTF files, such as +OLE ojects. It can be used as a Python library or a command-line tool. + + Usage: rtfobj.py + +It extracts and decodes all the data blocks encoded as hexadecimal in the RTF document, and saves them as files named "object_xxxx.bin", xxxx being the location of the object in the RTF file. + +Usage as python module: rtf_iter_objects(filename) is an iterator which yields a tuple (index, object) providing the index of each hexadecimal stream in the RTF file, and the corresponding decoded object. Example: + + import rtfobj + for index, data in rtfobj.rtf_iter_objects("myfile.rtf"): + print 'found object size %d at index %08X' % (len(data), index) + + +For more info, see [http://www.decalage.info/python/rtfobj](http://www.decalage.info/python/rtfobj) + + How to contribute: ------------------ @@ -154,7 +176,7 @@ License This license applies to the python-oletools package, apart from the thirdparty folder which contains third-party files published with their own license. -The python-oletools package is copyright (c) 2012, Philippe Lagadec (http://www.decalage.info) +The python-oletools package is copyright (c) 2012-2013, Philippe Lagadec (http://www.decalage.info) All rights reserved. Redistribution and use in source and binary forms, with or without modification, diff --git a/oletools/README.txt b/oletools/README.txt index 601594a..0e25065 100644 --- a/oletools/README.txt +++ b/oletools/README.txt @@ -26,11 +26,14 @@ Tools in python-oletools: - **pyxswf**: a tool to detect, extract and analyze Flash objects (SWF) that may be embedded in files such as MS Office documents (e.g. Word, Excel) and RTF, which is especially useful for malware analysis. +- **rtfobj**: a tool and python module to extract embedded objects from + RTF files. - and a few others (coming soon) News ---- +- 2013-04-18 v0.04: fixed bug in rtfobj, added documentation for rtfobj - 2012-11-09 v0.03: Improved pyxswf to extract Flash objects from RTF - 2012-10-29 v0.02: Added oleid - 2012-10-09 v0.01: Initial version of olebrowse and pyxswf @@ -174,6 +177,35 @@ Windows: For more info, see `http://www.decalage.info/python/pyxswf `_ +rtfobj +------ + +rtfobj is a Python module to extract embedded objects from RTF files, +such as OLE ojects. It can be used as a Python library or a command-line +tool. + +:: + + Usage: rtfobj.py + +It extracts and decodes all the data blocks encoded as hexadecimal in +the RTF document, and saves them as files named "object\_xxxx.bin", xxxx +being the location of the object in the RTF file. + +Usage as python module: rtf\_iter\_objects(filename) is an iterator +which yields a tuple (index, object) providing the index of each +hexadecimal stream in the RTF file, and the corresponding decoded +object. Example: + +:: + + import rtfobj + for index, data in rtfobj.rtf_iter_objects("myfile.rtf"): + print 'found object size %d at index %08X' % (len(data), index) + +For more info, see +`http://www.decalage.info/python/rtfobj `_ + How to contribute: ------------------ @@ -199,7 +231,7 @@ This license applies to the python-oletools package, apart from the thirdparty folder which contains third-party files published with their own license. -The python-oletools package is copyright (c) 2012, Philippe Lagadec +The python-oletools package is copyright (c) 2012-2013, Philippe Lagadec (http://www.decalage.info) All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/oletools/rtfobj.py b/oletools/rtfobj.py index 96539bc..8e5efd3 100644 --- a/oletools/rtfobj.py +++ b/oletools/rtfobj.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """ -rtfobj.py - Philippe Lagadec 2012-11-09 +rtfobj.py - Philippe Lagadec 2013-04-02 rtfobj is a Python module to extract embedded objects from RTF files, such as OLE ojects. It can be used as a Python library or a command-line tool. @@ -12,7 +12,7 @@ rtfobj project website: http://www.decalage.info/python/rtfobj rtfobj is part of the python-oletools package: http://www.decalage.info/python/oletools -rtfobj is copyright (c) 2012, Philippe Lagadec (http://www.decalage.info) +rtfobj is copyright (c) 2012-2013, Philippe Lagadec (http://www.decalage.info) All rights reserved. Redistribution and use in source and binary forms, with or without modification, @@ -36,15 +36,18 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ -__version__ = '0.01' +__version__ = '0.02' #------------------------------------------------------------------------------ # CHANGELOG: # 2012-11-09 v0.01 PL: - first version +# 2013-04-02 v0.02 PL: - fixed bug in main #------------------------------------------------------------------------------ # TODO: # - improve regex pattern for better performance? +# - allow semicolon within hex, as found in this sample: +# http://contagiodump.blogspot.nl/2011/10/sep-28-cve-2010-3333-manuscript-with.html import re, sys, string, binascii @@ -54,6 +57,8 @@ import re, sys, string, binascii # several hex chars, at least 4: (?:[0-9A-Fa-f]{2}){4,} # at least 4 hex chars, followed by whitespace or CR/LF: (?:[0-9A-Fa-f]{2}){4,}\s* PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}' +# improved pattern, allowing semicolons within hex: +#PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}' # a dummy translation table for str.translate, which does not change anythying: TRANSTABLE_NOCHANGE = string.maketrans('', '') @@ -78,7 +83,7 @@ def rtf_iter_objects (filename, min_size=32): yield m.start(), found if __name__ == '__main__': - if len(sys.argv<2): + if len(sys.argv)<2: sys.exit(__doc__) for index, data in rtf_iter_objects(sys.argv[1]): print 'found object size %d at index %08X' % (len(data), index) -- libgit2 0.21.4