diff --git a/README.md b/README.md index 48cc9d6..b1ee2ef 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,8 @@ Tools in python-oletools: - **olebrowse**: A simple GUI to browse OLE files (e.g. MS Word, Excel, Powerpoint documents), to view and extract individual data streams. -- **pyxswf**: a script to detect, extract and analyze Flash objects (SWF) that may +- **oleid**: a tool to analyze OLE files to detect specific characteristics that could potentially indicate that the file is suspicious or malicious. +- **pyxswf**: a tool to detect, extract and analyze Flash objects (SWF) that may be embedded in files such as MS Office documents (e.g. Word, Excel), which is especially useful for malware analysis. - and a few others (coming soon) @@ -18,7 +19,8 @@ Tools in python-oletools: News ---- -- 2012-10-09: Initial version of olebrowse and pyxswf +- 2012-10-29 v0.02: Added oleid +- 2012-10-09 v0.01: Initial version of olebrowse and pyxswf - see changelog in source code for more info. Download: @@ -39,6 +41,35 @@ If you provide a file it will be opened, else a dialog will allow you to browse For screenshots and other info, see [http://www.decalage.info/python/olebrowse](http://www.decalage.info/python/olebrowse) +oleid: +------ + +oleid is a script to analyze OLE files such as MS Office documents (e.g. Word, +Excel), to detect specific characteristics that could potentially indicate that +the file is suspicious or malicious, in terms of security (e.g. malware). +For example it can detect VBA macros, embedded Flash objects, fragmentation. + + Usage: oleid.py + +Example - analyzing a Word document containing a Flash object and VBA macros: + + C:\oletools>oleid.py word_flash_vba.doc + Filename: word_flash_vba.doc + OLE format: True + Has SummaryInformation stream: True + Application name: Microsoft Office Word + Encrypted: False + Word Document: True + VBA Macros: True + Excel Workbook: False + PowerPoint Presentation: False + Visio Drawing: False + ObjectPool: True + Flash objects: 1 + +oleid project website: [http://www.decalage.info/python/oleid](http://www.decalage.info/python/oleid) + + pyxswf: -------- diff --git a/oletools/README.txt b/oletools/README.txt index e0296d9..73b4c51 100644 --- a/oletools/README.txt +++ b/oletools/README.txt @@ -20,15 +20,19 @@ Tools in python-oletools: - **olebrowse**: A simple GUI to browse OLE files (e.g. MS Word, Excel, Powerpoint documents), to view and extract individual data streams. -- **pyxswf**: a script to detect, extract and analyze Flash objects - (SWF) that may be embedded in files such as MS Office documents (e.g. - Word, Excel), which is especially useful for malware analysis. +- **oleid**: a tool to analyze OLE files to detect specific + characteristics that could potentially indicate that the file is + suspicious or malicious. +- **pyxswf**: a tool to detect, extract and analyze Flash objects (SWF) + that may be embedded in files such as MS Office documents (e.g. Word, + Excel), which is especially useful for malware analysis. - and a few others (coming soon) News ---- -- 2012-10-09: Initial version of olebrowse and pyxswf +- 2012-10-29 v0.02: Added oleid +- 2012-10-09 v0.01: Initial version of olebrowse and pyxswf - see changelog in source code for more info. Download: @@ -56,6 +60,41 @@ file for further analysis. For screenshots and other info, see `http://www.decalage.info/python/olebrowse `_ +oleid: +------ + +oleid is a script to analyze OLE files such as MS Office documents (e.g. +Word, Excel), to detect specific characteristics that could potentially +indicate that the file is suspicious or malicious, in terms of security +(e.g. malware). For example it can detect VBA macros, embedded Flash +objects, fragmentation. + +:: + + Usage: oleid.py + +Example - analyzing a Word document containing a Flash object and VBA +macros: + +:: + + C:\oletools>oleid.py word_flash_vba.doc + Filename: word_flash_vba.doc + OLE format: True + Has SummaryInformation stream: True + Application name: Microsoft Office Word + Encrypted: False + Word Document: True + VBA Macros: True + Excel Workbook: False + PowerPoint Presentation: False + Visio Drawing: False + ObjectPool: True + Flash objects: 1 + +oleid project website: +`http://www.decalage.info/python/oleid `_ + pyxswf: ------- diff --git a/oletools/oleid.py b/oletools/oleid.py new file mode 100644 index 0000000..a9d29d7 --- /dev/null +++ b/oletools/oleid.py @@ -0,0 +1,266 @@ +#!/usr/bin/env python +""" +oleid.py - Philippe Lagadec 2012-10-18 + +oleid is a script to analyze OLE files such as MS Office documents (e.g. Word, +Excel), to detect specific characteristics that could potentially indicate that +the file is suspicious or malicious, in terms of security (e.g. malware). +For example it can detect VBA macros, embedded Flash objects, fragmentation. +The results can be displayed or returned as XML for further processing. + +Usage: oleid.py + +oleid project website: http://www.decalage.info/python/oleid + +oleid is part of the python-oletools package: +http://www.decalage.info/python/oletools + +oleid is copyright (c) 2012, Philippe Lagadec (http://www.decalage.info) +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + +__version__ = '0.01' + +#------------------------------------------------------------------------------ +# CHANGELOG: +# 2012-10-29 v0.01 PL: - first version + +#------------------------------------------------------------------------------ +# TODO: +# - detect RTF and OpenXML +# - fragmentation +# - OLE package +# - entropy +# - detect PE header? +# - detect NOPs? +# - list type of each object in object pool? +# - criticality for each indicator?: info, low, medium, high +# - support wildcards with glob? +# - verbose option +# - csv, xml output + +import optparse, sys, os, re, zlib, struct +from thirdparty.OleFileIO_PL import OleFileIO_PL + + +class Indicator (object): + + def __init__(self, _id, value=None, _type=bool, name=None, description=None): + self.id = _id + self.value = value + self.type = _type + self.name = name + if name == None: + self.name = _id + self.description = description + + +def detect_flash (data): + """ + Detect Flash objects (SWF files) within a binary string of data + return a list of (start_index, length, compressed) tuples, or [] if nothing + found. + + Code inspired from xxxswf.py by Alexander Hanel (but significantly reworked) + http://hooked-on-mnemonics.blogspot.nl/2011/12/xxxswfpy.html + """ + #TODO: report + found = [] + for match in re.finditer('CWS|FWS', data): + start = match.start() + if start+8 > len(data): + # header size larger than remaining data, this is not a SWF + continue + #TODO: one struct.unpack should be simpler + # Read Header + header = data[start:start+3] + # Read Version + ver = struct.unpack(' 20: + continue + # Read SWF Size + size = struct.unpack(' len(data) or size < 1024: + # declared size larger than remaining data, this is not a SWF + # or declared size too small for a usual SWF + continue + # Read SWF into buffer. If compressed read uncompressed size. + swf = data[start:start+size] + compressed = False + if 'CWS' in header: + compressed = True + # compressed SWF: data after header (8 bytes) until the end is + # compressed with zlib. Attempt to decompress it to check if it is + # valid + compressed_data = swf[8:] + try: + zlib.decompress(compressed_data) + except: + continue + # else we don't check anything at this stage, we only assume it is a + # valid SWF. So there might be false positives for uncompressed SWF. + found.append((start, size, compressed)) + #print 'Found SWF start=%x, length=%d' % (start, size) + return found + + +class OleID: + + def __init__(self, filename): + self.filename = filename + self.indicators = [] + + def check(self): + # check if it is actually an OLE file: + oleformat = Indicator('ole_format', True, name='OLE format') + self.indicators.append(oleformat) + if not OleFileIO_PL.isOleFile(self.filename): + oleformat.value = False + return self.indicators + # parse file: + self.ole = OleFileIO_PL.OleFileIO(self.filename) + # checks: + self.check_properties() + self.check_encrypted() + self.check_word() + self.check_excel() + self.check_powerpoint() + self.check_visio() + self.check_ObjectPool() + self.check_flash() + self.ole.close() + return self.indicators + + def check_properties (self): + suminfo = Indicator('has_suminfo', False, name='Has SummaryInformation stream') + self.indicators.append(suminfo) + appname = Indicator('appname', 'unknown', _type=str, name='Application name') + self.indicators.append(appname) + self.suminfo = {} + # check stream SummaryInformation + if self.ole.exists("\x05SummaryInformation"): + suminfo.value = True + self.suminfo = self.ole.getproperties("\x05SummaryInformation") + # check application name: + appname.value = self.suminfo.get(0x12, 'unknown') + + def check_encrypted (self): + # we keep the pointer to the indicator, can be modified by other checks: + self.encrypted = Indicator('encrypted', False, name='Encrypted') + self.indicators.append(self.encrypted) + # check if bit 1 of security field = 1: + # (this field may be missing for Powerpoint2000, for example) + if 0x13 in self.suminfo: + if self.suminfo[0x13] & 1: + self.encrypted.value = True + + def check_word (self): + word = Indicator('word', False, name='Word Document', + description='Contains a WordDocument stream, very likely to be a Microsoft Word Document.') + self.indicators.append(word) + self.macros = Indicator('vba_macros', False, name='VBA Macros') + self.indicators.append(self.macros) + if self.ole.exists('WordDocument'): + word.value = True + # check for Word-specific encryption flag: + s = self.ole.openstream(["WordDocument"]) + # pass header 10 bytes + s.read(10) + # read flag structure: + temp16 = struct.unpack("H", s.read(2))[0] + fEncrypted = (temp16 & 0x0100) >> 8 + if fEncrypted: + self.encrypted.value = True + s.close() + # check for VBA macros: + if self.ole.exists('Macros'): + self.macros.value = True + + def check_excel (self): + excel = Indicator('excel', False, name='Excel Workbook', + description='Contains a Workbook or Book stream, very likely to be a Microsoft Excel Workbook.') + self.indicators.append(excel) + #self.macros = Indicator('vba_macros', False, name='VBA Macros') + #self.indicators.append(self.macros) + if self.ole.exists('Workbook') or self.ole.exists('Book'): + excel.value = True + # check for VBA macros: + if self.ole.exists('_VBA_PROJECT_CUR'): + self.macros.value = True + + def check_powerpoint (self): + ppt = Indicator('ppt', False, name='PowerPoint Presentation', + description='Contains a PowerPoint Document stream, very likely to be a Microsoft PowerPoint Presentation.') + self.indicators.append(ppt) + if self.ole.exists('PowerPoint Document'): + ppt.value = True + + def check_visio (self): + visio = Indicator('visio', False, name='Visio Drawing', + description='Contains a VisioDocument stream, very likely to be a Microsoft Visio Drawing.') + self.indicators.append(visio) + if self.ole.exists('VisioDocument'): + visio.value = True + + def check_ObjectPool (self): + objpool = Indicator('ObjectPool', False, name='ObjectPool', + description='Contains an ObjectPool stream, very likely to contain embedded OLE objects or files.') + self.indicators.append(objpool) + if self.ole.exists('ObjectPool'): + objpool.value = True + + + def check_flash (self): + flash = Indicator('flash', 0, _type=int, name='Flash objects', + description='Number of embedded Flash objects (SWF files) detected in OLE streams. Not 100% accurate, there may be false positives.') + self.indicators.append(flash) + for stream in self.ole.listdir(): + data = self.ole.openstream(stream).read() + found = detect_flash(data) + # just add to the count of Flash objects: + flash.value += len(found) + #print stream, found + +def main(): + usage = 'usage: %prog [options] ' + parser = optparse.OptionParser(usage=usage) +## parser.add_option('-o', '--ole', action='store_true', dest='ole', help='Parse an OLE file (e.g. Word, Excel) to look for SWF in each stream') + + (options, args) = parser.parse_args() + + # Print help if no argurments are passed + if len(args) == 0: + parser.print_help() + return + + for filename in args: + print '\nFilename:', filename + oleid = OleID(filename) + indicators = oleid.check() + for indicator in indicators: + print '%s: %s' % (indicator.name, indicator.value) + +if __name__ == '__main__': + main()