diff --git a/oletools/common/errors.py b/oletools/common/errors.py new file mode 100644 index 0000000..4ee4cb1 --- /dev/null +++ b/oletools/common/errors.py @@ -0,0 +1,13 @@ +""" +Errors used in several tools to avoid duplication + +.. codeauthor:: Intra2net AG +""" + +class FileIsEncryptedError(ValueError): + """Exception thrown if file is encrypted and cannot deal with it.""" + # see also: same class in olevba[3] and record_base + def __init__(self, filename=None): + super(FileIsEncryptedError, self).__init__( + 'Office file {}is encrypted, not yet supported' + .format('' if filename is None else filename + ' ')) diff --git a/oletools/msodde.py b/oletools/msodde.py index 69eac6c..fe335a6 100644 --- a/oletools/msodde.py +++ b/oletools/msodde.py @@ -11,6 +11,7 @@ Supported formats: - RTF - CSV (exported from / imported into Excel) - XML (exported from Word 2003, Word 2007+, Excel 2003, (Excel 2007+?) +- raises an error if run with files encrypted using MS Crypto API RC4 Author: Philippe Lagadec - http://www.decalage.info License: BSD, see source code or documentation @@ -61,7 +62,9 @@ import olefile from oletools import ooxml from oletools import xls_parser from oletools import rtfobj +from oletools import oleid from oletools.common.log_helper import log_helper +from oletools.common.errors import FileIsEncryptedError # ----------------------------------------------------------------------------- # CHANGELOG: @@ -84,6 +87,7 @@ from oletools.common.log_helper import log_helper # 2018-01-10 CH: - add single-xml files (Word 2003/2007+ / Excel 2003) # 2018-03-21 CH: - added detection for various CSV formulas (issue #259) # 2018-09-11 v0.54 PL: - olefile is now a dependency +# 2018-10-25 CH: - detect encryption and raise error if detected __version__ = '0.54dev1' @@ -438,17 +442,18 @@ def process_doc_stream(stream): return result_parts -def process_doc(filepath): +def process_doc(ole): """ find dde links in word ole (.doc/.dot) file + Checks whether files is ppt and returns empty immediately in that case + (ppt files cannot contain DDE-links to my knowledge) + like process_xml, returns a concatenated unicode string of dde links or empty if none were found. dde-links will still begin with the dde[auto] key word (possibly after some whitespace) """ logger.debug('process_doc') - ole = olefile.OleFileIO(filepath, path_encoding=None) - links = [] for sid, direntry in enumerate(ole.direntries): is_orphan = direntry is None @@ -703,8 +708,8 @@ def process_xlsx(filepath): log_func = logger.debug else: # default log_func = logger.info - log_func('Failed to parse {0} of content type {1}' - .format(subfile, content_type)) + log_func('Failed to parse {0} of content type {1} ("{2}")' + .format(subfile, content_type, str(exc))) # in any case: continue with next return u'\n'.join(dde_links) @@ -886,9 +891,20 @@ def process_file(filepath, field_filter_mode=None): if xls_parser.is_xls(filepath): logger.debug('Process file as excel 2003 (xls)') return process_xls(filepath) + + # encrypted files also look like ole, even if office 2007+ (xml-based) + # so check for encryption, first + ole = olefile.OleFileIO(filepath, path_encoding=None) + oid = oleid.OleID(ole) + if oid.check_encrypted().value: + log.debug('is encrypted - raise error') + raise FileIsEncryptedError(filepath) + elif oid.check_powerpoint().value: + log.debug('is ppt - cannot have DDE') + return u'' else: logger.debug('Process file as word 2003 (doc)') - return process_doc(filepath) + return process_doc(ole) with open(filepath, 'rb') as file_handle: if file_handle.read(4) == RTF_START: diff --git a/oletools/oleid.py b/oletools/oleid.py index 7e828b9..0a6cba1 100644 --- a/oletools/oleid.py +++ b/oletools/oleid.py @@ -6,9 +6,8 @@ oleid is a script to analyze OLE files such as MS Office documents (e.g. Word, Excel), to detect specific characteristics that could potentially indicate that the file is suspicious or malicious, in terms of security (e.g. malware). For example it can detect VBA macros, embedded Flash objects, fragmentation. -The results can be displayed or returned as XML for further processing. - -Usage: oleid.py +The results is displayed as ascii table (but could be returned or printed in +other formats like CSV, XML or JSON in future). oleid project website: http://www.decalage.info/python/oleid @@ -21,8 +20,8 @@ http://www.decalage.info/python/oletools # oleid is copyright (c) 2012-2018, Philippe Lagadec (http://www.decalage.info) # All rights reserved. # -# Redistribution and use in source and binary forms, with or without modification, -# are permitted provided that the following conditions are met: +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. @@ -30,16 +29,17 @@ http://www.decalage.info/python/oletools # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. # To improve Python 2+3 compatibility: from __future__ import print_function @@ -56,6 +56,8 @@ from __future__ import print_function # 2017-04-26 PL: - fixed absolute imports (issue #141) # 2017-09-01 SA: - detect OpenXML encryption # 2018-09-11 v0.54 PL: - olefile is now a dependency +# 2018-10-19 CH: - accept olefile as well as filename, return Indicators, +# improve encryption detection for ppt __version__ = '0.54dev1' @@ -78,28 +80,27 @@ __version__ = '0.54dev1' #=== IMPORTS ================================================================= -import optparse, sys, os, re, zlib, struct +import argparse, sys, re, zlib, struct +from os.path import dirname, abspath -# IMPORTANT: it should be possible to run oletools directly as scripts -# in any directory without installing them with pip or setup.py. -# In that case, relative imports are NOT usable. -# And to enable Python 2+3 compatibility, we need to use absolute imports, -# so we add the oletools parent folder to sys.path (absolute+normalized path): -_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) -# print('_thismodule_dir = %r' % _thismodule_dir) -_parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) -# print('_parent_dir = %r' % _thirdparty_dir) -if not _parent_dir in sys.path: - sys.path.insert(0, _parent_dir) +# little hack to allow absolute imports even if oletools is not installed +# (required to run oletools directly as scripts in any directory). +try: + from oletools.thirdparty import prettytable +except ImportError: + PARENT_DIR = dirname(dirname(abspath(__file__))) + if PARENT_DIR not in sys.path: + sys.path.insert(0, PARENT_DIR) + del PARENT_DIR + from oletools.thirdparty import prettytable import olefile -from oletools.thirdparty.prettytable import prettytable #=== FUNCTIONS =============================================================== -def detect_flash (data): +def detect_flash(data): """ Detect Flash objects (SWF files) within a binary string of data return a list of (start_index, length, compressed) tuples, or [] if nothing @@ -141,7 +142,7 @@ def detect_flash (data): compressed_data = swf[8:] try: zlib.decompress(compressed_data) - except: + except Exception: continue # else we don't check anything at this stage, we only assume it is a # valid SWF. So there might be false positives for uncompressed SWF. @@ -152,9 +153,15 @@ def detect_flash (data): #=== CLASSES ================================================================= -class Indicator (object): +class Indicator(object): + """ + Piece of information of an :py:class:`OleID` object. + + Contains an ID, value, type, name and description. No other functionality. + """ - def __init__(self, _id, value=None, _type=bool, name=None, description=None): + def __init__(self, _id, value=None, _type=bool, name=None, + description=None): self.id = _id self.value = value self.type = _type @@ -164,21 +171,55 @@ class Indicator (object): self.description = description -class OleID: +class OleID(object): + """ + Summary of information about an OLE file - def __init__(self, filename): - self.filename = filename + Call :py:meth:`OleID.check` to gather all info on a given file or run one + of the `check_` functions to just get a specific piece of info. + """ + + def __init__(self, input_file): + """ + Create an OleID object + + This does not run any checks yet nor open the file. + + Can either give just a filename (as str), so OleID will check whether + that is a valid OLE file and create a :py:class:`olefile.OleFileIO` + object for it. Or you can give an already opened + :py:class:`olefile.OleFileIO` as argument to avoid re-opening (e.g. if + called from other oletools). + + If filename is given, only :py:meth:`OleID.check` opens the file. Other + functions will return None + """ + if isinstance(input_file, olefile.OleFileIO): + self.ole = input_file + self.filename = None + else: + self.filename = input_file + self.ole = None self.indicators = [] + self.suminfo_data = None def check(self): + """ + Open file and run all checks on it. + + :returns: list of all :py:class:`Indicator`s created + """ # check if it is actually an OLE file: oleformat = Indicator('ole_format', True, name='OLE format') self.indicators.append(oleformat) - if not olefile.isOleFile(self.filename): + if self.ole: + oleformat.value = True + elif not olefile.isOleFile(self.filename): oleformat.value = False return self.indicators - # parse file: - self.ole = olefile.OleFileIO(self.filename) + else: + # parse file: + self.ole = olefile.OleFileIO(self.filename) # checks: self.check_properties() self.check_encrypted() @@ -186,143 +227,274 @@ class OleID: self.check_excel() self.check_powerpoint() self.check_visio() - self.check_ObjectPool() + self.check_object_pool() self.check_flash() self.ole.close() return self.indicators - def check_properties (self): - suminfo = Indicator('has_suminfo', False, name='Has SummaryInformation stream') + def check_properties(self): + """ + Read summary information required for other check_* functions + + :returns: 2 :py:class:`Indicator`s (for presence of summary info and + application name) or None if file was not opened + """ + suminfo = Indicator('has_suminfo', False, + name='Has SummaryInformation stream') self.indicators.append(suminfo) - appname = Indicator('appname', 'unknown', _type=str, name='Application name') + appname = Indicator('appname', 'unknown', _type=str, + name='Application name') self.indicators.append(appname) - self.suminfo = {} - # check stream SummaryInformation + if not self.ole: + return None, None + self.suminfo_data = {} + # check stream SummaryInformation (not present e.g. in encrypted ppt) if self.ole.exists("\x05SummaryInformation"): suminfo.value = True - self.suminfo = self.ole.getproperties("\x05SummaryInformation") + self.suminfo_data = self.ole.getproperties("\x05SummaryInformation") # check application name: - appname.value = self.suminfo.get(0x12, 'unknown') - - def check_encrypted (self): + appname.value = self.suminfo_data.get(0x12, 'unknown') + return suminfo, appname + + def get_indicator(self, indicator_id): + """Helper function: returns an indicator if present (or None)""" + result = [indicator for indicator in self.indicators + if indicator.id == indicator_id] + if result: + return result[0] + else: + return None + + def check_encrypted(self): + """ + Check whether this file is encrypted. + + Might call check_properties. + + :returns: :py:class:`Indicator` for encryption or None if file was not + opened + """ # we keep the pointer to the indicator, can be modified by other checks: - self.encrypted = Indicator('encrypted', False, name='Encrypted') - self.indicators.append(self.encrypted) + encrypted = Indicator('encrypted', False, name='Encrypted') + self.indicators.append(encrypted) + if not self.ole: + return None # check if bit 1 of security field = 1: # (this field may be missing for Powerpoint2000, for example) - if 0x13 in self.suminfo: - if self.suminfo[0x13] & 1: - self.encrypted.value = True + if self.suminfo_data is None: + self.check_properties() + if 0x13 in self.suminfo_data: + if self.suminfo_data[0x13] & 1: + encrypted.value = True # check if this is an OpenXML encrypted file elif self.ole.exists('EncryptionInfo'): - self.encrypted.value = True - - def check_word (self): - word = Indicator('word', False, name='Word Document', - description='Contains a WordDocument stream, very likely to be a Microsoft Word Document.') + encrypted.value = True + # or an encrypted ppt file + if self.ole.exists('EncryptedSummary') and \ + not self.ole.exists('SummaryInformation'): + encrypted.value = True + return encrypted + + def check_word(self): + """ + Check whether this file is a word document + + If this finds evidence of encryption, will correct/add encryption + indicator. + + :returns: 2 :py:class:`Indicator`s (for word and vba_macro) or None if + file was not opened + """ + word = Indicator( + 'word', False, name='Word Document', + description='Contains a WordDocument stream, very likely to be a ' + 'Microsoft Word Document.') self.indicators.append(word) - self.macros = Indicator('vba_macros', False, name='VBA Macros') - self.indicators.append(self.macros) + macros = Indicator('vba_macros', False, name='VBA Macros') + self.indicators.append(macros) + if not self.ole: + return None, None if self.ole.exists('WordDocument'): word.value = True # check for Word-specific encryption flag: - s = self.ole.openstream(["WordDocument"]) - # pass header 10 bytes - s.read(10) - # read flag structure: - temp16 = struct.unpack("H", s.read(2))[0] - fEncrypted = (temp16 & 0x0100) >> 8 - if fEncrypted: - self.encrypted.value = True - s.close() + stream = None + try: + stream = self.ole.openstream(["WordDocument"]) + # pass header 10 bytes + stream.read(10) + # read flag structure: + temp16 = struct.unpack("H", stream.read(2))[0] + f_encrypted = (temp16 & 0x0100) >> 8 + if f_encrypted: + # correct encrypted indicator if present or add one + encrypt_ind = self.get_indicator('encrypted') + if encrypt_ind: + encrypt_ind.value = True + else: + self.indicators.append('encrypted', True, name='Encrypted') + except Exception: + raise + finally: + if stream is not None: + stream.close() # check for VBA macros: if self.ole.exists('Macros'): - self.macros.value = True + macros.value = True + return word, macros + + def check_excel(self): + """ + Check whether this file is an excel workbook. + + If this finds macros, will add/correct macro indicator. - def check_excel (self): - excel = Indicator('excel', False, name='Excel Workbook', - description='Contains a Workbook or Book stream, very likely to be a Microsoft Excel Workbook.') + see also: :py:func:`xls_parser.is_xls` + + :returns: :py:class:`Indicator` for excel or (None, None) if file was + not opened + """ + excel = Indicator( + 'excel', False, name='Excel Workbook', + description='Contains a Workbook or Book stream, very likely to be ' + 'a Microsoft Excel Workbook.') self.indicators.append(excel) + if not self.ole: + return None #self.macros = Indicator('vba_macros', False, name='VBA Macros') #self.indicators.append(self.macros) if self.ole.exists('Workbook') or self.ole.exists('Book'): excel.value = True # check for VBA macros: if self.ole.exists('_VBA_PROJECT_CUR'): - self.macros.value = True - - def check_powerpoint (self): - ppt = Indicator('ppt', False, name='PowerPoint Presentation', - description='Contains a PowerPoint Document stream, very likely to be a Microsoft PowerPoint Presentation.') + # correct macro indicator if present or add one + macro_ind = self.get_indicator('vba_macros') + if macro_ind: + macro_ind.value = True + else: + self.indicators.append('vba_macros', True, + name='VBA Macros') + return excel + + def check_powerpoint(self): + """ + Check whether this file is a powerpoint presentation + + see also: :py:func:`ppt_record_parser.is_ppt` + + :returns: :py:class:`Indicator` for whether this is a powerpoint + presentation or not or None if file was not opened + """ + ppt = Indicator( + 'ppt', False, name='PowerPoint Presentation', + description='Contains a PowerPoint Document stream, very likely to ' + 'be a Microsoft PowerPoint Presentation.') self.indicators.append(ppt) + if not self.ole: + return None if self.ole.exists('PowerPoint Document'): ppt.value = True - - def check_visio (self): - visio = Indicator('visio', False, name='Visio Drawing', - description='Contains a VisioDocument stream, very likely to be a Microsoft Visio Drawing.') + return ppt + + def check_visio(self): + """Check whether this file is a visio drawing""" + visio = Indicator( + 'visio', False, name='Visio Drawing', + description='Contains a VisioDocument stream, very likely to be a ' + 'Microsoft Visio Drawing.') self.indicators.append(visio) + if not self.ole: + return None if self.ole.exists('VisioDocument'): visio.value = True + return visio + + def check_object_pool(self): + """ + Check whether this file contains an ObjectPool stream. + + Such a stream would be a strong indicator for embedded objects or files. - def check_ObjectPool (self): - objpool = Indicator('ObjectPool', False, name='ObjectPool', - description='Contains an ObjectPool stream, very likely to contain embedded OLE objects or files.') + :returns: :py:class:`Indicator` for ObjectPool stream or None if file + was not opened + """ + objpool = Indicator( + 'ObjectPool', False, name='ObjectPool', + description='Contains an ObjectPool stream, very likely to contain ' + 'embedded OLE objects or files.') self.indicators.append(objpool) + if not self.ole: + return None if self.ole.exists('ObjectPool'): objpool.value = True - - - def check_flash (self): - flash = Indicator('flash', 0, _type=int, name='Flash objects', - description='Number of embedded Flash objects (SWF files) detected in OLE streams. Not 100% accurate, there may be false positives.') + return objpool + + def check_flash(self): + """ + Check whether this file contains flash objects + + :returns: :py:class:`Indicator` for count of flash objects or None if + file was not opened + """ + flash = Indicator( + 'flash', 0, _type=int, name='Flash objects', + description='Number of embedded Flash objects (SWF files) detected ' + 'in OLE streams. Not 100% accurate, there may be false ' + 'positives.') self.indicators.append(flash) + if not self.ole: + return None for stream in self.ole.listdir(): data = self.ole.openstream(stream).read() found = detect_flash(data) # just add to the count of Flash objects: flash.value += len(found) #print stream, found + return flash #=== MAIN ================================================================= def main(): + """Called when running this file as script. Shows all info on input file.""" # print banner with version - print ('oleid %s - http://decalage.info/oletools' % __version__) - print ('THIS IS WORK IN PROGRESS - Check updates regularly!') - print ('Please report any issue at https://github.com/decalage2/oletools/issues') - print ('') + print('oleid %s - http://decalage.info/oletools' % __version__) + print('THIS IS WORK IN PROGRESS - Check updates regularly!') + print('Please report any issue at ' + 'https://github.com/decalage2/oletools/issues') + print('') - usage = 'usage: %prog [options] ' - parser = optparse.OptionParser(usage=__doc__ + '\n' + usage) -## parser.add_option('-o', '--ole', action='store_true', dest='ole', help='Parse an OLE file (e.g. Word, Excel) to look for SWF in each stream') + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('input', type=str, nargs='*', metavar='FILE', + help='Name of files to process') + # parser.add_argument('-o', '--ole', action='store_true', dest='ole', + # help='Parse an OLE file (e.g. Word, Excel) to look for ' + # 'SWF in each stream') - (options, args) = parser.parse_args() + args = parser.parse_args() # Print help if no argurments are passed - if len(args) == 0: + if len(args.input) == 0: parser.print_help() return - for filename in args: + for filename in args.input: print('Filename:', filename) oleid = OleID(filename) indicators = oleid.check() #TODO: add description #TODO: highlight suspicious indicators - t = prettytable.PrettyTable(['Indicator', 'Value']) - t.align = 'l' - t.max_width = 39 - #t.border = False + table = prettytable.PrettyTable(['Indicator', 'Value']) + table.align = 'l' + table.max_width = 39 + table.border = False for indicator in indicators: #print '%s: %s' % (indicator.name, indicator.value) - t.add_row((indicator.name, indicator.value)) + table.add_row((indicator.name, indicator.value)) - print(t) - print ('') + print(table) + print('') if __name__ == '__main__': main() diff --git a/oletools/olevba.py b/oletools/olevba.py index cf180d8..1f32efd 100644 --- a/oletools/olevba.py +++ b/oletools/olevba.py @@ -14,6 +14,7 @@ Supported formats: - Word 2003 XML (.xml) - Word/Excel Single File Web Page / MHTML (.mht) - Publisher (.pub) +- raises an error if run with files encrypted using MS Crypto API RC4 Author: Philippe Lagadec - http://www.decalage.info License: BSD, see source code or documentation @@ -208,6 +209,7 @@ from __future__ import print_function # (issue #283) # 2018-09-11 v0.54 PL: - olefile is now a dependency # 2018-10-08 PL: - replace backspace before printing to console (issue #358) +# 2018-10-25 CH: - detect encryption and raise error if detected __version__ = '0.54dev2' @@ -309,6 +311,8 @@ from pyparsing import \ from oletools import ppt_parser from oletools import oleform from oletools import rtfobj +from oletools import oleid +from oletools.common.errors import FileIsEncryptedError # monkeypatch email to fix issue #32: @@ -472,6 +476,7 @@ RETURN_OPEN_ERROR = 5 RETURN_PARSE_ERROR = 6 RETURN_SEVERAL_ERRS = 7 RETURN_UNEXPECTED = 8 +RETURN_ENCRYPTED = 9 # MAC codepages (from http://stackoverflow.com/questions/1592925/decoding-mac-os-text-in-python) MAC_CODEPAGES = { @@ -2367,6 +2372,12 @@ class VBA_Parser(object): # This looks like an OLE file self.open_ole(_file) + # check whether file is encrypted (need to do this before try ppt) + log.debug('Check encryption of ole file') + crypt_indicator = oleid.OleID(self.ole_file).check_encrypted() + if crypt_indicator.value: + raise FileIsEncryptedError(filename) + # if this worked, try whether it is a ppt file (special ole file) self.open_ppt() if self.type is None and is_zipfile(_file): @@ -3634,6 +3645,16 @@ def main(cmd_line_args=None): % (filename, exc.orig_exc)) return_code = RETURN_PARSE_ERROR if return_code == 0 \ else RETURN_SEVERAL_ERRS + except FileIsEncryptedError as exc: + if options.output_mode in ('triage', 'unspecified'): + print('%-12s %s - File is encrypted' % ('!ERROR', filename)) + elif options.output_mode == 'json': + print_json(file=filename, type='error', + error=type(exc).__name__, message=str(exc)) + else: + log.exception('File %s is encrypted!' % (filename)) + return_code = RETURN_ENCRYPTED if return_code == 0 \ + else RETURN_SEVERAL_ERRS # Here we do not close the vba_parser, because process_file may need it below. if options.output_mode == 'triage': diff --git a/oletools/olevba3.py b/oletools/olevba3.py index 8e855cb..1c48ab6 100644 --- a/oletools/olevba3.py +++ b/oletools/olevba3.py @@ -16,6 +16,7 @@ Supported formats: - Word 2003 XML (.xml) - Word/Excel Single File Web Page / MHTML (.mht) - Publisher (.pub) +- raises an error if run with files encrypted using MS Crypto API RC4 Author: Philippe Lagadec - http://www.decalage.info License: BSD, see source code or documentation @@ -207,6 +208,7 @@ from __future__ import print_function # 2018-06-11 v0.53.1 MHW: - fixed #320: chr instead of unichr on python 3 # 2018-06-12 MHW: - fixed #322: import reduce from functools # 2018-09-11 v0.54 PL: - olefile is now a dependency +# 2018-10-25 CH: - detect encryption and raise error if detected __version__ = '0.54dev1' @@ -247,7 +249,6 @@ import os import logging import struct from _io import StringIO,BytesIO -from oletools import rtfobj import math import zipfile import re @@ -298,6 +299,9 @@ from pyparsing import \ alphanums, alphas, hexnums,nums, opAssoc, srange, \ infixNotation, ParserElement import oletools.ppt_parser as ppt_parser +from oletools import rtfobj +from oletools import oleid +from oletools.common.errors import FileIsEncryptedError # monkeypatch email to fix issue #32: # allow header lines without ":" @@ -479,6 +483,7 @@ RETURN_OPEN_ERROR = 5 RETURN_PARSE_ERROR = 6 RETURN_SEVERAL_ERRS = 7 RETURN_UNEXPECTED = 8 +RETURN_ENCRYPTED = 9 # MAC codepages (from http://stackoverflow.com/questions/1592925/decoding-mac-os-text-in-python) MAC_CODEPAGES = { @@ -2360,6 +2365,12 @@ class VBA_Parser(object): # This looks like an OLE file self.open_ole(_file) + # check whether file is encrypted (need to do this before try ppt) + log.debug('Check encryption of ole file') + crypt_indicator = oleid.OleID(self.ole_file).check_encrypted() + if crypt_indicator.value: + raise FileIsEncryptedError(filename) + # if this worked, try whether it is a ppt file (special ole file) self.open_ppt() if self.type is None and is_zipfile(_file): @@ -3594,6 +3605,18 @@ def main(cmd_line_args=None): % (filename, exc.orig_exc)) return_code = RETURN_PARSE_ERROR if return_code == 0 \ else RETURN_SEVERAL_ERRS + except FileIsEncryptedError as exc: + if options.output_mode in ('triage', 'unspecified'): + print('%-12s %s - File is encrypted' % ('!ERROR', filename)) + elif options.output_mode == 'json': + print_json(file=filename, type='error', + error=type(exc).__name__, message=str(exc)) + else: + log.exception('File %s is encrypted!' % (filename)) + return_code = RETURN_ENCRYPTED if return_code == 0 \ + else RETURN_SEVERAL_ERRS + # Here we do not close the vba_parser, because process_file may need it below. + finally: if vba_parser is not None: vba_parser.close() diff --git a/oletools/ooxml.py b/oletools/ooxml.py index 78ef489..d33828b 100644 --- a/oletools/ooxml.py +++ b/oletools/ooxml.py @@ -9,6 +9,8 @@ See also: Notes on Microsoft's implementation of ECMA-376: [MS-0E376] TODO: may have to tell apart single xml types: office2003 looks much different than 2006+ --> DOCTYPE_*_XML2003 +TODO: check what is duplicate here with oleid, maybe merge some day? +TODO: "xml2003" == "flatopc"? .. codeauthor:: Intra2net AG """ diff --git a/oletools/ppt_record_parser.py b/oletools/ppt_record_parser.py index 487d5a8..acdc0dd 100644 --- a/oletools/ppt_record_parser.py +++ b/oletools/ppt_record_parser.py @@ -63,6 +63,7 @@ except ImportError: sys.path.insert(0, PARENT_DIR) del PARENT_DIR from oletools import record_base +from oletools.common.errors import FileIsEncryptedError # types of relevant records (there are much more than listed here) @@ -147,13 +148,17 @@ def is_ppt(filename): Param filename can be anything that OleFileIO constructor accepts: name of file or file data or data stream. + + see also: oleid.OleID.check_powerpoint """ have_current_user = False have_user_edit = False have_persist_dir = False have_document_container = False + ppt_file = None try: - for stream in PptFile(filename).iter_streams(): + ppt_file = PptFile(filename) + for stream in ppt_file.iter_streams(): if stream.name == 'Current User': for record in stream.iter_records(): if isinstance(record, PptRecordCurrentUser): @@ -176,6 +181,11 @@ def is_ppt(filename): return True else: # ignore other streams/storages since they are optional continue + except FileIsEncryptedError: + assert ppt_file is not None, \ + 'Encryption error should not be raised from just opening OLE file.' + # just rely on stream names, copied from oleid + return ppt_file.exists('PowerPoint Document') except Exception: pass return False diff --git a/oletools/record_base.py b/oletools/record_base.py index 0983ffc..49b3cb5 100644 --- a/oletools/record_base.py +++ b/oletools/record_base.py @@ -44,6 +44,7 @@ __version__ = '0.54dev1' # TODO: # - read DocumentSummaryInformation first to get more info about streams # (maybe content type or so; identify streams that are never record-based) +# Or use oleid to avoid same functionality in several files # - think about integrating this with olefile itself # ----------------------------------------------------------------------------- @@ -62,6 +63,18 @@ import logging import olefile +try: + from oletools.common.errors import FileIsEncryptedError +except ImportError: + # little hack to allow absolute imports even if oletools is not installed. + PARENT_DIR = os.path.normpath(os.path.dirname(os.path.dirname( + os.path.abspath(__file__)))) + if PARENT_DIR not in sys.path: + sys.path.insert(0, PARENT_DIR) + del PARENT_DIR + from oletools.common.errors import FileIsEncryptedError +from oletools import oleid + ############################################################################### # Helpers @@ -111,6 +124,12 @@ class OleRecordFile(olefile.OleFileIO): Subclass of OleFileIO! """ + def open(self, filename, *args, **kwargs): + """Call OleFileIO.open, raise error if is encrypted.""" + #super(OleRecordFile, self).open(filename, *args, **kwargs) + OleFileIO.open(self, filename, *args, **kwargs) + self.is_encrypted = oleid.OleID(self).check_encrypted().value + @classmethod def stream_class_for_name(cls, stream_name): """ helper for iter_streams, must be overwritten in subclasses @@ -142,7 +161,8 @@ class OleRecordFile(olefile.OleFileIO): stream = clz(self._open(direntry.isectStart, direntry.size), direntry.size, None if is_orphan else direntry.name, - direntry.entry_type) + direntry.entry_type, + self.is_encrypted) yield stream stream.close() @@ -155,13 +175,14 @@ class OleRecordStream(object): abstract base class """ - def __init__(self, stream, size, name, stream_type): + def __init__(self, stream, size, name, stream_type, is_encrypted=False): self.stream = stream self.size = size self.name = name if stream_type not in ENTRY_TYPE2STR: raise ValueError('Unknown stream type: {0}'.format(stream_type)) self.stream_type = stream_type + self.is_encrypted = is_encrypted def read_record_head(self): """ read first few bytes of record to determine size and type @@ -190,6 +211,9 @@ class OleRecordStream(object): Stream must be positioned at start of records (e.g. start of stream). """ + if self.is_encrypted: + raise FileIsEncryptedError() + while True: # unpacking as in olevba._extract_vba pos = self.stream.tell() @@ -234,6 +258,8 @@ class OleSummaryInformationStream(OleRecordStream): Do nothing so far. OleFileIO reads quite some info from this. For more info see [MS-OSHARED] 2.3.3 and [MS-OLEPS] 2.21 and references therein. + + See also: info read in oleid.py. """ def iter_records(self, fill_data=False): """ yields nothing, stops at once """ diff --git a/oletools/xls_parser.py b/oletools/xls_parser.py index 15f026e..52575a7 100644 --- a/oletools/xls_parser.py +++ b/oletools/xls_parser.py @@ -86,14 +86,16 @@ def is_xls(filename): returns True if given file is an ole file and contains a Workbook stream todo: could further check that workbook stream starts with a globals - substream + substream. + See also: oleid.OleID.check_excel """ try: for stream in XlsFile(filename).iter_streams(): if isinstance(stream, WorkbookStream): return True except Exception: - return False + pass + return False def read_unicode(data, start_idx, n_chars): @@ -130,6 +132,8 @@ class XlsFile(record_base.OleRecordFile): @classmethod def stream_class_for_name(cls, stream_name): """ helper for iter_streams """ + if stream_name == 'Workbook': + return WorkbookStream return XlsStream diff --git a/tests/msodde/test_basic.py b/tests/msodde/test_basic.py index ac3121c..3386462 100644 --- a/tests/msodde/test_basic.py +++ b/tests/msodde/test_basic.py @@ -11,6 +11,7 @@ from __future__ import print_function import unittest from oletools import msodde from tests.test_utils import DATA_BASE_DIR as BASE_DIR +import os from os.path import join from traceback import print_exc @@ -55,6 +56,20 @@ class TestReturnCode(unittest.TestCase): """ check that text file argument leads to non-zero exit status """ self.do_test_validity(join(BASE_DIR, 'basic/text'), True) + def test_encrypted(self): + """ + check that encrypted files lead to non-zero exit status + + Currently, only the encryption applied by Office 2010 (CryptoApi RC4 + Encryption) is tested. + """ + CRYPT_DIR = join(BASE_DIR, 'encrypted') + ADD_ARGS = '', '-j', '-d', '-f', '-a' + for filename in os.listdir(CRYPT_DIR): + full_name = join(CRYPT_DIR, filename) + for args in ADD_ARGS: + self.do_test_validity(args + ' ' + full_name, True) + def do_test_validity(self, args, expect_error=False): """ helper for test_valid_doc[x] """ have_exception = False diff --git a/tests/oleid/test_basic.py b/tests/oleid/test_basic.py new file mode 100644 index 0000000..e527fa2 --- /dev/null +++ b/tests/oleid/test_basic.py @@ -0,0 +1,155 @@ +""" +Test basic functionality of oleid + +Should work with python2 and python3! +""" + +import unittest +import os +from os.path import join, relpath, splitext +from oletools import oleid + +# Directory with test data, independent of current working directory +from tests.test_utils import DATA_BASE_DIR + + +class TestOleIDBasic(unittest.TestCase): + """Test basic functionality of OleID""" + + def test_all(self): + """Run all file in test-data through oleid and compare to known ouput""" + # this relies on order of indicators being constant, could relax that + # Also requires that files have the correct suffixes (no rtf in doc) + NON_OLE_SUFFIXES = ('.xml', '.csv', '.rtf', '') + NON_OLE_VALUES = (False, ) + WORD = b'Microsoft Office Word' + PPT = b'Microsoft Office PowerPoint' + EXCEL = b'Microsoft Excel' + CRYPT = (True, False, 'unknown', True, False, False, False, False, + False, False, 0) + OLE_VALUES = { + 'oleobj/sample_with_lnk_file.doc': (True, True, WORD, False, True, + False, False, False, False, + True, 0), + 'oleobj/embedded-simple-2007.xlsb': (False,), + 'oleobj/embedded-simple-2007.docm': (False,), + 'oleobj/embedded-simple-2007.xltx': (False,), + 'oleobj/embedded-simple-2007.xlam': (False,), + 'oleobj/embedded-simple-2007.dotm': (False,), + 'oleobj/sample_with_lnk_file.ppt': (True, True, PPT, False, False, + False, False, True, False, + False, 0), + 'oleobj/embedded-simple-2007.xlsx': (False,), + 'oleobj/embedded-simple-2007.xlsm': (False,), + 'oleobj/embedded-simple-2007.ppsx': (False,), + 'oleobj/embedded-simple-2007.pps': (True, True, PPT, False, False, + False, False, True, False, + False, 0), + 'oleobj/embedded-simple-2007.xla': (True, True, EXCEL, False, + False, False, True, False, + False, False, 0), + 'oleobj/sample_with_calc_embedded.doc': (True, True, WORD, False, + True, False, False, False, + False, True, 0), + 'oleobj/embedded-unicode-2007.docx': (False,), + 'oleobj/embedded-unicode.doc': (True, True, WORD, False, True, + False, False, False, False, True, + 0), + 'oleobj/embedded-simple-2007.doc': (True, True, WORD, False, True, + False, False, False, False, + True, 0), + 'oleobj/embedded-simple-2007.xls': (True, True, EXCEL, False, + False, False, True, False, + False, False, 0), + 'oleobj/embedded-simple-2007.dot': (True, True, WORD, False, True, + False, False, False, False, + True, 0), + 'oleobj/sample_with_lnk_to_calc.doc': (True, True, WORD, False, + True, False, False, False, + False, True, 0), + 'oleobj/embedded-simple-2007.ppt': (True, True, PPT, False, False, + False, False, True, False, + False, 0), + 'oleobj/sample_with_lnk_file.pps': (True, True, PPT, False, False, + False, False, True, False, + False, 0), + 'oleobj/embedded-simple-2007.pptx': (False,), + 'oleobj/embedded-simple-2007.ppsm': (False,), + 'oleobj/embedded-simple-2007.dotx': (False,), + 'oleobj/embedded-simple-2007.pptm': (False,), + 'oleobj/embedded-simple-2007.xlt': (True, True, EXCEL, False, + False, False, True, False, + False, False, 0), + 'oleobj/embedded-simple-2007.docx': (False,), + 'oleobj/embedded-simple-2007.potx': (False,), + 'oleobj/embedded-simple-2007.pot': (True, True, PPT, False, False, + False, False, True, False, + False, 0), + 'oleobj/embedded-simple-2007.xltm': (False,), + 'oleobj/embedded-simple-2007.potm': (False,), + 'encrypted/encrypted.xlsx': CRYPT, + 'encrypted/encrypted.docm': CRYPT, + 'encrypted/encrypted.docx': CRYPT, + 'encrypted/encrypted.pptm': CRYPT, + 'encrypted/encrypted.xlsb': CRYPT, + 'encrypted/encrypted.xls': (True, True, EXCEL, True, False, False, + True, False, False, False, 0), + 'encrypted/encrypted.ppt': (True, False, 'unknown', True, False, + False, False, True, False, False, 0), + 'encrypted/encrypted.pptx': CRYPT, + 'encrypted/encrypted.xlsm': CRYPT, + 'encrypted/encrypted.doc': (True, True, WORD, True, True, False, + False, False, False, False, 0), + 'msodde/harmless-clean.docm': (False,), + 'msodde/dde-in-csv.csv': (False,), + 'msodde/dde-test-from-office2013-utf_16le-korean.doc': + (True, True, WORD, False, True, False, False, False, False, + False, 0), + 'msodde/harmless-clean.doc': (True, True, WORD, False, True, False, + False, False, False, False, 0), + 'msodde/dde-test.docm': (False,), + 'msodde/dde-test.xlsb': (False,), + 'msodde/dde-test.xlsm': (False,), + 'msodde/dde-test.docx': (False,), + 'msodde/dde-test.xlsx': (False,), + 'msodde/dde-test-from-office2003.doc': (True, True, WORD, False, + True, False, False, False, + False, False, 0), + 'msodde/dde-test-from-office2016.doc': (True, True, WORD, False, + True, False, False, False, + False, False, 0), + 'msodde/harmless-clean.docx': (False,), + 'oleform/oleform-PR314.docm': (False,), + 'basic/encrypted.docx': CRYPT, + } + + indicator_names = [] + for base_dir, _, files in os.walk(DATA_BASE_DIR): + for filename in files: + full_path = join(base_dir, filename) + name = relpath(full_path, DATA_BASE_DIR) + values = tuple(indicator.value for indicator in + oleid.OleID(full_path).check()) + if len(indicator_names) < 2: # not initialized with ole yet + indicator_names = tuple(indicator.name for indicator in + oleid.OleID(full_path).check()) + suffix = splitext(filename)[1] + if suffix in NON_OLE_SUFFIXES: + self.assertEqual(values, NON_OLE_VALUES, + msg='For non-ole file {} expected {}, ' + 'not {}'.format(name, NON_OLE_VALUES, + values)) + continue + try: + self.assertEqual(values, OLE_VALUES[name], + msg='Wrong detail values for {}:\n' + ' Names {}\n Found {}\n Expect {}' + .format(name, indicator_names, values, + OLE_VALUES[name])) + except KeyError: + print('Should add oleid output for {} to {} ({})' + .format(name, __name__, values[3:])) + +# just in case somebody calls this file as a script +if __name__ == '__main__': + unittest.main() diff --git a/tests/olevba/__init__.py b/tests/olevba/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/tests/olevba/__init__.py diff --git a/tests/olevba/test_basic.py b/tests/olevba/test_basic.py new file mode 100644 index 0000000..d319a12 --- /dev/null +++ b/tests/olevba/test_basic.py @@ -0,0 +1,45 @@ +""" +Test basic functionality of olevba[3] +""" + +import unittest +import sys +if sys.version_info.major <= 2: + from oletools import olevba +else: + from oletools import olevba3 as olevba +import os +from os.path import join + +# Directory with test data, independent of current working directory +from tests.test_utils import DATA_BASE_DIR + + +class TestOlevbaBasic(unittest.TestCase): + """Tests olevba basic functionality""" + + def test_crypt_return(self): + """ + Tests that encrypted files give a certain return code. + + Currently, only the encryption applied by Office 2010 (CryptoApi RC4 + Encryption) is tested. + """ + CRYPT_DIR = join(DATA_BASE_DIR, 'encrypted') + CRYPT_RETURN_CODE = 9 + ADD_ARGS = [], ['-d', ], ['-a', ], ['-j', ], ['-t', ] + for filename in os.listdir(CRYPT_DIR): + full_name = join(CRYPT_DIR, filename) + for args in ADD_ARGS: + try: + ret_code = olevba.main(args + [full_name, ]) + except SystemExit as se: + ret_code = se.code or 0 # se.code can be None + self.assertEqual(ret_code, CRYPT_RETURN_CODE, + msg='Wrong return code {} for args {}' + .format(ret_code, args + [filename, ])) + + +# just in case somebody calls this file as a script +if __name__ == '__main__': + unittest.main() diff --git a/tests/test-data/encrypted/encrypted.doc b/tests/test-data/encrypted/encrypted.doc new file mode 100644 index 0000000..cf553d7 --- /dev/null +++ b/tests/test-data/encrypted/encrypted.doc diff --git a/tests/test-data/encrypted/encrypted.docm b/tests/test-data/encrypted/encrypted.docm new file mode 100644 index 0000000..92d608a --- /dev/null +++ b/tests/test-data/encrypted/encrypted.docm diff --git a/tests/test-data/encrypted/encrypted.docx b/tests/test-data/encrypted/encrypted.docx new file mode 100644 index 0000000..06d9e87 --- /dev/null +++ b/tests/test-data/encrypted/encrypted.docx diff --git a/tests/test-data/encrypted/encrypted.ppt b/tests/test-data/encrypted/encrypted.ppt new file mode 100644 index 0000000..8671044 --- /dev/null +++ b/tests/test-data/encrypted/encrypted.ppt diff --git a/tests/test-data/encrypted/encrypted.pptm b/tests/test-data/encrypted/encrypted.pptm new file mode 100644 index 0000000..f26e0ff --- /dev/null +++ b/tests/test-data/encrypted/encrypted.pptm diff --git a/tests/test-data/encrypted/encrypted.pptx b/tests/test-data/encrypted/encrypted.pptx new file mode 100644 index 0000000..108057e --- /dev/null +++ b/tests/test-data/encrypted/encrypted.pptx diff --git a/tests/test-data/encrypted/encrypted.xls b/tests/test-data/encrypted/encrypted.xls new file mode 100644 index 0000000..75d010a --- /dev/null +++ b/tests/test-data/encrypted/encrypted.xls diff --git a/tests/test-data/encrypted/encrypted.xlsb b/tests/test-data/encrypted/encrypted.xlsb new file mode 100644 index 0000000..10fa81e --- /dev/null +++ b/tests/test-data/encrypted/encrypted.xlsb diff --git a/tests/test-data/encrypted/encrypted.xlsm b/tests/test-data/encrypted/encrypted.xlsm new file mode 100644 index 0000000..e43e0b0 --- /dev/null +++ b/tests/test-data/encrypted/encrypted.xlsm diff --git a/tests/test-data/encrypted/encrypted.xlsx b/tests/test-data/encrypted/encrypted.xlsx new file mode 100644 index 0000000..1666857 --- /dev/null +++ b/tests/test-data/encrypted/encrypted.xlsx