diff --git a/oletools/record_base.py b/oletools/record_base.py new file mode 100644 index 0000000..a245341 --- /dev/null +++ b/oletools/record_base.py @@ -0,0 +1,286 @@ +#!/usr/bin/env python + +""" +record_base.py + +Common stuff for ole files whose streams are a sequence of record structures. +This is the case for xls and ppt, so classes are bases for xls_parser.py and +ppt_parser.py . +""" + +# === LICENSE ================================================================= +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from __future__ import print_function + +#------------------------------------------------------------------------------ +# CHANGELOG: +# 2017-11-30 v0.01 CH: - first version based on xls_parser + +#------------------------------------------------------------------------------ +# TODO: +# - read DocumentSummaryInformation first to get more info about streams +# (maybe content type or so; identify streams that are never record-based) +# - think about integrating this with olefile itself + +# ----------------------------------------------------------------------------- +# REFERENCES: +# - [MS-XLS]: Excel Binary File Format (.xls) Structure Specification +# https://msdn.microsoft.com/en-us/library/office/cc313154(v=office.14).aspx +# - Understanding the Excel .xls Binary File Format +# https://msdn.microsoft.com/en-us/library/office/gg615597(v=office.14).aspx +# - [MS-PPT] + + +import sys +import os.path +from io import SEEK_CUR +import logging + +# little hack to allow absolute imports even if oletools is not installed. +# Copied from olevba.py +_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) # pylint: disable=invalid-name +_parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) # pylint: disable=invalid-name +del _thismodule_dir +if _parent_dir not in sys.path: + sys.path.insert(0, _parent_dir) +del _parent_dir + +from oletools.thirdparty import olefile + + +############################################################################### +# Helpers +############################################################################### + + +ENTRY_TYPE2STR = { + olefile.STGTY_EMPTY: 'empty', + olefile.STGTY_STORAGE: 'storage', + olefile.STGTY_STREAM: 'stream', + olefile.STGTY_LOCKBYTES: 'lock-bytes', + olefile.STGTY_PROPERTY: 'property', + olefile.STGTY_ROOT: 'root' +} + + +############################################################################### +# Base Classes +############################################################################### + + +class OleRecordFile(olefile.OleFileIO): + """ an OLE compound file whose streams have (mostly) record structure + + 'record structure' meaning that streams are a sequence of records. Records + are structure with information about type and size in their first bytes + and type-dependent data of given size after that. + + Subclass of OleFileIO! + """ + + @classmethod + def stream_class_for_name(cls, stream_name): + """ helper for iter_streams, must be overwritten in subclasses """ + return OleRecordStream # this is an abstract class! + + def iter_streams(self): + """ find all streams, including orphans """ + logging.debug('Finding streams in ole file') + + for sid, direntry in enumerate(self.direntries): + is_orphan = direntry is None + if is_orphan: + # this direntry is not part of the tree --> unused or orphan + direntry = self._load_direntry(sid) + is_stream = direntry.entry_type == olefile.STGTY_STREAM + logging.debug('direntry {:2d} {}: {}'.format( + sid, '[orphan]' if is_orphan else direntry.name, + 'is stream of size {}'.format(direntry.size) if is_stream else + 'no stream ({})'.format(ENTRY_TYPE2STR[direntry.entry_type]))) + if is_stream: + clz = self.stream_class_for_name(direntry.name) + yield clz(self._open(direntry.isectStart, direntry.size), + None if is_orphan else direntry.name) + + +class OleRecordStream(object): + """ a stream found in an OleRecordFile + + Always has a name and a size (both read-only). Has an OleFileStream handle. + + abstract base class + """ + + def __init__(self, stream, name): + self.stream = stream + self.name = name + self.size = stream.size + + def read_record_head(self): + """ read first few bytes of record to determine size and type + + Abstract base method, to be implemented in subclasses. + + returns (rec_type, rec_size, other) where other will be forwarded to + record constructors + """ + raise NotImplementedError('Abstract method ' + 'OleRecordStream.read_record_head called') + + @classmethod + def record_class_for_type(cls, rec_type): + """ determine a class for given record type + + Only a base implementation. Create subclasses of OleRecordBase and + return those when appropriate. + + returns (clz, force_read) + """ + return OleRecordBase, False + + def iter_records(self, fill_data=False): + """ yield all records in this stream + + Stream must be positioned at start of records (e.g. start of stream). + """ + while True: + # unpacking as in olevba._extract_vba + pos = self.stream.tell() + if pos >= self.size: + break + + # read first few bytes, determine record type and size + rec_type, rec_size, other = self.read_record_head() + logging.debug('Record type {0} of size {1}' + .format(rec_type, rec_size)) + + # determine what class to wrap this into + rec_clz, force_read = self.record_class_for_type(rec_type) + + if fill_data or force_read: + data = self.stream.read(rec_size) + if len(data) != rec_size: + raise IOError('Not enough data in stream ({0} < {1})' + .format(len(data), rec_size)) + else: + self.stream.seek(rec_size, SEEK_CUR) + data = None + yield rec_clz(rec_type, rec_size, other, pos, data) + + def __str__(self): + return '[{2} {0} (size {1})' \ + .format(self.name or '[orphan]', self.size, + self.__class__.__name__) + + +class OleRecordBase(object): + """ a record found in an OleRecordStream + + always has a type and a size, also pos and data + """ + + # for subclasses with a fixed type + TYPE = None + + # (max) size of subclasses + MAX_SIZE = None + SIZE = None + + def __init__(self, type, size, more_data, pos, data): + """ create a record; more_data is discarded """ + if self.TYPE is not None and type != self.TYPE: + raise ValueError('Wrong subclass {0} for type {1}' + .format(self.__class__.__name__, type)) + self.type = type + if self.SIZE is not None and size != self.SIZE: + raise ValueError('Wrong size {0} for record type {1}' + .format(size, type)) + elif self.MAX_SIZE is not None and size > self.MAX_SIZE: + raise ValueError('Wrong size: {0} > MAX_SIZE for record type {1}' + .format(size, type)) + self.size = size + self.pos = pos + self.data = data + self.parse(more_data) + + def parse(self, more_data): + """ finish constructing this record + + Can save more_data from OleRecordStream.read_record_head and/or parse + data (if it was read). + + Base implementation, does nothing. To be overwritten in subclasses. + """ + pass + + def _type_str(self): + """ helper for __str__, base implementation """ + return '{0} type {1}'.format(self.__class__.__name__, self.type) + + def __str__(self): + """ create a short but informative textual representation of self """ + return '[' + self._type_str() + \ + ' (size {0} from {1})]'.format(self.size, self.pos) + + +############################################################################### +# TESTING +############################################################################### + + +def test(filenames, ole_file_class=OleRecordFile, + must_parse=None): + """ parse all given file names and print rough structure + + if an error occurs while parsing a stream of type in must_parse, the error + will be raised. Otherwise a message is printed + """ + logging.basicConfig(level=logging.DEBUG) + if not filenames: + logging.info('need file name[s]') + return 2 + for filename in filenames: + logging.info('checking file {0}'.format(filename)) + if not olefile.isOleFile(filename): + logging.info('not an ole file - skip') + continue + ole = ole_file_class(filename) + + for stream in ole.iter_streams(): + logging.info(stream) + try: + for record in stream.iter_records(): + logging.info(' {0}'.format(record)) + except Exception: + if not must_parse: + raise + elif isinstance(stream, must_parse): + raise + else: + logging.info(' failed to parse', exc_info=True) + return 0 + + +if __name__ == '__main__': + sys.exit(test(sys.argv[1:])) diff --git a/oletools/xls_parser.py b/oletools/xls_parser.py index f008281..9f9c789 100644 --- a/oletools/xls_parser.py +++ b/oletools/xls_parser.py @@ -30,9 +30,11 @@ Read storages, (sub-)streams, records from xls file #------------------------------------------------------------------------------ # CHANGELOG: -# 2017-11-02 v0.01 CH: - first version +# 2017-11-02 v0.1 CH: - first version +# 2017-11-02 v0.2 CH: - move some code to record_base.py +# (to avoid copy-and-paste in ppt_parser.py) -__version__ = '0.1' +__version__ = '0.2' # ----------------------------------------------------------------------------- # TODO: @@ -52,17 +54,8 @@ __version__ = '0.1' import sys import os.path from struct import unpack -from io import SEEK_CUR import logging - -# little hack to allow absolute imports even if oletools is not installed. -# Copied from olevba.py -_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) # pylint: disable=invalid-name -_parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) # pylint: disable=invalid-name -if _parent_dir not in sys.path: - sys.path.insert(0, _parent_dir) - -from oletools.thirdparty import olefile +from record_base import OleRecordFile, OleRecordStream, OleRecordBase, test # === PYTHON 2+3 SUPPORT ====================================================== @@ -75,16 +68,6 @@ if sys.version_info[0] >= 3: ############################################################################### -ENTRY_TYPE2STR = { - olefile.STGTY_EMPTY: 'empty', - olefile.STGTY_STORAGE: 'storage', - olefile.STGTY_STREAM: 'stream', - olefile.STGTY_LOCKBYTES: 'lock-bytes', - olefile.STGTY_PROPERTY: 'property', - olefile.STGTY_ROOT: 'root' -} - - def is_xls(filename): """ determine whether a given file is an excel ole file @@ -95,7 +78,7 @@ def is_xls(filename): substream """ try: - for stream in XlsFile(filename).get_streams(): + for stream in XlsFile(filename).iter_streams(): if isinstance(stream, WorkbookStream): return True except Exception: @@ -122,7 +105,7 @@ def read_unicode_2byte(data, start_idx, n_chars): unpack('<' + 'H'*n_chars, data[start_idx:end_idx])) else: # slower version but less memory-extensive unichars = (unichr(unpack(' unused or orphan - direntry = self._load_direntry(sid) - is_stream = direntry.entry_type == olefile.STGTY_STREAM - logging.debug('direntry {:2d} {}: {}'.format( - sid, '[orphan]' if is_orphan else direntry.name, - 'is stream of size {}'.format(direntry.size) if is_stream else - 'no stream ({})'.format(ENTRY_TYPE2STR[direntry.entry_type]))) - if is_stream: - if direntry.name == 'Workbook': - clz = WorkbookStream - else: - clz = XlsStream - yield clz(self._open(direntry.isectStart, direntry.size), - None if is_orphan else direntry.name) +class XlsStream(OleRecordStream): + """ most streams in xls file consist of records """ + def read_record_head(self): + """ read first few bytes of record to determine size and type -class XlsStream(object): - """ specialization of an OLE stream - - Currently not much use, but may be interesting for further sub-classing - when extending this code. - - stream argument can be oleile.OleStream or ooxml.ZipSubFile - """ + returns (type, size, other) where other is None + """ + rec_type, rec_size = unpack('= self.size: - break - type = unpack('= self.size: - break - val = ord(self.stream.read(1)) - if val & self.HIGH_BIT_MASK: # high bit of the low byte is 1 - val2 = ord(self.stream.read(1)) # need another byte - # combine 7 low bits of each byte - type = (val & self.LOW7_BIT_MASK) + \ + val = ord(self.stream.read(1)) + if val & self.HIGH_BIT_MASK: # high bit of the low byte is 1 + val2 = ord(self.stream.read(1)) # need another byte + # combine 7 low bits of each byte + rec_type = (val & self.LOW7_BIT_MASK) + \ ((val2 & self.LOW7_BIT_MASK) << 7) - else: - type = val - - size = 0 - shift = 0 - for _ in range(4): # size needs up to 4 byte - val = ord(self.stream.read(1)) - size += (val & self.LOW7_BIT_MASK) << shift - shift += 7 - if (val & self.HIGH_BIT_MASK) == 0: # high-bit is 0 --> done - break - - if pos + size > self.size: - raise ValueError('Stream does not seem to have record ' - 'structure or is incomplete (record size {0})' - .format(size)) - data = self.stream.read(size) - - clz = XlsbRecord - if type == XlsbBeginSupBook.TYPE: - clz = XlsbBeginSupBook - yield clz(type, size, pos, data) + else: + rec_type = val + + rec_size = 0 + shift = 0 + for _ in range(4): # rec_size needs up to 4 byte + val = ord(self.stream.read(1)) + rec_size += (val & self.LOW7_BIT_MASK) << shift + shift += 7 + if (val & self.HIGH_BIT_MASK) == 0: # high-bit is 0 --> done + break + return rec_type, rec_size, None + + @classmethod + def record_class_for_type(cls, type): + """ determine a class for given record type + + returns (clz, force_read) + """ + if type == XlsbBeginSupBook.TYPE: + return XlsbBeginSupBook, True + else: + return XlsbRecord, False ############################################################################### @@ -309,7 +253,6 @@ FREQUENT_RECORDS = dict([ #: records found in xlsb binary parts FREQUENT_RECORDS_XLSB = dict([ - (360, 'BrtBeginSupBook'), (588, 'BrtEndSupBook'), (667, 'BrtSupAddin'), (355, 'BrtSupBookSrc'), @@ -330,36 +273,12 @@ FREQUENT_RECORDS_XLSB = dict([ ]) -class XlsRecord(object): +class XlsRecord(OleRecordBase): """ basic building block of data in workbook stream """ #: max size of a record in xls stream (does not apply to xlsb) MAX_SIZE = 8224 - # to be overwritten in subclasses that have fixed type/size - TYPE = None - SIZE = None - - def __init__(self, type, size, pos, data=None): - """ create a record """ - self.type = type - if self.MAX_SIZE is not None and size > self.MAX_SIZE: - logging.warning('record size {0} exceeds max size' - .format(size)) - elif self.SIZE is not None and size != self.SIZE: - raise ValueError('size {0} is not as expected for this type' - .format(size)) - self.size = size - self.pos = pos - self.data = data - if data is not None and len(data) != size: - raise ValueError('data size {0} is not expected size {1}' - .format(len(data), size)) - - def read_data(self, stream): - """ read data from stream if up to now only pos was known """ - raise NotImplementedError() - def _type_str(self): """ simplification for subclasses to create their own __str__ """ try: @@ -367,10 +286,6 @@ class XlsRecord(object): except KeyError: return 'XlsRecord type {0}'.format(self.type) - def __str__(self): - return '[' + self._type_str() + \ - ' (size {0} from {1})]'.format(self.size, self.pos) - class XlsRecordBof(XlsRecord): """ record found at beginning of substreams """ @@ -380,8 +295,7 @@ class XlsRecordBof(XlsRecord): DOCTYPES = dict([(0x5, 'workbook'), (0x10, 'dialog/worksheet'), (0x20, 'chart'), (0x40, 'macro')]) - def __init__(self, *args, **kwargs): - super(XlsRecordBof, self).__init__(*args, **kwargs) + def parse(self, _): if self.data is None: self.doctype = None return @@ -420,9 +334,7 @@ class XlsRecordSupBook(XlsRecord): LINK_TYPE_OLE_DDE = 'ole/dde data source' LINK_TYPE_EXTERNAL = 'external workbook' - def __init__(self, *args, **kwargs): - super(XlsRecordSupBook, self).__init__(*args, **kwargs) - + def parse(self, _): # set defaults self.ctab = None self.cch = None @@ -461,7 +373,7 @@ class XlsRecordSupBook(XlsRecord): return 'SupBook Record ({0})'.format(self.support_link_type) -class XlsbRecord(XlsRecord): +class XlsbRecord(OleRecordBase): """ like an xls record, but from binary part of xlsb file has no MAX_SIZE and types have different meanings @@ -491,8 +403,7 @@ class XlsbBeginSupBook(XlsbRecord): LINK_TYPE_UNEXPECTED = 'unexpected' LINK_TYPE_UNKNOWN = 'unknown' - def __init__(self, *args, **kwargs): - super(XlsbBeginSupBook, self).__init__(*args, **kwargs) + def parse(self, _): self.link_type = self.LINK_TYPE_UNKNOWN self.string1 = '' self.string2 = '' @@ -540,6 +451,7 @@ class XlsbBeginSupBook(XlsbRecord): # XLSB Binary Parts ############################################################################### + def parse_xlsb_part(stream, _, filename): """ Excel xlsb files also have a record structure. iter records """ for record in XlsbStream(stream, filename).iter_records(): @@ -551,26 +463,5 @@ def parse_xlsb_part(stream, _, filename): ############################################################################### -def test(*filenames): - """ parse all given file names and print rough structure """ - logging.basicConfig(level=logging.DEBUG) - if not filenames: - logging.info('need file name[s]') - return 2 - for filename in filenames: - logging.info('checking file {0}'.format(filename)) - if not olefile.isOleFile(filename): - logging.info('not an ole file - skip') - continue - xls = XlsFile(filename) - - for stream in xls.get_streams(): - logging.info(stream) - if isinstance(stream, WorkbookStream): - for record in stream.iter_records(): - logging.info(' {0}'.format(record)) - return 0 - - if __name__ == '__main__': - sys.exit(test(*sys.argv[1:])) + sys.exit(test(sys.argv[1:], XlsFile, WorkbookStream))