diff --git a/oletools/xls_parser.py b/oletools/xls_parser.py index a2587de..f26d1bf 100644 --- a/oletools/xls_parser.py +++ b/oletools/xls_parser.py @@ -36,33 +36,42 @@ from __future__ import print_function __version__ = '0.1' -#------------------------------------------------------------------------------ -# TODO: -# everything +# ----------------------------------------------------------------------------- +# TODO: +# - parse more record types (ExternName, ...) +# - check what bad stuff can be in other storages: Embedded ("MBD..."), Linked +# ("LNK..."), "MsoDataStore" and OleStream ('\001Ole') # -#------------------------------------------------------------------------------ -# REFERENCES: -# - [MS-XLS]: Excel Binary File Format (.xls) Structure Specification -# https://msdn.microsoft.com/en-us/library/office/cc313154(v=office.14).aspx -# - Understanding the Excel .xls Binary File Format -# https://msdn.microsoft.com/en-us/library/office/gg615597(v=office.14).aspx +# ----------------------------------------------------------------------------- +# REFERENCES: +# - [MS-XLS]: Excel Binary File Format (.xls) Structure Specification +# https://msdn.microsoft.com/en-us/library/office/cc313154(v=office.14).aspx +# - Understanding the Excel .xls Binary File Format +# https://msdn.microsoft.com/en-us/library/office/gg615597(v=office.14).aspx # -#--- IMPORTS ------------------------------------------------------------------ +# -- IMPORTS ------------------------------------------------------------------ import sys import os.path +from struct import unpack +from io import SEEK_CUR # little hack to allow absolute imports even if oletools is not installed. # Copied from olevba.py -_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) -_parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) -if not _parent_dir in sys.path: +_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) # pylint: disable=invalid-name +_parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) # pylint: disable=invalid-name +if _parent_dir not in sys.path: sys.path.insert(0, _parent_dir) -from oletools.thirdparty import olefile +from oletools.thirdparty import olefile # pylint: disable=wrong-import-position + + +############################################################################### +# Helpers +############################################################################### -entry_type2str = { +ENTRY_TYPE2STR = { olefile.STGTY_EMPTY: 'empty', olefile.STGTY_STORAGE: 'storage', olefile.STGTY_STREAM: 'stream', @@ -71,6 +80,29 @@ entry_type2str = { olefile.STGTY_ROOT: 'root' } + +def is_xls(filename): + """ + determine whether a given file is an excel ole file + + returns True if given file is an ole file and contains a Workbook stream + + todo: could further check that workbook stream starts with a globals + substream + """ + try: + for stream in XlsFile(filename).get_streams(): + if isinstance(stream, WorkbookStream): + return True + except Exception: + return False + + +############################################################################### +# File, Storage, Stream +############################################################################### + + class XlsFile(olefile.OleFileIO): """ specialization of an OLE compound file """ @@ -81,7 +113,7 @@ class XlsFile(olefile.OleFileIO): for sid, direntry in enumerate(self.direntries): is_orphan = direntry is None if is_orphan: - # this direntry is not part of the tree: either unused or an orphan + # this direntry is not part of the tree --> unused or orphan direntry = self._load_direntry(sid) is_stream = direntry.entry_type == olefile.STGTY_STREAM print('direntry {:2d} {}: {}' @@ -89,30 +121,286 @@ class XlsFile(olefile.OleFileIO): 'is stream of size {}'.format(direntry.size) if is_stream else 'no stream ({})' - .format(entry_type2str[direntry.entry_type]))) + .format(ENTRY_TYPE2STR[direntry.entry_type]))) if is_stream: - yield XlsStream(self._open(direntry.isectStart, direntry.size)) + if direntry.name == 'Workbook': + clz = WorkbookStream + else: + clz = XlsStream + yield clz(self._open(direntry.isectStart, direntry.size), + None if is_orphan else direntry.name) + +class XlsStream(object): + """ specialization of an OLE stream -class XlsStream: - """ specialization of an OLE (sub-)stream """ + Currently not much use, but may be interesting for further sub-classing + when extending this code. + """ - def __init__(self, stream): + def __init__(self, stream, name): self.stream = stream + self.size = stream.size + self.name = name + def __str__(self): + return '[XlsStream {0} (size {1})' \ + .format(self.name or '[orphan]', self.size) -def test(filename): - """ parse given file and print rough structure """ - try: + +class WorkbookStream(XlsStream): + """ the workbook stream which contains records """ + + def iter_records(self, fill_data=False): + """ iterate over records in streams""" + if self.stream.tell() != 0: + print('have to jump to start') + self.stream.seek(0) + + while True: + # unpacking as in olevba._extract_vba + pos = self.stream.tell() + if pos >= self.size: + break + type = unpack(' self.MAX_SIZE: + raise ValueError('size {0} exceeds max size'.format(size)) + elif self.SIZE is not None and size != self.SIZE: + raise ValueError('size {0} is not as expected for this type' + .format(size)) + self.size = size + self.pos = pos + self.data = data + if data is not None and len(data) != size: + raise ValueError('data size {0} is not expected size {1}' + .format(len(data), size)) + + def read_data(self, stream): + """ read data from stream if up to now only pos was known """ + raise NotImplementedError() + + def _type_str(self): + """ simplification for subclasses to create their own __str__ """ + try: + return FREQUENT_RECORDS[self.type] + except KeyError: + return 'XlsRecord type {0}'.format(self.type) + + def __str__(self): + return '[' + self._type_str() + \ + ' (size {0} from {1})]'.format(self.size, self.pos) + + +class XlsRecordBof(XlsRecord): + """ record found at beginning of substreams """ + TYPE = 2057 + SIZE = 16 + # types of substreams + DOCTYPES = dict([(0x5, 'workbook'), (0x10, 'dialog/worksheet'), + (0x20, 'chart'), (0x40, 'macro')]) + + def __init__(self, *args, **kwargs): + super(XlsRecordBof, self).__init__(*args, **kwargs) + if self.data is None: + self.doctype = None + return + # parse data (only doctype, ignore rest) + self.doctype = unpack('= 4)' + .format(self.size)) + self.ctab, self.cch = unpack(' 0 and self.virt_path: + self.support_link_type = self.LINK_TYPE_EXTERNAL + + def _type_str(self): + return 'SupBook Record ({0})'.format(self.support_link_type) + + +def read_unicode(data, start_idx, n_chars): + """ read a unicode string from a XLUnicodeStringNoCch structure """ + # first bit 0x0 --> only low-bytes are saved, all high bytes are 0 + # first bit 0x1 --> 2 bytes per character + low_bytes_only = (ord(data[start_idx]) == 0) + if low_bytes_only: + end_idx = start_idx + 1 + n_chars + return data[start_idx+1:end_idx].decode('ascii'), end_idx + end_idx = start_idx + 1 + n_chars * 2 + return u''.join(unichr(val) for val in + unpack('<' + 'H'*n_chars, data[start_idx+1:end_idx])), \ + end_idx + + +############################################################################### +# TESTING +############################################################################### + + +def test(*filenames): + """ parse all given file names and print rough structure """ + if not filenames: + print('need file name[s]') + return 2 + for filename in filenames: + if not olefile.isOleFile(filename): + continue xls = XlsFile(filename) - except Exception as exc: - print('{}: {}'.format(filename, exc)) - return - for stream in xls.get_streams(): - pass + for stream in xls.get_streams(): + print(stream) + if isinstance(stream, WorkbookStream): + for record in stream.iter_records(): + print(' {0}'.format(record)) + return 0 + if __name__ == '__main__': - """ parse all given file names and print rough structure """ - for filename in sys.argv[1:]: - test(filename) + sys.exit(test(sys.argv[1:]))