""" Parse xls up to some point Read storages, (sub-)streams, records from xls file """ # # === LICENSE ================================================================== # xls_parser is copyright (c) 2014-2019 Philippe Lagadec (http://www.decalage.info) # All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, # are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #------------------------------------------------------------------------------ # CHANGELOG: # 2017-11-02 v0.1 CH: - first version # 2017-11-02 v0.2 CH: - move some code to record_base.py # (to avoid copy-and-paste in ppt_parser.py) # 2019-01-30 v0.54 PL: - fixed import to avoid mixing installed oletools # and dev version __version__ = '0.54' # ----------------------------------------------------------------------------- # TODO: # - parse more record types (ExternName, ...) # - check what bad stuff can be in other storages: Embedded ("MBD..."), Linked # ("LNK..."), "MsoDataStore" and OleStream ('\001Ole') # # ----------------------------------------------------------------------------- # REFERENCES: # - [MS-XLS]: Excel Binary File Format (.xls) Structure Specification # https://msdn.microsoft.com/en-us/library/office/cc313154(v=office.14).aspx # - Understanding the Excel .xls Binary File Format # https://msdn.microsoft.com/en-us/library/office/gg615597(v=office.14).aspx # # -- IMPORTS ------------------------------------------------------------------ import sys import os.path from struct import unpack import logging # little hack to allow absolute imports even if oletools is not installed. # Copied from olevba.py PARENT_DIR = os.path.normpath(os.path.dirname(os.path.dirname( os.path.abspath(__file__)))) if PARENT_DIR not in sys.path: sys.path.insert(0, PARENT_DIR) del PARENT_DIR from oletools import record_base # === PYTHON 2+3 SUPPORT ====================================================== if sys.version_info[0] >= 3: unichr = chr ############################################################################### # Helpers ############################################################################### def is_xls(filename): """ determine whether a given file is an excel ole file returns True if given file is an ole file and contains a Workbook stream todo: could further check that workbook stream starts with a globals substream. See also: oleid.OleID.check_excel """ xls_file = None try: xls_file = XlsFile(filename) for stream in xls_file.iter_streams(): if isinstance(stream, WorkbookStream): return True except Exception: logging.debug('Ignoring exception in is_xls, assume is not xls', exc_info=True) finally: if xls_file is not None: xls_file.close() return False def read_unicode(data, start_idx, n_chars): """ read a unicode string from a XLUnicodeStringNoCch structure """ # first bit 0x0 --> only low-bytes are saved, all high bytes are 0 # first bit 0x1 --> 2 bytes per character low_bytes_only = (ord(data[start_idx:start_idx+1]) == 0) if low_bytes_only: end_idx = start_idx + 1 + n_chars return data[start_idx+1:end_idx].decode('ascii'), end_idx else: return read_unicode_2byte(data, start_idx+1, n_chars) def read_unicode_2byte(data, start_idx, n_chars): """ read a unicode string with characters encoded by 2 bytes """ end_idx = start_idx + n_chars * 2 if n_chars < 256: # faster version, long format string for unpack unichars = (unichr(val) for val in unpack('<' + 'H'*n_chars, data[start_idx:end_idx])) else: # slower version but less memory-extensive unichars = (unichr(unpack(' done break return rec_type, rec_size, None @classmethod def record_class_for_type(cls, rec_type): """ determine a class for given record type returns (clz, force_read) """ if rec_type == XlsbBeginSupBook.TYPE: return XlsbBeginSupBook, True else: return XlsbRecord, False ############################################################################### # RECORDS ############################################################################### # records that appear often but do not need their own XlsRecord subclass (yet) FREQUENT_RECORDS = dict([ ( 156, 'BuiltInFnGroupCount'), # pylint: disable=bad-whitespace (2147, 'BookExt'), # pylint: disable=bad-whitespace ( 442, 'CodeName'), # pylint: disable=bad-whitespace ( 66, 'CodePage'), # pylint: disable=bad-whitespace (4195, 'Dat'), # pylint: disable=bad-whitespace (2154, 'DataLabExt'), # pylint: disable=bad-whitespace (2155, 'DataLabExtContents'), # pylint: disable=bad-whitespace ( 215, 'DBCell'), # pylint: disable=bad-whitespace ( 220, 'DbOrParmQry'), # pylint: disable=bad-whitespace (2051, 'DBQueryExt'), # pylint: disable=bad-whitespace (2166, 'DConn'), # pylint: disable=bad-whitespace ( 35, 'ExternName'), # pylint: disable=bad-whitespace ( 23, 'ExternSheet'), # pylint: disable=bad-whitespace ( 255, 'ExtSST'), # pylint: disable=bad-whitespace (2052, 'ExtString'), # pylint: disable=bad-whitespace (2151, 'FeatHdr'), # pylint: disable=bad-whitespace ( 91, 'FileSharing'), # pylint: disable=bad-whitespace (1054, 'Format'), # pylint: disable=bad-whitespace ( 49, 'Font'), # pylint: disable=bad-whitespace (2199, 'GUIDTypeLib'), # pylint: disable=bad-whitespace ( 440, 'HLink'), # pylint: disable=bad-whitespace ( 225, 'InterfaceHdr'), # pylint: disable=bad-whitespace ( 226, 'InterfaceEnd'), # pylint: disable=bad-whitespace ( 523, 'Index'), # pylint: disable=bad-whitespace ( 24, 'Lbl'), # pylint: disable=bad-whitespace ( 193, 'Mms'), # pylint: disable=bad-whitespace ( 93, 'Obj'), # pylint: disable=bad-whitespace (4135, 'ObjectLink'), # pylint: disable=bad-whitespace (2058, 'OleDbConn'), # pylint: disable=bad-whitespace ( 222, 'OleObjectSize'), # pylint: disable=bad-whitespace (2214, 'RichTextStream'), # pylint: disable=bad-whitespace (2146, 'SheetExt'), # pylint: disable=bad-whitespace (1212, 'ShrFmla'), # pylint: disable=bad-whitespace (2060, 'SxViewExt'), # pylint: disable=bad-whitespace (2136, 'SxViewLink'), # pylint: disable=bad-whitespace (2049, 'WebPub'), # pylint: disable=bad-whitespace ( 224, 'XF (formatting)'), # pylint: disable=bad-whitespace (2173, 'XFExt (formatting)'), # pylint: disable=bad-whitespace ( 659, 'Style'), # pylint: disable=bad-whitespace (2194, 'StyleExt') # pylint: disable=bad-whitespace ]) #: records found in xlsb binary parts FREQUENT_RECORDS_XLSB = dict([ (588, 'BrtEndSupBook'), (667, 'BrtSupAddin'), (355, 'BrtSupBookSrc'), (586, 'BrtSupNameBits'), (584, 'BrtSupNameBool'), (587, 'BrtSupNameEnd'), (581, 'BrtSupNameErr'), (585, 'BrtSupNameFmla'), (583, 'BrtSupNameNil'), (580, 'BrtSupNameNum'), (582, 'BrtSupNameSt'), (577, 'BrtSupNameStart'), (579, 'BrtSupNameValueEnd'), (578, 'BrtSupNameValueStart'), (358, 'BrtSupSame'), (357, 'BrtSupSelf'), (359, 'BrtSupTabs'), ]) class XlsRecord(record_base.OleRecordBase): """ basic building block of data in workbook stream """ #: max size of a record in xls stream (does not apply to xlsb) MAX_SIZE = 8224 def _type_str(self): """ simplification for subclasses to create their own __str__ """ try: return FREQUENT_RECORDS[self.type] except KeyError: return 'XlsRecord type {0}'.format(self.type) class XlsRecordBof(XlsRecord): """ record found at beginning of substreams """ TYPE = 2057 SIZE = 16 # types of substreams DOCTYPES = dict([(0x5, 'workbook'), (0x10, 'dialog/worksheet'), (0x20, 'chart'), (0x40, 'macro')]) def finish_constructing(self, _): if self.data is None: self.doctype = None return # parse data (only doctype, ignore rest) self.doctype = unpack('= 4)' .format(self.size)) self.ctab, self.cch = unpack(' 0 and self.virt_path: self.support_link_type = self.LINK_TYPE_EXTERNAL def _type_str(self): return 'SupBook Record ({0})'.format(self.support_link_type) class XlsbRecord(record_base.OleRecordBase): """ like an xls record, but from binary part of xlsb file has no MAX_SIZE and types have different meanings """ MAX_SIZE = None def _type_str(self): """ simplification for subclasses to create their own __str__ """ try: return FREQUENT_RECORDS_XLSB[self.type] except KeyError: return 'XlsbRecord type {0}'.format(self.type) class XlsbBeginSupBook(XlsbRecord): """ Record beginning an external link in xlsb file contains information about the link itself (e.g. for DDE the link is string1 + ' ' + string2) """ TYPE = 360 LINK_TYPE_WORKBOOK = 'workbook' LINK_TYPE_DDE = 'DDE' LINK_TYPE_OLE = 'OLE' LINK_TYPE_UNEXPECTED = 'unexpected' LINK_TYPE_UNKNOWN = 'unknown' def finish_constructing(self, _): self.link_type = self.LINK_TYPE_UNKNOWN self.string1 = '' self.string2 = '' if self.data is None: return self.sbt = unpack('