Commit d397edb5fe4149bf8278b16e63da33db0e6357da
1 parent
27dc5360
xls_parser: move code to new record_base for re-use with ppt files
Parsing through records seems to make sense. Try to repeat the same with ppt files next. To avoid copy-and-paste, move code to be used by both to common base record_base.py
Showing
2 changed files
with
367 additions
and
190 deletions
oletools/record_base.py
0 → 100644
| 1 | +#!/usr/bin/env python | ||
| 2 | + | ||
| 3 | +""" | ||
| 4 | +record_base.py | ||
| 5 | + | ||
| 6 | +Common stuff for ole files whose streams are a sequence of record structures. | ||
| 7 | +This is the case for xls and ppt, so classes are bases for xls_parser.py and | ||
| 8 | +ppt_parser.py . | ||
| 9 | +""" | ||
| 10 | + | ||
| 11 | +# === LICENSE ================================================================= | ||
| 12 | +# | ||
| 13 | +# Redistribution and use in source and binary forms, with or without | ||
| 14 | +# modification, are permitted provided that the following conditions are met: | ||
| 15 | +# | ||
| 16 | +# * Redistributions of source code must retain the above copyright notice, | ||
| 17 | +# this list of conditions and the following disclaimer. | ||
| 18 | +# * Redistributions in binary form must reproduce the above copyright notice, | ||
| 19 | +# this list of conditions and the following disclaimer in the documentation | ||
| 20 | +# and/or other materials provided with the distribution. | ||
| 21 | +# | ||
| 22 | +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||
| 23 | +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 24 | +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
| 25 | +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE | ||
| 26 | +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | ||
| 27 | +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | ||
| 28 | +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | ||
| 29 | +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | ||
| 30 | +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
| 31 | +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | ||
| 32 | +# POSSIBILITY OF SUCH DAMAGE. | ||
| 33 | + | ||
| 34 | +from __future__ import print_function | ||
| 35 | + | ||
| 36 | +#------------------------------------------------------------------------------ | ||
| 37 | +# CHANGELOG: | ||
| 38 | +# 2017-11-30 v0.01 CH: - first version based on xls_parser | ||
| 39 | + | ||
| 40 | +#------------------------------------------------------------------------------ | ||
| 41 | +# TODO: | ||
| 42 | +# - read DocumentSummaryInformation first to get more info about streams | ||
| 43 | +# (maybe content type or so; identify streams that are never record-based) | ||
| 44 | +# - think about integrating this with olefile itself | ||
| 45 | + | ||
| 46 | +# ----------------------------------------------------------------------------- | ||
| 47 | +# REFERENCES: | ||
| 48 | +# - [MS-XLS]: Excel Binary File Format (.xls) Structure Specification | ||
| 49 | +# https://msdn.microsoft.com/en-us/library/office/cc313154(v=office.14).aspx | ||
| 50 | +# - Understanding the Excel .xls Binary File Format | ||
| 51 | +# https://msdn.microsoft.com/en-us/library/office/gg615597(v=office.14).aspx | ||
| 52 | +# - [MS-PPT] | ||
| 53 | + | ||
| 54 | + | ||
| 55 | +import sys | ||
| 56 | +import os.path | ||
| 57 | +from io import SEEK_CUR | ||
| 58 | +import logging | ||
| 59 | + | ||
| 60 | +# little hack to allow absolute imports even if oletools is not installed. | ||
| 61 | +# Copied from olevba.py | ||
| 62 | +_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) # pylint: disable=invalid-name | ||
| 63 | +_parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) # pylint: disable=invalid-name | ||
| 64 | +del _thismodule_dir | ||
| 65 | +if _parent_dir not in sys.path: | ||
| 66 | + sys.path.insert(0, _parent_dir) | ||
| 67 | +del _parent_dir | ||
| 68 | + | ||
| 69 | +from oletools.thirdparty import olefile | ||
| 70 | + | ||
| 71 | + | ||
| 72 | +############################################################################### | ||
| 73 | +# Helpers | ||
| 74 | +############################################################################### | ||
| 75 | + | ||
| 76 | + | ||
| 77 | +ENTRY_TYPE2STR = { | ||
| 78 | + olefile.STGTY_EMPTY: 'empty', | ||
| 79 | + olefile.STGTY_STORAGE: 'storage', | ||
| 80 | + olefile.STGTY_STREAM: 'stream', | ||
| 81 | + olefile.STGTY_LOCKBYTES: 'lock-bytes', | ||
| 82 | + olefile.STGTY_PROPERTY: 'property', | ||
| 83 | + olefile.STGTY_ROOT: 'root' | ||
| 84 | +} | ||
| 85 | + | ||
| 86 | + | ||
| 87 | +############################################################################### | ||
| 88 | +# Base Classes | ||
| 89 | +############################################################################### | ||
| 90 | + | ||
| 91 | + | ||
| 92 | +class OleRecordFile(olefile.OleFileIO): | ||
| 93 | + """ an OLE compound file whose streams have (mostly) record structure | ||
| 94 | + | ||
| 95 | + 'record structure' meaning that streams are a sequence of records. Records | ||
| 96 | + are structure with information about type and size in their first bytes | ||
| 97 | + and type-dependent data of given size after that. | ||
| 98 | + | ||
| 99 | + Subclass of OleFileIO! | ||
| 100 | + """ | ||
| 101 | + | ||
| 102 | + @classmethod | ||
| 103 | + def stream_class_for_name(cls, stream_name): | ||
| 104 | + """ helper for iter_streams, must be overwritten in subclasses """ | ||
| 105 | + return OleRecordStream # this is an abstract class! | ||
| 106 | + | ||
| 107 | + def iter_streams(self): | ||
| 108 | + """ find all streams, including orphans """ | ||
| 109 | + logging.debug('Finding streams in ole file') | ||
| 110 | + | ||
| 111 | + for sid, direntry in enumerate(self.direntries): | ||
| 112 | + is_orphan = direntry is None | ||
| 113 | + if is_orphan: | ||
| 114 | + # this direntry is not part of the tree --> unused or orphan | ||
| 115 | + direntry = self._load_direntry(sid) | ||
| 116 | + is_stream = direntry.entry_type == olefile.STGTY_STREAM | ||
| 117 | + logging.debug('direntry {:2d} {}: {}'.format( | ||
| 118 | + sid, '[orphan]' if is_orphan else direntry.name, | ||
| 119 | + 'is stream of size {}'.format(direntry.size) if is_stream else | ||
| 120 | + 'no stream ({})'.format(ENTRY_TYPE2STR[direntry.entry_type]))) | ||
| 121 | + if is_stream: | ||
| 122 | + clz = self.stream_class_for_name(direntry.name) | ||
| 123 | + yield clz(self._open(direntry.isectStart, direntry.size), | ||
| 124 | + None if is_orphan else direntry.name) | ||
| 125 | + | ||
| 126 | + | ||
| 127 | +class OleRecordStream(object): | ||
| 128 | + """ a stream found in an OleRecordFile | ||
| 129 | + | ||
| 130 | + Always has a name and a size (both read-only). Has an OleFileStream handle. | ||
| 131 | + | ||
| 132 | + abstract base class | ||
| 133 | + """ | ||
| 134 | + | ||
| 135 | + def __init__(self, stream, name): | ||
| 136 | + self.stream = stream | ||
| 137 | + self.name = name | ||
| 138 | + self.size = stream.size | ||
| 139 | + | ||
| 140 | + def read_record_head(self): | ||
| 141 | + """ read first few bytes of record to determine size and type | ||
| 142 | + | ||
| 143 | + Abstract base method, to be implemented in subclasses. | ||
| 144 | + | ||
| 145 | + returns (rec_type, rec_size, other) where other will be forwarded to | ||
| 146 | + record constructors | ||
| 147 | + """ | ||
| 148 | + raise NotImplementedError('Abstract method ' | ||
| 149 | + 'OleRecordStream.read_record_head called') | ||
| 150 | + | ||
| 151 | + @classmethod | ||
| 152 | + def record_class_for_type(cls, rec_type): | ||
| 153 | + """ determine a class for given record type | ||
| 154 | + | ||
| 155 | + Only a base implementation. Create subclasses of OleRecordBase and | ||
| 156 | + return those when appropriate. | ||
| 157 | + | ||
| 158 | + returns (clz, force_read) | ||
| 159 | + """ | ||
| 160 | + return OleRecordBase, False | ||
| 161 | + | ||
| 162 | + def iter_records(self, fill_data=False): | ||
| 163 | + """ yield all records in this stream | ||
| 164 | + | ||
| 165 | + Stream must be positioned at start of records (e.g. start of stream). | ||
| 166 | + """ | ||
| 167 | + while True: | ||
| 168 | + # unpacking as in olevba._extract_vba | ||
| 169 | + pos = self.stream.tell() | ||
| 170 | + if pos >= self.size: | ||
| 171 | + break | ||
| 172 | + | ||
| 173 | + # read first few bytes, determine record type and size | ||
| 174 | + rec_type, rec_size, other = self.read_record_head() | ||
| 175 | + logging.debug('Record type {0} of size {1}' | ||
| 176 | + .format(rec_type, rec_size)) | ||
| 177 | + | ||
| 178 | + # determine what class to wrap this into | ||
| 179 | + rec_clz, force_read = self.record_class_for_type(rec_type) | ||
| 180 | + | ||
| 181 | + if fill_data or force_read: | ||
| 182 | + data = self.stream.read(rec_size) | ||
| 183 | + if len(data) != rec_size: | ||
| 184 | + raise IOError('Not enough data in stream ({0} < {1})' | ||
| 185 | + .format(len(data), rec_size)) | ||
| 186 | + else: | ||
| 187 | + self.stream.seek(rec_size, SEEK_CUR) | ||
| 188 | + data = None | ||
| 189 | + yield rec_clz(rec_type, rec_size, other, pos, data) | ||
| 190 | + | ||
| 191 | + def __str__(self): | ||
| 192 | + return '[{2} {0} (size {1})' \ | ||
| 193 | + .format(self.name or '[orphan]', self.size, | ||
| 194 | + self.__class__.__name__) | ||
| 195 | + | ||
| 196 | + | ||
| 197 | +class OleRecordBase(object): | ||
| 198 | + """ a record found in an OleRecordStream | ||
| 199 | + | ||
| 200 | + always has a type and a size, also pos and data | ||
| 201 | + """ | ||
| 202 | + | ||
| 203 | + # for subclasses with a fixed type | ||
| 204 | + TYPE = None | ||
| 205 | + | ||
| 206 | + # (max) size of subclasses | ||
| 207 | + MAX_SIZE = None | ||
| 208 | + SIZE = None | ||
| 209 | + | ||
| 210 | + def __init__(self, type, size, more_data, pos, data): | ||
| 211 | + """ create a record; more_data is discarded """ | ||
| 212 | + if self.TYPE is not None and type != self.TYPE: | ||
| 213 | + raise ValueError('Wrong subclass {0} for type {1}' | ||
| 214 | + .format(self.__class__.__name__, type)) | ||
| 215 | + self.type = type | ||
| 216 | + if self.SIZE is not None and size != self.SIZE: | ||
| 217 | + raise ValueError('Wrong size {0} for record type {1}' | ||
| 218 | + .format(size, type)) | ||
| 219 | + elif self.MAX_SIZE is not None and size > self.MAX_SIZE: | ||
| 220 | + raise ValueError('Wrong size: {0} > MAX_SIZE for record type {1}' | ||
| 221 | + .format(size, type)) | ||
| 222 | + self.size = size | ||
| 223 | + self.pos = pos | ||
| 224 | + self.data = data | ||
| 225 | + self.parse(more_data) | ||
| 226 | + | ||
| 227 | + def parse(self, more_data): | ||
| 228 | + """ finish constructing this record | ||
| 229 | + | ||
| 230 | + Can save more_data from OleRecordStream.read_record_head and/or parse | ||
| 231 | + data (if it was read). | ||
| 232 | + | ||
| 233 | + Base implementation, does nothing. To be overwritten in subclasses. | ||
| 234 | + """ | ||
| 235 | + pass | ||
| 236 | + | ||
| 237 | + def _type_str(self): | ||
| 238 | + """ helper for __str__, base implementation """ | ||
| 239 | + return '{0} type {1}'.format(self.__class__.__name__, self.type) | ||
| 240 | + | ||
| 241 | + def __str__(self): | ||
| 242 | + """ create a short but informative textual representation of self """ | ||
| 243 | + return '[' + self._type_str() + \ | ||
| 244 | + ' (size {0} from {1})]'.format(self.size, self.pos) | ||
| 245 | + | ||
| 246 | + | ||
| 247 | +############################################################################### | ||
| 248 | +# TESTING | ||
| 249 | +############################################################################### | ||
| 250 | + | ||
| 251 | + | ||
| 252 | +def test(filenames, ole_file_class=OleRecordFile, | ||
| 253 | + must_parse=None): | ||
| 254 | + """ parse all given file names and print rough structure | ||
| 255 | + | ||
| 256 | + if an error occurs while parsing a stream of type in must_parse, the error | ||
| 257 | + will be raised. Otherwise a message is printed | ||
| 258 | + """ | ||
| 259 | + logging.basicConfig(level=logging.DEBUG) | ||
| 260 | + if not filenames: | ||
| 261 | + logging.info('need file name[s]') | ||
| 262 | + return 2 | ||
| 263 | + for filename in filenames: | ||
| 264 | + logging.info('checking file {0}'.format(filename)) | ||
| 265 | + if not olefile.isOleFile(filename): | ||
| 266 | + logging.info('not an ole file - skip') | ||
| 267 | + continue | ||
| 268 | + ole = ole_file_class(filename) | ||
| 269 | + | ||
| 270 | + for stream in ole.iter_streams(): | ||
| 271 | + logging.info(stream) | ||
| 272 | + try: | ||
| 273 | + for record in stream.iter_records(): | ||
| 274 | + logging.info(' {0}'.format(record)) | ||
| 275 | + except Exception: | ||
| 276 | + if not must_parse: | ||
| 277 | + raise | ||
| 278 | + elif isinstance(stream, must_parse): | ||
| 279 | + raise | ||
| 280 | + else: | ||
| 281 | + logging.info(' failed to parse', exc_info=True) | ||
| 282 | + return 0 | ||
| 283 | + | ||
| 284 | + | ||
| 285 | +if __name__ == '__main__': | ||
| 286 | + sys.exit(test(sys.argv[1:])) |
oletools/xls_parser.py
| @@ -30,9 +30,11 @@ Read storages, (sub-)streams, records from xls file | @@ -30,9 +30,11 @@ Read storages, (sub-)streams, records from xls file | ||
| 30 | 30 | ||
| 31 | #------------------------------------------------------------------------------ | 31 | #------------------------------------------------------------------------------ |
| 32 | # CHANGELOG: | 32 | # CHANGELOG: |
| 33 | -# 2017-11-02 v0.01 CH: - first version | 33 | +# 2017-11-02 v0.1 CH: - first version |
| 34 | +# 2017-11-02 v0.2 CH: - move some code to record_base.py | ||
| 35 | +# (to avoid copy-and-paste in ppt_parser.py) | ||
| 34 | 36 | ||
| 35 | -__version__ = '0.1' | 37 | +__version__ = '0.2' |
| 36 | 38 | ||
| 37 | # ----------------------------------------------------------------------------- | 39 | # ----------------------------------------------------------------------------- |
| 38 | # TODO: | 40 | # TODO: |
| @@ -52,17 +54,8 @@ __version__ = '0.1' | @@ -52,17 +54,8 @@ __version__ = '0.1' | ||
| 52 | import sys | 54 | import sys |
| 53 | import os.path | 55 | import os.path |
| 54 | from struct import unpack | 56 | from struct import unpack |
| 55 | -from io import SEEK_CUR | ||
| 56 | import logging | 57 | import logging |
| 57 | - | ||
| 58 | -# little hack to allow absolute imports even if oletools is not installed. | ||
| 59 | -# Copied from olevba.py | ||
| 60 | -_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) # pylint: disable=invalid-name | ||
| 61 | -_parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) # pylint: disable=invalid-name | ||
| 62 | -if _parent_dir not in sys.path: | ||
| 63 | - sys.path.insert(0, _parent_dir) | ||
| 64 | - | ||
| 65 | -from oletools.thirdparty import olefile | 58 | +from record_base import OleRecordFile, OleRecordStream, OleRecordBase, test |
| 66 | 59 | ||
| 67 | 60 | ||
| 68 | # === PYTHON 2+3 SUPPORT ====================================================== | 61 | # === PYTHON 2+3 SUPPORT ====================================================== |
| @@ -75,16 +68,6 @@ if sys.version_info[0] >= 3: | @@ -75,16 +68,6 @@ if sys.version_info[0] >= 3: | ||
| 75 | ############################################################################### | 68 | ############################################################################### |
| 76 | 69 | ||
| 77 | 70 | ||
| 78 | -ENTRY_TYPE2STR = { | ||
| 79 | - olefile.STGTY_EMPTY: 'empty', | ||
| 80 | - olefile.STGTY_STORAGE: 'storage', | ||
| 81 | - olefile.STGTY_STREAM: 'stream', | ||
| 82 | - olefile.STGTY_LOCKBYTES: 'lock-bytes', | ||
| 83 | - olefile.STGTY_PROPERTY: 'property', | ||
| 84 | - olefile.STGTY_ROOT: 'root' | ||
| 85 | -} | ||
| 86 | - | ||
| 87 | - | ||
| 88 | def is_xls(filename): | 71 | def is_xls(filename): |
| 89 | """ | 72 | """ |
| 90 | determine whether a given file is an excel ole file | 73 | determine whether a given file is an excel ole file |
| @@ -95,7 +78,7 @@ def is_xls(filename): | @@ -95,7 +78,7 @@ def is_xls(filename): | ||
| 95 | substream | 78 | substream |
| 96 | """ | 79 | """ |
| 97 | try: | 80 | try: |
| 98 | - for stream in XlsFile(filename).get_streams(): | 81 | + for stream in XlsFile(filename).iter_streams(): |
| 99 | if isinstance(stream, WorkbookStream): | 82 | if isinstance(stream, WorkbookStream): |
| 100 | return True | 83 | return True |
| 101 | except Exception: | 84 | except Exception: |
| @@ -122,7 +105,7 @@ def read_unicode_2byte(data, start_idx, n_chars): | @@ -122,7 +105,7 @@ def read_unicode_2byte(data, start_idx, n_chars): | ||
| 122 | unpack('<' + 'H'*n_chars, data[start_idx:end_idx])) | 105 | unpack('<' + 'H'*n_chars, data[start_idx:end_idx])) |
| 123 | else: # slower version but less memory-extensive | 106 | else: # slower version but less memory-extensive |
| 124 | unichars = (unichr(unpack('<H', data[data_idx:data_idx+2])[0]) | 107 | unichars = (unichr(unpack('<H', data[data_idx:data_idx+2])[0]) |
| 125 | - for data_idx in xrange(start_idx, end_idx, 2)) | 108 | + for data_idx in range(start_idx, end_idx, 2)) |
| 126 | return u''.join(unichars), end_idx | 109 | return u''.join(unichars), end_idx |
| 127 | 110 | ||
| 128 | 111 | ||
| @@ -130,133 +113,94 @@ def read_unicode_2byte(data, start_idx, n_chars): | @@ -130,133 +113,94 @@ def read_unicode_2byte(data, start_idx, n_chars): | ||
| 130 | # File, Storage, Stream | 113 | # File, Storage, Stream |
| 131 | ############################################################################### | 114 | ############################################################################### |
| 132 | 115 | ||
| 116 | +class XlsFile(OleRecordFile): | ||
| 117 | + """ An xls file has most streams made up of records """ | ||
| 133 | 118 | ||
| 134 | -class XlsFile(olefile.OleFileIO): | ||
| 135 | - """ specialization of an OLE compound file """ | 119 | + @classmethod |
| 120 | + def stream_class_for_name(self, stream_name): | ||
| 121 | + """ helper for iter_streams """ | ||
| 122 | + return XlsStream | ||
| 136 | 123 | ||
| 137 | - def get_streams(self): | ||
| 138 | - """ find all streams, including orphans """ | ||
| 139 | - logging.debug('Finding streams in ole file') | ||
| 140 | 124 | ||
| 141 | - for sid, direntry in enumerate(self.direntries): | ||
| 142 | - is_orphan = direntry is None | ||
| 143 | - if is_orphan: | ||
| 144 | - # this direntry is not part of the tree --> unused or orphan | ||
| 145 | - direntry = self._load_direntry(sid) | ||
| 146 | - is_stream = direntry.entry_type == olefile.STGTY_STREAM | ||
| 147 | - logging.debug('direntry {:2d} {}: {}'.format( | ||
| 148 | - sid, '[orphan]' if is_orphan else direntry.name, | ||
| 149 | - 'is stream of size {}'.format(direntry.size) if is_stream else | ||
| 150 | - 'no stream ({})'.format(ENTRY_TYPE2STR[direntry.entry_type]))) | ||
| 151 | - if is_stream: | ||
| 152 | - if direntry.name == 'Workbook': | ||
| 153 | - clz = WorkbookStream | ||
| 154 | - else: | ||
| 155 | - clz = XlsStream | ||
| 156 | - yield clz(self._open(direntry.isectStart, direntry.size), | ||
| 157 | - None if is_orphan else direntry.name) | 125 | +class XlsStream(OleRecordStream): |
| 126 | + """ most streams in xls file consist of records """ | ||
| 158 | 127 | ||
| 128 | + def read_record_head(self): | ||
| 129 | + """ read first few bytes of record to determine size and type | ||
| 159 | 130 | ||
| 160 | -class XlsStream(object): | ||
| 161 | - """ specialization of an OLE stream | ||
| 162 | - | ||
| 163 | - Currently not much use, but may be interesting for further sub-classing | ||
| 164 | - when extending this code. | ||
| 165 | - | ||
| 166 | - stream argument can be oleile.OleStream or ooxml.ZipSubFile | ||
| 167 | - """ | 131 | + returns (type, size, other) where other is None |
| 132 | + """ | ||
| 133 | + rec_type, rec_size = unpack('<HH', self.stream.read(4)) | ||
| 134 | + return rec_type, rec_size, None | ||
| 168 | 135 | ||
| 169 | - def __init__(self, stream, name): | ||
| 170 | - self.stream = stream | ||
| 171 | - self.size = stream.size | ||
| 172 | - self.name = name | 136 | + @classmethod |
| 137 | + def record_class_for_type(cls, type): | ||
| 138 | + """ determine a class for given record type | ||
| 173 | 139 | ||
| 174 | - def __str__(self): | ||
| 175 | - return '[XlsStream {0} (size {1})' \ | ||
| 176 | - .format(self.name or '[orphan]', self.size) | 140 | + returns (clz, force_read) |
| 141 | + """ | ||
| 142 | + return XlsRecord, False | ||
| 177 | 143 | ||
| 178 | 144 | ||
| 179 | class WorkbookStream(XlsStream): | 145 | class WorkbookStream(XlsStream): |
| 180 | - """ the workbook stream which contains records """ | 146 | + """ Stream in excel file that holds most info """ |
| 181 | 147 | ||
| 182 | - def iter_records(self, fill_data=False): | ||
| 183 | - """ iterate over records in streams | 148 | + @classmethod |
| 149 | + def record_class_for_type(cls, type): | ||
| 150 | + """ determine a class for given record type | ||
| 184 | 151 | ||
| 185 | - Stream must be positioned at start of records (e.g. start of stream). | 152 | + returns (clz, force_read) |
| 186 | """ | 153 | """ |
| 187 | - while True: | ||
| 188 | - # unpacking as in olevba._extract_vba | ||
| 189 | - pos = self.stream.tell() | ||
| 190 | - if pos >= self.size: | ||
| 191 | - break | ||
| 192 | - type = unpack('<H', self.stream.read(2))[0] | ||
| 193 | - size = unpack('<H', self.stream.read(2))[0] | ||
| 194 | - force_read = False | ||
| 195 | - if type == XlsRecordBof.TYPE: | ||
| 196 | - clz = XlsRecordBof | ||
| 197 | - force_read = True | ||
| 198 | - elif type == XlsRecordEof.TYPE: | ||
| 199 | - clz = XlsRecordEof | ||
| 200 | - elif type == XlsRecordSupBook.TYPE: | ||
| 201 | - clz = XlsRecordSupBook | ||
| 202 | - force_read = True | ||
| 203 | - else: | ||
| 204 | - clz = XlsRecord | ||
| 205 | - data = None | ||
| 206 | - if fill_data or force_read: | ||
| 207 | - data = self.stream.read(size) | ||
| 208 | - else: | ||
| 209 | - self.stream.seek(size, SEEK_CUR) | ||
| 210 | - yield clz(type, size, pos, data) | ||
| 211 | - | ||
| 212 | - def __str__(self): | ||
| 213 | - return '[Workbook Stream (size {0})'.format(self.size) | ||
| 214 | - | ||
| 215 | - | ||
| 216 | -class XlsbStream(XlsStream): | 154 | + if type == XlsRecordBof.TYPE: |
| 155 | + return XlsRecordBof, True | ||
| 156 | + elif type == XlsRecordEof.TYPE: | ||
| 157 | + return XlsRecordEof, False | ||
| 158 | + elif type == XlsRecordSupBook.TYPE: | ||
| 159 | + return XlsRecordSupBook, True | ||
| 160 | + else: | ||
| 161 | + return XlsRecord, False | ||
| 162 | + | ||
| 163 | + | ||
| 164 | +class XlsbStream(OleRecordStream): | ||
| 217 | """ binary stream of an xlsb file, usually have a record structure """ | 165 | """ binary stream of an xlsb file, usually have a record structure """ |
| 218 | 166 | ||
| 219 | HIGH_BIT_MASK = 0b10000000 | 167 | HIGH_BIT_MASK = 0b10000000 |
| 220 | LOW7_BIT_MASK = 0b01111111 | 168 | LOW7_BIT_MASK = 0b01111111 |
| 221 | 169 | ||
| 222 | - def iter_records(self): | ||
| 223 | - """ iterate over records in stream | 170 | + def read_record_head(self): |
| 171 | + """ read first few bytes of record to determine size and type | ||
| 224 | 172 | ||
| 225 | - Record type and size are encoded differently than in xls streams. | ||
| 226 | - (c.f. [MS-XLSB, Paragraph 2.1.4: Record) | 173 | + returns (type, size, other) where other is None |
| 227 | """ | 174 | """ |
| 228 | - while True: | ||
| 229 | - pos = self.stream.tell() | ||
| 230 | - if pos >= self.size: | ||
| 231 | - break | ||
| 232 | - val = ord(self.stream.read(1)) | ||
| 233 | - if val & self.HIGH_BIT_MASK: # high bit of the low byte is 1 | ||
| 234 | - val2 = ord(self.stream.read(1)) # need another byte | ||
| 235 | - # combine 7 low bits of each byte | ||
| 236 | - type = (val & self.LOW7_BIT_MASK) + \ | 175 | + val = ord(self.stream.read(1)) |
| 176 | + if val & self.HIGH_BIT_MASK: # high bit of the low byte is 1 | ||
| 177 | + val2 = ord(self.stream.read(1)) # need another byte | ||
| 178 | + # combine 7 low bits of each byte | ||
| 179 | + rec_type = (val & self.LOW7_BIT_MASK) + \ | ||
| 237 | ((val2 & self.LOW7_BIT_MASK) << 7) | 180 | ((val2 & self.LOW7_BIT_MASK) << 7) |
| 238 | - else: | ||
| 239 | - type = val | ||
| 240 | - | ||
| 241 | - size = 0 | ||
| 242 | - shift = 0 | ||
| 243 | - for _ in range(4): # size needs up to 4 byte | ||
| 244 | - val = ord(self.stream.read(1)) | ||
| 245 | - size += (val & self.LOW7_BIT_MASK) << shift | ||
| 246 | - shift += 7 | ||
| 247 | - if (val & self.HIGH_BIT_MASK) == 0: # high-bit is 0 --> done | ||
| 248 | - break | ||
| 249 | - | ||
| 250 | - if pos + size > self.size: | ||
| 251 | - raise ValueError('Stream does not seem to have record ' | ||
| 252 | - 'structure or is incomplete (record size {0})' | ||
| 253 | - .format(size)) | ||
| 254 | - data = self.stream.read(size) | ||
| 255 | - | ||
| 256 | - clz = XlsbRecord | ||
| 257 | - if type == XlsbBeginSupBook.TYPE: | ||
| 258 | - clz = XlsbBeginSupBook | ||
| 259 | - yield clz(type, size, pos, data) | 181 | + else: |
| 182 | + rec_type = val | ||
| 183 | + | ||
| 184 | + rec_size = 0 | ||
| 185 | + shift = 0 | ||
| 186 | + for _ in range(4): # rec_size needs up to 4 byte | ||
| 187 | + val = ord(self.stream.read(1)) | ||
| 188 | + rec_size += (val & self.LOW7_BIT_MASK) << shift | ||
| 189 | + shift += 7 | ||
| 190 | + if (val & self.HIGH_BIT_MASK) == 0: # high-bit is 0 --> done | ||
| 191 | + break | ||
| 192 | + return rec_type, rec_size, None | ||
| 193 | + | ||
| 194 | + @classmethod | ||
| 195 | + def record_class_for_type(cls, type): | ||
| 196 | + """ determine a class for given record type | ||
| 197 | + | ||
| 198 | + returns (clz, force_read) | ||
| 199 | + """ | ||
| 200 | + if type == XlsbBeginSupBook.TYPE: | ||
| 201 | + return XlsbBeginSupBook, True | ||
| 202 | + else: | ||
| 203 | + return XlsbRecord, False | ||
| 260 | 204 | ||
| 261 | 205 | ||
| 262 | ############################################################################### | 206 | ############################################################################### |
| @@ -309,7 +253,6 @@ FREQUENT_RECORDS = dict([ | @@ -309,7 +253,6 @@ FREQUENT_RECORDS = dict([ | ||
| 309 | 253 | ||
| 310 | #: records found in xlsb binary parts | 254 | #: records found in xlsb binary parts |
| 311 | FREQUENT_RECORDS_XLSB = dict([ | 255 | FREQUENT_RECORDS_XLSB = dict([ |
| 312 | - (360, 'BrtBeginSupBook'), | ||
| 313 | (588, 'BrtEndSupBook'), | 256 | (588, 'BrtEndSupBook'), |
| 314 | (667, 'BrtSupAddin'), | 257 | (667, 'BrtSupAddin'), |
| 315 | (355, 'BrtSupBookSrc'), | 258 | (355, 'BrtSupBookSrc'), |
| @@ -330,36 +273,12 @@ FREQUENT_RECORDS_XLSB = dict([ | @@ -330,36 +273,12 @@ FREQUENT_RECORDS_XLSB = dict([ | ||
| 330 | ]) | 273 | ]) |
| 331 | 274 | ||
| 332 | 275 | ||
| 333 | -class XlsRecord(object): | 276 | +class XlsRecord(OleRecordBase): |
| 334 | """ basic building block of data in workbook stream """ | 277 | """ basic building block of data in workbook stream """ |
| 335 | 278 | ||
| 336 | #: max size of a record in xls stream (does not apply to xlsb) | 279 | #: max size of a record in xls stream (does not apply to xlsb) |
| 337 | MAX_SIZE = 8224 | 280 | MAX_SIZE = 8224 |
| 338 | 281 | ||
| 339 | - # to be overwritten in subclasses that have fixed type/size | ||
| 340 | - TYPE = None | ||
| 341 | - SIZE = None | ||
| 342 | - | ||
| 343 | - def __init__(self, type, size, pos, data=None): | ||
| 344 | - """ create a record """ | ||
| 345 | - self.type = type | ||
| 346 | - if self.MAX_SIZE is not None and size > self.MAX_SIZE: | ||
| 347 | - logging.warning('record size {0} exceeds max size' | ||
| 348 | - .format(size)) | ||
| 349 | - elif self.SIZE is not None and size != self.SIZE: | ||
| 350 | - raise ValueError('size {0} is not as expected for this type' | ||
| 351 | - .format(size)) | ||
| 352 | - self.size = size | ||
| 353 | - self.pos = pos | ||
| 354 | - self.data = data | ||
| 355 | - if data is not None and len(data) != size: | ||
| 356 | - raise ValueError('data size {0} is not expected size {1}' | ||
| 357 | - .format(len(data), size)) | ||
| 358 | - | ||
| 359 | - def read_data(self, stream): | ||
| 360 | - """ read data from stream if up to now only pos was known """ | ||
| 361 | - raise NotImplementedError() | ||
| 362 | - | ||
| 363 | def _type_str(self): | 282 | def _type_str(self): |
| 364 | """ simplification for subclasses to create their own __str__ """ | 283 | """ simplification for subclasses to create their own __str__ """ |
| 365 | try: | 284 | try: |
| @@ -367,10 +286,6 @@ class XlsRecord(object): | @@ -367,10 +286,6 @@ class XlsRecord(object): | ||
| 367 | except KeyError: | 286 | except KeyError: |
| 368 | return 'XlsRecord type {0}'.format(self.type) | 287 | return 'XlsRecord type {0}'.format(self.type) |
| 369 | 288 | ||
| 370 | - def __str__(self): | ||
| 371 | - return '[' + self._type_str() + \ | ||
| 372 | - ' (size {0} from {1})]'.format(self.size, self.pos) | ||
| 373 | - | ||
| 374 | 289 | ||
| 375 | class XlsRecordBof(XlsRecord): | 290 | class XlsRecordBof(XlsRecord): |
| 376 | """ record found at beginning of substreams """ | 291 | """ record found at beginning of substreams """ |
| @@ -380,8 +295,7 @@ class XlsRecordBof(XlsRecord): | @@ -380,8 +295,7 @@ class XlsRecordBof(XlsRecord): | ||
| 380 | DOCTYPES = dict([(0x5, 'workbook'), (0x10, 'dialog/worksheet'), | 295 | DOCTYPES = dict([(0x5, 'workbook'), (0x10, 'dialog/worksheet'), |
| 381 | (0x20, 'chart'), (0x40, 'macro')]) | 296 | (0x20, 'chart'), (0x40, 'macro')]) |
| 382 | 297 | ||
| 383 | - def __init__(self, *args, **kwargs): | ||
| 384 | - super(XlsRecordBof, self).__init__(*args, **kwargs) | 298 | + def parse(self, _): |
| 385 | if self.data is None: | 299 | if self.data is None: |
| 386 | self.doctype = None | 300 | self.doctype = None |
| 387 | return | 301 | return |
| @@ -420,9 +334,7 @@ class XlsRecordSupBook(XlsRecord): | @@ -420,9 +334,7 @@ class XlsRecordSupBook(XlsRecord): | ||
| 420 | LINK_TYPE_OLE_DDE = 'ole/dde data source' | 334 | LINK_TYPE_OLE_DDE = 'ole/dde data source' |
| 421 | LINK_TYPE_EXTERNAL = 'external workbook' | 335 | LINK_TYPE_EXTERNAL = 'external workbook' |
| 422 | 336 | ||
| 423 | - def __init__(self, *args, **kwargs): | ||
| 424 | - super(XlsRecordSupBook, self).__init__(*args, **kwargs) | ||
| 425 | - | 337 | + def parse(self, _): |
| 426 | # set defaults | 338 | # set defaults |
| 427 | self.ctab = None | 339 | self.ctab = None |
| 428 | self.cch = None | 340 | self.cch = None |
| @@ -461,7 +373,7 @@ class XlsRecordSupBook(XlsRecord): | @@ -461,7 +373,7 @@ class XlsRecordSupBook(XlsRecord): | ||
| 461 | return 'SupBook Record ({0})'.format(self.support_link_type) | 373 | return 'SupBook Record ({0})'.format(self.support_link_type) |
| 462 | 374 | ||
| 463 | 375 | ||
| 464 | -class XlsbRecord(XlsRecord): | 376 | +class XlsbRecord(OleRecordBase): |
| 465 | """ like an xls record, but from binary part of xlsb file | 377 | """ like an xls record, but from binary part of xlsb file |
| 466 | 378 | ||
| 467 | has no MAX_SIZE and types have different meanings | 379 | has no MAX_SIZE and types have different meanings |
| @@ -491,8 +403,7 @@ class XlsbBeginSupBook(XlsbRecord): | @@ -491,8 +403,7 @@ class XlsbBeginSupBook(XlsbRecord): | ||
| 491 | LINK_TYPE_UNEXPECTED = 'unexpected' | 403 | LINK_TYPE_UNEXPECTED = 'unexpected' |
| 492 | LINK_TYPE_UNKNOWN = 'unknown' | 404 | LINK_TYPE_UNKNOWN = 'unknown' |
| 493 | 405 | ||
| 494 | - def __init__(self, *args, **kwargs): | ||
| 495 | - super(XlsbBeginSupBook, self).__init__(*args, **kwargs) | 406 | + def parse(self, _): |
| 496 | self.link_type = self.LINK_TYPE_UNKNOWN | 407 | self.link_type = self.LINK_TYPE_UNKNOWN |
| 497 | self.string1 = '' | 408 | self.string1 = '' |
| 498 | self.string2 = '' | 409 | self.string2 = '' |
| @@ -540,6 +451,7 @@ class XlsbBeginSupBook(XlsbRecord): | @@ -540,6 +451,7 @@ class XlsbBeginSupBook(XlsbRecord): | ||
| 540 | # XLSB Binary Parts | 451 | # XLSB Binary Parts |
| 541 | ############################################################################### | 452 | ############################################################################### |
| 542 | 453 | ||
| 454 | + | ||
| 543 | def parse_xlsb_part(stream, _, filename): | 455 | def parse_xlsb_part(stream, _, filename): |
| 544 | """ Excel xlsb files also have a record structure. iter records """ | 456 | """ Excel xlsb files also have a record structure. iter records """ |
| 545 | for record in XlsbStream(stream, filename).iter_records(): | 457 | for record in XlsbStream(stream, filename).iter_records(): |
| @@ -551,26 +463,5 @@ def parse_xlsb_part(stream, _, filename): | @@ -551,26 +463,5 @@ def parse_xlsb_part(stream, _, filename): | ||
| 551 | ############################################################################### | 463 | ############################################################################### |
| 552 | 464 | ||
| 553 | 465 | ||
| 554 | -def test(*filenames): | ||
| 555 | - """ parse all given file names and print rough structure """ | ||
| 556 | - logging.basicConfig(level=logging.DEBUG) | ||
| 557 | - if not filenames: | ||
| 558 | - logging.info('need file name[s]') | ||
| 559 | - return 2 | ||
| 560 | - for filename in filenames: | ||
| 561 | - logging.info('checking file {0}'.format(filename)) | ||
| 562 | - if not olefile.isOleFile(filename): | ||
| 563 | - logging.info('not an ole file - skip') | ||
| 564 | - continue | ||
| 565 | - xls = XlsFile(filename) | ||
| 566 | - | ||
| 567 | - for stream in xls.get_streams(): | ||
| 568 | - logging.info(stream) | ||
| 569 | - if isinstance(stream, WorkbookStream): | ||
| 570 | - for record in stream.iter_records(): | ||
| 571 | - logging.info(' {0}'.format(record)) | ||
| 572 | - return 0 | ||
| 573 | - | ||
| 574 | - | ||
| 575 | if __name__ == '__main__': | 466 | if __name__ == '__main__': |
| 576 | - sys.exit(test(*sys.argv[1:])) | 467 | + sys.exit(test(sys.argv[1:], XlsFile, WorkbookStream)) |