Commit d397edb5fe4149bf8278b16e63da33db0e6357da
1 parent
27dc5360
xls_parser: move code to new record_base for re-use with ppt files
Parsing through records seems to make sense. Try to repeat the same with ppt files next. To avoid copy-and-paste, move code to be used by both to common base record_base.py
Showing
2 changed files
with
367 additions
and
190 deletions
oletools/record_base.py
0 → 100644
| 1 | +#!/usr/bin/env python | |
| 2 | + | |
| 3 | +""" | |
| 4 | +record_base.py | |
| 5 | + | |
| 6 | +Common stuff for ole files whose streams are a sequence of record structures. | |
| 7 | +This is the case for xls and ppt, so classes are bases for xls_parser.py and | |
| 8 | +ppt_parser.py . | |
| 9 | +""" | |
| 10 | + | |
| 11 | +# === LICENSE ================================================================= | |
| 12 | +# | |
| 13 | +# Redistribution and use in source and binary forms, with or without | |
| 14 | +# modification, are permitted provided that the following conditions are met: | |
| 15 | +# | |
| 16 | +# * Redistributions of source code must retain the above copyright notice, | |
| 17 | +# this list of conditions and the following disclaimer. | |
| 18 | +# * Redistributions in binary form must reproduce the above copyright notice, | |
| 19 | +# this list of conditions and the following disclaimer in the documentation | |
| 20 | +# and/or other materials provided with the distribution. | |
| 21 | +# | |
| 22 | +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
| 23 | +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 24 | +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| 25 | +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE | |
| 26 | +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
| 27 | +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
| 28 | +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
| 29 | +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
| 30 | +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
| 31 | +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
| 32 | +# POSSIBILITY OF SUCH DAMAGE. | |
| 33 | + | |
| 34 | +from __future__ import print_function | |
| 35 | + | |
| 36 | +#------------------------------------------------------------------------------ | |
| 37 | +# CHANGELOG: | |
| 38 | +# 2017-11-30 v0.01 CH: - first version based on xls_parser | |
| 39 | + | |
| 40 | +#------------------------------------------------------------------------------ | |
| 41 | +# TODO: | |
| 42 | +# - read DocumentSummaryInformation first to get more info about streams | |
| 43 | +# (maybe content type or so; identify streams that are never record-based) | |
| 44 | +# - think about integrating this with olefile itself | |
| 45 | + | |
| 46 | +# ----------------------------------------------------------------------------- | |
| 47 | +# REFERENCES: | |
| 48 | +# - [MS-XLS]: Excel Binary File Format (.xls) Structure Specification | |
| 49 | +# https://msdn.microsoft.com/en-us/library/office/cc313154(v=office.14).aspx | |
| 50 | +# - Understanding the Excel .xls Binary File Format | |
| 51 | +# https://msdn.microsoft.com/en-us/library/office/gg615597(v=office.14).aspx | |
| 52 | +# - [MS-PPT] | |
| 53 | + | |
| 54 | + | |
| 55 | +import sys | |
| 56 | +import os.path | |
| 57 | +from io import SEEK_CUR | |
| 58 | +import logging | |
| 59 | + | |
| 60 | +# little hack to allow absolute imports even if oletools is not installed. | |
| 61 | +# Copied from olevba.py | |
| 62 | +_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) # pylint: disable=invalid-name | |
| 63 | +_parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) # pylint: disable=invalid-name | |
| 64 | +del _thismodule_dir | |
| 65 | +if _parent_dir not in sys.path: | |
| 66 | + sys.path.insert(0, _parent_dir) | |
| 67 | +del _parent_dir | |
| 68 | + | |
| 69 | +from oletools.thirdparty import olefile | |
| 70 | + | |
| 71 | + | |
| 72 | +############################################################################### | |
| 73 | +# Helpers | |
| 74 | +############################################################################### | |
| 75 | + | |
| 76 | + | |
| 77 | +ENTRY_TYPE2STR = { | |
| 78 | + olefile.STGTY_EMPTY: 'empty', | |
| 79 | + olefile.STGTY_STORAGE: 'storage', | |
| 80 | + olefile.STGTY_STREAM: 'stream', | |
| 81 | + olefile.STGTY_LOCKBYTES: 'lock-bytes', | |
| 82 | + olefile.STGTY_PROPERTY: 'property', | |
| 83 | + olefile.STGTY_ROOT: 'root' | |
| 84 | +} | |
| 85 | + | |
| 86 | + | |
| 87 | +############################################################################### | |
| 88 | +# Base Classes | |
| 89 | +############################################################################### | |
| 90 | + | |
| 91 | + | |
| 92 | +class OleRecordFile(olefile.OleFileIO): | |
| 93 | + """ an OLE compound file whose streams have (mostly) record structure | |
| 94 | + | |
| 95 | + 'record structure' meaning that streams are a sequence of records. Records | |
| 96 | + are structure with information about type and size in their first bytes | |
| 97 | + and type-dependent data of given size after that. | |
| 98 | + | |
| 99 | + Subclass of OleFileIO! | |
| 100 | + """ | |
| 101 | + | |
| 102 | + @classmethod | |
| 103 | + def stream_class_for_name(cls, stream_name): | |
| 104 | + """ helper for iter_streams, must be overwritten in subclasses """ | |
| 105 | + return OleRecordStream # this is an abstract class! | |
| 106 | + | |
| 107 | + def iter_streams(self): | |
| 108 | + """ find all streams, including orphans """ | |
| 109 | + logging.debug('Finding streams in ole file') | |
| 110 | + | |
| 111 | + for sid, direntry in enumerate(self.direntries): | |
| 112 | + is_orphan = direntry is None | |
| 113 | + if is_orphan: | |
| 114 | + # this direntry is not part of the tree --> unused or orphan | |
| 115 | + direntry = self._load_direntry(sid) | |
| 116 | + is_stream = direntry.entry_type == olefile.STGTY_STREAM | |
| 117 | + logging.debug('direntry {:2d} {}: {}'.format( | |
| 118 | + sid, '[orphan]' if is_orphan else direntry.name, | |
| 119 | + 'is stream of size {}'.format(direntry.size) if is_stream else | |
| 120 | + 'no stream ({})'.format(ENTRY_TYPE2STR[direntry.entry_type]))) | |
| 121 | + if is_stream: | |
| 122 | + clz = self.stream_class_for_name(direntry.name) | |
| 123 | + yield clz(self._open(direntry.isectStart, direntry.size), | |
| 124 | + None if is_orphan else direntry.name) | |
| 125 | + | |
| 126 | + | |
| 127 | +class OleRecordStream(object): | |
| 128 | + """ a stream found in an OleRecordFile | |
| 129 | + | |
| 130 | + Always has a name and a size (both read-only). Has an OleFileStream handle. | |
| 131 | + | |
| 132 | + abstract base class | |
| 133 | + """ | |
| 134 | + | |
| 135 | + def __init__(self, stream, name): | |
| 136 | + self.stream = stream | |
| 137 | + self.name = name | |
| 138 | + self.size = stream.size | |
| 139 | + | |
| 140 | + def read_record_head(self): | |
| 141 | + """ read first few bytes of record to determine size and type | |
| 142 | + | |
| 143 | + Abstract base method, to be implemented in subclasses. | |
| 144 | + | |
| 145 | + returns (rec_type, rec_size, other) where other will be forwarded to | |
| 146 | + record constructors | |
| 147 | + """ | |
| 148 | + raise NotImplementedError('Abstract method ' | |
| 149 | + 'OleRecordStream.read_record_head called') | |
| 150 | + | |
| 151 | + @classmethod | |
| 152 | + def record_class_for_type(cls, rec_type): | |
| 153 | + """ determine a class for given record type | |
| 154 | + | |
| 155 | + Only a base implementation. Create subclasses of OleRecordBase and | |
| 156 | + return those when appropriate. | |
| 157 | + | |
| 158 | + returns (clz, force_read) | |
| 159 | + """ | |
| 160 | + return OleRecordBase, False | |
| 161 | + | |
| 162 | + def iter_records(self, fill_data=False): | |
| 163 | + """ yield all records in this stream | |
| 164 | + | |
| 165 | + Stream must be positioned at start of records (e.g. start of stream). | |
| 166 | + """ | |
| 167 | + while True: | |
| 168 | + # unpacking as in olevba._extract_vba | |
| 169 | + pos = self.stream.tell() | |
| 170 | + if pos >= self.size: | |
| 171 | + break | |
| 172 | + | |
| 173 | + # read first few bytes, determine record type and size | |
| 174 | + rec_type, rec_size, other = self.read_record_head() | |
| 175 | + logging.debug('Record type {0} of size {1}' | |
| 176 | + .format(rec_type, rec_size)) | |
| 177 | + | |
| 178 | + # determine what class to wrap this into | |
| 179 | + rec_clz, force_read = self.record_class_for_type(rec_type) | |
| 180 | + | |
| 181 | + if fill_data or force_read: | |
| 182 | + data = self.stream.read(rec_size) | |
| 183 | + if len(data) != rec_size: | |
| 184 | + raise IOError('Not enough data in stream ({0} < {1})' | |
| 185 | + .format(len(data), rec_size)) | |
| 186 | + else: | |
| 187 | + self.stream.seek(rec_size, SEEK_CUR) | |
| 188 | + data = None | |
| 189 | + yield rec_clz(rec_type, rec_size, other, pos, data) | |
| 190 | + | |
| 191 | + def __str__(self): | |
| 192 | + return '[{2} {0} (size {1})' \ | |
| 193 | + .format(self.name or '[orphan]', self.size, | |
| 194 | + self.__class__.__name__) | |
| 195 | + | |
| 196 | + | |
| 197 | +class OleRecordBase(object): | |
| 198 | + """ a record found in an OleRecordStream | |
| 199 | + | |
| 200 | + always has a type and a size, also pos and data | |
| 201 | + """ | |
| 202 | + | |
| 203 | + # for subclasses with a fixed type | |
| 204 | + TYPE = None | |
| 205 | + | |
| 206 | + # (max) size of subclasses | |
| 207 | + MAX_SIZE = None | |
| 208 | + SIZE = None | |
| 209 | + | |
| 210 | + def __init__(self, type, size, more_data, pos, data): | |
| 211 | + """ create a record; more_data is discarded """ | |
| 212 | + if self.TYPE is not None and type != self.TYPE: | |
| 213 | + raise ValueError('Wrong subclass {0} for type {1}' | |
| 214 | + .format(self.__class__.__name__, type)) | |
| 215 | + self.type = type | |
| 216 | + if self.SIZE is not None and size != self.SIZE: | |
| 217 | + raise ValueError('Wrong size {0} for record type {1}' | |
| 218 | + .format(size, type)) | |
| 219 | + elif self.MAX_SIZE is not None and size > self.MAX_SIZE: | |
| 220 | + raise ValueError('Wrong size: {0} > MAX_SIZE for record type {1}' | |
| 221 | + .format(size, type)) | |
| 222 | + self.size = size | |
| 223 | + self.pos = pos | |
| 224 | + self.data = data | |
| 225 | + self.parse(more_data) | |
| 226 | + | |
| 227 | + def parse(self, more_data): | |
| 228 | + """ finish constructing this record | |
| 229 | + | |
| 230 | + Can save more_data from OleRecordStream.read_record_head and/or parse | |
| 231 | + data (if it was read). | |
| 232 | + | |
| 233 | + Base implementation, does nothing. To be overwritten in subclasses. | |
| 234 | + """ | |
| 235 | + pass | |
| 236 | + | |
| 237 | + def _type_str(self): | |
| 238 | + """ helper for __str__, base implementation """ | |
| 239 | + return '{0} type {1}'.format(self.__class__.__name__, self.type) | |
| 240 | + | |
| 241 | + def __str__(self): | |
| 242 | + """ create a short but informative textual representation of self """ | |
| 243 | + return '[' + self._type_str() + \ | |
| 244 | + ' (size {0} from {1})]'.format(self.size, self.pos) | |
| 245 | + | |
| 246 | + | |
| 247 | +############################################################################### | |
| 248 | +# TESTING | |
| 249 | +############################################################################### | |
| 250 | + | |
| 251 | + | |
| 252 | +def test(filenames, ole_file_class=OleRecordFile, | |
| 253 | + must_parse=None): | |
| 254 | + """ parse all given file names and print rough structure | |
| 255 | + | |
| 256 | + if an error occurs while parsing a stream of type in must_parse, the error | |
| 257 | + will be raised. Otherwise a message is printed | |
| 258 | + """ | |
| 259 | + logging.basicConfig(level=logging.DEBUG) | |
| 260 | + if not filenames: | |
| 261 | + logging.info('need file name[s]') | |
| 262 | + return 2 | |
| 263 | + for filename in filenames: | |
| 264 | + logging.info('checking file {0}'.format(filename)) | |
| 265 | + if not olefile.isOleFile(filename): | |
| 266 | + logging.info('not an ole file - skip') | |
| 267 | + continue | |
| 268 | + ole = ole_file_class(filename) | |
| 269 | + | |
| 270 | + for stream in ole.iter_streams(): | |
| 271 | + logging.info(stream) | |
| 272 | + try: | |
| 273 | + for record in stream.iter_records(): | |
| 274 | + logging.info(' {0}'.format(record)) | |
| 275 | + except Exception: | |
| 276 | + if not must_parse: | |
| 277 | + raise | |
| 278 | + elif isinstance(stream, must_parse): | |
| 279 | + raise | |
| 280 | + else: | |
| 281 | + logging.info(' failed to parse', exc_info=True) | |
| 282 | + return 0 | |
| 283 | + | |
| 284 | + | |
| 285 | +if __name__ == '__main__': | |
| 286 | + sys.exit(test(sys.argv[1:])) | ... | ... |
oletools/xls_parser.py
| ... | ... | @@ -30,9 +30,11 @@ Read storages, (sub-)streams, records from xls file |
| 30 | 30 | |
| 31 | 31 | #------------------------------------------------------------------------------ |
| 32 | 32 | # CHANGELOG: |
| 33 | -# 2017-11-02 v0.01 CH: - first version | |
| 33 | +# 2017-11-02 v0.1 CH: - first version | |
| 34 | +# 2017-11-02 v0.2 CH: - move some code to record_base.py | |
| 35 | +# (to avoid copy-and-paste in ppt_parser.py) | |
| 34 | 36 | |
| 35 | -__version__ = '0.1' | |
| 37 | +__version__ = '0.2' | |
| 36 | 38 | |
| 37 | 39 | # ----------------------------------------------------------------------------- |
| 38 | 40 | # TODO: |
| ... | ... | @@ -52,17 +54,8 @@ __version__ = '0.1' |
| 52 | 54 | import sys |
| 53 | 55 | import os.path |
| 54 | 56 | from struct import unpack |
| 55 | -from io import SEEK_CUR | |
| 56 | 57 | import logging |
| 57 | - | |
| 58 | -# little hack to allow absolute imports even if oletools is not installed. | |
| 59 | -# Copied from olevba.py | |
| 60 | -_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) # pylint: disable=invalid-name | |
| 61 | -_parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) # pylint: disable=invalid-name | |
| 62 | -if _parent_dir not in sys.path: | |
| 63 | - sys.path.insert(0, _parent_dir) | |
| 64 | - | |
| 65 | -from oletools.thirdparty import olefile | |
| 58 | +from record_base import OleRecordFile, OleRecordStream, OleRecordBase, test | |
| 66 | 59 | |
| 67 | 60 | |
| 68 | 61 | # === PYTHON 2+3 SUPPORT ====================================================== |
| ... | ... | @@ -75,16 +68,6 @@ if sys.version_info[0] >= 3: |
| 75 | 68 | ############################################################################### |
| 76 | 69 | |
| 77 | 70 | |
| 78 | -ENTRY_TYPE2STR = { | |
| 79 | - olefile.STGTY_EMPTY: 'empty', | |
| 80 | - olefile.STGTY_STORAGE: 'storage', | |
| 81 | - olefile.STGTY_STREAM: 'stream', | |
| 82 | - olefile.STGTY_LOCKBYTES: 'lock-bytes', | |
| 83 | - olefile.STGTY_PROPERTY: 'property', | |
| 84 | - olefile.STGTY_ROOT: 'root' | |
| 85 | -} | |
| 86 | - | |
| 87 | - | |
| 88 | 71 | def is_xls(filename): |
| 89 | 72 | """ |
| 90 | 73 | determine whether a given file is an excel ole file |
| ... | ... | @@ -95,7 +78,7 @@ def is_xls(filename): |
| 95 | 78 | substream |
| 96 | 79 | """ |
| 97 | 80 | try: |
| 98 | - for stream in XlsFile(filename).get_streams(): | |
| 81 | + for stream in XlsFile(filename).iter_streams(): | |
| 99 | 82 | if isinstance(stream, WorkbookStream): |
| 100 | 83 | return True |
| 101 | 84 | except Exception: |
| ... | ... | @@ -122,7 +105,7 @@ def read_unicode_2byte(data, start_idx, n_chars): |
| 122 | 105 | unpack('<' + 'H'*n_chars, data[start_idx:end_idx])) |
| 123 | 106 | else: # slower version but less memory-extensive |
| 124 | 107 | unichars = (unichr(unpack('<H', data[data_idx:data_idx+2])[0]) |
| 125 | - for data_idx in xrange(start_idx, end_idx, 2)) | |
| 108 | + for data_idx in range(start_idx, end_idx, 2)) | |
| 126 | 109 | return u''.join(unichars), end_idx |
| 127 | 110 | |
| 128 | 111 | |
| ... | ... | @@ -130,133 +113,94 @@ def read_unicode_2byte(data, start_idx, n_chars): |
| 130 | 113 | # File, Storage, Stream |
| 131 | 114 | ############################################################################### |
| 132 | 115 | |
| 116 | +class XlsFile(OleRecordFile): | |
| 117 | + """ An xls file has most streams made up of records """ | |
| 133 | 118 | |
| 134 | -class XlsFile(olefile.OleFileIO): | |
| 135 | - """ specialization of an OLE compound file """ | |
| 119 | + @classmethod | |
| 120 | + def stream_class_for_name(self, stream_name): | |
| 121 | + """ helper for iter_streams """ | |
| 122 | + return XlsStream | |
| 136 | 123 | |
| 137 | - def get_streams(self): | |
| 138 | - """ find all streams, including orphans """ | |
| 139 | - logging.debug('Finding streams in ole file') | |
| 140 | 124 | |
| 141 | - for sid, direntry in enumerate(self.direntries): | |
| 142 | - is_orphan = direntry is None | |
| 143 | - if is_orphan: | |
| 144 | - # this direntry is not part of the tree --> unused or orphan | |
| 145 | - direntry = self._load_direntry(sid) | |
| 146 | - is_stream = direntry.entry_type == olefile.STGTY_STREAM | |
| 147 | - logging.debug('direntry {:2d} {}: {}'.format( | |
| 148 | - sid, '[orphan]' if is_orphan else direntry.name, | |
| 149 | - 'is stream of size {}'.format(direntry.size) if is_stream else | |
| 150 | - 'no stream ({})'.format(ENTRY_TYPE2STR[direntry.entry_type]))) | |
| 151 | - if is_stream: | |
| 152 | - if direntry.name == 'Workbook': | |
| 153 | - clz = WorkbookStream | |
| 154 | - else: | |
| 155 | - clz = XlsStream | |
| 156 | - yield clz(self._open(direntry.isectStart, direntry.size), | |
| 157 | - None if is_orphan else direntry.name) | |
| 125 | +class XlsStream(OleRecordStream): | |
| 126 | + """ most streams in xls file consist of records """ | |
| 158 | 127 | |
| 128 | + def read_record_head(self): | |
| 129 | + """ read first few bytes of record to determine size and type | |
| 159 | 130 | |
| 160 | -class XlsStream(object): | |
| 161 | - """ specialization of an OLE stream | |
| 162 | - | |
| 163 | - Currently not much use, but may be interesting for further sub-classing | |
| 164 | - when extending this code. | |
| 165 | - | |
| 166 | - stream argument can be oleile.OleStream or ooxml.ZipSubFile | |
| 167 | - """ | |
| 131 | + returns (type, size, other) where other is None | |
| 132 | + """ | |
| 133 | + rec_type, rec_size = unpack('<HH', self.stream.read(4)) | |
| 134 | + return rec_type, rec_size, None | |
| 168 | 135 | |
| 169 | - def __init__(self, stream, name): | |
| 170 | - self.stream = stream | |
| 171 | - self.size = stream.size | |
| 172 | - self.name = name | |
| 136 | + @classmethod | |
| 137 | + def record_class_for_type(cls, type): | |
| 138 | + """ determine a class for given record type | |
| 173 | 139 | |
| 174 | - def __str__(self): | |
| 175 | - return '[XlsStream {0} (size {1})' \ | |
| 176 | - .format(self.name or '[orphan]', self.size) | |
| 140 | + returns (clz, force_read) | |
| 141 | + """ | |
| 142 | + return XlsRecord, False | |
| 177 | 143 | |
| 178 | 144 | |
| 179 | 145 | class WorkbookStream(XlsStream): |
| 180 | - """ the workbook stream which contains records """ | |
| 146 | + """ Stream in excel file that holds most info """ | |
| 181 | 147 | |
| 182 | - def iter_records(self, fill_data=False): | |
| 183 | - """ iterate over records in streams | |
| 148 | + @classmethod | |
| 149 | + def record_class_for_type(cls, type): | |
| 150 | + """ determine a class for given record type | |
| 184 | 151 | |
| 185 | - Stream must be positioned at start of records (e.g. start of stream). | |
| 152 | + returns (clz, force_read) | |
| 186 | 153 | """ |
| 187 | - while True: | |
| 188 | - # unpacking as in olevba._extract_vba | |
| 189 | - pos = self.stream.tell() | |
| 190 | - if pos >= self.size: | |
| 191 | - break | |
| 192 | - type = unpack('<H', self.stream.read(2))[0] | |
| 193 | - size = unpack('<H', self.stream.read(2))[0] | |
| 194 | - force_read = False | |
| 195 | - if type == XlsRecordBof.TYPE: | |
| 196 | - clz = XlsRecordBof | |
| 197 | - force_read = True | |
| 198 | - elif type == XlsRecordEof.TYPE: | |
| 199 | - clz = XlsRecordEof | |
| 200 | - elif type == XlsRecordSupBook.TYPE: | |
| 201 | - clz = XlsRecordSupBook | |
| 202 | - force_read = True | |
| 203 | - else: | |
| 204 | - clz = XlsRecord | |
| 205 | - data = None | |
| 206 | - if fill_data or force_read: | |
| 207 | - data = self.stream.read(size) | |
| 208 | - else: | |
| 209 | - self.stream.seek(size, SEEK_CUR) | |
| 210 | - yield clz(type, size, pos, data) | |
| 211 | - | |
| 212 | - def __str__(self): | |
| 213 | - return '[Workbook Stream (size {0})'.format(self.size) | |
| 214 | - | |
| 215 | - | |
| 216 | -class XlsbStream(XlsStream): | |
| 154 | + if type == XlsRecordBof.TYPE: | |
| 155 | + return XlsRecordBof, True | |
| 156 | + elif type == XlsRecordEof.TYPE: | |
| 157 | + return XlsRecordEof, False | |
| 158 | + elif type == XlsRecordSupBook.TYPE: | |
| 159 | + return XlsRecordSupBook, True | |
| 160 | + else: | |
| 161 | + return XlsRecord, False | |
| 162 | + | |
| 163 | + | |
| 164 | +class XlsbStream(OleRecordStream): | |
| 217 | 165 | """ binary stream of an xlsb file, usually have a record structure """ |
| 218 | 166 | |
| 219 | 167 | HIGH_BIT_MASK = 0b10000000 |
| 220 | 168 | LOW7_BIT_MASK = 0b01111111 |
| 221 | 169 | |
| 222 | - def iter_records(self): | |
| 223 | - """ iterate over records in stream | |
| 170 | + def read_record_head(self): | |
| 171 | + """ read first few bytes of record to determine size and type | |
| 224 | 172 | |
| 225 | - Record type and size are encoded differently than in xls streams. | |
| 226 | - (c.f. [MS-XLSB, Paragraph 2.1.4: Record) | |
| 173 | + returns (type, size, other) where other is None | |
| 227 | 174 | """ |
| 228 | - while True: | |
| 229 | - pos = self.stream.tell() | |
| 230 | - if pos >= self.size: | |
| 231 | - break | |
| 232 | - val = ord(self.stream.read(1)) | |
| 233 | - if val & self.HIGH_BIT_MASK: # high bit of the low byte is 1 | |
| 234 | - val2 = ord(self.stream.read(1)) # need another byte | |
| 235 | - # combine 7 low bits of each byte | |
| 236 | - type = (val & self.LOW7_BIT_MASK) + \ | |
| 175 | + val = ord(self.stream.read(1)) | |
| 176 | + if val & self.HIGH_BIT_MASK: # high bit of the low byte is 1 | |
| 177 | + val2 = ord(self.stream.read(1)) # need another byte | |
| 178 | + # combine 7 low bits of each byte | |
| 179 | + rec_type = (val & self.LOW7_BIT_MASK) + \ | |
| 237 | 180 | ((val2 & self.LOW7_BIT_MASK) << 7) |
| 238 | - else: | |
| 239 | - type = val | |
| 240 | - | |
| 241 | - size = 0 | |
| 242 | - shift = 0 | |
| 243 | - for _ in range(4): # size needs up to 4 byte | |
| 244 | - val = ord(self.stream.read(1)) | |
| 245 | - size += (val & self.LOW7_BIT_MASK) << shift | |
| 246 | - shift += 7 | |
| 247 | - if (val & self.HIGH_BIT_MASK) == 0: # high-bit is 0 --> done | |
| 248 | - break | |
| 249 | - | |
| 250 | - if pos + size > self.size: | |
| 251 | - raise ValueError('Stream does not seem to have record ' | |
| 252 | - 'structure or is incomplete (record size {0})' | |
| 253 | - .format(size)) | |
| 254 | - data = self.stream.read(size) | |
| 255 | - | |
| 256 | - clz = XlsbRecord | |
| 257 | - if type == XlsbBeginSupBook.TYPE: | |
| 258 | - clz = XlsbBeginSupBook | |
| 259 | - yield clz(type, size, pos, data) | |
| 181 | + else: | |
| 182 | + rec_type = val | |
| 183 | + | |
| 184 | + rec_size = 0 | |
| 185 | + shift = 0 | |
| 186 | + for _ in range(4): # rec_size needs up to 4 byte | |
| 187 | + val = ord(self.stream.read(1)) | |
| 188 | + rec_size += (val & self.LOW7_BIT_MASK) << shift | |
| 189 | + shift += 7 | |
| 190 | + if (val & self.HIGH_BIT_MASK) == 0: # high-bit is 0 --> done | |
| 191 | + break | |
| 192 | + return rec_type, rec_size, None | |
| 193 | + | |
| 194 | + @classmethod | |
| 195 | + def record_class_for_type(cls, type): | |
| 196 | + """ determine a class for given record type | |
| 197 | + | |
| 198 | + returns (clz, force_read) | |
| 199 | + """ | |
| 200 | + if type == XlsbBeginSupBook.TYPE: | |
| 201 | + return XlsbBeginSupBook, True | |
| 202 | + else: | |
| 203 | + return XlsbRecord, False | |
| 260 | 204 | |
| 261 | 205 | |
| 262 | 206 | ############################################################################### |
| ... | ... | @@ -309,7 +253,6 @@ FREQUENT_RECORDS = dict([ |
| 309 | 253 | |
| 310 | 254 | #: records found in xlsb binary parts |
| 311 | 255 | FREQUENT_RECORDS_XLSB = dict([ |
| 312 | - (360, 'BrtBeginSupBook'), | |
| 313 | 256 | (588, 'BrtEndSupBook'), |
| 314 | 257 | (667, 'BrtSupAddin'), |
| 315 | 258 | (355, 'BrtSupBookSrc'), |
| ... | ... | @@ -330,36 +273,12 @@ FREQUENT_RECORDS_XLSB = dict([ |
| 330 | 273 | ]) |
| 331 | 274 | |
| 332 | 275 | |
| 333 | -class XlsRecord(object): | |
| 276 | +class XlsRecord(OleRecordBase): | |
| 334 | 277 | """ basic building block of data in workbook stream """ |
| 335 | 278 | |
| 336 | 279 | #: max size of a record in xls stream (does not apply to xlsb) |
| 337 | 280 | MAX_SIZE = 8224 |
| 338 | 281 | |
| 339 | - # to be overwritten in subclasses that have fixed type/size | |
| 340 | - TYPE = None | |
| 341 | - SIZE = None | |
| 342 | - | |
| 343 | - def __init__(self, type, size, pos, data=None): | |
| 344 | - """ create a record """ | |
| 345 | - self.type = type | |
| 346 | - if self.MAX_SIZE is not None and size > self.MAX_SIZE: | |
| 347 | - logging.warning('record size {0} exceeds max size' | |
| 348 | - .format(size)) | |
| 349 | - elif self.SIZE is not None and size != self.SIZE: | |
| 350 | - raise ValueError('size {0} is not as expected for this type' | |
| 351 | - .format(size)) | |
| 352 | - self.size = size | |
| 353 | - self.pos = pos | |
| 354 | - self.data = data | |
| 355 | - if data is not None and len(data) != size: | |
| 356 | - raise ValueError('data size {0} is not expected size {1}' | |
| 357 | - .format(len(data), size)) | |
| 358 | - | |
| 359 | - def read_data(self, stream): | |
| 360 | - """ read data from stream if up to now only pos was known """ | |
| 361 | - raise NotImplementedError() | |
| 362 | - | |
| 363 | 282 | def _type_str(self): |
| 364 | 283 | """ simplification for subclasses to create their own __str__ """ |
| 365 | 284 | try: |
| ... | ... | @@ -367,10 +286,6 @@ class XlsRecord(object): |
| 367 | 286 | except KeyError: |
| 368 | 287 | return 'XlsRecord type {0}'.format(self.type) |
| 369 | 288 | |
| 370 | - def __str__(self): | |
| 371 | - return '[' + self._type_str() + \ | |
| 372 | - ' (size {0} from {1})]'.format(self.size, self.pos) | |
| 373 | - | |
| 374 | 289 | |
| 375 | 290 | class XlsRecordBof(XlsRecord): |
| 376 | 291 | """ record found at beginning of substreams """ |
| ... | ... | @@ -380,8 +295,7 @@ class XlsRecordBof(XlsRecord): |
| 380 | 295 | DOCTYPES = dict([(0x5, 'workbook'), (0x10, 'dialog/worksheet'), |
| 381 | 296 | (0x20, 'chart'), (0x40, 'macro')]) |
| 382 | 297 | |
| 383 | - def __init__(self, *args, **kwargs): | |
| 384 | - super(XlsRecordBof, self).__init__(*args, **kwargs) | |
| 298 | + def parse(self, _): | |
| 385 | 299 | if self.data is None: |
| 386 | 300 | self.doctype = None |
| 387 | 301 | return |
| ... | ... | @@ -420,9 +334,7 @@ class XlsRecordSupBook(XlsRecord): |
| 420 | 334 | LINK_TYPE_OLE_DDE = 'ole/dde data source' |
| 421 | 335 | LINK_TYPE_EXTERNAL = 'external workbook' |
| 422 | 336 | |
| 423 | - def __init__(self, *args, **kwargs): | |
| 424 | - super(XlsRecordSupBook, self).__init__(*args, **kwargs) | |
| 425 | - | |
| 337 | + def parse(self, _): | |
| 426 | 338 | # set defaults |
| 427 | 339 | self.ctab = None |
| 428 | 340 | self.cch = None |
| ... | ... | @@ -461,7 +373,7 @@ class XlsRecordSupBook(XlsRecord): |
| 461 | 373 | return 'SupBook Record ({0})'.format(self.support_link_type) |
| 462 | 374 | |
| 463 | 375 | |
| 464 | -class XlsbRecord(XlsRecord): | |
| 376 | +class XlsbRecord(OleRecordBase): | |
| 465 | 377 | """ like an xls record, but from binary part of xlsb file |
| 466 | 378 | |
| 467 | 379 | has no MAX_SIZE and types have different meanings |
| ... | ... | @@ -491,8 +403,7 @@ class XlsbBeginSupBook(XlsbRecord): |
| 491 | 403 | LINK_TYPE_UNEXPECTED = 'unexpected' |
| 492 | 404 | LINK_TYPE_UNKNOWN = 'unknown' |
| 493 | 405 | |
| 494 | - def __init__(self, *args, **kwargs): | |
| 495 | - super(XlsbBeginSupBook, self).__init__(*args, **kwargs) | |
| 406 | + def parse(self, _): | |
| 496 | 407 | self.link_type = self.LINK_TYPE_UNKNOWN |
| 497 | 408 | self.string1 = '' |
| 498 | 409 | self.string2 = '' |
| ... | ... | @@ -540,6 +451,7 @@ class XlsbBeginSupBook(XlsbRecord): |
| 540 | 451 | # XLSB Binary Parts |
| 541 | 452 | ############################################################################### |
| 542 | 453 | |
| 454 | + | |
| 543 | 455 | def parse_xlsb_part(stream, _, filename): |
| 544 | 456 | """ Excel xlsb files also have a record structure. iter records """ |
| 545 | 457 | for record in XlsbStream(stream, filename).iter_records(): |
| ... | ... | @@ -551,26 +463,5 @@ def parse_xlsb_part(stream, _, filename): |
| 551 | 463 | ############################################################################### |
| 552 | 464 | |
| 553 | 465 | |
| 554 | -def test(*filenames): | |
| 555 | - """ parse all given file names and print rough structure """ | |
| 556 | - logging.basicConfig(level=logging.DEBUG) | |
| 557 | - if not filenames: | |
| 558 | - logging.info('need file name[s]') | |
| 559 | - return 2 | |
| 560 | - for filename in filenames: | |
| 561 | - logging.info('checking file {0}'.format(filename)) | |
| 562 | - if not olefile.isOleFile(filename): | |
| 563 | - logging.info('not an ole file - skip') | |
| 564 | - continue | |
| 565 | - xls = XlsFile(filename) | |
| 566 | - | |
| 567 | - for stream in xls.get_streams(): | |
| 568 | - logging.info(stream) | |
| 569 | - if isinstance(stream, WorkbookStream): | |
| 570 | - for record in stream.iter_records(): | |
| 571 | - logging.info(' {0}'.format(record)) | |
| 572 | - return 0 | |
| 573 | - | |
| 574 | - | |
| 575 | 466 | if __name__ == '__main__': |
| 576 | - sys.exit(test(*sys.argv[1:])) | |
| 467 | + sys.exit(test(sys.argv[1:], XlsFile, WorkbookStream)) | ... | ... |