Commit d397edb5fe4149bf8278b16e63da33db0e6357da

Authored by Christian Herdtweck
1 parent 27dc5360

xls_parser: move code to new record_base for re-use with ppt files

Parsing through records seems to make sense. Try to repeat the same with
ppt files next. To avoid copy-and-paste, move code to be used by both to
common base record_base.py
oletools/record_base.py 0 → 100644
  1 +#!/usr/bin/env python
  2 +
  3 +"""
  4 +record_base.py
  5 +
  6 +Common stuff for ole files whose streams are a sequence of record structures.
  7 +This is the case for xls and ppt, so classes are bases for xls_parser.py and
  8 +ppt_parser.py .
  9 +"""
  10 +
  11 +# === LICENSE =================================================================
  12 +#
  13 +# Redistribution and use in source and binary forms, with or without
  14 +# modification, are permitted provided that the following conditions are met:
  15 +#
  16 +# * Redistributions of source code must retain the above copyright notice,
  17 +# this list of conditions and the following disclaimer.
  18 +# * Redistributions in binary form must reproduce the above copyright notice,
  19 +# this list of conditions and the following disclaimer in the documentation
  20 +# and/or other materials provided with the distribution.
  21 +#
  22 +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  23 +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24 +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25 +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  26 +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  27 +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  28 +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  29 +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  30 +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  31 +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  32 +# POSSIBILITY OF SUCH DAMAGE.
  33 +
  34 +from __future__ import print_function
  35 +
  36 +#------------------------------------------------------------------------------
  37 +# CHANGELOG:
  38 +# 2017-11-30 v0.01 CH: - first version based on xls_parser
  39 +
  40 +#------------------------------------------------------------------------------
  41 +# TODO:
  42 +# - read DocumentSummaryInformation first to get more info about streams
  43 +# (maybe content type or so; identify streams that are never record-based)
  44 +# - think about integrating this with olefile itself
  45 +
  46 +# -----------------------------------------------------------------------------
  47 +# REFERENCES:
  48 +# - [MS-XLS]: Excel Binary File Format (.xls) Structure Specification
  49 +# https://msdn.microsoft.com/en-us/library/office/cc313154(v=office.14).aspx
  50 +# - Understanding the Excel .xls Binary File Format
  51 +# https://msdn.microsoft.com/en-us/library/office/gg615597(v=office.14).aspx
  52 +# - [MS-PPT]
  53 +
  54 +
  55 +import sys
  56 +import os.path
  57 +from io import SEEK_CUR
  58 +import logging
  59 +
  60 +# little hack to allow absolute imports even if oletools is not installed.
  61 +# Copied from olevba.py
  62 +_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) # pylint: disable=invalid-name
  63 +_parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) # pylint: disable=invalid-name
  64 +del _thismodule_dir
  65 +if _parent_dir not in sys.path:
  66 + sys.path.insert(0, _parent_dir)
  67 +del _parent_dir
  68 +
  69 +from oletools.thirdparty import olefile
  70 +
  71 +
  72 +###############################################################################
  73 +# Helpers
  74 +###############################################################################
  75 +
  76 +
  77 +ENTRY_TYPE2STR = {
  78 + olefile.STGTY_EMPTY: 'empty',
  79 + olefile.STGTY_STORAGE: 'storage',
  80 + olefile.STGTY_STREAM: 'stream',
  81 + olefile.STGTY_LOCKBYTES: 'lock-bytes',
  82 + olefile.STGTY_PROPERTY: 'property',
  83 + olefile.STGTY_ROOT: 'root'
  84 +}
  85 +
  86 +
  87 +###############################################################################
  88 +# Base Classes
  89 +###############################################################################
  90 +
  91 +
  92 +class OleRecordFile(olefile.OleFileIO):
  93 + """ an OLE compound file whose streams have (mostly) record structure
  94 +
  95 + 'record structure' meaning that streams are a sequence of records. Records
  96 + are structure with information about type and size in their first bytes
  97 + and type-dependent data of given size after that.
  98 +
  99 + Subclass of OleFileIO!
  100 + """
  101 +
  102 + @classmethod
  103 + def stream_class_for_name(cls, stream_name):
  104 + """ helper for iter_streams, must be overwritten in subclasses """
  105 + return OleRecordStream # this is an abstract class!
  106 +
  107 + def iter_streams(self):
  108 + """ find all streams, including orphans """
  109 + logging.debug('Finding streams in ole file')
  110 +
  111 + for sid, direntry in enumerate(self.direntries):
  112 + is_orphan = direntry is None
  113 + if is_orphan:
  114 + # this direntry is not part of the tree --> unused or orphan
  115 + direntry = self._load_direntry(sid)
  116 + is_stream = direntry.entry_type == olefile.STGTY_STREAM
  117 + logging.debug('direntry {:2d} {}: {}'.format(
  118 + sid, '[orphan]' if is_orphan else direntry.name,
  119 + 'is stream of size {}'.format(direntry.size) if is_stream else
  120 + 'no stream ({})'.format(ENTRY_TYPE2STR[direntry.entry_type])))
  121 + if is_stream:
  122 + clz = self.stream_class_for_name(direntry.name)
  123 + yield clz(self._open(direntry.isectStart, direntry.size),
  124 + None if is_orphan else direntry.name)
  125 +
  126 +
  127 +class OleRecordStream(object):
  128 + """ a stream found in an OleRecordFile
  129 +
  130 + Always has a name and a size (both read-only). Has an OleFileStream handle.
  131 +
  132 + abstract base class
  133 + """
  134 +
  135 + def __init__(self, stream, name):
  136 + self.stream = stream
  137 + self.name = name
  138 + self.size = stream.size
  139 +
  140 + def read_record_head(self):
  141 + """ read first few bytes of record to determine size and type
  142 +
  143 + Abstract base method, to be implemented in subclasses.
  144 +
  145 + returns (rec_type, rec_size, other) where other will be forwarded to
  146 + record constructors
  147 + """
  148 + raise NotImplementedError('Abstract method '
  149 + 'OleRecordStream.read_record_head called')
  150 +
  151 + @classmethod
  152 + def record_class_for_type(cls, rec_type):
  153 + """ determine a class for given record type
  154 +
  155 + Only a base implementation. Create subclasses of OleRecordBase and
  156 + return those when appropriate.
  157 +
  158 + returns (clz, force_read)
  159 + """
  160 + return OleRecordBase, False
  161 +
  162 + def iter_records(self, fill_data=False):
  163 + """ yield all records in this stream
  164 +
  165 + Stream must be positioned at start of records (e.g. start of stream).
  166 + """
  167 + while True:
  168 + # unpacking as in olevba._extract_vba
  169 + pos = self.stream.tell()
  170 + if pos >= self.size:
  171 + break
  172 +
  173 + # read first few bytes, determine record type and size
  174 + rec_type, rec_size, other = self.read_record_head()
  175 + logging.debug('Record type {0} of size {1}'
  176 + .format(rec_type, rec_size))
  177 +
  178 + # determine what class to wrap this into
  179 + rec_clz, force_read = self.record_class_for_type(rec_type)
  180 +
  181 + if fill_data or force_read:
  182 + data = self.stream.read(rec_size)
  183 + if len(data) != rec_size:
  184 + raise IOError('Not enough data in stream ({0} < {1})'
  185 + .format(len(data), rec_size))
  186 + else:
  187 + self.stream.seek(rec_size, SEEK_CUR)
  188 + data = None
  189 + yield rec_clz(rec_type, rec_size, other, pos, data)
  190 +
  191 + def __str__(self):
  192 + return '[{2} {0} (size {1})' \
  193 + .format(self.name or '[orphan]', self.size,
  194 + self.__class__.__name__)
  195 +
  196 +
  197 +class OleRecordBase(object):
  198 + """ a record found in an OleRecordStream
  199 +
  200 + always has a type and a size, also pos and data
  201 + """
  202 +
  203 + # for subclasses with a fixed type
  204 + TYPE = None
  205 +
  206 + # (max) size of subclasses
  207 + MAX_SIZE = None
  208 + SIZE = None
  209 +
  210 + def __init__(self, type, size, more_data, pos, data):
  211 + """ create a record; more_data is discarded """
  212 + if self.TYPE is not None and type != self.TYPE:
  213 + raise ValueError('Wrong subclass {0} for type {1}'
  214 + .format(self.__class__.__name__, type))
  215 + self.type = type
  216 + if self.SIZE is not None and size != self.SIZE:
  217 + raise ValueError('Wrong size {0} for record type {1}'
  218 + .format(size, type))
  219 + elif self.MAX_SIZE is not None and size > self.MAX_SIZE:
  220 + raise ValueError('Wrong size: {0} > MAX_SIZE for record type {1}'
  221 + .format(size, type))
  222 + self.size = size
  223 + self.pos = pos
  224 + self.data = data
  225 + self.parse(more_data)
  226 +
  227 + def parse(self, more_data):
  228 + """ finish constructing this record
  229 +
  230 + Can save more_data from OleRecordStream.read_record_head and/or parse
  231 + data (if it was read).
  232 +
  233 + Base implementation, does nothing. To be overwritten in subclasses.
  234 + """
  235 + pass
  236 +
  237 + def _type_str(self):
  238 + """ helper for __str__, base implementation """
  239 + return '{0} type {1}'.format(self.__class__.__name__, self.type)
  240 +
  241 + def __str__(self):
  242 + """ create a short but informative textual representation of self """
  243 + return '[' + self._type_str() + \
  244 + ' (size {0} from {1})]'.format(self.size, self.pos)
  245 +
  246 +
  247 +###############################################################################
  248 +# TESTING
  249 +###############################################################################
  250 +
  251 +
  252 +def test(filenames, ole_file_class=OleRecordFile,
  253 + must_parse=None):
  254 + """ parse all given file names and print rough structure
  255 +
  256 + if an error occurs while parsing a stream of type in must_parse, the error
  257 + will be raised. Otherwise a message is printed
  258 + """
  259 + logging.basicConfig(level=logging.DEBUG)
  260 + if not filenames:
  261 + logging.info('need file name[s]')
  262 + return 2
  263 + for filename in filenames:
  264 + logging.info('checking file {0}'.format(filename))
  265 + if not olefile.isOleFile(filename):
  266 + logging.info('not an ole file - skip')
  267 + continue
  268 + ole = ole_file_class(filename)
  269 +
  270 + for stream in ole.iter_streams():
  271 + logging.info(stream)
  272 + try:
  273 + for record in stream.iter_records():
  274 + logging.info(' {0}'.format(record))
  275 + except Exception:
  276 + if not must_parse:
  277 + raise
  278 + elif isinstance(stream, must_parse):
  279 + raise
  280 + else:
  281 + logging.info(' failed to parse', exc_info=True)
  282 + return 0
  283 +
  284 +
  285 +if __name__ == '__main__':
  286 + sys.exit(test(sys.argv[1:]))
oletools/xls_parser.py
@@ -30,9 +30,11 @@ Read storages, (sub-)streams, records from xls file @@ -30,9 +30,11 @@ Read storages, (sub-)streams, records from xls file
30 30
31 #------------------------------------------------------------------------------ 31 #------------------------------------------------------------------------------
32 # CHANGELOG: 32 # CHANGELOG:
33 -# 2017-11-02 v0.01 CH: - first version 33 +# 2017-11-02 v0.1 CH: - first version
  34 +# 2017-11-02 v0.2 CH: - move some code to record_base.py
  35 +# (to avoid copy-and-paste in ppt_parser.py)
34 36
35 -__version__ = '0.1' 37 +__version__ = '0.2'
36 38
37 # ----------------------------------------------------------------------------- 39 # -----------------------------------------------------------------------------
38 # TODO: 40 # TODO:
@@ -52,17 +54,8 @@ __version__ = &#39;0.1&#39; @@ -52,17 +54,8 @@ __version__ = &#39;0.1&#39;
52 import sys 54 import sys
53 import os.path 55 import os.path
54 from struct import unpack 56 from struct import unpack
55 -from io import SEEK_CUR  
56 import logging 57 import logging
57 -  
58 -# little hack to allow absolute imports even if oletools is not installed.  
59 -# Copied from olevba.py  
60 -_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) # pylint: disable=invalid-name  
61 -_parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) # pylint: disable=invalid-name  
62 -if _parent_dir not in sys.path:  
63 - sys.path.insert(0, _parent_dir)  
64 -  
65 -from oletools.thirdparty import olefile 58 +from record_base import OleRecordFile, OleRecordStream, OleRecordBase, test
66 59
67 60
68 # === PYTHON 2+3 SUPPORT ====================================================== 61 # === PYTHON 2+3 SUPPORT ======================================================
@@ -75,16 +68,6 @@ if sys.version_info[0] &gt;= 3: @@ -75,16 +68,6 @@ if sys.version_info[0] &gt;= 3:
75 ############################################################################### 68 ###############################################################################
76 69
77 70
78 -ENTRY_TYPE2STR = {  
79 - olefile.STGTY_EMPTY: 'empty',  
80 - olefile.STGTY_STORAGE: 'storage',  
81 - olefile.STGTY_STREAM: 'stream',  
82 - olefile.STGTY_LOCKBYTES: 'lock-bytes',  
83 - olefile.STGTY_PROPERTY: 'property',  
84 - olefile.STGTY_ROOT: 'root'  
85 -}  
86 -  
87 -  
88 def is_xls(filename): 71 def is_xls(filename):
89 """ 72 """
90 determine whether a given file is an excel ole file 73 determine whether a given file is an excel ole file
@@ -95,7 +78,7 @@ def is_xls(filename): @@ -95,7 +78,7 @@ def is_xls(filename):
95 substream 78 substream
96 """ 79 """
97 try: 80 try:
98 - for stream in XlsFile(filename).get_streams(): 81 + for stream in XlsFile(filename).iter_streams():
99 if isinstance(stream, WorkbookStream): 82 if isinstance(stream, WorkbookStream):
100 return True 83 return True
101 except Exception: 84 except Exception:
@@ -122,7 +105,7 @@ def read_unicode_2byte(data, start_idx, n_chars): @@ -122,7 +105,7 @@ def read_unicode_2byte(data, start_idx, n_chars):
122 unpack('<' + 'H'*n_chars, data[start_idx:end_idx])) 105 unpack('<' + 'H'*n_chars, data[start_idx:end_idx]))
123 else: # slower version but less memory-extensive 106 else: # slower version but less memory-extensive
124 unichars = (unichr(unpack('<H', data[data_idx:data_idx+2])[0]) 107 unichars = (unichr(unpack('<H', data[data_idx:data_idx+2])[0])
125 - for data_idx in xrange(start_idx, end_idx, 2)) 108 + for data_idx in range(start_idx, end_idx, 2))
126 return u''.join(unichars), end_idx 109 return u''.join(unichars), end_idx
127 110
128 111
@@ -130,133 +113,94 @@ def read_unicode_2byte(data, start_idx, n_chars): @@ -130,133 +113,94 @@ def read_unicode_2byte(data, start_idx, n_chars):
130 # File, Storage, Stream 113 # File, Storage, Stream
131 ############################################################################### 114 ###############################################################################
132 115
  116 +class XlsFile(OleRecordFile):
  117 + """ An xls file has most streams made up of records """
133 118
134 -class XlsFile(olefile.OleFileIO):  
135 - """ specialization of an OLE compound file """ 119 + @classmethod
  120 + def stream_class_for_name(self, stream_name):
  121 + """ helper for iter_streams """
  122 + return XlsStream
136 123
137 - def get_streams(self):  
138 - """ find all streams, including orphans """  
139 - logging.debug('Finding streams in ole file')  
140 124
141 - for sid, direntry in enumerate(self.direntries):  
142 - is_orphan = direntry is None  
143 - if is_orphan:  
144 - # this direntry is not part of the tree --> unused or orphan  
145 - direntry = self._load_direntry(sid)  
146 - is_stream = direntry.entry_type == olefile.STGTY_STREAM  
147 - logging.debug('direntry {:2d} {}: {}'.format(  
148 - sid, '[orphan]' if is_orphan else direntry.name,  
149 - 'is stream of size {}'.format(direntry.size) if is_stream else  
150 - 'no stream ({})'.format(ENTRY_TYPE2STR[direntry.entry_type])))  
151 - if is_stream:  
152 - if direntry.name == 'Workbook':  
153 - clz = WorkbookStream  
154 - else:  
155 - clz = XlsStream  
156 - yield clz(self._open(direntry.isectStart, direntry.size),  
157 - None if is_orphan else direntry.name) 125 +class XlsStream(OleRecordStream):
  126 + """ most streams in xls file consist of records """
158 127
  128 + def read_record_head(self):
  129 + """ read first few bytes of record to determine size and type
159 130
160 -class XlsStream(object):  
161 - """ specialization of an OLE stream  
162 -  
163 - Currently not much use, but may be interesting for further sub-classing  
164 - when extending this code.  
165 -  
166 - stream argument can be oleile.OleStream or ooxml.ZipSubFile  
167 - """ 131 + returns (type, size, other) where other is None
  132 + """
  133 + rec_type, rec_size = unpack('<HH', self.stream.read(4))
  134 + return rec_type, rec_size, None
168 135
169 - def __init__(self, stream, name):  
170 - self.stream = stream  
171 - self.size = stream.size  
172 - self.name = name 136 + @classmethod
  137 + def record_class_for_type(cls, type):
  138 + """ determine a class for given record type
173 139
174 - def __str__(self):  
175 - return '[XlsStream {0} (size {1})' \  
176 - .format(self.name or '[orphan]', self.size) 140 + returns (clz, force_read)
  141 + """
  142 + return XlsRecord, False
177 143
178 144
179 class WorkbookStream(XlsStream): 145 class WorkbookStream(XlsStream):
180 - """ the workbook stream which contains records """ 146 + """ Stream in excel file that holds most info """
181 147
182 - def iter_records(self, fill_data=False):  
183 - """ iterate over records in streams 148 + @classmethod
  149 + def record_class_for_type(cls, type):
  150 + """ determine a class for given record type
184 151
185 - Stream must be positioned at start of records (e.g. start of stream). 152 + returns (clz, force_read)
186 """ 153 """
187 - while True:  
188 - # unpacking as in olevba._extract_vba  
189 - pos = self.stream.tell()  
190 - if pos >= self.size:  
191 - break  
192 - type = unpack('<H', self.stream.read(2))[0]  
193 - size = unpack('<H', self.stream.read(2))[0]  
194 - force_read = False  
195 - if type == XlsRecordBof.TYPE:  
196 - clz = XlsRecordBof  
197 - force_read = True  
198 - elif type == XlsRecordEof.TYPE:  
199 - clz = XlsRecordEof  
200 - elif type == XlsRecordSupBook.TYPE:  
201 - clz = XlsRecordSupBook  
202 - force_read = True  
203 - else:  
204 - clz = XlsRecord  
205 - data = None  
206 - if fill_data or force_read:  
207 - data = self.stream.read(size)  
208 - else:  
209 - self.stream.seek(size, SEEK_CUR)  
210 - yield clz(type, size, pos, data)  
211 -  
212 - def __str__(self):  
213 - return '[Workbook Stream (size {0})'.format(self.size)  
214 -  
215 -  
216 -class XlsbStream(XlsStream): 154 + if type == XlsRecordBof.TYPE:
  155 + return XlsRecordBof, True
  156 + elif type == XlsRecordEof.TYPE:
  157 + return XlsRecordEof, False
  158 + elif type == XlsRecordSupBook.TYPE:
  159 + return XlsRecordSupBook, True
  160 + else:
  161 + return XlsRecord, False
  162 +
  163 +
  164 +class XlsbStream(OleRecordStream):
217 """ binary stream of an xlsb file, usually have a record structure """ 165 """ binary stream of an xlsb file, usually have a record structure """
218 166
219 HIGH_BIT_MASK = 0b10000000 167 HIGH_BIT_MASK = 0b10000000
220 LOW7_BIT_MASK = 0b01111111 168 LOW7_BIT_MASK = 0b01111111
221 169
222 - def iter_records(self):  
223 - """ iterate over records in stream 170 + def read_record_head(self):
  171 + """ read first few bytes of record to determine size and type
224 172
225 - Record type and size are encoded differently than in xls streams.  
226 - (c.f. [MS-XLSB, Paragraph 2.1.4: Record) 173 + returns (type, size, other) where other is None
227 """ 174 """
228 - while True:  
229 - pos = self.stream.tell()  
230 - if pos >= self.size:  
231 - break  
232 - val = ord(self.stream.read(1))  
233 - if val & self.HIGH_BIT_MASK: # high bit of the low byte is 1  
234 - val2 = ord(self.stream.read(1)) # need another byte  
235 - # combine 7 low bits of each byte  
236 - type = (val & self.LOW7_BIT_MASK) + \ 175 + val = ord(self.stream.read(1))
  176 + if val & self.HIGH_BIT_MASK: # high bit of the low byte is 1
  177 + val2 = ord(self.stream.read(1)) # need another byte
  178 + # combine 7 low bits of each byte
  179 + rec_type = (val & self.LOW7_BIT_MASK) + \
237 ((val2 & self.LOW7_BIT_MASK) << 7) 180 ((val2 & self.LOW7_BIT_MASK) << 7)
238 - else:  
239 - type = val  
240 -  
241 - size = 0  
242 - shift = 0  
243 - for _ in range(4): # size needs up to 4 byte  
244 - val = ord(self.stream.read(1))  
245 - size += (val & self.LOW7_BIT_MASK) << shift  
246 - shift += 7  
247 - if (val & self.HIGH_BIT_MASK) == 0: # high-bit is 0 --> done  
248 - break  
249 -  
250 - if pos + size > self.size:  
251 - raise ValueError('Stream does not seem to have record '  
252 - 'structure or is incomplete (record size {0})'  
253 - .format(size))  
254 - data = self.stream.read(size)  
255 -  
256 - clz = XlsbRecord  
257 - if type == XlsbBeginSupBook.TYPE:  
258 - clz = XlsbBeginSupBook  
259 - yield clz(type, size, pos, data) 181 + else:
  182 + rec_type = val
  183 +
  184 + rec_size = 0
  185 + shift = 0
  186 + for _ in range(4): # rec_size needs up to 4 byte
  187 + val = ord(self.stream.read(1))
  188 + rec_size += (val & self.LOW7_BIT_MASK) << shift
  189 + shift += 7
  190 + if (val & self.HIGH_BIT_MASK) == 0: # high-bit is 0 --> done
  191 + break
  192 + return rec_type, rec_size, None
  193 +
  194 + @classmethod
  195 + def record_class_for_type(cls, type):
  196 + """ determine a class for given record type
  197 +
  198 + returns (clz, force_read)
  199 + """
  200 + if type == XlsbBeginSupBook.TYPE:
  201 + return XlsbBeginSupBook, True
  202 + else:
  203 + return XlsbRecord, False
260 204
261 205
262 ############################################################################### 206 ###############################################################################
@@ -309,7 +253,6 @@ FREQUENT_RECORDS = dict([ @@ -309,7 +253,6 @@ FREQUENT_RECORDS = dict([
309 253
310 #: records found in xlsb binary parts 254 #: records found in xlsb binary parts
311 FREQUENT_RECORDS_XLSB = dict([ 255 FREQUENT_RECORDS_XLSB = dict([
312 - (360, 'BrtBeginSupBook'),  
313 (588, 'BrtEndSupBook'), 256 (588, 'BrtEndSupBook'),
314 (667, 'BrtSupAddin'), 257 (667, 'BrtSupAddin'),
315 (355, 'BrtSupBookSrc'), 258 (355, 'BrtSupBookSrc'),
@@ -330,36 +273,12 @@ FREQUENT_RECORDS_XLSB = dict([ @@ -330,36 +273,12 @@ FREQUENT_RECORDS_XLSB = dict([
330 ]) 273 ])
331 274
332 275
333 -class XlsRecord(object): 276 +class XlsRecord(OleRecordBase):
334 """ basic building block of data in workbook stream """ 277 """ basic building block of data in workbook stream """
335 278
336 #: max size of a record in xls stream (does not apply to xlsb) 279 #: max size of a record in xls stream (does not apply to xlsb)
337 MAX_SIZE = 8224 280 MAX_SIZE = 8224
338 281
339 - # to be overwritten in subclasses that have fixed type/size  
340 - TYPE = None  
341 - SIZE = None  
342 -  
343 - def __init__(self, type, size, pos, data=None):  
344 - """ create a record """  
345 - self.type = type  
346 - if self.MAX_SIZE is not None and size > self.MAX_SIZE:  
347 - logging.warning('record size {0} exceeds max size'  
348 - .format(size))  
349 - elif self.SIZE is not None and size != self.SIZE:  
350 - raise ValueError('size {0} is not as expected for this type'  
351 - .format(size))  
352 - self.size = size  
353 - self.pos = pos  
354 - self.data = data  
355 - if data is not None and len(data) != size:  
356 - raise ValueError('data size {0} is not expected size {1}'  
357 - .format(len(data), size))  
358 -  
359 - def read_data(self, stream):  
360 - """ read data from stream if up to now only pos was known """  
361 - raise NotImplementedError()  
362 -  
363 def _type_str(self): 282 def _type_str(self):
364 """ simplification for subclasses to create their own __str__ """ 283 """ simplification for subclasses to create their own __str__ """
365 try: 284 try:
@@ -367,10 +286,6 @@ class XlsRecord(object): @@ -367,10 +286,6 @@ class XlsRecord(object):
367 except KeyError: 286 except KeyError:
368 return 'XlsRecord type {0}'.format(self.type) 287 return 'XlsRecord type {0}'.format(self.type)
369 288
370 - def __str__(self):  
371 - return '[' + self._type_str() + \  
372 - ' (size {0} from {1})]'.format(self.size, self.pos)  
373 -  
374 289
375 class XlsRecordBof(XlsRecord): 290 class XlsRecordBof(XlsRecord):
376 """ record found at beginning of substreams """ 291 """ record found at beginning of substreams """
@@ -380,8 +295,7 @@ class XlsRecordBof(XlsRecord): @@ -380,8 +295,7 @@ class XlsRecordBof(XlsRecord):
380 DOCTYPES = dict([(0x5, 'workbook'), (0x10, 'dialog/worksheet'), 295 DOCTYPES = dict([(0x5, 'workbook'), (0x10, 'dialog/worksheet'),
381 (0x20, 'chart'), (0x40, 'macro')]) 296 (0x20, 'chart'), (0x40, 'macro')])
382 297
383 - def __init__(self, *args, **kwargs):  
384 - super(XlsRecordBof, self).__init__(*args, **kwargs) 298 + def parse(self, _):
385 if self.data is None: 299 if self.data is None:
386 self.doctype = None 300 self.doctype = None
387 return 301 return
@@ -420,9 +334,7 @@ class XlsRecordSupBook(XlsRecord): @@ -420,9 +334,7 @@ class XlsRecordSupBook(XlsRecord):
420 LINK_TYPE_OLE_DDE = 'ole/dde data source' 334 LINK_TYPE_OLE_DDE = 'ole/dde data source'
421 LINK_TYPE_EXTERNAL = 'external workbook' 335 LINK_TYPE_EXTERNAL = 'external workbook'
422 336
423 - def __init__(self, *args, **kwargs):  
424 - super(XlsRecordSupBook, self).__init__(*args, **kwargs)  
425 - 337 + def parse(self, _):
426 # set defaults 338 # set defaults
427 self.ctab = None 339 self.ctab = None
428 self.cch = None 340 self.cch = None
@@ -461,7 +373,7 @@ class XlsRecordSupBook(XlsRecord): @@ -461,7 +373,7 @@ class XlsRecordSupBook(XlsRecord):
461 return 'SupBook Record ({0})'.format(self.support_link_type) 373 return 'SupBook Record ({0})'.format(self.support_link_type)
462 374
463 375
464 -class XlsbRecord(XlsRecord): 376 +class XlsbRecord(OleRecordBase):
465 """ like an xls record, but from binary part of xlsb file 377 """ like an xls record, but from binary part of xlsb file
466 378
467 has no MAX_SIZE and types have different meanings 379 has no MAX_SIZE and types have different meanings
@@ -491,8 +403,7 @@ class XlsbBeginSupBook(XlsbRecord): @@ -491,8 +403,7 @@ class XlsbBeginSupBook(XlsbRecord):
491 LINK_TYPE_UNEXPECTED = 'unexpected' 403 LINK_TYPE_UNEXPECTED = 'unexpected'
492 LINK_TYPE_UNKNOWN = 'unknown' 404 LINK_TYPE_UNKNOWN = 'unknown'
493 405
494 - def __init__(self, *args, **kwargs):  
495 - super(XlsbBeginSupBook, self).__init__(*args, **kwargs) 406 + def parse(self, _):
496 self.link_type = self.LINK_TYPE_UNKNOWN 407 self.link_type = self.LINK_TYPE_UNKNOWN
497 self.string1 = '' 408 self.string1 = ''
498 self.string2 = '' 409 self.string2 = ''
@@ -540,6 +451,7 @@ class XlsbBeginSupBook(XlsbRecord): @@ -540,6 +451,7 @@ class XlsbBeginSupBook(XlsbRecord):
540 # XLSB Binary Parts 451 # XLSB Binary Parts
541 ############################################################################### 452 ###############################################################################
542 453
  454 +
543 def parse_xlsb_part(stream, _, filename): 455 def parse_xlsb_part(stream, _, filename):
544 """ Excel xlsb files also have a record structure. iter records """ 456 """ Excel xlsb files also have a record structure. iter records """
545 for record in XlsbStream(stream, filename).iter_records(): 457 for record in XlsbStream(stream, filename).iter_records():
@@ -551,26 +463,5 @@ def parse_xlsb_part(stream, _, filename): @@ -551,26 +463,5 @@ def parse_xlsb_part(stream, _, filename):
551 ############################################################################### 463 ###############################################################################
552 464
553 465
554 -def test(*filenames):  
555 - """ parse all given file names and print rough structure """  
556 - logging.basicConfig(level=logging.DEBUG)  
557 - if not filenames:  
558 - logging.info('need file name[s]')  
559 - return 2  
560 - for filename in filenames:  
561 - logging.info('checking file {0}'.format(filename))  
562 - if not olefile.isOleFile(filename):  
563 - logging.info('not an ole file - skip')  
564 - continue  
565 - xls = XlsFile(filename)  
566 -  
567 - for stream in xls.get_streams():  
568 - logging.info(stream)  
569 - if isinstance(stream, WorkbookStream):  
570 - for record in stream.iter_records():  
571 - logging.info(' {0}'.format(record))  
572 - return 0  
573 -  
574 -  
575 if __name__ == '__main__': 466 if __name__ == '__main__':
576 - sys.exit(test(*sys.argv[1:])) 467 + sys.exit(test(sys.argv[1:], XlsFile, WorkbookStream))