Commit d397edb5fe4149bf8278b16e63da33db0e6357da

Authored by Christian Herdtweck
1 parent 27dc5360

xls_parser: move code to new record_base for re-use with ppt files

Parsing through records seems to make sense. Try to repeat the same with
ppt files next. To avoid copy-and-paste, move code to be used by both to
common base record_base.py
oletools/record_base.py 0 → 100644
  1 +#!/usr/bin/env python
  2 +
  3 +"""
  4 +record_base.py
  5 +
  6 +Common stuff for ole files whose streams are a sequence of record structures.
  7 +This is the case for xls and ppt, so classes are bases for xls_parser.py and
  8 +ppt_parser.py .
  9 +"""
  10 +
  11 +# === LICENSE =================================================================
  12 +#
  13 +# Redistribution and use in source and binary forms, with or without
  14 +# modification, are permitted provided that the following conditions are met:
  15 +#
  16 +# * Redistributions of source code must retain the above copyright notice,
  17 +# this list of conditions and the following disclaimer.
  18 +# * Redistributions in binary form must reproduce the above copyright notice,
  19 +# this list of conditions and the following disclaimer in the documentation
  20 +# and/or other materials provided with the distribution.
  21 +#
  22 +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  23 +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24 +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25 +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  26 +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  27 +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  28 +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  29 +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  30 +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  31 +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  32 +# POSSIBILITY OF SUCH DAMAGE.
  33 +
  34 +from __future__ import print_function
  35 +
  36 +#------------------------------------------------------------------------------
  37 +# CHANGELOG:
  38 +# 2017-11-30 v0.01 CH: - first version based on xls_parser
  39 +
  40 +#------------------------------------------------------------------------------
  41 +# TODO:
  42 +# - read DocumentSummaryInformation first to get more info about streams
  43 +# (maybe content type or so; identify streams that are never record-based)
  44 +# - think about integrating this with olefile itself
  45 +
  46 +# -----------------------------------------------------------------------------
  47 +# REFERENCES:
  48 +# - [MS-XLS]: Excel Binary File Format (.xls) Structure Specification
  49 +# https://msdn.microsoft.com/en-us/library/office/cc313154(v=office.14).aspx
  50 +# - Understanding the Excel .xls Binary File Format
  51 +# https://msdn.microsoft.com/en-us/library/office/gg615597(v=office.14).aspx
  52 +# - [MS-PPT]
  53 +
  54 +
  55 +import sys
  56 +import os.path
  57 +from io import SEEK_CUR
  58 +import logging
  59 +
  60 +# little hack to allow absolute imports even if oletools is not installed.
  61 +# Copied from olevba.py
  62 +_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) # pylint: disable=invalid-name
  63 +_parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) # pylint: disable=invalid-name
  64 +del _thismodule_dir
  65 +if _parent_dir not in sys.path:
  66 + sys.path.insert(0, _parent_dir)
  67 +del _parent_dir
  68 +
  69 +from oletools.thirdparty import olefile
  70 +
  71 +
  72 +###############################################################################
  73 +# Helpers
  74 +###############################################################################
  75 +
  76 +
  77 +ENTRY_TYPE2STR = {
  78 + olefile.STGTY_EMPTY: 'empty',
  79 + olefile.STGTY_STORAGE: 'storage',
  80 + olefile.STGTY_STREAM: 'stream',
  81 + olefile.STGTY_LOCKBYTES: 'lock-bytes',
  82 + olefile.STGTY_PROPERTY: 'property',
  83 + olefile.STGTY_ROOT: 'root'
  84 +}
  85 +
  86 +
  87 +###############################################################################
  88 +# Base Classes
  89 +###############################################################################
  90 +
  91 +
  92 +class OleRecordFile(olefile.OleFileIO):
  93 + """ an OLE compound file whose streams have (mostly) record structure
  94 +
  95 + 'record structure' meaning that streams are a sequence of records. Records
  96 + are structure with information about type and size in their first bytes
  97 + and type-dependent data of given size after that.
  98 +
  99 + Subclass of OleFileIO!
  100 + """
  101 +
  102 + @classmethod
  103 + def stream_class_for_name(cls, stream_name):
  104 + """ helper for iter_streams, must be overwritten in subclasses """
  105 + return OleRecordStream # this is an abstract class!
  106 +
  107 + def iter_streams(self):
  108 + """ find all streams, including orphans """
  109 + logging.debug('Finding streams in ole file')
  110 +
  111 + for sid, direntry in enumerate(self.direntries):
  112 + is_orphan = direntry is None
  113 + if is_orphan:
  114 + # this direntry is not part of the tree --> unused or orphan
  115 + direntry = self._load_direntry(sid)
  116 + is_stream = direntry.entry_type == olefile.STGTY_STREAM
  117 + logging.debug('direntry {:2d} {}: {}'.format(
  118 + sid, '[orphan]' if is_orphan else direntry.name,
  119 + 'is stream of size {}'.format(direntry.size) if is_stream else
  120 + 'no stream ({})'.format(ENTRY_TYPE2STR[direntry.entry_type])))
  121 + if is_stream:
  122 + clz = self.stream_class_for_name(direntry.name)
  123 + yield clz(self._open(direntry.isectStart, direntry.size),
  124 + None if is_orphan else direntry.name)
  125 +
  126 +
  127 +class OleRecordStream(object):
  128 + """ a stream found in an OleRecordFile
  129 +
  130 + Always has a name and a size (both read-only). Has an OleFileStream handle.
  131 +
  132 + abstract base class
  133 + """
  134 +
  135 + def __init__(self, stream, name):
  136 + self.stream = stream
  137 + self.name = name
  138 + self.size = stream.size
  139 +
  140 + def read_record_head(self):
  141 + """ read first few bytes of record to determine size and type
  142 +
  143 + Abstract base method, to be implemented in subclasses.
  144 +
  145 + returns (rec_type, rec_size, other) where other will be forwarded to
  146 + record constructors
  147 + """
  148 + raise NotImplementedError('Abstract method '
  149 + 'OleRecordStream.read_record_head called')
  150 +
  151 + @classmethod
  152 + def record_class_for_type(cls, rec_type):
  153 + """ determine a class for given record type
  154 +
  155 + Only a base implementation. Create subclasses of OleRecordBase and
  156 + return those when appropriate.
  157 +
  158 + returns (clz, force_read)
  159 + """
  160 + return OleRecordBase, False
  161 +
  162 + def iter_records(self, fill_data=False):
  163 + """ yield all records in this stream
  164 +
  165 + Stream must be positioned at start of records (e.g. start of stream).
  166 + """
  167 + while True:
  168 + # unpacking as in olevba._extract_vba
  169 + pos = self.stream.tell()
  170 + if pos >= self.size:
  171 + break
  172 +
  173 + # read first few bytes, determine record type and size
  174 + rec_type, rec_size, other = self.read_record_head()
  175 + logging.debug('Record type {0} of size {1}'
  176 + .format(rec_type, rec_size))
  177 +
  178 + # determine what class to wrap this into
  179 + rec_clz, force_read = self.record_class_for_type(rec_type)
  180 +
  181 + if fill_data or force_read:
  182 + data = self.stream.read(rec_size)
  183 + if len(data) != rec_size:
  184 + raise IOError('Not enough data in stream ({0} < {1})'
  185 + .format(len(data), rec_size))
  186 + else:
  187 + self.stream.seek(rec_size, SEEK_CUR)
  188 + data = None
  189 + yield rec_clz(rec_type, rec_size, other, pos, data)
  190 +
  191 + def __str__(self):
  192 + return '[{2} {0} (size {1})' \
  193 + .format(self.name or '[orphan]', self.size,
  194 + self.__class__.__name__)
  195 +
  196 +
  197 +class OleRecordBase(object):
  198 + """ a record found in an OleRecordStream
  199 +
  200 + always has a type and a size, also pos and data
  201 + """
  202 +
  203 + # for subclasses with a fixed type
  204 + TYPE = None
  205 +
  206 + # (max) size of subclasses
  207 + MAX_SIZE = None
  208 + SIZE = None
  209 +
  210 + def __init__(self, type, size, more_data, pos, data):
  211 + """ create a record; more_data is discarded """
  212 + if self.TYPE is not None and type != self.TYPE:
  213 + raise ValueError('Wrong subclass {0} for type {1}'
  214 + .format(self.__class__.__name__, type))
  215 + self.type = type
  216 + if self.SIZE is not None and size != self.SIZE:
  217 + raise ValueError('Wrong size {0} for record type {1}'
  218 + .format(size, type))
  219 + elif self.MAX_SIZE is not None and size > self.MAX_SIZE:
  220 + raise ValueError('Wrong size: {0} > MAX_SIZE for record type {1}'
  221 + .format(size, type))
  222 + self.size = size
  223 + self.pos = pos
  224 + self.data = data
  225 + self.parse(more_data)
  226 +
  227 + def parse(self, more_data):
  228 + """ finish constructing this record
  229 +
  230 + Can save more_data from OleRecordStream.read_record_head and/or parse
  231 + data (if it was read).
  232 +
  233 + Base implementation, does nothing. To be overwritten in subclasses.
  234 + """
  235 + pass
  236 +
  237 + def _type_str(self):
  238 + """ helper for __str__, base implementation """
  239 + return '{0} type {1}'.format(self.__class__.__name__, self.type)
  240 +
  241 + def __str__(self):
  242 + """ create a short but informative textual representation of self """
  243 + return '[' + self._type_str() + \
  244 + ' (size {0} from {1})]'.format(self.size, self.pos)
  245 +
  246 +
  247 +###############################################################################
  248 +# TESTING
  249 +###############################################################################
  250 +
  251 +
  252 +def test(filenames, ole_file_class=OleRecordFile,
  253 + must_parse=None):
  254 + """ parse all given file names and print rough structure
  255 +
  256 + if an error occurs while parsing a stream of type in must_parse, the error
  257 + will be raised. Otherwise a message is printed
  258 + """
  259 + logging.basicConfig(level=logging.DEBUG)
  260 + if not filenames:
  261 + logging.info('need file name[s]')
  262 + return 2
  263 + for filename in filenames:
  264 + logging.info('checking file {0}'.format(filename))
  265 + if not olefile.isOleFile(filename):
  266 + logging.info('not an ole file - skip')
  267 + continue
  268 + ole = ole_file_class(filename)
  269 +
  270 + for stream in ole.iter_streams():
  271 + logging.info(stream)
  272 + try:
  273 + for record in stream.iter_records():
  274 + logging.info(' {0}'.format(record))
  275 + except Exception:
  276 + if not must_parse:
  277 + raise
  278 + elif isinstance(stream, must_parse):
  279 + raise
  280 + else:
  281 + logging.info(' failed to parse', exc_info=True)
  282 + return 0
  283 +
  284 +
  285 +if __name__ == '__main__':
  286 + sys.exit(test(sys.argv[1:]))
... ...
oletools/xls_parser.py
... ... @@ -30,9 +30,11 @@ Read storages, (sub-)streams, records from xls file
30 30  
31 31 #------------------------------------------------------------------------------
32 32 # CHANGELOG:
33   -# 2017-11-02 v0.01 CH: - first version
  33 +# 2017-11-02 v0.1 CH: - first version
  34 +# 2017-11-02 v0.2 CH: - move some code to record_base.py
  35 +# (to avoid copy-and-paste in ppt_parser.py)
34 36  
35   -__version__ = '0.1'
  37 +__version__ = '0.2'
36 38  
37 39 # -----------------------------------------------------------------------------
38 40 # TODO:
... ... @@ -52,17 +54,8 @@ __version__ = &#39;0.1&#39;
52 54 import sys
53 55 import os.path
54 56 from struct import unpack
55   -from io import SEEK_CUR
56 57 import logging
57   -
58   -# little hack to allow absolute imports even if oletools is not installed.
59   -# Copied from olevba.py
60   -_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) # pylint: disable=invalid-name
61   -_parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) # pylint: disable=invalid-name
62   -if _parent_dir not in sys.path:
63   - sys.path.insert(0, _parent_dir)
64   -
65   -from oletools.thirdparty import olefile
  58 +from record_base import OleRecordFile, OleRecordStream, OleRecordBase, test
66 59  
67 60  
68 61 # === PYTHON 2+3 SUPPORT ======================================================
... ... @@ -75,16 +68,6 @@ if sys.version_info[0] &gt;= 3:
75 68 ###############################################################################
76 69  
77 70  
78   -ENTRY_TYPE2STR = {
79   - olefile.STGTY_EMPTY: 'empty',
80   - olefile.STGTY_STORAGE: 'storage',
81   - olefile.STGTY_STREAM: 'stream',
82   - olefile.STGTY_LOCKBYTES: 'lock-bytes',
83   - olefile.STGTY_PROPERTY: 'property',
84   - olefile.STGTY_ROOT: 'root'
85   -}
86   -
87   -
88 71 def is_xls(filename):
89 72 """
90 73 determine whether a given file is an excel ole file
... ... @@ -95,7 +78,7 @@ def is_xls(filename):
95 78 substream
96 79 """
97 80 try:
98   - for stream in XlsFile(filename).get_streams():
  81 + for stream in XlsFile(filename).iter_streams():
99 82 if isinstance(stream, WorkbookStream):
100 83 return True
101 84 except Exception:
... ... @@ -122,7 +105,7 @@ def read_unicode_2byte(data, start_idx, n_chars):
122 105 unpack('<' + 'H'*n_chars, data[start_idx:end_idx]))
123 106 else: # slower version but less memory-extensive
124 107 unichars = (unichr(unpack('<H', data[data_idx:data_idx+2])[0])
125   - for data_idx in xrange(start_idx, end_idx, 2))
  108 + for data_idx in range(start_idx, end_idx, 2))
126 109 return u''.join(unichars), end_idx
127 110  
128 111  
... ... @@ -130,133 +113,94 @@ def read_unicode_2byte(data, start_idx, n_chars):
130 113 # File, Storage, Stream
131 114 ###############################################################################
132 115  
  116 +class XlsFile(OleRecordFile):
  117 + """ An xls file has most streams made up of records """
133 118  
134   -class XlsFile(olefile.OleFileIO):
135   - """ specialization of an OLE compound file """
  119 + @classmethod
  120 + def stream_class_for_name(self, stream_name):
  121 + """ helper for iter_streams """
  122 + return XlsStream
136 123  
137   - def get_streams(self):
138   - """ find all streams, including orphans """
139   - logging.debug('Finding streams in ole file')
140 124  
141   - for sid, direntry in enumerate(self.direntries):
142   - is_orphan = direntry is None
143   - if is_orphan:
144   - # this direntry is not part of the tree --> unused or orphan
145   - direntry = self._load_direntry(sid)
146   - is_stream = direntry.entry_type == olefile.STGTY_STREAM
147   - logging.debug('direntry {:2d} {}: {}'.format(
148   - sid, '[orphan]' if is_orphan else direntry.name,
149   - 'is stream of size {}'.format(direntry.size) if is_stream else
150   - 'no stream ({})'.format(ENTRY_TYPE2STR[direntry.entry_type])))
151   - if is_stream:
152   - if direntry.name == 'Workbook':
153   - clz = WorkbookStream
154   - else:
155   - clz = XlsStream
156   - yield clz(self._open(direntry.isectStart, direntry.size),
157   - None if is_orphan else direntry.name)
  125 +class XlsStream(OleRecordStream):
  126 + """ most streams in xls file consist of records """
158 127  
  128 + def read_record_head(self):
  129 + """ read first few bytes of record to determine size and type
159 130  
160   -class XlsStream(object):
161   - """ specialization of an OLE stream
162   -
163   - Currently not much use, but may be interesting for further sub-classing
164   - when extending this code.
165   -
166   - stream argument can be oleile.OleStream or ooxml.ZipSubFile
167   - """
  131 + returns (type, size, other) where other is None
  132 + """
  133 + rec_type, rec_size = unpack('<HH', self.stream.read(4))
  134 + return rec_type, rec_size, None
168 135  
169   - def __init__(self, stream, name):
170   - self.stream = stream
171   - self.size = stream.size
172   - self.name = name
  136 + @classmethod
  137 + def record_class_for_type(cls, type):
  138 + """ determine a class for given record type
173 139  
174   - def __str__(self):
175   - return '[XlsStream {0} (size {1})' \
176   - .format(self.name or '[orphan]', self.size)
  140 + returns (clz, force_read)
  141 + """
  142 + return XlsRecord, False
177 143  
178 144  
179 145 class WorkbookStream(XlsStream):
180   - """ the workbook stream which contains records """
  146 + """ Stream in excel file that holds most info """
181 147  
182   - def iter_records(self, fill_data=False):
183   - """ iterate over records in streams
  148 + @classmethod
  149 + def record_class_for_type(cls, type):
  150 + """ determine a class for given record type
184 151  
185   - Stream must be positioned at start of records (e.g. start of stream).
  152 + returns (clz, force_read)
186 153 """
187   - while True:
188   - # unpacking as in olevba._extract_vba
189   - pos = self.stream.tell()
190   - if pos >= self.size:
191   - break
192   - type = unpack('<H', self.stream.read(2))[0]
193   - size = unpack('<H', self.stream.read(2))[0]
194   - force_read = False
195   - if type == XlsRecordBof.TYPE:
196   - clz = XlsRecordBof
197   - force_read = True
198   - elif type == XlsRecordEof.TYPE:
199   - clz = XlsRecordEof
200   - elif type == XlsRecordSupBook.TYPE:
201   - clz = XlsRecordSupBook
202   - force_read = True
203   - else:
204   - clz = XlsRecord
205   - data = None
206   - if fill_data or force_read:
207   - data = self.stream.read(size)
208   - else:
209   - self.stream.seek(size, SEEK_CUR)
210   - yield clz(type, size, pos, data)
211   -
212   - def __str__(self):
213   - return '[Workbook Stream (size {0})'.format(self.size)
214   -
215   -
216   -class XlsbStream(XlsStream):
  154 + if type == XlsRecordBof.TYPE:
  155 + return XlsRecordBof, True
  156 + elif type == XlsRecordEof.TYPE:
  157 + return XlsRecordEof, False
  158 + elif type == XlsRecordSupBook.TYPE:
  159 + return XlsRecordSupBook, True
  160 + else:
  161 + return XlsRecord, False
  162 +
  163 +
  164 +class XlsbStream(OleRecordStream):
217 165 """ binary stream of an xlsb file, usually have a record structure """
218 166  
219 167 HIGH_BIT_MASK = 0b10000000
220 168 LOW7_BIT_MASK = 0b01111111
221 169  
222   - def iter_records(self):
223   - """ iterate over records in stream
  170 + def read_record_head(self):
  171 + """ read first few bytes of record to determine size and type
224 172  
225   - Record type and size are encoded differently than in xls streams.
226   - (c.f. [MS-XLSB, Paragraph 2.1.4: Record)
  173 + returns (type, size, other) where other is None
227 174 """
228   - while True:
229   - pos = self.stream.tell()
230   - if pos >= self.size:
231   - break
232   - val = ord(self.stream.read(1))
233   - if val & self.HIGH_BIT_MASK: # high bit of the low byte is 1
234   - val2 = ord(self.stream.read(1)) # need another byte
235   - # combine 7 low bits of each byte
236   - type = (val & self.LOW7_BIT_MASK) + \
  175 + val = ord(self.stream.read(1))
  176 + if val & self.HIGH_BIT_MASK: # high bit of the low byte is 1
  177 + val2 = ord(self.stream.read(1)) # need another byte
  178 + # combine 7 low bits of each byte
  179 + rec_type = (val & self.LOW7_BIT_MASK) + \
237 180 ((val2 & self.LOW7_BIT_MASK) << 7)
238   - else:
239   - type = val
240   -
241   - size = 0
242   - shift = 0
243   - for _ in range(4): # size needs up to 4 byte
244   - val = ord(self.stream.read(1))
245   - size += (val & self.LOW7_BIT_MASK) << shift
246   - shift += 7
247   - if (val & self.HIGH_BIT_MASK) == 0: # high-bit is 0 --> done
248   - break
249   -
250   - if pos + size > self.size:
251   - raise ValueError('Stream does not seem to have record '
252   - 'structure or is incomplete (record size {0})'
253   - .format(size))
254   - data = self.stream.read(size)
255   -
256   - clz = XlsbRecord
257   - if type == XlsbBeginSupBook.TYPE:
258   - clz = XlsbBeginSupBook
259   - yield clz(type, size, pos, data)
  181 + else:
  182 + rec_type = val
  183 +
  184 + rec_size = 0
  185 + shift = 0
  186 + for _ in range(4): # rec_size needs up to 4 byte
  187 + val = ord(self.stream.read(1))
  188 + rec_size += (val & self.LOW7_BIT_MASK) << shift
  189 + shift += 7
  190 + if (val & self.HIGH_BIT_MASK) == 0: # high-bit is 0 --> done
  191 + break
  192 + return rec_type, rec_size, None
  193 +
  194 + @classmethod
  195 + def record_class_for_type(cls, type):
  196 + """ determine a class for given record type
  197 +
  198 + returns (clz, force_read)
  199 + """
  200 + if type == XlsbBeginSupBook.TYPE:
  201 + return XlsbBeginSupBook, True
  202 + else:
  203 + return XlsbRecord, False
260 204  
261 205  
262 206 ###############################################################################
... ... @@ -309,7 +253,6 @@ FREQUENT_RECORDS = dict([
309 253  
310 254 #: records found in xlsb binary parts
311 255 FREQUENT_RECORDS_XLSB = dict([
312   - (360, 'BrtBeginSupBook'),
313 256 (588, 'BrtEndSupBook'),
314 257 (667, 'BrtSupAddin'),
315 258 (355, 'BrtSupBookSrc'),
... ... @@ -330,36 +273,12 @@ FREQUENT_RECORDS_XLSB = dict([
330 273 ])
331 274  
332 275  
333   -class XlsRecord(object):
  276 +class XlsRecord(OleRecordBase):
334 277 """ basic building block of data in workbook stream """
335 278  
336 279 #: max size of a record in xls stream (does not apply to xlsb)
337 280 MAX_SIZE = 8224
338 281  
339   - # to be overwritten in subclasses that have fixed type/size
340   - TYPE = None
341   - SIZE = None
342   -
343   - def __init__(self, type, size, pos, data=None):
344   - """ create a record """
345   - self.type = type
346   - if self.MAX_SIZE is not None and size > self.MAX_SIZE:
347   - logging.warning('record size {0} exceeds max size'
348   - .format(size))
349   - elif self.SIZE is not None and size != self.SIZE:
350   - raise ValueError('size {0} is not as expected for this type'
351   - .format(size))
352   - self.size = size
353   - self.pos = pos
354   - self.data = data
355   - if data is not None and len(data) != size:
356   - raise ValueError('data size {0} is not expected size {1}'
357   - .format(len(data), size))
358   -
359   - def read_data(self, stream):
360   - """ read data from stream if up to now only pos was known """
361   - raise NotImplementedError()
362   -
363 282 def _type_str(self):
364 283 """ simplification for subclasses to create their own __str__ """
365 284 try:
... ... @@ -367,10 +286,6 @@ class XlsRecord(object):
367 286 except KeyError:
368 287 return 'XlsRecord type {0}'.format(self.type)
369 288  
370   - def __str__(self):
371   - return '[' + self._type_str() + \
372   - ' (size {0} from {1})]'.format(self.size, self.pos)
373   -
374 289  
375 290 class XlsRecordBof(XlsRecord):
376 291 """ record found at beginning of substreams """
... ... @@ -380,8 +295,7 @@ class XlsRecordBof(XlsRecord):
380 295 DOCTYPES = dict([(0x5, 'workbook'), (0x10, 'dialog/worksheet'),
381 296 (0x20, 'chart'), (0x40, 'macro')])
382 297  
383   - def __init__(self, *args, **kwargs):
384   - super(XlsRecordBof, self).__init__(*args, **kwargs)
  298 + def parse(self, _):
385 299 if self.data is None:
386 300 self.doctype = None
387 301 return
... ... @@ -420,9 +334,7 @@ class XlsRecordSupBook(XlsRecord):
420 334 LINK_TYPE_OLE_DDE = 'ole/dde data source'
421 335 LINK_TYPE_EXTERNAL = 'external workbook'
422 336  
423   - def __init__(self, *args, **kwargs):
424   - super(XlsRecordSupBook, self).__init__(*args, **kwargs)
425   -
  337 + def parse(self, _):
426 338 # set defaults
427 339 self.ctab = None
428 340 self.cch = None
... ... @@ -461,7 +373,7 @@ class XlsRecordSupBook(XlsRecord):
461 373 return 'SupBook Record ({0})'.format(self.support_link_type)
462 374  
463 375  
464   -class XlsbRecord(XlsRecord):
  376 +class XlsbRecord(OleRecordBase):
465 377 """ like an xls record, but from binary part of xlsb file
466 378  
467 379 has no MAX_SIZE and types have different meanings
... ... @@ -491,8 +403,7 @@ class XlsbBeginSupBook(XlsbRecord):
491 403 LINK_TYPE_UNEXPECTED = 'unexpected'
492 404 LINK_TYPE_UNKNOWN = 'unknown'
493 405  
494   - def __init__(self, *args, **kwargs):
495   - super(XlsbBeginSupBook, self).__init__(*args, **kwargs)
  406 + def parse(self, _):
496 407 self.link_type = self.LINK_TYPE_UNKNOWN
497 408 self.string1 = ''
498 409 self.string2 = ''
... ... @@ -540,6 +451,7 @@ class XlsbBeginSupBook(XlsbRecord):
540 451 # XLSB Binary Parts
541 452 ###############################################################################
542 453  
  454 +
543 455 def parse_xlsb_part(stream, _, filename):
544 456 """ Excel xlsb files also have a record structure. iter records """
545 457 for record in XlsbStream(stream, filename).iter_records():
... ... @@ -551,26 +463,5 @@ def parse_xlsb_part(stream, _, filename):
551 463 ###############################################################################
552 464  
553 465  
554   -def test(*filenames):
555   - """ parse all given file names and print rough structure """
556   - logging.basicConfig(level=logging.DEBUG)
557   - if not filenames:
558   - logging.info('need file name[s]')
559   - return 2
560   - for filename in filenames:
561   - logging.info('checking file {0}'.format(filename))
562   - if not olefile.isOleFile(filename):
563   - logging.info('not an ole file - skip')
564   - continue
565   - xls = XlsFile(filename)
566   -
567   - for stream in xls.get_streams():
568   - logging.info(stream)
569   - if isinstance(stream, WorkbookStream):
570   - for record in stream.iter_records():
571   - logging.info(' {0}'.format(record))
572   - return 0
573   -
574   -
575 466 if __name__ == '__main__':
576   - sys.exit(test(*sys.argv[1:]))
  467 + sys.exit(test(sys.argv[1:], XlsFile, WorkbookStream))
... ...