Commit ed6b0de40f893a7539b8d302bec4ed31a6f2ac72

Authored by Christian Herdtweck
Committed by Philippe Lagadec
1 parent 1360ca04

xls_parser: parse records from xls workbook stream

Showing 1 changed file with 320 additions and 32 deletions
oletools/xls_parser.py
... ... @@ -36,33 +36,42 @@ from __future__ import print_function
36 36  
37 37 __version__ = '0.1'
38 38  
39   -#------------------------------------------------------------------------------
40   -# TODO:
41   -# everything
  39 +# -----------------------------------------------------------------------------
  40 +# TODO:
  41 +# - parse more record types (ExternName, ...)
  42 +# - check what bad stuff can be in other storages: Embedded ("MBD..."), Linked
  43 +# ("LNK..."), "MsoDataStore" and OleStream ('\001Ole')
42 44 #
43   -#------------------------------------------------------------------------------
44   -# REFERENCES:
45   -# - [MS-XLS]: Excel Binary File Format (.xls) Structure Specification
46   -# https://msdn.microsoft.com/en-us/library/office/cc313154(v=office.14).aspx
47   -# - Understanding the Excel .xls Binary File Format
48   -# https://msdn.microsoft.com/en-us/library/office/gg615597(v=office.14).aspx
  45 +# -----------------------------------------------------------------------------
  46 +# REFERENCES:
  47 +# - [MS-XLS]: Excel Binary File Format (.xls) Structure Specification
  48 +# https://msdn.microsoft.com/en-us/library/office/cc313154(v=office.14).aspx
  49 +# - Understanding the Excel .xls Binary File Format
  50 +# https://msdn.microsoft.com/en-us/library/office/gg615597(v=office.14).aspx
49 51 #
50   -#--- IMPORTS ------------------------------------------------------------------
  52 +# -- IMPORTS ------------------------------------------------------------------
51 53  
52 54 import sys
53 55 import os.path
  56 +from struct import unpack
  57 +from io import SEEK_CUR
54 58  
55 59 # little hack to allow absolute imports even if oletools is not installed.
56 60 # Copied from olevba.py
57   -_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__)))
58   -_parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..'))
59   -if not _parent_dir in sys.path:
  61 +_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) # pylint: disable=invalid-name
  62 +_parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) # pylint: disable=invalid-name
  63 +if _parent_dir not in sys.path:
60 64 sys.path.insert(0, _parent_dir)
61 65  
62   -from oletools.thirdparty import olefile
  66 +from oletools.thirdparty import olefile # pylint: disable=wrong-import-position
  67 +
  68 +
  69 +###############################################################################
  70 +# Helpers
  71 +###############################################################################
63 72  
64 73  
65   -entry_type2str = {
  74 +ENTRY_TYPE2STR = {
66 75 olefile.STGTY_EMPTY: 'empty',
67 76 olefile.STGTY_STORAGE: 'storage',
68 77 olefile.STGTY_STREAM: 'stream',
... ... @@ -71,6 +80,29 @@ entry_type2str = {
71 80 olefile.STGTY_ROOT: 'root'
72 81 }
73 82  
  83 +
  84 +def is_xls(filename):
  85 + """
  86 + determine whether a given file is an excel ole file
  87 +
  88 + returns True if given file is an ole file and contains a Workbook stream
  89 +
  90 + todo: could further check that workbook stream starts with a globals
  91 + substream
  92 + """
  93 + try:
  94 + for stream in XlsFile(filename).get_streams():
  95 + if isinstance(stream, WorkbookStream):
  96 + return True
  97 + except Exception:
  98 + return False
  99 +
  100 +
  101 +###############################################################################
  102 +# File, Storage, Stream
  103 +###############################################################################
  104 +
  105 +
74 106 class XlsFile(olefile.OleFileIO):
75 107 """ specialization of an OLE compound file """
76 108  
... ... @@ -81,7 +113,7 @@ class XlsFile(olefile.OleFileIO):
81 113 for sid, direntry in enumerate(self.direntries):
82 114 is_orphan = direntry is None
83 115 if is_orphan:
84   - # this direntry is not part of the tree: either unused or an orphan
  116 + # this direntry is not part of the tree --> unused or orphan
85 117 direntry = self._load_direntry(sid)
86 118 is_stream = direntry.entry_type == olefile.STGTY_STREAM
87 119 print('direntry {:2d} {}: {}'
... ... @@ -89,30 +121,286 @@ class XlsFile(olefile.OleFileIO):
89 121 'is stream of size {}'.format(direntry.size)
90 122 if is_stream else
91 123 'no stream ({})'
92   - .format(entry_type2str[direntry.entry_type])))
  124 + .format(ENTRY_TYPE2STR[direntry.entry_type])))
93 125 if is_stream:
94   - yield XlsStream(self._open(direntry.isectStart, direntry.size))
  126 + if direntry.name == 'Workbook':
  127 + clz = WorkbookStream
  128 + else:
  129 + clz = XlsStream
  130 + yield clz(self._open(direntry.isectStart, direntry.size),
  131 + None if is_orphan else direntry.name)
  132 +
95 133  
  134 +class XlsStream(object):
  135 + """ specialization of an OLE stream
96 136  
97   -class XlsStream:
98   - """ specialization of an OLE (sub-)stream """
  137 + Currently not much use, but may be interesting for further sub-classing
  138 + when extending this code.
  139 + """
99 140  
100   - def __init__(self, stream):
  141 + def __init__(self, stream, name):
101 142 self.stream = stream
  143 + self.size = stream.size
  144 + self.name = name
102 145  
  146 + def __str__(self):
  147 + return '[XlsStream {0} (size {1})' \
  148 + .format(self.name or '[orphan]', self.size)
103 149  
104   -def test(filename):
105   - """ parse given file and print rough structure """
106   - try:
  150 +
  151 +class WorkbookStream(XlsStream):
  152 + """ the workbook stream which contains records """
  153 +
  154 + def iter_records(self, fill_data=False):
  155 + """ iterate over records in streams"""
  156 + if self.stream.tell() != 0:
  157 + print('have to jump to start')
  158 + self.stream.seek(0)
  159 +
  160 + while True:
  161 + # unpacking as in olevba._extract_vba
  162 + pos = self.stream.tell()
  163 + if pos >= self.size:
  164 + break
  165 + type = unpack('<H', self.stream.read(2))[0]
  166 + size = unpack('<H', self.stream.read(2))[0]
  167 + force_read = False
  168 + if type == XlsRecordBof.TYPE:
  169 + clz = XlsRecordBof
  170 + force_read = True
  171 + elif type == XlsRecordEof.TYPE:
  172 + clz = XlsRecordEof
  173 + elif type == XlsRecordSupBook.TYPE:
  174 + clz = XlsRecordSupBook
  175 + force_read = True
  176 + else:
  177 + clz = XlsRecord
  178 + data = None
  179 + if fill_data or force_read:
  180 + data = self.stream.read(size)
  181 + else:
  182 + self.stream.seek(size, SEEK_CUR)
  183 + yield clz(type, size, pos, data)
  184 +
  185 + def __str__(self):
  186 + return '[Workbook Stream (size {0})'.format(self.size)
  187 +
  188 +
  189 +###############################################################################
  190 +# RECORDS
  191 +###############################################################################
  192 +
  193 +# records that appear often but do not need their own XlsRecord subclass (yet)
  194 +FREQUENT_RECORDS = dict([
  195 + ( 156, 'BuiltInFnGroupCount'), # pylint: disable=bad-whitespace
  196 + (2147, 'BookExt'), # pylint: disable=bad-whitespace
  197 + ( 442, 'CodeName'), # pylint: disable=bad-whitespace
  198 + ( 66, 'CodePage'), # pylint: disable=bad-whitespace
  199 + (4195, 'Dat'), # pylint: disable=bad-whitespace
  200 + (2154, 'DataLabExt'), # pylint: disable=bad-whitespace
  201 + (2155, 'DataLabExtContents'), # pylint: disable=bad-whitespace
  202 + ( 215, 'DBCell'), # pylint: disable=bad-whitespace
  203 + ( 220, 'DbOrParmQry'), # pylint: disable=bad-whitespace
  204 + (2051, 'DBQueryExt'), # pylint: disable=bad-whitespace
  205 + (2166, 'DConn'), # pylint: disable=bad-whitespace
  206 + ( 35, 'ExternName'), # pylint: disable=bad-whitespace
  207 + ( 23, 'ExternSheet'), # pylint: disable=bad-whitespace
  208 + ( 255, 'ExtSST'), # pylint: disable=bad-whitespace
  209 + (2052, 'ExtString'), # pylint: disable=bad-whitespace
  210 + (2151, 'FeatHdr'), # pylint: disable=bad-whitespace
  211 + ( 91, 'FileSharing'), # pylint: disable=bad-whitespace
  212 + (1054, 'Format'), # pylint: disable=bad-whitespace
  213 + ( 49, 'Font'), # pylint: disable=bad-whitespace
  214 + (2199, 'GUIDTypeLib'), # pylint: disable=bad-whitespace
  215 + ( 440, 'HLink'), # pylint: disable=bad-whitespace
  216 + ( 225, 'InterfaceHdr'), # pylint: disable=bad-whitespace
  217 + ( 226, 'InterfaceEnd'), # pylint: disable=bad-whitespace
  218 + ( 523, 'Index'), # pylint: disable=bad-whitespace
  219 + ( 24, 'Lbl'), # pylint: disable=bad-whitespace
  220 + ( 193, 'Mms'), # pylint: disable=bad-whitespace
  221 + ( 93, 'Obj'), # pylint: disable=bad-whitespace
  222 + (4135, 'ObjectLink'), # pylint: disable=bad-whitespace
  223 + (2058, 'OleDbConn'), # pylint: disable=bad-whitespace
  224 + ( 222, 'OleObjectSize'), # pylint: disable=bad-whitespace
  225 + (2214, 'RichTextStream'), # pylint: disable=bad-whitespace
  226 + (2146, 'SheetExt'), # pylint: disable=bad-whitespace
  227 + (1212, 'ShrFmla'), # pylint: disable=bad-whitespace
  228 + (2060, 'SxViewExt'), # pylint: disable=bad-whitespace
  229 + (2136, 'SxViewLink'), # pylint: disable=bad-whitespace
  230 + (2049, 'WebPub'), # pylint: disable=bad-whitespace
  231 + ( 224, 'XF (formatting)'), # pylint: disable=bad-whitespace
  232 + (2173, 'XFExt (formatting)'), # pylint: disable=bad-whitespace
  233 + ( 659, 'Style'), # pylint: disable=bad-whitespace
  234 + (2194, 'StyleExt') # pylint: disable=bad-whitespace
  235 +])
  236 +
  237 +
  238 +class XlsRecord(object):
  239 + """ basic building block of data in workbook stream """
  240 +
  241 + #: max size of a record
  242 + MAX_SIZE = 8224
  243 +
  244 + # to be overwritten in subclasses that have fixed type/size
  245 + TYPE = None
  246 + SIZE = None
  247 +
  248 + def __init__(self, type, size, pos, data=None):
  249 + """ create a record """
  250 + self.type = type
  251 + if size > self.MAX_SIZE:
  252 + raise ValueError('size {0} exceeds max size'.format(size))
  253 + elif self.SIZE is not None and size != self.SIZE:
  254 + raise ValueError('size {0} is not as expected for this type'
  255 + .format(size))
  256 + self.size = size
  257 + self.pos = pos
  258 + self.data = data
  259 + if data is not None and len(data) != size:
  260 + raise ValueError('data size {0} is not expected size {1}'
  261 + .format(len(data), size))
  262 +
  263 + def read_data(self, stream):
  264 + """ read data from stream if up to now only pos was known """
  265 + raise NotImplementedError()
  266 +
  267 + def _type_str(self):
  268 + """ simplification for subclasses to create their own __str__ """
  269 + try:
  270 + return FREQUENT_RECORDS[self.type]
  271 + except KeyError:
  272 + return 'XlsRecord type {0}'.format(self.type)
  273 +
  274 + def __str__(self):
  275 + return '[' + self._type_str() + \
  276 + ' (size {0} from {1})]'.format(self.size, self.pos)
  277 +
  278 +
  279 +class XlsRecordBof(XlsRecord):
  280 + """ record found at beginning of substreams """
  281 + TYPE = 2057
  282 + SIZE = 16
  283 + # types of substreams
  284 + DOCTYPES = dict([(0x5, 'workbook'), (0x10, 'dialog/worksheet'),
  285 + (0x20, 'chart'), (0x40, 'macro')])
  286 +
  287 + def __init__(self, *args, **kwargs):
  288 + super(XlsRecordBof, self).__init__(*args, **kwargs)
  289 + if self.data is None:
  290 + self.doctype = None
  291 + return
  292 + # parse data (only doctype, ignore rest)
  293 + self.doctype = unpack('<H', self.data[2:4])[0]
  294 +
  295 + def _type_str(self):
  296 + return 'BOF Record ({0} substream)'.format(
  297 + self.DOCTYPES[self.doctype] if self.doctype in self.DOCTYPES
  298 + else 'unknown')
  299 +
  300 +
  301 +class XlsRecordEof(XlsRecord):
  302 + """ record found at end of substreams """
  303 + TYPE = 10
  304 + SIZE = 0
  305 +
  306 + def _type_str(self):
  307 + return 'EOF Record'
  308 +
  309 +
  310 +class XlsRecordSupBook(XlsRecord):
  311 + """ The SupBook record specifies a supporting link
  312 +
  313 + "... The collection of records specifies the contents of an external
  314 + workbook, DDE data source, or OLE data source." (MS-XLS, paragraph 2.4.271)
  315 + """
  316 +
  317 + TYPE = 430
  318 +
  319 + LINK_TYPE_UNKNOWN = 'unknown'
  320 + LINK_TYPE_SELF = 'self-referencing'
  321 + LINK_TYPE_ADDIN = 'addin-referencing'
  322 + LINK_TYPE_UNUSED = 'unused'
  323 + LINK_TYPE_SAMESHEET = 'same-sheet'
  324 + LINK_TYPE_OLE_DDE = 'ole/dde data source'
  325 + LINK_TYPE_EXTERNAL = 'external workbook'
  326 +
  327 + def __init__(self, *args, **kwargs):
  328 + super(XlsRecordSupBook, self).__init__(*args, **kwargs)
  329 +
  330 + # set defaults
  331 + self.ctab = None
  332 + self.cch = None
  333 + self.virt_path = None
  334 + self.support_link_type = self.LINK_TYPE_UNKNOWN
  335 + if self.data is None:
  336 + return
  337 +
  338 + # parse data
  339 + if self.size < 4:
  340 + raise ValueError('not enough data (size is {0} but need >= 4)'
  341 + .format(self.size))
  342 + self.ctab, self.cch = unpack('<HH', self.data[:4])
  343 + if 0 < self.cch <= 0xff:
  344 + # this is the length of virt_path
  345 + self.virt_path, _ = read_unicode(self.data, 4, self.cch)
  346 + else:
  347 + self.virt_path, _ = u'', 4
  348 + # ignore variable rgst
  349 +
  350 + if self.cch == 0x401: # ctab is undefined and to be ignored
  351 + self.support_link_type = self.LINK_TYPE_SELF
  352 + elif self.ctab == 0x1 and self.cch == 0x3A01:
  353 + self.support_link_type = self.LINK_TYPE_ADDIN
  354 + # next records must be ExternName with all add-in functions
  355 + elif self.virt_path == '\u0020': # space ; ctab can be anything
  356 + self.support_link_type = self.LINK_TYPE_UNUSED
  357 + elif self.virt_path == '\u0000':
  358 + self.support_link_type = self.LINK_TYPE_SAMESHEET
  359 + elif self.ctab == 0x0 and self.virt_path:
  360 + self.support_link_type = self.LINK_TYPE_OLE_DDE
  361 + elif self.ctab > 0 and self.virt_path:
  362 + self.support_link_type = self.LINK_TYPE_EXTERNAL
  363 +
  364 + def _type_str(self):
  365 + return 'SupBook Record ({0})'.format(self.support_link_type)
  366 +
  367 +
  368 +def read_unicode(data, start_idx, n_chars):
  369 + """ read a unicode string from a XLUnicodeStringNoCch structure """
  370 + # first bit 0x0 --> only low-bytes are saved, all high bytes are 0
  371 + # first bit 0x1 --> 2 bytes per character
  372 + low_bytes_only = (ord(data[start_idx]) == 0)
  373 + if low_bytes_only:
  374 + end_idx = start_idx + 1 + n_chars
  375 + return data[start_idx+1:end_idx].decode('ascii'), end_idx
  376 + end_idx = start_idx + 1 + n_chars * 2
  377 + return u''.join(unichr(val) for val in
  378 + unpack('<' + 'H'*n_chars, data[start_idx+1:end_idx])), \
  379 + end_idx
  380 +
  381 +
  382 +###############################################################################
  383 +# TESTING
  384 +###############################################################################
  385 +
  386 +
  387 +def test(*filenames):
  388 + """ parse all given file names and print rough structure """
  389 + if not filenames:
  390 + print('need file name[s]')
  391 + return 2
  392 + for filename in filenames:
  393 + if not olefile.isOleFile(filename):
  394 + continue
107 395 xls = XlsFile(filename)
108   - except Exception as exc:
109   - print('{}: {}'.format(filename, exc))
110   - return
111 396  
112   - for stream in xls.get_streams():
113   - pass
  397 + for stream in xls.get_streams():
  398 + print(stream)
  399 + if isinstance(stream, WorkbookStream):
  400 + for record in stream.iter_records():
  401 + print(' {0}'.format(record))
  402 + return 0
  403 +
114 404  
115 405 if __name__ == '__main__':
116   - """ parse all given file names and print rough structure """
117   - for filename in sys.argv[1:]:
118   - test(filename)
  406 + sys.exit(test(sys.argv[1:]))
... ...