Commit ed6b0de40f893a7539b8d302bec4ed31a6f2ac72
Committed by
Philippe Lagadec
1 parent
1360ca04
xls_parser: parse records from xls workbook stream
Showing
1 changed file
with
320 additions
and
32 deletions
oletools/xls_parser.py
| ... | ... | @@ -36,33 +36,42 @@ from __future__ import print_function |
| 36 | 36 | |
| 37 | 37 | __version__ = '0.1' |
| 38 | 38 | |
| 39 | -#------------------------------------------------------------------------------ | |
| 40 | -# TODO: | |
| 41 | -# everything | |
| 39 | +# ----------------------------------------------------------------------------- | |
| 40 | +# TODO: | |
| 41 | +# - parse more record types (ExternName, ...) | |
| 42 | +# - check what bad stuff can be in other storages: Embedded ("MBD..."), Linked | |
| 43 | +# ("LNK..."), "MsoDataStore" and OleStream ('\001Ole') | |
| 42 | 44 | # |
| 43 | -#------------------------------------------------------------------------------ | |
| 44 | -# REFERENCES: | |
| 45 | -# - [MS-XLS]: Excel Binary File Format (.xls) Structure Specification | |
| 46 | -# https://msdn.microsoft.com/en-us/library/office/cc313154(v=office.14).aspx | |
| 47 | -# - Understanding the Excel .xls Binary File Format | |
| 48 | -# https://msdn.microsoft.com/en-us/library/office/gg615597(v=office.14).aspx | |
| 45 | +# ----------------------------------------------------------------------------- | |
| 46 | +# REFERENCES: | |
| 47 | +# - [MS-XLS]: Excel Binary File Format (.xls) Structure Specification | |
| 48 | +# https://msdn.microsoft.com/en-us/library/office/cc313154(v=office.14).aspx | |
| 49 | +# - Understanding the Excel .xls Binary File Format | |
| 50 | +# https://msdn.microsoft.com/en-us/library/office/gg615597(v=office.14).aspx | |
| 49 | 51 | # |
| 50 | -#--- IMPORTS ------------------------------------------------------------------ | |
| 52 | +# -- IMPORTS ------------------------------------------------------------------ | |
| 51 | 53 | |
| 52 | 54 | import sys |
| 53 | 55 | import os.path |
| 56 | +from struct import unpack | |
| 57 | +from io import SEEK_CUR | |
| 54 | 58 | |
| 55 | 59 | # little hack to allow absolute imports even if oletools is not installed. |
| 56 | 60 | # Copied from olevba.py |
| 57 | -_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) | |
| 58 | -_parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) | |
| 59 | -if not _parent_dir in sys.path: | |
| 61 | +_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) # pylint: disable=invalid-name | |
| 62 | +_parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) # pylint: disable=invalid-name | |
| 63 | +if _parent_dir not in sys.path: | |
| 60 | 64 | sys.path.insert(0, _parent_dir) |
| 61 | 65 | |
| 62 | -from oletools.thirdparty import olefile | |
| 66 | +from oletools.thirdparty import olefile # pylint: disable=wrong-import-position | |
| 67 | + | |
| 68 | + | |
| 69 | +############################################################################### | |
| 70 | +# Helpers | |
| 71 | +############################################################################### | |
| 63 | 72 | |
| 64 | 73 | |
| 65 | -entry_type2str = { | |
| 74 | +ENTRY_TYPE2STR = { | |
| 66 | 75 | olefile.STGTY_EMPTY: 'empty', |
| 67 | 76 | olefile.STGTY_STORAGE: 'storage', |
| 68 | 77 | olefile.STGTY_STREAM: 'stream', |
| ... | ... | @@ -71,6 +80,29 @@ entry_type2str = { |
| 71 | 80 | olefile.STGTY_ROOT: 'root' |
| 72 | 81 | } |
| 73 | 82 | |
| 83 | + | |
| 84 | +def is_xls(filename): | |
| 85 | + """ | |
| 86 | + determine whether a given file is an excel ole file | |
| 87 | + | |
| 88 | + returns True if given file is an ole file and contains a Workbook stream | |
| 89 | + | |
| 90 | + todo: could further check that workbook stream starts with a globals | |
| 91 | + substream | |
| 92 | + """ | |
| 93 | + try: | |
| 94 | + for stream in XlsFile(filename).get_streams(): | |
| 95 | + if isinstance(stream, WorkbookStream): | |
| 96 | + return True | |
| 97 | + except Exception: | |
| 98 | + return False | |
| 99 | + | |
| 100 | + | |
| 101 | +############################################################################### | |
| 102 | +# File, Storage, Stream | |
| 103 | +############################################################################### | |
| 104 | + | |
| 105 | + | |
| 74 | 106 | class XlsFile(olefile.OleFileIO): |
| 75 | 107 | """ specialization of an OLE compound file """ |
| 76 | 108 | |
| ... | ... | @@ -81,7 +113,7 @@ class XlsFile(olefile.OleFileIO): |
| 81 | 113 | for sid, direntry in enumerate(self.direntries): |
| 82 | 114 | is_orphan = direntry is None |
| 83 | 115 | if is_orphan: |
| 84 | - # this direntry is not part of the tree: either unused or an orphan | |
| 116 | + # this direntry is not part of the tree --> unused or orphan | |
| 85 | 117 | direntry = self._load_direntry(sid) |
| 86 | 118 | is_stream = direntry.entry_type == olefile.STGTY_STREAM |
| 87 | 119 | print('direntry {:2d} {}: {}' |
| ... | ... | @@ -89,30 +121,286 @@ class XlsFile(olefile.OleFileIO): |
| 89 | 121 | 'is stream of size {}'.format(direntry.size) |
| 90 | 122 | if is_stream else |
| 91 | 123 | 'no stream ({})' |
| 92 | - .format(entry_type2str[direntry.entry_type]))) | |
| 124 | + .format(ENTRY_TYPE2STR[direntry.entry_type]))) | |
| 93 | 125 | if is_stream: |
| 94 | - yield XlsStream(self._open(direntry.isectStart, direntry.size)) | |
| 126 | + if direntry.name == 'Workbook': | |
| 127 | + clz = WorkbookStream | |
| 128 | + else: | |
| 129 | + clz = XlsStream | |
| 130 | + yield clz(self._open(direntry.isectStart, direntry.size), | |
| 131 | + None if is_orphan else direntry.name) | |
| 132 | + | |
| 95 | 133 | |
| 134 | +class XlsStream(object): | |
| 135 | + """ specialization of an OLE stream | |
| 96 | 136 | |
| 97 | -class XlsStream: | |
| 98 | - """ specialization of an OLE (sub-)stream """ | |
| 137 | + Currently not much use, but may be interesting for further sub-classing | |
| 138 | + when extending this code. | |
| 139 | + """ | |
| 99 | 140 | |
| 100 | - def __init__(self, stream): | |
| 141 | + def __init__(self, stream, name): | |
| 101 | 142 | self.stream = stream |
| 143 | + self.size = stream.size | |
| 144 | + self.name = name | |
| 102 | 145 | |
| 146 | + def __str__(self): | |
| 147 | + return '[XlsStream {0} (size {1})' \ | |
| 148 | + .format(self.name or '[orphan]', self.size) | |
| 103 | 149 | |
| 104 | -def test(filename): | |
| 105 | - """ parse given file and print rough structure """ | |
| 106 | - try: | |
| 150 | + | |
| 151 | +class WorkbookStream(XlsStream): | |
| 152 | + """ the workbook stream which contains records """ | |
| 153 | + | |
| 154 | + def iter_records(self, fill_data=False): | |
| 155 | + """ iterate over records in streams""" | |
| 156 | + if self.stream.tell() != 0: | |
| 157 | + print('have to jump to start') | |
| 158 | + self.stream.seek(0) | |
| 159 | + | |
| 160 | + while True: | |
| 161 | + # unpacking as in olevba._extract_vba | |
| 162 | + pos = self.stream.tell() | |
| 163 | + if pos >= self.size: | |
| 164 | + break | |
| 165 | + type = unpack('<H', self.stream.read(2))[0] | |
| 166 | + size = unpack('<H', self.stream.read(2))[0] | |
| 167 | + force_read = False | |
| 168 | + if type == XlsRecordBof.TYPE: | |
| 169 | + clz = XlsRecordBof | |
| 170 | + force_read = True | |
| 171 | + elif type == XlsRecordEof.TYPE: | |
| 172 | + clz = XlsRecordEof | |
| 173 | + elif type == XlsRecordSupBook.TYPE: | |
| 174 | + clz = XlsRecordSupBook | |
| 175 | + force_read = True | |
| 176 | + else: | |
| 177 | + clz = XlsRecord | |
| 178 | + data = None | |
| 179 | + if fill_data or force_read: | |
| 180 | + data = self.stream.read(size) | |
| 181 | + else: | |
| 182 | + self.stream.seek(size, SEEK_CUR) | |
| 183 | + yield clz(type, size, pos, data) | |
| 184 | + | |
| 185 | + def __str__(self): | |
| 186 | + return '[Workbook Stream (size {0})'.format(self.size) | |
| 187 | + | |
| 188 | + | |
| 189 | +############################################################################### | |
| 190 | +# RECORDS | |
| 191 | +############################################################################### | |
| 192 | + | |
| 193 | +# records that appear often but do not need their own XlsRecord subclass (yet) | |
| 194 | +FREQUENT_RECORDS = dict([ | |
| 195 | + ( 156, 'BuiltInFnGroupCount'), # pylint: disable=bad-whitespace | |
| 196 | + (2147, 'BookExt'), # pylint: disable=bad-whitespace | |
| 197 | + ( 442, 'CodeName'), # pylint: disable=bad-whitespace | |
| 198 | + ( 66, 'CodePage'), # pylint: disable=bad-whitespace | |
| 199 | + (4195, 'Dat'), # pylint: disable=bad-whitespace | |
| 200 | + (2154, 'DataLabExt'), # pylint: disable=bad-whitespace | |
| 201 | + (2155, 'DataLabExtContents'), # pylint: disable=bad-whitespace | |
| 202 | + ( 215, 'DBCell'), # pylint: disable=bad-whitespace | |
| 203 | + ( 220, 'DbOrParmQry'), # pylint: disable=bad-whitespace | |
| 204 | + (2051, 'DBQueryExt'), # pylint: disable=bad-whitespace | |
| 205 | + (2166, 'DConn'), # pylint: disable=bad-whitespace | |
| 206 | + ( 35, 'ExternName'), # pylint: disable=bad-whitespace | |
| 207 | + ( 23, 'ExternSheet'), # pylint: disable=bad-whitespace | |
| 208 | + ( 255, 'ExtSST'), # pylint: disable=bad-whitespace | |
| 209 | + (2052, 'ExtString'), # pylint: disable=bad-whitespace | |
| 210 | + (2151, 'FeatHdr'), # pylint: disable=bad-whitespace | |
| 211 | + ( 91, 'FileSharing'), # pylint: disable=bad-whitespace | |
| 212 | + (1054, 'Format'), # pylint: disable=bad-whitespace | |
| 213 | + ( 49, 'Font'), # pylint: disable=bad-whitespace | |
| 214 | + (2199, 'GUIDTypeLib'), # pylint: disable=bad-whitespace | |
| 215 | + ( 440, 'HLink'), # pylint: disable=bad-whitespace | |
| 216 | + ( 225, 'InterfaceHdr'), # pylint: disable=bad-whitespace | |
| 217 | + ( 226, 'InterfaceEnd'), # pylint: disable=bad-whitespace | |
| 218 | + ( 523, 'Index'), # pylint: disable=bad-whitespace | |
| 219 | + ( 24, 'Lbl'), # pylint: disable=bad-whitespace | |
| 220 | + ( 193, 'Mms'), # pylint: disable=bad-whitespace | |
| 221 | + ( 93, 'Obj'), # pylint: disable=bad-whitespace | |
| 222 | + (4135, 'ObjectLink'), # pylint: disable=bad-whitespace | |
| 223 | + (2058, 'OleDbConn'), # pylint: disable=bad-whitespace | |
| 224 | + ( 222, 'OleObjectSize'), # pylint: disable=bad-whitespace | |
| 225 | + (2214, 'RichTextStream'), # pylint: disable=bad-whitespace | |
| 226 | + (2146, 'SheetExt'), # pylint: disable=bad-whitespace | |
| 227 | + (1212, 'ShrFmla'), # pylint: disable=bad-whitespace | |
| 228 | + (2060, 'SxViewExt'), # pylint: disable=bad-whitespace | |
| 229 | + (2136, 'SxViewLink'), # pylint: disable=bad-whitespace | |
| 230 | + (2049, 'WebPub'), # pylint: disable=bad-whitespace | |
| 231 | + ( 224, 'XF (formatting)'), # pylint: disable=bad-whitespace | |
| 232 | + (2173, 'XFExt (formatting)'), # pylint: disable=bad-whitespace | |
| 233 | + ( 659, 'Style'), # pylint: disable=bad-whitespace | |
| 234 | + (2194, 'StyleExt') # pylint: disable=bad-whitespace | |
| 235 | +]) | |
| 236 | + | |
| 237 | + | |
| 238 | +class XlsRecord(object): | |
| 239 | + """ basic building block of data in workbook stream """ | |
| 240 | + | |
| 241 | + #: max size of a record | |
| 242 | + MAX_SIZE = 8224 | |
| 243 | + | |
| 244 | + # to be overwritten in subclasses that have fixed type/size | |
| 245 | + TYPE = None | |
| 246 | + SIZE = None | |
| 247 | + | |
| 248 | + def __init__(self, type, size, pos, data=None): | |
| 249 | + """ create a record """ | |
| 250 | + self.type = type | |
| 251 | + if size > self.MAX_SIZE: | |
| 252 | + raise ValueError('size {0} exceeds max size'.format(size)) | |
| 253 | + elif self.SIZE is not None and size != self.SIZE: | |
| 254 | + raise ValueError('size {0} is not as expected for this type' | |
| 255 | + .format(size)) | |
| 256 | + self.size = size | |
| 257 | + self.pos = pos | |
| 258 | + self.data = data | |
| 259 | + if data is not None and len(data) != size: | |
| 260 | + raise ValueError('data size {0} is not expected size {1}' | |
| 261 | + .format(len(data), size)) | |
| 262 | + | |
| 263 | + def read_data(self, stream): | |
| 264 | + """ read data from stream if up to now only pos was known """ | |
| 265 | + raise NotImplementedError() | |
| 266 | + | |
| 267 | + def _type_str(self): | |
| 268 | + """ simplification for subclasses to create their own __str__ """ | |
| 269 | + try: | |
| 270 | + return FREQUENT_RECORDS[self.type] | |
| 271 | + except KeyError: | |
| 272 | + return 'XlsRecord type {0}'.format(self.type) | |
| 273 | + | |
| 274 | + def __str__(self): | |
| 275 | + return '[' + self._type_str() + \ | |
| 276 | + ' (size {0} from {1})]'.format(self.size, self.pos) | |
| 277 | + | |
| 278 | + | |
| 279 | +class XlsRecordBof(XlsRecord): | |
| 280 | + """ record found at beginning of substreams """ | |
| 281 | + TYPE = 2057 | |
| 282 | + SIZE = 16 | |
| 283 | + # types of substreams | |
| 284 | + DOCTYPES = dict([(0x5, 'workbook'), (0x10, 'dialog/worksheet'), | |
| 285 | + (0x20, 'chart'), (0x40, 'macro')]) | |
| 286 | + | |
| 287 | + def __init__(self, *args, **kwargs): | |
| 288 | + super(XlsRecordBof, self).__init__(*args, **kwargs) | |
| 289 | + if self.data is None: | |
| 290 | + self.doctype = None | |
| 291 | + return | |
| 292 | + # parse data (only doctype, ignore rest) | |
| 293 | + self.doctype = unpack('<H', self.data[2:4])[0] | |
| 294 | + | |
| 295 | + def _type_str(self): | |
| 296 | + return 'BOF Record ({0} substream)'.format( | |
| 297 | + self.DOCTYPES[self.doctype] if self.doctype in self.DOCTYPES | |
| 298 | + else 'unknown') | |
| 299 | + | |
| 300 | + | |
| 301 | +class XlsRecordEof(XlsRecord): | |
| 302 | + """ record found at end of substreams """ | |
| 303 | + TYPE = 10 | |
| 304 | + SIZE = 0 | |
| 305 | + | |
| 306 | + def _type_str(self): | |
| 307 | + return 'EOF Record' | |
| 308 | + | |
| 309 | + | |
| 310 | +class XlsRecordSupBook(XlsRecord): | |
| 311 | + """ The SupBook record specifies a supporting link | |
| 312 | + | |
| 313 | + "... The collection of records specifies the contents of an external | |
| 314 | + workbook, DDE data source, or OLE data source." (MS-XLS, paragraph 2.4.271) | |
| 315 | + """ | |
| 316 | + | |
| 317 | + TYPE = 430 | |
| 318 | + | |
| 319 | + LINK_TYPE_UNKNOWN = 'unknown' | |
| 320 | + LINK_TYPE_SELF = 'self-referencing' | |
| 321 | + LINK_TYPE_ADDIN = 'addin-referencing' | |
| 322 | + LINK_TYPE_UNUSED = 'unused' | |
| 323 | + LINK_TYPE_SAMESHEET = 'same-sheet' | |
| 324 | + LINK_TYPE_OLE_DDE = 'ole/dde data source' | |
| 325 | + LINK_TYPE_EXTERNAL = 'external workbook' | |
| 326 | + | |
| 327 | + def __init__(self, *args, **kwargs): | |
| 328 | + super(XlsRecordSupBook, self).__init__(*args, **kwargs) | |
| 329 | + | |
| 330 | + # set defaults | |
| 331 | + self.ctab = None | |
| 332 | + self.cch = None | |
| 333 | + self.virt_path = None | |
| 334 | + self.support_link_type = self.LINK_TYPE_UNKNOWN | |
| 335 | + if self.data is None: | |
| 336 | + return | |
| 337 | + | |
| 338 | + # parse data | |
| 339 | + if self.size < 4: | |
| 340 | + raise ValueError('not enough data (size is {0} but need >= 4)' | |
| 341 | + .format(self.size)) | |
| 342 | + self.ctab, self.cch = unpack('<HH', self.data[:4]) | |
| 343 | + if 0 < self.cch <= 0xff: | |
| 344 | + # this is the length of virt_path | |
| 345 | + self.virt_path, _ = read_unicode(self.data, 4, self.cch) | |
| 346 | + else: | |
| 347 | + self.virt_path, _ = u'', 4 | |
| 348 | + # ignore variable rgst | |
| 349 | + | |
| 350 | + if self.cch == 0x401: # ctab is undefined and to be ignored | |
| 351 | + self.support_link_type = self.LINK_TYPE_SELF | |
| 352 | + elif self.ctab == 0x1 and self.cch == 0x3A01: | |
| 353 | + self.support_link_type = self.LINK_TYPE_ADDIN | |
| 354 | + # next records must be ExternName with all add-in functions | |
| 355 | + elif self.virt_path == '\u0020': # space ; ctab can be anything | |
| 356 | + self.support_link_type = self.LINK_TYPE_UNUSED | |
| 357 | + elif self.virt_path == '\u0000': | |
| 358 | + self.support_link_type = self.LINK_TYPE_SAMESHEET | |
| 359 | + elif self.ctab == 0x0 and self.virt_path: | |
| 360 | + self.support_link_type = self.LINK_TYPE_OLE_DDE | |
| 361 | + elif self.ctab > 0 and self.virt_path: | |
| 362 | + self.support_link_type = self.LINK_TYPE_EXTERNAL | |
| 363 | + | |
| 364 | + def _type_str(self): | |
| 365 | + return 'SupBook Record ({0})'.format(self.support_link_type) | |
| 366 | + | |
| 367 | + | |
| 368 | +def read_unicode(data, start_idx, n_chars): | |
| 369 | + """ read a unicode string from a XLUnicodeStringNoCch structure """ | |
| 370 | + # first bit 0x0 --> only low-bytes are saved, all high bytes are 0 | |
| 371 | + # first bit 0x1 --> 2 bytes per character | |
| 372 | + low_bytes_only = (ord(data[start_idx]) == 0) | |
| 373 | + if low_bytes_only: | |
| 374 | + end_idx = start_idx + 1 + n_chars | |
| 375 | + return data[start_idx+1:end_idx].decode('ascii'), end_idx | |
| 376 | + end_idx = start_idx + 1 + n_chars * 2 | |
| 377 | + return u''.join(unichr(val) for val in | |
| 378 | + unpack('<' + 'H'*n_chars, data[start_idx+1:end_idx])), \ | |
| 379 | + end_idx | |
| 380 | + | |
| 381 | + | |
| 382 | +############################################################################### | |
| 383 | +# TESTING | |
| 384 | +############################################################################### | |
| 385 | + | |
| 386 | + | |
| 387 | +def test(*filenames): | |
| 388 | + """ parse all given file names and print rough structure """ | |
| 389 | + if not filenames: | |
| 390 | + print('need file name[s]') | |
| 391 | + return 2 | |
| 392 | + for filename in filenames: | |
| 393 | + if not olefile.isOleFile(filename): | |
| 394 | + continue | |
| 107 | 395 | xls = XlsFile(filename) |
| 108 | - except Exception as exc: | |
| 109 | - print('{}: {}'.format(filename, exc)) | |
| 110 | - return | |
| 111 | 396 | |
| 112 | - for stream in xls.get_streams(): | |
| 113 | - pass | |
| 397 | + for stream in xls.get_streams(): | |
| 398 | + print(stream) | |
| 399 | + if isinstance(stream, WorkbookStream): | |
| 400 | + for record in stream.iter_records(): | |
| 401 | + print(' {0}'.format(record)) | |
| 402 | + return 0 | |
| 403 | + | |
| 114 | 404 | |
| 115 | 405 | if __name__ == '__main__': |
| 116 | - """ parse all given file names and print rough structure """ | |
| 117 | - for filename in sys.argv[1:]: | |
| 118 | - test(filename) | |
| 406 | + sys.exit(test(sys.argv[1:])) | ... | ... |