Commit 6af5e38e63c076a43e8cdce848ea2b1da55e81ea

Authored by Christian Herdtweck
Committed by Philippe Lagadec
1 parent 17ae3fc3

msodde: Start parsing non-xml parts in xlsb files. Unfinished!

Showing 1 changed file with 65 additions and 8 deletions
oletools/msodde.py
... ... @@ -71,13 +71,6 @@ __version__ = '0.52dev7'
71 71  
72 72 #--- IMPORTS ------------------------------------------------------------------
73 73  
74   -# import lxml or ElementTree for XML parsing:
75   -try:
76   - # lxml: best performance for XML processing
77   - import lxml.etree as ET
78   -except ImportError:
79   - import xml.etree.cElementTree as ET
80   -
81 74 import argparse
82 75 import zipfile
83 76 import os
... ... @@ -85,6 +78,14 @@ import sys
85 78 import json
86 79 import logging
87 80 import re
  81 +from struct import unpack
  82 +
  83 +# import lxml or ElementTree for XML parsing:
  84 +try:
  85 + # lxml: best performance for XML processing
  86 + import lxml.etree as ET
  87 +except ImportError:
  88 + import xml.etree.cElementTree as ET
88 89  
89 90 # little hack to allow absolute imports even if oletools is not installed
90 91 # Copied from olevba.py
... ... @@ -709,7 +710,8 @@ def field_is_blacklisted(contents):
709 710 def process_xlsx(filepath, filed_filter_mode=None):
710 711 """ process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm) """
711 712 dde_links = []
712   - for subfile, elem, _ in ooxml.iter_xml(filepath):
  713 + parser = ooxml.XmlParser(filepath)
  714 + for subfile, elem, _ in parser.iter_xml():
713 715 tag = elem.tag.lower()
714 716 if tag == 'ddelink' or tag.endswith('}ddelink'):
715 717 # we have found a dde link. Try to get more info about it
... ... @@ -719,9 +721,63 @@ def process_xlsx(filepath, filed_filter_mode=None):
719 721 if 'ddeTopic' in elem.attrib:
720 722 link_info.append(elem.attrib['ddeTopic'])
721 723 dde_links.append(' '.join(link_info))
  724 +
  725 + for subfile, content_type, handle in parser.iter_non_xml():
  726 + log.warning('File contains non-xml part {0} that could not be parsed'
  727 + .format(subfile))
  728 +
  729 + if content_type.startswith('application/vnd.ms-excel.'):
  730 + dde_links.extend(process_xlsb(subfile, content_type, handle))
  731 + raise NotImplementedError('Continue reverse-engineering')
  732 + else:
  733 + magic = handle.read(len(olefile.MAGIC))
  734 + if magic == olefile.MAGIC:
  735 + log.debug('found ole file {0} in excel ooxml'.format(subfile))
  736 + raise NotImplementedError('continue. need to reset stream')
  737 +
722 738 return u'\n'.join(dde_links)
723 739  
724 740  
  741 +def process_xlsb(subfile, content_type, stream):
  742 + """ Process data contained in a binary part of an OOXML excel file
  743 +
  744 + lots of these in xlsb files
  745 +
  746 + Work in progress, always returns []
  747 +
  748 + Format of these streams seems to roughly have record-like structure like
  749 + xls files (see xls_parser.py), but have to guess a lot since I could not
  750 + find proper description in [MS-XLSB] nor [ECMA-376] nor [MS-OE376]. The
  751 + code here is reverse-engineered from comparing dde-test.xlsb and
  752 + dde-test.xlsx
  753 +
  754 + The author of
  755 + https://www.codeproject.com/Articles/15216/Office-bin-file-format seems to
  756 + have tried to reverse-engineer several .bin streams based on the assumption
  757 + they contain BIFF data.
  758 +
  759 + Anyway, need more test samples to get any reliable results from this.
  760 + """
  761 + log.debug('Trying to parse subfile {0}'.format(subfile))
  762 + while True:
  763 + data = stream.read(3)
  764 + if not data:
  765 + break # end of stream
  766 + type = ord(data[0])
  767 + unknown = ord(data[1])
  768 + size = ord(data[2])
  769 + data = stream.read(size)
  770 +
  771 + log.debug('Record of type {0} unknown part {1} and size {2}: {3}'
  772 + .format(type, unknown, size, data[:64]))
  773 + if len(data) != size:
  774 + log.warning('Stream in {0} does not seem to fit record structure. '
  775 + .format(subfile) +
  776 + '(read {0} bytes but expected {1})'
  777 + .format(len(data), size))
  778 + return []
  779 +
  780 +
725 781 def process_file(filepath, field_filter_mode=None):
726 782 """ decides which of process_doc/x or process_xls/x to call """
727 783 if olefile.isOleFile(filepath):
... ... @@ -738,6 +794,7 @@ def process_file(filepath, field_filter_mode=None):
738 794 else:
739 795 return process_docx(filepath, field_filter_mode)
740 796 except Exception:
  797 + log.debug('Exception trying to xml-parse file', exc_info=True)
741 798 return process_docx(filepath, field_filter_mode)
742 799  
743 800  
... ...