Commit 6af5e38e63c076a43e8cdce848ea2b1da55e81ea

Authored by Christian Herdtweck
Committed by Philippe Lagadec
1 parent 17ae3fc3

msodde: Start parsing non-xml parts in xlsb files. Unfinished!

Showing 1 changed file with 65 additions and 8 deletions
oletools/msodde.py
@@ -71,13 +71,6 @@ __version__ = '0.52dev7' @@ -71,13 +71,6 @@ __version__ = '0.52dev7'
71 71
72 #--- IMPORTS ------------------------------------------------------------------ 72 #--- IMPORTS ------------------------------------------------------------------
73 73
74 -# import lxml or ElementTree for XML parsing:  
75 -try:  
76 - # lxml: best performance for XML processing  
77 - import lxml.etree as ET  
78 -except ImportError:  
79 - import xml.etree.cElementTree as ET  
80 -  
81 import argparse 74 import argparse
82 import zipfile 75 import zipfile
83 import os 76 import os
@@ -85,6 +78,14 @@ import sys @@ -85,6 +78,14 @@ import sys
85 import json 78 import json
86 import logging 79 import logging
87 import re 80 import re
  81 +from struct import unpack
  82 +
  83 +# import lxml or ElementTree for XML parsing:
  84 +try:
  85 + # lxml: best performance for XML processing
  86 + import lxml.etree as ET
  87 +except ImportError:
  88 + import xml.etree.cElementTree as ET
88 89
89 # little hack to allow absolute imports even if oletools is not installed 90 # little hack to allow absolute imports even if oletools is not installed
90 # Copied from olevba.py 91 # Copied from olevba.py
@@ -709,7 +710,8 @@ def field_is_blacklisted(contents): @@ -709,7 +710,8 @@ def field_is_blacklisted(contents):
709 def process_xlsx(filepath, filed_filter_mode=None): 710 def process_xlsx(filepath, filed_filter_mode=None):
710 """ process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm) """ 711 """ process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm) """
711 dde_links = [] 712 dde_links = []
712 - for subfile, elem, _ in ooxml.iter_xml(filepath): 713 + parser = ooxml.XmlParser(filepath)
  714 + for subfile, elem, _ in parser.iter_xml():
713 tag = elem.tag.lower() 715 tag = elem.tag.lower()
714 if tag == 'ddelink' or tag.endswith('}ddelink'): 716 if tag == 'ddelink' or tag.endswith('}ddelink'):
715 # we have found a dde link. Try to get more info about it 717 # we have found a dde link. Try to get more info about it
@@ -719,9 +721,63 @@ def process_xlsx(filepath, filed_filter_mode=None): @@ -719,9 +721,63 @@ def process_xlsx(filepath, filed_filter_mode=None):
719 if 'ddeTopic' in elem.attrib: 721 if 'ddeTopic' in elem.attrib:
720 link_info.append(elem.attrib['ddeTopic']) 722 link_info.append(elem.attrib['ddeTopic'])
721 dde_links.append(' '.join(link_info)) 723 dde_links.append(' '.join(link_info))
  724 +
  725 + for subfile, content_type, handle in parser.iter_non_xml():
  726 + log.warning('File contains non-xml part {0} that could not be parsed'
  727 + .format(subfile))
  728 +
  729 + if content_type.startswith('application/vnd.ms-excel.'):
  730 + dde_links.extend(process_xlsb(subfile, content_type, handle))
  731 + raise NotImplementedError('Continue reverse-engineering')
  732 + else:
  733 + magic = handle.read(len(olefile.MAGIC))
  734 + if magic == olefile.MAGIC:
  735 + log.debug('found ole file {0} in excel ooxml'.format(subfile))
  736 + raise NotImplementedError('continue. need to reset stream')
  737 +
722 return u'\n'.join(dde_links) 738 return u'\n'.join(dde_links)
723 739
724 740
  741 +def process_xlsb(subfile, content_type, stream):
  742 + """ Process data contained in a binary part of an OOXML excel file
  743 +
  744 + lots of these in xlsb files
  745 +
  746 + Work in progress, always returns []
  747 +
  748 + Format of these streams seems to roughly have record-like structure like
  749 + xls files (see xls_parser.py), but have to guess a lot since I could not
  750 + find proper description in [MS-XLSB] nor [ECMA-376] nor [MS-OE376]. The
  751 + code here is reverse-engineered from comparing dde-test.xlsb and
  752 + dde-test.xlsx
  753 +
  754 + The author of
  755 + https://www.codeproject.com/Articles/15216/Office-bin-file-format seems to
  756 + have tried to reverse-engineer several .bin streams based on the assumption
  757 + they contain BIFF data.
  758 +
  759 + Anyway, need more test samples to get any reliable results from this.
  760 + """
  761 + log.debug('Trying to parse subfile {0}'.format(subfile))
  762 + while True:
  763 + data = stream.read(3)
  764 + if not data:
  765 + break # end of stream
  766 + type = ord(data[0])
  767 + unknown = ord(data[1])
  768 + size = ord(data[2])
  769 + data = stream.read(size)
  770 +
  771 + log.debug('Record of type {0} unknown part {1} and size {2}: {3}'
  772 + .format(type, unknown, size, data[:64]))
  773 + if len(data) != size:
  774 + log.warning('Stream in {0} does not seem to fit record structure. '
  775 + .format(subfile) +
  776 + '(read {0} bytes but expected {1})'
  777 + .format(len(data), size))
  778 + return []
  779 +
  780 +
725 def process_file(filepath, field_filter_mode=None): 781 def process_file(filepath, field_filter_mode=None):
726 """ decides which of process_doc/x or process_xls/x to call """ 782 """ decides which of process_doc/x or process_xls/x to call """
727 if olefile.isOleFile(filepath): 783 if olefile.isOleFile(filepath):
@@ -738,6 +794,7 @@ def process_file(filepath, field_filter_mode=None): @@ -738,6 +794,7 @@ def process_file(filepath, field_filter_mode=None):
738 else: 794 else:
739 return process_docx(filepath, field_filter_mode) 795 return process_docx(filepath, field_filter_mode)
740 except Exception: 796 except Exception:
  797 + log.debug('Exception trying to xml-parse file', exc_info=True)
741 return process_docx(filepath, field_filter_mode) 798 return process_docx(filepath, field_filter_mode)
742 799
743 800