Commit 6af5e38e63c076a43e8cdce848ea2b1da55e81ea
Committed by
Philippe Lagadec
1 parent
17ae3fc3
msodde: Start parsing non-xml parts in xlsb files. Unfinished!
Showing
1 changed file
with
65 additions
and
8 deletions
oletools/msodde.py
| ... | ... | @@ -71,13 +71,6 @@ __version__ = '0.52dev7' |
| 71 | 71 | |
| 72 | 72 | #--- IMPORTS ------------------------------------------------------------------ |
| 73 | 73 | |
| 74 | -# import lxml or ElementTree for XML parsing: | |
| 75 | -try: | |
| 76 | - # lxml: best performance for XML processing | |
| 77 | - import lxml.etree as ET | |
| 78 | -except ImportError: | |
| 79 | - import xml.etree.cElementTree as ET | |
| 80 | - | |
| 81 | 74 | import argparse |
| 82 | 75 | import zipfile |
| 83 | 76 | import os |
| ... | ... | @@ -85,6 +78,14 @@ import sys |
| 85 | 78 | import json |
| 86 | 79 | import logging |
| 87 | 80 | import re |
| 81 | +from struct import unpack | |
| 82 | + | |
| 83 | +# import lxml or ElementTree for XML parsing: | |
| 84 | +try: | |
| 85 | + # lxml: best performance for XML processing | |
| 86 | + import lxml.etree as ET | |
| 87 | +except ImportError: | |
| 88 | + import xml.etree.cElementTree as ET | |
| 88 | 89 | |
| 89 | 90 | # little hack to allow absolute imports even if oletools is not installed |
| 90 | 91 | # Copied from olevba.py |
| ... | ... | @@ -709,7 +710,8 @@ def field_is_blacklisted(contents): |
| 709 | 710 | def process_xlsx(filepath, filed_filter_mode=None): |
| 710 | 711 | """ process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm) """ |
| 711 | 712 | dde_links = [] |
| 712 | - for subfile, elem, _ in ooxml.iter_xml(filepath): | |
| 713 | + parser = ooxml.XmlParser(filepath) | |
| 714 | + for subfile, elem, _ in parser.iter_xml(): | |
| 713 | 715 | tag = elem.tag.lower() |
| 714 | 716 | if tag == 'ddelink' or tag.endswith('}ddelink'): |
| 715 | 717 | # we have found a dde link. Try to get more info about it |
| ... | ... | @@ -719,9 +721,63 @@ def process_xlsx(filepath, filed_filter_mode=None): |
| 719 | 721 | if 'ddeTopic' in elem.attrib: |
| 720 | 722 | link_info.append(elem.attrib['ddeTopic']) |
| 721 | 723 | dde_links.append(' '.join(link_info)) |
| 724 | + | |
| 725 | + for subfile, content_type, handle in parser.iter_non_xml(): | |
| 726 | + log.warning('File contains non-xml part {0} that could not be parsed' | |
| 727 | + .format(subfile)) | |
| 728 | + | |
| 729 | + if content_type.startswith('application/vnd.ms-excel.'): | |
| 730 | + dde_links.extend(process_xlsb(subfile, content_type, handle)) | |
| 731 | + raise NotImplementedError('Continue reverse-engineering') | |
| 732 | + else: | |
| 733 | + magic = handle.read(len(olefile.MAGIC)) | |
| 734 | + if magic == olefile.MAGIC: | |
| 735 | + log.debug('found ole file {0} in excel ooxml'.format(subfile)) | |
| 736 | + raise NotImplementedError('continue. need to reset stream') | |
| 737 | + | |
| 722 | 738 | return u'\n'.join(dde_links) |
| 723 | 739 | |
| 724 | 740 | |
| 741 | +def process_xlsb(subfile, content_type, stream): | |
| 742 | + """ Process data contained in a binary part of an OOXML excel file | |
| 743 | + | |
| 744 | + lots of these in xlsb files | |
| 745 | + | |
| 746 | + Work in progress, always returns [] | |
| 747 | + | |
| 748 | + Format of these streams seems to roughly have record-like structure like | |
| 749 | + xls files (see xls_parser.py), but have to guess a lot since I could not | |
| 750 | + find proper description in [MS-XLSB] nor [ECMA-376] nor [MS-OE376]. The | |
| 751 | + code here is reverse-engineered from comparing dde-test.xlsb and | |
| 752 | + dde-test.xlsx | |
| 753 | + | |
| 754 | + The author of | |
| 755 | + https://www.codeproject.com/Articles/15216/Office-bin-file-format seems to | |
| 756 | + have tried to reverse-engineer several .bin streams based on the assumption | |
| 757 | + they contain BIFF data. | |
| 758 | + | |
| 759 | + Anyway, need more test samples to get any reliable results from this. | |
| 760 | + """ | |
| 761 | + log.debug('Trying to parse subfile {0}'.format(subfile)) | |
| 762 | + while True: | |
| 763 | + data = stream.read(3) | |
| 764 | + if not data: | |
| 765 | + break # end of stream | |
| 766 | + type = ord(data[0]) | |
| 767 | + unknown = ord(data[1]) | |
| 768 | + size = ord(data[2]) | |
| 769 | + data = stream.read(size) | |
| 770 | + | |
| 771 | + log.debug('Record of type {0} unknown part {1} and size {2}: {3}' | |
| 772 | + .format(type, unknown, size, data[:64])) | |
| 773 | + if len(data) != size: | |
| 774 | + log.warning('Stream in {0} does not seem to fit record structure. ' | |
| 775 | + .format(subfile) + | |
| 776 | + '(read {0} bytes but expected {1})' | |
| 777 | + .format(len(data), size)) | |
| 778 | + return [] | |
| 779 | + | |
| 780 | + | |
| 725 | 781 | def process_file(filepath, field_filter_mode=None): |
| 726 | 782 | """ decides which of process_doc/x or process_xls/x to call """ |
| 727 | 783 | if olefile.isOleFile(filepath): |
| ... | ... | @@ -738,6 +794,7 @@ def process_file(filepath, field_filter_mode=None): |
| 738 | 794 | else: |
| 739 | 795 | return process_docx(filepath, field_filter_mode) |
| 740 | 796 | except Exception: |
| 797 | + log.debug('Exception trying to xml-parse file', exc_info=True) | |
| 741 | 798 | return process_docx(filepath, field_filter_mode) |
| 742 | 799 | |
| 743 | 800 | ... | ... |