Commit caed0b2ad17a5318201fc1bed96746c2ab626346

Authored by decalage2
1 parent 40718f9d

added msodde.py

Showing 1 changed file with 132 additions and 0 deletions
oletools/msodde.py 0 → 100644
  1 +#!/usr/bin/env python
  2 +"""
  3 +msodde.py
  4 +
  5 +msodde is a script to parse MS Office documents
  6 +(e.g. Word, Excel), to detect and extract DDE links.
  7 +
  8 +Supported formats:
  9 +- Word 2007+ (.docx, .dotx, .docm, .dotm)
  10 +
  11 +Author: Philippe Lagadec - http://www.decalage.info
  12 +License: BSD, see source code or documentation
  13 +
  14 +msodde is part of the python-oletools package:
  15 +http://www.decalage.info/python/oletools
  16 +"""
  17 +
  18 +# === LICENSE ==================================================================
  19 +
  20 +# msodde is copyright (c) 2017 Philippe Lagadec (http://www.decalage.info)
  21 +# All rights reserved.
  22 +#
  23 +# Redistribution and use in source and binary forms, with or without modification,
  24 +# are permitted provided that the following conditions are met:
  25 +#
  26 +# * Redistributions of source code must retain the above copyright notice, this
  27 +# list of conditions and the following disclaimer.
  28 +# * Redistributions in binary form must reproduce the above copyright notice,
  29 +# this list of conditions and the following disclaimer in the documentation
  30 +# and/or other materials provided with the distribution.
  31 +#
  32 +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  33 +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  34 +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  35 +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  36 +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  37 +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  38 +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  39 +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  40 +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  41 +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  42 +
  43 +from __future__ import print_function
  44 +
  45 +#------------------------------------------------------------------------------
  46 +# CHANGELOG:
  47 +# 2017-10-18 v0.52 PL: - first version
  48 +
  49 +__version__ = '0.52dev1'
  50 +
  51 +#------------------------------------------------------------------------------
  52 +# TODO: detect beginning/end of fields, to separate each field
  53 +# TODO: test if DDE links can also appear in headers, footers and other places
  54 +# TODO: add xlsx support
  55 +
  56 +#------------------------------------------------------------------------------
  57 +# REFERENCES:
  58 +
  59 +
  60 +#--- IMPORTS ------------------------------------------------------------------
  61 +
  62 +# import lxml or ElementTree for XML parsing:
  63 +try:
  64 + # lxml: best performance for XML processing
  65 + import lxml.etree as ET
  66 +except ImportError:
  67 + import xml.etree.cElementTree as ET
  68 +
  69 +import argparse
  70 +import zipfile
  71 +import os
  72 +import sys
  73 +
  74 +
  75 +# === CONSTANTS ==============================================================
  76 +
  77 +
  78 +NS_WORD = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
  79 +
  80 +# XML tag for 'w:instrText'
  81 +TAG_W_INSTRTEXT = '{%s}instrText' % NS_WORD
  82 +
  83 +
  84 +# === FUNCTIONS ==============================================================
  85 +
  86 +def process_args():
  87 + parser = argparse.ArgumentParser(description='A python tool to detect and extract DDE links in MS Office files')
  88 + parser.add_argument("filepath", help="path of the file to be analyzed")
  89 +
  90 + args = parser.parse_args()
  91 +
  92 + if not os.path.exists(args.filepath):
  93 + print('File {} does not exist.'.format(args.filepath))
  94 + sys.exit(1)
  95 +
  96 + return args
  97 +
  98 +
  99 +
  100 +def process_file(filepath):
  101 + z = zipfile.ZipFile(filepath)
  102 + data = z.read('word/document.xml')
  103 + z.close()
  104 + # parse the XML data:
  105 + root = ET.fromstring(data)
  106 + text = u''
  107 + # find all the tags 'w:instrText':
  108 + # (each is a chunk of a DDE link)
  109 + for elem in root.iter(TAG_W_INSTRTEXT):
  110 + # concatenate the text of the field:
  111 + text += elem.text
  112 + return text
  113 +
  114 +
  115 +#=== MAIN =================================================================
  116 +
  117 +def main():
  118 + # print banner with version
  119 + print ('msodde %s - http://decalage.info/python/oletools' % __version__)
  120 + print ('THIS IS WORK IN PROGRESS - Check updates regularly!')
  121 + print ('Please report any issue at https://github.com/decalage2/oletools/issues')
  122 + print ('')
  123 +
  124 + args = process_args()
  125 + print('Opening file: %s' % args.filepath)
  126 + text = process_file(args.filepath)
  127 + print ('DDE Links:')
  128 + print(text)
  129 +
  130 +
  131 +if __name__ == '__main__':
  132 + main()