Commit a8209d750fe07e3c2dae3883c26c5779b52b75d3

Authored by Philippe Lagadec
Committed by GitHub
2 parents 15743bfd d4b8e77b

Merge branch 'master' into ddedev

oletools/msodde.py
... ... @@ -6,7 +6,7 @@ msodde is a script to parse MS Office documents
6 6 (e.g. Word, Excel), to detect and extract DDE links.
7 7  
8 8 Supported formats:
9   -- Word 2007+ (.docx, .dotx, .docm, .dotm)
  9 +- Word 97-2003 (.doc, .dot), Word 2007+ (.docx, .dotx, .docm, .dotm)
10 10  
11 11 Author: Philippe Lagadec - http://www.decalage.info
12 12 License: BSD, see source code or documentation
... ... @@ -46,14 +46,14 @@ from __future__ import print_function
46 46 # CHANGELOG:
47 47 # 2017-10-18 v0.52 PL: - first version
48 48 # 2017-10-20 PL: - fixed issue #202 (handling empty xml tags)
49   -# 2017-10-23 PL: - add check for fldSimple codes
50   -# 2017-10-24 PL: - group tags and track begin/end tags to keep DDE strings together
  49 +# 2017-10-23 ES: - add check for fldSimple codes
  50 +# 2017-10-24 ES: - group tags and track begin/end tags to keep DDE strings together
  51 +# 2017-10-25 CH: - add json output
  52 +# 2017-10-25 CH: - parse doc
51 53  
52   -__version__ = '0.52dev2'
  54 +__version__ = '0.52dev4'
53 55  
54 56 #------------------------------------------------------------------------------
55   -# TODO: detect beginning/end of fields, to separate each field
56   -# TODO: test if DDE links can also appear in headers, footers and other places
57 57 # TODO: field codes can be in headers/footers/comments - parse these
58 58 # TODO: add xlsx support
59 59  
... ... @@ -74,7 +74,14 @@ import argparse
74 74 import zipfile
75 75 import os
76 76 import sys
  77 +import json
77 78  
  79 +from oletools.thirdparty import olefile
  80 +
  81 +# === PYTHON 2+3 SUPPORT ======================================================
  82 +
  83 +if sys.version_info[0] >= 3:
  84 + unichr = chr
78 85  
79 86 # === CONSTANTS ==============================================================
80 87  
... ... @@ -91,24 +98,212 @@ ATTR_W_INSTR = '{%s}instr' % NS_WORD
91 98 ATTR_W_FLDCHARTYPE = '{%s}fldCharType' % NS_WORD
92 99 LOCATIONS = ['word/document.xml','word/endnotes.xml','word/footnotes.xml','word/header1.xml','word/footer1.xml','word/header2.xml','word/footer2.xml','word/comments.xml']
93 100  
94   -# === FUNCTIONS ==============================================================
95   -
96   -def process_args():
97   - parser = argparse.ArgumentParser(description='A python tool to detect and extract DDE links in MS Office files')
98   - parser.add_argument("filepath", help="path of the file to be analyzed")
  101 +# banner to be printed at program start
  102 +BANNER = """msodde %s - http://decalage.info/python/oletools
  103 +THIS IS WORK IN PROGRESS - Check updates regularly!
  104 +Please report any issue at https://github.com/decalage2/oletools/issues
  105 +""" % __version__
  106 +
  107 +BANNER_JSON = dict(type='meta', version=__version__, name='msodde',
  108 + link='http://decalage.info/python/oletools',
  109 + message='THIS IS WORK IN PROGRESS - Check updates regularly! '
  110 + 'Please report any issue at '
  111 + 'https://github.com/decalage2/oletools/issues')
  112 +
  113 +# === ARGUMENT PARSING =======================================================
  114 +
  115 +class ArgParserWithBanner(argparse.ArgumentParser):
  116 + """ Print banner before showing any error """
  117 + def error(self, message):
  118 + print(BANNER)
  119 + super(ArgParserWithBanner, self).error(message)
  120 +
  121 +
  122 +def existing_file(filename):
  123 + """ called by argument parser to see whether given file exists """
  124 + if not os.path.exists(filename):
  125 + raise argparse.ArgumentTypeError('File {0} does not exist.'
  126 + .format(filename))
  127 + return filename
  128 +
  129 +
  130 +def process_args(cmd_line_args=None):
  131 + parser = ArgParserWithBanner(description='A python tool to detect and extract DDE links in MS Office files')
  132 + parser.add_argument("filepath", help="path of the file to be analyzed",
  133 + type=existing_file, metavar='FILE')
  134 + parser.add_argument("--json", '-j', action='store_true',
  135 + help="Output in json format")
99 136 parser.add_argument("--nounquote", help="don't unquote values",action='store_true')
100   - args = parser.parse_args()
101 137  
102   - if not os.path.exists(args.filepath):
103   - print('File {} does not exist.'.format(args.filepath))
104   - sys.exit(1)
105   -
106   - return args
  138 + return parser.parse_args(cmd_line_args)
107 139  
108 140  
  141 +# === FUNCTIONS ==============================================================
109 142  
110   -def process_file(data):
111   -
  143 +# from [MS-DOC], section 2.8.25 (PlcFld):
  144 +# A field consists of two parts: field instructions and, optionally, a result. All fields MUST begin with
  145 +# Unicode character 0x0013 with sprmCFSpec applied with a value of 1. This is the field begin
  146 +# character. All fields MUST end with a Unicode character 0x0015 with sprmCFSpec applied with a value
  147 +# of 1. This is the field end character. If the field has a result, then there MUST be a Unicode character
  148 +# 0x0014 with sprmCFSpec applied with a value of 1 somewhere between the field begin character and
  149 +# the field end character. This is the field separator. The field result is the content between the field
  150 +# separator and the field end character. The field instructions are the content between the field begin
  151 +# character and the field separator, if one is present, or between the field begin character and the field
  152 +# end character if no separator is present. The field begin character, field end character, and field
  153 +# separator are collectively referred to as field characters.
  154 +
  155 +
  156 +def process_ole_field(data):
  157 + """ check if field instructions start with DDE
  158 +
  159 + expects unicode input, returns unicode output (empty if not dde) """
  160 + #print('processing field \'{0}\''.format(data))
  161 +
  162 + if data.lstrip().lower().startswith(u'dde'):
  163 + #print('--> is DDE!')
  164 + return data
  165 + else:
  166 + return u''
  167 +
  168 +
  169 +OLE_FIELD_START = 0x13
  170 +OLE_FIELD_SEP = 0x14
  171 +OLE_FIELD_END = 0x15
  172 +OLE_FIELD_MAX_SIZE = 1000 # max field size to analyze, rest is ignored
  173 +
  174 +
  175 +def process_ole_stream(stream):
  176 + """ find dde links in single ole stream
  177 +
  178 + since ole file stream are subclasses of io.BytesIO, they are buffered, so
  179 + reading char-wise is not that bad performanc-wise """
  180 +
  181 + have_start = False
  182 + have_sep = False
  183 + field_contents = None
  184 + result_parts = []
  185 + max_size_exceeded = False
  186 + idx = -1
  187 + while True:
  188 + idx += 1
  189 + char = stream.read(1) # loop over every single byte
  190 + if len(char) == 0:
  191 + break
  192 + else:
  193 + char = ord(char)
  194 +
  195 + if char == OLE_FIELD_START:
  196 + #print('DEBUG: have start at {}'.format(idx))
  197 + #if have_start:
  198 + # print("DEBUG: dismissing previous contents of length {}"
  199 + # .format(len(field_contents)))
  200 + have_start = True
  201 + have_sep = False
  202 + max_size_exceeded = False
  203 + field_contents = u''
  204 + continue
  205 + elif not have_start:
  206 + continue
  207 +
  208 + # now we are after start char but not at end yet
  209 + if char == OLE_FIELD_SEP:
  210 + #print('DEBUG: have sep at {}'.format(idx))
  211 + have_sep = True
  212 + elif char == OLE_FIELD_END:
  213 + #print('DEBUG: have end at {}'.format(idx))
  214 +
  215 + # have complete field now, process it
  216 + result_parts.append(process_ole_field(field_contents))
  217 +
  218 + # re-set variables for next field
  219 + have_start = False
  220 + have_sep = False
  221 + field_contents = None
  222 + elif not have_sep:
  223 + # check that array does not get too long by accident
  224 + if max_size_exceeded:
  225 + pass
  226 + elif len(field_contents) > OLE_FIELD_MAX_SIZE:
  227 + #print('DEBUG: exceeded max size')
  228 + max_size_exceeded = True
  229 +
  230 + # appending a raw byte to a unicode string here. Not clean but
  231 + # all we do later is check for the ascii-sequence 'DDE' later...
  232 + elif char < 128:
  233 + field_contents += unichr(char)
  234 + #print('DEBUG: at idx {:4d}: add byte {} ({})'
  235 + # .format(idx, unichr(char), char))
  236 + else:
  237 + field_contents += u'?'
  238 + #print('DEBUG: at idx {:4d}: add byte ? ({})'
  239 + # .format(idx, char))
  240 + #print('\nstream len = {}'.format(idx))
  241 +
  242 + # copy behaviour of process_xml: Just concatenate unicode strings
  243 + return u''.join(result_parts)
  244 +
  245 +
  246 +def process_ole_storage(ole):
  247 + """ process a "directory" inside an ole stream """
  248 + results = []
  249 + for st in ole.listdir(streams=True, storages=True):
  250 + st_type = ole.get_type(st)
  251 + if st_type == olefile.STGTY_STREAM: # a stream
  252 + stream = None
  253 + links = ''
  254 + try:
  255 + stream = ole.openstream(st)
  256 + #print('Checking stream {0}'.format(st))
  257 + links = process_ole_stream(stream)
  258 + except:
  259 + raise
  260 + finally:
  261 + if stream:
  262 + stream.close()
  263 + if links:
  264 + results.append(links)
  265 + elif st_type == olefile.STGTY_STORAGE: # a storage
  266 + #print('Checking storage {0}'.format(st))
  267 + links = process_ole_storage(st)
  268 + if links:
  269 + results.extend(links)
  270 + else:
  271 + #print('Warning: unexpected type {0} for entry {1}. Ignore it'
  272 + # .format(st_type, st))
  273 + continue
  274 + return results
  275 +
  276 +
  277 +def process_ole(filepath):
  278 + """
  279 + find dde links in ole file
  280 +
  281 + like process_xml, returns a concatenated unicode string of dde links or
  282 + empty if none were found. dde-links will still being with the dde[auto] key
  283 + word (possibly after some whitespace)
  284 + """
  285 + #print('Looks like ole')
  286 + ole = olefile.OleFileIO(filepath, path_encoding=None)
  287 + text_parts = process_ole_storage(ole)
  288 + return u'\n'.join(text_parts)
  289 +
  290 +
  291 +def process_openxml(filepath):
  292 + all_fields = []
  293 + z = zipfile.ZipFile(args.filepath)
  294 + for filepath in z.namelist():
  295 + if filepath in LOCATIONS:
  296 + data = z.read(filepath)
  297 + fields = process_xml(data)
  298 + if len(fields) > 0:
  299 + #print ('DDE Links in %s:'%filepath)
  300 + #for f in fields:
  301 + # print(f)
  302 + all_fields.extend(fields)
  303 + z.close()
  304 + return u'\n'.join(all_fields)
  305 +
  306 +def process_xml(data):
112 307 # parse the XML data:
113 308 root = ET.fromstring(data)
114 309 fields = []
... ... @@ -173,32 +368,63 @@ def unquote(field):
173 368 return ddestr
174 369  
175 370  
  371 +def process_file(filepath):
  372 + """ decides to either call process_openxml or process_ole """
  373 + if olefile.isOleFile(filepath):
  374 + return process_ole(filepath)
  375 + else:
  376 + return process_openxml(filepath)
  377 +
  378 +
176 379 #=== MAIN =================================================================
177 380  
178   -def main():
179   - # print banner with version
180   - print ('msodde %s - http://decalage.info/python/oletools' % __version__)
181   - print ('THIS IS WORK IN PROGRESS - Check updates regularly!')
182   - print ('Please report any issue at https://github.com/decalage2/oletools/issues')
183   - print ('')
  381 +def main(cmd_line_args=None):
  382 + """ Main function, called if this file is called as a script
  383 +
  384 + Optional argument: command line arguments to be forwarded to ArgumentParser
  385 + in process_args. Per default (cmd_line_args=None), sys.argv is used. Option
  386 + mainly added for unit-testing
  387 + """
  388 + args = process_args(cmd_line_args)
184 389  
185   - args = process_args()
186   - print('Opening file: %s' % args.filepath)
187 390 if args.nounquote :
188 391 global NO_QUOTES
189 392 NO_QUOTES = True
190   - z = zipfile.ZipFile(args.filepath)
191   - for filepath in z.namelist():
192   - if filepath in LOCATIONS:
193   - data = z.read(filepath)
194   - fields = process_file(data)
195   - if len(fields) > 0:
196   - print ('DDE Links in %s:'%filepath)
197   - for f in fields:
198   - print(f)
199   - z.close()
200   -
  393 +
  394 + if args.json:
  395 + jout = []
  396 + jout.append(BANNER_JSON)
  397 + else:
  398 + # print banner with version
  399 + print(BANNER)
  400 +
  401 + if not args.json:
  402 + print('Opening file: %s' % args.filepath)
  403 +
  404 + text = ''
  405 + return_code = 1
  406 + try:
  407 + text = process_file(args.filepath)
  408 + return_code = 0
  409 + except Exception as exc:
  410 + if args.json:
  411 + jout.append(dict(type='error', error=type(exc).__name__,
  412 + message=str(exc))) # strange: str(exc) is enclosed in ""
  413 + else:
  414 + raise
  415 +
  416 + if args.json:
  417 + for line in text.splitlines():
  418 + jout.append(dict(type='dde-link', link=line.strip()))
  419 + json.dump(jout, sys.stdout, check_circular=False, indent=4)
  420 + print() # add a newline after closing "]"
  421 + return return_code # required if we catch an exception in json-mode
  422 + else:
  423 + print ('DDE Links:')
  424 + print(text)
  425 +
  426 + return return_code
201 427  
202 428  
203 429 if __name__ == '__main__':
204   - main()
  430 + sys.exit(main())
... ...
tests/howto_add_unittests.txt 0 → 100644
  1 +Howto: Add unittests
  2 +--------------------
  3 +
  4 +For helping python's unittest to discover your tests, do the
  5 +following:
  6 +
  7 +* create a subdirectory within oletools/tests/
  8 + - The directory name must be a valid python package name,
  9 + so must not include '-', for example
  10 + - e.g. oletools/tests/my_feature
  11 +
  12 +* Create a __init__.py inside that directory
  13 + - can be empty but must be there
  14 +
  15 +* Copy the unittest_template.py into your test directory
  16 +
  17 +* Rename your copy of the template to fit its purpose
  18 + - file name must start with 'test' and end with '.py'
  19 + - e.g. oletools/tests/my_feature/test_bla.py
  20 +
  21 +* Create python code inside that directory
  22 + - classes names must start with Test and must be subclasses
  23 + of Unittest.TestCase
  24 + - test functions inside your test cases must start with test_
  25 + - see unittest_template.py for examples
  26 +
  27 +* If your unit test requires test files, put them into a subdir
  28 + of oletools/tests/test-data with some name that clarifies what
  29 + tests it belongs to
  30 + - e.g. oletools/tests/test-data/my_feature/example.doc
  31 + - Do not add files with actual evil malware macros! Only harmless
  32 + test data!
  33 +
  34 +* Test that unittests work by running from the oletools base dir:
  35 + python -m unittest discover -v
  36 +
  37 +* Re-test with python2 and python3 (if possible)
... ...
tests/msodde_doc/__init__.py 0 → 100644
tests/msodde_doc/test_basic.py 0 → 100644
  1 +""" Test some basic behaviour of msodde.py
  2 +
  3 +Ensure that
  4 +- doc and docx are read without error
  5 +- garbage returns error return status
  6 +- dde-links are found where appropriate
  7 +"""
  8 +
  9 +from __future__ import print_function
  10 +
  11 +import unittest
  12 +from oletools import msodde
  13 +import shlex
  14 +from os.path import join, dirname, normpath
  15 +import sys
  16 +
  17 +# python 2/3 version conflict:
  18 +if sys.version_info.major <= 2:
  19 + from StringIO import StringIO
  20 + #from io import BytesIO as StringIO - try if print() gives UnicodeError
  21 +else:
  22 + from io import StringIO
  23 +
  24 +
  25 +# base directory for test input
  26 +BASE_DIR = normpath(join(dirname(__file__), '..', 'test-data'))
  27 +
  28 +
  29 +class TestReturnCode(unittest.TestCase):
  30 +
  31 + def test_valid_doc(self):
  32 + """ check that a valid doc file leads to 0 exit status """
  33 + print(join(BASE_DIR, 'msodde-doc/test_document.doc'))
  34 + self.do_test_validity(join(BASE_DIR, 'msodde-doc/test_document.doc'))
  35 +
  36 + def test_valid_docx(self):
  37 + """ check that a valid docx file leads to 0 exit status """
  38 + self.do_test_validity(join(BASE_DIR, 'msodde-doc/test_document.docx'))
  39 +
  40 + def test_invalid_none(self):
  41 + """ check that no file argument leads to non-zero exit status """
  42 + self.do_test_validity('', True)
  43 +
  44 + def test_invalid_empty(self):
  45 + """ check that empty file argument leads to non-zero exit status """
  46 + self.do_test_validity(join(BASE_DIR, 'basic/empty'), True)
  47 +
  48 + def test_invalid_text(self):
  49 + """ check that text file argument leads to non-zero exit status """
  50 + self.do_test_validity(join(BASE_DIR, 'basic/text'), True)
  51 +
  52 + def do_test_validity(self, args, expect_error=False):
  53 + """ helper for test_valid_doc[x] """
  54 + args = shlex.split(args)
  55 + return_code = -1
  56 + have_exception = False
  57 + try:
  58 + return_code = msodde.main(args)
  59 + except Exception:
  60 + have_exception = True
  61 + except SystemExit as se: # sys.exit() was called
  62 + return_code = se.code
  63 + if se.code is None:
  64 + return_code = 0
  65 +
  66 + self.assertEqual(expect_error, have_exception or (return_code != 0))
  67 +
  68 +
  69 +class OutputCapture:
  70 + """ context manager that captures stdout """
  71 +
  72 + def __init__(self):
  73 + self.output = StringIO() # in py2, this actually is BytesIO
  74 +
  75 + def __enter__(self):
  76 + sys.stdout = self.output
  77 + return self
  78 +
  79 + def __exit__(self, exc_type, exc_value, traceback):
  80 + sys.stdout = sys.__stdout__ # re-set to original
  81 +
  82 + if exc_type: # there has been an error
  83 + print('Got error during output capture!')
  84 + print('Print captured output and re-raise:')
  85 + for line in self.output.getvalue().splitlines():
  86 + print(line.rstrip()) # print output before re-raising
  87 +
  88 + def __iter__(self):
  89 + for line in self.output.getvalue().splitlines():
  90 + yield line.rstrip() # remove newline at end of line
  91 +
  92 +
  93 +class TestDdeInDoc(unittest.TestCase):
  94 +
  95 + def test_with_dde(self):
  96 + """ check that dde links appear on stdout """
  97 + with OutputCapture() as capturer:
  98 + msodde.main([join(BASE_DIR, 'msodde-doc', 'dde-test.doc')])
  99 +
  100 + for line in capturer:
  101 + print(line)
  102 + pass # we just want to get the last line
  103 +
  104 + self.assertNotEqual(len(line.strip()), 0)
  105 +
  106 + def test_no_dde(self):
  107 + """ check that no dde links appear on stdout """
  108 + with OutputCapture() as capturer:
  109 + msodde.main([join(BASE_DIR, 'msodde-doc', 'test_document.doc')])
  110 +
  111 + for line in capturer:
  112 + print(line)
  113 + pass # we just want to get the last line
  114 +
  115 + self.assertEqual(line.strip(), '')
  116 +
  117 +
  118 +if __name__ == '__main__':
  119 + unittest.main()
... ...
tests/test-data/basic/empty 0 → 100644
tests/test-data/basic/text 0 → 100644
  1 +bla
... ...
tests/test-data/msodde-doc/dde-test.doc 0 → 100644
No preview for this file type
tests/test-data/msodde-doc/test_document.doc 0 → 100644
No preview for this file type
tests/test-data/msodde-doc/test_document.docx 0 → 100644
No preview for this file type
tests/unittest_template.py 0 → 100644
  1 +""" Test my new feature
  2 +
  3 +Some more info if you want
  4 +
  5 +Should work with python2 and python3!
  6 +"""
  7 +
  8 +import unittest
  9 +
  10 +# if you need data from oletools/test-data/DIR/, uncomment these lines:
  11 +#from os.path import join, dirname, normpath
  12 +#Directory with test data, independent of current working directory
  13 +#DATA_DIR = normpath(join(dirname(__file__), '..', 'test-data', 'DIR'))
  14 +
  15 +
  16 +class TestMyFeature(unittest.TestCase):
  17 + """ Tests my cool new feature """
  18 +
  19 + def test_this(self):
  20 + """ check that this works """
  21 + pass # your code here
  22 +
  23 + def test_that(self):
  24 + """ check that that also works """
  25 + pass # your code here
  26 +
  27 + def helper_function(self, filename):
  28 + """ to be called from other test functions to avoid copy-and-paste
  29 +
  30 + this is not called by unittest directly, only from your functions """
  31 + pass # your code here
  32 + # e.g.: msodde.main(join(DATA_DIR, filename))
  33 +
  34 +
  35 +# just in case somebody calls this file as a script
  36 +if __name__ == '__main__':
  37 + unittest.main()
... ...