diff --git a/oletools/msodde.py b/oletools/msodde.py index caa53ce..03d379a 100644 --- a/oletools/msodde.py +++ b/oletools/msodde.py @@ -6,7 +6,7 @@ msodde is a script to parse MS Office documents (e.g. Word, Excel), to detect and extract DDE links. Supported formats: -- Word 2007+ (.docx, .dotx, .docm, .dotm) +- Word 97-2003 (.doc, .dot), Word 2007+ (.docx, .dotx, .docm, .dotm) Author: Philippe Lagadec - http://www.decalage.info License: BSD, see source code or documentation @@ -47,8 +47,9 @@ from __future__ import print_function # 2017-10-18 v0.52 PL: - first version # 2017-10-20 PL: - fixed issue #202 (handling empty xml tags) # 2017-10-25 CH: - add json output +# 2017-10-25 CH: - parse doc -__version__ = '0.52dev2' +__version__ = '0.52dev3' #------------------------------------------------------------------------------ # TODO: detect beginning/end of fields, to separate each field @@ -114,9 +115,154 @@ def process_args(): return args +# from [MS-DOC], section 2.8.25 (PlcFld): +# A field consists of two parts: field instructions and, optionally, a result. All fields MUST begin with +# Unicode character 0x0013 with sprmCFSpec applied with a value of 1. This is the field begin +# character. All fields MUST end with a Unicode character 0x0015 with sprmCFSpec applied with a value +# of 1. This is the field end character. If the field has a result, then there MUST be a Unicode character +# 0x0014 with sprmCFSpec applied with a value of 1 somewhere between the field begin character and +# the field end character. This is the field separator. The field result is the content between the field +# separator and the field end character. The field instructions are the content between the field begin +# character and the field separator, if one is present, or between the field begin character and the field +# end character if no separator is present. The field begin character, field end character, and field +# separator are collectively referred to as field characters. -def process_file(filepath): +def process_ole_field(data): + """ check if field instructions start with DDE + + expects unicode input, returns unicode output (empty if not dde) """ + #print('processing field \'{0}\''.format(data)) + + if data.lstrip().lower().startswith(u'dde'): + #print('--> is DDE!') + return data + else: + return u'' + + +OLE_FIELD_START = 0x13 +OLE_FIELD_SEP = 0x14 +OLE_FIELD_END = 0x15 +OLE_FIELD_MAX_SIZE = 1000 # max field size to analyze, rest is ignored + + +def process_ole_stream(stream): + """ find dde links in single ole stream + + since ole file stream are subclasses of io.BytesIO, they are buffered, so + reading char-wise is not that bad performanc-wise """ + + have_start = False + have_sep = False + field_contents = None + result_parts = [] + max_size_exceeded = False + idx = -1 + while True: + idx += 1 + char = stream.read(1) # loop over every single byte + if len(char) == 0: + break + else: + char = ord(char) + + if char == OLE_FIELD_START: + #print('DEBUG: have start at {}'.format(idx)) + #if have_start: + # print("DEBUG: dismissing previous contents of length {}" + # .format(len(field_contents))) + have_start = True + have_sep = False + max_size_exceeded = False + field_contents = u'' + continue + elif not have_start: + continue + + # now we are after start char but not at end yet + if char == OLE_FIELD_SEP: + #print('DEBUG: have sep at {}'.format(idx)) + have_sep = True + elif char == OLE_FIELD_END: + #print('DEBUG: have end at {}'.format(idx)) + + # have complete field now, process it + result_parts.append(process_ole_field(field_contents)) + + # re-set variables for next field + have_start = False + have_sep = False + field_contents = None + elif not have_sep: + # check that array does not get too long by accident + if max_size_exceeded: + pass + elif len(field_contents) > OLE_FIELD_MAX_SIZE: + #print('DEBUG: exceeded max size') + max_size_exceeded = True + + # appending a raw byte to a unicode string here. Not clean but + # all we do later is check for the ascii-sequence 'DDE' later... + elif char < 128: + field_contents += unichr(char) + #print('DEBUG: at idx {:4d}: add byte {} ({})' + # .format(idx, unichr(char), char)) + else: + field_contents += u'?' + #print('DEBUG: at idx {:4d}: add byte ? ({})' + # .format(idx, char)) + #print('\nstream len = {}'.format(idx)) + + # copy behaviour of process_xml: Just concatenate unicode strings + return u''.join(result_parts) + + +def process_ole_storage(ole): + """ process a "directory" inside an ole stream """ + results = [] + for st in ole.listdir(streams=True, storages=True): + st_type = ole.get_type(st) + if st_type == olefile.STGTY_STREAM: # a stream + stream = None + links = '' + try: + stream = ole.openstream(st) + #print('Checking stream {0}'.format(st)) + links = process_ole_stream(stream) + except: + raise + finally: + if stream: + stream.close() + if links: + results.append(links) + elif st_type == olefile.STGTY_STORAGE: # a storage + #print('Checking storage {0}'.format(st)) + links = process_ole_storage(st) + if links: + results.extend(links) + else: + #print('Warning: unexpected type {0} for entry {1}. Ignore it' + # .format(st_type, st)) + continue + return results + + +def process_ole(filepath): + """ find dde links in ole file + + like process_xml, returns a concatenated unicode string of dde links or + empty if none were found. dde-links will still being with the dde[auto] key + word (possibly after some whitespace) + """ + #print('Looks like ole') + ole = olefile.OleFileIO(filepath, path_encoding=None) + text_parts = process_ole_storage(ole) + return u'\n'.join(text_parts) + + +def process_xml(filepath): z = zipfile.ZipFile(filepath) data = z.read('word/document.xml') z.close() @@ -134,11 +280,18 @@ def process_file(filepath): # concatenate the attribute of the field, if present: if elem.attrib is not None: text += elem.attrib[TAG_W_INSTRATTR] - return text +def process_file(filepath): + """ decides to either call process_xml or process_ole """ + if olefile.isOleFile(filepath): + return process_ole(filepath) + else: + return process_xml(filepath) + + #=== MAIN ================================================================= def main():