Commit aca2c7b9328921be0c44e8b9bebdbaa6bd158809

Authored by Christian Herdtweck
1 parent c49a5078

Add parser for .doc files (Word 2003 and earlier)

Works with Python 2.7, still have to check with other versions
Showing 1 changed file with 157 additions and 4 deletions
oletools/msodde.py
... ... @@ -6,7 +6,7 @@ msodde is a script to parse MS Office documents
6 6 (e.g. Word, Excel), to detect and extract DDE links.
7 7  
8 8 Supported formats:
9   -- Word 2007+ (.docx, .dotx, .docm, .dotm)
  9 +- Word 97-2003 (.doc, .dot), Word 2007+ (.docx, .dotx, .docm, .dotm)
10 10  
11 11 Author: Philippe Lagadec - http://www.decalage.info
12 12 License: BSD, see source code or documentation
... ... @@ -47,8 +47,9 @@ from __future__ import print_function
47 47 # 2017-10-18 v0.52 PL: - first version
48 48 # 2017-10-20 PL: - fixed issue #202 (handling empty xml tags)
49 49 # 2017-10-25 CH: - add json output
  50 +# 2017-10-25 CH: - parse doc
50 51  
51   -__version__ = '0.52dev2'
  52 +__version__ = '0.52dev3'
52 53  
53 54 #------------------------------------------------------------------------------
54 55 # TODO: detect beginning/end of fields, to separate each field
... ... @@ -114,9 +115,154 @@ def process_args():
114 115  
115 116 return args
116 117  
  118 +# from [MS-DOC], section 2.8.25 (PlcFld):
  119 +# A field consists of two parts: field instructions and, optionally, a result. All fields MUST begin with
  120 +# Unicode character 0x0013 with sprmCFSpec applied with a value of 1. This is the field begin
  121 +# character. All fields MUST end with a Unicode character 0x0015 with sprmCFSpec applied with a value
  122 +# of 1. This is the field end character. If the field has a result, then there MUST be a Unicode character
  123 +# 0x0014 with sprmCFSpec applied with a value of 1 somewhere between the field begin character and
  124 +# the field end character. This is the field separator. The field result is the content between the field
  125 +# separator and the field end character. The field instructions are the content between the field begin
  126 +# character and the field separator, if one is present, or between the field begin character and the field
  127 +# end character if no separator is present. The field begin character, field end character, and field
  128 +# separator are collectively referred to as field characters.
117 129  
118 130  
119   -def process_file(filepath):
  131 +def process_ole_field(data):
  132 + """ check if field instructions start with DDE
  133 +
  134 + expects unicode input, returns unicode output (empty if not dde) """
  135 + #print('processing field \'{0}\''.format(data))
  136 +
  137 + if data.lstrip().lower().startswith(u'dde'):
  138 + #print('--> is DDE!')
  139 + return data
  140 + else:
  141 + return u''
  142 +
  143 +
  144 +OLE_FIELD_START = 0x13
  145 +OLE_FIELD_SEP = 0x14
  146 +OLE_FIELD_END = 0x15
  147 +OLE_FIELD_MAX_SIZE = 1000 # max field size to analyze, rest is ignored
  148 +
  149 +
  150 +def process_ole_stream(stream):
  151 + """ find dde links in single ole stream
  152 +
  153 + since ole file stream are subclasses of io.BytesIO, they are buffered, so
  154 + reading char-wise is not that bad performanc-wise """
  155 +
  156 + have_start = False
  157 + have_sep = False
  158 + field_contents = None
  159 + result_parts = []
  160 + max_size_exceeded = False
  161 + idx = -1
  162 + while True:
  163 + idx += 1
  164 + char = stream.read(1) # loop over every single byte
  165 + if len(char) == 0:
  166 + break
  167 + else:
  168 + char = ord(char)
  169 +
  170 + if char == OLE_FIELD_START:
  171 + #print('DEBUG: have start at {}'.format(idx))
  172 + #if have_start:
  173 + # print("DEBUG: dismissing previous contents of length {}"
  174 + # .format(len(field_contents)))
  175 + have_start = True
  176 + have_sep = False
  177 + max_size_exceeded = False
  178 + field_contents = u''
  179 + continue
  180 + elif not have_start:
  181 + continue
  182 +
  183 + # now we are after start char but not at end yet
  184 + if char == OLE_FIELD_SEP:
  185 + #print('DEBUG: have sep at {}'.format(idx))
  186 + have_sep = True
  187 + elif char == OLE_FIELD_END:
  188 + #print('DEBUG: have end at {}'.format(idx))
  189 +
  190 + # have complete field now, process it
  191 + result_parts.append(process_ole_field(field_contents))
  192 +
  193 + # re-set variables for next field
  194 + have_start = False
  195 + have_sep = False
  196 + field_contents = None
  197 + elif not have_sep:
  198 + # check that array does not get too long by accident
  199 + if max_size_exceeded:
  200 + pass
  201 + elif len(field_contents) > OLE_FIELD_MAX_SIZE:
  202 + #print('DEBUG: exceeded max size')
  203 + max_size_exceeded = True
  204 +
  205 + # appending a raw byte to a unicode string here. Not clean but
  206 + # all we do later is check for the ascii-sequence 'DDE' later...
  207 + elif char < 128:
  208 + field_contents += unichr(char)
  209 + #print('DEBUG: at idx {:4d}: add byte {} ({})'
  210 + # .format(idx, unichr(char), char))
  211 + else:
  212 + field_contents += u'?'
  213 + #print('DEBUG: at idx {:4d}: add byte ? ({})'
  214 + # .format(idx, char))
  215 + #print('\nstream len = {}'.format(idx))
  216 +
  217 + # copy behaviour of process_xml: Just concatenate unicode strings
  218 + return u''.join(result_parts)
  219 +
  220 +
  221 +def process_ole_storage(ole):
  222 + """ process a "directory" inside an ole stream """
  223 + results = []
  224 + for st in ole.listdir(streams=True, storages=True):
  225 + st_type = ole.get_type(st)
  226 + if st_type == olefile.STGTY_STREAM: # a stream
  227 + stream = None
  228 + links = ''
  229 + try:
  230 + stream = ole.openstream(st)
  231 + #print('Checking stream {0}'.format(st))
  232 + links = process_ole_stream(stream)
  233 + except:
  234 + raise
  235 + finally:
  236 + if stream:
  237 + stream.close()
  238 + if links:
  239 + results.append(links)
  240 + elif st_type == olefile.STGTY_STORAGE: # a storage
  241 + #print('Checking storage {0}'.format(st))
  242 + links = process_ole_storage(st)
  243 + if links:
  244 + results.extend(links)
  245 + else:
  246 + #print('Warning: unexpected type {0} for entry {1}. Ignore it'
  247 + # .format(st_type, st))
  248 + continue
  249 + return results
  250 +
  251 +
  252 +def process_ole(filepath):
  253 + """ find dde links in ole file
  254 +
  255 + like process_xml, returns a concatenated unicode string of dde links or
  256 + empty if none were found. dde-links will still being with the dde[auto] key
  257 + word (possibly after some whitespace)
  258 + """
  259 + #print('Looks like ole')
  260 + ole = olefile.OleFileIO(filepath, path_encoding=None)
  261 + text_parts = process_ole_storage(ole)
  262 + return u'\n'.join(text_parts)
  263 +
  264 +
  265 +def process_xml(filepath):
120 266 z = zipfile.ZipFile(filepath)
121 267 data = z.read('word/document.xml')
122 268 z.close()
... ... @@ -134,11 +280,18 @@ def process_file(filepath):
134 280 # concatenate the attribute of the field, if present:
135 281 if elem.attrib is not None:
136 282 text += elem.attrib[TAG_W_INSTRATTR]
137   -
138 283  
139 284 return text
140 285  
141 286  
  287 +def process_file(filepath):
  288 + """ decides to either call process_xml or process_ole """
  289 + if olefile.isOleFile(filepath):
  290 + return process_ole(filepath)
  291 + else:
  292 + return process_xml(filepath)
  293 +
  294 +
142 295 #=== MAIN =================================================================
143 296  
144 297 def main():
... ...