Commit aca2c7b9328921be0c44e8b9bebdbaa6bd158809
1 parent
c49a5078
Add parser for .doc files (Word 2003 and earlier)
Works with Python 2.7, still have to check with other versions
Showing
1 changed file
with
157 additions
and
4 deletions
oletools/msodde.py
| @@ -6,7 +6,7 @@ msodde is a script to parse MS Office documents | @@ -6,7 +6,7 @@ msodde is a script to parse MS Office documents | ||
| 6 | (e.g. Word, Excel), to detect and extract DDE links. | 6 | (e.g. Word, Excel), to detect and extract DDE links. |
| 7 | 7 | ||
| 8 | Supported formats: | 8 | Supported formats: |
| 9 | -- Word 2007+ (.docx, .dotx, .docm, .dotm) | 9 | +- Word 97-2003 (.doc, .dot), Word 2007+ (.docx, .dotx, .docm, .dotm) |
| 10 | 10 | ||
| 11 | Author: Philippe Lagadec - http://www.decalage.info | 11 | Author: Philippe Lagadec - http://www.decalage.info |
| 12 | License: BSD, see source code or documentation | 12 | License: BSD, see source code or documentation |
| @@ -47,8 +47,9 @@ from __future__ import print_function | @@ -47,8 +47,9 @@ from __future__ import print_function | ||
| 47 | # 2017-10-18 v0.52 PL: - first version | 47 | # 2017-10-18 v0.52 PL: - first version |
| 48 | # 2017-10-20 PL: - fixed issue #202 (handling empty xml tags) | 48 | # 2017-10-20 PL: - fixed issue #202 (handling empty xml tags) |
| 49 | # 2017-10-25 CH: - add json output | 49 | # 2017-10-25 CH: - add json output |
| 50 | +# 2017-10-25 CH: - parse doc | ||
| 50 | 51 | ||
| 51 | -__version__ = '0.52dev2' | 52 | +__version__ = '0.52dev3' |
| 52 | 53 | ||
| 53 | #------------------------------------------------------------------------------ | 54 | #------------------------------------------------------------------------------ |
| 54 | # TODO: detect beginning/end of fields, to separate each field | 55 | # TODO: detect beginning/end of fields, to separate each field |
| @@ -114,9 +115,154 @@ def process_args(): | @@ -114,9 +115,154 @@ def process_args(): | ||
| 114 | 115 | ||
| 115 | return args | 116 | return args |
| 116 | 117 | ||
| 118 | +# from [MS-DOC], section 2.8.25 (PlcFld): | ||
| 119 | +# A field consists of two parts: field instructions and, optionally, a result. All fields MUST begin with | ||
| 120 | +# Unicode character 0x0013 with sprmCFSpec applied with a value of 1. This is the field begin | ||
| 121 | +# character. All fields MUST end with a Unicode character 0x0015 with sprmCFSpec applied with a value | ||
| 122 | +# of 1. This is the field end character. If the field has a result, then there MUST be a Unicode character | ||
| 123 | +# 0x0014 with sprmCFSpec applied with a value of 1 somewhere between the field begin character and | ||
| 124 | +# the field end character. This is the field separator. The field result is the content between the field | ||
| 125 | +# separator and the field end character. The field instructions are the content between the field begin | ||
| 126 | +# character and the field separator, if one is present, or between the field begin character and the field | ||
| 127 | +# end character if no separator is present. The field begin character, field end character, and field | ||
| 128 | +# separator are collectively referred to as field characters. | ||
| 117 | 129 | ||
| 118 | 130 | ||
| 119 | -def process_file(filepath): | 131 | +def process_ole_field(data): |
| 132 | + """ check if field instructions start with DDE | ||
| 133 | + | ||
| 134 | + expects unicode input, returns unicode output (empty if not dde) """ | ||
| 135 | + #print('processing field \'{0}\''.format(data)) | ||
| 136 | + | ||
| 137 | + if data.lstrip().lower().startswith(u'dde'): | ||
| 138 | + #print('--> is DDE!') | ||
| 139 | + return data | ||
| 140 | + else: | ||
| 141 | + return u'' | ||
| 142 | + | ||
| 143 | + | ||
| 144 | +OLE_FIELD_START = 0x13 | ||
| 145 | +OLE_FIELD_SEP = 0x14 | ||
| 146 | +OLE_FIELD_END = 0x15 | ||
| 147 | +OLE_FIELD_MAX_SIZE = 1000 # max field size to analyze, rest is ignored | ||
| 148 | + | ||
| 149 | + | ||
| 150 | +def process_ole_stream(stream): | ||
| 151 | + """ find dde links in single ole stream | ||
| 152 | + | ||
| 153 | + since ole file stream are subclasses of io.BytesIO, they are buffered, so | ||
| 154 | + reading char-wise is not that bad performanc-wise """ | ||
| 155 | + | ||
| 156 | + have_start = False | ||
| 157 | + have_sep = False | ||
| 158 | + field_contents = None | ||
| 159 | + result_parts = [] | ||
| 160 | + max_size_exceeded = False | ||
| 161 | + idx = -1 | ||
| 162 | + while True: | ||
| 163 | + idx += 1 | ||
| 164 | + char = stream.read(1) # loop over every single byte | ||
| 165 | + if len(char) == 0: | ||
| 166 | + break | ||
| 167 | + else: | ||
| 168 | + char = ord(char) | ||
| 169 | + | ||
| 170 | + if char == OLE_FIELD_START: | ||
| 171 | + #print('DEBUG: have start at {}'.format(idx)) | ||
| 172 | + #if have_start: | ||
| 173 | + # print("DEBUG: dismissing previous contents of length {}" | ||
| 174 | + # .format(len(field_contents))) | ||
| 175 | + have_start = True | ||
| 176 | + have_sep = False | ||
| 177 | + max_size_exceeded = False | ||
| 178 | + field_contents = u'' | ||
| 179 | + continue | ||
| 180 | + elif not have_start: | ||
| 181 | + continue | ||
| 182 | + | ||
| 183 | + # now we are after start char but not at end yet | ||
| 184 | + if char == OLE_FIELD_SEP: | ||
| 185 | + #print('DEBUG: have sep at {}'.format(idx)) | ||
| 186 | + have_sep = True | ||
| 187 | + elif char == OLE_FIELD_END: | ||
| 188 | + #print('DEBUG: have end at {}'.format(idx)) | ||
| 189 | + | ||
| 190 | + # have complete field now, process it | ||
| 191 | + result_parts.append(process_ole_field(field_contents)) | ||
| 192 | + | ||
| 193 | + # re-set variables for next field | ||
| 194 | + have_start = False | ||
| 195 | + have_sep = False | ||
| 196 | + field_contents = None | ||
| 197 | + elif not have_sep: | ||
| 198 | + # check that array does not get too long by accident | ||
| 199 | + if max_size_exceeded: | ||
| 200 | + pass | ||
| 201 | + elif len(field_contents) > OLE_FIELD_MAX_SIZE: | ||
| 202 | + #print('DEBUG: exceeded max size') | ||
| 203 | + max_size_exceeded = True | ||
| 204 | + | ||
| 205 | + # appending a raw byte to a unicode string here. Not clean but | ||
| 206 | + # all we do later is check for the ascii-sequence 'DDE' later... | ||
| 207 | + elif char < 128: | ||
| 208 | + field_contents += unichr(char) | ||
| 209 | + #print('DEBUG: at idx {:4d}: add byte {} ({})' | ||
| 210 | + # .format(idx, unichr(char), char)) | ||
| 211 | + else: | ||
| 212 | + field_contents += u'?' | ||
| 213 | + #print('DEBUG: at idx {:4d}: add byte ? ({})' | ||
| 214 | + # .format(idx, char)) | ||
| 215 | + #print('\nstream len = {}'.format(idx)) | ||
| 216 | + | ||
| 217 | + # copy behaviour of process_xml: Just concatenate unicode strings | ||
| 218 | + return u''.join(result_parts) | ||
| 219 | + | ||
| 220 | + | ||
| 221 | +def process_ole_storage(ole): | ||
| 222 | + """ process a "directory" inside an ole stream """ | ||
| 223 | + results = [] | ||
| 224 | + for st in ole.listdir(streams=True, storages=True): | ||
| 225 | + st_type = ole.get_type(st) | ||
| 226 | + if st_type == olefile.STGTY_STREAM: # a stream | ||
| 227 | + stream = None | ||
| 228 | + links = '' | ||
| 229 | + try: | ||
| 230 | + stream = ole.openstream(st) | ||
| 231 | + #print('Checking stream {0}'.format(st)) | ||
| 232 | + links = process_ole_stream(stream) | ||
| 233 | + except: | ||
| 234 | + raise | ||
| 235 | + finally: | ||
| 236 | + if stream: | ||
| 237 | + stream.close() | ||
| 238 | + if links: | ||
| 239 | + results.append(links) | ||
| 240 | + elif st_type == olefile.STGTY_STORAGE: # a storage | ||
| 241 | + #print('Checking storage {0}'.format(st)) | ||
| 242 | + links = process_ole_storage(st) | ||
| 243 | + if links: | ||
| 244 | + results.extend(links) | ||
| 245 | + else: | ||
| 246 | + #print('Warning: unexpected type {0} for entry {1}. Ignore it' | ||
| 247 | + # .format(st_type, st)) | ||
| 248 | + continue | ||
| 249 | + return results | ||
| 250 | + | ||
| 251 | + | ||
| 252 | +def process_ole(filepath): | ||
| 253 | + """ find dde links in ole file | ||
| 254 | + | ||
| 255 | + like process_xml, returns a concatenated unicode string of dde links or | ||
| 256 | + empty if none were found. dde-links will still being with the dde[auto] key | ||
| 257 | + word (possibly after some whitespace) | ||
| 258 | + """ | ||
| 259 | + #print('Looks like ole') | ||
| 260 | + ole = olefile.OleFileIO(filepath, path_encoding=None) | ||
| 261 | + text_parts = process_ole_storage(ole) | ||
| 262 | + return u'\n'.join(text_parts) | ||
| 263 | + | ||
| 264 | + | ||
| 265 | +def process_xml(filepath): | ||
| 120 | z = zipfile.ZipFile(filepath) | 266 | z = zipfile.ZipFile(filepath) |
| 121 | data = z.read('word/document.xml') | 267 | data = z.read('word/document.xml') |
| 122 | z.close() | 268 | z.close() |
| @@ -134,11 +280,18 @@ def process_file(filepath): | @@ -134,11 +280,18 @@ def process_file(filepath): | ||
| 134 | # concatenate the attribute of the field, if present: | 280 | # concatenate the attribute of the field, if present: |
| 135 | if elem.attrib is not None: | 281 | if elem.attrib is not None: |
| 136 | text += elem.attrib[TAG_W_INSTRATTR] | 282 | text += elem.attrib[TAG_W_INSTRATTR] |
| 137 | - | ||
| 138 | 283 | ||
| 139 | return text | 284 | return text |
| 140 | 285 | ||
| 141 | 286 | ||
| 287 | +def process_file(filepath): | ||
| 288 | + """ decides to either call process_xml or process_ole """ | ||
| 289 | + if olefile.isOleFile(filepath): | ||
| 290 | + return process_ole(filepath) | ||
| 291 | + else: | ||
| 292 | + return process_xml(filepath) | ||
| 293 | + | ||
| 294 | + | ||
| 142 | #=== MAIN ================================================================= | 295 | #=== MAIN ================================================================= |
| 143 | 296 | ||
| 144 | def main(): | 297 | def main(): |