Commit f47d572ff4c24c0b1c39e578aafccc182b1dec1b
Merge branch 'staaldraad-ddedev'
Showing
1 changed file
with
155 additions
and
19 deletions
oletools/msodde.py
| ... | ... | @@ -46,14 +46,16 @@ from __future__ import print_function |
| 46 | 46 | # CHANGELOG: |
| 47 | 47 | # 2017-10-18 v0.52 PL: - first version |
| 48 | 48 | # 2017-10-20 PL: - fixed issue #202 (handling empty xml tags) |
| 49 | +# 2017-10-23 ES: - add check for fldSimple codes | |
| 50 | +# 2017-10-24 ES: - group tags and track begin/end tags to keep DDE strings together | |
| 49 | 51 | # 2017-10-25 CH: - add json output |
| 50 | 52 | # 2017-10-25 CH: - parse doc |
| 53 | +# PL: - added logging | |
| 51 | 54 | |
| 52 | -__version__ = '0.52dev3' | |
| 55 | +__version__ = '0.52dev4' | |
| 53 | 56 | |
| 54 | 57 | #------------------------------------------------------------------------------ |
| 55 | -# TODO: detect beginning/end of fields, to separate each field | |
| 56 | -# TODO: test if DDE links can also appear in headers, footers and other places | |
| 58 | +# TODO: field codes can be in headers/footers/comments - parse these | |
| 57 | 59 | # TODO: add xlsx support |
| 58 | 60 | |
| 59 | 61 | #------------------------------------------------------------------------------ |
| ... | ... | @@ -74,6 +76,7 @@ import zipfile |
| 74 | 76 | import os |
| 75 | 77 | import sys |
| 76 | 78 | import json |
| 79 | +import logging | |
| 77 | 80 | |
| 78 | 81 | from oletools.thirdparty import olefile |
| 79 | 82 | |
| ... | ... | @@ -86,11 +89,16 @@ if sys.version_info[0] >= 3: |
| 86 | 89 | |
| 87 | 90 | |
| 88 | 91 | NS_WORD = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' |
| 89 | - | |
| 92 | +NO_QUOTES = False | |
| 90 | 93 | # XML tag for 'w:instrText' |
| 91 | 94 | TAG_W_INSTRTEXT = '{%s}instrText' % NS_WORD |
| 92 | 95 | TAG_W_FLDSIMPLE = '{%s}fldSimple' % NS_WORD |
| 93 | -TAG_W_INSTRATTR= '{%s}instr' % NS_WORD | |
| 96 | +TAG_W_FLDCHAR = '{%s}fldChar' % NS_WORD | |
| 97 | +TAG_W_P = "{%s}p" % NS_WORD | |
| 98 | +TAG_W_R = "{%s}r" % NS_WORD | |
| 99 | +ATTR_W_INSTR = '{%s}instr' % NS_WORD | |
| 100 | +ATTR_W_FLDCHARTYPE = '{%s}fldCharType' % NS_WORD | |
| 101 | +LOCATIONS = ['word/document.xml','word/endnotes.xml','word/footnotes.xml','word/header1.xml','word/footer1.xml','word/header2.xml','word/footer2.xml','word/comments.xml'] | |
| 94 | 102 | |
| 95 | 103 | # banner to be printed at program start |
| 96 | 104 | BANNER = """msodde %s - http://decalage.info/python/oletools |
| ... | ... | @@ -104,6 +112,57 @@ BANNER_JSON = dict(type='meta', version=__version__, name='msodde', |
| 104 | 112 | 'Please report any issue at ' |
| 105 | 113 | 'https://github.com/decalage2/oletools/issues') |
| 106 | 114 | |
| 115 | +# === LOGGING ================================================================= | |
| 116 | + | |
| 117 | +DEFAULT_LOG_LEVEL = "warning" # Default log level | |
| 118 | +LOG_LEVELS = { | |
| 119 | + 'debug': logging.DEBUG, | |
| 120 | + 'info': logging.INFO, | |
| 121 | + 'warning': logging.WARNING, | |
| 122 | + 'error': logging.ERROR, | |
| 123 | + 'critical': logging.CRITICAL | |
| 124 | +} | |
| 125 | + | |
| 126 | +class NullHandler(logging.Handler): | |
| 127 | + """ | |
| 128 | + Log Handler without output, to avoid printing messages if logging is not | |
| 129 | + configured by the main application. | |
| 130 | + Python 2.7 has logging.NullHandler, but this is necessary for 2.6: | |
| 131 | + see https://docs.python.org/2.6/library/logging.html#configuring-logging-for-a-library | |
| 132 | + """ | |
| 133 | + def emit(self, record): | |
| 134 | + pass | |
| 135 | + | |
| 136 | +def get_logger(name, level=logging.CRITICAL+1): | |
| 137 | + """ | |
| 138 | + Create a suitable logger object for this module. | |
| 139 | + The goal is not to change settings of the root logger, to avoid getting | |
| 140 | + other modules' logs on the screen. | |
| 141 | + If a logger exists with same name, reuse it. (Else it would have duplicate | |
| 142 | + handlers and messages would be doubled.) | |
| 143 | + The level is set to CRITICAL+1 by default, to avoid any logging. | |
| 144 | + """ | |
| 145 | + # First, test if there is already a logger with the same name, else it | |
| 146 | + # will generate duplicate messages (due to duplicate handlers): | |
| 147 | + if name in logging.Logger.manager.loggerDict: | |
| 148 | + #NOTE: another less intrusive but more "hackish" solution would be to | |
| 149 | + # use getLogger then test if its effective level is not default. | |
| 150 | + logger = logging.getLogger(name) | |
| 151 | + # make sure level is OK: | |
| 152 | + logger.setLevel(level) | |
| 153 | + return logger | |
| 154 | + # get a new logger: | |
| 155 | + logger = logging.getLogger(name) | |
| 156 | + # only add a NullHandler for this logger, it is up to the application | |
| 157 | + # to configure its own logging: | |
| 158 | + logger.addHandler(NullHandler()) | |
| 159 | + logger.setLevel(level) | |
| 160 | + return logger | |
| 161 | + | |
| 162 | +# a global logger object used for debugging: | |
| 163 | +log = get_logger('msodde') | |
| 164 | + | |
| 165 | + | |
| 107 | 166 | # === ARGUMENT PARSING ======================================================= |
| 108 | 167 | |
| 109 | 168 | class ArgParserWithBanner(argparse.ArgumentParser): |
| ... | ... | @@ -127,6 +186,9 @@ def process_args(cmd_line_args=None): |
| 127 | 186 | type=existing_file, metavar='FILE') |
| 128 | 187 | parser.add_argument("--json", '-j', action='store_true', |
| 129 | 188 | help="Output in json format") |
| 189 | + parser.add_argument("--nounquote", help="don't unquote values",action='store_true') | |
| 190 | + parser.add_argument('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL, | |
| 191 | + help="logging level debug/info/warning/error/critical (default=warning)") | |
| 130 | 192 | |
| 131 | 193 | return parser.parse_args(cmd_line_args) |
| 132 | 194 | |
| ... | ... | @@ -268,46 +330,107 @@ def process_ole_storage(ole): |
| 268 | 330 | |
| 269 | 331 | |
| 270 | 332 | def process_ole(filepath): |
| 271 | - """ find dde links in ole file | |
| 333 | + """ | |
| 334 | + find dde links in ole file | |
| 272 | 335 | |
| 273 | 336 | like process_xml, returns a concatenated unicode string of dde links or |
| 274 | 337 | empty if none were found. dde-links will still being with the dde[auto] key |
| 275 | 338 | word (possibly after some whitespace) |
| 276 | 339 | """ |
| 340 | + log.debug('process_ole') | |
| 277 | 341 | #print('Looks like ole') |
| 278 | 342 | ole = olefile.OleFileIO(filepath, path_encoding=None) |
| 279 | 343 | text_parts = process_ole_storage(ole) |
| 280 | 344 | return u'\n'.join(text_parts) |
| 281 | 345 | |
| 282 | 346 | |
| 283 | -def process_xml(filepath): | |
| 347 | +def process_openxml(filepath): | |
| 348 | + log.debug('process_openxml') | |
| 349 | + all_fields = [] | |
| 284 | 350 | z = zipfile.ZipFile(filepath) |
| 285 | - data = z.read('word/document.xml') | |
| 351 | + for filepath in z.namelist(): | |
| 352 | + if filepath in LOCATIONS: | |
| 353 | + data = z.read(filepath) | |
| 354 | + fields = process_xml(data) | |
| 355 | + if len(fields) > 0: | |
| 356 | + #print ('DDE Links in %s:'%filepath) | |
| 357 | + #for f in fields: | |
| 358 | + # print(f) | |
| 359 | + all_fields.extend(fields) | |
| 286 | 360 | z.close() |
| 361 | + return u'\n'.join(all_fields) | |
| 362 | + | |
| 363 | +def process_xml(data): | |
| 287 | 364 | # parse the XML data: |
| 288 | 365 | root = ET.fromstring(data) |
| 289 | - text = u'' | |
| 290 | - # find all the tags 'w:instrText': | |
| 366 | + fields = [] | |
| 367 | + ddetext = u'' | |
| 368 | + level = 0 | |
| 369 | + # find all the tags 'w:p': | |
| 370 | + # parse each for begin and end tags, to group DDE strings | |
| 371 | + # fldChar can be in either a w:r element, floating alone in the w:p or spread accross w:p tags | |
| 372 | + # escape DDE if quoted etc | |
| 291 | 373 | # (each is a chunk of a DDE link) |
| 292 | - for elem in root.iter(TAG_W_INSTRTEXT): | |
| 293 | - # concatenate the text of the field, if present: | |
| 294 | - if elem.text is not None: | |
| 295 | - text += elem.text | |
| 374 | + | |
| 375 | + for subs in root.iter(TAG_W_P): | |
| 376 | + elem = None | |
| 377 | + for e in subs: | |
| 378 | + #check if w:r and if it is parse children elements to pull out the first FLDCHAR or INSTRTEXT | |
| 379 | + if e.tag == TAG_W_R: | |
| 380 | + for child in e: | |
| 381 | + if child.tag == TAG_W_FLDCHAR or child.tag == TAG_W_INSTRTEXT: | |
| 382 | + elem = child | |
| 383 | + break | |
| 384 | + else: | |
| 385 | + elem = e | |
| 386 | + #this should be an error condition | |
| 387 | + if elem is None: | |
| 388 | + continue | |
| 389 | + | |
| 390 | + #check if FLDCHARTYPE and whether "begin" or "end" tag | |
| 391 | + if elem.attrib.get(ATTR_W_FLDCHARTYPE) is not None: | |
| 392 | + if elem.attrib[ATTR_W_FLDCHARTYPE] == "begin": | |
| 393 | + level += 1 | |
| 394 | + if elem.attrib[ATTR_W_FLDCHARTYPE] == "end": | |
| 395 | + level -= 1 | |
| 396 | + if level == 0 or level == -1 : # edge-case where level becomes -1 | |
| 397 | + fields.append(ddetext) | |
| 398 | + ddetext = u'' | |
| 399 | + level = 0 # reset edge-case | |
| 400 | + | |
| 401 | + # concatenate the text of the field, if present: | |
| 402 | + if elem.tag == TAG_W_INSTRTEXT and elem.text is not None: | |
| 403 | + #expand field code if QUOTED | |
| 404 | + ddetext += unquote(elem.text) | |
| 296 | 405 | |
| 297 | 406 | for elem in root.iter(TAG_W_FLDSIMPLE): |
| 298 | 407 | # concatenate the attribute of the field, if present: |
| 299 | 408 | if elem.attrib is not None: |
| 300 | - text += elem.attrib[TAG_W_INSTRATTR] | |
| 301 | - | |
| 302 | - return text | |
| 409 | + fields.append(elem.attrib[ATTR_W_INSTR]) | |
| 410 | + | |
| 411 | + return fields | |
| 412 | + | |
| 413 | +def unquote(field): | |
| 414 | + if "QUOTE" not in field or NO_QUOTES: | |
| 415 | + return field | |
| 416 | + #split into components | |
| 417 | + parts = field.strip().split(" ") | |
| 418 | + ddestr = "" | |
| 419 | + for p in parts[1:]: | |
| 420 | + try: | |
| 421 | + ch = chr(int(p)) | |
| 422 | + except ValueError: | |
| 423 | + ch = p | |
| 424 | + ddestr += ch | |
| 425 | + return ddestr | |
| 303 | 426 | |
| 304 | 427 | |
| 305 | 428 | def process_file(filepath): |
| 306 | - """ decides to either call process_xml or process_ole """ | |
| 429 | + """ decides to either call process_openxml or process_ole """ | |
| 307 | 430 | if olefile.isOleFile(filepath): |
| 308 | 431 | return process_ole(filepath) |
| 309 | 432 | else: |
| 310 | - return process_xml(filepath) | |
| 433 | + return process_openxml(filepath) | |
| 311 | 434 | |
| 312 | 435 | |
| 313 | 436 | #=== MAIN ================================================================= |
| ... | ... | @@ -321,6 +444,19 @@ def main(cmd_line_args=None): |
| 321 | 444 | """ |
| 322 | 445 | args = process_args(cmd_line_args) |
| 323 | 446 | |
| 447 | + # Setup logging to the console: | |
| 448 | + # here we use stdout instead of stderr by default, so that the output | |
| 449 | + # can be redirected properly. | |
| 450 | + logging.basicConfig(level=LOG_LEVELS[args.loglevel], stream=sys.stdout, | |
| 451 | + format='%(levelname)-8s %(message)s') | |
| 452 | + # enable logging in the modules: | |
| 453 | + log.setLevel(logging.NOTSET) | |
| 454 | + | |
| 455 | + | |
| 456 | + if args.nounquote : | |
| 457 | + global NO_QUOTES | |
| 458 | + NO_QUOTES = True | |
| 459 | + | |
| 324 | 460 | if args.json: |
| 325 | 461 | jout = [] |
| 326 | 462 | jout.append(BANNER_JSON) | ... | ... |