Commit f47d572ff4c24c0b1c39e578aafccc182b1dec1b

Authored by decalage2
2 parents d4b8e77b 9a4e8d90

Merge branch 'staaldraad-ddedev'

Showing 1 changed file with 155 additions and 19 deletions
oletools/msodde.py
... ... @@ -46,14 +46,16 @@ from __future__ import print_function
46 46 # CHANGELOG:
47 47 # 2017-10-18 v0.52 PL: - first version
48 48 # 2017-10-20 PL: - fixed issue #202 (handling empty xml tags)
  49 +# 2017-10-23 ES: - add check for fldSimple codes
  50 +# 2017-10-24 ES: - group tags and track begin/end tags to keep DDE strings together
49 51 # 2017-10-25 CH: - add json output
50 52 # 2017-10-25 CH: - parse doc
  53 +# PL: - added logging
51 54  
52   -__version__ = '0.52dev3'
  55 +__version__ = '0.52dev4'
53 56  
54 57 #------------------------------------------------------------------------------
55   -# TODO: detect beginning/end of fields, to separate each field
56   -# TODO: test if DDE links can also appear in headers, footers and other places
  58 +# TODO: field codes can be in headers/footers/comments - parse these
57 59 # TODO: add xlsx support
58 60  
59 61 #------------------------------------------------------------------------------
... ... @@ -74,6 +76,7 @@ import zipfile
74 76 import os
75 77 import sys
76 78 import json
  79 +import logging
77 80  
78 81 from oletools.thirdparty import olefile
79 82  
... ... @@ -86,11 +89,16 @@ if sys.version_info[0] >= 3:
86 89  
87 90  
88 91 NS_WORD = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
89   -
  92 +NO_QUOTES = False
90 93 # XML tag for 'w:instrText'
91 94 TAG_W_INSTRTEXT = '{%s}instrText' % NS_WORD
92 95 TAG_W_FLDSIMPLE = '{%s}fldSimple' % NS_WORD
93   -TAG_W_INSTRATTR= '{%s}instr' % NS_WORD
  96 +TAG_W_FLDCHAR = '{%s}fldChar' % NS_WORD
  97 +TAG_W_P = "{%s}p" % NS_WORD
  98 +TAG_W_R = "{%s}r" % NS_WORD
  99 +ATTR_W_INSTR = '{%s}instr' % NS_WORD
  100 +ATTR_W_FLDCHARTYPE = '{%s}fldCharType' % NS_WORD
  101 +LOCATIONS = ['word/document.xml','word/endnotes.xml','word/footnotes.xml','word/header1.xml','word/footer1.xml','word/header2.xml','word/footer2.xml','word/comments.xml']
94 102  
95 103 # banner to be printed at program start
96 104 BANNER = """msodde %s - http://decalage.info/python/oletools
... ... @@ -104,6 +112,57 @@ BANNER_JSON = dict(type='meta', version=__version__, name='msodde',
104 112 'Please report any issue at '
105 113 'https://github.com/decalage2/oletools/issues')
106 114  
  115 +# === LOGGING =================================================================
  116 +
  117 +DEFAULT_LOG_LEVEL = "warning" # Default log level
  118 +LOG_LEVELS = {
  119 + 'debug': logging.DEBUG,
  120 + 'info': logging.INFO,
  121 + 'warning': logging.WARNING,
  122 + 'error': logging.ERROR,
  123 + 'critical': logging.CRITICAL
  124 +}
  125 +
  126 +class NullHandler(logging.Handler):
  127 + """
  128 + Log Handler without output, to avoid printing messages if logging is not
  129 + configured by the main application.
  130 + Python 2.7 has logging.NullHandler, but this is necessary for 2.6:
  131 + see https://docs.python.org/2.6/library/logging.html#configuring-logging-for-a-library
  132 + """
  133 + def emit(self, record):
  134 + pass
  135 +
  136 +def get_logger(name, level=logging.CRITICAL+1):
  137 + """
  138 + Create a suitable logger object for this module.
  139 + The goal is not to change settings of the root logger, to avoid getting
  140 + other modules' logs on the screen.
  141 + If a logger exists with same name, reuse it. (Else it would have duplicate
  142 + handlers and messages would be doubled.)
  143 + The level is set to CRITICAL+1 by default, to avoid any logging.
  144 + """
  145 + # First, test if there is already a logger with the same name, else it
  146 + # will generate duplicate messages (due to duplicate handlers):
  147 + if name in logging.Logger.manager.loggerDict:
  148 + #NOTE: another less intrusive but more "hackish" solution would be to
  149 + # use getLogger then test if its effective level is not default.
  150 + logger = logging.getLogger(name)
  151 + # make sure level is OK:
  152 + logger.setLevel(level)
  153 + return logger
  154 + # get a new logger:
  155 + logger = logging.getLogger(name)
  156 + # only add a NullHandler for this logger, it is up to the application
  157 + # to configure its own logging:
  158 + logger.addHandler(NullHandler())
  159 + logger.setLevel(level)
  160 + return logger
  161 +
  162 +# a global logger object used for debugging:
  163 +log = get_logger('msodde')
  164 +
  165 +
107 166 # === ARGUMENT PARSING =======================================================
108 167  
109 168 class ArgParserWithBanner(argparse.ArgumentParser):
... ... @@ -127,6 +186,9 @@ def process_args(cmd_line_args=None):
127 186 type=existing_file, metavar='FILE')
128 187 parser.add_argument("--json", '-j', action='store_true',
129 188 help="Output in json format")
  189 + parser.add_argument("--nounquote", help="don't unquote values",action='store_true')
  190 + parser.add_argument('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL,
  191 + help="logging level debug/info/warning/error/critical (default=warning)")
130 192  
131 193 return parser.parse_args(cmd_line_args)
132 194  
... ... @@ -268,46 +330,107 @@ def process_ole_storage(ole):
268 330  
269 331  
270 332 def process_ole(filepath):
271   - """ find dde links in ole file
  333 + """
  334 + find dde links in ole file
272 335  
273 336 like process_xml, returns a concatenated unicode string of dde links or
274 337 empty if none were found. dde-links will still being with the dde[auto] key
275 338 word (possibly after some whitespace)
276 339 """
  340 + log.debug('process_ole')
277 341 #print('Looks like ole')
278 342 ole = olefile.OleFileIO(filepath, path_encoding=None)
279 343 text_parts = process_ole_storage(ole)
280 344 return u'\n'.join(text_parts)
281 345  
282 346  
283   -def process_xml(filepath):
  347 +def process_openxml(filepath):
  348 + log.debug('process_openxml')
  349 + all_fields = []
284 350 z = zipfile.ZipFile(filepath)
285   - data = z.read('word/document.xml')
  351 + for filepath in z.namelist():
  352 + if filepath in LOCATIONS:
  353 + data = z.read(filepath)
  354 + fields = process_xml(data)
  355 + if len(fields) > 0:
  356 + #print ('DDE Links in %s:'%filepath)
  357 + #for f in fields:
  358 + # print(f)
  359 + all_fields.extend(fields)
286 360 z.close()
  361 + return u'\n'.join(all_fields)
  362 +
  363 +def process_xml(data):
287 364 # parse the XML data:
288 365 root = ET.fromstring(data)
289   - text = u''
290   - # find all the tags 'w:instrText':
  366 + fields = []
  367 + ddetext = u''
  368 + level = 0
  369 + # find all the tags 'w:p':
  370 + # parse each for begin and end tags, to group DDE strings
  371 + # fldChar can be in either a w:r element, floating alone in the w:p or spread accross w:p tags
  372 + # escape DDE if quoted etc
291 373 # (each is a chunk of a DDE link)
292   - for elem in root.iter(TAG_W_INSTRTEXT):
293   - # concatenate the text of the field, if present:
294   - if elem.text is not None:
295   - text += elem.text
  374 +
  375 + for subs in root.iter(TAG_W_P):
  376 + elem = None
  377 + for e in subs:
  378 + #check if w:r and if it is parse children elements to pull out the first FLDCHAR or INSTRTEXT
  379 + if e.tag == TAG_W_R:
  380 + for child in e:
  381 + if child.tag == TAG_W_FLDCHAR or child.tag == TAG_W_INSTRTEXT:
  382 + elem = child
  383 + break
  384 + else:
  385 + elem = e
  386 + #this should be an error condition
  387 + if elem is None:
  388 + continue
  389 +
  390 + #check if FLDCHARTYPE and whether "begin" or "end" tag
  391 + if elem.attrib.get(ATTR_W_FLDCHARTYPE) is not None:
  392 + if elem.attrib[ATTR_W_FLDCHARTYPE] == "begin":
  393 + level += 1
  394 + if elem.attrib[ATTR_W_FLDCHARTYPE] == "end":
  395 + level -= 1
  396 + if level == 0 or level == -1 : # edge-case where level becomes -1
  397 + fields.append(ddetext)
  398 + ddetext = u''
  399 + level = 0 # reset edge-case
  400 +
  401 + # concatenate the text of the field, if present:
  402 + if elem.tag == TAG_W_INSTRTEXT and elem.text is not None:
  403 + #expand field code if QUOTED
  404 + ddetext += unquote(elem.text)
296 405  
297 406 for elem in root.iter(TAG_W_FLDSIMPLE):
298 407 # concatenate the attribute of the field, if present:
299 408 if elem.attrib is not None:
300   - text += elem.attrib[TAG_W_INSTRATTR]
301   -
302   - return text
  409 + fields.append(elem.attrib[ATTR_W_INSTR])
  410 +
  411 + return fields
  412 +
  413 +def unquote(field):
  414 + if "QUOTE" not in field or NO_QUOTES:
  415 + return field
  416 + #split into components
  417 + parts = field.strip().split(" ")
  418 + ddestr = ""
  419 + for p in parts[1:]:
  420 + try:
  421 + ch = chr(int(p))
  422 + except ValueError:
  423 + ch = p
  424 + ddestr += ch
  425 + return ddestr
303 426  
304 427  
305 428 def process_file(filepath):
306   - """ decides to either call process_xml or process_ole """
  429 + """ decides to either call process_openxml or process_ole """
307 430 if olefile.isOleFile(filepath):
308 431 return process_ole(filepath)
309 432 else:
310   - return process_xml(filepath)
  433 + return process_openxml(filepath)
311 434  
312 435  
313 436 #=== MAIN =================================================================
... ... @@ -321,6 +444,19 @@ def main(cmd_line_args=None):
321 444 """
322 445 args = process_args(cmd_line_args)
323 446  
  447 + # Setup logging to the console:
  448 + # here we use stdout instead of stderr by default, so that the output
  449 + # can be redirected properly.
  450 + logging.basicConfig(level=LOG_LEVELS[args.loglevel], stream=sys.stdout,
  451 + format='%(levelname)-8s %(message)s')
  452 + # enable logging in the modules:
  453 + log.setLevel(logging.NOTSET)
  454 +
  455 +
  456 + if args.nounquote :
  457 + global NO_QUOTES
  458 + NO_QUOTES = True
  459 +
324 460 if args.json:
325 461 jout = []
326 462 jout.append(BANNER_JSON)
... ...