Commit f47d572ff4c24c0b1c39e578aafccc182b1dec1b

Authored by decalage2
2 parents d4b8e77b 9a4e8d90

Merge branch 'staaldraad-ddedev'

Showing 1 changed file with 155 additions and 19 deletions
oletools/msodde.py
@@ -46,14 +46,16 @@ from __future__ import print_function @@ -46,14 +46,16 @@ from __future__ import print_function
46 # CHANGELOG: 46 # CHANGELOG:
47 # 2017-10-18 v0.52 PL: - first version 47 # 2017-10-18 v0.52 PL: - first version
48 # 2017-10-20 PL: - fixed issue #202 (handling empty xml tags) 48 # 2017-10-20 PL: - fixed issue #202 (handling empty xml tags)
  49 +# 2017-10-23 ES: - add check for fldSimple codes
  50 +# 2017-10-24 ES: - group tags and track begin/end tags to keep DDE strings together
49 # 2017-10-25 CH: - add json output 51 # 2017-10-25 CH: - add json output
50 # 2017-10-25 CH: - parse doc 52 # 2017-10-25 CH: - parse doc
  53 +# PL: - added logging
51 54
52 -__version__ = '0.52dev3' 55 +__version__ = '0.52dev4'
53 56
54 #------------------------------------------------------------------------------ 57 #------------------------------------------------------------------------------
55 -# TODO: detect beginning/end of fields, to separate each field  
56 -# TODO: test if DDE links can also appear in headers, footers and other places 58 +# TODO: field codes can be in headers/footers/comments - parse these
57 # TODO: add xlsx support 59 # TODO: add xlsx support
58 60
59 #------------------------------------------------------------------------------ 61 #------------------------------------------------------------------------------
@@ -74,6 +76,7 @@ import zipfile @@ -74,6 +76,7 @@ import zipfile
74 import os 76 import os
75 import sys 77 import sys
76 import json 78 import json
  79 +import logging
77 80
78 from oletools.thirdparty import olefile 81 from oletools.thirdparty import olefile
79 82
@@ -86,11 +89,16 @@ if sys.version_info[0] >= 3: @@ -86,11 +89,16 @@ if sys.version_info[0] >= 3:
86 89
87 90
88 NS_WORD = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' 91 NS_WORD = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
89 - 92 +NO_QUOTES = False
90 # XML tag for 'w:instrText' 93 # XML tag for 'w:instrText'
91 TAG_W_INSTRTEXT = '{%s}instrText' % NS_WORD 94 TAG_W_INSTRTEXT = '{%s}instrText' % NS_WORD
92 TAG_W_FLDSIMPLE = '{%s}fldSimple' % NS_WORD 95 TAG_W_FLDSIMPLE = '{%s}fldSimple' % NS_WORD
93 -TAG_W_INSTRATTR= '{%s}instr' % NS_WORD 96 +TAG_W_FLDCHAR = '{%s}fldChar' % NS_WORD
  97 +TAG_W_P = "{%s}p" % NS_WORD
  98 +TAG_W_R = "{%s}r" % NS_WORD
  99 +ATTR_W_INSTR = '{%s}instr' % NS_WORD
  100 +ATTR_W_FLDCHARTYPE = '{%s}fldCharType' % NS_WORD
  101 +LOCATIONS = ['word/document.xml','word/endnotes.xml','word/footnotes.xml','word/header1.xml','word/footer1.xml','word/header2.xml','word/footer2.xml','word/comments.xml']
94 102
95 # banner to be printed at program start 103 # banner to be printed at program start
96 BANNER = """msodde %s - http://decalage.info/python/oletools 104 BANNER = """msodde %s - http://decalage.info/python/oletools
@@ -104,6 +112,57 @@ BANNER_JSON = dict(type='meta', version=__version__, name='msodde', @@ -104,6 +112,57 @@ BANNER_JSON = dict(type='meta', version=__version__, name='msodde',
104 'Please report any issue at ' 112 'Please report any issue at '
105 'https://github.com/decalage2/oletools/issues') 113 'https://github.com/decalage2/oletools/issues')
106 114
  115 +# === LOGGING =================================================================
  116 +
  117 +DEFAULT_LOG_LEVEL = "warning" # Default log level
  118 +LOG_LEVELS = {
  119 + 'debug': logging.DEBUG,
  120 + 'info': logging.INFO,
  121 + 'warning': logging.WARNING,
  122 + 'error': logging.ERROR,
  123 + 'critical': logging.CRITICAL
  124 +}
  125 +
  126 +class NullHandler(logging.Handler):
  127 + """
  128 + Log Handler without output, to avoid printing messages if logging is not
  129 + configured by the main application.
  130 + Python 2.7 has logging.NullHandler, but this is necessary for 2.6:
  131 + see https://docs.python.org/2.6/library/logging.html#configuring-logging-for-a-library
  132 + """
  133 + def emit(self, record):
  134 + pass
  135 +
  136 +def get_logger(name, level=logging.CRITICAL+1):
  137 + """
  138 + Create a suitable logger object for this module.
  139 + The goal is not to change settings of the root logger, to avoid getting
  140 + other modules' logs on the screen.
  141 + If a logger exists with same name, reuse it. (Else it would have duplicate
  142 + handlers and messages would be doubled.)
  143 + The level is set to CRITICAL+1 by default, to avoid any logging.
  144 + """
  145 + # First, test if there is already a logger with the same name, else it
  146 + # will generate duplicate messages (due to duplicate handlers):
  147 + if name in logging.Logger.manager.loggerDict:
  148 + #NOTE: another less intrusive but more "hackish" solution would be to
  149 + # use getLogger then test if its effective level is not default.
  150 + logger = logging.getLogger(name)
  151 + # make sure level is OK:
  152 + logger.setLevel(level)
  153 + return logger
  154 + # get a new logger:
  155 + logger = logging.getLogger(name)
  156 + # only add a NullHandler for this logger, it is up to the application
  157 + # to configure its own logging:
  158 + logger.addHandler(NullHandler())
  159 + logger.setLevel(level)
  160 + return logger
  161 +
  162 +# a global logger object used for debugging:
  163 +log = get_logger('msodde')
  164 +
  165 +
107 # === ARGUMENT PARSING ======================================================= 166 # === ARGUMENT PARSING =======================================================
108 167
109 class ArgParserWithBanner(argparse.ArgumentParser): 168 class ArgParserWithBanner(argparse.ArgumentParser):
@@ -127,6 +186,9 @@ def process_args(cmd_line_args=None): @@ -127,6 +186,9 @@ def process_args(cmd_line_args=None):
127 type=existing_file, metavar='FILE') 186 type=existing_file, metavar='FILE')
128 parser.add_argument("--json", '-j', action='store_true', 187 parser.add_argument("--json", '-j', action='store_true',
129 help="Output in json format") 188 help="Output in json format")
  189 + parser.add_argument("--nounquote", help="don't unquote values",action='store_true')
  190 + parser.add_argument('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL,
  191 + help="logging level debug/info/warning/error/critical (default=warning)")
130 192
131 return parser.parse_args(cmd_line_args) 193 return parser.parse_args(cmd_line_args)
132 194
@@ -268,46 +330,107 @@ def process_ole_storage(ole): @@ -268,46 +330,107 @@ def process_ole_storage(ole):
268 330
269 331
270 def process_ole(filepath): 332 def process_ole(filepath):
271 - """ find dde links in ole file 333 + """
  334 + find dde links in ole file
272 335
273 like process_xml, returns a concatenated unicode string of dde links or 336 like process_xml, returns a concatenated unicode string of dde links or
274 empty if none were found. dde-links will still being with the dde[auto] key 337 empty if none were found. dde-links will still being with the dde[auto] key
275 word (possibly after some whitespace) 338 word (possibly after some whitespace)
276 """ 339 """
  340 + log.debug('process_ole')
277 #print('Looks like ole') 341 #print('Looks like ole')
278 ole = olefile.OleFileIO(filepath, path_encoding=None) 342 ole = olefile.OleFileIO(filepath, path_encoding=None)
279 text_parts = process_ole_storage(ole) 343 text_parts = process_ole_storage(ole)
280 return u'\n'.join(text_parts) 344 return u'\n'.join(text_parts)
281 345
282 346
283 -def process_xml(filepath): 347 +def process_openxml(filepath):
  348 + log.debug('process_openxml')
  349 + all_fields = []
284 z = zipfile.ZipFile(filepath) 350 z = zipfile.ZipFile(filepath)
285 - data = z.read('word/document.xml') 351 + for filepath in z.namelist():
  352 + if filepath in LOCATIONS:
  353 + data = z.read(filepath)
  354 + fields = process_xml(data)
  355 + if len(fields) > 0:
  356 + #print ('DDE Links in %s:'%filepath)
  357 + #for f in fields:
  358 + # print(f)
  359 + all_fields.extend(fields)
286 z.close() 360 z.close()
  361 + return u'\n'.join(all_fields)
  362 +
  363 +def process_xml(data):
287 # parse the XML data: 364 # parse the XML data:
288 root = ET.fromstring(data) 365 root = ET.fromstring(data)
289 - text = u''  
290 - # find all the tags 'w:instrText': 366 + fields = []
  367 + ddetext = u''
  368 + level = 0
  369 + # find all the tags 'w:p':
  370 + # parse each for begin and end tags, to group DDE strings
  371 + # fldChar can be in either a w:r element, floating alone in the w:p or spread accross w:p tags
  372 + # escape DDE if quoted etc
291 # (each is a chunk of a DDE link) 373 # (each is a chunk of a DDE link)
292 - for elem in root.iter(TAG_W_INSTRTEXT):  
293 - # concatenate the text of the field, if present:  
294 - if elem.text is not None:  
295 - text += elem.text 374 +
  375 + for subs in root.iter(TAG_W_P):
  376 + elem = None
  377 + for e in subs:
  378 + #check if w:r and if it is parse children elements to pull out the first FLDCHAR or INSTRTEXT
  379 + if e.tag == TAG_W_R:
  380 + for child in e:
  381 + if child.tag == TAG_W_FLDCHAR or child.tag == TAG_W_INSTRTEXT:
  382 + elem = child
  383 + break
  384 + else:
  385 + elem = e
  386 + #this should be an error condition
  387 + if elem is None:
  388 + continue
  389 +
  390 + #check if FLDCHARTYPE and whether "begin" or "end" tag
  391 + if elem.attrib.get(ATTR_W_FLDCHARTYPE) is not None:
  392 + if elem.attrib[ATTR_W_FLDCHARTYPE] == "begin":
  393 + level += 1
  394 + if elem.attrib[ATTR_W_FLDCHARTYPE] == "end":
  395 + level -= 1
  396 + if level == 0 or level == -1 : # edge-case where level becomes -1
  397 + fields.append(ddetext)
  398 + ddetext = u''
  399 + level = 0 # reset edge-case
  400 +
  401 + # concatenate the text of the field, if present:
  402 + if elem.tag == TAG_W_INSTRTEXT and elem.text is not None:
  403 + #expand field code if QUOTED
  404 + ddetext += unquote(elem.text)
296 405
297 for elem in root.iter(TAG_W_FLDSIMPLE): 406 for elem in root.iter(TAG_W_FLDSIMPLE):
298 # concatenate the attribute of the field, if present: 407 # concatenate the attribute of the field, if present:
299 if elem.attrib is not None: 408 if elem.attrib is not None:
300 - text += elem.attrib[TAG_W_INSTRATTR]  
301 -  
302 - return text 409 + fields.append(elem.attrib[ATTR_W_INSTR])
  410 +
  411 + return fields
  412 +
  413 +def unquote(field):
  414 + if "QUOTE" not in field or NO_QUOTES:
  415 + return field
  416 + #split into components
  417 + parts = field.strip().split(" ")
  418 + ddestr = ""
  419 + for p in parts[1:]:
  420 + try:
  421 + ch = chr(int(p))
  422 + except ValueError:
  423 + ch = p
  424 + ddestr += ch
  425 + return ddestr
303 426
304 427
305 def process_file(filepath): 428 def process_file(filepath):
306 - """ decides to either call process_xml or process_ole """ 429 + """ decides to either call process_openxml or process_ole """
307 if olefile.isOleFile(filepath): 430 if olefile.isOleFile(filepath):
308 return process_ole(filepath) 431 return process_ole(filepath)
309 else: 432 else:
310 - return process_xml(filepath) 433 + return process_openxml(filepath)
311 434
312 435
313 #=== MAIN ================================================================= 436 #=== MAIN =================================================================
@@ -321,6 +444,19 @@ def main(cmd_line_args=None): @@ -321,6 +444,19 @@ def main(cmd_line_args=None):
321 """ 444 """
322 args = process_args(cmd_line_args) 445 args = process_args(cmd_line_args)
323 446
  447 + # Setup logging to the console:
  448 + # here we use stdout instead of stderr by default, so that the output
  449 + # can be redirected properly.
  450 + logging.basicConfig(level=LOG_LEVELS[args.loglevel], stream=sys.stdout,
  451 + format='%(levelname)-8s %(message)s')
  452 + # enable logging in the modules:
  453 + log.setLevel(logging.NOTSET)
  454 +
  455 +
  456 + if args.nounquote :
  457 + global NO_QUOTES
  458 + NO_QUOTES = True
  459 +
324 if args.json: 460 if args.json:
325 jout = [] 461 jout = []
326 jout.append(BANNER_JSON) 462 jout.append(BANNER_JSON)