Commit 2142e7a635947a528525091822b00e24ed20ad2d

Authored by Christian Herdtweck
Committed by Philippe Lagadec
1 parent 399a1154

msodde: Create a blacklist of harmless commands

This is based on the OpenXML specification (ISO 29500-1:2016 / ECMA-376)
Showing 1 changed file with 161 additions and 0 deletions
oletools/msodde.py
... ... @@ -77,6 +77,7 @@ import os
77 77 import sys
78 78 import json
79 79 import logging
  80 +import re
80 81  
81 82 # little hack to allow absolute imports even if oletools is not installed
82 83 # Copied from olevba.py
... ... @@ -107,6 +108,79 @@ ATTR_W_INSTR = '{%s}instr' % NS_WORD
107 108 ATTR_W_FLDCHARTYPE = '{%s}fldCharType' % NS_WORD
108 109 LOCATIONS = ['word/document.xml','word/endnotes.xml','word/footnotes.xml','word/header1.xml','word/footer1.xml','word/header2.xml','word/footer2.xml','word/comments.xml']
109 110  
  111 +# list of acceptable, harmless field instructions for blacklist field mode
  112 +# c.f. http://officeopenxml.com/WPfieldInstructions.php or the official
  113 +# standard ISO-29500-1:2016 / ECMA-376 paragraphs 17.16.4, 17.16.5, 17.16.23
  114 +# https://www.iso.org/standard/71691.html (neither mentions DDE[AUTO]).
  115 +# Format: (command, n_required_args, n_optional_args,
  116 +# switches_with_args, switches_without_args, format_switches)
  117 +FIELD_BLACKLIST = (
  118 + # date and time:
  119 + ('CREATEDATE', 0, 0, '', 'hs', 'datetime'),
  120 + ('DATE', 0, 0, '', 'hls', 'datetime'),
  121 + ('EDITTIME', 0, 0, '', '', 'numeric'),
  122 + ('PRINTDATE', 0, 0, '', 'hs', 'datetime'),
  123 + ('SAVEDATE', 0, 0, '', 'hs', 'datetime'),
  124 + ('TIME', 0, 0, '', '', 'datetime'),
  125 + # exclude document automation (we hate the "auto" in "automation")
  126 + # (COMPARE, DOCVARIABLE, GOTOBUTTON, IF, MACROBUTTON, PRINT)
  127 + # document information
  128 + ('AUTHOR', 0, 1, '', '', 'string'),
  129 + ('COMMENTS', 0, 1, '', '', 'string'),
  130 + ('DOCPROPERTY', 1, 0, '', '', 'string/numeric/datetime'),
  131 + ('FILENAME', 0, 0, '', 'p', 'string'),
  132 + ('FILESIZE', 0, 0, '', 'km', 'numeric'),
  133 + ('KEYWORDS', 0, 1, '', '', 'string'),
  134 + ('LASTSAVEDBY', 0, 0, '', '', 'string'),
  135 + ('NUMCHARS', 0, 0, '', '', 'numeric'),
  136 + ('NUMPAGES', 0, 0, '', '', 'numeric'),
  137 + ('NUMWORDS', 0, 0, '', '', 'numeric'),
  138 + ('SUBJECT', 0, 1, '', '', 'string'),
  139 + ('TEMPLATE', 0, 0, '', 'p', 'string'),
  140 + ('TITLE', 0, 1, '', '', 'string'),
  141 + # equations and formulas
  142 + # exlude '=' formulae because they have different syntax
  143 + ('ADVANCE', 0, 0, 'dlruxy', '', ''),
  144 + ('SYMBOL', 1, 0, 'fs', 'ahju', ''),
  145 + # form fields
  146 + ('FORMCHECKBOX', 0, 0, '', '', ''),
  147 + ('FORMDROPDOWN', 0, 0, '', '', ''),
  148 + ('FORMTEXT', 0, 0, '', '', ''),
  149 + # index and tables
  150 + ('INDEX', 0, 0, 'bcdefghklpsz', 'ry', ''),
  151 + # exlude RD since that imports data from other files
  152 + ('TA', 0, 0, 'clrs', 'bi', ''),
  153 + ('TC', 1, 0, 'fl', 'n', ''),
  154 + ('TOA', 0, 0, 'bcdegls', 'fhp', ''),
  155 + ('TOC', 0, 0, 'abcdflnopst', 'huwxz', ''),
  156 + ('XE', 1, 0, 'frty', 'bi', ''),
  157 + # links and references
  158 + # exclude AUTOTEXT and AUTOTEXTLIST since we do not like stuff with 'AUTO'
  159 + ('BIBLIOGRAPHY', 0, 0, 'lfm', '', ''),
  160 + ('CITATION', 1, 0, 'lfspvm', 'nty', ''),
  161 + # exclude HYPERLINK since we are allergic to URLs
  162 + # exclude INCLUDEPICTURE and INCLUDETEXT (other file or maybe even URL?)
  163 + # exclude LINK and REF (could reference other files)
  164 + ('NOTEREF', 1, 0, '', 'fhp', ''),
  165 + ('PAGEREF', 1, 0, '', 'hp', ''),
  166 + ('QUOTE', 1, 0, '', '', 'datetime'),
  167 + ('STYLEREF', 1, 0, '', 'lnprtw', ''),
  168 + # exclude all Mail Merge commands since they import data from other files
  169 + # (ADDRESSBLOCK, ASK, COMPARE, DATABASE, FILLIN, GREETINGLINE, IF,
  170 + # MERGEFIELD, MERGEREC, MERGESEQ, NEXT, NEXTIF, SET, SKIPIF)
  171 + # Numbering
  172 + ('LISTNUM', 0, 1, 'ls', '', ''),
  173 + ('PAGE', 0, 0, '', '', 'numeric'),
  174 + ('REVNUM', 0, 0, '', '', ''),
  175 + ('SECTION', 0, 0, '', '', 'numeric'),
  176 + ('SECTIONPAGES', 0, 0, '', '', 'numeric'),
  177 + ('SEQ', 1, 1, 'rs', 'chn', 'numeric'),
  178 + # user information
  179 + ('USERADDRESS', 0, 1, '', '', 'string'),
  180 + ('USERINITIALS', 0, 1, '', '', 'string'),
  181 + ('USERNAME', 0, 1, '', '', 'string'),
  182 +)
  183 +
110 184 # banner to be printed at program start
111 185 BANNER = """msodde %s - http://decalage.info/python/oletools
112 186 THIS IS WORK IN PROGRESS - Check updates regularly!
... ... @@ -475,6 +549,93 @@ def unquote(field):
475 549 ddestr += ch
476 550 return ddestr
477 551  
  552 +# "static variables" for field_is_blacklisted:
  553 +FIELD_WORD_REGEX = re.compile(r'"[^"]*"|\S+')
  554 +FIELD_BLACKLIST_CMDS = tuple(field[0].lower() for field in FIELD_BLACKLIST)
  555 +FIELD_SWITCH_REGEX = re.compile(r'^\\[\w#*@]$')
  556 +
  557 +def field_is_blacklisted(contents):
  558 + """ Check if given field contents matches any in FIELD_BLACKLIST
  559 +
  560 + A complete parser of field contents would be really complicated, so this
  561 + function has to make a trade-off. There may be valid constructs that this
  562 + simple parser cannot comprehend. Most arguments are not tested for validity
  563 + since that would make this test much more complicated. However, if this
  564 + parser accepts some field contents, then office is very likely to not
  565 + complain about it, either.
  566 + """
  567 +
  568 + # split contents into "words", (e.g. 'bla' or '\s' or '"a b c"' or '""')
  569 + words = FIELD_WORD_REGEX.findall(contents)
  570 + if not words:
  571 + return False
  572 +
  573 + # check if first word is one of the commands on our blacklist
  574 + try:
  575 + index = FIELD_BLACKLIST_CMDS.index(words[0].lower())
  576 + except ValueError: # first word is no blacklisted command
  577 + return False
  578 + log.debug('trying to match "{0}" to blacklist command {0}'
  579 + .format(contents, FIELD_BLACKLIST[index]))
  580 + _, nargs_required, nargs_optional, sw_with_arg, sw_solo, sw_format \
  581 + = FIELD_BLACKLIST[index]
  582 +
  583 + # check number of args
  584 + nargs = 0
  585 + for word in words[1:]:
  586 + if word[0] == '\\': # note: words can never be empty, but can be '""'
  587 + break
  588 + nargs += 1
  589 + if nargs < nargs_required:
  590 + log.debug('too few args: found {0}, but need at least {1} in "{2}"'
  591 + .format(nargs, nargs_required, contents))
  592 + return False
  593 + elif nargs > nargs_required + nargs_optional:
  594 + log.debug('too many args: found {0}, but need at most {1}+{2} in "{3}"'
  595 + .format(nargs, nargs_required, nargs_optional, contents))
  596 + return False
  597 +
  598 + # check switches
  599 + expect_arg = False
  600 + arg_choices = []
  601 + for word in words[1+nargs:]:
  602 + if expect_arg: # this is an argument for the last switch
  603 + if arg_choices and (word not in arg_choices):
  604 + log.debug('Found invalid switch argument "{0}" in "{1}"'
  605 + .format(word, contents))
  606 + return False
  607 + expect_arg = False
  608 + arg_choices = [] # in general, do not enforce choices
  609 + continue # "no further questions, your honor"
  610 + elif not FIELD_SWITCH_REGEX.match(word):
  611 + log.debug('expected switch, found "{0}" in "{1}"'
  612 + .format(word, contents))
  613 + return False
  614 + # we want a switch and we got a valid one
  615 + switch = word[1]
  616 +
  617 + if switch in sw_solo:
  618 + pass
  619 + elif switch in sw_with_arg:
  620 + expect_arg = True # next word is interpreted as arg, not switch
  621 + elif switch == '#' and 'numeric' in sw_format:
  622 + expect_arg = True # next word is numeric format
  623 + elif switch == '@' and 'datetime' in sw_format:
  624 + expect_arg = True # next word is date/time format
  625 + elif switch == '*':
  626 + expect_arg = True # next word is format argument
  627 + arg_choices += ['CHARFORMAT', 'MERGEFORMAT'] # always allowed
  628 + if 'string' in sw_format:
  629 + arg_choices += ['Caps', 'FirstCap', 'Lower', 'Upper']
  630 + if 'numeric' in sw_format:
  631 + arg_choices = [] # too many choices to list them here
  632 + else:
  633 + log.debug('unexpected switch {0} in "{1}"'.format(switch, contents))
  634 + return False
  635 +
  636 + # if nothing went wrong sofar, the contents seems to match the blacklist
  637 + return True
  638 +
478 639  
479 640 def process_file(filepath):
480 641 """ decides to either call process_openxml or process_ole """
... ...