Commit 2142e7a635947a528525091822b00e24ed20ad2d
Committed by
Philippe Lagadec
1 parent
399a1154
msodde: Create a blacklist of harmless commands
This is based on the OpenXML specification (ISO 29500-1:2016 / ECMA-376)
Showing
1 changed file
with
161 additions
and
0 deletions
oletools/msodde.py
| ... | ... | @@ -77,6 +77,7 @@ import os |
| 77 | 77 | import sys |
| 78 | 78 | import json |
| 79 | 79 | import logging |
| 80 | +import re | |
| 80 | 81 | |
| 81 | 82 | # little hack to allow absolute imports even if oletools is not installed |
| 82 | 83 | # Copied from olevba.py |
| ... | ... | @@ -107,6 +108,79 @@ ATTR_W_INSTR = '{%s}instr' % NS_WORD |
| 107 | 108 | ATTR_W_FLDCHARTYPE = '{%s}fldCharType' % NS_WORD |
| 108 | 109 | LOCATIONS = ['word/document.xml','word/endnotes.xml','word/footnotes.xml','word/header1.xml','word/footer1.xml','word/header2.xml','word/footer2.xml','word/comments.xml'] |
| 109 | 110 | |
| 111 | +# list of acceptable, harmless field instructions for blacklist field mode | |
| 112 | +# c.f. http://officeopenxml.com/WPfieldInstructions.php or the official | |
| 113 | +# standard ISO-29500-1:2016 / ECMA-376 paragraphs 17.16.4, 17.16.5, 17.16.23 | |
| 114 | +# https://www.iso.org/standard/71691.html (neither mentions DDE[AUTO]). | |
| 115 | +# Format: (command, n_required_args, n_optional_args, | |
| 116 | +# switches_with_args, switches_without_args, format_switches) | |
| 117 | +FIELD_BLACKLIST = ( | |
| 118 | + # date and time: | |
| 119 | + ('CREATEDATE', 0, 0, '', 'hs', 'datetime'), | |
| 120 | + ('DATE', 0, 0, '', 'hls', 'datetime'), | |
| 121 | + ('EDITTIME', 0, 0, '', '', 'numeric'), | |
| 122 | + ('PRINTDATE', 0, 0, '', 'hs', 'datetime'), | |
| 123 | + ('SAVEDATE', 0, 0, '', 'hs', 'datetime'), | |
| 124 | + ('TIME', 0, 0, '', '', 'datetime'), | |
| 125 | + # exclude document automation (we hate the "auto" in "automation") | |
| 126 | + # (COMPARE, DOCVARIABLE, GOTOBUTTON, IF, MACROBUTTON, PRINT) | |
| 127 | + # document information | |
| 128 | + ('AUTHOR', 0, 1, '', '', 'string'), | |
| 129 | + ('COMMENTS', 0, 1, '', '', 'string'), | |
| 130 | + ('DOCPROPERTY', 1, 0, '', '', 'string/numeric/datetime'), | |
| 131 | + ('FILENAME', 0, 0, '', 'p', 'string'), | |
| 132 | + ('FILESIZE', 0, 0, '', 'km', 'numeric'), | |
| 133 | + ('KEYWORDS', 0, 1, '', '', 'string'), | |
| 134 | + ('LASTSAVEDBY', 0, 0, '', '', 'string'), | |
| 135 | + ('NUMCHARS', 0, 0, '', '', 'numeric'), | |
| 136 | + ('NUMPAGES', 0, 0, '', '', 'numeric'), | |
| 137 | + ('NUMWORDS', 0, 0, '', '', 'numeric'), | |
| 138 | + ('SUBJECT', 0, 1, '', '', 'string'), | |
| 139 | + ('TEMPLATE', 0, 0, '', 'p', 'string'), | |
| 140 | + ('TITLE', 0, 1, '', '', 'string'), | |
| 141 | + # equations and formulas | |
| 142 | + # exlude '=' formulae because they have different syntax | |
| 143 | + ('ADVANCE', 0, 0, 'dlruxy', '', ''), | |
| 144 | + ('SYMBOL', 1, 0, 'fs', 'ahju', ''), | |
| 145 | + # form fields | |
| 146 | + ('FORMCHECKBOX', 0, 0, '', '', ''), | |
| 147 | + ('FORMDROPDOWN', 0, 0, '', '', ''), | |
| 148 | + ('FORMTEXT', 0, 0, '', '', ''), | |
| 149 | + # index and tables | |
| 150 | + ('INDEX', 0, 0, 'bcdefghklpsz', 'ry', ''), | |
| 151 | + # exlude RD since that imports data from other files | |
| 152 | + ('TA', 0, 0, 'clrs', 'bi', ''), | |
| 153 | + ('TC', 1, 0, 'fl', 'n', ''), | |
| 154 | + ('TOA', 0, 0, 'bcdegls', 'fhp', ''), | |
| 155 | + ('TOC', 0, 0, 'abcdflnopst', 'huwxz', ''), | |
| 156 | + ('XE', 1, 0, 'frty', 'bi', ''), | |
| 157 | + # links and references | |
| 158 | + # exclude AUTOTEXT and AUTOTEXTLIST since we do not like stuff with 'AUTO' | |
| 159 | + ('BIBLIOGRAPHY', 0, 0, 'lfm', '', ''), | |
| 160 | + ('CITATION', 1, 0, 'lfspvm', 'nty', ''), | |
| 161 | + # exclude HYPERLINK since we are allergic to URLs | |
| 162 | + # exclude INCLUDEPICTURE and INCLUDETEXT (other file or maybe even URL?) | |
| 163 | + # exclude LINK and REF (could reference other files) | |
| 164 | + ('NOTEREF', 1, 0, '', 'fhp', ''), | |
| 165 | + ('PAGEREF', 1, 0, '', 'hp', ''), | |
| 166 | + ('QUOTE', 1, 0, '', '', 'datetime'), | |
| 167 | + ('STYLEREF', 1, 0, '', 'lnprtw', ''), | |
| 168 | + # exclude all Mail Merge commands since they import data from other files | |
| 169 | + # (ADDRESSBLOCK, ASK, COMPARE, DATABASE, FILLIN, GREETINGLINE, IF, | |
| 170 | + # MERGEFIELD, MERGEREC, MERGESEQ, NEXT, NEXTIF, SET, SKIPIF) | |
| 171 | + # Numbering | |
| 172 | + ('LISTNUM', 0, 1, 'ls', '', ''), | |
| 173 | + ('PAGE', 0, 0, '', '', 'numeric'), | |
| 174 | + ('REVNUM', 0, 0, '', '', ''), | |
| 175 | + ('SECTION', 0, 0, '', '', 'numeric'), | |
| 176 | + ('SECTIONPAGES', 0, 0, '', '', 'numeric'), | |
| 177 | + ('SEQ', 1, 1, 'rs', 'chn', 'numeric'), | |
| 178 | + # user information | |
| 179 | + ('USERADDRESS', 0, 1, '', '', 'string'), | |
| 180 | + ('USERINITIALS', 0, 1, '', '', 'string'), | |
| 181 | + ('USERNAME', 0, 1, '', '', 'string'), | |
| 182 | +) | |
| 183 | + | |
| 110 | 184 | # banner to be printed at program start |
| 111 | 185 | BANNER = """msodde %s - http://decalage.info/python/oletools |
| 112 | 186 | THIS IS WORK IN PROGRESS - Check updates regularly! |
| ... | ... | @@ -475,6 +549,93 @@ def unquote(field): |
| 475 | 549 | ddestr += ch |
| 476 | 550 | return ddestr |
| 477 | 551 | |
| 552 | +# "static variables" for field_is_blacklisted: | |
| 553 | +FIELD_WORD_REGEX = re.compile(r'"[^"]*"|\S+') | |
| 554 | +FIELD_BLACKLIST_CMDS = tuple(field[0].lower() for field in FIELD_BLACKLIST) | |
| 555 | +FIELD_SWITCH_REGEX = re.compile(r'^\\[\w#*@]$') | |
| 556 | + | |
| 557 | +def field_is_blacklisted(contents): | |
| 558 | + """ Check if given field contents matches any in FIELD_BLACKLIST | |
| 559 | + | |
| 560 | + A complete parser of field contents would be really complicated, so this | |
| 561 | + function has to make a trade-off. There may be valid constructs that this | |
| 562 | + simple parser cannot comprehend. Most arguments are not tested for validity | |
| 563 | + since that would make this test much more complicated. However, if this | |
| 564 | + parser accepts some field contents, then office is very likely to not | |
| 565 | + complain about it, either. | |
| 566 | + """ | |
| 567 | + | |
| 568 | + # split contents into "words", (e.g. 'bla' or '\s' or '"a b c"' or '""') | |
| 569 | + words = FIELD_WORD_REGEX.findall(contents) | |
| 570 | + if not words: | |
| 571 | + return False | |
| 572 | + | |
| 573 | + # check if first word is one of the commands on our blacklist | |
| 574 | + try: | |
| 575 | + index = FIELD_BLACKLIST_CMDS.index(words[0].lower()) | |
| 576 | + except ValueError: # first word is no blacklisted command | |
| 577 | + return False | |
| 578 | + log.debug('trying to match "{0}" to blacklist command {0}' | |
| 579 | + .format(contents, FIELD_BLACKLIST[index])) | |
| 580 | + _, nargs_required, nargs_optional, sw_with_arg, sw_solo, sw_format \ | |
| 581 | + = FIELD_BLACKLIST[index] | |
| 582 | + | |
| 583 | + # check number of args | |
| 584 | + nargs = 0 | |
| 585 | + for word in words[1:]: | |
| 586 | + if word[0] == '\\': # note: words can never be empty, but can be '""' | |
| 587 | + break | |
| 588 | + nargs += 1 | |
| 589 | + if nargs < nargs_required: | |
| 590 | + log.debug('too few args: found {0}, but need at least {1} in "{2}"' | |
| 591 | + .format(nargs, nargs_required, contents)) | |
| 592 | + return False | |
| 593 | + elif nargs > nargs_required + nargs_optional: | |
| 594 | + log.debug('too many args: found {0}, but need at most {1}+{2} in "{3}"' | |
| 595 | + .format(nargs, nargs_required, nargs_optional, contents)) | |
| 596 | + return False | |
| 597 | + | |
| 598 | + # check switches | |
| 599 | + expect_arg = False | |
| 600 | + arg_choices = [] | |
| 601 | + for word in words[1+nargs:]: | |
| 602 | + if expect_arg: # this is an argument for the last switch | |
| 603 | + if arg_choices and (word not in arg_choices): | |
| 604 | + log.debug('Found invalid switch argument "{0}" in "{1}"' | |
| 605 | + .format(word, contents)) | |
| 606 | + return False | |
| 607 | + expect_arg = False | |
| 608 | + arg_choices = [] # in general, do not enforce choices | |
| 609 | + continue # "no further questions, your honor" | |
| 610 | + elif not FIELD_SWITCH_REGEX.match(word): | |
| 611 | + log.debug('expected switch, found "{0}" in "{1}"' | |
| 612 | + .format(word, contents)) | |
| 613 | + return False | |
| 614 | + # we want a switch and we got a valid one | |
| 615 | + switch = word[1] | |
| 616 | + | |
| 617 | + if switch in sw_solo: | |
| 618 | + pass | |
| 619 | + elif switch in sw_with_arg: | |
| 620 | + expect_arg = True # next word is interpreted as arg, not switch | |
| 621 | + elif switch == '#' and 'numeric' in sw_format: | |
| 622 | + expect_arg = True # next word is numeric format | |
| 623 | + elif switch == '@' and 'datetime' in sw_format: | |
| 624 | + expect_arg = True # next word is date/time format | |
| 625 | + elif switch == '*': | |
| 626 | + expect_arg = True # next word is format argument | |
| 627 | + arg_choices += ['CHARFORMAT', 'MERGEFORMAT'] # always allowed | |
| 628 | + if 'string' in sw_format: | |
| 629 | + arg_choices += ['Caps', 'FirstCap', 'Lower', 'Upper'] | |
| 630 | + if 'numeric' in sw_format: | |
| 631 | + arg_choices = [] # too many choices to list them here | |
| 632 | + else: | |
| 633 | + log.debug('unexpected switch {0} in "{1}"'.format(switch, contents)) | |
| 634 | + return False | |
| 635 | + | |
| 636 | + # if nothing went wrong sofar, the contents seems to match the blacklist | |
| 637 | + return True | |
| 638 | + | |
| 478 | 639 | |
| 479 | 640 | def process_file(filepath): |
| 480 | 641 | """ decides to either call process_openxml or process_ole """ | ... | ... |