Commit 6028d9ab1106a188c91e9aff496df9e745726d43
1 parent
56ed93a1
msodde: clean-up code following pep8 and pylint (1)
- disable pylint-whitespace-check from FIELD_BLACKLIST - shortend most all lines to max 79 chars (except pylint: disable-*) - moved imports further up - re-wrap a few lines - add missing doc strings - add/remove whitespace - remove old commented debug-log/print statements
Showing
1 changed file
with
136 additions
and
114 deletions
oletools/msodde.py
| @@ -17,30 +17,31 @@ msodde is part of the python-oletools package: | @@ -17,30 +17,31 @@ msodde is part of the python-oletools package: | ||
| 17 | http://www.decalage.info/python/oletools | 17 | http://www.decalage.info/python/oletools |
| 18 | """ | 18 | """ |
| 19 | 19 | ||
| 20 | -# === LICENSE ================================================================== | 20 | +# === LICENSE ================================================================= |
| 21 | 21 | ||
| 22 | # msodde is copyright (c) 2017 Philippe Lagadec (http://www.decalage.info) | 22 | # msodde is copyright (c) 2017 Philippe Lagadec (http://www.decalage.info) |
| 23 | # All rights reserved. | 23 | # All rights reserved. |
| 24 | # | 24 | # |
| 25 | -# Redistribution and use in source and binary forms, with or without modification, | ||
| 26 | -# are permitted provided that the following conditions are met: | 25 | +# Redistribution and use in source and binary forms, with or without |
| 26 | +# modification, are permitted provided that the following conditions are met: | ||
| 27 | # | 27 | # |
| 28 | -# * Redistributions of source code must retain the above copyright notice, this | ||
| 29 | -# list of conditions and the following disclaimer. | 28 | +# * Redistributions of source code must retain the above copyright notice, |
| 29 | +# this list of conditions and the following disclaimer. | ||
| 30 | # * Redistributions in binary form must reproduce the above copyright notice, | 30 | # * Redistributions in binary form must reproduce the above copyright notice, |
| 31 | # this list of conditions and the following disclaimer in the documentation | 31 | # this list of conditions and the following disclaimer in the documentation |
| 32 | # and/or other materials provided with the distribution. | 32 | # and/or other materials provided with the distribution. |
| 33 | # | 33 | # |
| 34 | -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | ||
| 35 | -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | ||
| 36 | -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | ||
| 37 | -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | ||
| 38 | -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
| 39 | -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||
| 40 | -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | ||
| 41 | -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||
| 42 | -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
| 43 | -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 34 | +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 35 | +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 36 | +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
| 37 | +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE | ||
| 38 | +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | ||
| 39 | +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | ||
| 40 | +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | ||
| 41 | +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | ||
| 42 | +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
| 43 | +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | ||
| 44 | +# POSSIBILITY OF SUCH DAMAGE. | ||
| 44 | 45 | ||
| 45 | from __future__ import print_function | 46 | from __future__ import print_function |
| 46 | 47 | ||
| @@ -49,7 +50,8 @@ from __future__ import print_function | @@ -49,7 +50,8 @@ from __future__ import print_function | ||
| 49 | # 2017-10-18 v0.52 PL: - first version | 50 | # 2017-10-18 v0.52 PL: - first version |
| 50 | # 2017-10-20 PL: - fixed issue #202 (handling empty xml tags) | 51 | # 2017-10-20 PL: - fixed issue #202 (handling empty xml tags) |
| 51 | # 2017-10-23 ES: - add check for fldSimple codes | 52 | # 2017-10-23 ES: - add check for fldSimple codes |
| 52 | -# 2017-10-24 ES: - group tags and track begin/end tags to keep DDE strings together | 53 | +# 2017-10-24 ES: - group tags and track begin/end tags to keep DDE |
| 54 | +# strings together | ||
| 53 | # 2017-10-25 CH: - add json output | 55 | # 2017-10-25 CH: - add json output |
| 54 | # 2017-10-25 CH: - parse doc | 56 | # 2017-10-25 CH: - parse doc |
| 55 | # PL: - added logging | 57 | # PL: - added logging |
| @@ -62,7 +64,7 @@ from __future__ import print_function | @@ -62,7 +64,7 @@ from __future__ import print_function | ||
| 62 | 64 | ||
| 63 | __version__ = '0.52dev9' | 65 | __version__ = '0.52dev9' |
| 64 | 66 | ||
| 65 | -#------------------------------------------------------------------------------ | 67 | +# ----------------------------------------------------------------------------- |
| 66 | # TODO: field codes can be in headers/footers/comments - parse these | 68 | # TODO: field codes can be in headers/footers/comments - parse these |
| 67 | # TODO: generalize behaviour for xlsx: find all external links (maybe rename | 69 | # TODO: generalize behaviour for xlsx: find all external links (maybe rename |
| 68 | # command line flag for "blacklist" to "find all suspicious" or so) | 70 | # command line flag for "blacklist" to "find all suspicious" or so) |
| @@ -71,7 +73,7 @@ __version__ = '0.52dev9' | @@ -71,7 +73,7 @@ __version__ = '0.52dev9' | ||
| 71 | # DDE-Links | 73 | # DDE-Links |
| 72 | # TODO: avoid reading complete rtf file data into memory | 74 | # TODO: avoid reading complete rtf file data into memory |
| 73 | 75 | ||
| 74 | -#------------------------------------------------------------------------------ | 76 | +# ----------------------------------------------------------------------------- |
| 75 | # REFERENCES: | 77 | # REFERENCES: |
| 76 | 78 | ||
| 77 | 79 | ||
| @@ -123,7 +125,9 @@ TAG_W_P = "{%s}p" % NS_WORD | @@ -123,7 +125,9 @@ TAG_W_P = "{%s}p" % NS_WORD | ||
| 123 | TAG_W_R = "{%s}r" % NS_WORD | 125 | TAG_W_R = "{%s}r" % NS_WORD |
| 124 | ATTR_W_INSTR = '{%s}instr' % NS_WORD | 126 | ATTR_W_INSTR = '{%s}instr' % NS_WORD |
| 125 | ATTR_W_FLDCHARTYPE = '{%s}fldCharType' % NS_WORD | 127 | ATTR_W_FLDCHARTYPE = '{%s}fldCharType' % NS_WORD |
| 126 | -LOCATIONS = ['word/document.xml','word/endnotes.xml','word/footnotes.xml','word/header1.xml','word/footer1.xml','word/header2.xml','word/footer2.xml','word/comments.xml'] | 128 | +LOCATIONS = ('word/document.xml', 'word/endnotes.xml', 'word/footnotes.xml', |
| 129 | + 'word/header1.xml', 'word/footer1.xml', 'word/header2.xml', | ||
| 130 | + 'word/footer2.xml', 'word/comments.xml') | ||
| 127 | 131 | ||
| 128 | # list of acceptable, harmless field instructions for blacklist field mode | 132 | # list of acceptable, harmless field instructions for blacklist field mode |
| 129 | # c.f. http://officeopenxml.com/WPfieldInstructions.php or the official | 133 | # c.f. http://officeopenxml.com/WPfieldInstructions.php or the official |
| @@ -133,73 +137,74 @@ LOCATIONS = ['word/document.xml','word/endnotes.xml','word/footnotes.xml','word/ | @@ -133,73 +137,74 @@ LOCATIONS = ['word/document.xml','word/endnotes.xml','word/footnotes.xml','word/ | ||
| 133 | # switches_with_args, switches_without_args, format_switches) | 137 | # switches_with_args, switches_without_args, format_switches) |
| 134 | FIELD_BLACKLIST = ( | 138 | FIELD_BLACKLIST = ( |
| 135 | # date and time: | 139 | # date and time: |
| 136 | - ('CREATEDATE', 0, 0, '', 'hs', 'datetime'), | ||
| 137 | - ('DATE', 0, 0, '', 'hls', 'datetime'), | ||
| 138 | - ('EDITTIME', 0, 0, '', '', 'numeric'), | ||
| 139 | - ('PRINTDATE', 0, 0, '', 'hs', 'datetime'), | ||
| 140 | - ('SAVEDATE', 0, 0, '', 'hs', 'datetime'), | ||
| 141 | - ('TIME', 0, 0, '', '', 'datetime'), | 140 | + ('CREATEDATE', 0, 0, '', 'hs', 'datetime'), # pylint: disable=bad-whitespace |
| 141 | + ('DATE', 0, 0, '', 'hls', 'datetime'), # pylint: disable=bad-whitespace | ||
| 142 | + ('EDITTIME', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | ||
| 143 | + ('PRINTDATE', 0, 0, '', 'hs', 'datetime'), # pylint: disable=bad-whitespace | ||
| 144 | + ('SAVEDATE', 0, 0, '', 'hs', 'datetime'), # pylint: disable=bad-whitespace | ||
| 145 | + ('TIME', 0, 0, '', '', 'datetime'), # pylint: disable=bad-whitespace | ||
| 142 | # exclude document automation (we hate the "auto" in "automation") | 146 | # exclude document automation (we hate the "auto" in "automation") |
| 143 | # (COMPARE, DOCVARIABLE, GOTOBUTTON, IF, MACROBUTTON, PRINT) | 147 | # (COMPARE, DOCVARIABLE, GOTOBUTTON, IF, MACROBUTTON, PRINT) |
| 144 | # document information | 148 | # document information |
| 145 | - ('AUTHOR', 0, 1, '', '', 'string'), | ||
| 146 | - ('COMMENTS', 0, 1, '', '', 'string'), | ||
| 147 | - ('DOCPROPERTY', 1, 0, '', '', 'string/numeric/datetime'), | ||
| 148 | - ('FILENAME', 0, 0, '', 'p', 'string'), | ||
| 149 | - ('FILESIZE', 0, 0, '', 'km', 'numeric'), | ||
| 150 | - ('KEYWORDS', 0, 1, '', '', 'string'), | ||
| 151 | - ('LASTSAVEDBY', 0, 0, '', '', 'string'), | ||
| 152 | - ('NUMCHARS', 0, 0, '', '', 'numeric'), | ||
| 153 | - ('NUMPAGES', 0, 0, '', '', 'numeric'), | ||
| 154 | - ('NUMWORDS', 0, 0, '', '', 'numeric'), | ||
| 155 | - ('SUBJECT', 0, 1, '', '', 'string'), | ||
| 156 | - ('TEMPLATE', 0, 0, '', 'p', 'string'), | ||
| 157 | - ('TITLE', 0, 1, '', '', 'string'), | 149 | + ('AUTHOR', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace |
| 150 | + ('COMMENTS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | ||
| 151 | + ('DOCPROPERTY', 1, 0, '', '', 'string/numeric/datetime'), # pylint: disable=bad-whitespace | ||
| 152 | + ('FILENAME', 0, 0, '', 'p', 'string'), # pylint: disable=bad-whitespace | ||
| 153 | + ('FILESIZE', 0, 0, '', 'km', 'numeric'), # pylint: disable=bad-whitespace | ||
| 154 | + ('KEYWORDS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | ||
| 155 | + ('LASTSAVEDBY', 0, 0, '', '', 'string'), # pylint: disable=bad-whitespace | ||
| 156 | + ('NUMCHARS', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | ||
| 157 | + ('NUMPAGES', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | ||
| 158 | + ('NUMWORDS', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | ||
| 159 | + ('SUBJECT', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | ||
| 160 | + ('TEMPLATE', 0, 0, '', 'p', 'string'), # pylint: disable=bad-whitespace | ||
| 161 | + ('TITLE', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | ||
| 158 | # equations and formulas | 162 | # equations and formulas |
| 159 | - # exlude '=' formulae because they have different syntax | ||
| 160 | - ('ADVANCE', 0, 0, 'dlruxy', '', ''), | ||
| 161 | - ('SYMBOL', 1, 0, 'fs', 'ahju', ''), | 163 | + # exlude '=' formulae because they have different syntax (and can be bad) |
| 164 | + ('ADVANCE', 0, 0, 'dlruxy', '', ''), # pylint: disable=bad-whitespace | ||
| 165 | + ('SYMBOL', 1, 0, 'fs', 'ahju', ''), # pylint: disable=bad-whitespace | ||
| 162 | # form fields | 166 | # form fields |
| 163 | - ('FORMCHECKBOX', 0, 0, '', '', ''), | ||
| 164 | - ('FORMDROPDOWN', 0, 0, '', '', ''), | ||
| 165 | - ('FORMTEXT', 0, 0, '', '', ''), | 167 | + ('FORMCHECKBOX', 0, 0, '', '', ''), # pylint: disable=bad-whitespace |
| 168 | + ('FORMDROPDOWN', 0, 0, '', '', ''), # pylint: disable=bad-whitespace | ||
| 169 | + ('FORMTEXT', 0, 0, '', '', ''), # pylint: disable=bad-whitespace | ||
| 166 | # index and tables | 170 | # index and tables |
| 167 | - ('INDEX', 0, 0, 'bcdefghklpsz', 'ry', ''), | 171 | + ('INDEX', 0, 0, 'bcdefghklpsz', 'ry', ''), # pylint: disable=bad-whitespace |
| 168 | # exlude RD since that imports data from other files | 172 | # exlude RD since that imports data from other files |
| 169 | - ('TA', 0, 0, 'clrs', 'bi', ''), | ||
| 170 | - ('TC', 1, 0, 'fl', 'n', ''), | ||
| 171 | - ('TOA', 0, 0, 'bcdegls', 'fhp', ''), | ||
| 172 | - ('TOC', 0, 0, 'abcdflnopst', 'huwxz', ''), | ||
| 173 | - ('XE', 1, 0, 'frty', 'bi', ''), | 173 | + ('TA', 0, 0, 'clrs', 'bi', ''), # pylint: disable=bad-whitespace |
| 174 | + ('TC', 1, 0, 'fl', 'n', ''), # pylint: disable=bad-whitespace | ||
| 175 | + ('TOA', 0, 0, 'bcdegls', 'fhp', ''), # pylint: disable=bad-whitespace | ||
| 176 | + ('TOC', 0, 0, 'abcdflnopst', 'huwxz', ''), # pylint: disable=bad-whitespace | ||
| 177 | + ('XE', 1, 0, 'frty', 'bi', ''), # pylint: disable=bad-whitespace | ||
| 174 | # links and references | 178 | # links and references |
| 175 | # exclude AUTOTEXT and AUTOTEXTLIST since we do not like stuff with 'AUTO' | 179 | # exclude AUTOTEXT and AUTOTEXTLIST since we do not like stuff with 'AUTO' |
| 176 | - ('BIBLIOGRAPHY', 0, 0, 'lfm', '', ''), | ||
| 177 | - ('CITATION', 1, 0, 'lfspvm', 'nty', ''), | 180 | + ('BIBLIOGRAPHY', 0, 0, 'lfm', '', ''), # pylint: disable=bad-whitespace |
| 181 | + ('CITATION', 1, 0, 'lfspvm', 'nty', ''), # pylint: disable=bad-whitespace | ||
| 178 | # exclude HYPERLINK since we are allergic to URLs | 182 | # exclude HYPERLINK since we are allergic to URLs |
| 179 | # exclude INCLUDEPICTURE and INCLUDETEXT (other file or maybe even URL?) | 183 | # exclude INCLUDEPICTURE and INCLUDETEXT (other file or maybe even URL?) |
| 180 | # exclude LINK and REF (could reference other files) | 184 | # exclude LINK and REF (could reference other files) |
| 181 | - ('NOTEREF', 1, 0, '', 'fhp', ''), | ||
| 182 | - ('PAGEREF', 1, 0, '', 'hp', ''), | ||
| 183 | - ('QUOTE', 1, 0, '', '', 'datetime'), | ||
| 184 | - ('STYLEREF', 1, 0, '', 'lnprtw', ''), | 185 | + ('NOTEREF', 1, 0, '', 'fhp', ''), # pylint: disable=bad-whitespace |
| 186 | + ('PAGEREF', 1, 0, '', 'hp', ''), # pylint: disable=bad-whitespace | ||
| 187 | + ('QUOTE', 1, 0, '', '', 'datetime'), # pylint: disable=bad-whitespace | ||
| 188 | + ('STYLEREF', 1, 0, '', 'lnprtw', ''), # pylint: disable=bad-whitespace | ||
| 185 | # exclude all Mail Merge commands since they import data from other files | 189 | # exclude all Mail Merge commands since they import data from other files |
| 186 | # (ADDRESSBLOCK, ASK, COMPARE, DATABASE, FILLIN, GREETINGLINE, IF, | 190 | # (ADDRESSBLOCK, ASK, COMPARE, DATABASE, FILLIN, GREETINGLINE, IF, |
| 187 | # MERGEFIELD, MERGEREC, MERGESEQ, NEXT, NEXTIF, SET, SKIPIF) | 191 | # MERGEFIELD, MERGEREC, MERGESEQ, NEXT, NEXTIF, SET, SKIPIF) |
| 188 | # Numbering | 192 | # Numbering |
| 189 | - ('LISTNUM', 0, 1, 'ls', '', ''), | ||
| 190 | - ('PAGE', 0, 0, '', '', 'numeric'), | ||
| 191 | - ('REVNUM', 0, 0, '', '', ''), | ||
| 192 | - ('SECTION', 0, 0, '', '', 'numeric'), | ||
| 193 | - ('SECTIONPAGES', 0, 0, '', '', 'numeric'), | ||
| 194 | - ('SEQ', 1, 1, 'rs', 'chn', 'numeric'), | ||
| 195 | - # user information | ||
| 196 | - ('USERADDRESS', 0, 1, '', '', 'string'), | ||
| 197 | - ('USERINITIALS', 0, 1, '', '', 'string'), | ||
| 198 | - ('USERNAME', 0, 1, '', '', 'string'), | 193 | + ('LISTNUM', 0, 1, 'ls', '', ''), # pylint: disable=bad-whitespace |
| 194 | + ('PAGE', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | ||
| 195 | + ('REVNUM', 0, 0, '', '', ''), # pylint: disable=bad-whitespace | ||
| 196 | + ('SECTION', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | ||
| 197 | + ('SECTIONPAGES', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | ||
| 198 | + ('SEQ', 1, 1, 'rs', 'chn', 'numeric'), # pylint: disable=bad-whitespace | ||
| 199 | + # user information # pylint: disable=bad-whitespace | ||
| 200 | + ('USERADDRESS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | ||
| 201 | + ('USERINITIALS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | ||
| 202 | + ('USERNAME', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | ||
| 199 | ) | 203 | ) |
| 200 | 204 | ||
| 201 | FIELD_DDE_REGEX = re.compile(r'^\s*dde(auto)?\s+', re.I) | 205 | FIELD_DDE_REGEX = re.compile(r'^\s*dde(auto)?\s+', re.I) |
| 202 | 206 | ||
| 207 | +# filter modes | ||
| 203 | FIELD_FILTER_DDE = 'only dde' | 208 | FIELD_FILTER_DDE = 'only dde' |
| 204 | FIELD_FILTER_BLACKLIST = 'exclude blacklisted' | 209 | FIELD_FILTER_BLACKLIST = 'exclude blacklisted' |
| 205 | FIELD_FILTER_ALL = 'keep all' | 210 | FIELD_FILTER_ALL = 'keep all' |
| @@ -229,6 +234,7 @@ LOG_LEVELS = { | @@ -229,6 +234,7 @@ LOG_LEVELS = { | ||
| 229 | 'critical': logging.CRITICAL | 234 | 'critical': logging.CRITICAL |
| 230 | } | 235 | } |
| 231 | 236 | ||
| 237 | + | ||
| 232 | class NullHandler(logging.Handler): | 238 | class NullHandler(logging.Handler): |
| 233 | """ | 239 | """ |
| 234 | Log Handler without output, to avoid printing messages if logging is not | 240 | Log Handler without output, to avoid printing messages if logging is not |
| @@ -239,6 +245,7 @@ class NullHandler(logging.Handler): | @@ -239,6 +245,7 @@ class NullHandler(logging.Handler): | ||
| 239 | def emit(self, record): | 245 | def emit(self, record): |
| 240 | pass | 246 | pass |
| 241 | 247 | ||
| 248 | + | ||
| 242 | def get_logger(name, level=logging.CRITICAL+1): | 249 | def get_logger(name, level=logging.CRITICAL+1): |
| 243 | """ | 250 | """ |
| 244 | Create a suitable logger object for this module. | 251 | Create a suitable logger object for this module. |
| @@ -251,7 +258,7 @@ def get_logger(name, level=logging.CRITICAL+1): | @@ -251,7 +258,7 @@ def get_logger(name, level=logging.CRITICAL+1): | ||
| 251 | # First, test if there is already a logger with the same name, else it | 258 | # First, test if there is already a logger with the same name, else it |
| 252 | # will generate duplicate messages (due to duplicate handlers): | 259 | # will generate duplicate messages (due to duplicate handlers): |
| 253 | if name in logging.Logger.manager.loggerDict: | 260 | if name in logging.Logger.manager.loggerDict: |
| 254 | - #NOTE: another less intrusive but more "hackish" solution would be to | 261 | + # NOTE: another less intrusive but more "hackish" solution would be to |
| 255 | # use getLogger then test if its effective level is not default. | 262 | # use getLogger then test if its effective level is not default. |
| 256 | logger = logging.getLogger(name) | 263 | logger = logging.getLogger(name) |
| 257 | # make sure level is OK: | 264 | # make sure level is OK: |
| @@ -338,28 +345,34 @@ def existing_file(filename): | @@ -338,28 +345,34 @@ def existing_file(filename): | ||
| 338 | 345 | ||
| 339 | def process_args(cmd_line_args=None): | 346 | def process_args(cmd_line_args=None): |
| 340 | """ parse command line arguments (given ones or per default sys.argv) """ | 347 | """ parse command line arguments (given ones or per default sys.argv) """ |
| 341 | - parser = ArgParserWithBanner(description='A python tool to detect and extract DDE links in MS Office files') | 348 | + parser = ArgParserWithBanner(description='A python tool to detect and ' |
| 349 | + 'extract DDE links in MS Office files') | ||
| 342 | parser.add_argument("filepath", help="path of the file to be analyzed", | 350 | parser.add_argument("filepath", help="path of the file to be analyzed", |
| 343 | type=existing_file, metavar='FILE') | 351 | type=existing_file, metavar='FILE') |
| 344 | parser.add_argument('-j', "--json", action='store_true', | 352 | parser.add_argument('-j', "--json", action='store_true', |
| 345 | help="Output in json format. Do not use with -ldebug") | 353 | help="Output in json format. Do not use with -ldebug") |
| 346 | - parser.add_argument("--nounquote", help="don't unquote values",action='store_true') | ||
| 347 | - parser.add_argument('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL, | ||
| 348 | - help="logging level debug/info/warning/error/critical (default=%(default)s)") | 354 | + parser.add_argument("--nounquote", help="don't unquote values", |
| 355 | + action='store_true') | ||
| 356 | + parser.add_argument('-l', '--loglevel', dest="loglevel", action="store", | ||
| 357 | + default=DEFAULT_LOG_LEVEL, | ||
| 358 | + help="logging level debug/info/warning/error/critical " | ||
| 359 | + "(default=%(default)s)") | ||
| 349 | filter_group = parser.add_argument_group( | 360 | filter_group = parser.add_argument_group( |
| 350 | - title='Filter which OpenXML field commands are returned', | ||
| 351 | - description='Only applies to OpenXML (e.g. docx) and rtf, not to OLE ' | ||
| 352 | - '(e.g. .doc). These options are mutually exclusive, last ' | ||
| 353 | - 'option found on command line overwrites earlier ones.') | 361 | + title='Filter which OpenXML field commands are returned', |
| 362 | + description='Only applies to OpenXML (e.g. docx) and rtf, not to OLE ' | ||
| 363 | + '(e.g. .doc). These options are mutually exclusive, last ' | ||
| 364 | + 'option found on command line overwrites earlier ones.') | ||
| 354 | filter_group.add_argument('-d', '--dde-only', action='store_const', | 365 | filter_group.add_argument('-d', '--dde-only', action='store_const', |
| 355 | dest='field_filter_mode', const=FIELD_FILTER_DDE, | 366 | dest='field_filter_mode', const=FIELD_FILTER_DDE, |
| 356 | help='Return only DDE and DDEAUTO fields') | 367 | help='Return only DDE and DDEAUTO fields') |
| 357 | filter_group.add_argument('-f', '--filter', action='store_const', | 368 | filter_group.add_argument('-f', '--filter', action='store_const', |
| 358 | - dest='field_filter_mode', const=FIELD_FILTER_BLACKLIST, | ||
| 359 | - help='Return all fields except harmless ones like PAGE') | 369 | + dest='field_filter_mode', |
| 370 | + const=FIELD_FILTER_BLACKLIST, | ||
| 371 | + help='Return all fields except harmless ones') | ||
| 360 | filter_group.add_argument('-a', '--all-fields', action='store_const', | 372 | filter_group.add_argument('-a', '--all-fields', action='store_const', |
| 361 | dest='field_filter_mode', const=FIELD_FILTER_ALL, | 373 | dest='field_filter_mode', const=FIELD_FILTER_ALL, |
| 362 | - help='Return all fields, irrespective of their contents') | 374 | + help='Return all fields, irrespective of their ' |
| 375 | + 'contents') | ||
| 363 | parser.set_defaults(field_filter_mode=FIELD_FILTER_DEFAULT) | 376 | parser.set_defaults(field_filter_mode=FIELD_FILTER_DEFAULT) |
| 364 | 377 | ||
| 365 | return parser.parse_args(cmd_line_args) | 378 | return parser.parse_args(cmd_line_args) |
| @@ -368,16 +381,19 @@ def process_args(cmd_line_args=None): | @@ -368,16 +381,19 @@ def process_args(cmd_line_args=None): | ||
| 368 | # === FUNCTIONS ============================================================== | 381 | # === FUNCTIONS ============================================================== |
| 369 | 382 | ||
| 370 | # from [MS-DOC], section 2.8.25 (PlcFld): | 383 | # from [MS-DOC], section 2.8.25 (PlcFld): |
| 371 | -# A field consists of two parts: field instructions and, optionally, a result. All fields MUST begin with | ||
| 372 | -# Unicode character 0x0013 with sprmCFSpec applied with a value of 1. This is the field begin | ||
| 373 | -# character. All fields MUST end with a Unicode character 0x0015 with sprmCFSpec applied with a value | ||
| 374 | -# of 1. This is the field end character. If the field has a result, then there MUST be a Unicode character | ||
| 375 | -# 0x0014 with sprmCFSpec applied with a value of 1 somewhere between the field begin character and | ||
| 376 | -# the field end character. This is the field separator. The field result is the content between the field | ||
| 377 | -# separator and the field end character. The field instructions are the content between the field begin | ||
| 378 | -# character and the field separator, if one is present, or between the field begin character and the field | ||
| 379 | -# end character if no separator is present. The field begin character, field end character, and field | ||
| 380 | -# separator are collectively referred to as field characters. | 384 | +# A field consists of two parts: field instructions and, optionally, a result. |
| 385 | +# All fields MUST begin with Unicode character 0x0013 with sprmCFSpec applied | ||
| 386 | +# with a value of 1. This is the field begin character. All fields MUST end | ||
| 387 | +# with a Unicode character 0x0015 with sprmCFSpec applied with a value of 1. | ||
| 388 | +# This is the field end character. If the field has a result, then there MUST | ||
| 389 | +# be a Unicode character 0x0014 with sprmCFSpec applied with a value of 1 | ||
| 390 | +# somewhere between the field begin character and the field end character. This | ||
| 391 | +# is the field separator. The field result is the content between the field | ||
| 392 | +# separator and the field end character. The field instructions are the content | ||
| 393 | +# between the field begin character and the field separator, if one is present, | ||
| 394 | +# or between the field begin character and the field end character if no | ||
| 395 | +# separator is present. The field begin character, field end character, and | ||
| 396 | +# field separator are collectively referred to as field characters. | ||
| 381 | 397 | ||
| 382 | 398 | ||
| 383 | def process_doc_field(data): | 399 | def process_doc_field(data): |
| @@ -387,7 +403,6 @@ def process_doc_field(data): | @@ -387,7 +403,6 @@ def process_doc_field(data): | ||
| 387 | log.debug('processing field \'{0}\''.format(data)) | 403 | log.debug('processing field \'{0}\''.format(data)) |
| 388 | 404 | ||
| 389 | if data.lstrip().lower().startswith(u'dde'): | 405 | if data.lstrip().lower().startswith(u'dde'): |
| 390 | - #log.debug('--> is DDE!') | ||
| 391 | return data | 406 | return data |
| 392 | elif data.lstrip().lower().startswith(u'\x00d\x00d\x00e\x00'): | 407 | elif data.lstrip().lower().startswith(u'\x00d\x00d\x00e\x00'): |
| 393 | return data | 408 | return data |
| @@ -512,7 +527,6 @@ def process_doc(filepath): | @@ -512,7 +527,6 @@ def process_doc(filepath): | ||
| 512 | return u'\n'.join(links) | 527 | return u'\n'.join(links) |
| 513 | 528 | ||
| 514 | 529 | ||
| 515 | - | ||
| 516 | def process_xls(filepath): | 530 | def process_xls(filepath): |
| 517 | """ find dde links in excel ole file """ | 531 | """ find dde links in excel ole file """ |
| 518 | 532 | ||
| @@ -531,6 +545,7 @@ def process_xls(filepath): | @@ -531,6 +545,7 @@ def process_xls(filepath): | ||
| 531 | 545 | ||
| 532 | 546 | ||
| 533 | def process_docx(filepath, field_filter_mode=None): | 547 | def process_docx(filepath, field_filter_mode=None): |
| 548 | + """ find dde-links (and other fields) in Word 2007+ files """ | ||
| 534 | log.debug('process_docx') | 549 | log.debug('process_docx') |
| 535 | all_fields = [] | 550 | all_fields = [] |
| 536 | with zipfile.ZipFile(filepath) as z: | 551 | with zipfile.ZipFile(filepath) as z: |
| @@ -539,9 +554,6 @@ def process_docx(filepath, field_filter_mode=None): | @@ -539,9 +554,6 @@ def process_docx(filepath, field_filter_mode=None): | ||
| 539 | data = z.read(filepath) | 554 | data = z.read(filepath) |
| 540 | fields = process_xml(data) | 555 | fields = process_xml(data) |
| 541 | if len(fields) > 0: | 556 | if len(fields) > 0: |
| 542 | - #print ('DDE Links in %s:'%filepath) | ||
| 543 | - #for f in fields: | ||
| 544 | - # print(f) | ||
| 545 | all_fields.extend(fields) | 557 | all_fields.extend(fields) |
| 546 | 558 | ||
| 547 | # apply field command filter | 559 | # apply field command filter |
| @@ -560,8 +572,10 @@ def process_docx(filepath, field_filter_mode=None): | @@ -560,8 +572,10 @@ def process_docx(filepath, field_filter_mode=None): | ||
| 560 | .format(field_filter_mode)) | 572 | .format(field_filter_mode)) |
| 561 | 573 | ||
| 562 | return u'\n'.join(clean_fields) | 574 | return u'\n'.join(clean_fields) |
| 563 | - | 575 | + |
| 576 | + | ||
| 564 | def process_xml(data): | 577 | def process_xml(data): |
| 578 | + """ Find dde-links and other fields in office XML data """ | ||
| 565 | # parse the XML data: | 579 | # parse the XML data: |
| 566 | root = ET.fromstring(data) | 580 | root = ET.fromstring(data) |
| 567 | fields = [] | 581 | fields = [] |
| @@ -569,17 +583,18 @@ def process_xml(data): | @@ -569,17 +583,18 @@ def process_xml(data): | ||
| 569 | level = 0 | 583 | level = 0 |
| 570 | # find all the tags 'w:p': | 584 | # find all the tags 'w:p': |
| 571 | # parse each for begin and end tags, to group DDE strings | 585 | # parse each for begin and end tags, to group DDE strings |
| 572 | - # fldChar can be in either a w:r element, floating alone in the w:p or spread accross w:p tags | 586 | + # fldChar can be in either a w:r element, floating alone in the w:p |
| 587 | + # or spread accross w:p tags | ||
| 573 | # escape DDE if quoted etc | 588 | # escape DDE if quoted etc |
| 574 | # (each is a chunk of a DDE link) | 589 | # (each is a chunk of a DDE link) |
| 575 | 590 | ||
| 576 | for subs in root.iter(TAG_W_P): | 591 | for subs in root.iter(TAG_W_P): |
| 577 | elem = None | 592 | elem = None |
| 578 | for e in subs: | 593 | for e in subs: |
| 579 | - #check if w:r and if it is parse children elements to pull out the first FLDCHAR or INSTRTEXT | ||
| 580 | if e.tag == TAG_W_R: | 594 | if e.tag == TAG_W_R: |
| 581 | for child in e: | 595 | for child in e: |
| 582 | if child.tag == TAG_W_FLDCHAR or child.tag == TAG_W_INSTRTEXT: | 596 | if child.tag == TAG_W_FLDCHAR or child.tag == TAG_W_INSTRTEXT: |
| 597 | + # check if w:r; parse children to pull out first FLDCHAR/INSTRTEXT | ||
| 583 | elem = child | 598 | elem = child |
| 584 | break | 599 | break |
| 585 | else: | 600 | else: |
| @@ -587,21 +602,21 @@ def process_xml(data): | @@ -587,21 +602,21 @@ def process_xml(data): | ||
| 587 | #this should be an error condition | 602 | #this should be an error condition |
| 588 | if elem is None: | 603 | if elem is None: |
| 589 | continue | 604 | continue |
| 590 | - | ||
| 591 | - #check if FLDCHARTYPE and whether "begin" or "end" tag | 605 | + |
| 606 | + # check if FLDCHARTYPE and whether "begin" or "end" tag | ||
| 592 | if elem.attrib.get(ATTR_W_FLDCHARTYPE) is not None: | 607 | if elem.attrib.get(ATTR_W_FLDCHARTYPE) is not None: |
| 593 | if elem.attrib[ATTR_W_FLDCHARTYPE] == "begin": | 608 | if elem.attrib[ATTR_W_FLDCHARTYPE] == "begin": |
| 594 | - level += 1 | 609 | + level += 1 |
| 595 | if elem.attrib[ATTR_W_FLDCHARTYPE] == "end": | 610 | if elem.attrib[ATTR_W_FLDCHARTYPE] == "end": |
| 596 | level -= 1 | 611 | level -= 1 |
| 597 | - if level == 0 or level == -1 : # edge-case where level becomes -1 | 612 | + if level == 0 or level == -1: # edge-case; level becomes -1 |
| 598 | fields.append(ddetext) | 613 | fields.append(ddetext) |
| 599 | ddetext = u'' | 614 | ddetext = u'' |
| 600 | - level = 0 # reset edge-case | ||
| 601 | - | 615 | + level = 0 # reset edge-case |
| 616 | + | ||
| 602 | # concatenate the text of the field, if present: | 617 | # concatenate the text of the field, if present: |
| 603 | if elem.tag == TAG_W_INSTRTEXT and elem.text is not None: | 618 | if elem.tag == TAG_W_INSTRTEXT and elem.text is not None: |
| 604 | - #expand field code if QUOTED | 619 | + # expand field code if QUOTED |
| 605 | ddetext += unquote(elem.text) | 620 | ddetext += unquote(elem.text) |
| 606 | 621 | ||
| 607 | for elem in root.iter(TAG_W_FLDSIMPLE): | 622 | for elem in root.iter(TAG_W_FLDSIMPLE): |
| @@ -611,10 +626,11 @@ def process_xml(data): | @@ -611,10 +626,11 @@ def process_xml(data): | ||
| 611 | 626 | ||
| 612 | return fields | 627 | return fields |
| 613 | 628 | ||
| 614 | -def unquote(field): | 629 | + |
| 630 | +def unquote(field): | ||
| 615 | if "QUOTE" not in field or NO_QUOTES: | 631 | if "QUOTE" not in field or NO_QUOTES: |
| 616 | return field | 632 | return field |
| 617 | - #split into components | 633 | + # split into components |
| 618 | parts = field.strip().split(" ") | 634 | parts = field.strip().split(" ") |
| 619 | ddestr = "" | 635 | ddestr = "" |
| 620 | for p in parts[1:]: | 636 | for p in parts[1:]: |
| @@ -625,11 +641,13 @@ def unquote(field): | @@ -625,11 +641,13 @@ def unquote(field): | ||
| 625 | ddestr += ch | 641 | ddestr += ch |
| 626 | return ddestr | 642 | return ddestr |
| 627 | 643 | ||
| 644 | + | ||
| 628 | # "static variables" for field_is_blacklisted: | 645 | # "static variables" for field_is_blacklisted: |
| 629 | FIELD_WORD_REGEX = re.compile(r'"[^"]*"|\S+') | 646 | FIELD_WORD_REGEX = re.compile(r'"[^"]*"|\S+') |
| 630 | FIELD_BLACKLIST_CMDS = tuple(field[0].lower() for field in FIELD_BLACKLIST) | 647 | FIELD_BLACKLIST_CMDS = tuple(field[0].lower() for field in FIELD_BLACKLIST) |
| 631 | FIELD_SWITCH_REGEX = re.compile(r'^\\[\w#*@]$') | 648 | FIELD_SWITCH_REGEX = re.compile(r'^\\[\w#*@]$') |
| 632 | 649 | ||
| 650 | + | ||
| 633 | def field_is_blacklisted(contents): | 651 | def field_is_blacklisted(contents): |
| 634 | """ Check if given field contents matches any in FIELD_BLACKLIST | 652 | """ Check if given field contents matches any in FIELD_BLACKLIST |
| 635 | 653 | ||
| @@ -651,7 +669,7 @@ def field_is_blacklisted(contents): | @@ -651,7 +669,7 @@ def field_is_blacklisted(contents): | ||
| 651 | index = FIELD_BLACKLIST_CMDS.index(words[0].lower()) | 669 | index = FIELD_BLACKLIST_CMDS.index(words[0].lower()) |
| 652 | except ValueError: # first word is no blacklisted command | 670 | except ValueError: # first word is no blacklisted command |
| 653 | return False | 671 | return False |
| 654 | - log.debug('trying to match "{0}" to blacklist command {0}' | 672 | + log.debug('trying to match "{0}" to blacklist command {1}' |
| 655 | .format(contents, FIELD_BLACKLIST[index])) | 673 | .format(contents, FIELD_BLACKLIST[index])) |
| 656 | _, nargs_required, nargs_optional, sw_with_arg, sw_solo, sw_format \ | 674 | _, nargs_required, nargs_optional, sw_with_arg, sw_solo, sw_format \ |
| 657 | = FIELD_BLACKLIST[index] | 675 | = FIELD_BLACKLIST[index] |
| @@ -706,14 +724,15 @@ def field_is_blacklisted(contents): | @@ -706,14 +724,15 @@ def field_is_blacklisted(contents): | ||
| 706 | if 'numeric' in sw_format: | 724 | if 'numeric' in sw_format: |
| 707 | arg_choices = [] # too many choices to list them here | 725 | arg_choices = [] # too many choices to list them here |
| 708 | else: | 726 | else: |
| 709 | - log.debug('unexpected switch {0} in "{1}"'.format(switch, contents)) | 727 | + log.debug('unexpected switch {0} in "{1}"' |
| 728 | + .format(switch, contents)) | ||
| 710 | return False | 729 | return False |
| 711 | 730 | ||
| 712 | # if nothing went wrong sofar, the contents seems to match the blacklist | 731 | # if nothing went wrong sofar, the contents seems to match the blacklist |
| 713 | return True | 732 | return True |
| 714 | 733 | ||
| 715 | 734 | ||
| 716 | -def process_xlsx(filepath, filed_filter_mode=None): | 735 | +def process_xlsx(filepath): |
| 717 | """ process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm) """ | 736 | """ process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm) """ |
| 718 | dde_links = [] | 737 | dde_links = [] |
| 719 | parser = ooxml.XmlParser(filepath) | 738 | parser = ooxml.XmlParser(filepath) |
| @@ -733,7 +752,8 @@ def process_xlsx(filepath, filed_filter_mode=None): | @@ -733,7 +752,8 @@ def process_xlsx(filepath, filed_filter_mode=None): | ||
| 733 | try: | 752 | try: |
| 734 | logging.info('Parsing non-xml subfile {0} with content type {1}' | 753 | logging.info('Parsing non-xml subfile {0} with content type {1}' |
| 735 | .format(subfile, content_type)) | 754 | .format(subfile, content_type)) |
| 736 | - for record in xls_parser.parse_xlsb_part(handle, content_type, subfile): | 755 | + for record in xls_parser.parse_xlsb_part(handle, content_type, |
| 756 | + subfile): | ||
| 737 | logging.debug('{0}: {1}'.format(subfile, record)) | 757 | logging.debug('{0}: {1}'.format(subfile, record)) |
| 738 | if isinstance(record, xls_parser.XlsbBeginSupBook) and \ | 758 | if isinstance(record, xls_parser.XlsbBeginSupBook) and \ |
| 739 | record.link_type == \ | 759 | record.link_type == \ |
| @@ -791,8 +811,10 @@ class RtfFieldParser(rtfobj.RtfParser): | @@ -791,8 +811,10 @@ class RtfFieldParser(rtfobj.RtfParser): | ||
| 791 | 811 | ||
| 792 | RTF_START = b'\x7b\x5c\x72\x74' # == b'{\rt' but does not mess up auto-indent | 812 | RTF_START = b'\x7b\x5c\x72\x74' # == b'{\rt' but does not mess up auto-indent |
| 793 | 813 | ||
| 814 | + | ||
| 794 | def process_rtf(file_handle, field_filter_mode=None): | 815 | def process_rtf(file_handle, field_filter_mode=None): |
| 795 | log.debug('process_rtf') | 816 | log.debug('process_rtf') |
| 817 | + """ find dde links or other fields in rtf file """ | ||
| 796 | all_fields = [] | 818 | all_fields = [] |
| 797 | data = RTF_START + file_handle.read() # read complete file into memory! | 819 | data = RTF_START + file_handle.read() # read complete file into memory! |
| 798 | file_handle.close() | 820 | file_handle.close() |
| @@ -828,7 +850,7 @@ def process_file(filepath, field_filter_mode=None): | @@ -828,7 +850,7 @@ def process_file(filepath, field_filter_mode=None): | ||
| 828 | return process_doc(filepath) | 850 | return process_doc(filepath) |
| 829 | 851 | ||
| 830 | with open(filepath, 'rb') as file_handle: | 852 | with open(filepath, 'rb') as file_handle: |
| 831 | - if file_handle.read(4) == RTF_START: | 853 | + if file_handle.read(4) == RTF_START: |
| 832 | # This is a RTF file | 854 | # This is a RTF file |
| 833 | return process_rtf(file_handle, field_filter_mode) | 855 | return process_rtf(file_handle, field_filter_mode) |
| 834 | 856 | ||
| @@ -846,7 +868,7 @@ def process_file(filepath, field_filter_mode=None): | @@ -846,7 +868,7 @@ def process_file(filepath, field_filter_mode=None): | ||
| 846 | return process_docx(filepath, field_filter_mode) | 868 | return process_docx(filepath, field_filter_mode) |
| 847 | 869 | ||
| 848 | 870 | ||
| 849 | -#=== MAIN ================================================================= | 871 | +# === MAIN ================================================================= |
| 850 | 872 | ||
| 851 | def main(cmd_line_args=None): | 873 | def main(cmd_line_args=None): |
| 852 | """ Main function, called if this file is called as a script | 874 | """ Main function, called if this file is called as a script |
| @@ -868,10 +890,10 @@ def main(cmd_line_args=None): | @@ -868,10 +890,10 @@ def main(cmd_line_args=None): | ||
| 868 | if args.json and args.loglevel.lower() == 'debug': | 890 | if args.json and args.loglevel.lower() == 'debug': |
| 869 | log.warning('Debug log output will not be json-compatible!') | 891 | log.warning('Debug log output will not be json-compatible!') |
| 870 | 892 | ||
| 871 | - if args.nounquote : | 893 | + if args.nounquote: |
| 872 | global NO_QUOTES | 894 | global NO_QUOTES |
| 873 | NO_QUOTES = True | 895 | NO_QUOTES = True |
| 874 | - | 896 | + |
| 875 | if args.json: | 897 | if args.json: |
| 876 | jout = [] | 898 | jout = [] |
| 877 | jout.append(BANNER_JSON) | 899 | jout.append(BANNER_JSON) |
| @@ -890,7 +912,7 @@ def main(cmd_line_args=None): | @@ -890,7 +912,7 @@ def main(cmd_line_args=None): | ||
| 890 | except Exception as exc: | 912 | except Exception as exc: |
| 891 | if args.json: | 913 | if args.json: |
| 892 | jout.append(dict(type='error', error=type(exc).__name__, | 914 | jout.append(dict(type='error', error=type(exc).__name__, |
| 893 | - message=str(exc))) # strange: str(exc) is enclosed in "" | 915 | + message=str(exc))) |
| 894 | else: | 916 | else: |
| 895 | raise # re-raise last known exception, keeping trace intact | 917 | raise # re-raise last known exception, keeping trace intact |
| 896 | 918 |