Commit 6028d9ab1106a188c91e9aff496df9e745726d43
1 parent
56ed93a1
msodde: clean-up code following pep8 and pylint (1)
- disable pylint-whitespace-check from FIELD_BLACKLIST - shortend most all lines to max 79 chars (except pylint: disable-*) - moved imports further up - re-wrap a few lines - add missing doc strings - add/remove whitespace - remove old commented debug-log/print statements
Showing
1 changed file
with
136 additions
and
114 deletions
oletools/msodde.py
| ... | ... | @@ -17,30 +17,31 @@ msodde is part of the python-oletools package: |
| 17 | 17 | http://www.decalage.info/python/oletools |
| 18 | 18 | """ |
| 19 | 19 | |
| 20 | -# === LICENSE ================================================================== | |
| 20 | +# === LICENSE ================================================================= | |
| 21 | 21 | |
| 22 | 22 | # msodde is copyright (c) 2017 Philippe Lagadec (http://www.decalage.info) |
| 23 | 23 | # All rights reserved. |
| 24 | 24 | # |
| 25 | -# Redistribution and use in source and binary forms, with or without modification, | |
| 26 | -# are permitted provided that the following conditions are met: | |
| 25 | +# Redistribution and use in source and binary forms, with or without | |
| 26 | +# modification, are permitted provided that the following conditions are met: | |
| 27 | 27 | # |
| 28 | -# * Redistributions of source code must retain the above copyright notice, this | |
| 29 | -# list of conditions and the following disclaimer. | |
| 28 | +# * Redistributions of source code must retain the above copyright notice, | |
| 29 | +# this list of conditions and the following disclaimer. | |
| 30 | 30 | # * Redistributions in binary form must reproduce the above copyright notice, |
| 31 | 31 | # this list of conditions and the following disclaimer in the documentation |
| 32 | 32 | # and/or other materials provided with the distribution. |
| 33 | 33 | # |
| 34 | -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
| 35 | -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
| 36 | -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
| 37 | -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | |
| 38 | -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 39 | -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
| 40 | -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
| 41 | -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
| 42 | -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
| 43 | -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 34 | +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
| 35 | +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 36 | +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| 37 | +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE | |
| 38 | +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
| 39 | +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
| 40 | +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
| 41 | +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
| 42 | +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
| 43 | +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
| 44 | +# POSSIBILITY OF SUCH DAMAGE. | |
| 44 | 45 | |
| 45 | 46 | from __future__ import print_function |
| 46 | 47 | |
| ... | ... | @@ -49,7 +50,8 @@ from __future__ import print_function |
| 49 | 50 | # 2017-10-18 v0.52 PL: - first version |
| 50 | 51 | # 2017-10-20 PL: - fixed issue #202 (handling empty xml tags) |
| 51 | 52 | # 2017-10-23 ES: - add check for fldSimple codes |
| 52 | -# 2017-10-24 ES: - group tags and track begin/end tags to keep DDE strings together | |
| 53 | +# 2017-10-24 ES: - group tags and track begin/end tags to keep DDE | |
| 54 | +# strings together | |
| 53 | 55 | # 2017-10-25 CH: - add json output |
| 54 | 56 | # 2017-10-25 CH: - parse doc |
| 55 | 57 | # PL: - added logging |
| ... | ... | @@ -62,7 +64,7 @@ from __future__ import print_function |
| 62 | 64 | |
| 63 | 65 | __version__ = '0.52dev9' |
| 64 | 66 | |
| 65 | -#------------------------------------------------------------------------------ | |
| 67 | +# ----------------------------------------------------------------------------- | |
| 66 | 68 | # TODO: field codes can be in headers/footers/comments - parse these |
| 67 | 69 | # TODO: generalize behaviour for xlsx: find all external links (maybe rename |
| 68 | 70 | # command line flag for "blacklist" to "find all suspicious" or so) |
| ... | ... | @@ -71,7 +73,7 @@ __version__ = '0.52dev9' |
| 71 | 73 | # DDE-Links |
| 72 | 74 | # TODO: avoid reading complete rtf file data into memory |
| 73 | 75 | |
| 74 | -#------------------------------------------------------------------------------ | |
| 76 | +# ----------------------------------------------------------------------------- | |
| 75 | 77 | # REFERENCES: |
| 76 | 78 | |
| 77 | 79 | |
| ... | ... | @@ -123,7 +125,9 @@ TAG_W_P = "{%s}p" % NS_WORD |
| 123 | 125 | TAG_W_R = "{%s}r" % NS_WORD |
| 124 | 126 | ATTR_W_INSTR = '{%s}instr' % NS_WORD |
| 125 | 127 | ATTR_W_FLDCHARTYPE = '{%s}fldCharType' % NS_WORD |
| 126 | -LOCATIONS = ['word/document.xml','word/endnotes.xml','word/footnotes.xml','word/header1.xml','word/footer1.xml','word/header2.xml','word/footer2.xml','word/comments.xml'] | |
| 128 | +LOCATIONS = ('word/document.xml', 'word/endnotes.xml', 'word/footnotes.xml', | |
| 129 | + 'word/header1.xml', 'word/footer1.xml', 'word/header2.xml', | |
| 130 | + 'word/footer2.xml', 'word/comments.xml') | |
| 127 | 131 | |
| 128 | 132 | # list of acceptable, harmless field instructions for blacklist field mode |
| 129 | 133 | # c.f. http://officeopenxml.com/WPfieldInstructions.php or the official |
| ... | ... | @@ -133,73 +137,74 @@ LOCATIONS = ['word/document.xml','word/endnotes.xml','word/footnotes.xml','word/ |
| 133 | 137 | # switches_with_args, switches_without_args, format_switches) |
| 134 | 138 | FIELD_BLACKLIST = ( |
| 135 | 139 | # date and time: |
| 136 | - ('CREATEDATE', 0, 0, '', 'hs', 'datetime'), | |
| 137 | - ('DATE', 0, 0, '', 'hls', 'datetime'), | |
| 138 | - ('EDITTIME', 0, 0, '', '', 'numeric'), | |
| 139 | - ('PRINTDATE', 0, 0, '', 'hs', 'datetime'), | |
| 140 | - ('SAVEDATE', 0, 0, '', 'hs', 'datetime'), | |
| 141 | - ('TIME', 0, 0, '', '', 'datetime'), | |
| 140 | + ('CREATEDATE', 0, 0, '', 'hs', 'datetime'), # pylint: disable=bad-whitespace | |
| 141 | + ('DATE', 0, 0, '', 'hls', 'datetime'), # pylint: disable=bad-whitespace | |
| 142 | + ('EDITTIME', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | |
| 143 | + ('PRINTDATE', 0, 0, '', 'hs', 'datetime'), # pylint: disable=bad-whitespace | |
| 144 | + ('SAVEDATE', 0, 0, '', 'hs', 'datetime'), # pylint: disable=bad-whitespace | |
| 145 | + ('TIME', 0, 0, '', '', 'datetime'), # pylint: disable=bad-whitespace | |
| 142 | 146 | # exclude document automation (we hate the "auto" in "automation") |
| 143 | 147 | # (COMPARE, DOCVARIABLE, GOTOBUTTON, IF, MACROBUTTON, PRINT) |
| 144 | 148 | # document information |
| 145 | - ('AUTHOR', 0, 1, '', '', 'string'), | |
| 146 | - ('COMMENTS', 0, 1, '', '', 'string'), | |
| 147 | - ('DOCPROPERTY', 1, 0, '', '', 'string/numeric/datetime'), | |
| 148 | - ('FILENAME', 0, 0, '', 'p', 'string'), | |
| 149 | - ('FILESIZE', 0, 0, '', 'km', 'numeric'), | |
| 150 | - ('KEYWORDS', 0, 1, '', '', 'string'), | |
| 151 | - ('LASTSAVEDBY', 0, 0, '', '', 'string'), | |
| 152 | - ('NUMCHARS', 0, 0, '', '', 'numeric'), | |
| 153 | - ('NUMPAGES', 0, 0, '', '', 'numeric'), | |
| 154 | - ('NUMWORDS', 0, 0, '', '', 'numeric'), | |
| 155 | - ('SUBJECT', 0, 1, '', '', 'string'), | |
| 156 | - ('TEMPLATE', 0, 0, '', 'p', 'string'), | |
| 157 | - ('TITLE', 0, 1, '', '', 'string'), | |
| 149 | + ('AUTHOR', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | |
| 150 | + ('COMMENTS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | |
| 151 | + ('DOCPROPERTY', 1, 0, '', '', 'string/numeric/datetime'), # pylint: disable=bad-whitespace | |
| 152 | + ('FILENAME', 0, 0, '', 'p', 'string'), # pylint: disable=bad-whitespace | |
| 153 | + ('FILESIZE', 0, 0, '', 'km', 'numeric'), # pylint: disable=bad-whitespace | |
| 154 | + ('KEYWORDS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | |
| 155 | + ('LASTSAVEDBY', 0, 0, '', '', 'string'), # pylint: disable=bad-whitespace | |
| 156 | + ('NUMCHARS', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | |
| 157 | + ('NUMPAGES', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | |
| 158 | + ('NUMWORDS', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | |
| 159 | + ('SUBJECT', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | |
| 160 | + ('TEMPLATE', 0, 0, '', 'p', 'string'), # pylint: disable=bad-whitespace | |
| 161 | + ('TITLE', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | |
| 158 | 162 | # equations and formulas |
| 159 | - # exlude '=' formulae because they have different syntax | |
| 160 | - ('ADVANCE', 0, 0, 'dlruxy', '', ''), | |
| 161 | - ('SYMBOL', 1, 0, 'fs', 'ahju', ''), | |
| 163 | + # exlude '=' formulae because they have different syntax (and can be bad) | |
| 164 | + ('ADVANCE', 0, 0, 'dlruxy', '', ''), # pylint: disable=bad-whitespace | |
| 165 | + ('SYMBOL', 1, 0, 'fs', 'ahju', ''), # pylint: disable=bad-whitespace | |
| 162 | 166 | # form fields |
| 163 | - ('FORMCHECKBOX', 0, 0, '', '', ''), | |
| 164 | - ('FORMDROPDOWN', 0, 0, '', '', ''), | |
| 165 | - ('FORMTEXT', 0, 0, '', '', ''), | |
| 167 | + ('FORMCHECKBOX', 0, 0, '', '', ''), # pylint: disable=bad-whitespace | |
| 168 | + ('FORMDROPDOWN', 0, 0, '', '', ''), # pylint: disable=bad-whitespace | |
| 169 | + ('FORMTEXT', 0, 0, '', '', ''), # pylint: disable=bad-whitespace | |
| 166 | 170 | # index and tables |
| 167 | - ('INDEX', 0, 0, 'bcdefghklpsz', 'ry', ''), | |
| 171 | + ('INDEX', 0, 0, 'bcdefghklpsz', 'ry', ''), # pylint: disable=bad-whitespace | |
| 168 | 172 | # exlude RD since that imports data from other files |
| 169 | - ('TA', 0, 0, 'clrs', 'bi', ''), | |
| 170 | - ('TC', 1, 0, 'fl', 'n', ''), | |
| 171 | - ('TOA', 0, 0, 'bcdegls', 'fhp', ''), | |
| 172 | - ('TOC', 0, 0, 'abcdflnopst', 'huwxz', ''), | |
| 173 | - ('XE', 1, 0, 'frty', 'bi', ''), | |
| 173 | + ('TA', 0, 0, 'clrs', 'bi', ''), # pylint: disable=bad-whitespace | |
| 174 | + ('TC', 1, 0, 'fl', 'n', ''), # pylint: disable=bad-whitespace | |
| 175 | + ('TOA', 0, 0, 'bcdegls', 'fhp', ''), # pylint: disable=bad-whitespace | |
| 176 | + ('TOC', 0, 0, 'abcdflnopst', 'huwxz', ''), # pylint: disable=bad-whitespace | |
| 177 | + ('XE', 1, 0, 'frty', 'bi', ''), # pylint: disable=bad-whitespace | |
| 174 | 178 | # links and references |
| 175 | 179 | # exclude AUTOTEXT and AUTOTEXTLIST since we do not like stuff with 'AUTO' |
| 176 | - ('BIBLIOGRAPHY', 0, 0, 'lfm', '', ''), | |
| 177 | - ('CITATION', 1, 0, 'lfspvm', 'nty', ''), | |
| 180 | + ('BIBLIOGRAPHY', 0, 0, 'lfm', '', ''), # pylint: disable=bad-whitespace | |
| 181 | + ('CITATION', 1, 0, 'lfspvm', 'nty', ''), # pylint: disable=bad-whitespace | |
| 178 | 182 | # exclude HYPERLINK since we are allergic to URLs |
| 179 | 183 | # exclude INCLUDEPICTURE and INCLUDETEXT (other file or maybe even URL?) |
| 180 | 184 | # exclude LINK and REF (could reference other files) |
| 181 | - ('NOTEREF', 1, 0, '', 'fhp', ''), | |
| 182 | - ('PAGEREF', 1, 0, '', 'hp', ''), | |
| 183 | - ('QUOTE', 1, 0, '', '', 'datetime'), | |
| 184 | - ('STYLEREF', 1, 0, '', 'lnprtw', ''), | |
| 185 | + ('NOTEREF', 1, 0, '', 'fhp', ''), # pylint: disable=bad-whitespace | |
| 186 | + ('PAGEREF', 1, 0, '', 'hp', ''), # pylint: disable=bad-whitespace | |
| 187 | + ('QUOTE', 1, 0, '', '', 'datetime'), # pylint: disable=bad-whitespace | |
| 188 | + ('STYLEREF', 1, 0, '', 'lnprtw', ''), # pylint: disable=bad-whitespace | |
| 185 | 189 | # exclude all Mail Merge commands since they import data from other files |
| 186 | 190 | # (ADDRESSBLOCK, ASK, COMPARE, DATABASE, FILLIN, GREETINGLINE, IF, |
| 187 | 191 | # MERGEFIELD, MERGEREC, MERGESEQ, NEXT, NEXTIF, SET, SKIPIF) |
| 188 | 192 | # Numbering |
| 189 | - ('LISTNUM', 0, 1, 'ls', '', ''), | |
| 190 | - ('PAGE', 0, 0, '', '', 'numeric'), | |
| 191 | - ('REVNUM', 0, 0, '', '', ''), | |
| 192 | - ('SECTION', 0, 0, '', '', 'numeric'), | |
| 193 | - ('SECTIONPAGES', 0, 0, '', '', 'numeric'), | |
| 194 | - ('SEQ', 1, 1, 'rs', 'chn', 'numeric'), | |
| 195 | - # user information | |
| 196 | - ('USERADDRESS', 0, 1, '', '', 'string'), | |
| 197 | - ('USERINITIALS', 0, 1, '', '', 'string'), | |
| 198 | - ('USERNAME', 0, 1, '', '', 'string'), | |
| 193 | + ('LISTNUM', 0, 1, 'ls', '', ''), # pylint: disable=bad-whitespace | |
| 194 | + ('PAGE', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | |
| 195 | + ('REVNUM', 0, 0, '', '', ''), # pylint: disable=bad-whitespace | |
| 196 | + ('SECTION', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | |
| 197 | + ('SECTIONPAGES', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | |
| 198 | + ('SEQ', 1, 1, 'rs', 'chn', 'numeric'), # pylint: disable=bad-whitespace | |
| 199 | + # user information # pylint: disable=bad-whitespace | |
| 200 | + ('USERADDRESS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | |
| 201 | + ('USERINITIALS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | |
| 202 | + ('USERNAME', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | |
| 199 | 203 | ) |
| 200 | 204 | |
| 201 | 205 | FIELD_DDE_REGEX = re.compile(r'^\s*dde(auto)?\s+', re.I) |
| 202 | 206 | |
| 207 | +# filter modes | |
| 203 | 208 | FIELD_FILTER_DDE = 'only dde' |
| 204 | 209 | FIELD_FILTER_BLACKLIST = 'exclude blacklisted' |
| 205 | 210 | FIELD_FILTER_ALL = 'keep all' |
| ... | ... | @@ -229,6 +234,7 @@ LOG_LEVELS = { |
| 229 | 234 | 'critical': logging.CRITICAL |
| 230 | 235 | } |
| 231 | 236 | |
| 237 | + | |
| 232 | 238 | class NullHandler(logging.Handler): |
| 233 | 239 | """ |
| 234 | 240 | Log Handler without output, to avoid printing messages if logging is not |
| ... | ... | @@ -239,6 +245,7 @@ class NullHandler(logging.Handler): |
| 239 | 245 | def emit(self, record): |
| 240 | 246 | pass |
| 241 | 247 | |
| 248 | + | |
| 242 | 249 | def get_logger(name, level=logging.CRITICAL+1): |
| 243 | 250 | """ |
| 244 | 251 | Create a suitable logger object for this module. |
| ... | ... | @@ -251,7 +258,7 @@ def get_logger(name, level=logging.CRITICAL+1): |
| 251 | 258 | # First, test if there is already a logger with the same name, else it |
| 252 | 259 | # will generate duplicate messages (due to duplicate handlers): |
| 253 | 260 | if name in logging.Logger.manager.loggerDict: |
| 254 | - #NOTE: another less intrusive but more "hackish" solution would be to | |
| 261 | + # NOTE: another less intrusive but more "hackish" solution would be to | |
| 255 | 262 | # use getLogger then test if its effective level is not default. |
| 256 | 263 | logger = logging.getLogger(name) |
| 257 | 264 | # make sure level is OK: |
| ... | ... | @@ -338,28 +345,34 @@ def existing_file(filename): |
| 338 | 345 | |
| 339 | 346 | def process_args(cmd_line_args=None): |
| 340 | 347 | """ parse command line arguments (given ones or per default sys.argv) """ |
| 341 | - parser = ArgParserWithBanner(description='A python tool to detect and extract DDE links in MS Office files') | |
| 348 | + parser = ArgParserWithBanner(description='A python tool to detect and ' | |
| 349 | + 'extract DDE links in MS Office files') | |
| 342 | 350 | parser.add_argument("filepath", help="path of the file to be analyzed", |
| 343 | 351 | type=existing_file, metavar='FILE') |
| 344 | 352 | parser.add_argument('-j', "--json", action='store_true', |
| 345 | 353 | help="Output in json format. Do not use with -ldebug") |
| 346 | - parser.add_argument("--nounquote", help="don't unquote values",action='store_true') | |
| 347 | - parser.add_argument('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL, | |
| 348 | - help="logging level debug/info/warning/error/critical (default=%(default)s)") | |
| 354 | + parser.add_argument("--nounquote", help="don't unquote values", | |
| 355 | + action='store_true') | |
| 356 | + parser.add_argument('-l', '--loglevel', dest="loglevel", action="store", | |
| 357 | + default=DEFAULT_LOG_LEVEL, | |
| 358 | + help="logging level debug/info/warning/error/critical " | |
| 359 | + "(default=%(default)s)") | |
| 349 | 360 | filter_group = parser.add_argument_group( |
| 350 | - title='Filter which OpenXML field commands are returned', | |
| 351 | - description='Only applies to OpenXML (e.g. docx) and rtf, not to OLE ' | |
| 352 | - '(e.g. .doc). These options are mutually exclusive, last ' | |
| 353 | - 'option found on command line overwrites earlier ones.') | |
| 361 | + title='Filter which OpenXML field commands are returned', | |
| 362 | + description='Only applies to OpenXML (e.g. docx) and rtf, not to OLE ' | |
| 363 | + '(e.g. .doc). These options are mutually exclusive, last ' | |
| 364 | + 'option found on command line overwrites earlier ones.') | |
| 354 | 365 | filter_group.add_argument('-d', '--dde-only', action='store_const', |
| 355 | 366 | dest='field_filter_mode', const=FIELD_FILTER_DDE, |
| 356 | 367 | help='Return only DDE and DDEAUTO fields') |
| 357 | 368 | filter_group.add_argument('-f', '--filter', action='store_const', |
| 358 | - dest='field_filter_mode', const=FIELD_FILTER_BLACKLIST, | |
| 359 | - help='Return all fields except harmless ones like PAGE') | |
| 369 | + dest='field_filter_mode', | |
| 370 | + const=FIELD_FILTER_BLACKLIST, | |
| 371 | + help='Return all fields except harmless ones') | |
| 360 | 372 | filter_group.add_argument('-a', '--all-fields', action='store_const', |
| 361 | 373 | dest='field_filter_mode', const=FIELD_FILTER_ALL, |
| 362 | - help='Return all fields, irrespective of their contents') | |
| 374 | + help='Return all fields, irrespective of their ' | |
| 375 | + 'contents') | |
| 363 | 376 | parser.set_defaults(field_filter_mode=FIELD_FILTER_DEFAULT) |
| 364 | 377 | |
| 365 | 378 | return parser.parse_args(cmd_line_args) |
| ... | ... | @@ -368,16 +381,19 @@ def process_args(cmd_line_args=None): |
| 368 | 381 | # === FUNCTIONS ============================================================== |
| 369 | 382 | |
| 370 | 383 | # from [MS-DOC], section 2.8.25 (PlcFld): |
| 371 | -# A field consists of two parts: field instructions and, optionally, a result. All fields MUST begin with | |
| 372 | -# Unicode character 0x0013 with sprmCFSpec applied with a value of 1. This is the field begin | |
| 373 | -# character. All fields MUST end with a Unicode character 0x0015 with sprmCFSpec applied with a value | |
| 374 | -# of 1. This is the field end character. If the field has a result, then there MUST be a Unicode character | |
| 375 | -# 0x0014 with sprmCFSpec applied with a value of 1 somewhere between the field begin character and | |
| 376 | -# the field end character. This is the field separator. The field result is the content between the field | |
| 377 | -# separator and the field end character. The field instructions are the content between the field begin | |
| 378 | -# character and the field separator, if one is present, or between the field begin character and the field | |
| 379 | -# end character if no separator is present. The field begin character, field end character, and field | |
| 380 | -# separator are collectively referred to as field characters. | |
| 384 | +# A field consists of two parts: field instructions and, optionally, a result. | |
| 385 | +# All fields MUST begin with Unicode character 0x0013 with sprmCFSpec applied | |
| 386 | +# with a value of 1. This is the field begin character. All fields MUST end | |
| 387 | +# with a Unicode character 0x0015 with sprmCFSpec applied with a value of 1. | |
| 388 | +# This is the field end character. If the field has a result, then there MUST | |
| 389 | +# be a Unicode character 0x0014 with sprmCFSpec applied with a value of 1 | |
| 390 | +# somewhere between the field begin character and the field end character. This | |
| 391 | +# is the field separator. The field result is the content between the field | |
| 392 | +# separator and the field end character. The field instructions are the content | |
| 393 | +# between the field begin character and the field separator, if one is present, | |
| 394 | +# or between the field begin character and the field end character if no | |
| 395 | +# separator is present. The field begin character, field end character, and | |
| 396 | +# field separator are collectively referred to as field characters. | |
| 381 | 397 | |
| 382 | 398 | |
| 383 | 399 | def process_doc_field(data): |
| ... | ... | @@ -387,7 +403,6 @@ def process_doc_field(data): |
| 387 | 403 | log.debug('processing field \'{0}\''.format(data)) |
| 388 | 404 | |
| 389 | 405 | if data.lstrip().lower().startswith(u'dde'): |
| 390 | - #log.debug('--> is DDE!') | |
| 391 | 406 | return data |
| 392 | 407 | elif data.lstrip().lower().startswith(u'\x00d\x00d\x00e\x00'): |
| 393 | 408 | return data |
| ... | ... | @@ -512,7 +527,6 @@ def process_doc(filepath): |
| 512 | 527 | return u'\n'.join(links) |
| 513 | 528 | |
| 514 | 529 | |
| 515 | - | |
| 516 | 530 | def process_xls(filepath): |
| 517 | 531 | """ find dde links in excel ole file """ |
| 518 | 532 | |
| ... | ... | @@ -531,6 +545,7 @@ def process_xls(filepath): |
| 531 | 545 | |
| 532 | 546 | |
| 533 | 547 | def process_docx(filepath, field_filter_mode=None): |
| 548 | + """ find dde-links (and other fields) in Word 2007+ files """ | |
| 534 | 549 | log.debug('process_docx') |
| 535 | 550 | all_fields = [] |
| 536 | 551 | with zipfile.ZipFile(filepath) as z: |
| ... | ... | @@ -539,9 +554,6 @@ def process_docx(filepath, field_filter_mode=None): |
| 539 | 554 | data = z.read(filepath) |
| 540 | 555 | fields = process_xml(data) |
| 541 | 556 | if len(fields) > 0: |
| 542 | - #print ('DDE Links in %s:'%filepath) | |
| 543 | - #for f in fields: | |
| 544 | - # print(f) | |
| 545 | 557 | all_fields.extend(fields) |
| 546 | 558 | |
| 547 | 559 | # apply field command filter |
| ... | ... | @@ -560,8 +572,10 @@ def process_docx(filepath, field_filter_mode=None): |
| 560 | 572 | .format(field_filter_mode)) |
| 561 | 573 | |
| 562 | 574 | return u'\n'.join(clean_fields) |
| 563 | - | |
| 575 | + | |
| 576 | + | |
| 564 | 577 | def process_xml(data): |
| 578 | + """ Find dde-links and other fields in office XML data """ | |
| 565 | 579 | # parse the XML data: |
| 566 | 580 | root = ET.fromstring(data) |
| 567 | 581 | fields = [] |
| ... | ... | @@ -569,17 +583,18 @@ def process_xml(data): |
| 569 | 583 | level = 0 |
| 570 | 584 | # find all the tags 'w:p': |
| 571 | 585 | # parse each for begin and end tags, to group DDE strings |
| 572 | - # fldChar can be in either a w:r element, floating alone in the w:p or spread accross w:p tags | |
| 586 | + # fldChar can be in either a w:r element, floating alone in the w:p | |
| 587 | + # or spread accross w:p tags | |
| 573 | 588 | # escape DDE if quoted etc |
| 574 | 589 | # (each is a chunk of a DDE link) |
| 575 | 590 | |
| 576 | 591 | for subs in root.iter(TAG_W_P): |
| 577 | 592 | elem = None |
| 578 | 593 | for e in subs: |
| 579 | - #check if w:r and if it is parse children elements to pull out the first FLDCHAR or INSTRTEXT | |
| 580 | 594 | if e.tag == TAG_W_R: |
| 581 | 595 | for child in e: |
| 582 | 596 | if child.tag == TAG_W_FLDCHAR or child.tag == TAG_W_INSTRTEXT: |
| 597 | + # check if w:r; parse children to pull out first FLDCHAR/INSTRTEXT | |
| 583 | 598 | elem = child |
| 584 | 599 | break |
| 585 | 600 | else: |
| ... | ... | @@ -587,21 +602,21 @@ def process_xml(data): |
| 587 | 602 | #this should be an error condition |
| 588 | 603 | if elem is None: |
| 589 | 604 | continue |
| 590 | - | |
| 591 | - #check if FLDCHARTYPE and whether "begin" or "end" tag | |
| 605 | + | |
| 606 | + # check if FLDCHARTYPE and whether "begin" or "end" tag | |
| 592 | 607 | if elem.attrib.get(ATTR_W_FLDCHARTYPE) is not None: |
| 593 | 608 | if elem.attrib[ATTR_W_FLDCHARTYPE] == "begin": |
| 594 | - level += 1 | |
| 609 | + level += 1 | |
| 595 | 610 | if elem.attrib[ATTR_W_FLDCHARTYPE] == "end": |
| 596 | 611 | level -= 1 |
| 597 | - if level == 0 or level == -1 : # edge-case where level becomes -1 | |
| 612 | + if level == 0 or level == -1: # edge-case; level becomes -1 | |
| 598 | 613 | fields.append(ddetext) |
| 599 | 614 | ddetext = u'' |
| 600 | - level = 0 # reset edge-case | |
| 601 | - | |
| 615 | + level = 0 # reset edge-case | |
| 616 | + | |
| 602 | 617 | # concatenate the text of the field, if present: |
| 603 | 618 | if elem.tag == TAG_W_INSTRTEXT and elem.text is not None: |
| 604 | - #expand field code if QUOTED | |
| 619 | + # expand field code if QUOTED | |
| 605 | 620 | ddetext += unquote(elem.text) |
| 606 | 621 | |
| 607 | 622 | for elem in root.iter(TAG_W_FLDSIMPLE): |
| ... | ... | @@ -611,10 +626,11 @@ def process_xml(data): |
| 611 | 626 | |
| 612 | 627 | return fields |
| 613 | 628 | |
| 614 | -def unquote(field): | |
| 629 | + | |
| 630 | +def unquote(field): | |
| 615 | 631 | if "QUOTE" not in field or NO_QUOTES: |
| 616 | 632 | return field |
| 617 | - #split into components | |
| 633 | + # split into components | |
| 618 | 634 | parts = field.strip().split(" ") |
| 619 | 635 | ddestr = "" |
| 620 | 636 | for p in parts[1:]: |
| ... | ... | @@ -625,11 +641,13 @@ def unquote(field): |
| 625 | 641 | ddestr += ch |
| 626 | 642 | return ddestr |
| 627 | 643 | |
| 644 | + | |
| 628 | 645 | # "static variables" for field_is_blacklisted: |
| 629 | 646 | FIELD_WORD_REGEX = re.compile(r'"[^"]*"|\S+') |
| 630 | 647 | FIELD_BLACKLIST_CMDS = tuple(field[0].lower() for field in FIELD_BLACKLIST) |
| 631 | 648 | FIELD_SWITCH_REGEX = re.compile(r'^\\[\w#*@]$') |
| 632 | 649 | |
| 650 | + | |
| 633 | 651 | def field_is_blacklisted(contents): |
| 634 | 652 | """ Check if given field contents matches any in FIELD_BLACKLIST |
| 635 | 653 | |
| ... | ... | @@ -651,7 +669,7 @@ def field_is_blacklisted(contents): |
| 651 | 669 | index = FIELD_BLACKLIST_CMDS.index(words[0].lower()) |
| 652 | 670 | except ValueError: # first word is no blacklisted command |
| 653 | 671 | return False |
| 654 | - log.debug('trying to match "{0}" to blacklist command {0}' | |
| 672 | + log.debug('trying to match "{0}" to blacklist command {1}' | |
| 655 | 673 | .format(contents, FIELD_BLACKLIST[index])) |
| 656 | 674 | _, nargs_required, nargs_optional, sw_with_arg, sw_solo, sw_format \ |
| 657 | 675 | = FIELD_BLACKLIST[index] |
| ... | ... | @@ -706,14 +724,15 @@ def field_is_blacklisted(contents): |
| 706 | 724 | if 'numeric' in sw_format: |
| 707 | 725 | arg_choices = [] # too many choices to list them here |
| 708 | 726 | else: |
| 709 | - log.debug('unexpected switch {0} in "{1}"'.format(switch, contents)) | |
| 727 | + log.debug('unexpected switch {0} in "{1}"' | |
| 728 | + .format(switch, contents)) | |
| 710 | 729 | return False |
| 711 | 730 | |
| 712 | 731 | # if nothing went wrong sofar, the contents seems to match the blacklist |
| 713 | 732 | return True |
| 714 | 733 | |
| 715 | 734 | |
| 716 | -def process_xlsx(filepath, filed_filter_mode=None): | |
| 735 | +def process_xlsx(filepath): | |
| 717 | 736 | """ process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm) """ |
| 718 | 737 | dde_links = [] |
| 719 | 738 | parser = ooxml.XmlParser(filepath) |
| ... | ... | @@ -733,7 +752,8 @@ def process_xlsx(filepath, filed_filter_mode=None): |
| 733 | 752 | try: |
| 734 | 753 | logging.info('Parsing non-xml subfile {0} with content type {1}' |
| 735 | 754 | .format(subfile, content_type)) |
| 736 | - for record in xls_parser.parse_xlsb_part(handle, content_type, subfile): | |
| 755 | + for record in xls_parser.parse_xlsb_part(handle, content_type, | |
| 756 | + subfile): | |
| 737 | 757 | logging.debug('{0}: {1}'.format(subfile, record)) |
| 738 | 758 | if isinstance(record, xls_parser.XlsbBeginSupBook) and \ |
| 739 | 759 | record.link_type == \ |
| ... | ... | @@ -791,8 +811,10 @@ class RtfFieldParser(rtfobj.RtfParser): |
| 791 | 811 | |
| 792 | 812 | RTF_START = b'\x7b\x5c\x72\x74' # == b'{\rt' but does not mess up auto-indent |
| 793 | 813 | |
| 814 | + | |
| 794 | 815 | def process_rtf(file_handle, field_filter_mode=None): |
| 795 | 816 | log.debug('process_rtf') |
| 817 | + """ find dde links or other fields in rtf file """ | |
| 796 | 818 | all_fields = [] |
| 797 | 819 | data = RTF_START + file_handle.read() # read complete file into memory! |
| 798 | 820 | file_handle.close() |
| ... | ... | @@ -828,7 +850,7 @@ def process_file(filepath, field_filter_mode=None): |
| 828 | 850 | return process_doc(filepath) |
| 829 | 851 | |
| 830 | 852 | with open(filepath, 'rb') as file_handle: |
| 831 | - if file_handle.read(4) == RTF_START: | |
| 853 | + if file_handle.read(4) == RTF_START: | |
| 832 | 854 | # This is a RTF file |
| 833 | 855 | return process_rtf(file_handle, field_filter_mode) |
| 834 | 856 | |
| ... | ... | @@ -846,7 +868,7 @@ def process_file(filepath, field_filter_mode=None): |
| 846 | 868 | return process_docx(filepath, field_filter_mode) |
| 847 | 869 | |
| 848 | 870 | |
| 849 | -#=== MAIN ================================================================= | |
| 871 | +# === MAIN ================================================================= | |
| 850 | 872 | |
| 851 | 873 | def main(cmd_line_args=None): |
| 852 | 874 | """ Main function, called if this file is called as a script |
| ... | ... | @@ -868,10 +890,10 @@ def main(cmd_line_args=None): |
| 868 | 890 | if args.json and args.loglevel.lower() == 'debug': |
| 869 | 891 | log.warning('Debug log output will not be json-compatible!') |
| 870 | 892 | |
| 871 | - if args.nounquote : | |
| 893 | + if args.nounquote: | |
| 872 | 894 | global NO_QUOTES |
| 873 | 895 | NO_QUOTES = True |
| 874 | - | |
| 896 | + | |
| 875 | 897 | if args.json: |
| 876 | 898 | jout = [] |
| 877 | 899 | jout.append(BANNER_JSON) |
| ... | ... | @@ -890,7 +912,7 @@ def main(cmd_line_args=None): |
| 890 | 912 | except Exception as exc: |
| 891 | 913 | if args.json: |
| 892 | 914 | jout.append(dict(type='error', error=type(exc).__name__, |
| 893 | - message=str(exc))) # strange: str(exc) is enclosed in "" | |
| 915 | + message=str(exc))) | |
| 894 | 916 | else: |
| 895 | 917 | raise # re-raise last known exception, keeping trace intact |
| 896 | 918 | ... | ... |