Commit 6028d9ab1106a188c91e9aff496df9e745726d43

Authored by Christian Herdtweck
1 parent 56ed93a1

msodde: clean-up code following pep8 and pylint (1)

- disable pylint-whitespace-check from FIELD_BLACKLIST
- shortend most all lines to max 79 chars (except pylint: disable-*)
- moved imports further up
- re-wrap a few lines
- add missing doc strings
- add/remove whitespace
- remove old commented debug-log/print statements
Showing 1 changed file with 136 additions and 114 deletions
oletools/msodde.py
... ... @@ -17,30 +17,31 @@ msodde is part of the python-oletools package:
17 17 http://www.decalage.info/python/oletools
18 18 """
19 19  
20   -# === LICENSE ==================================================================
  20 +# === LICENSE =================================================================
21 21  
22 22 # msodde is copyright (c) 2017 Philippe Lagadec (http://www.decalage.info)
23 23 # All rights reserved.
24 24 #
25   -# Redistribution and use in source and binary forms, with or without modification,
26   -# are permitted provided that the following conditions are met:
  25 +# Redistribution and use in source and binary forms, with or without
  26 +# modification, are permitted provided that the following conditions are met:
27 27 #
28   -# * Redistributions of source code must retain the above copyright notice, this
29   -# list of conditions and the following disclaimer.
  28 +# * Redistributions of source code must retain the above copyright notice,
  29 +# this list of conditions and the following disclaimer.
30 30 # * Redistributions in binary form must reproduce the above copyright notice,
31 31 # this list of conditions and the following disclaimer in the documentation
32 32 # and/or other materials provided with the distribution.
33 33 #
34   -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
35   -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
36   -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
37   -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
38   -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39   -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
40   -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
41   -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
42   -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43   -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  34 +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  35 +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  36 +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  37 +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  38 +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  39 +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  40 +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  41 +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  42 +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  43 +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  44 +# POSSIBILITY OF SUCH DAMAGE.
44 45  
45 46 from __future__ import print_function
46 47  
... ... @@ -49,7 +50,8 @@ from __future__ import print_function
49 50 # 2017-10-18 v0.52 PL: - first version
50 51 # 2017-10-20 PL: - fixed issue #202 (handling empty xml tags)
51 52 # 2017-10-23 ES: - add check for fldSimple codes
52   -# 2017-10-24 ES: - group tags and track begin/end tags to keep DDE strings together
  53 +# 2017-10-24 ES: - group tags and track begin/end tags to keep DDE
  54 +# strings together
53 55 # 2017-10-25 CH: - add json output
54 56 # 2017-10-25 CH: - parse doc
55 57 # PL: - added logging
... ... @@ -62,7 +64,7 @@ from __future__ import print_function
62 64  
63 65 __version__ = '0.52dev9'
64 66  
65   -#------------------------------------------------------------------------------
  67 +# -----------------------------------------------------------------------------
66 68 # TODO: field codes can be in headers/footers/comments - parse these
67 69 # TODO: generalize behaviour for xlsx: find all external links (maybe rename
68 70 # command line flag for "blacklist" to "find all suspicious" or so)
... ... @@ -71,7 +73,7 @@ __version__ = '0.52dev9'
71 73 # DDE-Links
72 74 # TODO: avoid reading complete rtf file data into memory
73 75  
74   -#------------------------------------------------------------------------------
  76 +# -----------------------------------------------------------------------------
75 77 # REFERENCES:
76 78  
77 79  
... ... @@ -123,7 +125,9 @@ TAG_W_P = "{%s}p" % NS_WORD
123 125 TAG_W_R = "{%s}r" % NS_WORD
124 126 ATTR_W_INSTR = '{%s}instr' % NS_WORD
125 127 ATTR_W_FLDCHARTYPE = '{%s}fldCharType' % NS_WORD
126   -LOCATIONS = ['word/document.xml','word/endnotes.xml','word/footnotes.xml','word/header1.xml','word/footer1.xml','word/header2.xml','word/footer2.xml','word/comments.xml']
  128 +LOCATIONS = ('word/document.xml', 'word/endnotes.xml', 'word/footnotes.xml',
  129 + 'word/header1.xml', 'word/footer1.xml', 'word/header2.xml',
  130 + 'word/footer2.xml', 'word/comments.xml')
127 131  
128 132 # list of acceptable, harmless field instructions for blacklist field mode
129 133 # c.f. http://officeopenxml.com/WPfieldInstructions.php or the official
... ... @@ -133,73 +137,74 @@ LOCATIONS = ['word/document.xml','word/endnotes.xml','word/footnotes.xml','word/
133 137 # switches_with_args, switches_without_args, format_switches)
134 138 FIELD_BLACKLIST = (
135 139 # date and time:
136   - ('CREATEDATE', 0, 0, '', 'hs', 'datetime'),
137   - ('DATE', 0, 0, '', 'hls', 'datetime'),
138   - ('EDITTIME', 0, 0, '', '', 'numeric'),
139   - ('PRINTDATE', 0, 0, '', 'hs', 'datetime'),
140   - ('SAVEDATE', 0, 0, '', 'hs', 'datetime'),
141   - ('TIME', 0, 0, '', '', 'datetime'),
  140 + ('CREATEDATE', 0, 0, '', 'hs', 'datetime'), # pylint: disable=bad-whitespace
  141 + ('DATE', 0, 0, '', 'hls', 'datetime'), # pylint: disable=bad-whitespace
  142 + ('EDITTIME', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace
  143 + ('PRINTDATE', 0, 0, '', 'hs', 'datetime'), # pylint: disable=bad-whitespace
  144 + ('SAVEDATE', 0, 0, '', 'hs', 'datetime'), # pylint: disable=bad-whitespace
  145 + ('TIME', 0, 0, '', '', 'datetime'), # pylint: disable=bad-whitespace
142 146 # exclude document automation (we hate the "auto" in "automation")
143 147 # (COMPARE, DOCVARIABLE, GOTOBUTTON, IF, MACROBUTTON, PRINT)
144 148 # document information
145   - ('AUTHOR', 0, 1, '', '', 'string'),
146   - ('COMMENTS', 0, 1, '', '', 'string'),
147   - ('DOCPROPERTY', 1, 0, '', '', 'string/numeric/datetime'),
148   - ('FILENAME', 0, 0, '', 'p', 'string'),
149   - ('FILESIZE', 0, 0, '', 'km', 'numeric'),
150   - ('KEYWORDS', 0, 1, '', '', 'string'),
151   - ('LASTSAVEDBY', 0, 0, '', '', 'string'),
152   - ('NUMCHARS', 0, 0, '', '', 'numeric'),
153   - ('NUMPAGES', 0, 0, '', '', 'numeric'),
154   - ('NUMWORDS', 0, 0, '', '', 'numeric'),
155   - ('SUBJECT', 0, 1, '', '', 'string'),
156   - ('TEMPLATE', 0, 0, '', 'p', 'string'),
157   - ('TITLE', 0, 1, '', '', 'string'),
  149 + ('AUTHOR', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace
  150 + ('COMMENTS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace
  151 + ('DOCPROPERTY', 1, 0, '', '', 'string/numeric/datetime'), # pylint: disable=bad-whitespace
  152 + ('FILENAME', 0, 0, '', 'p', 'string'), # pylint: disable=bad-whitespace
  153 + ('FILESIZE', 0, 0, '', 'km', 'numeric'), # pylint: disable=bad-whitespace
  154 + ('KEYWORDS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace
  155 + ('LASTSAVEDBY', 0, 0, '', '', 'string'), # pylint: disable=bad-whitespace
  156 + ('NUMCHARS', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace
  157 + ('NUMPAGES', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace
  158 + ('NUMWORDS', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace
  159 + ('SUBJECT', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace
  160 + ('TEMPLATE', 0, 0, '', 'p', 'string'), # pylint: disable=bad-whitespace
  161 + ('TITLE', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace
158 162 # equations and formulas
159   - # exlude '=' formulae because they have different syntax
160   - ('ADVANCE', 0, 0, 'dlruxy', '', ''),
161   - ('SYMBOL', 1, 0, 'fs', 'ahju', ''),
  163 + # exlude '=' formulae because they have different syntax (and can be bad)
  164 + ('ADVANCE', 0, 0, 'dlruxy', '', ''), # pylint: disable=bad-whitespace
  165 + ('SYMBOL', 1, 0, 'fs', 'ahju', ''), # pylint: disable=bad-whitespace
162 166 # form fields
163   - ('FORMCHECKBOX', 0, 0, '', '', ''),
164   - ('FORMDROPDOWN', 0, 0, '', '', ''),
165   - ('FORMTEXT', 0, 0, '', '', ''),
  167 + ('FORMCHECKBOX', 0, 0, '', '', ''), # pylint: disable=bad-whitespace
  168 + ('FORMDROPDOWN', 0, 0, '', '', ''), # pylint: disable=bad-whitespace
  169 + ('FORMTEXT', 0, 0, '', '', ''), # pylint: disable=bad-whitespace
166 170 # index and tables
167   - ('INDEX', 0, 0, 'bcdefghklpsz', 'ry', ''),
  171 + ('INDEX', 0, 0, 'bcdefghklpsz', 'ry', ''), # pylint: disable=bad-whitespace
168 172 # exlude RD since that imports data from other files
169   - ('TA', 0, 0, 'clrs', 'bi', ''),
170   - ('TC', 1, 0, 'fl', 'n', ''),
171   - ('TOA', 0, 0, 'bcdegls', 'fhp', ''),
172   - ('TOC', 0, 0, 'abcdflnopst', 'huwxz', ''),
173   - ('XE', 1, 0, 'frty', 'bi', ''),
  173 + ('TA', 0, 0, 'clrs', 'bi', ''), # pylint: disable=bad-whitespace
  174 + ('TC', 1, 0, 'fl', 'n', ''), # pylint: disable=bad-whitespace
  175 + ('TOA', 0, 0, 'bcdegls', 'fhp', ''), # pylint: disable=bad-whitespace
  176 + ('TOC', 0, 0, 'abcdflnopst', 'huwxz', ''), # pylint: disable=bad-whitespace
  177 + ('XE', 1, 0, 'frty', 'bi', ''), # pylint: disable=bad-whitespace
174 178 # links and references
175 179 # exclude AUTOTEXT and AUTOTEXTLIST since we do not like stuff with 'AUTO'
176   - ('BIBLIOGRAPHY', 0, 0, 'lfm', '', ''),
177   - ('CITATION', 1, 0, 'lfspvm', 'nty', ''),
  180 + ('BIBLIOGRAPHY', 0, 0, 'lfm', '', ''), # pylint: disable=bad-whitespace
  181 + ('CITATION', 1, 0, 'lfspvm', 'nty', ''), # pylint: disable=bad-whitespace
178 182 # exclude HYPERLINK since we are allergic to URLs
179 183 # exclude INCLUDEPICTURE and INCLUDETEXT (other file or maybe even URL?)
180 184 # exclude LINK and REF (could reference other files)
181   - ('NOTEREF', 1, 0, '', 'fhp', ''),
182   - ('PAGEREF', 1, 0, '', 'hp', ''),
183   - ('QUOTE', 1, 0, '', '', 'datetime'),
184   - ('STYLEREF', 1, 0, '', 'lnprtw', ''),
  185 + ('NOTEREF', 1, 0, '', 'fhp', ''), # pylint: disable=bad-whitespace
  186 + ('PAGEREF', 1, 0, '', 'hp', ''), # pylint: disable=bad-whitespace
  187 + ('QUOTE', 1, 0, '', '', 'datetime'), # pylint: disable=bad-whitespace
  188 + ('STYLEREF', 1, 0, '', 'lnprtw', ''), # pylint: disable=bad-whitespace
185 189 # exclude all Mail Merge commands since they import data from other files
186 190 # (ADDRESSBLOCK, ASK, COMPARE, DATABASE, FILLIN, GREETINGLINE, IF,
187 191 # MERGEFIELD, MERGEREC, MERGESEQ, NEXT, NEXTIF, SET, SKIPIF)
188 192 # Numbering
189   - ('LISTNUM', 0, 1, 'ls', '', ''),
190   - ('PAGE', 0, 0, '', '', 'numeric'),
191   - ('REVNUM', 0, 0, '', '', ''),
192   - ('SECTION', 0, 0, '', '', 'numeric'),
193   - ('SECTIONPAGES', 0, 0, '', '', 'numeric'),
194   - ('SEQ', 1, 1, 'rs', 'chn', 'numeric'),
195   - # user information
196   - ('USERADDRESS', 0, 1, '', '', 'string'),
197   - ('USERINITIALS', 0, 1, '', '', 'string'),
198   - ('USERNAME', 0, 1, '', '', 'string'),
  193 + ('LISTNUM', 0, 1, 'ls', '', ''), # pylint: disable=bad-whitespace
  194 + ('PAGE', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace
  195 + ('REVNUM', 0, 0, '', '', ''), # pylint: disable=bad-whitespace
  196 + ('SECTION', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace
  197 + ('SECTIONPAGES', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace
  198 + ('SEQ', 1, 1, 'rs', 'chn', 'numeric'), # pylint: disable=bad-whitespace
  199 + # user information # pylint: disable=bad-whitespace
  200 + ('USERADDRESS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace
  201 + ('USERINITIALS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace
  202 + ('USERNAME', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace
199 203 )
200 204  
201 205 FIELD_DDE_REGEX = re.compile(r'^\s*dde(auto)?\s+', re.I)
202 206  
  207 +# filter modes
203 208 FIELD_FILTER_DDE = 'only dde'
204 209 FIELD_FILTER_BLACKLIST = 'exclude blacklisted'
205 210 FIELD_FILTER_ALL = 'keep all'
... ... @@ -229,6 +234,7 @@ LOG_LEVELS = {
229 234 'critical': logging.CRITICAL
230 235 }
231 236  
  237 +
232 238 class NullHandler(logging.Handler):
233 239 """
234 240 Log Handler without output, to avoid printing messages if logging is not
... ... @@ -239,6 +245,7 @@ class NullHandler(logging.Handler):
239 245 def emit(self, record):
240 246 pass
241 247  
  248 +
242 249 def get_logger(name, level=logging.CRITICAL+1):
243 250 """
244 251 Create a suitable logger object for this module.
... ... @@ -251,7 +258,7 @@ def get_logger(name, level=logging.CRITICAL+1):
251 258 # First, test if there is already a logger with the same name, else it
252 259 # will generate duplicate messages (due to duplicate handlers):
253 260 if name in logging.Logger.manager.loggerDict:
254   - #NOTE: another less intrusive but more "hackish" solution would be to
  261 + # NOTE: another less intrusive but more "hackish" solution would be to
255 262 # use getLogger then test if its effective level is not default.
256 263 logger = logging.getLogger(name)
257 264 # make sure level is OK:
... ... @@ -338,28 +345,34 @@ def existing_file(filename):
338 345  
339 346 def process_args(cmd_line_args=None):
340 347 """ parse command line arguments (given ones or per default sys.argv) """
341   - parser = ArgParserWithBanner(description='A python tool to detect and extract DDE links in MS Office files')
  348 + parser = ArgParserWithBanner(description='A python tool to detect and '
  349 + 'extract DDE links in MS Office files')
342 350 parser.add_argument("filepath", help="path of the file to be analyzed",
343 351 type=existing_file, metavar='FILE')
344 352 parser.add_argument('-j', "--json", action='store_true',
345 353 help="Output in json format. Do not use with -ldebug")
346   - parser.add_argument("--nounquote", help="don't unquote values",action='store_true')
347   - parser.add_argument('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL,
348   - help="logging level debug/info/warning/error/critical (default=%(default)s)")
  354 + parser.add_argument("--nounquote", help="don't unquote values",
  355 + action='store_true')
  356 + parser.add_argument('-l', '--loglevel', dest="loglevel", action="store",
  357 + default=DEFAULT_LOG_LEVEL,
  358 + help="logging level debug/info/warning/error/critical "
  359 + "(default=%(default)s)")
349 360 filter_group = parser.add_argument_group(
350   - title='Filter which OpenXML field commands are returned',
351   - description='Only applies to OpenXML (e.g. docx) and rtf, not to OLE '
352   - '(e.g. .doc). These options are mutually exclusive, last '
353   - 'option found on command line overwrites earlier ones.')
  361 + title='Filter which OpenXML field commands are returned',
  362 + description='Only applies to OpenXML (e.g. docx) and rtf, not to OLE '
  363 + '(e.g. .doc). These options are mutually exclusive, last '
  364 + 'option found on command line overwrites earlier ones.')
354 365 filter_group.add_argument('-d', '--dde-only', action='store_const',
355 366 dest='field_filter_mode', const=FIELD_FILTER_DDE,
356 367 help='Return only DDE and DDEAUTO fields')
357 368 filter_group.add_argument('-f', '--filter', action='store_const',
358   - dest='field_filter_mode', const=FIELD_FILTER_BLACKLIST,
359   - help='Return all fields except harmless ones like PAGE')
  369 + dest='field_filter_mode',
  370 + const=FIELD_FILTER_BLACKLIST,
  371 + help='Return all fields except harmless ones')
360 372 filter_group.add_argument('-a', '--all-fields', action='store_const',
361 373 dest='field_filter_mode', const=FIELD_FILTER_ALL,
362   - help='Return all fields, irrespective of their contents')
  374 + help='Return all fields, irrespective of their '
  375 + 'contents')
363 376 parser.set_defaults(field_filter_mode=FIELD_FILTER_DEFAULT)
364 377  
365 378 return parser.parse_args(cmd_line_args)
... ... @@ -368,16 +381,19 @@ def process_args(cmd_line_args=None):
368 381 # === FUNCTIONS ==============================================================
369 382  
370 383 # from [MS-DOC], section 2.8.25 (PlcFld):
371   -# A field consists of two parts: field instructions and, optionally, a result. All fields MUST begin with
372   -# Unicode character 0x0013 with sprmCFSpec applied with a value of 1. This is the field begin
373   -# character. All fields MUST end with a Unicode character 0x0015 with sprmCFSpec applied with a value
374   -# of 1. This is the field end character. If the field has a result, then there MUST be a Unicode character
375   -# 0x0014 with sprmCFSpec applied with a value of 1 somewhere between the field begin character and
376   -# the field end character. This is the field separator. The field result is the content between the field
377   -# separator and the field end character. The field instructions are the content between the field begin
378   -# character and the field separator, if one is present, or between the field begin character and the field
379   -# end character if no separator is present. The field begin character, field end character, and field
380   -# separator are collectively referred to as field characters.
  384 +# A field consists of two parts: field instructions and, optionally, a result.
  385 +# All fields MUST begin with Unicode character 0x0013 with sprmCFSpec applied
  386 +# with a value of 1. This is the field begin character. All fields MUST end
  387 +# with a Unicode character 0x0015 with sprmCFSpec applied with a value of 1.
  388 +# This is the field end character. If the field has a result, then there MUST
  389 +# be a Unicode character 0x0014 with sprmCFSpec applied with a value of 1
  390 +# somewhere between the field begin character and the field end character. This
  391 +# is the field separator. The field result is the content between the field
  392 +# separator and the field end character. The field instructions are the content
  393 +# between the field begin character and the field separator, if one is present,
  394 +# or between the field begin character and the field end character if no
  395 +# separator is present. The field begin character, field end character, and
  396 +# field separator are collectively referred to as field characters.
381 397  
382 398  
383 399 def process_doc_field(data):
... ... @@ -387,7 +403,6 @@ def process_doc_field(data):
387 403 log.debug('processing field \'{0}\''.format(data))
388 404  
389 405 if data.lstrip().lower().startswith(u'dde'):
390   - #log.debug('--> is DDE!')
391 406 return data
392 407 elif data.lstrip().lower().startswith(u'\x00d\x00d\x00e\x00'):
393 408 return data
... ... @@ -512,7 +527,6 @@ def process_doc(filepath):
512 527 return u'\n'.join(links)
513 528  
514 529  
515   -
516 530 def process_xls(filepath):
517 531 """ find dde links in excel ole file """
518 532  
... ... @@ -531,6 +545,7 @@ def process_xls(filepath):
531 545  
532 546  
533 547 def process_docx(filepath, field_filter_mode=None):
  548 + """ find dde-links (and other fields) in Word 2007+ files """
534 549 log.debug('process_docx')
535 550 all_fields = []
536 551 with zipfile.ZipFile(filepath) as z:
... ... @@ -539,9 +554,6 @@ def process_docx(filepath, field_filter_mode=None):
539 554 data = z.read(filepath)
540 555 fields = process_xml(data)
541 556 if len(fields) > 0:
542   - #print ('DDE Links in %s:'%filepath)
543   - #for f in fields:
544   - # print(f)
545 557 all_fields.extend(fields)
546 558  
547 559 # apply field command filter
... ... @@ -560,8 +572,10 @@ def process_docx(filepath, field_filter_mode=None):
560 572 .format(field_filter_mode))
561 573  
562 574 return u'\n'.join(clean_fields)
563   -
  575 +
  576 +
564 577 def process_xml(data):
  578 + """ Find dde-links and other fields in office XML data """
565 579 # parse the XML data:
566 580 root = ET.fromstring(data)
567 581 fields = []
... ... @@ -569,17 +583,18 @@ def process_xml(data):
569 583 level = 0
570 584 # find all the tags 'w:p':
571 585 # parse each for begin and end tags, to group DDE strings
572   - # fldChar can be in either a w:r element, floating alone in the w:p or spread accross w:p tags
  586 + # fldChar can be in either a w:r element, floating alone in the w:p
  587 + # or spread accross w:p tags
573 588 # escape DDE if quoted etc
574 589 # (each is a chunk of a DDE link)
575 590  
576 591 for subs in root.iter(TAG_W_P):
577 592 elem = None
578 593 for e in subs:
579   - #check if w:r and if it is parse children elements to pull out the first FLDCHAR or INSTRTEXT
580 594 if e.tag == TAG_W_R:
581 595 for child in e:
582 596 if child.tag == TAG_W_FLDCHAR or child.tag == TAG_W_INSTRTEXT:
  597 + # check if w:r; parse children to pull out first FLDCHAR/INSTRTEXT
583 598 elem = child
584 599 break
585 600 else:
... ... @@ -587,21 +602,21 @@ def process_xml(data):
587 602 #this should be an error condition
588 603 if elem is None:
589 604 continue
590   -
591   - #check if FLDCHARTYPE and whether "begin" or "end" tag
  605 +
  606 + # check if FLDCHARTYPE and whether "begin" or "end" tag
592 607 if elem.attrib.get(ATTR_W_FLDCHARTYPE) is not None:
593 608 if elem.attrib[ATTR_W_FLDCHARTYPE] == "begin":
594   - level += 1
  609 + level += 1
595 610 if elem.attrib[ATTR_W_FLDCHARTYPE] == "end":
596 611 level -= 1
597   - if level == 0 or level == -1 : # edge-case where level becomes -1
  612 + if level == 0 or level == -1: # edge-case; level becomes -1
598 613 fields.append(ddetext)
599 614 ddetext = u''
600   - level = 0 # reset edge-case
601   -
  615 + level = 0 # reset edge-case
  616 +
602 617 # concatenate the text of the field, if present:
603 618 if elem.tag == TAG_W_INSTRTEXT and elem.text is not None:
604   - #expand field code if QUOTED
  619 + # expand field code if QUOTED
605 620 ddetext += unquote(elem.text)
606 621  
607 622 for elem in root.iter(TAG_W_FLDSIMPLE):
... ... @@ -611,10 +626,11 @@ def process_xml(data):
611 626  
612 627 return fields
613 628  
614   -def unquote(field):
  629 +
  630 +def unquote(field):
615 631 if "QUOTE" not in field or NO_QUOTES:
616 632 return field
617   - #split into components
  633 + # split into components
618 634 parts = field.strip().split(" ")
619 635 ddestr = ""
620 636 for p in parts[1:]:
... ... @@ -625,11 +641,13 @@ def unquote(field):
625 641 ddestr += ch
626 642 return ddestr
627 643  
  644 +
628 645 # "static variables" for field_is_blacklisted:
629 646 FIELD_WORD_REGEX = re.compile(r'"[^"]*"|\S+')
630 647 FIELD_BLACKLIST_CMDS = tuple(field[0].lower() for field in FIELD_BLACKLIST)
631 648 FIELD_SWITCH_REGEX = re.compile(r'^\\[\w#*@]$')
632 649  
  650 +
633 651 def field_is_blacklisted(contents):
634 652 """ Check if given field contents matches any in FIELD_BLACKLIST
635 653  
... ... @@ -651,7 +669,7 @@ def field_is_blacklisted(contents):
651 669 index = FIELD_BLACKLIST_CMDS.index(words[0].lower())
652 670 except ValueError: # first word is no blacklisted command
653 671 return False
654   - log.debug('trying to match "{0}" to blacklist command {0}'
  672 + log.debug('trying to match "{0}" to blacklist command {1}'
655 673 .format(contents, FIELD_BLACKLIST[index]))
656 674 _, nargs_required, nargs_optional, sw_with_arg, sw_solo, sw_format \
657 675 = FIELD_BLACKLIST[index]
... ... @@ -706,14 +724,15 @@ def field_is_blacklisted(contents):
706 724 if 'numeric' in sw_format:
707 725 arg_choices = [] # too many choices to list them here
708 726 else:
709   - log.debug('unexpected switch {0} in "{1}"'.format(switch, contents))
  727 + log.debug('unexpected switch {0} in "{1}"'
  728 + .format(switch, contents))
710 729 return False
711 730  
712 731 # if nothing went wrong sofar, the contents seems to match the blacklist
713 732 return True
714 733  
715 734  
716   -def process_xlsx(filepath, filed_filter_mode=None):
  735 +def process_xlsx(filepath):
717 736 """ process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm) """
718 737 dde_links = []
719 738 parser = ooxml.XmlParser(filepath)
... ... @@ -733,7 +752,8 @@ def process_xlsx(filepath, filed_filter_mode=None):
733 752 try:
734 753 logging.info('Parsing non-xml subfile {0} with content type {1}'
735 754 .format(subfile, content_type))
736   - for record in xls_parser.parse_xlsb_part(handle, content_type, subfile):
  755 + for record in xls_parser.parse_xlsb_part(handle, content_type,
  756 + subfile):
737 757 logging.debug('{0}: {1}'.format(subfile, record))
738 758 if isinstance(record, xls_parser.XlsbBeginSupBook) and \
739 759 record.link_type == \
... ... @@ -791,8 +811,10 @@ class RtfFieldParser(rtfobj.RtfParser):
791 811  
792 812 RTF_START = b'\x7b\x5c\x72\x74' # == b'{\rt' but does not mess up auto-indent
793 813  
  814 +
794 815 def process_rtf(file_handle, field_filter_mode=None):
795 816 log.debug('process_rtf')
  817 + """ find dde links or other fields in rtf file """
796 818 all_fields = []
797 819 data = RTF_START + file_handle.read() # read complete file into memory!
798 820 file_handle.close()
... ... @@ -828,7 +850,7 @@ def process_file(filepath, field_filter_mode=None):
828 850 return process_doc(filepath)
829 851  
830 852 with open(filepath, 'rb') as file_handle:
831   - if file_handle.read(4) == RTF_START:
  853 + if file_handle.read(4) == RTF_START:
832 854 # This is a RTF file
833 855 return process_rtf(file_handle, field_filter_mode)
834 856  
... ... @@ -846,7 +868,7 @@ def process_file(filepath, field_filter_mode=None):
846 868 return process_docx(filepath, field_filter_mode)
847 869  
848 870  
849   -#=== MAIN =================================================================
  871 +# === MAIN =================================================================
850 872  
851 873 def main(cmd_line_args=None):
852 874 """ Main function, called if this file is called as a script
... ... @@ -868,10 +890,10 @@ def main(cmd_line_args=None):
868 890 if args.json and args.loglevel.lower() == 'debug':
869 891 log.warning('Debug log output will not be json-compatible!')
870 892  
871   - if args.nounquote :
  893 + if args.nounquote:
872 894 global NO_QUOTES
873 895 NO_QUOTES = True
874   -
  896 +
875 897 if args.json:
876 898 jout = []
877 899 jout.append(BANNER_JSON)
... ... @@ -890,7 +912,7 @@ def main(cmd_line_args=None):
890 912 except Exception as exc:
891 913 if args.json:
892 914 jout.append(dict(type='error', error=type(exc).__name__,
893   - message=str(exc))) # strange: str(exc) is enclosed in ""
  915 + message=str(exc)))
894 916 else:
895 917 raise # re-raise last known exception, keeping trace intact
896 918  
... ...