From 6028d9ab1106a188c91e9aff496df9e745726d43 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Thu, 4 Jan 2018 16:06:22 +0100 Subject: [PATCH] msodde: clean-up code following pep8 and pylint (1) --- oletools/msodde.py | 250 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------------------------------------------------------------------------------------------------ 1 file changed, 136 insertions(+), 114 deletions(-) diff --git a/oletools/msodde.py b/oletools/msodde.py index 2451efa..33980cd 100644 --- a/oletools/msodde.py +++ b/oletools/msodde.py @@ -17,30 +17,31 @@ msodde is part of the python-oletools package: http://www.decalage.info/python/oletools """ -# === LICENSE ================================================================== +# === LICENSE ================================================================= # msodde is copyright (c) 2017 Philippe Lagadec (http://www.decalage.info) # All rights reserved. # -# Redistribution and use in source and binary forms, with or without modification, -# are permitted provided that the following conditions are met: +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: # -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. from __future__ import print_function @@ -49,7 +50,8 @@ from __future__ import print_function # 2017-10-18 v0.52 PL: - first version # 2017-10-20 PL: - fixed issue #202 (handling empty xml tags) # 2017-10-23 ES: - add check for fldSimple codes -# 2017-10-24 ES: - group tags and track begin/end tags to keep DDE strings together +# 2017-10-24 ES: - group tags and track begin/end tags to keep DDE +# strings together # 2017-10-25 CH: - add json output # 2017-10-25 CH: - parse doc # PL: - added logging @@ -62,7 +64,7 @@ from __future__ import print_function __version__ = '0.52dev9' -#------------------------------------------------------------------------------ +# ----------------------------------------------------------------------------- # TODO: field codes can be in headers/footers/comments - parse these # TODO: generalize behaviour for xlsx: find all external links (maybe rename # command line flag for "blacklist" to "find all suspicious" or so) @@ -71,7 +73,7 @@ __version__ = '0.52dev9' # DDE-Links # TODO: avoid reading complete rtf file data into memory -#------------------------------------------------------------------------------ +# ----------------------------------------------------------------------------- # REFERENCES: @@ -123,7 +125,9 @@ TAG_W_P = "{%s}p" % NS_WORD TAG_W_R = "{%s}r" % NS_WORD ATTR_W_INSTR = '{%s}instr' % NS_WORD ATTR_W_FLDCHARTYPE = '{%s}fldCharType' % NS_WORD -LOCATIONS = ['word/document.xml','word/endnotes.xml','word/footnotes.xml','word/header1.xml','word/footer1.xml','word/header2.xml','word/footer2.xml','word/comments.xml'] +LOCATIONS = ('word/document.xml', 'word/endnotes.xml', 'word/footnotes.xml', + 'word/header1.xml', 'word/footer1.xml', 'word/header2.xml', + 'word/footer2.xml', 'word/comments.xml') # list of acceptable, harmless field instructions for blacklist field mode # c.f. http://officeopenxml.com/WPfieldInstructions.php or the official @@ -133,73 +137,74 @@ LOCATIONS = ['word/document.xml','word/endnotes.xml','word/footnotes.xml','word/ # switches_with_args, switches_without_args, format_switches) FIELD_BLACKLIST = ( # date and time: - ('CREATEDATE', 0, 0, '', 'hs', 'datetime'), - ('DATE', 0, 0, '', 'hls', 'datetime'), - ('EDITTIME', 0, 0, '', '', 'numeric'), - ('PRINTDATE', 0, 0, '', 'hs', 'datetime'), - ('SAVEDATE', 0, 0, '', 'hs', 'datetime'), - ('TIME', 0, 0, '', '', 'datetime'), + ('CREATEDATE', 0, 0, '', 'hs', 'datetime'), # pylint: disable=bad-whitespace + ('DATE', 0, 0, '', 'hls', 'datetime'), # pylint: disable=bad-whitespace + ('EDITTIME', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace + ('PRINTDATE', 0, 0, '', 'hs', 'datetime'), # pylint: disable=bad-whitespace + ('SAVEDATE', 0, 0, '', 'hs', 'datetime'), # pylint: disable=bad-whitespace + ('TIME', 0, 0, '', '', 'datetime'), # pylint: disable=bad-whitespace # exclude document automation (we hate the "auto" in "automation") # (COMPARE, DOCVARIABLE, GOTOBUTTON, IF, MACROBUTTON, PRINT) # document information - ('AUTHOR', 0, 1, '', '', 'string'), - ('COMMENTS', 0, 1, '', '', 'string'), - ('DOCPROPERTY', 1, 0, '', '', 'string/numeric/datetime'), - ('FILENAME', 0, 0, '', 'p', 'string'), - ('FILESIZE', 0, 0, '', 'km', 'numeric'), - ('KEYWORDS', 0, 1, '', '', 'string'), - ('LASTSAVEDBY', 0, 0, '', '', 'string'), - ('NUMCHARS', 0, 0, '', '', 'numeric'), - ('NUMPAGES', 0, 0, '', '', 'numeric'), - ('NUMWORDS', 0, 0, '', '', 'numeric'), - ('SUBJECT', 0, 1, '', '', 'string'), - ('TEMPLATE', 0, 0, '', 'p', 'string'), - ('TITLE', 0, 1, '', '', 'string'), + ('AUTHOR', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace + ('COMMENTS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace + ('DOCPROPERTY', 1, 0, '', '', 'string/numeric/datetime'), # pylint: disable=bad-whitespace + ('FILENAME', 0, 0, '', 'p', 'string'), # pylint: disable=bad-whitespace + ('FILESIZE', 0, 0, '', 'km', 'numeric'), # pylint: disable=bad-whitespace + ('KEYWORDS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace + ('LASTSAVEDBY', 0, 0, '', '', 'string'), # pylint: disable=bad-whitespace + ('NUMCHARS', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace + ('NUMPAGES', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace + ('NUMWORDS', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace + ('SUBJECT', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace + ('TEMPLATE', 0, 0, '', 'p', 'string'), # pylint: disable=bad-whitespace + ('TITLE', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace # equations and formulas - # exlude '=' formulae because they have different syntax - ('ADVANCE', 0, 0, 'dlruxy', '', ''), - ('SYMBOL', 1, 0, 'fs', 'ahju', ''), + # exlude '=' formulae because they have different syntax (and can be bad) + ('ADVANCE', 0, 0, 'dlruxy', '', ''), # pylint: disable=bad-whitespace + ('SYMBOL', 1, 0, 'fs', 'ahju', ''), # pylint: disable=bad-whitespace # form fields - ('FORMCHECKBOX', 0, 0, '', '', ''), - ('FORMDROPDOWN', 0, 0, '', '', ''), - ('FORMTEXT', 0, 0, '', '', ''), + ('FORMCHECKBOX', 0, 0, '', '', ''), # pylint: disable=bad-whitespace + ('FORMDROPDOWN', 0, 0, '', '', ''), # pylint: disable=bad-whitespace + ('FORMTEXT', 0, 0, '', '', ''), # pylint: disable=bad-whitespace # index and tables - ('INDEX', 0, 0, 'bcdefghklpsz', 'ry', ''), + ('INDEX', 0, 0, 'bcdefghklpsz', 'ry', ''), # pylint: disable=bad-whitespace # exlude RD since that imports data from other files - ('TA', 0, 0, 'clrs', 'bi', ''), - ('TC', 1, 0, 'fl', 'n', ''), - ('TOA', 0, 0, 'bcdegls', 'fhp', ''), - ('TOC', 0, 0, 'abcdflnopst', 'huwxz', ''), - ('XE', 1, 0, 'frty', 'bi', ''), + ('TA', 0, 0, 'clrs', 'bi', ''), # pylint: disable=bad-whitespace + ('TC', 1, 0, 'fl', 'n', ''), # pylint: disable=bad-whitespace + ('TOA', 0, 0, 'bcdegls', 'fhp', ''), # pylint: disable=bad-whitespace + ('TOC', 0, 0, 'abcdflnopst', 'huwxz', ''), # pylint: disable=bad-whitespace + ('XE', 1, 0, 'frty', 'bi', ''), # pylint: disable=bad-whitespace # links and references # exclude AUTOTEXT and AUTOTEXTLIST since we do not like stuff with 'AUTO' - ('BIBLIOGRAPHY', 0, 0, 'lfm', '', ''), - ('CITATION', 1, 0, 'lfspvm', 'nty', ''), + ('BIBLIOGRAPHY', 0, 0, 'lfm', '', ''), # pylint: disable=bad-whitespace + ('CITATION', 1, 0, 'lfspvm', 'nty', ''), # pylint: disable=bad-whitespace # exclude HYPERLINK since we are allergic to URLs # exclude INCLUDEPICTURE and INCLUDETEXT (other file or maybe even URL?) # exclude LINK and REF (could reference other files) - ('NOTEREF', 1, 0, '', 'fhp', ''), - ('PAGEREF', 1, 0, '', 'hp', ''), - ('QUOTE', 1, 0, '', '', 'datetime'), - ('STYLEREF', 1, 0, '', 'lnprtw', ''), + ('NOTEREF', 1, 0, '', 'fhp', ''), # pylint: disable=bad-whitespace + ('PAGEREF', 1, 0, '', 'hp', ''), # pylint: disable=bad-whitespace + ('QUOTE', 1, 0, '', '', 'datetime'), # pylint: disable=bad-whitespace + ('STYLEREF', 1, 0, '', 'lnprtw', ''), # pylint: disable=bad-whitespace # exclude all Mail Merge commands since they import data from other files # (ADDRESSBLOCK, ASK, COMPARE, DATABASE, FILLIN, GREETINGLINE, IF, # MERGEFIELD, MERGEREC, MERGESEQ, NEXT, NEXTIF, SET, SKIPIF) # Numbering - ('LISTNUM', 0, 1, 'ls', '', ''), - ('PAGE', 0, 0, '', '', 'numeric'), - ('REVNUM', 0, 0, '', '', ''), - ('SECTION', 0, 0, '', '', 'numeric'), - ('SECTIONPAGES', 0, 0, '', '', 'numeric'), - ('SEQ', 1, 1, 'rs', 'chn', 'numeric'), - # user information - ('USERADDRESS', 0, 1, '', '', 'string'), - ('USERINITIALS', 0, 1, '', '', 'string'), - ('USERNAME', 0, 1, '', '', 'string'), + ('LISTNUM', 0, 1, 'ls', '', ''), # pylint: disable=bad-whitespace + ('PAGE', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace + ('REVNUM', 0, 0, '', '', ''), # pylint: disable=bad-whitespace + ('SECTION', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace + ('SECTIONPAGES', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace + ('SEQ', 1, 1, 'rs', 'chn', 'numeric'), # pylint: disable=bad-whitespace + # user information # pylint: disable=bad-whitespace + ('USERADDRESS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace + ('USERINITIALS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace + ('USERNAME', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace ) FIELD_DDE_REGEX = re.compile(r'^\s*dde(auto)?\s+', re.I) +# filter modes FIELD_FILTER_DDE = 'only dde' FIELD_FILTER_BLACKLIST = 'exclude blacklisted' FIELD_FILTER_ALL = 'keep all' @@ -229,6 +234,7 @@ LOG_LEVELS = { 'critical': logging.CRITICAL } + class NullHandler(logging.Handler): """ Log Handler without output, to avoid printing messages if logging is not @@ -239,6 +245,7 @@ class NullHandler(logging.Handler): def emit(self, record): pass + def get_logger(name, level=logging.CRITICAL+1): """ Create a suitable logger object for this module. @@ -251,7 +258,7 @@ def get_logger(name, level=logging.CRITICAL+1): # First, test if there is already a logger with the same name, else it # will generate duplicate messages (due to duplicate handlers): if name in logging.Logger.manager.loggerDict: - #NOTE: another less intrusive but more "hackish" solution would be to + # NOTE: another less intrusive but more "hackish" solution would be to # use getLogger then test if its effective level is not default. logger = logging.getLogger(name) # make sure level is OK: @@ -338,28 +345,34 @@ def existing_file(filename): def process_args(cmd_line_args=None): """ parse command line arguments (given ones or per default sys.argv) """ - parser = ArgParserWithBanner(description='A python tool to detect and extract DDE links in MS Office files') + parser = ArgParserWithBanner(description='A python tool to detect and ' + 'extract DDE links in MS Office files') parser.add_argument("filepath", help="path of the file to be analyzed", type=existing_file, metavar='FILE') parser.add_argument('-j', "--json", action='store_true', help="Output in json format. Do not use with -ldebug") - parser.add_argument("--nounquote", help="don't unquote values",action='store_true') - parser.add_argument('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL, - help="logging level debug/info/warning/error/critical (default=%(default)s)") + parser.add_argument("--nounquote", help="don't unquote values", + action='store_true') + parser.add_argument('-l', '--loglevel', dest="loglevel", action="store", + default=DEFAULT_LOG_LEVEL, + help="logging level debug/info/warning/error/critical " + "(default=%(default)s)") filter_group = parser.add_argument_group( - title='Filter which OpenXML field commands are returned', - description='Only applies to OpenXML (e.g. docx) and rtf, not to OLE ' - '(e.g. .doc). These options are mutually exclusive, last ' - 'option found on command line overwrites earlier ones.') + title='Filter which OpenXML field commands are returned', + description='Only applies to OpenXML (e.g. docx) and rtf, not to OLE ' + '(e.g. .doc). These options are mutually exclusive, last ' + 'option found on command line overwrites earlier ones.') filter_group.add_argument('-d', '--dde-only', action='store_const', dest='field_filter_mode', const=FIELD_FILTER_DDE, help='Return only DDE and DDEAUTO fields') filter_group.add_argument('-f', '--filter', action='store_const', - dest='field_filter_mode', const=FIELD_FILTER_BLACKLIST, - help='Return all fields except harmless ones like PAGE') + dest='field_filter_mode', + const=FIELD_FILTER_BLACKLIST, + help='Return all fields except harmless ones') filter_group.add_argument('-a', '--all-fields', action='store_const', dest='field_filter_mode', const=FIELD_FILTER_ALL, - help='Return all fields, irrespective of their contents') + help='Return all fields, irrespective of their ' + 'contents') parser.set_defaults(field_filter_mode=FIELD_FILTER_DEFAULT) return parser.parse_args(cmd_line_args) @@ -368,16 +381,19 @@ def process_args(cmd_line_args=None): # === FUNCTIONS ============================================================== # from [MS-DOC], section 2.8.25 (PlcFld): -# A field consists of two parts: field instructions and, optionally, a result. All fields MUST begin with -# Unicode character 0x0013 with sprmCFSpec applied with a value of 1. This is the field begin -# character. All fields MUST end with a Unicode character 0x0015 with sprmCFSpec applied with a value -# of 1. This is the field end character. If the field has a result, then there MUST be a Unicode character -# 0x0014 with sprmCFSpec applied with a value of 1 somewhere between the field begin character and -# the field end character. This is the field separator. The field result is the content between the field -# separator and the field end character. The field instructions are the content between the field begin -# character and the field separator, if one is present, or between the field begin character and the field -# end character if no separator is present. The field begin character, field end character, and field -# separator are collectively referred to as field characters. +# A field consists of two parts: field instructions and, optionally, a result. +# All fields MUST begin with Unicode character 0x0013 with sprmCFSpec applied +# with a value of 1. This is the field begin character. All fields MUST end +# with a Unicode character 0x0015 with sprmCFSpec applied with a value of 1. +# This is the field end character. If the field has a result, then there MUST +# be a Unicode character 0x0014 with sprmCFSpec applied with a value of 1 +# somewhere between the field begin character and the field end character. This +# is the field separator. The field result is the content between the field +# separator and the field end character. The field instructions are the content +# between the field begin character and the field separator, if one is present, +# or between the field begin character and the field end character if no +# separator is present. The field begin character, field end character, and +# field separator are collectively referred to as field characters. def process_doc_field(data): @@ -387,7 +403,6 @@ def process_doc_field(data): log.debug('processing field \'{0}\''.format(data)) if data.lstrip().lower().startswith(u'dde'): - #log.debug('--> is DDE!') return data elif data.lstrip().lower().startswith(u'\x00d\x00d\x00e\x00'): return data @@ -512,7 +527,6 @@ def process_doc(filepath): return u'\n'.join(links) - def process_xls(filepath): """ find dde links in excel ole file """ @@ -531,6 +545,7 @@ def process_xls(filepath): def process_docx(filepath, field_filter_mode=None): + """ find dde-links (and other fields) in Word 2007+ files """ log.debug('process_docx') all_fields = [] with zipfile.ZipFile(filepath) as z: @@ -539,9 +554,6 @@ def process_docx(filepath, field_filter_mode=None): data = z.read(filepath) fields = process_xml(data) if len(fields) > 0: - #print ('DDE Links in %s:'%filepath) - #for f in fields: - # print(f) all_fields.extend(fields) # apply field command filter @@ -560,8 +572,10 @@ def process_docx(filepath, field_filter_mode=None): .format(field_filter_mode)) return u'\n'.join(clean_fields) - + + def process_xml(data): + """ Find dde-links and other fields in office XML data """ # parse the XML data: root = ET.fromstring(data) fields = [] @@ -569,17 +583,18 @@ def process_xml(data): level = 0 # find all the tags 'w:p': # parse each for begin and end tags, to group DDE strings - # fldChar can be in either a w:r element, floating alone in the w:p or spread accross w:p tags + # fldChar can be in either a w:r element, floating alone in the w:p + # or spread accross w:p tags # escape DDE if quoted etc # (each is a chunk of a DDE link) for subs in root.iter(TAG_W_P): elem = None for e in subs: - #check if w:r and if it is parse children elements to pull out the first FLDCHAR or INSTRTEXT if e.tag == TAG_W_R: for child in e: if child.tag == TAG_W_FLDCHAR or child.tag == TAG_W_INSTRTEXT: + # check if w:r; parse children to pull out first FLDCHAR/INSTRTEXT elem = child break else: @@ -587,21 +602,21 @@ def process_xml(data): #this should be an error condition if elem is None: continue - - #check if FLDCHARTYPE and whether "begin" or "end" tag + + # check if FLDCHARTYPE and whether "begin" or "end" tag if elem.attrib.get(ATTR_W_FLDCHARTYPE) is not None: if elem.attrib[ATTR_W_FLDCHARTYPE] == "begin": - level += 1 + level += 1 if elem.attrib[ATTR_W_FLDCHARTYPE] == "end": level -= 1 - if level == 0 or level == -1 : # edge-case where level becomes -1 + if level == 0 or level == -1: # edge-case; level becomes -1 fields.append(ddetext) ddetext = u'' - level = 0 # reset edge-case - + level = 0 # reset edge-case + # concatenate the text of the field, if present: if elem.tag == TAG_W_INSTRTEXT and elem.text is not None: - #expand field code if QUOTED + # expand field code if QUOTED ddetext += unquote(elem.text) for elem in root.iter(TAG_W_FLDSIMPLE): @@ -611,10 +626,11 @@ def process_xml(data): return fields -def unquote(field): + +def unquote(field): if "QUOTE" not in field or NO_QUOTES: return field - #split into components + # split into components parts = field.strip().split(" ") ddestr = "" for p in parts[1:]: @@ -625,11 +641,13 @@ def unquote(field): ddestr += ch return ddestr + # "static variables" for field_is_blacklisted: FIELD_WORD_REGEX = re.compile(r'"[^"]*"|\S+') FIELD_BLACKLIST_CMDS = tuple(field[0].lower() for field in FIELD_BLACKLIST) FIELD_SWITCH_REGEX = re.compile(r'^\\[\w#*@]$') + def field_is_blacklisted(contents): """ Check if given field contents matches any in FIELD_BLACKLIST @@ -651,7 +669,7 @@ def field_is_blacklisted(contents): index = FIELD_BLACKLIST_CMDS.index(words[0].lower()) except ValueError: # first word is no blacklisted command return False - log.debug('trying to match "{0}" to blacklist command {0}' + log.debug('trying to match "{0}" to blacklist command {1}' .format(contents, FIELD_BLACKLIST[index])) _, nargs_required, nargs_optional, sw_with_arg, sw_solo, sw_format \ = FIELD_BLACKLIST[index] @@ -706,14 +724,15 @@ def field_is_blacklisted(contents): if 'numeric' in sw_format: arg_choices = [] # too many choices to list them here else: - log.debug('unexpected switch {0} in "{1}"'.format(switch, contents)) + log.debug('unexpected switch {0} in "{1}"' + .format(switch, contents)) return False # if nothing went wrong sofar, the contents seems to match the blacklist return True -def process_xlsx(filepath, filed_filter_mode=None): +def process_xlsx(filepath): """ process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm) """ dde_links = [] parser = ooxml.XmlParser(filepath) @@ -733,7 +752,8 @@ def process_xlsx(filepath, filed_filter_mode=None): try: logging.info('Parsing non-xml subfile {0} with content type {1}' .format(subfile, content_type)) - for record in xls_parser.parse_xlsb_part(handle, content_type, subfile): + for record in xls_parser.parse_xlsb_part(handle, content_type, + subfile): logging.debug('{0}: {1}'.format(subfile, record)) if isinstance(record, xls_parser.XlsbBeginSupBook) and \ record.link_type == \ @@ -791,8 +811,10 @@ class RtfFieldParser(rtfobj.RtfParser): RTF_START = b'\x7b\x5c\x72\x74' # == b'{\rt' but does not mess up auto-indent + def process_rtf(file_handle, field_filter_mode=None): log.debug('process_rtf') + """ find dde links or other fields in rtf file """ all_fields = [] data = RTF_START + file_handle.read() # read complete file into memory! file_handle.close() @@ -828,7 +850,7 @@ def process_file(filepath, field_filter_mode=None): return process_doc(filepath) with open(filepath, 'rb') as file_handle: - if file_handle.read(4) == RTF_START: + if file_handle.read(4) == RTF_START: # This is a RTF file return process_rtf(file_handle, field_filter_mode) @@ -846,7 +868,7 @@ def process_file(filepath, field_filter_mode=None): return process_docx(filepath, field_filter_mode) -#=== MAIN ================================================================= +# === MAIN ================================================================= def main(cmd_line_args=None): """ Main function, called if this file is called as a script @@ -868,10 +890,10 @@ def main(cmd_line_args=None): if args.json and args.loglevel.lower() == 'debug': log.warning('Debug log output will not be json-compatible!') - if args.nounquote : + if args.nounquote: global NO_QUOTES NO_QUOTES = True - + if args.json: jout = [] jout.append(BANNER_JSON) @@ -890,7 +912,7 @@ def main(cmd_line_args=None): except Exception as exc: if args.json: jout.append(dict(type='error', error=type(exc).__name__, - message=str(exc))) # strange: str(exc) is enclosed in "" + message=str(exc))) else: raise # re-raise last known exception, keeping trace intact -- libgit2 0.21.4