Commit 2f7a1ef1b347a1124d01ef6559d939be5ccf50cd
Committed by
GitHub
Merge pull request #365 from christian-intra2net/encoding-for-non-unicode-environments
Encoding for non unicode environments
Showing
9 changed files
with
484 additions
and
66 deletions
oletools/common/io_encoding.py
0 → 100644
| 1 | +#!/usr/bin/env python3 | ||
| 2 | + | ||
| 3 | +""" | ||
| 4 | +Tool to help with input/output encoding | ||
| 5 | + | ||
| 6 | +Helpers to run smoothly in unicode-unfriendly environments like output redirect | ||
| 7 | +or unusual language settings. | ||
| 8 | + | ||
| 9 | +In such settings, output to console falls back to ASCII-only. Also open() | ||
| 10 | +suddenly fails to interprete non-ASCII characters. | ||
| 11 | + | ||
| 12 | +Therefore, at start of scripts can run :py:meth:`ensure_stdout_handles_unicode` | ||
| 13 | +and when opening text files use :py:meth:`uopen` to replace :py:meth:`open`. | ||
| 14 | + | ||
| 15 | +Part of the python-oletools package: | ||
| 16 | +http://www.decalage.info/python/oletools | ||
| 17 | +""" | ||
| 18 | + | ||
| 19 | +# === LICENSE ================================================================= | ||
| 20 | + | ||
| 21 | +# msodde is copyright (c) 2017-2018 Philippe Lagadec (http://www.decalage.info) | ||
| 22 | +# All rights reserved. | ||
| 23 | +# | ||
| 24 | +# Redistribution and use in source and binary forms, with or without | ||
| 25 | +# modification, are permitted provided that the following conditions are met: | ||
| 26 | +# | ||
| 27 | +# * Redistributions of source code must retain the above copyright notice, | ||
| 28 | +# this list of conditions and the following disclaimer. | ||
| 29 | +# * Redistributions in binary form must reproduce the above copyright notice, | ||
| 30 | +# this list of conditions and the following disclaimer in the documentation | ||
| 31 | +# and/or other materials provided with the distribution. | ||
| 32 | +# | ||
| 33 | +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||
| 34 | +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 35 | +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
| 36 | +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE | ||
| 37 | +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | ||
| 38 | +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | ||
| 39 | +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | ||
| 40 | +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | ||
| 41 | +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
| 42 | +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | ||
| 43 | +# POSSIBILITY OF SUCH DAMAGE. | ||
| 44 | + | ||
| 45 | +# ----------------------------------------------------------------------------- | ||
| 46 | +# CHANGELOG: | ||
| 47 | +# 2018-11-04 v0.54 CH: - first version: ensure_stdout_handles_unicode, uopen | ||
| 48 | + | ||
| 49 | +# -- IMPORTS ------------------------------------------------------------------ | ||
| 50 | +from __future__ import print_function | ||
| 51 | +import sys | ||
| 52 | +import codecs | ||
| 53 | +import os | ||
| 54 | +from locale import getpreferredencoding | ||
| 55 | + | ||
| 56 | +PY3 = sys.version_info.major >= 3 | ||
| 57 | + | ||
| 58 | +if PY3: | ||
| 59 | + from builtins import open as builtin_open | ||
| 60 | +else: | ||
| 61 | + from __builtin__ import open as builtin_open | ||
| 62 | + | ||
| 63 | +# -- CONSTANTS ---------------------------------------------------------------- | ||
| 64 | +#: encoding to use for redirection if no good encoding can be found | ||
| 65 | +FALLBACK_ENCODING_REDIRECT = 'utf8' | ||
| 66 | + | ||
| 67 | +#: encoding for reading text from files if preferred encoding is non-unicode | ||
| 68 | +FALLBACK_ENCODING_OPEN = 'utf8' | ||
| 69 | + | ||
| 70 | +#: print (pure-ascii) debug output to stdout | ||
| 71 | +DEBUG = False | ||
| 72 | + | ||
| 73 | +# the encoding specified in system environment | ||
| 74 | +try: | ||
| 75 | + PREFERRED_ENCODING = getpreferredencoding() | ||
| 76 | +except Exception as exc: | ||
| 77 | + if DEBUG: | ||
| 78 | + print('Exception getting preferred encoding: {}'.format(exc)) | ||
| 79 | + PREFERRED_ENCODING = None | ||
| 80 | + | ||
| 81 | + | ||
| 82 | +# -- HELPERS =----------------------------------------------------------------- | ||
| 83 | + | ||
| 84 | + | ||
| 85 | +def ensure_stdout_handles_unicode(): | ||
| 86 | + """ | ||
| 87 | + Ensure that print()ing unicode does not lead to errors. | ||
| 88 | + | ||
| 89 | + When print()ing unicode, python relies on the environment (e.g. in linux on | ||
| 90 | + the setting of the LANG environment variable) to tell it how to encode | ||
| 91 | + unicode. That works nicely for modern-day shells where encoding is usually | ||
| 92 | + UTF-8. But as soon as LANG is unset or just "C", or output is redirected or | ||
| 93 | + piped, the encoding falls back to 'ASCII', which cannot handle unicode | ||
| 94 | + characters. | ||
| 95 | + | ||
| 96 | + Based on solutions suggested on stackoverflow (c.f. | ||
| 97 | + https://stackoverflow.com/q/27347772/4405656 ), wrap stdout in an encoder | ||
| 98 | + that solves that problem. | ||
| 99 | + | ||
| 100 | + Unfortunately, stderr cannot be handled the same way ( see e.g. https:// | ||
| 101 | + pythonhosted.org/kitchen/unicode-frustrations.html#frustration-5-exceptions | ||
| 102 | + ), so we still have to hope there is only ascii in error messages | ||
| 103 | + """ | ||
| 104 | + # do not re-wrap | ||
| 105 | + if isinstance(sys.stdout, codecs.StreamWriter): | ||
| 106 | + if DEBUG: | ||
| 107 | + print('sys.stdout wrapped already') | ||
| 108 | + return | ||
| 109 | + | ||
| 110 | + # get output stream object | ||
| 111 | + if PY3: | ||
| 112 | + output_stream = sys.stdout.buffer | ||
| 113 | + else: | ||
| 114 | + output_stream = sys.stdout | ||
| 115 | + | ||
| 116 | + # determine encoding of sys.stdout | ||
| 117 | + try: | ||
| 118 | + encoding = sys.stdout.encoding | ||
| 119 | + except AttributeError: # variable "encoding" might not exist | ||
| 120 | + encoding = None | ||
| 121 | + if DEBUG: | ||
| 122 | + print('sys.stdout encoding is {}'.format(encoding)) | ||
| 123 | + | ||
| 124 | + if isinstance(encoding, str) and encoding.lower().startswith('utf'): | ||
| 125 | + if DEBUG: | ||
| 126 | + print('encoding is acceptable') | ||
| 127 | + return # everything alright, we are working in a good environment | ||
| 128 | + elif os.isatty(output_stream.fileno()): # e.g. C locale | ||
| 129 | + # Do not output UTF8 since that might be mis-interpreted. | ||
| 130 | + # Just replace chars that cannot be handled | ||
| 131 | + print('Encoding for stdout is only {}, will replace other chars to ' | ||
| 132 | + 'avoid unicode error'.format(encoding), file=sys.stderr) | ||
| 133 | + sys.stdout = codecs.getwriter(encoding)(output_stream, errors='replace') | ||
| 134 | + else: # e.g. redirection, pipe in python2 | ||
| 135 | + new_encoding = PREFERRED_ENCODING | ||
| 136 | + if DEBUG: | ||
| 137 | + print('not a tty, try preferred encoding {}'.format(new_encoding)) | ||
| 138 | + if not isinstance(new_encoding, str) \ | ||
| 139 | + or not new_encoding.lower().startswith('utf'): | ||
| 140 | + new_encoding = FALLBACK_ENCODING_REDIRECT | ||
| 141 | + if DEBUG: | ||
| 142 | + print('preferred encoding also unacceptable, fall back to {}' | ||
| 143 | + .format(new_encoding)) | ||
| 144 | + print('Encoding for stdout is only {}, will auto-encode text with {} ' | ||
| 145 | + 'before output'.format(encoding, new_encoding), file=sys.stderr) | ||
| 146 | + sys.stdout = codecs.getwriter(new_encoding)(output_stream) | ||
| 147 | + | ||
| 148 | + | ||
| 149 | +def uopen(filename, mode='r', *args, **kwargs): | ||
| 150 | + """ | ||
| 151 | + Replacement for builtin open() that reads unicode even in ASCII environment | ||
| 152 | + | ||
| 153 | + In order to read unicode from text, python uses locale.getpreferredencoding | ||
| 154 | + to translate bytes to str. If the environment only provides ASCII encoding, | ||
| 155 | + this will fail since most office files contain unicode. | ||
| 156 | + | ||
| 157 | + Therefore, guess a good encoding here if necessary and open file with that. | ||
| 158 | + | ||
| 159 | + :returns: same type as the builtin :py:func:`open` | ||
| 160 | + """ | ||
| 161 | + # do not interfere if not necessary: | ||
| 162 | + if 'b' in mode: | ||
| 163 | + if DEBUG: | ||
| 164 | + print('Opening binary file, do not interfere') | ||
| 165 | + return builtin_open(filename, mode, *args, **kwargs) | ||
| 166 | + if 'encoding' in kwargs: | ||
| 167 | + if DEBUG: | ||
| 168 | + print('Opening file with encoding {!r}, do not interfere' | ||
| 169 | + .format(kwargs['encoding'])) | ||
| 170 | + return builtin_open(filename, mode, *args, **kwargs) | ||
| 171 | + if len(args) > 3: # "encoding" is the 4th arg | ||
| 172 | + if DEBUG: | ||
| 173 | + print('Opening file with encoding {!r}, do not interfere' | ||
| 174 | + .format(args[3])) | ||
| 175 | + return builtin_open(filename, mode, *args, **kwargs) | ||
| 176 | + | ||
| 177 | + # determine preferred encoding | ||
| 178 | + encoding = PREFERRED_ENCODING | ||
| 179 | + if DEBUG: | ||
| 180 | + print('preferred encoding is {}'.format(encoding)) | ||
| 181 | + | ||
| 182 | + if isinstance(encoding, str) and encoding.lower().startswith('utf'): | ||
| 183 | + if DEBUG: | ||
| 184 | + print('encoding is acceptable, open {} regularly'.format(filename)) | ||
| 185 | + return builtin_open(filename, mode, *args, **kwargs) | ||
| 186 | + | ||
| 187 | + # so we want to read text from a file but can probably only deal with ASCII | ||
| 188 | + # --> use fallback | ||
| 189 | + if DEBUG: | ||
| 190 | + print('Opening {} with fallback encoding {}' | ||
| 191 | + .format(filename, FALLBACK_ENCODING_OPEN)) | ||
| 192 | + if PY3: | ||
| 193 | + return builtin_open(filename, mode, *args, | ||
| 194 | + encoding=FALLBACK_ENCODING_OPEN, **kwargs) | ||
| 195 | + else: | ||
| 196 | + handle = builtin_open(filename, mode, *args, **kwargs) | ||
| 197 | + return codecs.EncodedFile(handle, FALLBACK_ENCODING_OPEN) |
oletools/common/log_helper/log_helper.py
| @@ -44,6 +44,7 @@ General logging helpers | @@ -44,6 +44,7 @@ General logging helpers | ||
| 44 | from ._json_formatter import JsonFormatter | 44 | from ._json_formatter import JsonFormatter |
| 45 | from ._logger_adapter import OletoolsLoggerAdapter | 45 | from ._logger_adapter import OletoolsLoggerAdapter |
| 46 | from . import _root_logger_wrapper | 46 | from . import _root_logger_wrapper |
| 47 | +from ..io_encoding import ensure_stdout_handles_unicode | ||
| 47 | import logging | 48 | import logging |
| 48 | import sys | 49 | import sys |
| 49 | 50 | ||
| @@ -92,6 +93,9 @@ class LogHelper: | @@ -92,6 +93,9 @@ class LogHelper: | ||
| 92 | if self._is_enabled: | 93 | if self._is_enabled: |
| 93 | raise ValueError('re-enabling logging. Not sure whether that is ok...') | 94 | raise ValueError('re-enabling logging. Not sure whether that is ok...') |
| 94 | 95 | ||
| 96 | + if stream in (None, sys.stdout): | ||
| 97 | + ensure_stdout_handles_unicode() | ||
| 98 | + | ||
| 95 | log_level = LOG_LEVELS[level] | 99 | log_level = LOG_LEVELS[level] |
| 96 | logging.basicConfig(level=log_level, format=log_format, stream=stream) | 100 | logging.basicConfig(level=log_level, format=log_format, stream=stream) |
| 97 | self._is_enabled = True | 101 | self._is_enabled = True |
oletools/msodde.py
| @@ -74,6 +74,7 @@ from oletools import xls_parser | @@ -74,6 +74,7 @@ from oletools import xls_parser | ||
| 74 | from oletools import rtfobj | 74 | from oletools import rtfobj |
| 75 | from oletools.ppt_record_parser import is_ppt | 75 | from oletools.ppt_record_parser import is_ppt |
| 76 | from oletools import crypto | 76 | from oletools import crypto |
| 77 | +from oletools.common.io_encoding import ensure_stdout_handles_unicode | ||
| 77 | from oletools.common.log_helper import log_helper | 78 | from oletools.common.log_helper import log_helper |
| 78 | 79 | ||
| 79 | # ----------------------------------------------------------------------------- | 80 | # ----------------------------------------------------------------------------- |
| @@ -236,57 +237,6 @@ DEFAULT_LOG_LEVEL = "warning" # Default log level | @@ -236,57 +237,6 @@ DEFAULT_LOG_LEVEL = "warning" # Default log level | ||
| 236 | logger = log_helper.get_or_create_silent_logger('msodde') | 237 | logger = log_helper.get_or_create_silent_logger('msodde') |
| 237 | 238 | ||
| 238 | 239 | ||
| 239 | -# === UNICODE IN PY2 ========================================================= | ||
| 240 | - | ||
| 241 | -def ensure_stdout_handles_unicode(): | ||
| 242 | - """ Ensure stdout can handle unicode by wrapping it if necessary | ||
| 243 | - | ||
| 244 | - Required e.g. if output of this script is piped or redirected in a linux | ||
| 245 | - shell, since then sys.stdout.encoding is ascii and cannot handle | ||
| 246 | - print(unicode). In that case we need to find some compatible encoding and | ||
| 247 | - wrap sys.stdout into a encoder following (many thanks!) | ||
| 248 | - https://stackoverflow.com/a/1819009 or https://stackoverflow.com/a/20447935 | ||
| 249 | - | ||
| 250 | - Can be undone by setting sys.stdout = sys.__stdout__ | ||
| 251 | - """ | ||
| 252 | - import codecs | ||
| 253 | - import locale | ||
| 254 | - | ||
| 255 | - # do not re-wrap | ||
| 256 | - if isinstance(sys.stdout, codecs.StreamWriter): | ||
| 257 | - return | ||
| 258 | - | ||
| 259 | - # try to find encoding for sys.stdout | ||
| 260 | - encoding = None | ||
| 261 | - try: | ||
| 262 | - encoding = sys.stdout.encoding | ||
| 263 | - except AttributeError: # variable "encoding" might not exist | ||
| 264 | - pass | ||
| 265 | - | ||
| 266 | - if encoding not in (None, '', 'ascii'): | ||
| 267 | - return # no need to wrap | ||
| 268 | - | ||
| 269 | - # try to find an encoding that can handle unicode | ||
| 270 | - try: | ||
| 271 | - encoding = locale.getpreferredencoding() | ||
| 272 | - except Exception: | ||
| 273 | - pass | ||
| 274 | - | ||
| 275 | - # fallback if still no encoding available | ||
| 276 | - if encoding in (None, '', 'ascii'): | ||
| 277 | - encoding = 'utf8' | ||
| 278 | - | ||
| 279 | - # logging is probably not initialized yet, but just in case | ||
| 280 | - logger.debug('wrapping sys.stdout with encoder using {0}'.format(encoding)) | ||
| 281 | - | ||
| 282 | - wrapper = codecs.getwriter(encoding) | ||
| 283 | - sys.stdout = wrapper(sys.stdout) | ||
| 284 | - | ||
| 285 | - | ||
| 286 | -if sys.version_info.major < 3: | ||
| 287 | - ensure_stdout_handles_unicode() # e.g. for print(text) in main() | ||
| 288 | - | ||
| 289 | - | ||
| 290 | # === ARGUMENT PARSING ======================================================= | 240 | # === ARGUMENT PARSING ======================================================= |
| 291 | 241 | ||
| 292 | class ArgParserWithBanner(argparse.ArgumentParser): | 242 | class ArgParserWithBanner(argparse.ArgumentParser): |
| @@ -820,10 +770,15 @@ def process_csv(filepath): | @@ -820,10 +770,15 @@ def process_csv(filepath): | ||
| 820 | chars the same way that excel does. Tested to some extend in unittests. | 770 | chars the same way that excel does. Tested to some extend in unittests. |
| 821 | 771 | ||
| 822 | This can only find DDE-links, no other "suspicious" constructs (yet). | 772 | This can only find DDE-links, no other "suspicious" constructs (yet). |
| 823 | - """ | ||
| 824 | 773 | ||
| 774 | + Cannot deal with unicode files yet (need more than just use uopen()). | ||
| 775 | + """ | ||
| 825 | results = [] | 776 | results = [] |
| 826 | - with open(filepath, 'r') as file_handle: | 777 | + if sys.version_info.major <= 2: |
| 778 | + open_arg = dict(mode='rb') | ||
| 779 | + else: | ||
| 780 | + open_arg = dict(newline='') | ||
| 781 | + with open(filepath, **open_arg) as file_handle: | ||
| 827 | results, dialect = process_csv_dialect(file_handle, CSV_DELIMITERS) | 782 | results, dialect = process_csv_dialect(file_handle, CSV_DELIMITERS) |
| 828 | is_small = file_handle.tell() < CSV_SMALL_THRESH | 783 | is_small = file_handle.tell() < CSV_SMALL_THRESH |
| 829 | 784 | ||
| @@ -854,7 +809,6 @@ def process_csv(filepath): | @@ -854,7 +809,6 @@ def process_csv(filepath): | ||
| 854 | 809 | ||
| 855 | def process_csv_dialect(file_handle, delimiters): | 810 | def process_csv_dialect(file_handle, delimiters): |
| 856 | """ helper for process_csv: process with a specific csv dialect """ | 811 | """ helper for process_csv: process with a specific csv dialect """ |
| 857 | - | ||
| 858 | # determine dialect = delimiter chars, quote chars, ... | 812 | # determine dialect = delimiter chars, quote chars, ... |
| 859 | dialect = csv.Sniffer().sniff(file_handle.read(CSV_SMALL_THRESH), | 813 | dialect = csv.Sniffer().sniff(file_handle.read(CSV_SMALL_THRESH), |
| 860 | delimiters=delimiters) | 814 | delimiters=delimiters) |
oletools/olemeta.py
| @@ -79,6 +79,7 @@ if not _parent_dir in sys.path: | @@ -79,6 +79,7 @@ if not _parent_dir in sys.path: | ||
| 79 | import olefile | 79 | import olefile |
| 80 | from oletools.thirdparty import xglob | 80 | from oletools.thirdparty import xglob |
| 81 | from oletools.thirdparty.tablestream import tablestream | 81 | from oletools.thirdparty.tablestream import tablestream |
| 82 | +from oletools.common.io_encoding import ensure_stdout_handles_unicode | ||
| 82 | 83 | ||
| 83 | 84 | ||
| 84 | #=== MAIN ================================================================= | 85 | #=== MAIN ================================================================= |
| @@ -88,13 +89,12 @@ def process_ole(ole): | @@ -88,13 +89,12 @@ def process_ole(ole): | ||
| 88 | meta = ole.get_metadata() | 89 | meta = ole.get_metadata() |
| 89 | 90 | ||
| 90 | # console output with UTF8 encoding: | 91 | # console output with UTF8 encoding: |
| 91 | - # It looks like we do not need the UTF8 codec anymore, both for Python 2 and 3 | ||
| 92 | - console_utf8 = sys.stdout #codecs.getwriter('utf8')(sys.stdout) | 92 | + ensure_stdout_handles_unicode() |
| 93 | 93 | ||
| 94 | # TODO: move similar code to a function | 94 | # TODO: move similar code to a function |
| 95 | 95 | ||
| 96 | print('Properties from the SummaryInformation stream:') | 96 | print('Properties from the SummaryInformation stream:') |
| 97 | - t = tablestream.TableStream([21, 30], header_row=['Property', 'Value'], outfile=console_utf8) | 97 | + t = tablestream.TableStream([21, 30], header_row=['Property', 'Value']) |
| 98 | for prop in meta.SUMMARY_ATTRIBS: | 98 | for prop in meta.SUMMARY_ATTRIBS: |
| 99 | value = getattr(meta, prop) | 99 | value = getattr(meta, prop) |
| 100 | if value is not None: | 100 | if value is not None: |
| @@ -111,7 +111,7 @@ def process_ole(ole): | @@ -111,7 +111,7 @@ def process_ole(ole): | ||
| 111 | print('') | 111 | print('') |
| 112 | 112 | ||
| 113 | print('Properties from the DocumentSummaryInformation stream:') | 113 | print('Properties from the DocumentSummaryInformation stream:') |
| 114 | - t = tablestream.TableStream([21, 30], header_row=['Property', 'Value'], outfile=console_utf8) | 114 | + t = tablestream.TableStream([21, 30], header_row=['Property', 'Value']) |
| 115 | for prop in meta.DOCSUM_ATTRIBS: | 115 | for prop in meta.DOCSUM_ATTRIBS: |
| 116 | value = getattr(meta, prop) | 116 | value = getattr(meta, prop) |
| 117 | if value is not None: | 117 | if value is not None: |
oletools/oleobj.py
| @@ -73,6 +73,7 @@ except ImportError: | @@ -73,6 +73,7 @@ except ImportError: | ||
| 73 | from oletools.ppt_record_parser import (is_ppt, PptFile, | 73 | from oletools.ppt_record_parser import (is_ppt, PptFile, |
| 74 | PptRecordExOleVbaActiveXAtom) | 74 | PptRecordExOleVbaActiveXAtom) |
| 75 | from oletools.ooxml import XmlParser | 75 | from oletools.ooxml import XmlParser |
| 76 | +from oletools.common.io_encoding import ensure_stdout_handles_unicode | ||
| 76 | 77 | ||
| 77 | # ----------------------------------------------------------------------------- | 78 | # ----------------------------------------------------------------------------- |
| 78 | # CHANGELOG: | 79 | # CHANGELOG: |
| @@ -848,6 +849,7 @@ def main(cmd_line_args=None): | @@ -848,6 +849,7 @@ def main(cmd_line_args=None): | ||
| 848 | provide other arguments. | 849 | provide other arguments. |
| 849 | """ | 850 | """ |
| 850 | # print banner with version | 851 | # print banner with version |
| 852 | + ensure_stdout_handles_unicode() | ||
| 851 | print('oleobj %s - http://decalage.info/oletools' % __version__) | 853 | print('oleobj %s - http://decalage.info/oletools' % __version__) |
| 852 | print('THIS IS WORK IN PROGRESS - Check updates regularly!') | 854 | print('THIS IS WORK IN PROGRESS - Check updates regularly!') |
| 853 | print('Please report any issue at ' | 855 | print('Please report any issue at ' |
oletools/olevba.py
| @@ -318,6 +318,7 @@ from oletools import ppt_parser | @@ -318,6 +318,7 @@ from oletools import ppt_parser | ||
| 318 | from oletools import oleform | 318 | from oletools import oleform |
| 319 | from oletools import rtfobj | 319 | from oletools import rtfobj |
| 320 | from oletools import crypto | 320 | from oletools import crypto |
| 321 | +from oletools.common.io_encoding import ensure_stdout_handles_unicode | ||
| 321 | from oletools.common import codepages | 322 | from oletools.common import codepages |
| 322 | 323 | ||
| 323 | # monkeypatch email to fix issue #32: | 324 | # monkeypatch email to fix issue #32: |
| @@ -4122,7 +4123,6 @@ def main(cmd_line_args=None): | @@ -4122,7 +4123,6 @@ def main(cmd_line_args=None): | ||
| 4122 | in process_args. Per default (cmd_line_args=None), sys.argv is used. Option | 4123 | in process_args. Per default (cmd_line_args=None), sys.argv is used. Option |
| 4123 | mainly added for unit-testing | 4124 | mainly added for unit-testing |
| 4124 | """ | 4125 | """ |
| 4125 | - | ||
| 4126 | options, args = parse_args(cmd_line_args) | 4126 | options, args = parse_args(cmd_line_args) |
| 4127 | 4127 | ||
| 4128 | # provide info about tool and its version | 4128 | # provide info about tool and its version |
oletools/ooxml.py
| @@ -13,14 +13,47 @@ TODO: check what is duplicate here with oleid, maybe merge some day? | @@ -13,14 +13,47 @@ TODO: check what is duplicate here with oleid, maybe merge some day? | ||
| 13 | TODO: "xml2003" == "flatopc"? | 13 | TODO: "xml2003" == "flatopc"? |
| 14 | 14 | ||
| 15 | .. codeauthor:: Intra2net AG <info@intra2net> | 15 | .. codeauthor:: Intra2net AG <info@intra2net> |
| 16 | +License: BSD, see source code or documentation | ||
| 17 | + | ||
| 18 | +msodde is part of the python-oletools package: | ||
| 19 | +http://www.decalage.info/python/oletools | ||
| 16 | """ | 20 | """ |
| 17 | 21 | ||
| 22 | +# === LICENSE ================================================================= | ||
| 23 | + | ||
| 24 | +# msodde is copyright (c) 2017-2019 Philippe Lagadec (http://www.decalage.info) | ||
| 25 | +# All rights reserved. | ||
| 26 | +# | ||
| 27 | +# Redistribution and use in source and binary forms, with or without | ||
| 28 | +# modification, are permitted provided that the following conditions are met: | ||
| 29 | +# | ||
| 30 | +# * Redistributions of source code must retain the above copyright notice, | ||
| 31 | +# this list of conditions and the following disclaimer. | ||
| 32 | +# * Redistributions in binary form must reproduce the above copyright notice, | ||
| 33 | +# this list of conditions and the following disclaimer in the documentation | ||
| 34 | +# and/or other materials provided with the distribution. | ||
| 35 | +# | ||
| 36 | +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||
| 37 | +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| 38 | +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
| 39 | +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE | ||
| 40 | +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | ||
| 41 | +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | ||
| 42 | +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | ||
| 43 | +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | ||
| 44 | +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
| 45 | +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | ||
| 46 | +# POSSIBILITY OF SUCH DAMAGE. | ||
| 47 | + | ||
| 48 | +# -- IMPORTS ------------------------------------------------------------------ | ||
| 49 | + | ||
| 18 | import sys | 50 | import sys |
| 51 | +from oletools.common.log_helper import log_helper | ||
| 52 | +from oletools.common.io_encoding import uopen | ||
| 19 | from zipfile import ZipFile, BadZipfile, is_zipfile | 53 | from zipfile import ZipFile, BadZipfile, is_zipfile |
| 20 | from os.path import splitext | 54 | from os.path import splitext |
| 21 | import io | 55 | import io |
| 22 | import re | 56 | import re |
| 23 | -from oletools.common.log_helper import log_helper | ||
| 24 | 57 | ||
| 25 | # import lxml or ElementTree for XML parsing: | 58 | # import lxml or ElementTree for XML parsing: |
| 26 | try: | 59 | try: |
| @@ -29,6 +62,18 @@ try: | @@ -29,6 +62,18 @@ try: | ||
| 29 | except ImportError: | 62 | except ImportError: |
| 30 | import xml.etree.cElementTree as ET | 63 | import xml.etree.cElementTree as ET |
| 31 | 64 | ||
| 65 | +# ----------------------------------------------------------------------------- | ||
| 66 | +# CHANGELOG: | ||
| 67 | +# 2018-12-06 CH: - ensure stdout can handle unicode | ||
| 68 | + | ||
| 69 | +__version__ = '0.54.2' | ||
| 70 | + | ||
| 71 | + | ||
| 72 | +############################################################################### | ||
| 73 | +# CONSTANTS | ||
| 74 | +############################################################################### | ||
| 75 | + | ||
| 76 | + | ||
| 32 | logger = log_helper.get_or_create_silent_logger('ooxml') | 77 | logger = log_helper.get_or_create_silent_logger('ooxml') |
| 33 | 78 | ||
| 34 | #: subfiles that have to be part of every ooxml file | 79 | #: subfiles that have to be part of every ooxml file |
| @@ -127,7 +172,7 @@ def get_type(filename): | @@ -127,7 +172,7 @@ def get_type(filename): | ||
| 127 | parser = XmlParser(filename) | 172 | parser = XmlParser(filename) |
| 128 | if parser.is_single_xml(): | 173 | if parser.is_single_xml(): |
| 129 | match = None | 174 | match = None |
| 130 | - with open(filename, 'r') as handle: | 175 | + with uopen(filename, 'r') as handle: |
| 131 | match = re.search(OFFICE_XML_PROGID_REGEX, handle.read(1024)) | 176 | match = re.search(OFFICE_XML_PROGID_REGEX, handle.read(1024)) |
| 132 | if not match: | 177 | if not match: |
| 133 | return DOCTYPE_NONE | 178 | return DOCTYPE_NONE |
| @@ -416,7 +461,7 @@ class XmlParser(object): | @@ -416,7 +461,7 @@ class XmlParser(object): | ||
| 416 | 461 | ||
| 417 | # find prog id in xml prolog | 462 | # find prog id in xml prolog |
| 418 | match = None | 463 | match = None |
| 419 | - with open(self.filename, 'r') as handle: | 464 | + with uopen(self.filename, 'r') as handle: |
| 420 | match = re.search(OFFICE_XML_PROGID_REGEX, handle.read(1024)) | 465 | match = re.search(OFFICE_XML_PROGID_REGEX, handle.read(1024)) |
| 421 | if match: | 466 | if match: |
| 422 | self._is_single_xml = True | 467 | self._is_single_xml = True |
| @@ -424,11 +469,18 @@ class XmlParser(object): | @@ -424,11 +469,18 @@ class XmlParser(object): | ||
| 424 | raise BadOOXML(self.filename, 'is no zip and has no prog_id') | 469 | raise BadOOXML(self.filename, 'is no zip and has no prog_id') |
| 425 | 470 | ||
| 426 | def iter_files(self, args=None): | 471 | def iter_files(self, args=None): |
| 427 | - """ Find files in zip or just give single xml file """ | 472 | + """ |
| 473 | + Find files in zip or just give single xml file | ||
| 474 | + | ||
| 475 | + yields pairs (subfile-name, file-handle) where file-handle is an open | ||
| 476 | + file-like object. (Do not care too much about encoding here, the xml | ||
| 477 | + parser reads the encoding from the first lines in the file.) | ||
| 478 | + """ | ||
| 428 | if self.is_single_xml(): | 479 | if self.is_single_xml(): |
| 429 | if args: | 480 | if args: |
| 430 | raise BadOOXML(self.filename, 'xml has no subfiles') | 481 | raise BadOOXML(self.filename, 'xml has no subfiles') |
| 431 | - with open(self.filename, 'r') as handle: | 482 | + # do not use uopen, xml parser determines encoding on its own |
| 483 | + with open(self.filename, 'rb') as handle: | ||
| 432 | yield None, handle # the subfile=None is needed in iter_xml | 484 | yield None, handle # the subfile=None is needed in iter_xml |
| 433 | self.did_iter_all = True | 485 | self.did_iter_all = True |
| 434 | else: | 486 | else: |
| @@ -638,9 +690,10 @@ class XmlParser(object): | @@ -638,9 +690,10 @@ class XmlParser(object): | ||
| 638 | 690 | ||
| 639 | 691 | ||
| 640 | def test(): | 692 | def test(): |
| 641 | - """ Main function, called when running file as script | 693 | + """ |
| 694 | + Test xml parsing; called when running this file as a script. | ||
| 642 | 695 | ||
| 643 | - see module doc for more info | 696 | + Prints every element found in input file (to be given as command line arg). |
| 644 | """ | 697 | """ |
| 645 | log_helper.enable_logging(False, 'debug') | 698 | log_helper.enable_logging(False, 'debug') |
| 646 | if len(sys.argv) != 2: | 699 | if len(sys.argv) != 2: |
tests/common/test_encoding_handler.py
0 → 100644
| 1 | +"""Test common.ensure_stdout_handles_unicode""" | ||
| 2 | + | ||
| 3 | +from __future__ import print_function | ||
| 4 | + | ||
| 5 | +import unittest | ||
| 6 | +import sys | ||
| 7 | +from subprocess import check_call, CalledProcessError | ||
| 8 | +from tempfile import mkstemp | ||
| 9 | +import os | ||
| 10 | +from os.path import isfile | ||
| 11 | +from contextlib import contextmanager | ||
| 12 | + | ||
| 13 | +FILE_TEXT = u'The unicode check mark is \u2713.\n' | ||
| 14 | + | ||
| 15 | +@contextmanager | ||
| 16 | +def temp_file(just_name=True): | ||
| 17 | + """Context manager that creates temp file and deletes it in the end""" | ||
| 18 | + tmp_descriptor = None | ||
| 19 | + tmp_name = None | ||
| 20 | + tmp_handle = None | ||
| 21 | + try: | ||
| 22 | + tmp_descriptor, tmp_name = mkstemp() | ||
| 23 | + | ||
| 24 | + # we create our own file handle since we want to be able to close the | ||
| 25 | + # file and open it again for reading. | ||
| 26 | + # We keep the os-level descriptor open so file name is still reserved | ||
| 27 | + # for us | ||
| 28 | + if just_name: | ||
| 29 | + yield tmp_name | ||
| 30 | + else: | ||
| 31 | + tmp_handle = open(tmp_name, 'wb') | ||
| 32 | + yield tmp_handle, tmp_name | ||
| 33 | + except Exception: | ||
| 34 | + raise | ||
| 35 | + finally: | ||
| 36 | + if tmp_descriptor is not None: | ||
| 37 | + os.close(tmp_descriptor) | ||
| 38 | + if tmp_handle is not None: | ||
| 39 | + tmp_handle.close() | ||
| 40 | + if tmp_name is not None and isfile(tmp_name): | ||
| 41 | + os.unlink(tmp_name) | ||
| 42 | + | ||
| 43 | + | ||
| 44 | +class TestEncodingHandler(unittest.TestCase): | ||
| 45 | + """Tests replacing stdout encoding in various scenarios""" | ||
| 46 | + | ||
| 47 | + def test_print(self): | ||
| 48 | + """Test regular unicode output not raise error""" | ||
| 49 | + check_call('{python} {this_file} print'.format(python=sys.executable, | ||
| 50 | + this_file=__file__), | ||
| 51 | + shell=True) | ||
| 52 | + | ||
| 53 | + def test_print_redirect(self): | ||
| 54 | + """ | ||
| 55 | + Test redirection of unicode output to files does not raise error | ||
| 56 | + | ||
| 57 | + TODO: test this on non-linux OSs | ||
| 58 | + """ | ||
| 59 | + with temp_file() as tmp_file: | ||
| 60 | + check_call('{python} {this_file} print > {tmp_file}' | ||
| 61 | + .format(python=sys.executable, this_file=__file__, | ||
| 62 | + tmp_file=tmp_file), | ||
| 63 | + shell=True) | ||
| 64 | + | ||
| 65 | + @unittest.skipIf(not sys.platform.startswith('linux'), | ||
| 66 | + 'Only tested on linux sofar') | ||
| 67 | + def test_print_no_lang(self): | ||
| 68 | + """ | ||
| 69 | + Test redirection of unicode output to files does not raise error | ||
| 70 | + | ||
| 71 | + TODO: Adapt this for other OSs; for win create batch script | ||
| 72 | + """ | ||
| 73 | + check_call('LANG=C {python} {this_file} print' | ||
| 74 | + .format(python=sys.executable, this_file=__file__), | ||
| 75 | + shell=True) | ||
| 76 | + | ||
| 77 | + def test_uopen(self): | ||
| 78 | + """Test that uopen in a nice environment is ok""" | ||
| 79 | + with temp_file(False) as (tmp_handle, tmp_file): | ||
| 80 | + tmp_handle.write(FILE_TEXT.encode('utf8')) | ||
| 81 | + tmp_handle.close() | ||
| 82 | + | ||
| 83 | + try: | ||
| 84 | + check_call('{python} {this_file} read {tmp_file}' | ||
| 85 | + .format(python=sys.executable, this_file=__file__, | ||
| 86 | + tmp_file=tmp_file), | ||
| 87 | + shell=True) | ||
| 88 | + except CalledProcessError as cpe: | ||
| 89 | + self.fail(cpe.output) | ||
| 90 | + | ||
| 91 | + def test_uopen_redirect(self): | ||
| 92 | + """ | ||
| 93 | + Test redirection of unicode output to files does not raise error | ||
| 94 | + | ||
| 95 | + TODO: test this on non-linux OSs | ||
| 96 | + """ | ||
| 97 | + with temp_file(False) as (tmp_handle, tmp_file): | ||
| 98 | + tmp_handle.write(FILE_TEXT.encode('utf8')) | ||
| 99 | + tmp_handle.close() | ||
| 100 | + | ||
| 101 | + with temp_file() as redirect_file: | ||
| 102 | + try: | ||
| 103 | + check_call( | ||
| 104 | + '{python} {this_file} read {tmp_file} >{redirect_file}' | ||
| 105 | + .format(python=sys.executable, this_file=__file__, | ||
| 106 | + tmp_file=tmp_file, redirect_file=redirect_file), | ||
| 107 | + shell=True) | ||
| 108 | + except CalledProcessError as cpe: | ||
| 109 | + self.fail(cpe.output) | ||
| 110 | + | ||
| 111 | + @unittest.skipIf(not sys.platform.startswith('linux'), | ||
| 112 | + 'Only tested on linux sofar') | ||
| 113 | + def test_uopen_no_lang(self): | ||
| 114 | + """ | ||
| 115 | + Test that uopen in a C-LANG environment is ok | ||
| 116 | + | ||
| 117 | + TODO: Adapt this for other OSs; for win create batch script | ||
| 118 | + """ | ||
| 119 | + with temp_file(False) as (tmp_handle, tmp_file): | ||
| 120 | + tmp_handle.write(FILE_TEXT.encode('utf8')) | ||
| 121 | + tmp_handle.close() | ||
| 122 | + | ||
| 123 | + try: | ||
| 124 | + check_call('LANG=C {python} {this_file} read {tmp_file}' | ||
| 125 | + .format(python=sys.executable, this_file=__file__, | ||
| 126 | + tmp_file=tmp_file), | ||
| 127 | + shell=True) | ||
| 128 | + except CalledProcessError as cpe: | ||
| 129 | + self.fail(cpe.output) | ||
| 130 | + | ||
| 131 | + | ||
| 132 | +def run_read(filename): | ||
| 133 | + """This is called from test_uopen* tests as script. Reads text, compares""" | ||
| 134 | + from oletools.common.io_encoding import uopen | ||
| 135 | + # open file | ||
| 136 | + with uopen(filename, 'rt') as reader: | ||
| 137 | + # a few tests | ||
| 138 | + if reader.closed: | ||
| 139 | + raise ValueError('handle is closed!') | ||
| 140 | + if reader.name != filename: | ||
| 141 | + raise ValueError('Wrong filename {}'.format(reader.name)) | ||
| 142 | + if reader.isatty(): | ||
| 143 | + raise ValueError('Reader is a tty!') | ||
| 144 | + if reader.tell() != 0: | ||
| 145 | + raise ValueError('Reader.tell is not 0 at beginning') | ||
| 146 | + | ||
| 147 | + # read text | ||
| 148 | + text = reader.read() | ||
| 149 | + | ||
| 150 | + # a few more tests | ||
| 151 | + if not reader.closed: | ||
| 152 | + raise ValueError('Reader is not closed outside context') | ||
| 153 | + if reader.name != filename: | ||
| 154 | + raise ValueError('Wrong filename {} after context'.format(reader.name)) | ||
| 155 | + if reader.isatty(): | ||
| 156 | + raise ValueError('Reader has become a tty!') | ||
| 157 | + | ||
| 158 | + # compare text | ||
| 159 | + if sys.version_info.major <= 2: # in python2 get encoded byte string | ||
| 160 | + expect = FILE_TEXT.encode('utf8') | ||
| 161 | + else: # python3: should get real unicode | ||
| 162 | + expect = FILE_TEXT | ||
| 163 | + if text != expect: | ||
| 164 | + raise ValueError('Wrong contents: {!r} != {!r}' | ||
| 165 | + .format(text, expect)) | ||
| 166 | + return 0 | ||
| 167 | + | ||
| 168 | + | ||
| 169 | +def run_print(): | ||
| 170 | + """This is called from test_read* tests as script. Prints & logs unicode""" | ||
| 171 | + from oletools.common.io_encoding import ensure_stdout_handles_unicode | ||
| 172 | + from oletools.common.log_helper import log_helper | ||
| 173 | + ensure_stdout_handles_unicode() | ||
| 174 | + print(u'Check: \u2713') # print check mark | ||
| 175 | + | ||
| 176 | + # check logging as well | ||
| 177 | + logger = log_helper.get_or_create_silent_logger('test_encoding_handler') | ||
| 178 | + log_helper.enable_logging(False, 'debug', stream=sys.stdout) | ||
| 179 | + logger.info(u'Check: \u2713') | ||
| 180 | + return 0 | ||
| 181 | + | ||
| 182 | + | ||
| 183 | +# tests call this file as script | ||
| 184 | +if __name__ == '__main__': | ||
| 185 | + if len(sys.argv) < 2: | ||
| 186 | + sys.exit(unittest.main()) | ||
| 187 | + | ||
| 188 | + # hack required to import common from parent dir, not system-wide one | ||
| 189 | + # (usually unittest seems to do that for us) | ||
| 190 | + from os.path import abspath, dirname, join | ||
| 191 | + ole_base = dirname(dirname(dirname(abspath(__file__)))) | ||
| 192 | + sys.path.insert(0, ole_base) | ||
| 193 | + | ||
| 194 | + if sys.argv[1] == 'print': | ||
| 195 | + if len(sys.argv) > 2: | ||
| 196 | + print('Expect no arg for "print"', file=sys.stderr) | ||
| 197 | + sys.exit(2) | ||
| 198 | + sys.exit(run_print()) | ||
| 199 | + elif sys.argv[1] == 'read': | ||
| 200 | + if len(sys.argv) != 3: | ||
| 201 | + print('Expect single arg for "read"', file=sys.stderr) | ||
| 202 | + sys.exit(2) | ||
| 203 | + sys.exit(run_read(sys.argv[2])) | ||
| 204 | + else: | ||
| 205 | + print('Unexpected argument: {}'.format(sys.argv[1]), file=sys.stderr) | ||
| 206 | + sys.exit(2) |
tests/oleobj/test_basic.py
| @@ -10,6 +10,7 @@ from glob import glob | @@ -10,6 +10,7 @@ from glob import glob | ||
| 10 | # Directory with test data, independent of current working directory | 10 | # Directory with test data, independent of current working directory |
| 11 | from tests.test_utils import DATA_BASE_DIR, call_and_capture | 11 | from tests.test_utils import DATA_BASE_DIR, call_and_capture |
| 12 | from oletools import oleobj | 12 | from oletools import oleobj |
| 13 | +from oletools.common.io_encoding import ensure_stdout_handles_unicode | ||
| 13 | 14 | ||
| 14 | 15 | ||
| 15 | #: provide some more info to find errors | 16 | #: provide some more info to find errors |
| @@ -61,6 +62,7 @@ def calc_md5(filename): | @@ -61,6 +62,7 @@ def calc_md5(filename): | ||
| 61 | 62 | ||
| 62 | def preread_file(args): | 63 | def preread_file(args): |
| 63 | """helper for TestOleObj.test_non_streamed: preread + call process_file""" | 64 | """helper for TestOleObj.test_non_streamed: preread + call process_file""" |
| 65 | + ensure_stdout_handles_unicode() # usually, main() call this | ||
| 64 | ignore_arg, output_dir, filename = args | 66 | ignore_arg, output_dir, filename = args |
| 65 | if ignore_arg != '-d': | 67 | if ignore_arg != '-d': |
| 66 | raise ValueError('ignore_arg not as expected!') | 68 | raise ValueError('ignore_arg not as expected!') |