Commit 2f7a1ef1b347a1124d01ef6559d939be5ccf50cd
Committed by
GitHub
Merge pull request #365 from christian-intra2net/encoding-for-non-unicode-environments
Encoding for non unicode environments
Showing
9 changed files
with
484 additions
and
66 deletions
oletools/common/io_encoding.py
0 → 100644
| 1 | +#!/usr/bin/env python3 | |
| 2 | + | |
| 3 | +""" | |
| 4 | +Tool to help with input/output encoding | |
| 5 | + | |
| 6 | +Helpers to run smoothly in unicode-unfriendly environments like output redirect | |
| 7 | +or unusual language settings. | |
| 8 | + | |
| 9 | +In such settings, output to console falls back to ASCII-only. Also open() | |
| 10 | +suddenly fails to interprete non-ASCII characters. | |
| 11 | + | |
| 12 | +Therefore, at start of scripts can run :py:meth:`ensure_stdout_handles_unicode` | |
| 13 | +and when opening text files use :py:meth:`uopen` to replace :py:meth:`open`. | |
| 14 | + | |
| 15 | +Part of the python-oletools package: | |
| 16 | +http://www.decalage.info/python/oletools | |
| 17 | +""" | |
| 18 | + | |
| 19 | +# === LICENSE ================================================================= | |
| 20 | + | |
| 21 | +# msodde is copyright (c) 2017-2018 Philippe Lagadec (http://www.decalage.info) | |
| 22 | +# All rights reserved. | |
| 23 | +# | |
| 24 | +# Redistribution and use in source and binary forms, with or without | |
| 25 | +# modification, are permitted provided that the following conditions are met: | |
| 26 | +# | |
| 27 | +# * Redistributions of source code must retain the above copyright notice, | |
| 28 | +# this list of conditions and the following disclaimer. | |
| 29 | +# * Redistributions in binary form must reproduce the above copyright notice, | |
| 30 | +# this list of conditions and the following disclaimer in the documentation | |
| 31 | +# and/or other materials provided with the distribution. | |
| 32 | +# | |
| 33 | +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
| 34 | +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 35 | +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| 36 | +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE | |
| 37 | +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
| 38 | +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
| 39 | +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
| 40 | +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
| 41 | +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
| 42 | +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
| 43 | +# POSSIBILITY OF SUCH DAMAGE. | |
| 44 | + | |
| 45 | +# ----------------------------------------------------------------------------- | |
| 46 | +# CHANGELOG: | |
| 47 | +# 2018-11-04 v0.54 CH: - first version: ensure_stdout_handles_unicode, uopen | |
| 48 | + | |
| 49 | +# -- IMPORTS ------------------------------------------------------------------ | |
| 50 | +from __future__ import print_function | |
| 51 | +import sys | |
| 52 | +import codecs | |
| 53 | +import os | |
| 54 | +from locale import getpreferredencoding | |
| 55 | + | |
| 56 | +PY3 = sys.version_info.major >= 3 | |
| 57 | + | |
| 58 | +if PY3: | |
| 59 | + from builtins import open as builtin_open | |
| 60 | +else: | |
| 61 | + from __builtin__ import open as builtin_open | |
| 62 | + | |
| 63 | +# -- CONSTANTS ---------------------------------------------------------------- | |
| 64 | +#: encoding to use for redirection if no good encoding can be found | |
| 65 | +FALLBACK_ENCODING_REDIRECT = 'utf8' | |
| 66 | + | |
| 67 | +#: encoding for reading text from files if preferred encoding is non-unicode | |
| 68 | +FALLBACK_ENCODING_OPEN = 'utf8' | |
| 69 | + | |
| 70 | +#: print (pure-ascii) debug output to stdout | |
| 71 | +DEBUG = False | |
| 72 | + | |
| 73 | +# the encoding specified in system environment | |
| 74 | +try: | |
| 75 | + PREFERRED_ENCODING = getpreferredencoding() | |
| 76 | +except Exception as exc: | |
| 77 | + if DEBUG: | |
| 78 | + print('Exception getting preferred encoding: {}'.format(exc)) | |
| 79 | + PREFERRED_ENCODING = None | |
| 80 | + | |
| 81 | + | |
| 82 | +# -- HELPERS =----------------------------------------------------------------- | |
| 83 | + | |
| 84 | + | |
| 85 | +def ensure_stdout_handles_unicode(): | |
| 86 | + """ | |
| 87 | + Ensure that print()ing unicode does not lead to errors. | |
| 88 | + | |
| 89 | + When print()ing unicode, python relies on the environment (e.g. in linux on | |
| 90 | + the setting of the LANG environment variable) to tell it how to encode | |
| 91 | + unicode. That works nicely for modern-day shells where encoding is usually | |
| 92 | + UTF-8. But as soon as LANG is unset or just "C", or output is redirected or | |
| 93 | + piped, the encoding falls back to 'ASCII', which cannot handle unicode | |
| 94 | + characters. | |
| 95 | + | |
| 96 | + Based on solutions suggested on stackoverflow (c.f. | |
| 97 | + https://stackoverflow.com/q/27347772/4405656 ), wrap stdout in an encoder | |
| 98 | + that solves that problem. | |
| 99 | + | |
| 100 | + Unfortunately, stderr cannot be handled the same way ( see e.g. https:// | |
| 101 | + pythonhosted.org/kitchen/unicode-frustrations.html#frustration-5-exceptions | |
| 102 | + ), so we still have to hope there is only ascii in error messages | |
| 103 | + """ | |
| 104 | + # do not re-wrap | |
| 105 | + if isinstance(sys.stdout, codecs.StreamWriter): | |
| 106 | + if DEBUG: | |
| 107 | + print('sys.stdout wrapped already') | |
| 108 | + return | |
| 109 | + | |
| 110 | + # get output stream object | |
| 111 | + if PY3: | |
| 112 | + output_stream = sys.stdout.buffer | |
| 113 | + else: | |
| 114 | + output_stream = sys.stdout | |
| 115 | + | |
| 116 | + # determine encoding of sys.stdout | |
| 117 | + try: | |
| 118 | + encoding = sys.stdout.encoding | |
| 119 | + except AttributeError: # variable "encoding" might not exist | |
| 120 | + encoding = None | |
| 121 | + if DEBUG: | |
| 122 | + print('sys.stdout encoding is {}'.format(encoding)) | |
| 123 | + | |
| 124 | + if isinstance(encoding, str) and encoding.lower().startswith('utf'): | |
| 125 | + if DEBUG: | |
| 126 | + print('encoding is acceptable') | |
| 127 | + return # everything alright, we are working in a good environment | |
| 128 | + elif os.isatty(output_stream.fileno()): # e.g. C locale | |
| 129 | + # Do not output UTF8 since that might be mis-interpreted. | |
| 130 | + # Just replace chars that cannot be handled | |
| 131 | + print('Encoding for stdout is only {}, will replace other chars to ' | |
| 132 | + 'avoid unicode error'.format(encoding), file=sys.stderr) | |
| 133 | + sys.stdout = codecs.getwriter(encoding)(output_stream, errors='replace') | |
| 134 | + else: # e.g. redirection, pipe in python2 | |
| 135 | + new_encoding = PREFERRED_ENCODING | |
| 136 | + if DEBUG: | |
| 137 | + print('not a tty, try preferred encoding {}'.format(new_encoding)) | |
| 138 | + if not isinstance(new_encoding, str) \ | |
| 139 | + or not new_encoding.lower().startswith('utf'): | |
| 140 | + new_encoding = FALLBACK_ENCODING_REDIRECT | |
| 141 | + if DEBUG: | |
| 142 | + print('preferred encoding also unacceptable, fall back to {}' | |
| 143 | + .format(new_encoding)) | |
| 144 | + print('Encoding for stdout is only {}, will auto-encode text with {} ' | |
| 145 | + 'before output'.format(encoding, new_encoding), file=sys.stderr) | |
| 146 | + sys.stdout = codecs.getwriter(new_encoding)(output_stream) | |
| 147 | + | |
| 148 | + | |
| 149 | +def uopen(filename, mode='r', *args, **kwargs): | |
| 150 | + """ | |
| 151 | + Replacement for builtin open() that reads unicode even in ASCII environment | |
| 152 | + | |
| 153 | + In order to read unicode from text, python uses locale.getpreferredencoding | |
| 154 | + to translate bytes to str. If the environment only provides ASCII encoding, | |
| 155 | + this will fail since most office files contain unicode. | |
| 156 | + | |
| 157 | + Therefore, guess a good encoding here if necessary and open file with that. | |
| 158 | + | |
| 159 | + :returns: same type as the builtin :py:func:`open` | |
| 160 | + """ | |
| 161 | + # do not interfere if not necessary: | |
| 162 | + if 'b' in mode: | |
| 163 | + if DEBUG: | |
| 164 | + print('Opening binary file, do not interfere') | |
| 165 | + return builtin_open(filename, mode, *args, **kwargs) | |
| 166 | + if 'encoding' in kwargs: | |
| 167 | + if DEBUG: | |
| 168 | + print('Opening file with encoding {!r}, do not interfere' | |
| 169 | + .format(kwargs['encoding'])) | |
| 170 | + return builtin_open(filename, mode, *args, **kwargs) | |
| 171 | + if len(args) > 3: # "encoding" is the 4th arg | |
| 172 | + if DEBUG: | |
| 173 | + print('Opening file with encoding {!r}, do not interfere' | |
| 174 | + .format(args[3])) | |
| 175 | + return builtin_open(filename, mode, *args, **kwargs) | |
| 176 | + | |
| 177 | + # determine preferred encoding | |
| 178 | + encoding = PREFERRED_ENCODING | |
| 179 | + if DEBUG: | |
| 180 | + print('preferred encoding is {}'.format(encoding)) | |
| 181 | + | |
| 182 | + if isinstance(encoding, str) and encoding.lower().startswith('utf'): | |
| 183 | + if DEBUG: | |
| 184 | + print('encoding is acceptable, open {} regularly'.format(filename)) | |
| 185 | + return builtin_open(filename, mode, *args, **kwargs) | |
| 186 | + | |
| 187 | + # so we want to read text from a file but can probably only deal with ASCII | |
| 188 | + # --> use fallback | |
| 189 | + if DEBUG: | |
| 190 | + print('Opening {} with fallback encoding {}' | |
| 191 | + .format(filename, FALLBACK_ENCODING_OPEN)) | |
| 192 | + if PY3: | |
| 193 | + return builtin_open(filename, mode, *args, | |
| 194 | + encoding=FALLBACK_ENCODING_OPEN, **kwargs) | |
| 195 | + else: | |
| 196 | + handle = builtin_open(filename, mode, *args, **kwargs) | |
| 197 | + return codecs.EncodedFile(handle, FALLBACK_ENCODING_OPEN) | ... | ... |
oletools/common/log_helper/log_helper.py
| ... | ... | @@ -44,6 +44,7 @@ General logging helpers |
| 44 | 44 | from ._json_formatter import JsonFormatter |
| 45 | 45 | from ._logger_adapter import OletoolsLoggerAdapter |
| 46 | 46 | from . import _root_logger_wrapper |
| 47 | +from ..io_encoding import ensure_stdout_handles_unicode | |
| 47 | 48 | import logging |
| 48 | 49 | import sys |
| 49 | 50 | |
| ... | ... | @@ -92,6 +93,9 @@ class LogHelper: |
| 92 | 93 | if self._is_enabled: |
| 93 | 94 | raise ValueError('re-enabling logging. Not sure whether that is ok...') |
| 94 | 95 | |
| 96 | + if stream in (None, sys.stdout): | |
| 97 | + ensure_stdout_handles_unicode() | |
| 98 | + | |
| 95 | 99 | log_level = LOG_LEVELS[level] |
| 96 | 100 | logging.basicConfig(level=log_level, format=log_format, stream=stream) |
| 97 | 101 | self._is_enabled = True | ... | ... |
oletools/msodde.py
| ... | ... | @@ -74,6 +74,7 @@ from oletools import xls_parser |
| 74 | 74 | from oletools import rtfobj |
| 75 | 75 | from oletools.ppt_record_parser import is_ppt |
| 76 | 76 | from oletools import crypto |
| 77 | +from oletools.common.io_encoding import ensure_stdout_handles_unicode | |
| 77 | 78 | from oletools.common.log_helper import log_helper |
| 78 | 79 | |
| 79 | 80 | # ----------------------------------------------------------------------------- |
| ... | ... | @@ -236,57 +237,6 @@ DEFAULT_LOG_LEVEL = "warning" # Default log level |
| 236 | 237 | logger = log_helper.get_or_create_silent_logger('msodde') |
| 237 | 238 | |
| 238 | 239 | |
| 239 | -# === UNICODE IN PY2 ========================================================= | |
| 240 | - | |
| 241 | -def ensure_stdout_handles_unicode(): | |
| 242 | - """ Ensure stdout can handle unicode by wrapping it if necessary | |
| 243 | - | |
| 244 | - Required e.g. if output of this script is piped or redirected in a linux | |
| 245 | - shell, since then sys.stdout.encoding is ascii and cannot handle | |
| 246 | - print(unicode). In that case we need to find some compatible encoding and | |
| 247 | - wrap sys.stdout into a encoder following (many thanks!) | |
| 248 | - https://stackoverflow.com/a/1819009 or https://stackoverflow.com/a/20447935 | |
| 249 | - | |
| 250 | - Can be undone by setting sys.stdout = sys.__stdout__ | |
| 251 | - """ | |
| 252 | - import codecs | |
| 253 | - import locale | |
| 254 | - | |
| 255 | - # do not re-wrap | |
| 256 | - if isinstance(sys.stdout, codecs.StreamWriter): | |
| 257 | - return | |
| 258 | - | |
| 259 | - # try to find encoding for sys.stdout | |
| 260 | - encoding = None | |
| 261 | - try: | |
| 262 | - encoding = sys.stdout.encoding | |
| 263 | - except AttributeError: # variable "encoding" might not exist | |
| 264 | - pass | |
| 265 | - | |
| 266 | - if encoding not in (None, '', 'ascii'): | |
| 267 | - return # no need to wrap | |
| 268 | - | |
| 269 | - # try to find an encoding that can handle unicode | |
| 270 | - try: | |
| 271 | - encoding = locale.getpreferredencoding() | |
| 272 | - except Exception: | |
| 273 | - pass | |
| 274 | - | |
| 275 | - # fallback if still no encoding available | |
| 276 | - if encoding in (None, '', 'ascii'): | |
| 277 | - encoding = 'utf8' | |
| 278 | - | |
| 279 | - # logging is probably not initialized yet, but just in case | |
| 280 | - logger.debug('wrapping sys.stdout with encoder using {0}'.format(encoding)) | |
| 281 | - | |
| 282 | - wrapper = codecs.getwriter(encoding) | |
| 283 | - sys.stdout = wrapper(sys.stdout) | |
| 284 | - | |
| 285 | - | |
| 286 | -if sys.version_info.major < 3: | |
| 287 | - ensure_stdout_handles_unicode() # e.g. for print(text) in main() | |
| 288 | - | |
| 289 | - | |
| 290 | 240 | # === ARGUMENT PARSING ======================================================= |
| 291 | 241 | |
| 292 | 242 | class ArgParserWithBanner(argparse.ArgumentParser): |
| ... | ... | @@ -820,10 +770,15 @@ def process_csv(filepath): |
| 820 | 770 | chars the same way that excel does. Tested to some extend in unittests. |
| 821 | 771 | |
| 822 | 772 | This can only find DDE-links, no other "suspicious" constructs (yet). |
| 823 | - """ | |
| 824 | 773 | |
| 774 | + Cannot deal with unicode files yet (need more than just use uopen()). | |
| 775 | + """ | |
| 825 | 776 | results = [] |
| 826 | - with open(filepath, 'r') as file_handle: | |
| 777 | + if sys.version_info.major <= 2: | |
| 778 | + open_arg = dict(mode='rb') | |
| 779 | + else: | |
| 780 | + open_arg = dict(newline='') | |
| 781 | + with open(filepath, **open_arg) as file_handle: | |
| 827 | 782 | results, dialect = process_csv_dialect(file_handle, CSV_DELIMITERS) |
| 828 | 783 | is_small = file_handle.tell() < CSV_SMALL_THRESH |
| 829 | 784 | |
| ... | ... | @@ -854,7 +809,6 @@ def process_csv(filepath): |
| 854 | 809 | |
| 855 | 810 | def process_csv_dialect(file_handle, delimiters): |
| 856 | 811 | """ helper for process_csv: process with a specific csv dialect """ |
| 857 | - | |
| 858 | 812 | # determine dialect = delimiter chars, quote chars, ... |
| 859 | 813 | dialect = csv.Sniffer().sniff(file_handle.read(CSV_SMALL_THRESH), |
| 860 | 814 | delimiters=delimiters) | ... | ... |
oletools/olemeta.py
| ... | ... | @@ -79,6 +79,7 @@ if not _parent_dir in sys.path: |
| 79 | 79 | import olefile |
| 80 | 80 | from oletools.thirdparty import xglob |
| 81 | 81 | from oletools.thirdparty.tablestream import tablestream |
| 82 | +from oletools.common.io_encoding import ensure_stdout_handles_unicode | |
| 82 | 83 | |
| 83 | 84 | |
| 84 | 85 | #=== MAIN ================================================================= |
| ... | ... | @@ -88,13 +89,12 @@ def process_ole(ole): |
| 88 | 89 | meta = ole.get_metadata() |
| 89 | 90 | |
| 90 | 91 | # console output with UTF8 encoding: |
| 91 | - # It looks like we do not need the UTF8 codec anymore, both for Python 2 and 3 | |
| 92 | - console_utf8 = sys.stdout #codecs.getwriter('utf8')(sys.stdout) | |
| 92 | + ensure_stdout_handles_unicode() | |
| 93 | 93 | |
| 94 | 94 | # TODO: move similar code to a function |
| 95 | 95 | |
| 96 | 96 | print('Properties from the SummaryInformation stream:') |
| 97 | - t = tablestream.TableStream([21, 30], header_row=['Property', 'Value'], outfile=console_utf8) | |
| 97 | + t = tablestream.TableStream([21, 30], header_row=['Property', 'Value']) | |
| 98 | 98 | for prop in meta.SUMMARY_ATTRIBS: |
| 99 | 99 | value = getattr(meta, prop) |
| 100 | 100 | if value is not None: |
| ... | ... | @@ -111,7 +111,7 @@ def process_ole(ole): |
| 111 | 111 | print('') |
| 112 | 112 | |
| 113 | 113 | print('Properties from the DocumentSummaryInformation stream:') |
| 114 | - t = tablestream.TableStream([21, 30], header_row=['Property', 'Value'], outfile=console_utf8) | |
| 114 | + t = tablestream.TableStream([21, 30], header_row=['Property', 'Value']) | |
| 115 | 115 | for prop in meta.DOCSUM_ATTRIBS: |
| 116 | 116 | value = getattr(meta, prop) |
| 117 | 117 | if value is not None: | ... | ... |
oletools/oleobj.py
| ... | ... | @@ -73,6 +73,7 @@ except ImportError: |
| 73 | 73 | from oletools.ppt_record_parser import (is_ppt, PptFile, |
| 74 | 74 | PptRecordExOleVbaActiveXAtom) |
| 75 | 75 | from oletools.ooxml import XmlParser |
| 76 | +from oletools.common.io_encoding import ensure_stdout_handles_unicode | |
| 76 | 77 | |
| 77 | 78 | # ----------------------------------------------------------------------------- |
| 78 | 79 | # CHANGELOG: |
| ... | ... | @@ -848,6 +849,7 @@ def main(cmd_line_args=None): |
| 848 | 849 | provide other arguments. |
| 849 | 850 | """ |
| 850 | 851 | # print banner with version |
| 852 | + ensure_stdout_handles_unicode() | |
| 851 | 853 | print('oleobj %s - http://decalage.info/oletools' % __version__) |
| 852 | 854 | print('THIS IS WORK IN PROGRESS - Check updates regularly!') |
| 853 | 855 | print('Please report any issue at ' | ... | ... |
oletools/olevba.py
| ... | ... | @@ -318,6 +318,7 @@ from oletools import ppt_parser |
| 318 | 318 | from oletools import oleform |
| 319 | 319 | from oletools import rtfobj |
| 320 | 320 | from oletools import crypto |
| 321 | +from oletools.common.io_encoding import ensure_stdout_handles_unicode | |
| 321 | 322 | from oletools.common import codepages |
| 322 | 323 | |
| 323 | 324 | # monkeypatch email to fix issue #32: |
| ... | ... | @@ -4122,7 +4123,6 @@ def main(cmd_line_args=None): |
| 4122 | 4123 | in process_args. Per default (cmd_line_args=None), sys.argv is used. Option |
| 4123 | 4124 | mainly added for unit-testing |
| 4124 | 4125 | """ |
| 4125 | - | |
| 4126 | 4126 | options, args = parse_args(cmd_line_args) |
| 4127 | 4127 | |
| 4128 | 4128 | # provide info about tool and its version | ... | ... |
oletools/ooxml.py
| ... | ... | @@ -13,14 +13,47 @@ TODO: check what is duplicate here with oleid, maybe merge some day? |
| 13 | 13 | TODO: "xml2003" == "flatopc"? |
| 14 | 14 | |
| 15 | 15 | .. codeauthor:: Intra2net AG <info@intra2net> |
| 16 | +License: BSD, see source code or documentation | |
| 17 | + | |
| 18 | +msodde is part of the python-oletools package: | |
| 19 | +http://www.decalage.info/python/oletools | |
| 16 | 20 | """ |
| 17 | 21 | |
| 22 | +# === LICENSE ================================================================= | |
| 23 | + | |
| 24 | +# msodde is copyright (c) 2017-2019 Philippe Lagadec (http://www.decalage.info) | |
| 25 | +# All rights reserved. | |
| 26 | +# | |
| 27 | +# Redistribution and use in source and binary forms, with or without | |
| 28 | +# modification, are permitted provided that the following conditions are met: | |
| 29 | +# | |
| 30 | +# * Redistributions of source code must retain the above copyright notice, | |
| 31 | +# this list of conditions and the following disclaimer. | |
| 32 | +# * Redistributions in binary form must reproduce the above copyright notice, | |
| 33 | +# this list of conditions and the following disclaimer in the documentation | |
| 34 | +# and/or other materials provided with the distribution. | |
| 35 | +# | |
| 36 | +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
| 37 | +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 38 | +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| 39 | +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE | |
| 40 | +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
| 41 | +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
| 42 | +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
| 43 | +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
| 44 | +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
| 45 | +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
| 46 | +# POSSIBILITY OF SUCH DAMAGE. | |
| 47 | + | |
| 48 | +# -- IMPORTS ------------------------------------------------------------------ | |
| 49 | + | |
| 18 | 50 | import sys |
| 51 | +from oletools.common.log_helper import log_helper | |
| 52 | +from oletools.common.io_encoding import uopen | |
| 19 | 53 | from zipfile import ZipFile, BadZipfile, is_zipfile |
| 20 | 54 | from os.path import splitext |
| 21 | 55 | import io |
| 22 | 56 | import re |
| 23 | -from oletools.common.log_helper import log_helper | |
| 24 | 57 | |
| 25 | 58 | # import lxml or ElementTree for XML parsing: |
| 26 | 59 | try: |
| ... | ... | @@ -29,6 +62,18 @@ try: |
| 29 | 62 | except ImportError: |
| 30 | 63 | import xml.etree.cElementTree as ET |
| 31 | 64 | |
| 65 | +# ----------------------------------------------------------------------------- | |
| 66 | +# CHANGELOG: | |
| 67 | +# 2018-12-06 CH: - ensure stdout can handle unicode | |
| 68 | + | |
| 69 | +__version__ = '0.54.2' | |
| 70 | + | |
| 71 | + | |
| 72 | +############################################################################### | |
| 73 | +# CONSTANTS | |
| 74 | +############################################################################### | |
| 75 | + | |
| 76 | + | |
| 32 | 77 | logger = log_helper.get_or_create_silent_logger('ooxml') |
| 33 | 78 | |
| 34 | 79 | #: subfiles that have to be part of every ooxml file |
| ... | ... | @@ -127,7 +172,7 @@ def get_type(filename): |
| 127 | 172 | parser = XmlParser(filename) |
| 128 | 173 | if parser.is_single_xml(): |
| 129 | 174 | match = None |
| 130 | - with open(filename, 'r') as handle: | |
| 175 | + with uopen(filename, 'r') as handle: | |
| 131 | 176 | match = re.search(OFFICE_XML_PROGID_REGEX, handle.read(1024)) |
| 132 | 177 | if not match: |
| 133 | 178 | return DOCTYPE_NONE |
| ... | ... | @@ -416,7 +461,7 @@ class XmlParser(object): |
| 416 | 461 | |
| 417 | 462 | # find prog id in xml prolog |
| 418 | 463 | match = None |
| 419 | - with open(self.filename, 'r') as handle: | |
| 464 | + with uopen(self.filename, 'r') as handle: | |
| 420 | 465 | match = re.search(OFFICE_XML_PROGID_REGEX, handle.read(1024)) |
| 421 | 466 | if match: |
| 422 | 467 | self._is_single_xml = True |
| ... | ... | @@ -424,11 +469,18 @@ class XmlParser(object): |
| 424 | 469 | raise BadOOXML(self.filename, 'is no zip and has no prog_id') |
| 425 | 470 | |
| 426 | 471 | def iter_files(self, args=None): |
| 427 | - """ Find files in zip or just give single xml file """ | |
| 472 | + """ | |
| 473 | + Find files in zip or just give single xml file | |
| 474 | + | |
| 475 | + yields pairs (subfile-name, file-handle) where file-handle is an open | |
| 476 | + file-like object. (Do not care too much about encoding here, the xml | |
| 477 | + parser reads the encoding from the first lines in the file.) | |
| 478 | + """ | |
| 428 | 479 | if self.is_single_xml(): |
| 429 | 480 | if args: |
| 430 | 481 | raise BadOOXML(self.filename, 'xml has no subfiles') |
| 431 | - with open(self.filename, 'r') as handle: | |
| 482 | + # do not use uopen, xml parser determines encoding on its own | |
| 483 | + with open(self.filename, 'rb') as handle: | |
| 432 | 484 | yield None, handle # the subfile=None is needed in iter_xml |
| 433 | 485 | self.did_iter_all = True |
| 434 | 486 | else: |
| ... | ... | @@ -638,9 +690,10 @@ class XmlParser(object): |
| 638 | 690 | |
| 639 | 691 | |
| 640 | 692 | def test(): |
| 641 | - """ Main function, called when running file as script | |
| 693 | + """ | |
| 694 | + Test xml parsing; called when running this file as a script. | |
| 642 | 695 | |
| 643 | - see module doc for more info | |
| 696 | + Prints every element found in input file (to be given as command line arg). | |
| 644 | 697 | """ |
| 645 | 698 | log_helper.enable_logging(False, 'debug') |
| 646 | 699 | if len(sys.argv) != 2: | ... | ... |
tests/common/test_encoding_handler.py
0 → 100644
| 1 | +"""Test common.ensure_stdout_handles_unicode""" | |
| 2 | + | |
| 3 | +from __future__ import print_function | |
| 4 | + | |
| 5 | +import unittest | |
| 6 | +import sys | |
| 7 | +from subprocess import check_call, CalledProcessError | |
| 8 | +from tempfile import mkstemp | |
| 9 | +import os | |
| 10 | +from os.path import isfile | |
| 11 | +from contextlib import contextmanager | |
| 12 | + | |
| 13 | +FILE_TEXT = u'The unicode check mark is \u2713.\n' | |
| 14 | + | |
| 15 | +@contextmanager | |
| 16 | +def temp_file(just_name=True): | |
| 17 | + """Context manager that creates temp file and deletes it in the end""" | |
| 18 | + tmp_descriptor = None | |
| 19 | + tmp_name = None | |
| 20 | + tmp_handle = None | |
| 21 | + try: | |
| 22 | + tmp_descriptor, tmp_name = mkstemp() | |
| 23 | + | |
| 24 | + # we create our own file handle since we want to be able to close the | |
| 25 | + # file and open it again for reading. | |
| 26 | + # We keep the os-level descriptor open so file name is still reserved | |
| 27 | + # for us | |
| 28 | + if just_name: | |
| 29 | + yield tmp_name | |
| 30 | + else: | |
| 31 | + tmp_handle = open(tmp_name, 'wb') | |
| 32 | + yield tmp_handle, tmp_name | |
| 33 | + except Exception: | |
| 34 | + raise | |
| 35 | + finally: | |
| 36 | + if tmp_descriptor is not None: | |
| 37 | + os.close(tmp_descriptor) | |
| 38 | + if tmp_handle is not None: | |
| 39 | + tmp_handle.close() | |
| 40 | + if tmp_name is not None and isfile(tmp_name): | |
| 41 | + os.unlink(tmp_name) | |
| 42 | + | |
| 43 | + | |
| 44 | +class TestEncodingHandler(unittest.TestCase): | |
| 45 | + """Tests replacing stdout encoding in various scenarios""" | |
| 46 | + | |
| 47 | + def test_print(self): | |
| 48 | + """Test regular unicode output not raise error""" | |
| 49 | + check_call('{python} {this_file} print'.format(python=sys.executable, | |
| 50 | + this_file=__file__), | |
| 51 | + shell=True) | |
| 52 | + | |
| 53 | + def test_print_redirect(self): | |
| 54 | + """ | |
| 55 | + Test redirection of unicode output to files does not raise error | |
| 56 | + | |
| 57 | + TODO: test this on non-linux OSs | |
| 58 | + """ | |
| 59 | + with temp_file() as tmp_file: | |
| 60 | + check_call('{python} {this_file} print > {tmp_file}' | |
| 61 | + .format(python=sys.executable, this_file=__file__, | |
| 62 | + tmp_file=tmp_file), | |
| 63 | + shell=True) | |
| 64 | + | |
| 65 | + @unittest.skipIf(not sys.platform.startswith('linux'), | |
| 66 | + 'Only tested on linux sofar') | |
| 67 | + def test_print_no_lang(self): | |
| 68 | + """ | |
| 69 | + Test redirection of unicode output to files does not raise error | |
| 70 | + | |
| 71 | + TODO: Adapt this for other OSs; for win create batch script | |
| 72 | + """ | |
| 73 | + check_call('LANG=C {python} {this_file} print' | |
| 74 | + .format(python=sys.executable, this_file=__file__), | |
| 75 | + shell=True) | |
| 76 | + | |
| 77 | + def test_uopen(self): | |
| 78 | + """Test that uopen in a nice environment is ok""" | |
| 79 | + with temp_file(False) as (tmp_handle, tmp_file): | |
| 80 | + tmp_handle.write(FILE_TEXT.encode('utf8')) | |
| 81 | + tmp_handle.close() | |
| 82 | + | |
| 83 | + try: | |
| 84 | + check_call('{python} {this_file} read {tmp_file}' | |
| 85 | + .format(python=sys.executable, this_file=__file__, | |
| 86 | + tmp_file=tmp_file), | |
| 87 | + shell=True) | |
| 88 | + except CalledProcessError as cpe: | |
| 89 | + self.fail(cpe.output) | |
| 90 | + | |
| 91 | + def test_uopen_redirect(self): | |
| 92 | + """ | |
| 93 | + Test redirection of unicode output to files does not raise error | |
| 94 | + | |
| 95 | + TODO: test this on non-linux OSs | |
| 96 | + """ | |
| 97 | + with temp_file(False) as (tmp_handle, tmp_file): | |
| 98 | + tmp_handle.write(FILE_TEXT.encode('utf8')) | |
| 99 | + tmp_handle.close() | |
| 100 | + | |
| 101 | + with temp_file() as redirect_file: | |
| 102 | + try: | |
| 103 | + check_call( | |
| 104 | + '{python} {this_file} read {tmp_file} >{redirect_file}' | |
| 105 | + .format(python=sys.executable, this_file=__file__, | |
| 106 | + tmp_file=tmp_file, redirect_file=redirect_file), | |
| 107 | + shell=True) | |
| 108 | + except CalledProcessError as cpe: | |
| 109 | + self.fail(cpe.output) | |
| 110 | + | |
| 111 | + @unittest.skipIf(not sys.platform.startswith('linux'), | |
| 112 | + 'Only tested on linux sofar') | |
| 113 | + def test_uopen_no_lang(self): | |
| 114 | + """ | |
| 115 | + Test that uopen in a C-LANG environment is ok | |
| 116 | + | |
| 117 | + TODO: Adapt this for other OSs; for win create batch script | |
| 118 | + """ | |
| 119 | + with temp_file(False) as (tmp_handle, tmp_file): | |
| 120 | + tmp_handle.write(FILE_TEXT.encode('utf8')) | |
| 121 | + tmp_handle.close() | |
| 122 | + | |
| 123 | + try: | |
| 124 | + check_call('LANG=C {python} {this_file} read {tmp_file}' | |
| 125 | + .format(python=sys.executable, this_file=__file__, | |
| 126 | + tmp_file=tmp_file), | |
| 127 | + shell=True) | |
| 128 | + except CalledProcessError as cpe: | |
| 129 | + self.fail(cpe.output) | |
| 130 | + | |
| 131 | + | |
| 132 | +def run_read(filename): | |
| 133 | + """This is called from test_uopen* tests as script. Reads text, compares""" | |
| 134 | + from oletools.common.io_encoding import uopen | |
| 135 | + # open file | |
| 136 | + with uopen(filename, 'rt') as reader: | |
| 137 | + # a few tests | |
| 138 | + if reader.closed: | |
| 139 | + raise ValueError('handle is closed!') | |
| 140 | + if reader.name != filename: | |
| 141 | + raise ValueError('Wrong filename {}'.format(reader.name)) | |
| 142 | + if reader.isatty(): | |
| 143 | + raise ValueError('Reader is a tty!') | |
| 144 | + if reader.tell() != 0: | |
| 145 | + raise ValueError('Reader.tell is not 0 at beginning') | |
| 146 | + | |
| 147 | + # read text | |
| 148 | + text = reader.read() | |
| 149 | + | |
| 150 | + # a few more tests | |
| 151 | + if not reader.closed: | |
| 152 | + raise ValueError('Reader is not closed outside context') | |
| 153 | + if reader.name != filename: | |
| 154 | + raise ValueError('Wrong filename {} after context'.format(reader.name)) | |
| 155 | + if reader.isatty(): | |
| 156 | + raise ValueError('Reader has become a tty!') | |
| 157 | + | |
| 158 | + # compare text | |
| 159 | + if sys.version_info.major <= 2: # in python2 get encoded byte string | |
| 160 | + expect = FILE_TEXT.encode('utf8') | |
| 161 | + else: # python3: should get real unicode | |
| 162 | + expect = FILE_TEXT | |
| 163 | + if text != expect: | |
| 164 | + raise ValueError('Wrong contents: {!r} != {!r}' | |
| 165 | + .format(text, expect)) | |
| 166 | + return 0 | |
| 167 | + | |
| 168 | + | |
| 169 | +def run_print(): | |
| 170 | + """This is called from test_read* tests as script. Prints & logs unicode""" | |
| 171 | + from oletools.common.io_encoding import ensure_stdout_handles_unicode | |
| 172 | + from oletools.common.log_helper import log_helper | |
| 173 | + ensure_stdout_handles_unicode() | |
| 174 | + print(u'Check: \u2713') # print check mark | |
| 175 | + | |
| 176 | + # check logging as well | |
| 177 | + logger = log_helper.get_or_create_silent_logger('test_encoding_handler') | |
| 178 | + log_helper.enable_logging(False, 'debug', stream=sys.stdout) | |
| 179 | + logger.info(u'Check: \u2713') | |
| 180 | + return 0 | |
| 181 | + | |
| 182 | + | |
| 183 | +# tests call this file as script | |
| 184 | +if __name__ == '__main__': | |
| 185 | + if len(sys.argv) < 2: | |
| 186 | + sys.exit(unittest.main()) | |
| 187 | + | |
| 188 | + # hack required to import common from parent dir, not system-wide one | |
| 189 | + # (usually unittest seems to do that for us) | |
| 190 | + from os.path import abspath, dirname, join | |
| 191 | + ole_base = dirname(dirname(dirname(abspath(__file__)))) | |
| 192 | + sys.path.insert(0, ole_base) | |
| 193 | + | |
| 194 | + if sys.argv[1] == 'print': | |
| 195 | + if len(sys.argv) > 2: | |
| 196 | + print('Expect no arg for "print"', file=sys.stderr) | |
| 197 | + sys.exit(2) | |
| 198 | + sys.exit(run_print()) | |
| 199 | + elif sys.argv[1] == 'read': | |
| 200 | + if len(sys.argv) != 3: | |
| 201 | + print('Expect single arg for "read"', file=sys.stderr) | |
| 202 | + sys.exit(2) | |
| 203 | + sys.exit(run_read(sys.argv[2])) | |
| 204 | + else: | |
| 205 | + print('Unexpected argument: {}'.format(sys.argv[1]), file=sys.stderr) | |
| 206 | + sys.exit(2) | ... | ... |
tests/oleobj/test_basic.py
| ... | ... | @@ -10,6 +10,7 @@ from glob import glob |
| 10 | 10 | # Directory with test data, independent of current working directory |
| 11 | 11 | from tests.test_utils import DATA_BASE_DIR, call_and_capture |
| 12 | 12 | from oletools import oleobj |
| 13 | +from oletools.common.io_encoding import ensure_stdout_handles_unicode | |
| 13 | 14 | |
| 14 | 15 | |
| 15 | 16 | #: provide some more info to find errors |
| ... | ... | @@ -61,6 +62,7 @@ def calc_md5(filename): |
| 61 | 62 | |
| 62 | 63 | def preread_file(args): |
| 63 | 64 | """helper for TestOleObj.test_non_streamed: preread + call process_file""" |
| 65 | + ensure_stdout_handles_unicode() # usually, main() call this | |
| 64 | 66 | ignore_arg, output_dir, filename = args |
| 65 | 67 | if ignore_arg != '-d': |
| 66 | 68 | raise ValueError('ignore_arg not as expected!') | ... | ... |