Commit 2f7a1ef1b347a1124d01ef6559d939be5ccf50cd

Authored by Philippe Lagadec
Committed by GitHub
2 parents afeead80 0b3af2da

Merge pull request #365 from christian-intra2net/encoding-for-non-unicode-environments

Encoding for non unicode environments
oletools/common/io_encoding.py 0 → 100644
  1 +#!/usr/bin/env python3
  2 +
  3 +"""
  4 +Tool to help with input/output encoding
  5 +
  6 +Helpers to run smoothly in unicode-unfriendly environments like output redirect
  7 +or unusual language settings.
  8 +
  9 +In such settings, output to console falls back to ASCII-only. Also open()
  10 +suddenly fails to interprete non-ASCII characters.
  11 +
  12 +Therefore, at start of scripts can run :py:meth:`ensure_stdout_handles_unicode`
  13 +and when opening text files use :py:meth:`uopen` to replace :py:meth:`open`.
  14 +
  15 +Part of the python-oletools package:
  16 +http://www.decalage.info/python/oletools
  17 +"""
  18 +
  19 +# === LICENSE =================================================================
  20 +
  21 +# msodde is copyright (c) 2017-2018 Philippe Lagadec (http://www.decalage.info)
  22 +# All rights reserved.
  23 +#
  24 +# Redistribution and use in source and binary forms, with or without
  25 +# modification, are permitted provided that the following conditions are met:
  26 +#
  27 +# * Redistributions of source code must retain the above copyright notice,
  28 +# this list of conditions and the following disclaimer.
  29 +# * Redistributions in binary form must reproduce the above copyright notice,
  30 +# this list of conditions and the following disclaimer in the documentation
  31 +# and/or other materials provided with the distribution.
  32 +#
  33 +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  34 +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  35 +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  36 +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  37 +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  38 +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  39 +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  40 +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  41 +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  42 +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  43 +# POSSIBILITY OF SUCH DAMAGE.
  44 +
  45 +# -----------------------------------------------------------------------------
  46 +# CHANGELOG:
  47 +# 2018-11-04 v0.54 CH: - first version: ensure_stdout_handles_unicode, uopen
  48 +
  49 +# -- IMPORTS ------------------------------------------------------------------
  50 +from __future__ import print_function
  51 +import sys
  52 +import codecs
  53 +import os
  54 +from locale import getpreferredencoding
  55 +
  56 +PY3 = sys.version_info.major >= 3
  57 +
  58 +if PY3:
  59 + from builtins import open as builtin_open
  60 +else:
  61 + from __builtin__ import open as builtin_open
  62 +
  63 +# -- CONSTANTS ----------------------------------------------------------------
  64 +#: encoding to use for redirection if no good encoding can be found
  65 +FALLBACK_ENCODING_REDIRECT = 'utf8'
  66 +
  67 +#: encoding for reading text from files if preferred encoding is non-unicode
  68 +FALLBACK_ENCODING_OPEN = 'utf8'
  69 +
  70 +#: print (pure-ascii) debug output to stdout
  71 +DEBUG = False
  72 +
  73 +# the encoding specified in system environment
  74 +try:
  75 + PREFERRED_ENCODING = getpreferredencoding()
  76 +except Exception as exc:
  77 + if DEBUG:
  78 + print('Exception getting preferred encoding: {}'.format(exc))
  79 + PREFERRED_ENCODING = None
  80 +
  81 +
  82 +# -- HELPERS =-----------------------------------------------------------------
  83 +
  84 +
  85 +def ensure_stdout_handles_unicode():
  86 + """
  87 + Ensure that print()ing unicode does not lead to errors.
  88 +
  89 + When print()ing unicode, python relies on the environment (e.g. in linux on
  90 + the setting of the LANG environment variable) to tell it how to encode
  91 + unicode. That works nicely for modern-day shells where encoding is usually
  92 + UTF-8. But as soon as LANG is unset or just "C", or output is redirected or
  93 + piped, the encoding falls back to 'ASCII', which cannot handle unicode
  94 + characters.
  95 +
  96 + Based on solutions suggested on stackoverflow (c.f.
  97 + https://stackoverflow.com/q/27347772/4405656 ), wrap stdout in an encoder
  98 + that solves that problem.
  99 +
  100 + Unfortunately, stderr cannot be handled the same way ( see e.g. https://
  101 + pythonhosted.org/kitchen/unicode-frustrations.html#frustration-5-exceptions
  102 + ), so we still have to hope there is only ascii in error messages
  103 + """
  104 + # do not re-wrap
  105 + if isinstance(sys.stdout, codecs.StreamWriter):
  106 + if DEBUG:
  107 + print('sys.stdout wrapped already')
  108 + return
  109 +
  110 + # get output stream object
  111 + if PY3:
  112 + output_stream = sys.stdout.buffer
  113 + else:
  114 + output_stream = sys.stdout
  115 +
  116 + # determine encoding of sys.stdout
  117 + try:
  118 + encoding = sys.stdout.encoding
  119 + except AttributeError: # variable "encoding" might not exist
  120 + encoding = None
  121 + if DEBUG:
  122 + print('sys.stdout encoding is {}'.format(encoding))
  123 +
  124 + if isinstance(encoding, str) and encoding.lower().startswith('utf'):
  125 + if DEBUG:
  126 + print('encoding is acceptable')
  127 + return # everything alright, we are working in a good environment
  128 + elif os.isatty(output_stream.fileno()): # e.g. C locale
  129 + # Do not output UTF8 since that might be mis-interpreted.
  130 + # Just replace chars that cannot be handled
  131 + print('Encoding for stdout is only {}, will replace other chars to '
  132 + 'avoid unicode error'.format(encoding), file=sys.stderr)
  133 + sys.stdout = codecs.getwriter(encoding)(output_stream, errors='replace')
  134 + else: # e.g. redirection, pipe in python2
  135 + new_encoding = PREFERRED_ENCODING
  136 + if DEBUG:
  137 + print('not a tty, try preferred encoding {}'.format(new_encoding))
  138 + if not isinstance(new_encoding, str) \
  139 + or not new_encoding.lower().startswith('utf'):
  140 + new_encoding = FALLBACK_ENCODING_REDIRECT
  141 + if DEBUG:
  142 + print('preferred encoding also unacceptable, fall back to {}'
  143 + .format(new_encoding))
  144 + print('Encoding for stdout is only {}, will auto-encode text with {} '
  145 + 'before output'.format(encoding, new_encoding), file=sys.stderr)
  146 + sys.stdout = codecs.getwriter(new_encoding)(output_stream)
  147 +
  148 +
  149 +def uopen(filename, mode='r', *args, **kwargs):
  150 + """
  151 + Replacement for builtin open() that reads unicode even in ASCII environment
  152 +
  153 + In order to read unicode from text, python uses locale.getpreferredencoding
  154 + to translate bytes to str. If the environment only provides ASCII encoding,
  155 + this will fail since most office files contain unicode.
  156 +
  157 + Therefore, guess a good encoding here if necessary and open file with that.
  158 +
  159 + :returns: same type as the builtin :py:func:`open`
  160 + """
  161 + # do not interfere if not necessary:
  162 + if 'b' in mode:
  163 + if DEBUG:
  164 + print('Opening binary file, do not interfere')
  165 + return builtin_open(filename, mode, *args, **kwargs)
  166 + if 'encoding' in kwargs:
  167 + if DEBUG:
  168 + print('Opening file with encoding {!r}, do not interfere'
  169 + .format(kwargs['encoding']))
  170 + return builtin_open(filename, mode, *args, **kwargs)
  171 + if len(args) > 3: # "encoding" is the 4th arg
  172 + if DEBUG:
  173 + print('Opening file with encoding {!r}, do not interfere'
  174 + .format(args[3]))
  175 + return builtin_open(filename, mode, *args, **kwargs)
  176 +
  177 + # determine preferred encoding
  178 + encoding = PREFERRED_ENCODING
  179 + if DEBUG:
  180 + print('preferred encoding is {}'.format(encoding))
  181 +
  182 + if isinstance(encoding, str) and encoding.lower().startswith('utf'):
  183 + if DEBUG:
  184 + print('encoding is acceptable, open {} regularly'.format(filename))
  185 + return builtin_open(filename, mode, *args, **kwargs)
  186 +
  187 + # so we want to read text from a file but can probably only deal with ASCII
  188 + # --> use fallback
  189 + if DEBUG:
  190 + print('Opening {} with fallback encoding {}'
  191 + .format(filename, FALLBACK_ENCODING_OPEN))
  192 + if PY3:
  193 + return builtin_open(filename, mode, *args,
  194 + encoding=FALLBACK_ENCODING_OPEN, **kwargs)
  195 + else:
  196 + handle = builtin_open(filename, mode, *args, **kwargs)
  197 + return codecs.EncodedFile(handle, FALLBACK_ENCODING_OPEN)
oletools/common/log_helper/log_helper.py
@@ -44,6 +44,7 @@ General logging helpers @@ -44,6 +44,7 @@ General logging helpers
44 from ._json_formatter import JsonFormatter 44 from ._json_formatter import JsonFormatter
45 from ._logger_adapter import OletoolsLoggerAdapter 45 from ._logger_adapter import OletoolsLoggerAdapter
46 from . import _root_logger_wrapper 46 from . import _root_logger_wrapper
  47 +from ..io_encoding import ensure_stdout_handles_unicode
47 import logging 48 import logging
48 import sys 49 import sys
49 50
@@ -92,6 +93,9 @@ class LogHelper: @@ -92,6 +93,9 @@ class LogHelper:
92 if self._is_enabled: 93 if self._is_enabled:
93 raise ValueError('re-enabling logging. Not sure whether that is ok...') 94 raise ValueError('re-enabling logging. Not sure whether that is ok...')
94 95
  96 + if stream in (None, sys.stdout):
  97 + ensure_stdout_handles_unicode()
  98 +
95 log_level = LOG_LEVELS[level] 99 log_level = LOG_LEVELS[level]
96 logging.basicConfig(level=log_level, format=log_format, stream=stream) 100 logging.basicConfig(level=log_level, format=log_format, stream=stream)
97 self._is_enabled = True 101 self._is_enabled = True
oletools/msodde.py
@@ -74,6 +74,7 @@ from oletools import xls_parser @@ -74,6 +74,7 @@ from oletools import xls_parser
74 from oletools import rtfobj 74 from oletools import rtfobj
75 from oletools.ppt_record_parser import is_ppt 75 from oletools.ppt_record_parser import is_ppt
76 from oletools import crypto 76 from oletools import crypto
  77 +from oletools.common.io_encoding import ensure_stdout_handles_unicode
77 from oletools.common.log_helper import log_helper 78 from oletools.common.log_helper import log_helper
78 79
79 # ----------------------------------------------------------------------------- 80 # -----------------------------------------------------------------------------
@@ -236,57 +237,6 @@ DEFAULT_LOG_LEVEL = "warning" # Default log level @@ -236,57 +237,6 @@ DEFAULT_LOG_LEVEL = "warning" # Default log level
236 logger = log_helper.get_or_create_silent_logger('msodde') 237 logger = log_helper.get_or_create_silent_logger('msodde')
237 238
238 239
239 -# === UNICODE IN PY2 =========================================================  
240 -  
241 -def ensure_stdout_handles_unicode():  
242 - """ Ensure stdout can handle unicode by wrapping it if necessary  
243 -  
244 - Required e.g. if output of this script is piped or redirected in a linux  
245 - shell, since then sys.stdout.encoding is ascii and cannot handle  
246 - print(unicode). In that case we need to find some compatible encoding and  
247 - wrap sys.stdout into a encoder following (many thanks!)  
248 - https://stackoverflow.com/a/1819009 or https://stackoverflow.com/a/20447935  
249 -  
250 - Can be undone by setting sys.stdout = sys.__stdout__  
251 - """  
252 - import codecs  
253 - import locale  
254 -  
255 - # do not re-wrap  
256 - if isinstance(sys.stdout, codecs.StreamWriter):  
257 - return  
258 -  
259 - # try to find encoding for sys.stdout  
260 - encoding = None  
261 - try:  
262 - encoding = sys.stdout.encoding  
263 - except AttributeError: # variable "encoding" might not exist  
264 - pass  
265 -  
266 - if encoding not in (None, '', 'ascii'):  
267 - return # no need to wrap  
268 -  
269 - # try to find an encoding that can handle unicode  
270 - try:  
271 - encoding = locale.getpreferredencoding()  
272 - except Exception:  
273 - pass  
274 -  
275 - # fallback if still no encoding available  
276 - if encoding in (None, '', 'ascii'):  
277 - encoding = 'utf8'  
278 -  
279 - # logging is probably not initialized yet, but just in case  
280 - logger.debug('wrapping sys.stdout with encoder using {0}'.format(encoding))  
281 -  
282 - wrapper = codecs.getwriter(encoding)  
283 - sys.stdout = wrapper(sys.stdout)  
284 -  
285 -  
286 -if sys.version_info.major < 3:  
287 - ensure_stdout_handles_unicode() # e.g. for print(text) in main()  
288 -  
289 -  
290 # === ARGUMENT PARSING ======================================================= 240 # === ARGUMENT PARSING =======================================================
291 241
292 class ArgParserWithBanner(argparse.ArgumentParser): 242 class ArgParserWithBanner(argparse.ArgumentParser):
@@ -820,10 +770,15 @@ def process_csv(filepath): @@ -820,10 +770,15 @@ def process_csv(filepath):
820 chars the same way that excel does. Tested to some extend in unittests. 770 chars the same way that excel does. Tested to some extend in unittests.
821 771
822 This can only find DDE-links, no other "suspicious" constructs (yet). 772 This can only find DDE-links, no other "suspicious" constructs (yet).
823 - """  
824 773
  774 + Cannot deal with unicode files yet (need more than just use uopen()).
  775 + """
825 results = [] 776 results = []
826 - with open(filepath, 'r') as file_handle: 777 + if sys.version_info.major <= 2:
  778 + open_arg = dict(mode='rb')
  779 + else:
  780 + open_arg = dict(newline='')
  781 + with open(filepath, **open_arg) as file_handle:
827 results, dialect = process_csv_dialect(file_handle, CSV_DELIMITERS) 782 results, dialect = process_csv_dialect(file_handle, CSV_DELIMITERS)
828 is_small = file_handle.tell() < CSV_SMALL_THRESH 783 is_small = file_handle.tell() < CSV_SMALL_THRESH
829 784
@@ -854,7 +809,6 @@ def process_csv(filepath): @@ -854,7 +809,6 @@ def process_csv(filepath):
854 809
855 def process_csv_dialect(file_handle, delimiters): 810 def process_csv_dialect(file_handle, delimiters):
856 """ helper for process_csv: process with a specific csv dialect """ 811 """ helper for process_csv: process with a specific csv dialect """
857 -  
858 # determine dialect = delimiter chars, quote chars, ... 812 # determine dialect = delimiter chars, quote chars, ...
859 dialect = csv.Sniffer().sniff(file_handle.read(CSV_SMALL_THRESH), 813 dialect = csv.Sniffer().sniff(file_handle.read(CSV_SMALL_THRESH),
860 delimiters=delimiters) 814 delimiters=delimiters)
oletools/olemeta.py
@@ -79,6 +79,7 @@ if not _parent_dir in sys.path: @@ -79,6 +79,7 @@ if not _parent_dir in sys.path:
79 import olefile 79 import olefile
80 from oletools.thirdparty import xglob 80 from oletools.thirdparty import xglob
81 from oletools.thirdparty.tablestream import tablestream 81 from oletools.thirdparty.tablestream import tablestream
  82 +from oletools.common.io_encoding import ensure_stdout_handles_unicode
82 83
83 84
84 #=== MAIN ================================================================= 85 #=== MAIN =================================================================
@@ -88,13 +89,12 @@ def process_ole(ole): @@ -88,13 +89,12 @@ def process_ole(ole):
88 meta = ole.get_metadata() 89 meta = ole.get_metadata()
89 90
90 # console output with UTF8 encoding: 91 # console output with UTF8 encoding:
91 - # It looks like we do not need the UTF8 codec anymore, both for Python 2 and 3  
92 - console_utf8 = sys.stdout #codecs.getwriter('utf8')(sys.stdout) 92 + ensure_stdout_handles_unicode()
93 93
94 # TODO: move similar code to a function 94 # TODO: move similar code to a function
95 95
96 print('Properties from the SummaryInformation stream:') 96 print('Properties from the SummaryInformation stream:')
97 - t = tablestream.TableStream([21, 30], header_row=['Property', 'Value'], outfile=console_utf8) 97 + t = tablestream.TableStream([21, 30], header_row=['Property', 'Value'])
98 for prop in meta.SUMMARY_ATTRIBS: 98 for prop in meta.SUMMARY_ATTRIBS:
99 value = getattr(meta, prop) 99 value = getattr(meta, prop)
100 if value is not None: 100 if value is not None:
@@ -111,7 +111,7 @@ def process_ole(ole): @@ -111,7 +111,7 @@ def process_ole(ole):
111 print('') 111 print('')
112 112
113 print('Properties from the DocumentSummaryInformation stream:') 113 print('Properties from the DocumentSummaryInformation stream:')
114 - t = tablestream.TableStream([21, 30], header_row=['Property', 'Value'], outfile=console_utf8) 114 + t = tablestream.TableStream([21, 30], header_row=['Property', 'Value'])
115 for prop in meta.DOCSUM_ATTRIBS: 115 for prop in meta.DOCSUM_ATTRIBS:
116 value = getattr(meta, prop) 116 value = getattr(meta, prop)
117 if value is not None: 117 if value is not None:
oletools/oleobj.py
@@ -73,6 +73,7 @@ except ImportError: @@ -73,6 +73,7 @@ except ImportError:
73 from oletools.ppt_record_parser import (is_ppt, PptFile, 73 from oletools.ppt_record_parser import (is_ppt, PptFile,
74 PptRecordExOleVbaActiveXAtom) 74 PptRecordExOleVbaActiveXAtom)
75 from oletools.ooxml import XmlParser 75 from oletools.ooxml import XmlParser
  76 +from oletools.common.io_encoding import ensure_stdout_handles_unicode
76 77
77 # ----------------------------------------------------------------------------- 78 # -----------------------------------------------------------------------------
78 # CHANGELOG: 79 # CHANGELOG:
@@ -848,6 +849,7 @@ def main(cmd_line_args=None): @@ -848,6 +849,7 @@ def main(cmd_line_args=None):
848 provide other arguments. 849 provide other arguments.
849 """ 850 """
850 # print banner with version 851 # print banner with version
  852 + ensure_stdout_handles_unicode()
851 print('oleobj %s - http://decalage.info/oletools' % __version__) 853 print('oleobj %s - http://decalage.info/oletools' % __version__)
852 print('THIS IS WORK IN PROGRESS - Check updates regularly!') 854 print('THIS IS WORK IN PROGRESS - Check updates regularly!')
853 print('Please report any issue at ' 855 print('Please report any issue at '
oletools/olevba.py
@@ -318,6 +318,7 @@ from oletools import ppt_parser @@ -318,6 +318,7 @@ from oletools import ppt_parser
318 from oletools import oleform 318 from oletools import oleform
319 from oletools import rtfobj 319 from oletools import rtfobj
320 from oletools import crypto 320 from oletools import crypto
  321 +from oletools.common.io_encoding import ensure_stdout_handles_unicode
321 from oletools.common import codepages 322 from oletools.common import codepages
322 323
323 # monkeypatch email to fix issue #32: 324 # monkeypatch email to fix issue #32:
@@ -4122,7 +4123,6 @@ def main(cmd_line_args=None): @@ -4122,7 +4123,6 @@ def main(cmd_line_args=None):
4122 in process_args. Per default (cmd_line_args=None), sys.argv is used. Option 4123 in process_args. Per default (cmd_line_args=None), sys.argv is used. Option
4123 mainly added for unit-testing 4124 mainly added for unit-testing
4124 """ 4125 """
4125 -  
4126 options, args = parse_args(cmd_line_args) 4126 options, args = parse_args(cmd_line_args)
4127 4127
4128 # provide info about tool and its version 4128 # provide info about tool and its version
oletools/ooxml.py
@@ -13,14 +13,47 @@ TODO: check what is duplicate here with oleid, maybe merge some day? @@ -13,14 +13,47 @@ TODO: check what is duplicate here with oleid, maybe merge some day?
13 TODO: "xml2003" == "flatopc"? 13 TODO: "xml2003" == "flatopc"?
14 14
15 .. codeauthor:: Intra2net AG <info@intra2net> 15 .. codeauthor:: Intra2net AG <info@intra2net>
  16 +License: BSD, see source code or documentation
  17 +
  18 +msodde is part of the python-oletools package:
  19 +http://www.decalage.info/python/oletools
16 """ 20 """
17 21
  22 +# === LICENSE =================================================================
  23 +
  24 +# msodde is copyright (c) 2017-2019 Philippe Lagadec (http://www.decalage.info)
  25 +# All rights reserved.
  26 +#
  27 +# Redistribution and use in source and binary forms, with or without
  28 +# modification, are permitted provided that the following conditions are met:
  29 +#
  30 +# * Redistributions of source code must retain the above copyright notice,
  31 +# this list of conditions and the following disclaimer.
  32 +# * Redistributions in binary form must reproduce the above copyright notice,
  33 +# this list of conditions and the following disclaimer in the documentation
  34 +# and/or other materials provided with the distribution.
  35 +#
  36 +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  37 +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  38 +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  39 +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  40 +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  41 +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  42 +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  43 +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  44 +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  45 +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  46 +# POSSIBILITY OF SUCH DAMAGE.
  47 +
  48 +# -- IMPORTS ------------------------------------------------------------------
  49 +
18 import sys 50 import sys
  51 +from oletools.common.log_helper import log_helper
  52 +from oletools.common.io_encoding import uopen
19 from zipfile import ZipFile, BadZipfile, is_zipfile 53 from zipfile import ZipFile, BadZipfile, is_zipfile
20 from os.path import splitext 54 from os.path import splitext
21 import io 55 import io
22 import re 56 import re
23 -from oletools.common.log_helper import log_helper  
24 57
25 # import lxml or ElementTree for XML parsing: 58 # import lxml or ElementTree for XML parsing:
26 try: 59 try:
@@ -29,6 +62,18 @@ try: @@ -29,6 +62,18 @@ try:
29 except ImportError: 62 except ImportError:
30 import xml.etree.cElementTree as ET 63 import xml.etree.cElementTree as ET
31 64
  65 +# -----------------------------------------------------------------------------
  66 +# CHANGELOG:
  67 +# 2018-12-06 CH: - ensure stdout can handle unicode
  68 +
  69 +__version__ = '0.54.2'
  70 +
  71 +
  72 +###############################################################################
  73 +# CONSTANTS
  74 +###############################################################################
  75 +
  76 +
32 logger = log_helper.get_or_create_silent_logger('ooxml') 77 logger = log_helper.get_or_create_silent_logger('ooxml')
33 78
34 #: subfiles that have to be part of every ooxml file 79 #: subfiles that have to be part of every ooxml file
@@ -127,7 +172,7 @@ def get_type(filename): @@ -127,7 +172,7 @@ def get_type(filename):
127 parser = XmlParser(filename) 172 parser = XmlParser(filename)
128 if parser.is_single_xml(): 173 if parser.is_single_xml():
129 match = None 174 match = None
130 - with open(filename, 'r') as handle: 175 + with uopen(filename, 'r') as handle:
131 match = re.search(OFFICE_XML_PROGID_REGEX, handle.read(1024)) 176 match = re.search(OFFICE_XML_PROGID_REGEX, handle.read(1024))
132 if not match: 177 if not match:
133 return DOCTYPE_NONE 178 return DOCTYPE_NONE
@@ -416,7 +461,7 @@ class XmlParser(object): @@ -416,7 +461,7 @@ class XmlParser(object):
416 461
417 # find prog id in xml prolog 462 # find prog id in xml prolog
418 match = None 463 match = None
419 - with open(self.filename, 'r') as handle: 464 + with uopen(self.filename, 'r') as handle:
420 match = re.search(OFFICE_XML_PROGID_REGEX, handle.read(1024)) 465 match = re.search(OFFICE_XML_PROGID_REGEX, handle.read(1024))
421 if match: 466 if match:
422 self._is_single_xml = True 467 self._is_single_xml = True
@@ -424,11 +469,18 @@ class XmlParser(object): @@ -424,11 +469,18 @@ class XmlParser(object):
424 raise BadOOXML(self.filename, 'is no zip and has no prog_id') 469 raise BadOOXML(self.filename, 'is no zip and has no prog_id')
425 470
426 def iter_files(self, args=None): 471 def iter_files(self, args=None):
427 - """ Find files in zip or just give single xml file """ 472 + """
  473 + Find files in zip or just give single xml file
  474 +
  475 + yields pairs (subfile-name, file-handle) where file-handle is an open
  476 + file-like object. (Do not care too much about encoding here, the xml
  477 + parser reads the encoding from the first lines in the file.)
  478 + """
428 if self.is_single_xml(): 479 if self.is_single_xml():
429 if args: 480 if args:
430 raise BadOOXML(self.filename, 'xml has no subfiles') 481 raise BadOOXML(self.filename, 'xml has no subfiles')
431 - with open(self.filename, 'r') as handle: 482 + # do not use uopen, xml parser determines encoding on its own
  483 + with open(self.filename, 'rb') as handle:
432 yield None, handle # the subfile=None is needed in iter_xml 484 yield None, handle # the subfile=None is needed in iter_xml
433 self.did_iter_all = True 485 self.did_iter_all = True
434 else: 486 else:
@@ -638,9 +690,10 @@ class XmlParser(object): @@ -638,9 +690,10 @@ class XmlParser(object):
638 690
639 691
640 def test(): 692 def test():
641 - """ Main function, called when running file as script 693 + """
  694 + Test xml parsing; called when running this file as a script.
642 695
643 - see module doc for more info 696 + Prints every element found in input file (to be given as command line arg).
644 """ 697 """
645 log_helper.enable_logging(False, 'debug') 698 log_helper.enable_logging(False, 'debug')
646 if len(sys.argv) != 2: 699 if len(sys.argv) != 2:
tests/common/test_encoding_handler.py 0 → 100644
  1 +"""Test common.ensure_stdout_handles_unicode"""
  2 +
  3 +from __future__ import print_function
  4 +
  5 +import unittest
  6 +import sys
  7 +from subprocess import check_call, CalledProcessError
  8 +from tempfile import mkstemp
  9 +import os
  10 +from os.path import isfile
  11 +from contextlib import contextmanager
  12 +
  13 +FILE_TEXT = u'The unicode check mark is \u2713.\n'
  14 +
  15 +@contextmanager
  16 +def temp_file(just_name=True):
  17 + """Context manager that creates temp file and deletes it in the end"""
  18 + tmp_descriptor = None
  19 + tmp_name = None
  20 + tmp_handle = None
  21 + try:
  22 + tmp_descriptor, tmp_name = mkstemp()
  23 +
  24 + # we create our own file handle since we want to be able to close the
  25 + # file and open it again for reading.
  26 + # We keep the os-level descriptor open so file name is still reserved
  27 + # for us
  28 + if just_name:
  29 + yield tmp_name
  30 + else:
  31 + tmp_handle = open(tmp_name, 'wb')
  32 + yield tmp_handle, tmp_name
  33 + except Exception:
  34 + raise
  35 + finally:
  36 + if tmp_descriptor is not None:
  37 + os.close(tmp_descriptor)
  38 + if tmp_handle is not None:
  39 + tmp_handle.close()
  40 + if tmp_name is not None and isfile(tmp_name):
  41 + os.unlink(tmp_name)
  42 +
  43 +
  44 +class TestEncodingHandler(unittest.TestCase):
  45 + """Tests replacing stdout encoding in various scenarios"""
  46 +
  47 + def test_print(self):
  48 + """Test regular unicode output not raise error"""
  49 + check_call('{python} {this_file} print'.format(python=sys.executable,
  50 + this_file=__file__),
  51 + shell=True)
  52 +
  53 + def test_print_redirect(self):
  54 + """
  55 + Test redirection of unicode output to files does not raise error
  56 +
  57 + TODO: test this on non-linux OSs
  58 + """
  59 + with temp_file() as tmp_file:
  60 + check_call('{python} {this_file} print > {tmp_file}'
  61 + .format(python=sys.executable, this_file=__file__,
  62 + tmp_file=tmp_file),
  63 + shell=True)
  64 +
  65 + @unittest.skipIf(not sys.platform.startswith('linux'),
  66 + 'Only tested on linux sofar')
  67 + def test_print_no_lang(self):
  68 + """
  69 + Test redirection of unicode output to files does not raise error
  70 +
  71 + TODO: Adapt this for other OSs; for win create batch script
  72 + """
  73 + check_call('LANG=C {python} {this_file} print'
  74 + .format(python=sys.executable, this_file=__file__),
  75 + shell=True)
  76 +
  77 + def test_uopen(self):
  78 + """Test that uopen in a nice environment is ok"""
  79 + with temp_file(False) as (tmp_handle, tmp_file):
  80 + tmp_handle.write(FILE_TEXT.encode('utf8'))
  81 + tmp_handle.close()
  82 +
  83 + try:
  84 + check_call('{python} {this_file} read {tmp_file}'
  85 + .format(python=sys.executable, this_file=__file__,
  86 + tmp_file=tmp_file),
  87 + shell=True)
  88 + except CalledProcessError as cpe:
  89 + self.fail(cpe.output)
  90 +
  91 + def test_uopen_redirect(self):
  92 + """
  93 + Test redirection of unicode output to files does not raise error
  94 +
  95 + TODO: test this on non-linux OSs
  96 + """
  97 + with temp_file(False) as (tmp_handle, tmp_file):
  98 + tmp_handle.write(FILE_TEXT.encode('utf8'))
  99 + tmp_handle.close()
  100 +
  101 + with temp_file() as redirect_file:
  102 + try:
  103 + check_call(
  104 + '{python} {this_file} read {tmp_file} >{redirect_file}'
  105 + .format(python=sys.executable, this_file=__file__,
  106 + tmp_file=tmp_file, redirect_file=redirect_file),
  107 + shell=True)
  108 + except CalledProcessError as cpe:
  109 + self.fail(cpe.output)
  110 +
  111 + @unittest.skipIf(not sys.platform.startswith('linux'),
  112 + 'Only tested on linux sofar')
  113 + def test_uopen_no_lang(self):
  114 + """
  115 + Test that uopen in a C-LANG environment is ok
  116 +
  117 + TODO: Adapt this for other OSs; for win create batch script
  118 + """
  119 + with temp_file(False) as (tmp_handle, tmp_file):
  120 + tmp_handle.write(FILE_TEXT.encode('utf8'))
  121 + tmp_handle.close()
  122 +
  123 + try:
  124 + check_call('LANG=C {python} {this_file} read {tmp_file}'
  125 + .format(python=sys.executable, this_file=__file__,
  126 + tmp_file=tmp_file),
  127 + shell=True)
  128 + except CalledProcessError as cpe:
  129 + self.fail(cpe.output)
  130 +
  131 +
  132 +def run_read(filename):
  133 + """This is called from test_uopen* tests as script. Reads text, compares"""
  134 + from oletools.common.io_encoding import uopen
  135 + # open file
  136 + with uopen(filename, 'rt') as reader:
  137 + # a few tests
  138 + if reader.closed:
  139 + raise ValueError('handle is closed!')
  140 + if reader.name != filename:
  141 + raise ValueError('Wrong filename {}'.format(reader.name))
  142 + if reader.isatty():
  143 + raise ValueError('Reader is a tty!')
  144 + if reader.tell() != 0:
  145 + raise ValueError('Reader.tell is not 0 at beginning')
  146 +
  147 + # read text
  148 + text = reader.read()
  149 +
  150 + # a few more tests
  151 + if not reader.closed:
  152 + raise ValueError('Reader is not closed outside context')
  153 + if reader.name != filename:
  154 + raise ValueError('Wrong filename {} after context'.format(reader.name))
  155 + if reader.isatty():
  156 + raise ValueError('Reader has become a tty!')
  157 +
  158 + # compare text
  159 + if sys.version_info.major <= 2: # in python2 get encoded byte string
  160 + expect = FILE_TEXT.encode('utf8')
  161 + else: # python3: should get real unicode
  162 + expect = FILE_TEXT
  163 + if text != expect:
  164 + raise ValueError('Wrong contents: {!r} != {!r}'
  165 + .format(text, expect))
  166 + return 0
  167 +
  168 +
  169 +def run_print():
  170 + """This is called from test_read* tests as script. Prints & logs unicode"""
  171 + from oletools.common.io_encoding import ensure_stdout_handles_unicode
  172 + from oletools.common.log_helper import log_helper
  173 + ensure_stdout_handles_unicode()
  174 + print(u'Check: \u2713') # print check mark
  175 +
  176 + # check logging as well
  177 + logger = log_helper.get_or_create_silent_logger('test_encoding_handler')
  178 + log_helper.enable_logging(False, 'debug', stream=sys.stdout)
  179 + logger.info(u'Check: \u2713')
  180 + return 0
  181 +
  182 +
  183 +# tests call this file as script
  184 +if __name__ == '__main__':
  185 + if len(sys.argv) < 2:
  186 + sys.exit(unittest.main())
  187 +
  188 + # hack required to import common from parent dir, not system-wide one
  189 + # (usually unittest seems to do that for us)
  190 + from os.path import abspath, dirname, join
  191 + ole_base = dirname(dirname(dirname(abspath(__file__))))
  192 + sys.path.insert(0, ole_base)
  193 +
  194 + if sys.argv[1] == 'print':
  195 + if len(sys.argv) > 2:
  196 + print('Expect no arg for "print"', file=sys.stderr)
  197 + sys.exit(2)
  198 + sys.exit(run_print())
  199 + elif sys.argv[1] == 'read':
  200 + if len(sys.argv) != 3:
  201 + print('Expect single arg for "read"', file=sys.stderr)
  202 + sys.exit(2)
  203 + sys.exit(run_read(sys.argv[2]))
  204 + else:
  205 + print('Unexpected argument: {}'.format(sys.argv[1]), file=sys.stderr)
  206 + sys.exit(2)
tests/oleobj/test_basic.py
@@ -10,6 +10,7 @@ from glob import glob @@ -10,6 +10,7 @@ from glob import glob
10 # Directory with test data, independent of current working directory 10 # Directory with test data, independent of current working directory
11 from tests.test_utils import DATA_BASE_DIR, call_and_capture 11 from tests.test_utils import DATA_BASE_DIR, call_and_capture
12 from oletools import oleobj 12 from oletools import oleobj
  13 +from oletools.common.io_encoding import ensure_stdout_handles_unicode
13 14
14 15
15 #: provide some more info to find errors 16 #: provide some more info to find errors
@@ -61,6 +62,7 @@ def calc_md5(filename): @@ -61,6 +62,7 @@ def calc_md5(filename):
61 62
62 def preread_file(args): 63 def preread_file(args):
63 """helper for TestOleObj.test_non_streamed: preread + call process_file""" 64 """helper for TestOleObj.test_non_streamed: preread + call process_file"""
  65 + ensure_stdout_handles_unicode() # usually, main() call this
64 ignore_arg, output_dir, filename = args 66 ignore_arg, output_dir, filename = args
65 if ignore_arg != '-d': 67 if ignore_arg != '-d':
66 raise ValueError('ignore_arg not as expected!') 68 raise ValueError('ignore_arg not as expected!')