Commit 2f7a1ef1b347a1124d01ef6559d939be5ccf50cd

Authored by Philippe Lagadec
Committed by GitHub
2 parents afeead80 0b3af2da

Merge pull request #365 from christian-intra2net/encoding-for-non-unicode-environments

Encoding for non unicode environments
oletools/common/io_encoding.py 0 → 100644
  1 +#!/usr/bin/env python3
  2 +
  3 +"""
  4 +Tool to help with input/output encoding
  5 +
  6 +Helpers to run smoothly in unicode-unfriendly environments like output redirect
  7 +or unusual language settings.
  8 +
  9 +In such settings, output to console falls back to ASCII-only. Also open()
  10 +suddenly fails to interprete non-ASCII characters.
  11 +
  12 +Therefore, at start of scripts can run :py:meth:`ensure_stdout_handles_unicode`
  13 +and when opening text files use :py:meth:`uopen` to replace :py:meth:`open`.
  14 +
  15 +Part of the python-oletools package:
  16 +http://www.decalage.info/python/oletools
  17 +"""
  18 +
  19 +# === LICENSE =================================================================
  20 +
  21 +# msodde is copyright (c) 2017-2018 Philippe Lagadec (http://www.decalage.info)
  22 +# All rights reserved.
  23 +#
  24 +# Redistribution and use in source and binary forms, with or without
  25 +# modification, are permitted provided that the following conditions are met:
  26 +#
  27 +# * Redistributions of source code must retain the above copyright notice,
  28 +# this list of conditions and the following disclaimer.
  29 +# * Redistributions in binary form must reproduce the above copyright notice,
  30 +# this list of conditions and the following disclaimer in the documentation
  31 +# and/or other materials provided with the distribution.
  32 +#
  33 +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  34 +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  35 +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  36 +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  37 +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  38 +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  39 +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  40 +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  41 +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  42 +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  43 +# POSSIBILITY OF SUCH DAMAGE.
  44 +
  45 +# -----------------------------------------------------------------------------
  46 +# CHANGELOG:
  47 +# 2018-11-04 v0.54 CH: - first version: ensure_stdout_handles_unicode, uopen
  48 +
  49 +# -- IMPORTS ------------------------------------------------------------------
  50 +from __future__ import print_function
  51 +import sys
  52 +import codecs
  53 +import os
  54 +from locale import getpreferredencoding
  55 +
  56 +PY3 = sys.version_info.major >= 3
  57 +
  58 +if PY3:
  59 + from builtins import open as builtin_open
  60 +else:
  61 + from __builtin__ import open as builtin_open
  62 +
  63 +# -- CONSTANTS ----------------------------------------------------------------
  64 +#: encoding to use for redirection if no good encoding can be found
  65 +FALLBACK_ENCODING_REDIRECT = 'utf8'
  66 +
  67 +#: encoding for reading text from files if preferred encoding is non-unicode
  68 +FALLBACK_ENCODING_OPEN = 'utf8'
  69 +
  70 +#: print (pure-ascii) debug output to stdout
  71 +DEBUG = False
  72 +
  73 +# the encoding specified in system environment
  74 +try:
  75 + PREFERRED_ENCODING = getpreferredencoding()
  76 +except Exception as exc:
  77 + if DEBUG:
  78 + print('Exception getting preferred encoding: {}'.format(exc))
  79 + PREFERRED_ENCODING = None
  80 +
  81 +
  82 +# -- HELPERS =-----------------------------------------------------------------
  83 +
  84 +
  85 +def ensure_stdout_handles_unicode():
  86 + """
  87 + Ensure that print()ing unicode does not lead to errors.
  88 +
  89 + When print()ing unicode, python relies on the environment (e.g. in linux on
  90 + the setting of the LANG environment variable) to tell it how to encode
  91 + unicode. That works nicely for modern-day shells where encoding is usually
  92 + UTF-8. But as soon as LANG is unset or just "C", or output is redirected or
  93 + piped, the encoding falls back to 'ASCII', which cannot handle unicode
  94 + characters.
  95 +
  96 + Based on solutions suggested on stackoverflow (c.f.
  97 + https://stackoverflow.com/q/27347772/4405656 ), wrap stdout in an encoder
  98 + that solves that problem.
  99 +
  100 + Unfortunately, stderr cannot be handled the same way ( see e.g. https://
  101 + pythonhosted.org/kitchen/unicode-frustrations.html#frustration-5-exceptions
  102 + ), so we still have to hope there is only ascii in error messages
  103 + """
  104 + # do not re-wrap
  105 + if isinstance(sys.stdout, codecs.StreamWriter):
  106 + if DEBUG:
  107 + print('sys.stdout wrapped already')
  108 + return
  109 +
  110 + # get output stream object
  111 + if PY3:
  112 + output_stream = sys.stdout.buffer
  113 + else:
  114 + output_stream = sys.stdout
  115 +
  116 + # determine encoding of sys.stdout
  117 + try:
  118 + encoding = sys.stdout.encoding
  119 + except AttributeError: # variable "encoding" might not exist
  120 + encoding = None
  121 + if DEBUG:
  122 + print('sys.stdout encoding is {}'.format(encoding))
  123 +
  124 + if isinstance(encoding, str) and encoding.lower().startswith('utf'):
  125 + if DEBUG:
  126 + print('encoding is acceptable')
  127 + return # everything alright, we are working in a good environment
  128 + elif os.isatty(output_stream.fileno()): # e.g. C locale
  129 + # Do not output UTF8 since that might be mis-interpreted.
  130 + # Just replace chars that cannot be handled
  131 + print('Encoding for stdout is only {}, will replace other chars to '
  132 + 'avoid unicode error'.format(encoding), file=sys.stderr)
  133 + sys.stdout = codecs.getwriter(encoding)(output_stream, errors='replace')
  134 + else: # e.g. redirection, pipe in python2
  135 + new_encoding = PREFERRED_ENCODING
  136 + if DEBUG:
  137 + print('not a tty, try preferred encoding {}'.format(new_encoding))
  138 + if not isinstance(new_encoding, str) \
  139 + or not new_encoding.lower().startswith('utf'):
  140 + new_encoding = FALLBACK_ENCODING_REDIRECT
  141 + if DEBUG:
  142 + print('preferred encoding also unacceptable, fall back to {}'
  143 + .format(new_encoding))
  144 + print('Encoding for stdout is only {}, will auto-encode text with {} '
  145 + 'before output'.format(encoding, new_encoding), file=sys.stderr)
  146 + sys.stdout = codecs.getwriter(new_encoding)(output_stream)
  147 +
  148 +
  149 +def uopen(filename, mode='r', *args, **kwargs):
  150 + """
  151 + Replacement for builtin open() that reads unicode even in ASCII environment
  152 +
  153 + In order to read unicode from text, python uses locale.getpreferredencoding
  154 + to translate bytes to str. If the environment only provides ASCII encoding,
  155 + this will fail since most office files contain unicode.
  156 +
  157 + Therefore, guess a good encoding here if necessary and open file with that.
  158 +
  159 + :returns: same type as the builtin :py:func:`open`
  160 + """
  161 + # do not interfere if not necessary:
  162 + if 'b' in mode:
  163 + if DEBUG:
  164 + print('Opening binary file, do not interfere')
  165 + return builtin_open(filename, mode, *args, **kwargs)
  166 + if 'encoding' in kwargs:
  167 + if DEBUG:
  168 + print('Opening file with encoding {!r}, do not interfere'
  169 + .format(kwargs['encoding']))
  170 + return builtin_open(filename, mode, *args, **kwargs)
  171 + if len(args) > 3: # "encoding" is the 4th arg
  172 + if DEBUG:
  173 + print('Opening file with encoding {!r}, do not interfere'
  174 + .format(args[3]))
  175 + return builtin_open(filename, mode, *args, **kwargs)
  176 +
  177 + # determine preferred encoding
  178 + encoding = PREFERRED_ENCODING
  179 + if DEBUG:
  180 + print('preferred encoding is {}'.format(encoding))
  181 +
  182 + if isinstance(encoding, str) and encoding.lower().startswith('utf'):
  183 + if DEBUG:
  184 + print('encoding is acceptable, open {} regularly'.format(filename))
  185 + return builtin_open(filename, mode, *args, **kwargs)
  186 +
  187 + # so we want to read text from a file but can probably only deal with ASCII
  188 + # --> use fallback
  189 + if DEBUG:
  190 + print('Opening {} with fallback encoding {}'
  191 + .format(filename, FALLBACK_ENCODING_OPEN))
  192 + if PY3:
  193 + return builtin_open(filename, mode, *args,
  194 + encoding=FALLBACK_ENCODING_OPEN, **kwargs)
  195 + else:
  196 + handle = builtin_open(filename, mode, *args, **kwargs)
  197 + return codecs.EncodedFile(handle, FALLBACK_ENCODING_OPEN)
... ...
oletools/common/log_helper/log_helper.py
... ... @@ -44,6 +44,7 @@ General logging helpers
44 44 from ._json_formatter import JsonFormatter
45 45 from ._logger_adapter import OletoolsLoggerAdapter
46 46 from . import _root_logger_wrapper
  47 +from ..io_encoding import ensure_stdout_handles_unicode
47 48 import logging
48 49 import sys
49 50  
... ... @@ -92,6 +93,9 @@ class LogHelper:
92 93 if self._is_enabled:
93 94 raise ValueError('re-enabling logging. Not sure whether that is ok...')
94 95  
  96 + if stream in (None, sys.stdout):
  97 + ensure_stdout_handles_unicode()
  98 +
95 99 log_level = LOG_LEVELS[level]
96 100 logging.basicConfig(level=log_level, format=log_format, stream=stream)
97 101 self._is_enabled = True
... ...
oletools/msodde.py
... ... @@ -74,6 +74,7 @@ from oletools import xls_parser
74 74 from oletools import rtfobj
75 75 from oletools.ppt_record_parser import is_ppt
76 76 from oletools import crypto
  77 +from oletools.common.io_encoding import ensure_stdout_handles_unicode
77 78 from oletools.common.log_helper import log_helper
78 79  
79 80 # -----------------------------------------------------------------------------
... ... @@ -236,57 +237,6 @@ DEFAULT_LOG_LEVEL = "warning" # Default log level
236 237 logger = log_helper.get_or_create_silent_logger('msodde')
237 238  
238 239  
239   -# === UNICODE IN PY2 =========================================================
240   -
241   -def ensure_stdout_handles_unicode():
242   - """ Ensure stdout can handle unicode by wrapping it if necessary
243   -
244   - Required e.g. if output of this script is piped or redirected in a linux
245   - shell, since then sys.stdout.encoding is ascii and cannot handle
246   - print(unicode). In that case we need to find some compatible encoding and
247   - wrap sys.stdout into a encoder following (many thanks!)
248   - https://stackoverflow.com/a/1819009 or https://stackoverflow.com/a/20447935
249   -
250   - Can be undone by setting sys.stdout = sys.__stdout__
251   - """
252   - import codecs
253   - import locale
254   -
255   - # do not re-wrap
256   - if isinstance(sys.stdout, codecs.StreamWriter):
257   - return
258   -
259   - # try to find encoding for sys.stdout
260   - encoding = None
261   - try:
262   - encoding = sys.stdout.encoding
263   - except AttributeError: # variable "encoding" might not exist
264   - pass
265   -
266   - if encoding not in (None, '', 'ascii'):
267   - return # no need to wrap
268   -
269   - # try to find an encoding that can handle unicode
270   - try:
271   - encoding = locale.getpreferredencoding()
272   - except Exception:
273   - pass
274   -
275   - # fallback if still no encoding available
276   - if encoding in (None, '', 'ascii'):
277   - encoding = 'utf8'
278   -
279   - # logging is probably not initialized yet, but just in case
280   - logger.debug('wrapping sys.stdout with encoder using {0}'.format(encoding))
281   -
282   - wrapper = codecs.getwriter(encoding)
283   - sys.stdout = wrapper(sys.stdout)
284   -
285   -
286   -if sys.version_info.major < 3:
287   - ensure_stdout_handles_unicode() # e.g. for print(text) in main()
288   -
289   -
290 240 # === ARGUMENT PARSING =======================================================
291 241  
292 242 class ArgParserWithBanner(argparse.ArgumentParser):
... ... @@ -820,10 +770,15 @@ def process_csv(filepath):
820 770 chars the same way that excel does. Tested to some extend in unittests.
821 771  
822 772 This can only find DDE-links, no other "suspicious" constructs (yet).
823   - """
824 773  
  774 + Cannot deal with unicode files yet (need more than just use uopen()).
  775 + """
825 776 results = []
826   - with open(filepath, 'r') as file_handle:
  777 + if sys.version_info.major <= 2:
  778 + open_arg = dict(mode='rb')
  779 + else:
  780 + open_arg = dict(newline='')
  781 + with open(filepath, **open_arg) as file_handle:
827 782 results, dialect = process_csv_dialect(file_handle, CSV_DELIMITERS)
828 783 is_small = file_handle.tell() < CSV_SMALL_THRESH
829 784  
... ... @@ -854,7 +809,6 @@ def process_csv(filepath):
854 809  
855 810 def process_csv_dialect(file_handle, delimiters):
856 811 """ helper for process_csv: process with a specific csv dialect """
857   -
858 812 # determine dialect = delimiter chars, quote chars, ...
859 813 dialect = csv.Sniffer().sniff(file_handle.read(CSV_SMALL_THRESH),
860 814 delimiters=delimiters)
... ...
oletools/olemeta.py
... ... @@ -79,6 +79,7 @@ if not _parent_dir in sys.path:
79 79 import olefile
80 80 from oletools.thirdparty import xglob
81 81 from oletools.thirdparty.tablestream import tablestream
  82 +from oletools.common.io_encoding import ensure_stdout_handles_unicode
82 83  
83 84  
84 85 #=== MAIN =================================================================
... ... @@ -88,13 +89,12 @@ def process_ole(ole):
88 89 meta = ole.get_metadata()
89 90  
90 91 # console output with UTF8 encoding:
91   - # It looks like we do not need the UTF8 codec anymore, both for Python 2 and 3
92   - console_utf8 = sys.stdout #codecs.getwriter('utf8')(sys.stdout)
  92 + ensure_stdout_handles_unicode()
93 93  
94 94 # TODO: move similar code to a function
95 95  
96 96 print('Properties from the SummaryInformation stream:')
97   - t = tablestream.TableStream([21, 30], header_row=['Property', 'Value'], outfile=console_utf8)
  97 + t = tablestream.TableStream([21, 30], header_row=['Property', 'Value'])
98 98 for prop in meta.SUMMARY_ATTRIBS:
99 99 value = getattr(meta, prop)
100 100 if value is not None:
... ... @@ -111,7 +111,7 @@ def process_ole(ole):
111 111 print('')
112 112  
113 113 print('Properties from the DocumentSummaryInformation stream:')
114   - t = tablestream.TableStream([21, 30], header_row=['Property', 'Value'], outfile=console_utf8)
  114 + t = tablestream.TableStream([21, 30], header_row=['Property', 'Value'])
115 115 for prop in meta.DOCSUM_ATTRIBS:
116 116 value = getattr(meta, prop)
117 117 if value is not None:
... ...
oletools/oleobj.py
... ... @@ -73,6 +73,7 @@ except ImportError:
73 73 from oletools.ppt_record_parser import (is_ppt, PptFile,
74 74 PptRecordExOleVbaActiveXAtom)
75 75 from oletools.ooxml import XmlParser
  76 +from oletools.common.io_encoding import ensure_stdout_handles_unicode
76 77  
77 78 # -----------------------------------------------------------------------------
78 79 # CHANGELOG:
... ... @@ -848,6 +849,7 @@ def main(cmd_line_args=None):
848 849 provide other arguments.
849 850 """
850 851 # print banner with version
  852 + ensure_stdout_handles_unicode()
851 853 print('oleobj %s - http://decalage.info/oletools' % __version__)
852 854 print('THIS IS WORK IN PROGRESS - Check updates regularly!')
853 855 print('Please report any issue at '
... ...
oletools/olevba.py
... ... @@ -318,6 +318,7 @@ from oletools import ppt_parser
318 318 from oletools import oleform
319 319 from oletools import rtfobj
320 320 from oletools import crypto
  321 +from oletools.common.io_encoding import ensure_stdout_handles_unicode
321 322 from oletools.common import codepages
322 323  
323 324 # monkeypatch email to fix issue #32:
... ... @@ -4122,7 +4123,6 @@ def main(cmd_line_args=None):
4122 4123 in process_args. Per default (cmd_line_args=None), sys.argv is used. Option
4123 4124 mainly added for unit-testing
4124 4125 """
4125   -
4126 4126 options, args = parse_args(cmd_line_args)
4127 4127  
4128 4128 # provide info about tool and its version
... ...
oletools/ooxml.py
... ... @@ -13,14 +13,47 @@ TODO: check what is duplicate here with oleid, maybe merge some day?
13 13 TODO: "xml2003" == "flatopc"?
14 14  
15 15 .. codeauthor:: Intra2net AG <info@intra2net>
  16 +License: BSD, see source code or documentation
  17 +
  18 +msodde is part of the python-oletools package:
  19 +http://www.decalage.info/python/oletools
16 20 """
17 21  
  22 +# === LICENSE =================================================================
  23 +
  24 +# msodde is copyright (c) 2017-2019 Philippe Lagadec (http://www.decalage.info)
  25 +# All rights reserved.
  26 +#
  27 +# Redistribution and use in source and binary forms, with or without
  28 +# modification, are permitted provided that the following conditions are met:
  29 +#
  30 +# * Redistributions of source code must retain the above copyright notice,
  31 +# this list of conditions and the following disclaimer.
  32 +# * Redistributions in binary form must reproduce the above copyright notice,
  33 +# this list of conditions and the following disclaimer in the documentation
  34 +# and/or other materials provided with the distribution.
  35 +#
  36 +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  37 +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  38 +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  39 +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  40 +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  41 +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  42 +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  43 +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  44 +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  45 +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  46 +# POSSIBILITY OF SUCH DAMAGE.
  47 +
  48 +# -- IMPORTS ------------------------------------------------------------------
  49 +
18 50 import sys
  51 +from oletools.common.log_helper import log_helper
  52 +from oletools.common.io_encoding import uopen
19 53 from zipfile import ZipFile, BadZipfile, is_zipfile
20 54 from os.path import splitext
21 55 import io
22 56 import re
23   -from oletools.common.log_helper import log_helper
24 57  
25 58 # import lxml or ElementTree for XML parsing:
26 59 try:
... ... @@ -29,6 +62,18 @@ try:
29 62 except ImportError:
30 63 import xml.etree.cElementTree as ET
31 64  
  65 +# -----------------------------------------------------------------------------
  66 +# CHANGELOG:
  67 +# 2018-12-06 CH: - ensure stdout can handle unicode
  68 +
  69 +__version__ = '0.54.2'
  70 +
  71 +
  72 +###############################################################################
  73 +# CONSTANTS
  74 +###############################################################################
  75 +
  76 +
32 77 logger = log_helper.get_or_create_silent_logger('ooxml')
33 78  
34 79 #: subfiles that have to be part of every ooxml file
... ... @@ -127,7 +172,7 @@ def get_type(filename):
127 172 parser = XmlParser(filename)
128 173 if parser.is_single_xml():
129 174 match = None
130   - with open(filename, 'r') as handle:
  175 + with uopen(filename, 'r') as handle:
131 176 match = re.search(OFFICE_XML_PROGID_REGEX, handle.read(1024))
132 177 if not match:
133 178 return DOCTYPE_NONE
... ... @@ -416,7 +461,7 @@ class XmlParser(object):
416 461  
417 462 # find prog id in xml prolog
418 463 match = None
419   - with open(self.filename, 'r') as handle:
  464 + with uopen(self.filename, 'r') as handle:
420 465 match = re.search(OFFICE_XML_PROGID_REGEX, handle.read(1024))
421 466 if match:
422 467 self._is_single_xml = True
... ... @@ -424,11 +469,18 @@ class XmlParser(object):
424 469 raise BadOOXML(self.filename, 'is no zip and has no prog_id')
425 470  
426 471 def iter_files(self, args=None):
427   - """ Find files in zip or just give single xml file """
  472 + """
  473 + Find files in zip or just give single xml file
  474 +
  475 + yields pairs (subfile-name, file-handle) where file-handle is an open
  476 + file-like object. (Do not care too much about encoding here, the xml
  477 + parser reads the encoding from the first lines in the file.)
  478 + """
428 479 if self.is_single_xml():
429 480 if args:
430 481 raise BadOOXML(self.filename, 'xml has no subfiles')
431   - with open(self.filename, 'r') as handle:
  482 + # do not use uopen, xml parser determines encoding on its own
  483 + with open(self.filename, 'rb') as handle:
432 484 yield None, handle # the subfile=None is needed in iter_xml
433 485 self.did_iter_all = True
434 486 else:
... ... @@ -638,9 +690,10 @@ class XmlParser(object):
638 690  
639 691  
640 692 def test():
641   - """ Main function, called when running file as script
  693 + """
  694 + Test xml parsing; called when running this file as a script.
642 695  
643   - see module doc for more info
  696 + Prints every element found in input file (to be given as command line arg).
644 697 """
645 698 log_helper.enable_logging(False, 'debug')
646 699 if len(sys.argv) != 2:
... ...
tests/common/test_encoding_handler.py 0 → 100644
  1 +"""Test common.ensure_stdout_handles_unicode"""
  2 +
  3 +from __future__ import print_function
  4 +
  5 +import unittest
  6 +import sys
  7 +from subprocess import check_call, CalledProcessError
  8 +from tempfile import mkstemp
  9 +import os
  10 +from os.path import isfile
  11 +from contextlib import contextmanager
  12 +
  13 +FILE_TEXT = u'The unicode check mark is \u2713.\n'
  14 +
  15 +@contextmanager
  16 +def temp_file(just_name=True):
  17 + """Context manager that creates temp file and deletes it in the end"""
  18 + tmp_descriptor = None
  19 + tmp_name = None
  20 + tmp_handle = None
  21 + try:
  22 + tmp_descriptor, tmp_name = mkstemp()
  23 +
  24 + # we create our own file handle since we want to be able to close the
  25 + # file and open it again for reading.
  26 + # We keep the os-level descriptor open so file name is still reserved
  27 + # for us
  28 + if just_name:
  29 + yield tmp_name
  30 + else:
  31 + tmp_handle = open(tmp_name, 'wb')
  32 + yield tmp_handle, tmp_name
  33 + except Exception:
  34 + raise
  35 + finally:
  36 + if tmp_descriptor is not None:
  37 + os.close(tmp_descriptor)
  38 + if tmp_handle is not None:
  39 + tmp_handle.close()
  40 + if tmp_name is not None and isfile(tmp_name):
  41 + os.unlink(tmp_name)
  42 +
  43 +
  44 +class TestEncodingHandler(unittest.TestCase):
  45 + """Tests replacing stdout encoding in various scenarios"""
  46 +
  47 + def test_print(self):
  48 + """Test regular unicode output not raise error"""
  49 + check_call('{python} {this_file} print'.format(python=sys.executable,
  50 + this_file=__file__),
  51 + shell=True)
  52 +
  53 + def test_print_redirect(self):
  54 + """
  55 + Test redirection of unicode output to files does not raise error
  56 +
  57 + TODO: test this on non-linux OSs
  58 + """
  59 + with temp_file() as tmp_file:
  60 + check_call('{python} {this_file} print > {tmp_file}'
  61 + .format(python=sys.executable, this_file=__file__,
  62 + tmp_file=tmp_file),
  63 + shell=True)
  64 +
  65 + @unittest.skipIf(not sys.platform.startswith('linux'),
  66 + 'Only tested on linux sofar')
  67 + def test_print_no_lang(self):
  68 + """
  69 + Test redirection of unicode output to files does not raise error
  70 +
  71 + TODO: Adapt this for other OSs; for win create batch script
  72 + """
  73 + check_call('LANG=C {python} {this_file} print'
  74 + .format(python=sys.executable, this_file=__file__),
  75 + shell=True)
  76 +
  77 + def test_uopen(self):
  78 + """Test that uopen in a nice environment is ok"""
  79 + with temp_file(False) as (tmp_handle, tmp_file):
  80 + tmp_handle.write(FILE_TEXT.encode('utf8'))
  81 + tmp_handle.close()
  82 +
  83 + try:
  84 + check_call('{python} {this_file} read {tmp_file}'
  85 + .format(python=sys.executable, this_file=__file__,
  86 + tmp_file=tmp_file),
  87 + shell=True)
  88 + except CalledProcessError as cpe:
  89 + self.fail(cpe.output)
  90 +
  91 + def test_uopen_redirect(self):
  92 + """
  93 + Test redirection of unicode output to files does not raise error
  94 +
  95 + TODO: test this on non-linux OSs
  96 + """
  97 + with temp_file(False) as (tmp_handle, tmp_file):
  98 + tmp_handle.write(FILE_TEXT.encode('utf8'))
  99 + tmp_handle.close()
  100 +
  101 + with temp_file() as redirect_file:
  102 + try:
  103 + check_call(
  104 + '{python} {this_file} read {tmp_file} >{redirect_file}'
  105 + .format(python=sys.executable, this_file=__file__,
  106 + tmp_file=tmp_file, redirect_file=redirect_file),
  107 + shell=True)
  108 + except CalledProcessError as cpe:
  109 + self.fail(cpe.output)
  110 +
  111 + @unittest.skipIf(not sys.platform.startswith('linux'),
  112 + 'Only tested on linux sofar')
  113 + def test_uopen_no_lang(self):
  114 + """
  115 + Test that uopen in a C-LANG environment is ok
  116 +
  117 + TODO: Adapt this for other OSs; for win create batch script
  118 + """
  119 + with temp_file(False) as (tmp_handle, tmp_file):
  120 + tmp_handle.write(FILE_TEXT.encode('utf8'))
  121 + tmp_handle.close()
  122 +
  123 + try:
  124 + check_call('LANG=C {python} {this_file} read {tmp_file}'
  125 + .format(python=sys.executable, this_file=__file__,
  126 + tmp_file=tmp_file),
  127 + shell=True)
  128 + except CalledProcessError as cpe:
  129 + self.fail(cpe.output)
  130 +
  131 +
  132 +def run_read(filename):
  133 + """This is called from test_uopen* tests as script. Reads text, compares"""
  134 + from oletools.common.io_encoding import uopen
  135 + # open file
  136 + with uopen(filename, 'rt') as reader:
  137 + # a few tests
  138 + if reader.closed:
  139 + raise ValueError('handle is closed!')
  140 + if reader.name != filename:
  141 + raise ValueError('Wrong filename {}'.format(reader.name))
  142 + if reader.isatty():
  143 + raise ValueError('Reader is a tty!')
  144 + if reader.tell() != 0:
  145 + raise ValueError('Reader.tell is not 0 at beginning')
  146 +
  147 + # read text
  148 + text = reader.read()
  149 +
  150 + # a few more tests
  151 + if not reader.closed:
  152 + raise ValueError('Reader is not closed outside context')
  153 + if reader.name != filename:
  154 + raise ValueError('Wrong filename {} after context'.format(reader.name))
  155 + if reader.isatty():
  156 + raise ValueError('Reader has become a tty!')
  157 +
  158 + # compare text
  159 + if sys.version_info.major <= 2: # in python2 get encoded byte string
  160 + expect = FILE_TEXT.encode('utf8')
  161 + else: # python3: should get real unicode
  162 + expect = FILE_TEXT
  163 + if text != expect:
  164 + raise ValueError('Wrong contents: {!r} != {!r}'
  165 + .format(text, expect))
  166 + return 0
  167 +
  168 +
  169 +def run_print():
  170 + """This is called from test_read* tests as script. Prints & logs unicode"""
  171 + from oletools.common.io_encoding import ensure_stdout_handles_unicode
  172 + from oletools.common.log_helper import log_helper
  173 + ensure_stdout_handles_unicode()
  174 + print(u'Check: \u2713') # print check mark
  175 +
  176 + # check logging as well
  177 + logger = log_helper.get_or_create_silent_logger('test_encoding_handler')
  178 + log_helper.enable_logging(False, 'debug', stream=sys.stdout)
  179 + logger.info(u'Check: \u2713')
  180 + return 0
  181 +
  182 +
  183 +# tests call this file as script
  184 +if __name__ == '__main__':
  185 + if len(sys.argv) < 2:
  186 + sys.exit(unittest.main())
  187 +
  188 + # hack required to import common from parent dir, not system-wide one
  189 + # (usually unittest seems to do that for us)
  190 + from os.path import abspath, dirname, join
  191 + ole_base = dirname(dirname(dirname(abspath(__file__))))
  192 + sys.path.insert(0, ole_base)
  193 +
  194 + if sys.argv[1] == 'print':
  195 + if len(sys.argv) > 2:
  196 + print('Expect no arg for "print"', file=sys.stderr)
  197 + sys.exit(2)
  198 + sys.exit(run_print())
  199 + elif sys.argv[1] == 'read':
  200 + if len(sys.argv) != 3:
  201 + print('Expect single arg for "read"', file=sys.stderr)
  202 + sys.exit(2)
  203 + sys.exit(run_read(sys.argv[2]))
  204 + else:
  205 + print('Unexpected argument: {}'.format(sys.argv[1]), file=sys.stderr)
  206 + sys.exit(2)
... ...
tests/oleobj/test_basic.py
... ... @@ -10,6 +10,7 @@ from glob import glob
10 10 # Directory with test data, independent of current working directory
11 11 from tests.test_utils import DATA_BASE_DIR, call_and_capture
12 12 from oletools import oleobj
  13 +from oletools.common.io_encoding import ensure_stdout_handles_unicode
13 14  
14 15  
15 16 #: provide some more info to find errors
... ... @@ -61,6 +62,7 @@ def calc_md5(filename):
61 62  
62 63 def preread_file(args):
63 64 """helper for TestOleObj.test_non_streamed: preread + call process_file"""
  65 + ensure_stdout_handles_unicode() # usually, main() call this
64 66 ignore_arg, output_dir, filename = args
65 67 if ignore_arg != '-d':
66 68 raise ValueError('ignore_arg not as expected!')
... ...