Commit 56e35a601981b18cda5b0c3920b440c6369c513d
1 parent
cb436a2f
common: Handle unicode output in non-unicode environments
When print()ing unicode, python relies on locale.getpreferredencoding to determine how to represent unicode text. This fails in several cases, e.g. when redirecting output, piping output into other programs or when the shell environment has no locale defined (e.g. in linux with LANG=C). In all these cases, print()ing non-ascii characters raises unicode exceptions. Prevent these errors by encoding output in case of redirection, replacing unhandleded chars in case of unicode-unfriendly shells. This tries to solve issue #361
Showing
1 changed file
with
84 additions
and
0 deletions
oletools/common/__init__.py
| 1 | +"""Commonly used small stuff""" | |
| 2 | + | |
| 3 | +from __future__ import print_function | |
| 4 | +import sys | |
| 5 | +import codecs | |
| 6 | +import os | |
| 7 | +from locale import getpreferredencoding | |
| 8 | + | |
| 9 | +PY3 = sys.version_info.major >= 3 | |
| 10 | + | |
| 11 | +#: encoding to use for redirection if no good encoding can be found | |
| 12 | +FALLBACK_ENCODING_REDIRECT = 'utf8' | |
| 13 | +#: print (pure-ascii) debug output to stdout | |
| 14 | +DEBUG = False | |
| 15 | + | |
| 16 | +# the encoding specified in system environment | |
| 17 | +try: | |
| 18 | + PREFERRED_ENCODING = getpreferredencoding(False) | |
| 19 | +except Exception as exc: | |
| 20 | + if DEBUG: | |
| 21 | + print('Exception getting preferred encoding: {}'.format(exc)) | |
| 22 | + PREFERRED_ENCODING = None | |
| 23 | + | |
| 24 | + | |
| 25 | +def ensure_stdout_handles_unicode(): | |
| 26 | + """ | |
| 27 | + Ensure that print()ing unicode does not lead to errors. | |
| 28 | + | |
| 29 | + When print()ing unicode, python relies on the environment (e.g. in linux on | |
| 30 | + the setting of the LANG environment variable) to tell it how to encode | |
| 31 | + unicode. That works nicely for modern-day shells where encoding is usually | |
| 32 | + UTF-8. But as soon as LANG is unset or just "C", or output is redirected or | |
| 33 | + piped, the encoding falls back to 'ASCII', which cannot handle unicode | |
| 34 | + characters. | |
| 35 | + | |
| 36 | + Based on solutions suggested on stackoverflow (c.f. | |
| 37 | + https://stackoverflow.com/q/27347772/4405656 ), wrap stdout in an encoder | |
| 38 | + that solves that problem. | |
| 39 | + | |
| 40 | + Unfortunately, stderr cannot be handled the same way ( see e.g. https:// | |
| 41 | + pythonhosted.org/kitchen/unicode-frustrations.html#frustration-5-exceptions | |
| 42 | + ), so we still have to hope there is only ascii in error messages | |
| 43 | + """ | |
| 44 | + # do not re-wrap | |
| 45 | + if isinstance(sys.stdout, codecs.StreamWriter): | |
| 46 | + if DEBUG: | |
| 47 | + print('sys.stdout wrapped already') | |
| 48 | + return | |
| 49 | + | |
| 50 | + # get output stream object | |
| 51 | + if PY3: | |
| 52 | + output_stream = sys.stdout.buffer | |
| 53 | + else: | |
| 54 | + output_stream = sys.stdout | |
| 55 | + | |
| 56 | + # determine encoding of sys.stdout | |
| 57 | + try: | |
| 58 | + encoding = sys.stdout.encoding | |
| 59 | + except AttributeError: # variable "encoding" might not exist | |
| 60 | + encoding = None | |
| 61 | + if DEBUG: | |
| 62 | + print('sys.stdout encoding is {}'.format(encoding)) | |
| 63 | + | |
| 64 | + if isinstance(encoding, str) and encoding.lower().startswith('utf'): | |
| 65 | + if DEBUG: | |
| 66 | + print('encoding is acceptable') | |
| 67 | + return # everything alright, we are working in a good environment | |
| 68 | + elif os.isatty(output_stream.fileno()): # e.g. C locale | |
| 69 | + # Do not output UTF8 since that might be mis-interpreted. | |
| 70 | + # Just replace chars that cannot be handled | |
| 71 | + if DEBUG: | |
| 72 | + print('sys.stdout is a tty, just replace errors') | |
| 73 | + sys.stdout = codecs.getwriter(encoding)(output_stream, errors='replace') | |
| 74 | + else: # e.g. redirection, pipe in python2 | |
| 75 | + new_encoding = PREFERRED_ENCODING | |
| 76 | + if DEBUG: | |
| 77 | + print('not a tty, try preferred encoding {}'.format(new_encoding)) | |
| 78 | + if not isinstance(new_encoding, str) \ | |
| 79 | + or not new_encoding.lower().startswith('utf'): | |
| 80 | + new_encoding = FALLBACK_ENCODING_REDIRECT | |
| 81 | + if DEBUG: | |
| 82 | + print('preferred encoding also unacceptable, fall back to {}' | |
| 83 | + .format(new_encoding)) | |
| 84 | + sys.stdout = codecs.getwriter(new_encoding)(output_stream) | ... | ... |