Commit 5bccb6aa02d34e64584ef1d80823dc68c9b1fad6
1 parent
0798cd1b
common: handle open() of text files without unicode-environment
open() of text-files also depends on locale.getpreferredencoding which is "ascii" (or so) if e.g. LANG=C or if redirecting output in python2. Provide a function uopen() that ensures text-files are always opened such that unicode text can be read properly.
Showing
1 changed file
with
61 additions
and
0 deletions
oletools/common/__init__.py
| @@ -8,8 +8,18 @@ from locale import getpreferredencoding | @@ -8,8 +8,18 @@ from locale import getpreferredencoding | ||
| 8 | 8 | ||
| 9 | PY3 = sys.version_info.major >= 3 | 9 | PY3 = sys.version_info.major >= 3 |
| 10 | 10 | ||
| 11 | +if PY3: | ||
| 12 | + from builtins import open as builtin_open | ||
| 13 | +else: | ||
| 14 | + from __builtin__ import open as builtin_open | ||
| 15 | + | ||
| 16 | + | ||
| 11 | #: encoding to use for redirection if no good encoding can be found | 17 | #: encoding to use for redirection if no good encoding can be found |
| 12 | FALLBACK_ENCODING_REDIRECT = 'utf8' | 18 | FALLBACK_ENCODING_REDIRECT = 'utf8' |
| 19 | + | ||
| 20 | +#: encoding to use for reading text from files if preferred encoding is non-unicode | ||
| 21 | +FALLBACK_ENCODING_OPEN = 'utf8' | ||
| 22 | + | ||
| 13 | #: print (pure-ascii) debug output to stdout | 23 | #: print (pure-ascii) debug output to stdout |
| 14 | DEBUG = False | 24 | DEBUG = False |
| 15 | 25 | ||
| @@ -82,3 +92,54 @@ def ensure_stdout_handles_unicode(): | @@ -82,3 +92,54 @@ def ensure_stdout_handles_unicode(): | ||
| 82 | print('preferred encoding also unacceptable, fall back to {}' | 92 | print('preferred encoding also unacceptable, fall back to {}' |
| 83 | .format(new_encoding)) | 93 | .format(new_encoding)) |
| 84 | sys.stdout = codecs.getwriter(new_encoding)(output_stream) | 94 | sys.stdout = codecs.getwriter(new_encoding)(output_stream) |
| 95 | + | ||
| 96 | + | ||
| 97 | +def uopen(filename, mode, *args, **kwargs): | ||
| 98 | + """ | ||
| 99 | + Replacement for builtin open() that reads unicode even in ASCII environment | ||
| 100 | + | ||
| 101 | + In order to read unicode from text, python uses locale.getpreferredencoding | ||
| 102 | + to translate bytes to str. If the environment only provides ASCII encoding, | ||
| 103 | + this will fail since most office files contain unicode. | ||
| 104 | + | ||
| 105 | + Therefore, guess a good encoding here if necessary and open file with that. | ||
| 106 | + | ||
| 107 | + :returns: same type as the builtin :py:func:`open` | ||
| 108 | + """ | ||
| 109 | + # do not interfere if not necessary: | ||
| 110 | + if 'b' in mode: | ||
| 111 | + if DEBUG: | ||
| 112 | + print('Opening binary file, do not interfere') | ||
| 113 | + return builtin_open(filename, mode, *args, **kwargs) | ||
| 114 | + if 'encoding' in kwargs: | ||
| 115 | + if DEBUG: | ||
| 116 | + print('Opening file with encoding {!r}, do not interfere' | ||
| 117 | + .format(kwargs['encoding'])) | ||
| 118 | + return builtin_open(filename, mode, *args, **kwargs) | ||
| 119 | + if len(args) > 3: # "encoding" is the 4th arg | ||
| 120 | + if DEBUG: | ||
| 121 | + print('Opening file with encoding {!r}, do not interfere' | ||
| 122 | + .format(args[3])) | ||
| 123 | + return builtin_open(filename, mode, *args, **kwargs) | ||
| 124 | + | ||
| 125 | + # determine preferred encoding | ||
| 126 | + encoding = PREFERRED_ENCODING | ||
| 127 | + if DEBUG: | ||
| 128 | + print('preferred encoding is {}'.format(encoding)) | ||
| 129 | + | ||
| 130 | + if isinstance(encoding, str) and encoding.lower().startswith('utf'): | ||
| 131 | + if DEBUG: | ||
| 132 | + print('encoding is acceptable, open {} regularly'.format(filename)) | ||
| 133 | + return builtin_open(filename, mode, *args, **kwargs) | ||
| 134 | + | ||
| 135 | + # so we want to read text from a file but can probably only deal with ASCII | ||
| 136 | + # --> use fallback | ||
| 137 | + if DEBUG: | ||
| 138 | + print('Opening {} with fallback encoding {}' | ||
| 139 | + .format(filename, FALLBACK_ENCODING_OPEN)) | ||
| 140 | + if PY3: | ||
| 141 | + return builtin_open(filename, mode, *args, | ||
| 142 | + encoding=FALLBACK_ENCODING_OPEN, **kwargs) | ||
| 143 | + else: | ||
| 144 | + handle = builtin_open(filename, mode, *args, **kwargs) | ||
| 145 | + return codecs.EncodedFile(handle, FALLBACK_ENCODING_OPEN) |