Commit 5bccb6aa02d34e64584ef1d80823dc68c9b1fad6
1 parent
0798cd1b
common: handle open() of text files without unicode-environment
open() of text-files also depends on locale.getpreferredencoding which is "ascii" (or so) if e.g. LANG=C or if redirecting output in python2. Provide a function uopen() that ensures text-files are always opened such that unicode text can be read properly.
Showing
1 changed file
with
61 additions
and
0 deletions
oletools/common/__init__.py
| ... | ... | @@ -8,8 +8,18 @@ from locale import getpreferredencoding |
| 8 | 8 | |
| 9 | 9 | PY3 = sys.version_info.major >= 3 |
| 10 | 10 | |
| 11 | +if PY3: | |
| 12 | + from builtins import open as builtin_open | |
| 13 | +else: | |
| 14 | + from __builtin__ import open as builtin_open | |
| 15 | + | |
| 16 | + | |
| 11 | 17 | #: encoding to use for redirection if no good encoding can be found |
| 12 | 18 | FALLBACK_ENCODING_REDIRECT = 'utf8' |
| 19 | + | |
| 20 | +#: encoding to use for reading text from files if preferred encoding is non-unicode | |
| 21 | +FALLBACK_ENCODING_OPEN = 'utf8' | |
| 22 | + | |
| 13 | 23 | #: print (pure-ascii) debug output to stdout |
| 14 | 24 | DEBUG = False |
| 15 | 25 | |
| ... | ... | @@ -82,3 +92,54 @@ def ensure_stdout_handles_unicode(): |
| 82 | 92 | print('preferred encoding also unacceptable, fall back to {}' |
| 83 | 93 | .format(new_encoding)) |
| 84 | 94 | sys.stdout = codecs.getwriter(new_encoding)(output_stream) |
| 95 | + | |
| 96 | + | |
| 97 | +def uopen(filename, mode, *args, **kwargs): | |
| 98 | + """ | |
| 99 | + Replacement for builtin open() that reads unicode even in ASCII environment | |
| 100 | + | |
| 101 | + In order to read unicode from text, python uses locale.getpreferredencoding | |
| 102 | + to translate bytes to str. If the environment only provides ASCII encoding, | |
| 103 | + this will fail since most office files contain unicode. | |
| 104 | + | |
| 105 | + Therefore, guess a good encoding here if necessary and open file with that. | |
| 106 | + | |
| 107 | + :returns: same type as the builtin :py:func:`open` | |
| 108 | + """ | |
| 109 | + # do not interfere if not necessary: | |
| 110 | + if 'b' in mode: | |
| 111 | + if DEBUG: | |
| 112 | + print('Opening binary file, do not interfere') | |
| 113 | + return builtin_open(filename, mode, *args, **kwargs) | |
| 114 | + if 'encoding' in kwargs: | |
| 115 | + if DEBUG: | |
| 116 | + print('Opening file with encoding {!r}, do not interfere' | |
| 117 | + .format(kwargs['encoding'])) | |
| 118 | + return builtin_open(filename, mode, *args, **kwargs) | |
| 119 | + if len(args) > 3: # "encoding" is the 4th arg | |
| 120 | + if DEBUG: | |
| 121 | + print('Opening file with encoding {!r}, do not interfere' | |
| 122 | + .format(args[3])) | |
| 123 | + return builtin_open(filename, mode, *args, **kwargs) | |
| 124 | + | |
| 125 | + # determine preferred encoding | |
| 126 | + encoding = PREFERRED_ENCODING | |
| 127 | + if DEBUG: | |
| 128 | + print('preferred encoding is {}'.format(encoding)) | |
| 129 | + | |
| 130 | + if isinstance(encoding, str) and encoding.lower().startswith('utf'): | |
| 131 | + if DEBUG: | |
| 132 | + print('encoding is acceptable, open {} regularly'.format(filename)) | |
| 133 | + return builtin_open(filename, mode, *args, **kwargs) | |
| 134 | + | |
| 135 | + # so we want to read text from a file but can probably only deal with ASCII | |
| 136 | + # --> use fallback | |
| 137 | + if DEBUG: | |
| 138 | + print('Opening {} with fallback encoding {}' | |
| 139 | + .format(filename, FALLBACK_ENCODING_OPEN)) | |
| 140 | + if PY3: | |
| 141 | + return builtin_open(filename, mode, *args, | |
| 142 | + encoding=FALLBACK_ENCODING_OPEN, **kwargs) | |
| 143 | + else: | |
| 144 | + handle = builtin_open(filename, mode, *args, **kwargs) | |
| 145 | + return codecs.EncodedFile(handle, FALLBACK_ENCODING_OPEN) | ... | ... |