Commit 5bccb6aa02d34e64584ef1d80823dc68c9b1fad6

Authored by Christian Herdtweck
1 parent 0798cd1b

common: handle open() of text files without unicode-environment

open() of text-files also depends on locale.getpreferredencoding which is
"ascii" (or so) if e.g. LANG=C or if redirecting output in python2.

Provide a function uopen() that ensures text-files are always opened such
that unicode text can be read properly.
Showing 1 changed file with 61 additions and 0 deletions
oletools/common/__init__.py
... ... @@ -8,8 +8,18 @@ from locale import getpreferredencoding
8 8  
9 9 PY3 = sys.version_info.major >= 3
10 10  
  11 +if PY3:
  12 + from builtins import open as builtin_open
  13 +else:
  14 + from __builtin__ import open as builtin_open
  15 +
  16 +
11 17 #: encoding to use for redirection if no good encoding can be found
12 18 FALLBACK_ENCODING_REDIRECT = 'utf8'
  19 +
  20 +#: encoding to use for reading text from files if preferred encoding is non-unicode
  21 +FALLBACK_ENCODING_OPEN = 'utf8'
  22 +
13 23 #: print (pure-ascii) debug output to stdout
14 24 DEBUG = False
15 25  
... ... @@ -82,3 +92,54 @@ def ensure_stdout_handles_unicode():
82 92 print('preferred encoding also unacceptable, fall back to {}'
83 93 .format(new_encoding))
84 94 sys.stdout = codecs.getwriter(new_encoding)(output_stream)
  95 +
  96 +
  97 +def uopen(filename, mode, *args, **kwargs):
  98 + """
  99 + Replacement for builtin open() that reads unicode even in ASCII environment
  100 +
  101 + In order to read unicode from text, python uses locale.getpreferredencoding
  102 + to translate bytes to str. If the environment only provides ASCII encoding,
  103 + this will fail since most office files contain unicode.
  104 +
  105 + Therefore, guess a good encoding here if necessary and open file with that.
  106 +
  107 + :returns: same type as the builtin :py:func:`open`
  108 + """
  109 + # do not interfere if not necessary:
  110 + if 'b' in mode:
  111 + if DEBUG:
  112 + print('Opening binary file, do not interfere')
  113 + return builtin_open(filename, mode, *args, **kwargs)
  114 + if 'encoding' in kwargs:
  115 + if DEBUG:
  116 + print('Opening file with encoding {!r}, do not interfere'
  117 + .format(kwargs['encoding']))
  118 + return builtin_open(filename, mode, *args, **kwargs)
  119 + if len(args) > 3: # "encoding" is the 4th arg
  120 + if DEBUG:
  121 + print('Opening file with encoding {!r}, do not interfere'
  122 + .format(args[3]))
  123 + return builtin_open(filename, mode, *args, **kwargs)
  124 +
  125 + # determine preferred encoding
  126 + encoding = PREFERRED_ENCODING
  127 + if DEBUG:
  128 + print('preferred encoding is {}'.format(encoding))
  129 +
  130 + if isinstance(encoding, str) and encoding.lower().startswith('utf'):
  131 + if DEBUG:
  132 + print('encoding is acceptable, open {} regularly'.format(filename))
  133 + return builtin_open(filename, mode, *args, **kwargs)
  134 +
  135 + # so we want to read text from a file but can probably only deal with ASCII
  136 + # --> use fallback
  137 + if DEBUG:
  138 + print('Opening {} with fallback encoding {}'
  139 + .format(filename, FALLBACK_ENCODING_OPEN))
  140 + if PY3:
  141 + return builtin_open(filename, mode, *args,
  142 + encoding=FALLBACK_ENCODING_OPEN, **kwargs)
  143 + else:
  144 + handle = builtin_open(filename, mode, *args, **kwargs)
  145 + return codecs.EncodedFile(handle, FALLBACK_ENCODING_OPEN)
... ...