Commit 471b141f5d3159924f76759594b4d00eddae8880

Authored by Christian Herdtweck
1 parent 670d7075

oleobj: encode filenames/paths to unicode

This make compatibility with py3 easier, but requires us to guess an
encoding. Should work fine for European-generated files, could produce
strange results from Asian files.
Showing 1 changed file with 29 additions and 10 deletions
oletools/oleobj.py
... ... @@ -262,29 +262,46 @@ def read_length_prefixed_string(data, index):
262 262 return (ansi_string, index)
263 263  
264 264  
265   -def read_zero_terminated_string(data, index):
  265 +def guess_encoding(data):
  266 + """ guess encoding of byte string to create unicode
  267 +
  268 + Since this is used to decode path names from ole objects, prefer latin1
  269 + over utf* codecs if ascii is not enough
266 270 """
267   - Read a zero-terminated ANSI string from data
  271 + for encoding in 'ascii', 'latin1', 'utf8', 'utf-16-le', 'utf16':
  272 + try:
  273 + result = data.decode(encoding, errors='strict')
  274 + log.debug(u'encoded using {0}: "{1}"'.format(encoding, result))
  275 + return result
  276 + except UnicodeError:
  277 + pass
  278 + logging.warning('failed to guess encoding for string, falling back to '
  279 + 'ascii with replace')
  280 + return data.decode('ascii', errors='replace')
268 281  
269   - Guessing that max length is 256 bytes.
  282 +
  283 +def read_zero_terminated_string(data, index):
  284 + """
  285 + Read a zero-terminated string from data
270 286  
271 287 :param data: bytes string or stream containing an ansi string
272 288 :param index: index at which the string should start or None if data is
273 289 stream
274   - :return: tuple (string, index) containing the read string (bytes string),
  290 + :return: tuple (unicode, index) containing the read string (unicode),
275 291 and the index to start reading from next time.
276 292 """
277 293 if index is None:
278   - result = []
  294 + result = bytearray()
279 295 for _ in xrange(STR_MAX_LEN):
280   - char = data.read(1)
281   - if char == b'\x00':
282   - return b''.join(result), index
  296 + char = ord(data.read(1)) # need ord() for py3
  297 + if char == 0:
  298 + return guess_encoding(result), index
283 299 result.append(char)
284 300 raise ValueError('found no string-terminating zero-byte!')
285 301 else: # data is byte array, can just search
286 302 end_idx = data.index(b'\x00', index, index+STR_MAX_LEN)
287   - return data[index:end_idx], end_idx+1 # return index after the 0-byte
  303 + # encode and return with index after the 0-byte
  304 + return guess_encoding(data[index:end_idx]), end_idx+1
288 305  
289 306  
290 307 # === CLASSES =================================================================
... ... @@ -294,6 +311,8 @@ class OleNativeStream(object):
294 311 """
295 312 OLE object contained into an OLENativeStream structure.
296 313 (see MS-OLEDS 2.3.6 OLENativeStream)
  314 +
  315 + Filename and paths are decoded to unicode.
297 316 """
298 317 # constants for the type attribute:
299 318 # see MS-OLEDS 2.2.4 ObjectHeader
... ... @@ -446,7 +465,7 @@ def sanitize_filename(filename, replacement='_', max_length=200):
446 465 """compute basename of filename. Replaces all non-whitelisted characters.
447 466 The returned filename is always a basename of the file."""
448 467 basepath = os.path.basename(filename).strip()
449   - sane_fname = re.sub(r'[^\w\.\- ]', replacement, basepath)
  468 + sane_fname = re.sub(u'[^\\w\\.\\- ]', replacement, basepath)
450 469  
451 470 while ".." in sane_fname:
452 471 sane_fname = sane_fname.replace('..', '.')
... ...