Commit 471b141f5d3159924f76759594b4d00eddae8880

Authored by Christian Herdtweck
1 parent 670d7075

oleobj: encode filenames/paths to unicode

This make compatibility with py3 easier, but requires us to guess an
encoding. Should work fine for European-generated files, could produce
strange results from Asian files.
Showing 1 changed file with 29 additions and 10 deletions
oletools/oleobj.py
@@ -262,29 +262,46 @@ def read_length_prefixed_string(data, index): @@ -262,29 +262,46 @@ def read_length_prefixed_string(data, index):
262 return (ansi_string, index) 262 return (ansi_string, index)
263 263
264 264
265 -def read_zero_terminated_string(data, index): 265 +def guess_encoding(data):
  266 + """ guess encoding of byte string to create unicode
  267 +
  268 + Since this is used to decode path names from ole objects, prefer latin1
  269 + over utf* codecs if ascii is not enough
266 """ 270 """
267 - Read a zero-terminated ANSI string from data 271 + for encoding in 'ascii', 'latin1', 'utf8', 'utf-16-le', 'utf16':
  272 + try:
  273 + result = data.decode(encoding, errors='strict')
  274 + log.debug(u'encoded using {0}: "{1}"'.format(encoding, result))
  275 + return result
  276 + except UnicodeError:
  277 + pass
  278 + logging.warning('failed to guess encoding for string, falling back to '
  279 + 'ascii with replace')
  280 + return data.decode('ascii', errors='replace')
268 281
269 - Guessing that max length is 256 bytes. 282 +
  283 +def read_zero_terminated_string(data, index):
  284 + """
  285 + Read a zero-terminated string from data
270 286
271 :param data: bytes string or stream containing an ansi string 287 :param data: bytes string or stream containing an ansi string
272 :param index: index at which the string should start or None if data is 288 :param index: index at which the string should start or None if data is
273 stream 289 stream
274 - :return: tuple (string, index) containing the read string (bytes string), 290 + :return: tuple (unicode, index) containing the read string (unicode),
275 and the index to start reading from next time. 291 and the index to start reading from next time.
276 """ 292 """
277 if index is None: 293 if index is None:
278 - result = [] 294 + result = bytearray()
279 for _ in xrange(STR_MAX_LEN): 295 for _ in xrange(STR_MAX_LEN):
280 - char = data.read(1)  
281 - if char == b'\x00':  
282 - return b''.join(result), index 296 + char = ord(data.read(1)) # need ord() for py3
  297 + if char == 0:
  298 + return guess_encoding(result), index
283 result.append(char) 299 result.append(char)
284 raise ValueError('found no string-terminating zero-byte!') 300 raise ValueError('found no string-terminating zero-byte!')
285 else: # data is byte array, can just search 301 else: # data is byte array, can just search
286 end_idx = data.index(b'\x00', index, index+STR_MAX_LEN) 302 end_idx = data.index(b'\x00', index, index+STR_MAX_LEN)
287 - return data[index:end_idx], end_idx+1 # return index after the 0-byte 303 + # encode and return with index after the 0-byte
  304 + return guess_encoding(data[index:end_idx]), end_idx+1
288 305
289 306
290 # === CLASSES ================================================================= 307 # === CLASSES =================================================================
@@ -294,6 +311,8 @@ class OleNativeStream(object): @@ -294,6 +311,8 @@ class OleNativeStream(object):
294 """ 311 """
295 OLE object contained into an OLENativeStream structure. 312 OLE object contained into an OLENativeStream structure.
296 (see MS-OLEDS 2.3.6 OLENativeStream) 313 (see MS-OLEDS 2.3.6 OLENativeStream)
  314 +
  315 + Filename and paths are decoded to unicode.
297 """ 316 """
298 # constants for the type attribute: 317 # constants for the type attribute:
299 # see MS-OLEDS 2.2.4 ObjectHeader 318 # see MS-OLEDS 2.2.4 ObjectHeader
@@ -446,7 +465,7 @@ def sanitize_filename(filename, replacement='_', max_length=200): @@ -446,7 +465,7 @@ def sanitize_filename(filename, replacement='_', max_length=200):
446 """compute basename of filename. Replaces all non-whitelisted characters. 465 """compute basename of filename. Replaces all non-whitelisted characters.
447 The returned filename is always a basename of the file.""" 466 The returned filename is always a basename of the file."""
448 basepath = os.path.basename(filename).strip() 467 basepath = os.path.basename(filename).strip()
449 - sane_fname = re.sub(r'[^\w\.\- ]', replacement, basepath) 468 + sane_fname = re.sub(u'[^\\w\\.\\- ]', replacement, basepath)
450 469
451 while ".." in sane_fname: 470 while ".." in sane_fname:
452 sane_fname = sane_fname.replace('..', '.') 471 sane_fname = sane_fname.replace('..', '.')