Commit 471b141f5d3159924f76759594b4d00eddae8880
1 parent
670d7075
oleobj: encode filenames/paths to unicode
This make compatibility with py3 easier, but requires us to guess an encoding. Should work fine for European-generated files, could produce strange results from Asian files.
Showing
1 changed file
with
29 additions
and
10 deletions
oletools/oleobj.py
| @@ -262,29 +262,46 @@ def read_length_prefixed_string(data, index): | @@ -262,29 +262,46 @@ def read_length_prefixed_string(data, index): | ||
| 262 | return (ansi_string, index) | 262 | return (ansi_string, index) |
| 263 | 263 | ||
| 264 | 264 | ||
| 265 | -def read_zero_terminated_string(data, index): | 265 | +def guess_encoding(data): |
| 266 | + """ guess encoding of byte string to create unicode | ||
| 267 | + | ||
| 268 | + Since this is used to decode path names from ole objects, prefer latin1 | ||
| 269 | + over utf* codecs if ascii is not enough | ||
| 266 | """ | 270 | """ |
| 267 | - Read a zero-terminated ANSI string from data | 271 | + for encoding in 'ascii', 'latin1', 'utf8', 'utf-16-le', 'utf16': |
| 272 | + try: | ||
| 273 | + result = data.decode(encoding, errors='strict') | ||
| 274 | + log.debug(u'encoded using {0}: "{1}"'.format(encoding, result)) | ||
| 275 | + return result | ||
| 276 | + except UnicodeError: | ||
| 277 | + pass | ||
| 278 | + logging.warning('failed to guess encoding for string, falling back to ' | ||
| 279 | + 'ascii with replace') | ||
| 280 | + return data.decode('ascii', errors='replace') | ||
| 268 | 281 | ||
| 269 | - Guessing that max length is 256 bytes. | 282 | + |
| 283 | +def read_zero_terminated_string(data, index): | ||
| 284 | + """ | ||
| 285 | + Read a zero-terminated string from data | ||
| 270 | 286 | ||
| 271 | :param data: bytes string or stream containing an ansi string | 287 | :param data: bytes string or stream containing an ansi string |
| 272 | :param index: index at which the string should start or None if data is | 288 | :param index: index at which the string should start or None if data is |
| 273 | stream | 289 | stream |
| 274 | - :return: tuple (string, index) containing the read string (bytes string), | 290 | + :return: tuple (unicode, index) containing the read string (unicode), |
| 275 | and the index to start reading from next time. | 291 | and the index to start reading from next time. |
| 276 | """ | 292 | """ |
| 277 | if index is None: | 293 | if index is None: |
| 278 | - result = [] | 294 | + result = bytearray() |
| 279 | for _ in xrange(STR_MAX_LEN): | 295 | for _ in xrange(STR_MAX_LEN): |
| 280 | - char = data.read(1) | ||
| 281 | - if char == b'\x00': | ||
| 282 | - return b''.join(result), index | 296 | + char = ord(data.read(1)) # need ord() for py3 |
| 297 | + if char == 0: | ||
| 298 | + return guess_encoding(result), index | ||
| 283 | result.append(char) | 299 | result.append(char) |
| 284 | raise ValueError('found no string-terminating zero-byte!') | 300 | raise ValueError('found no string-terminating zero-byte!') |
| 285 | else: # data is byte array, can just search | 301 | else: # data is byte array, can just search |
| 286 | end_idx = data.index(b'\x00', index, index+STR_MAX_LEN) | 302 | end_idx = data.index(b'\x00', index, index+STR_MAX_LEN) |
| 287 | - return data[index:end_idx], end_idx+1 # return index after the 0-byte | 303 | + # encode and return with index after the 0-byte |
| 304 | + return guess_encoding(data[index:end_idx]), end_idx+1 | ||
| 288 | 305 | ||
| 289 | 306 | ||
| 290 | # === CLASSES ================================================================= | 307 | # === CLASSES ================================================================= |
| @@ -294,6 +311,8 @@ class OleNativeStream(object): | @@ -294,6 +311,8 @@ class OleNativeStream(object): | ||
| 294 | """ | 311 | """ |
| 295 | OLE object contained into an OLENativeStream structure. | 312 | OLE object contained into an OLENativeStream structure. |
| 296 | (see MS-OLEDS 2.3.6 OLENativeStream) | 313 | (see MS-OLEDS 2.3.6 OLENativeStream) |
| 314 | + | ||
| 315 | + Filename and paths are decoded to unicode. | ||
| 297 | """ | 316 | """ |
| 298 | # constants for the type attribute: | 317 | # constants for the type attribute: |
| 299 | # see MS-OLEDS 2.2.4 ObjectHeader | 318 | # see MS-OLEDS 2.2.4 ObjectHeader |
| @@ -446,7 +465,7 @@ def sanitize_filename(filename, replacement='_', max_length=200): | @@ -446,7 +465,7 @@ def sanitize_filename(filename, replacement='_', max_length=200): | ||
| 446 | """compute basename of filename. Replaces all non-whitelisted characters. | 465 | """compute basename of filename. Replaces all non-whitelisted characters. |
| 447 | The returned filename is always a basename of the file.""" | 466 | The returned filename is always a basename of the file.""" |
| 448 | basepath = os.path.basename(filename).strip() | 467 | basepath = os.path.basename(filename).strip() |
| 449 | - sane_fname = re.sub(r'[^\w\.\- ]', replacement, basepath) | 468 | + sane_fname = re.sub(u'[^\\w\\.\\- ]', replacement, basepath) |
| 450 | 469 | ||
| 451 | while ".." in sane_fname: | 470 | while ".." in sane_fname: |
| 452 | sane_fname = sane_fname.replace('..', '.') | 471 | sane_fname = sane_fname.replace('..', '.') |