Commit 471b141f5d3159924f76759594b4d00eddae8880
1 parent
670d7075
oleobj: encode filenames/paths to unicode
This make compatibility with py3 easier, but requires us to guess an encoding. Should work fine for European-generated files, could produce strange results from Asian files.
Showing
1 changed file
with
29 additions
and
10 deletions
oletools/oleobj.py
| ... | ... | @@ -262,29 +262,46 @@ def read_length_prefixed_string(data, index): |
| 262 | 262 | return (ansi_string, index) |
| 263 | 263 | |
| 264 | 264 | |
| 265 | -def read_zero_terminated_string(data, index): | |
| 265 | +def guess_encoding(data): | |
| 266 | + """ guess encoding of byte string to create unicode | |
| 267 | + | |
| 268 | + Since this is used to decode path names from ole objects, prefer latin1 | |
| 269 | + over utf* codecs if ascii is not enough | |
| 266 | 270 | """ |
| 267 | - Read a zero-terminated ANSI string from data | |
| 271 | + for encoding in 'ascii', 'latin1', 'utf8', 'utf-16-le', 'utf16': | |
| 272 | + try: | |
| 273 | + result = data.decode(encoding, errors='strict') | |
| 274 | + log.debug(u'encoded using {0}: "{1}"'.format(encoding, result)) | |
| 275 | + return result | |
| 276 | + except UnicodeError: | |
| 277 | + pass | |
| 278 | + logging.warning('failed to guess encoding for string, falling back to ' | |
| 279 | + 'ascii with replace') | |
| 280 | + return data.decode('ascii', errors='replace') | |
| 268 | 281 | |
| 269 | - Guessing that max length is 256 bytes. | |
| 282 | + | |
| 283 | +def read_zero_terminated_string(data, index): | |
| 284 | + """ | |
| 285 | + Read a zero-terminated string from data | |
| 270 | 286 | |
| 271 | 287 | :param data: bytes string or stream containing an ansi string |
| 272 | 288 | :param index: index at which the string should start or None if data is |
| 273 | 289 | stream |
| 274 | - :return: tuple (string, index) containing the read string (bytes string), | |
| 290 | + :return: tuple (unicode, index) containing the read string (unicode), | |
| 275 | 291 | and the index to start reading from next time. |
| 276 | 292 | """ |
| 277 | 293 | if index is None: |
| 278 | - result = [] | |
| 294 | + result = bytearray() | |
| 279 | 295 | for _ in xrange(STR_MAX_LEN): |
| 280 | - char = data.read(1) | |
| 281 | - if char == b'\x00': | |
| 282 | - return b''.join(result), index | |
| 296 | + char = ord(data.read(1)) # need ord() for py3 | |
| 297 | + if char == 0: | |
| 298 | + return guess_encoding(result), index | |
| 283 | 299 | result.append(char) |
| 284 | 300 | raise ValueError('found no string-terminating zero-byte!') |
| 285 | 301 | else: # data is byte array, can just search |
| 286 | 302 | end_idx = data.index(b'\x00', index, index+STR_MAX_LEN) |
| 287 | - return data[index:end_idx], end_idx+1 # return index after the 0-byte | |
| 303 | + # encode and return with index after the 0-byte | |
| 304 | + return guess_encoding(data[index:end_idx]), end_idx+1 | |
| 288 | 305 | |
| 289 | 306 | |
| 290 | 307 | # === CLASSES ================================================================= |
| ... | ... | @@ -294,6 +311,8 @@ class OleNativeStream(object): |
| 294 | 311 | """ |
| 295 | 312 | OLE object contained into an OLENativeStream structure. |
| 296 | 313 | (see MS-OLEDS 2.3.6 OLENativeStream) |
| 314 | + | |
| 315 | + Filename and paths are decoded to unicode. | |
| 297 | 316 | """ |
| 298 | 317 | # constants for the type attribute: |
| 299 | 318 | # see MS-OLEDS 2.2.4 ObjectHeader |
| ... | ... | @@ -446,7 +465,7 @@ def sanitize_filename(filename, replacement='_', max_length=200): |
| 446 | 465 | """compute basename of filename. Replaces all non-whitelisted characters. |
| 447 | 466 | The returned filename is always a basename of the file.""" |
| 448 | 467 | basepath = os.path.basename(filename).strip() |
| 449 | - sane_fname = re.sub(r'[^\w\.\- ]', replacement, basepath) | |
| 468 | + sane_fname = re.sub(u'[^\\w\\.\\- ]', replacement, basepath) | |
| 450 | 469 | |
| 451 | 470 | while ".." in sane_fname: |
| 452 | 471 | sane_fname = sane_fname.replace('..', '.') | ... | ... |