Commit 7a9cb92267c7fbe1339f687afdeffb0ff44a2903
1 parent
1d05bbab
improved olefile to specify the encoding for path names, changed default to UTF-…
…8 on python 2.x to support non-Latin1 code pages
Showing
2 changed files
with
53 additions
and
36 deletions
oletools/olevba.py
| ... | ... | @@ -774,6 +774,7 @@ def _extract_vba (ole, vba_root, project_path, dir_path): |
| 774 | 774 | # case-insensitive search in the code_modules dict to find the file extension: |
| 775 | 775 | filext = code_modules.get(MODULENAME_ModuleName.lower(), 'bin') |
| 776 | 776 | filename = '{0}.{1}'.format(MODULENAME_ModuleName, filext) |
| 777 | + #TODO: also yield the codepage so that callers can decode it properly | |
| 777 | 778 | yield (code_path, filename, code_data) |
| 778 | 779 | # print '-'*79 |
| 779 | 780 | # print filename |
| ... | ... | @@ -972,7 +973,8 @@ class VBA_Parser(object): |
| 972 | 973 | if olefile.isOleFile(_file): |
| 973 | 974 | # This looks like an OLE file |
| 974 | 975 | logging.info('Parsing OLE file %s' % self.filename) |
| 975 | - self.ole_file = olefile.OleFileIO(_file) | |
| 976 | + # Open and parse the OLE file, using unicode for path names: | |
| 977 | + self.ole_file = olefile.OleFileIO(_file, path_encoding=None) | |
| 976 | 978 | self.type = TYPE_OLE |
| 977 | 979 | #TODO: raise TypeError if this is a Powerpoint 97 file, since VBA macros cannot be detected yet |
| 978 | 980 | elif zipfile.is_zipfile(_file): | ... | ... |
oletools/thirdparty/olefile/olefile.py
| 1 | 1 | #!/usr/bin/env python |
| 2 | 2 | |
| 3 | -# olefile (formerly OleFileIO_PL) version 0.41 2014-11-25 | |
| 3 | +# olefile (formerly OleFileIO_PL) version 0.42 2015-01-24 | |
| 4 | 4 | # |
| 5 | 5 | # Module to read/write Microsoft OLE2 files (also called Structured Storage or |
| 6 | 6 | # Microsoft Compound Document File Format), such as Microsoft Office 97-2003 |
| ... | ... | @@ -9,7 +9,7 @@ |
| 9 | 9 | # |
| 10 | 10 | # Project website: http://www.decalage.info/olefile |
| 11 | 11 | # |
| 12 | -# olefile is copyright (c) 2005-2014 Philippe Lagadec (http://www.decalage.info) | |
| 12 | +# olefile is copyright (c) 2005-2015 Philippe Lagadec (http://www.decalage.info) | |
| 13 | 13 | # |
| 14 | 14 | # olefile is based on the OleFileIO module from the PIL library v1.1.6 |
| 15 | 15 | # See: http://www.pythonware.com/products/pil/index.htm |
| ... | ... | @@ -29,12 +29,12 @@ from __future__ import print_function # This version of olefile requires Pytho |
| 29 | 29 | |
| 30 | 30 | |
| 31 | 31 | __author__ = "Philippe Lagadec" |
| 32 | -__date__ = "2014-11-25" | |
| 33 | -__version__ = '0.41' | |
| 32 | +__date__ = "2015-01-24" | |
| 33 | +__version__ = '0.42' | |
| 34 | 34 | |
| 35 | 35 | #--- LICENSE ------------------------------------------------------------------ |
| 36 | 36 | |
| 37 | -# olefile (formerly OleFileIO_PL) is copyright (c) 2005-2014 Philippe Lagadec | |
| 37 | +# olefile (formerly OleFileIO_PL) is copyright (c) 2005-2015 Philippe Lagadec | |
| 38 | 38 | # (http://www.decalage.info) |
| 39 | 39 | # |
| 40 | 40 | # All rights reserved. |
| ... | ... | @@ -177,6 +177,9 @@ __version__ = '0.41' |
| 177 | 177 | # 2014-11-13 v0.41 PL: - improved isOleFile and OleFileIO.open to support OLE |
| 178 | 178 | # data in a string buffer and file-like objects. |
| 179 | 179 | # 2014-11-21 PL: - updated comments according to Pillow's commits |
| 180 | +# 2015-01-24 v0.42 PL: - changed the default path name encoding from Latin-1 | |
| 181 | +# to UTF-8 on Python 2.x (Unicode on Python 3.x) | |
| 182 | +# - added path_encoding option to override the default | |
| 180 | 183 | |
| 181 | 184 | #----------------------------------------------------------------------------- |
| 182 | 185 | # TODO (for version 1.0): |
| ... | ... | @@ -314,6 +317,14 @@ except NameError: |
| 314 | 317 | # if False (default PIL behaviour), all filenames are converted to Latin-1. |
| 315 | 318 | KEEP_UNICODE_NAMES = True |
| 316 | 319 | |
| 320 | +if sys.version_info[0] < 3: | |
| 321 | + # On Python 2.x, the default encoding for path names is UTF-8: | |
| 322 | + DEFAULT_PATH_ENCODING = 'utf-8' | |
| 323 | +else: | |
| 324 | + # On Python 3.x, the default encoding for path names is Unicode (None): | |
| 325 | + DEFAULT_PATH_ENCODING = None | |
| 326 | + | |
| 327 | + | |
| 317 | 328 | #=== DEBUGGING =============================================================== |
| 318 | 329 | |
| 319 | 330 | #TODO: replace this by proper logging |
| ... | ... | @@ -498,32 +509,6 @@ def _clsid(clsid): |
| 498 | 509 | |
| 499 | 510 | |
| 500 | 511 | |
| 501 | -# UNICODE support: | |
| 502 | -# (necessary to handle storages/streams names which use Unicode) | |
| 503 | - | |
| 504 | -def _unicode(s, errors='replace'): | |
| 505 | - """ | |
| 506 | - Map unicode string to Latin 1. (Python with Unicode support) | |
| 507 | - | |
| 508 | - :param s: UTF-16LE unicode string to convert to Latin-1 | |
| 509 | - :param errors: 'replace', 'ignore' or 'strict'. | |
| 510 | - """ | |
| 511 | - #TODO: test if it OleFileIO works with Unicode strings, instead of | |
| 512 | - # converting to Latin-1. | |
| 513 | - try: | |
| 514 | - # First the string is converted to plain Unicode: | |
| 515 | - # (assuming it is encoded as UTF-16 little-endian) | |
| 516 | - u = s.decode('UTF-16LE', errors) | |
| 517 | - if bytes is not str or KEEP_UNICODE_NAMES: | |
| 518 | - return u | |
| 519 | - else: | |
| 520 | - # Second the unicode string is converted to Latin-1 | |
| 521 | - return u.encode('latin_1', errors) | |
| 522 | - except: | |
| 523 | - # there was an error during Unicode to Latin-1 conversion: | |
| 524 | - raise IOError('incorrect Unicode name') | |
| 525 | - | |
| 526 | - | |
| 527 | 512 | def filetime2datetime(filetime): |
| 528 | 513 | """ |
| 529 | 514 | convert FILETIME (64 bits int) to Python datetime.datetime |
| ... | ... | @@ -910,8 +895,11 @@ class _OleDirectoryEntry: |
| 910 | 895 | namelength = 64 |
| 911 | 896 | # only characters without ending null char are kept: |
| 912 | 897 | name = name[:(namelength-2)] |
| 913 | - # name is converted from unicode to Latin-1: | |
| 914 | - self.name = _unicode(name) | |
| 898 | + #TODO: check if the name is actually followed by a null unicode character ([MS-CFB] 2.6.1) | |
| 899 | + #TODO: check if the name does not contain forbidden characters: | |
| 900 | + # [MS-CFB] 2.6.1: "The following characters are illegal and MUST NOT be part of the name: '/', '\', ':', '!'." | |
| 901 | + # name is converted from UTF-16LE to the path encoding specified in the OleFileIO: | |
| 902 | + self.name = olefile._decode_utf16_str(name) | |
| 915 | 903 | |
| 916 | 904 | debug('DirEntry SID=%d: %s' % (self.sid, repr(self.name))) |
| 917 | 905 | debug(' - type: %d' % self.entry_type) |
| ... | ... | @@ -1112,7 +1100,7 @@ class OleFileIO: |
| 1112 | 1100 | """ |
| 1113 | 1101 | |
| 1114 | 1102 | def __init__(self, filename=None, raise_defects=DEFECT_FATAL, |
| 1115 | - write_mode=False, debug=False): | |
| 1103 | + write_mode=False, debug=False, path_encoding=DEFAULT_PATH_ENCODING): | |
| 1116 | 1104 | """ |
| 1117 | 1105 | Constructor for the OleFileIO class. |
| 1118 | 1106 | |
| ... | ... | @@ -1133,6 +1121,11 @@ class OleFileIO: |
| 1133 | 1121 | of read-only by default. |
| 1134 | 1122 | |
| 1135 | 1123 | :param debug: bool, set debug mode |
| 1124 | + | |
| 1125 | + :param path_encoding: None or str, name of the codec to use for path | |
| 1126 | + names (streams and storages), or None for Unicode. | |
| 1127 | + Unicode by default on Python 3+, UTF-8 on Python 2.x. | |
| 1128 | + (new in olefile 0.42, was hardcoded to Latin-1 until olefile v0.41) | |
| 1136 | 1129 | """ |
| 1137 | 1130 | set_debug_mode(debug) |
| 1138 | 1131 | # minimal level for defects to be raised as exceptions: |
| ... | ... | @@ -1141,6 +1134,7 @@ class OleFileIO: |
| 1141 | 1134 | # tuples of (exception type, message) |
| 1142 | 1135 | self.parsing_issues = [] |
| 1143 | 1136 | self.write_mode = write_mode |
| 1137 | + self.path_encoding = path_encoding | |
| 1144 | 1138 | self._filesize = None |
| 1145 | 1139 | self.fp = None |
| 1146 | 1140 | if filename: |
| ... | ... | @@ -1171,6 +1165,25 @@ class OleFileIO: |
| 1171 | 1165 | self.parsing_issues.append((exception_type, message)) |
| 1172 | 1166 | |
| 1173 | 1167 | |
| 1168 | + def _decode_utf16_str(self, utf16_str, errors='replace'): | |
| 1169 | + """ | |
| 1170 | + Decode a string encoded in UTF-16 LE format, as found in the OLE | |
| 1171 | + directory or in property streams. Return a string encoded | |
| 1172 | + according to the path_encoding specified for the OleFileIO object. | |
| 1173 | + | |
| 1174 | + :param utf16_str: bytes string encoded in UTF-16 LE format | |
| 1175 | + :param errors: str, see python documentation for str.decode() | |
| 1176 | + :return: str, encoded according to path_encoding | |
| 1177 | + """ | |
| 1178 | + unicode_str = utf16_str.decode('UTF-16LE', errors) | |
| 1179 | + if self.path_encoding: | |
| 1180 | + # an encoding has been specified for path names: | |
| 1181 | + return unicode_str.encode(self.path_encoding, errors) | |
| 1182 | + else: | |
| 1183 | + # path_encoding=None, return the Unicode string as-is: | |
| 1184 | + return unicode_str | |
| 1185 | + | |
| 1186 | + | |
| 1174 | 1187 | def open(self, filename, write_mode=False): |
| 1175 | 1188 | """ |
| 1176 | 1189 | Open an OLE2 file in read-only or read/write mode. |
| ... | ... | @@ -1813,6 +1826,7 @@ class OleFileIO: |
| 1813 | 1826 | """ |
| 1814 | 1827 | prefix = prefix + [node.name] |
| 1815 | 1828 | for entry in node.kids: |
| 1829 | + #TODO: fix bug here, check entry type, a storage can have no kids | |
| 1816 | 1830 | if entry.kids: |
| 1817 | 1831 | # this is a storage |
| 1818 | 1832 | if storages: |
| ... | ... | @@ -2060,6 +2074,7 @@ class OleFileIO: |
| 2060 | 2074 | |
| 2061 | 2075 | :returns: a dictionary of values indexed by id (integer) |
| 2062 | 2076 | """ |
| 2077 | + #REFERENCE: [MS-OLEPS] https://msdn.microsoft.com/en-us/library/dd942421.aspx | |
| 2063 | 2078 | # make sure no_conversion is a list, just to simplify code below: |
| 2064 | 2079 | if no_conversion == None: |
| 2065 | 2080 | no_conversion = [] |
| ... | ... | @@ -2140,7 +2155,7 @@ class OleFileIO: |
| 2140 | 2155 | # "the string should NOT contain embedded or additional trailing |
| 2141 | 2156 | # null characters." |
| 2142 | 2157 | count = i32(s, offset+4) |
| 2143 | - value = _unicode(s[offset+8:offset+8+count*2]) | |
| 2158 | + value = self._decode_utf16_str(s[offset+8:offset+8+count*2]) | |
| 2144 | 2159 | elif type == VT_FILETIME: |
| 2145 | 2160 | value = long(i32(s, offset+4)) + (long(i32(s, offset+8))<<32) |
| 2146 | 2161 | # FILETIME is a 64-bit int: "number of 100ns periods | ... | ... |