Commit 7a9cb92267c7fbe1339f687afdeffb0ff44a2903
1 parent
1d05bbab
improved olefile to specify the encoding for path names, changed default to UTF-…
…8 on python 2.x to support non-Latin1 code pages
Showing
2 changed files
with
53 additions
and
36 deletions
oletools/olevba.py
| @@ -774,6 +774,7 @@ def _extract_vba (ole, vba_root, project_path, dir_path): | @@ -774,6 +774,7 @@ def _extract_vba (ole, vba_root, project_path, dir_path): | ||
| 774 | # case-insensitive search in the code_modules dict to find the file extension: | 774 | # case-insensitive search in the code_modules dict to find the file extension: |
| 775 | filext = code_modules.get(MODULENAME_ModuleName.lower(), 'bin') | 775 | filext = code_modules.get(MODULENAME_ModuleName.lower(), 'bin') |
| 776 | filename = '{0}.{1}'.format(MODULENAME_ModuleName, filext) | 776 | filename = '{0}.{1}'.format(MODULENAME_ModuleName, filext) |
| 777 | + #TODO: also yield the codepage so that callers can decode it properly | ||
| 777 | yield (code_path, filename, code_data) | 778 | yield (code_path, filename, code_data) |
| 778 | # print '-'*79 | 779 | # print '-'*79 |
| 779 | # print filename | 780 | # print filename |
| @@ -972,7 +973,8 @@ class VBA_Parser(object): | @@ -972,7 +973,8 @@ class VBA_Parser(object): | ||
| 972 | if olefile.isOleFile(_file): | 973 | if olefile.isOleFile(_file): |
| 973 | # This looks like an OLE file | 974 | # This looks like an OLE file |
| 974 | logging.info('Parsing OLE file %s' % self.filename) | 975 | logging.info('Parsing OLE file %s' % self.filename) |
| 975 | - self.ole_file = olefile.OleFileIO(_file) | 976 | + # Open and parse the OLE file, using unicode for path names: |
| 977 | + self.ole_file = olefile.OleFileIO(_file, path_encoding=None) | ||
| 976 | self.type = TYPE_OLE | 978 | self.type = TYPE_OLE |
| 977 | #TODO: raise TypeError if this is a Powerpoint 97 file, since VBA macros cannot be detected yet | 979 | #TODO: raise TypeError if this is a Powerpoint 97 file, since VBA macros cannot be detected yet |
| 978 | elif zipfile.is_zipfile(_file): | 980 | elif zipfile.is_zipfile(_file): |
oletools/thirdparty/olefile/olefile.py
| 1 | #!/usr/bin/env python | 1 | #!/usr/bin/env python |
| 2 | 2 | ||
| 3 | -# olefile (formerly OleFileIO_PL) version 0.41 2014-11-25 | 3 | +# olefile (formerly OleFileIO_PL) version 0.42 2015-01-24 |
| 4 | # | 4 | # |
| 5 | # Module to read/write Microsoft OLE2 files (also called Structured Storage or | 5 | # Module to read/write Microsoft OLE2 files (also called Structured Storage or |
| 6 | # Microsoft Compound Document File Format), such as Microsoft Office 97-2003 | 6 | # Microsoft Compound Document File Format), such as Microsoft Office 97-2003 |
| @@ -9,7 +9,7 @@ | @@ -9,7 +9,7 @@ | ||
| 9 | # | 9 | # |
| 10 | # Project website: http://www.decalage.info/olefile | 10 | # Project website: http://www.decalage.info/olefile |
| 11 | # | 11 | # |
| 12 | -# olefile is copyright (c) 2005-2014 Philippe Lagadec (http://www.decalage.info) | 12 | +# olefile is copyright (c) 2005-2015 Philippe Lagadec (http://www.decalage.info) |
| 13 | # | 13 | # |
| 14 | # olefile is based on the OleFileIO module from the PIL library v1.1.6 | 14 | # olefile is based on the OleFileIO module from the PIL library v1.1.6 |
| 15 | # See: http://www.pythonware.com/products/pil/index.htm | 15 | # See: http://www.pythonware.com/products/pil/index.htm |
| @@ -29,12 +29,12 @@ from __future__ import print_function # This version of olefile requires Pytho | @@ -29,12 +29,12 @@ from __future__ import print_function # This version of olefile requires Pytho | ||
| 29 | 29 | ||
| 30 | 30 | ||
| 31 | __author__ = "Philippe Lagadec" | 31 | __author__ = "Philippe Lagadec" |
| 32 | -__date__ = "2014-11-25" | ||
| 33 | -__version__ = '0.41' | 32 | +__date__ = "2015-01-24" |
| 33 | +__version__ = '0.42' | ||
| 34 | 34 | ||
| 35 | #--- LICENSE ------------------------------------------------------------------ | 35 | #--- LICENSE ------------------------------------------------------------------ |
| 36 | 36 | ||
| 37 | -# olefile (formerly OleFileIO_PL) is copyright (c) 2005-2014 Philippe Lagadec | 37 | +# olefile (formerly OleFileIO_PL) is copyright (c) 2005-2015 Philippe Lagadec |
| 38 | # (http://www.decalage.info) | 38 | # (http://www.decalage.info) |
| 39 | # | 39 | # |
| 40 | # All rights reserved. | 40 | # All rights reserved. |
| @@ -177,6 +177,9 @@ __version__ = '0.41' | @@ -177,6 +177,9 @@ __version__ = '0.41' | ||
| 177 | # 2014-11-13 v0.41 PL: - improved isOleFile and OleFileIO.open to support OLE | 177 | # 2014-11-13 v0.41 PL: - improved isOleFile and OleFileIO.open to support OLE |
| 178 | # data in a string buffer and file-like objects. | 178 | # data in a string buffer and file-like objects. |
| 179 | # 2014-11-21 PL: - updated comments according to Pillow's commits | 179 | # 2014-11-21 PL: - updated comments according to Pillow's commits |
| 180 | +# 2015-01-24 v0.42 PL: - changed the default path name encoding from Latin-1 | ||
| 181 | +# to UTF-8 on Python 2.x (Unicode on Python 3.x) | ||
| 182 | +# - added path_encoding option to override the default | ||
| 180 | 183 | ||
| 181 | #----------------------------------------------------------------------------- | 184 | #----------------------------------------------------------------------------- |
| 182 | # TODO (for version 1.0): | 185 | # TODO (for version 1.0): |
| @@ -314,6 +317,14 @@ except NameError: | @@ -314,6 +317,14 @@ except NameError: | ||
| 314 | # if False (default PIL behaviour), all filenames are converted to Latin-1. | 317 | # if False (default PIL behaviour), all filenames are converted to Latin-1. |
| 315 | KEEP_UNICODE_NAMES = True | 318 | KEEP_UNICODE_NAMES = True |
| 316 | 319 | ||
| 320 | +if sys.version_info[0] < 3: | ||
| 321 | + # On Python 2.x, the default encoding for path names is UTF-8: | ||
| 322 | + DEFAULT_PATH_ENCODING = 'utf-8' | ||
| 323 | +else: | ||
| 324 | + # On Python 3.x, the default encoding for path names is Unicode (None): | ||
| 325 | + DEFAULT_PATH_ENCODING = None | ||
| 326 | + | ||
| 327 | + | ||
| 317 | #=== DEBUGGING =============================================================== | 328 | #=== DEBUGGING =============================================================== |
| 318 | 329 | ||
| 319 | #TODO: replace this by proper logging | 330 | #TODO: replace this by proper logging |
| @@ -498,32 +509,6 @@ def _clsid(clsid): | @@ -498,32 +509,6 @@ def _clsid(clsid): | ||
| 498 | 509 | ||
| 499 | 510 | ||
| 500 | 511 | ||
| 501 | -# UNICODE support: | ||
| 502 | -# (necessary to handle storages/streams names which use Unicode) | ||
| 503 | - | ||
| 504 | -def _unicode(s, errors='replace'): | ||
| 505 | - """ | ||
| 506 | - Map unicode string to Latin 1. (Python with Unicode support) | ||
| 507 | - | ||
| 508 | - :param s: UTF-16LE unicode string to convert to Latin-1 | ||
| 509 | - :param errors: 'replace', 'ignore' or 'strict'. | ||
| 510 | - """ | ||
| 511 | - #TODO: test if it OleFileIO works with Unicode strings, instead of | ||
| 512 | - # converting to Latin-1. | ||
| 513 | - try: | ||
| 514 | - # First the string is converted to plain Unicode: | ||
| 515 | - # (assuming it is encoded as UTF-16 little-endian) | ||
| 516 | - u = s.decode('UTF-16LE', errors) | ||
| 517 | - if bytes is not str or KEEP_UNICODE_NAMES: | ||
| 518 | - return u | ||
| 519 | - else: | ||
| 520 | - # Second the unicode string is converted to Latin-1 | ||
| 521 | - return u.encode('latin_1', errors) | ||
| 522 | - except: | ||
| 523 | - # there was an error during Unicode to Latin-1 conversion: | ||
| 524 | - raise IOError('incorrect Unicode name') | ||
| 525 | - | ||
| 526 | - | ||
| 527 | def filetime2datetime(filetime): | 512 | def filetime2datetime(filetime): |
| 528 | """ | 513 | """ |
| 529 | convert FILETIME (64 bits int) to Python datetime.datetime | 514 | convert FILETIME (64 bits int) to Python datetime.datetime |
| @@ -910,8 +895,11 @@ class _OleDirectoryEntry: | @@ -910,8 +895,11 @@ class _OleDirectoryEntry: | ||
| 910 | namelength = 64 | 895 | namelength = 64 |
| 911 | # only characters without ending null char are kept: | 896 | # only characters without ending null char are kept: |
| 912 | name = name[:(namelength-2)] | 897 | name = name[:(namelength-2)] |
| 913 | - # name is converted from unicode to Latin-1: | ||
| 914 | - self.name = _unicode(name) | 898 | + #TODO: check if the name is actually followed by a null unicode character ([MS-CFB] 2.6.1) |
| 899 | + #TODO: check if the name does not contain forbidden characters: | ||
| 900 | + # [MS-CFB] 2.6.1: "The following characters are illegal and MUST NOT be part of the name: '/', '\', ':', '!'." | ||
| 901 | + # name is converted from UTF-16LE to the path encoding specified in the OleFileIO: | ||
| 902 | + self.name = olefile._decode_utf16_str(name) | ||
| 915 | 903 | ||
| 916 | debug('DirEntry SID=%d: %s' % (self.sid, repr(self.name))) | 904 | debug('DirEntry SID=%d: %s' % (self.sid, repr(self.name))) |
| 917 | debug(' - type: %d' % self.entry_type) | 905 | debug(' - type: %d' % self.entry_type) |
| @@ -1112,7 +1100,7 @@ class OleFileIO: | @@ -1112,7 +1100,7 @@ class OleFileIO: | ||
| 1112 | """ | 1100 | """ |
| 1113 | 1101 | ||
| 1114 | def __init__(self, filename=None, raise_defects=DEFECT_FATAL, | 1102 | def __init__(self, filename=None, raise_defects=DEFECT_FATAL, |
| 1115 | - write_mode=False, debug=False): | 1103 | + write_mode=False, debug=False, path_encoding=DEFAULT_PATH_ENCODING): |
| 1116 | """ | 1104 | """ |
| 1117 | Constructor for the OleFileIO class. | 1105 | Constructor for the OleFileIO class. |
| 1118 | 1106 | ||
| @@ -1133,6 +1121,11 @@ class OleFileIO: | @@ -1133,6 +1121,11 @@ class OleFileIO: | ||
| 1133 | of read-only by default. | 1121 | of read-only by default. |
| 1134 | 1122 | ||
| 1135 | :param debug: bool, set debug mode | 1123 | :param debug: bool, set debug mode |
| 1124 | + | ||
| 1125 | + :param path_encoding: None or str, name of the codec to use for path | ||
| 1126 | + names (streams and storages), or None for Unicode. | ||
| 1127 | + Unicode by default on Python 3+, UTF-8 on Python 2.x. | ||
| 1128 | + (new in olefile 0.42, was hardcoded to Latin-1 until olefile v0.41) | ||
| 1136 | """ | 1129 | """ |
| 1137 | set_debug_mode(debug) | 1130 | set_debug_mode(debug) |
| 1138 | # minimal level for defects to be raised as exceptions: | 1131 | # minimal level for defects to be raised as exceptions: |
| @@ -1141,6 +1134,7 @@ class OleFileIO: | @@ -1141,6 +1134,7 @@ class OleFileIO: | ||
| 1141 | # tuples of (exception type, message) | 1134 | # tuples of (exception type, message) |
| 1142 | self.parsing_issues = [] | 1135 | self.parsing_issues = [] |
| 1143 | self.write_mode = write_mode | 1136 | self.write_mode = write_mode |
| 1137 | + self.path_encoding = path_encoding | ||
| 1144 | self._filesize = None | 1138 | self._filesize = None |
| 1145 | self.fp = None | 1139 | self.fp = None |
| 1146 | if filename: | 1140 | if filename: |
| @@ -1171,6 +1165,25 @@ class OleFileIO: | @@ -1171,6 +1165,25 @@ class OleFileIO: | ||
| 1171 | self.parsing_issues.append((exception_type, message)) | 1165 | self.parsing_issues.append((exception_type, message)) |
| 1172 | 1166 | ||
| 1173 | 1167 | ||
| 1168 | + def _decode_utf16_str(self, utf16_str, errors='replace'): | ||
| 1169 | + """ | ||
| 1170 | + Decode a string encoded in UTF-16 LE format, as found in the OLE | ||
| 1171 | + directory or in property streams. Return a string encoded | ||
| 1172 | + according to the path_encoding specified for the OleFileIO object. | ||
| 1173 | + | ||
| 1174 | + :param utf16_str: bytes string encoded in UTF-16 LE format | ||
| 1175 | + :param errors: str, see python documentation for str.decode() | ||
| 1176 | + :return: str, encoded according to path_encoding | ||
| 1177 | + """ | ||
| 1178 | + unicode_str = utf16_str.decode('UTF-16LE', errors) | ||
| 1179 | + if self.path_encoding: | ||
| 1180 | + # an encoding has been specified for path names: | ||
| 1181 | + return unicode_str.encode(self.path_encoding, errors) | ||
| 1182 | + else: | ||
| 1183 | + # path_encoding=None, return the Unicode string as-is: | ||
| 1184 | + return unicode_str | ||
| 1185 | + | ||
| 1186 | + | ||
| 1174 | def open(self, filename, write_mode=False): | 1187 | def open(self, filename, write_mode=False): |
| 1175 | """ | 1188 | """ |
| 1176 | Open an OLE2 file in read-only or read/write mode. | 1189 | Open an OLE2 file in read-only or read/write mode. |
| @@ -1813,6 +1826,7 @@ class OleFileIO: | @@ -1813,6 +1826,7 @@ class OleFileIO: | ||
| 1813 | """ | 1826 | """ |
| 1814 | prefix = prefix + [node.name] | 1827 | prefix = prefix + [node.name] |
| 1815 | for entry in node.kids: | 1828 | for entry in node.kids: |
| 1829 | + #TODO: fix bug here, check entry type, a storage can have no kids | ||
| 1816 | if entry.kids: | 1830 | if entry.kids: |
| 1817 | # this is a storage | 1831 | # this is a storage |
| 1818 | if storages: | 1832 | if storages: |
| @@ -2060,6 +2074,7 @@ class OleFileIO: | @@ -2060,6 +2074,7 @@ class OleFileIO: | ||
| 2060 | 2074 | ||
| 2061 | :returns: a dictionary of values indexed by id (integer) | 2075 | :returns: a dictionary of values indexed by id (integer) |
| 2062 | """ | 2076 | """ |
| 2077 | + #REFERENCE: [MS-OLEPS] https://msdn.microsoft.com/en-us/library/dd942421.aspx | ||
| 2063 | # make sure no_conversion is a list, just to simplify code below: | 2078 | # make sure no_conversion is a list, just to simplify code below: |
| 2064 | if no_conversion == None: | 2079 | if no_conversion == None: |
| 2065 | no_conversion = [] | 2080 | no_conversion = [] |
| @@ -2140,7 +2155,7 @@ class OleFileIO: | @@ -2140,7 +2155,7 @@ class OleFileIO: | ||
| 2140 | # "the string should NOT contain embedded or additional trailing | 2155 | # "the string should NOT contain embedded or additional trailing |
| 2141 | # null characters." | 2156 | # null characters." |
| 2142 | count = i32(s, offset+4) | 2157 | count = i32(s, offset+4) |
| 2143 | - value = _unicode(s[offset+8:offset+8+count*2]) | 2158 | + value = self._decode_utf16_str(s[offset+8:offset+8+count*2]) |
| 2144 | elif type == VT_FILETIME: | 2159 | elif type == VT_FILETIME: |
| 2145 | value = long(i32(s, offset+4)) + (long(i32(s, offset+8))<<32) | 2160 | value = long(i32(s, offset+4)) + (long(i32(s, offset+8))<<32) |
| 2146 | # FILETIME is a 64-bit int: "number of 100ns periods | 2161 | # FILETIME is a 64-bit int: "number of 100ns periods |