Commit 7a9cb92267c7fbe1339f687afdeffb0ff44a2903

Authored by Philippe Lagadec
1 parent 1d05bbab

improved olefile to specify the encoding for path names, changed default to UTF-…

…8 on python 2.x to support non-Latin1 code pages
oletools/olevba.py
@@ -774,6 +774,7 @@ def _extract_vba (ole, vba_root, project_path, dir_path): @@ -774,6 +774,7 @@ def _extract_vba (ole, vba_root, project_path, dir_path):
774 # case-insensitive search in the code_modules dict to find the file extension: 774 # case-insensitive search in the code_modules dict to find the file extension:
775 filext = code_modules.get(MODULENAME_ModuleName.lower(), 'bin') 775 filext = code_modules.get(MODULENAME_ModuleName.lower(), 'bin')
776 filename = '{0}.{1}'.format(MODULENAME_ModuleName, filext) 776 filename = '{0}.{1}'.format(MODULENAME_ModuleName, filext)
  777 + #TODO: also yield the codepage so that callers can decode it properly
777 yield (code_path, filename, code_data) 778 yield (code_path, filename, code_data)
778 # print '-'*79 779 # print '-'*79
779 # print filename 780 # print filename
@@ -972,7 +973,8 @@ class VBA_Parser(object): @@ -972,7 +973,8 @@ class VBA_Parser(object):
972 if olefile.isOleFile(_file): 973 if olefile.isOleFile(_file):
973 # This looks like an OLE file 974 # This looks like an OLE file
974 logging.info('Parsing OLE file %s' % self.filename) 975 logging.info('Parsing OLE file %s' % self.filename)
975 - self.ole_file = olefile.OleFileIO(_file) 976 + # Open and parse the OLE file, using unicode for path names:
  977 + self.ole_file = olefile.OleFileIO(_file, path_encoding=None)
976 self.type = TYPE_OLE 978 self.type = TYPE_OLE
977 #TODO: raise TypeError if this is a Powerpoint 97 file, since VBA macros cannot be detected yet 979 #TODO: raise TypeError if this is a Powerpoint 97 file, since VBA macros cannot be detected yet
978 elif zipfile.is_zipfile(_file): 980 elif zipfile.is_zipfile(_file):
oletools/thirdparty/olefile/olefile.py
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 -# olefile (formerly OleFileIO_PL) version 0.41 2014-11-25 3 +# olefile (formerly OleFileIO_PL) version 0.42 2015-01-24
4 # 4 #
5 # Module to read/write Microsoft OLE2 files (also called Structured Storage or 5 # Module to read/write Microsoft OLE2 files (also called Structured Storage or
6 # Microsoft Compound Document File Format), such as Microsoft Office 97-2003 6 # Microsoft Compound Document File Format), such as Microsoft Office 97-2003
@@ -9,7 +9,7 @@ @@ -9,7 +9,7 @@
9 # 9 #
10 # Project website: http://www.decalage.info/olefile 10 # Project website: http://www.decalage.info/olefile
11 # 11 #
12 -# olefile is copyright (c) 2005-2014 Philippe Lagadec (http://www.decalage.info) 12 +# olefile is copyright (c) 2005-2015 Philippe Lagadec (http://www.decalage.info)
13 # 13 #
14 # olefile is based on the OleFileIO module from the PIL library v1.1.6 14 # olefile is based on the OleFileIO module from the PIL library v1.1.6
15 # See: http://www.pythonware.com/products/pil/index.htm 15 # See: http://www.pythonware.com/products/pil/index.htm
@@ -29,12 +29,12 @@ from __future__ import print_function # This version of olefile requires Pytho @@ -29,12 +29,12 @@ from __future__ import print_function # This version of olefile requires Pytho
29 29
30 30
31 __author__ = "Philippe Lagadec" 31 __author__ = "Philippe Lagadec"
32 -__date__ = "2014-11-25"  
33 -__version__ = '0.41' 32 +__date__ = "2015-01-24"
  33 +__version__ = '0.42'
34 34
35 #--- LICENSE ------------------------------------------------------------------ 35 #--- LICENSE ------------------------------------------------------------------
36 36
37 -# olefile (formerly OleFileIO_PL) is copyright (c) 2005-2014 Philippe Lagadec 37 +# olefile (formerly OleFileIO_PL) is copyright (c) 2005-2015 Philippe Lagadec
38 # (http://www.decalage.info) 38 # (http://www.decalage.info)
39 # 39 #
40 # All rights reserved. 40 # All rights reserved.
@@ -177,6 +177,9 @@ __version__ = '0.41' @@ -177,6 +177,9 @@ __version__ = '0.41'
177 # 2014-11-13 v0.41 PL: - improved isOleFile and OleFileIO.open to support OLE 177 # 2014-11-13 v0.41 PL: - improved isOleFile and OleFileIO.open to support OLE
178 # data in a string buffer and file-like objects. 178 # data in a string buffer and file-like objects.
179 # 2014-11-21 PL: - updated comments according to Pillow's commits 179 # 2014-11-21 PL: - updated comments according to Pillow's commits
  180 +# 2015-01-24 v0.42 PL: - changed the default path name encoding from Latin-1
  181 +# to UTF-8 on Python 2.x (Unicode on Python 3.x)
  182 +# - added path_encoding option to override the default
180 183
181 #----------------------------------------------------------------------------- 184 #-----------------------------------------------------------------------------
182 # TODO (for version 1.0): 185 # TODO (for version 1.0):
@@ -314,6 +317,14 @@ except NameError: @@ -314,6 +317,14 @@ except NameError:
314 # if False (default PIL behaviour), all filenames are converted to Latin-1. 317 # if False (default PIL behaviour), all filenames are converted to Latin-1.
315 KEEP_UNICODE_NAMES = True 318 KEEP_UNICODE_NAMES = True
316 319
  320 +if sys.version_info[0] < 3:
  321 + # On Python 2.x, the default encoding for path names is UTF-8:
  322 + DEFAULT_PATH_ENCODING = 'utf-8'
  323 +else:
  324 + # On Python 3.x, the default encoding for path names is Unicode (None):
  325 + DEFAULT_PATH_ENCODING = None
  326 +
  327 +
317 #=== DEBUGGING =============================================================== 328 #=== DEBUGGING ===============================================================
318 329
319 #TODO: replace this by proper logging 330 #TODO: replace this by proper logging
@@ -498,32 +509,6 @@ def _clsid(clsid): @@ -498,32 +509,6 @@ def _clsid(clsid):
498 509
499 510
500 511
501 -# UNICODE support:  
502 -# (necessary to handle storages/streams names which use Unicode)  
503 -  
504 -def _unicode(s, errors='replace'):  
505 - """  
506 - Map unicode string to Latin 1. (Python with Unicode support)  
507 -  
508 - :param s: UTF-16LE unicode string to convert to Latin-1  
509 - :param errors: 'replace', 'ignore' or 'strict'.  
510 - """  
511 - #TODO: test if it OleFileIO works with Unicode strings, instead of  
512 - # converting to Latin-1.  
513 - try:  
514 - # First the string is converted to plain Unicode:  
515 - # (assuming it is encoded as UTF-16 little-endian)  
516 - u = s.decode('UTF-16LE', errors)  
517 - if bytes is not str or KEEP_UNICODE_NAMES:  
518 - return u  
519 - else:  
520 - # Second the unicode string is converted to Latin-1  
521 - return u.encode('latin_1', errors)  
522 - except:  
523 - # there was an error during Unicode to Latin-1 conversion:  
524 - raise IOError('incorrect Unicode name')  
525 -  
526 -  
527 def filetime2datetime(filetime): 512 def filetime2datetime(filetime):
528 """ 513 """
529 convert FILETIME (64 bits int) to Python datetime.datetime 514 convert FILETIME (64 bits int) to Python datetime.datetime
@@ -910,8 +895,11 @@ class _OleDirectoryEntry: @@ -910,8 +895,11 @@ class _OleDirectoryEntry:
910 namelength = 64 895 namelength = 64
911 # only characters without ending null char are kept: 896 # only characters without ending null char are kept:
912 name = name[:(namelength-2)] 897 name = name[:(namelength-2)]
913 - # name is converted from unicode to Latin-1:  
914 - self.name = _unicode(name) 898 + #TODO: check if the name is actually followed by a null unicode character ([MS-CFB] 2.6.1)
  899 + #TODO: check if the name does not contain forbidden characters:
  900 + # [MS-CFB] 2.6.1: "The following characters are illegal and MUST NOT be part of the name: '/', '\', ':', '!'."
  901 + # name is converted from UTF-16LE to the path encoding specified in the OleFileIO:
  902 + self.name = olefile._decode_utf16_str(name)
915 903
916 debug('DirEntry SID=%d: %s' % (self.sid, repr(self.name))) 904 debug('DirEntry SID=%d: %s' % (self.sid, repr(self.name)))
917 debug(' - type: %d' % self.entry_type) 905 debug(' - type: %d' % self.entry_type)
@@ -1112,7 +1100,7 @@ class OleFileIO: @@ -1112,7 +1100,7 @@ class OleFileIO:
1112 """ 1100 """
1113 1101
1114 def __init__(self, filename=None, raise_defects=DEFECT_FATAL, 1102 def __init__(self, filename=None, raise_defects=DEFECT_FATAL,
1115 - write_mode=False, debug=False): 1103 + write_mode=False, debug=False, path_encoding=DEFAULT_PATH_ENCODING):
1116 """ 1104 """
1117 Constructor for the OleFileIO class. 1105 Constructor for the OleFileIO class.
1118 1106
@@ -1133,6 +1121,11 @@ class OleFileIO: @@ -1133,6 +1121,11 @@ class OleFileIO:
1133 of read-only by default. 1121 of read-only by default.
1134 1122
1135 :param debug: bool, set debug mode 1123 :param debug: bool, set debug mode
  1124 +
  1125 + :param path_encoding: None or str, name of the codec to use for path
  1126 + names (streams and storages), or None for Unicode.
  1127 + Unicode by default on Python 3+, UTF-8 on Python 2.x.
  1128 + (new in olefile 0.42, was hardcoded to Latin-1 until olefile v0.41)
1136 """ 1129 """
1137 set_debug_mode(debug) 1130 set_debug_mode(debug)
1138 # minimal level for defects to be raised as exceptions: 1131 # minimal level for defects to be raised as exceptions:
@@ -1141,6 +1134,7 @@ class OleFileIO: @@ -1141,6 +1134,7 @@ class OleFileIO:
1141 # tuples of (exception type, message) 1134 # tuples of (exception type, message)
1142 self.parsing_issues = [] 1135 self.parsing_issues = []
1143 self.write_mode = write_mode 1136 self.write_mode = write_mode
  1137 + self.path_encoding = path_encoding
1144 self._filesize = None 1138 self._filesize = None
1145 self.fp = None 1139 self.fp = None
1146 if filename: 1140 if filename:
@@ -1171,6 +1165,25 @@ class OleFileIO: @@ -1171,6 +1165,25 @@ class OleFileIO:
1171 self.parsing_issues.append((exception_type, message)) 1165 self.parsing_issues.append((exception_type, message))
1172 1166
1173 1167
  1168 + def _decode_utf16_str(self, utf16_str, errors='replace'):
  1169 + """
  1170 + Decode a string encoded in UTF-16 LE format, as found in the OLE
  1171 + directory or in property streams. Return a string encoded
  1172 + according to the path_encoding specified for the OleFileIO object.
  1173 +
  1174 + :param utf16_str: bytes string encoded in UTF-16 LE format
  1175 + :param errors: str, see python documentation for str.decode()
  1176 + :return: str, encoded according to path_encoding
  1177 + """
  1178 + unicode_str = utf16_str.decode('UTF-16LE', errors)
  1179 + if self.path_encoding:
  1180 + # an encoding has been specified for path names:
  1181 + return unicode_str.encode(self.path_encoding, errors)
  1182 + else:
  1183 + # path_encoding=None, return the Unicode string as-is:
  1184 + return unicode_str
  1185 +
  1186 +
1174 def open(self, filename, write_mode=False): 1187 def open(self, filename, write_mode=False):
1175 """ 1188 """
1176 Open an OLE2 file in read-only or read/write mode. 1189 Open an OLE2 file in read-only or read/write mode.
@@ -1813,6 +1826,7 @@ class OleFileIO: @@ -1813,6 +1826,7 @@ class OleFileIO:
1813 """ 1826 """
1814 prefix = prefix + [node.name] 1827 prefix = prefix + [node.name]
1815 for entry in node.kids: 1828 for entry in node.kids:
  1829 + #TODO: fix bug here, check entry type, a storage can have no kids
1816 if entry.kids: 1830 if entry.kids:
1817 # this is a storage 1831 # this is a storage
1818 if storages: 1832 if storages:
@@ -2060,6 +2074,7 @@ class OleFileIO: @@ -2060,6 +2074,7 @@ class OleFileIO:
2060 2074
2061 :returns: a dictionary of values indexed by id (integer) 2075 :returns: a dictionary of values indexed by id (integer)
2062 """ 2076 """
  2077 + #REFERENCE: [MS-OLEPS] https://msdn.microsoft.com/en-us/library/dd942421.aspx
2063 # make sure no_conversion is a list, just to simplify code below: 2078 # make sure no_conversion is a list, just to simplify code below:
2064 if no_conversion == None: 2079 if no_conversion == None:
2065 no_conversion = [] 2080 no_conversion = []
@@ -2140,7 +2155,7 @@ class OleFileIO: @@ -2140,7 +2155,7 @@ class OleFileIO:
2140 # "the string should NOT contain embedded or additional trailing 2155 # "the string should NOT contain embedded or additional trailing
2141 # null characters." 2156 # null characters."
2142 count = i32(s, offset+4) 2157 count = i32(s, offset+4)
2143 - value = _unicode(s[offset+8:offset+8+count*2]) 2158 + value = self._decode_utf16_str(s[offset+8:offset+8+count*2])
2144 elif type == VT_FILETIME: 2159 elif type == VT_FILETIME:
2145 value = long(i32(s, offset+4)) + (long(i32(s, offset+8))<<32) 2160 value = long(i32(s, offset+4)) + (long(i32(s, offset+8))<<32)
2146 # FILETIME is a 64-bit int: "number of 100ns periods 2161 # FILETIME is a 64-bit int: "number of 100ns periods