Commit 7a9cb92267c7fbe1339f687afdeffb0ff44a2903

Authored by Philippe Lagadec
1 parent 1d05bbab

improved olefile to specify the encoding for path names, changed default to UTF-…

…8 on python 2.x to support non-Latin1 code pages
oletools/olevba.py
... ... @@ -774,6 +774,7 @@ def _extract_vba (ole, vba_root, project_path, dir_path):
774 774 # case-insensitive search in the code_modules dict to find the file extension:
775 775 filext = code_modules.get(MODULENAME_ModuleName.lower(), 'bin')
776 776 filename = '{0}.{1}'.format(MODULENAME_ModuleName, filext)
  777 + #TODO: also yield the codepage so that callers can decode it properly
777 778 yield (code_path, filename, code_data)
778 779 # print '-'*79
779 780 # print filename
... ... @@ -972,7 +973,8 @@ class VBA_Parser(object):
972 973 if olefile.isOleFile(_file):
973 974 # This looks like an OLE file
974 975 logging.info('Parsing OLE file %s' % self.filename)
975   - self.ole_file = olefile.OleFileIO(_file)
  976 + # Open and parse the OLE file, using unicode for path names:
  977 + self.ole_file = olefile.OleFileIO(_file, path_encoding=None)
976 978 self.type = TYPE_OLE
977 979 #TODO: raise TypeError if this is a Powerpoint 97 file, since VBA macros cannot be detected yet
978 980 elif zipfile.is_zipfile(_file):
... ...
oletools/thirdparty/olefile/olefile.py
1 1 #!/usr/bin/env python
2 2  
3   -# olefile (formerly OleFileIO_PL) version 0.41 2014-11-25
  3 +# olefile (formerly OleFileIO_PL) version 0.42 2015-01-24
4 4 #
5 5 # Module to read/write Microsoft OLE2 files (also called Structured Storage or
6 6 # Microsoft Compound Document File Format), such as Microsoft Office 97-2003
... ... @@ -9,7 +9,7 @@
9 9 #
10 10 # Project website: http://www.decalage.info/olefile
11 11 #
12   -# olefile is copyright (c) 2005-2014 Philippe Lagadec (http://www.decalage.info)
  12 +# olefile is copyright (c) 2005-2015 Philippe Lagadec (http://www.decalage.info)
13 13 #
14 14 # olefile is based on the OleFileIO module from the PIL library v1.1.6
15 15 # See: http://www.pythonware.com/products/pil/index.htm
... ... @@ -29,12 +29,12 @@ from __future__ import print_function # This version of olefile requires Pytho
29 29  
30 30  
31 31 __author__ = "Philippe Lagadec"
32   -__date__ = "2014-11-25"
33   -__version__ = '0.41'
  32 +__date__ = "2015-01-24"
  33 +__version__ = '0.42'
34 34  
35 35 #--- LICENSE ------------------------------------------------------------------
36 36  
37   -# olefile (formerly OleFileIO_PL) is copyright (c) 2005-2014 Philippe Lagadec
  37 +# olefile (formerly OleFileIO_PL) is copyright (c) 2005-2015 Philippe Lagadec
38 38 # (http://www.decalage.info)
39 39 #
40 40 # All rights reserved.
... ... @@ -177,6 +177,9 @@ __version__ = '0.41'
177 177 # 2014-11-13 v0.41 PL: - improved isOleFile and OleFileIO.open to support OLE
178 178 # data in a string buffer and file-like objects.
179 179 # 2014-11-21 PL: - updated comments according to Pillow's commits
  180 +# 2015-01-24 v0.42 PL: - changed the default path name encoding from Latin-1
  181 +# to UTF-8 on Python 2.x (Unicode on Python 3.x)
  182 +# - added path_encoding option to override the default
180 183  
181 184 #-----------------------------------------------------------------------------
182 185 # TODO (for version 1.0):
... ... @@ -314,6 +317,14 @@ except NameError:
314 317 # if False (default PIL behaviour), all filenames are converted to Latin-1.
315 318 KEEP_UNICODE_NAMES = True
316 319  
  320 +if sys.version_info[0] < 3:
  321 + # On Python 2.x, the default encoding for path names is UTF-8:
  322 + DEFAULT_PATH_ENCODING = 'utf-8'
  323 +else:
  324 + # On Python 3.x, the default encoding for path names is Unicode (None):
  325 + DEFAULT_PATH_ENCODING = None
  326 +
  327 +
317 328 #=== DEBUGGING ===============================================================
318 329  
319 330 #TODO: replace this by proper logging
... ... @@ -498,32 +509,6 @@ def _clsid(clsid):
498 509  
499 510  
500 511  
501   -# UNICODE support:
502   -# (necessary to handle storages/streams names which use Unicode)
503   -
504   -def _unicode(s, errors='replace'):
505   - """
506   - Map unicode string to Latin 1. (Python with Unicode support)
507   -
508   - :param s: UTF-16LE unicode string to convert to Latin-1
509   - :param errors: 'replace', 'ignore' or 'strict'.
510   - """
511   - #TODO: test if it OleFileIO works with Unicode strings, instead of
512   - # converting to Latin-1.
513   - try:
514   - # First the string is converted to plain Unicode:
515   - # (assuming it is encoded as UTF-16 little-endian)
516   - u = s.decode('UTF-16LE', errors)
517   - if bytes is not str or KEEP_UNICODE_NAMES:
518   - return u
519   - else:
520   - # Second the unicode string is converted to Latin-1
521   - return u.encode('latin_1', errors)
522   - except:
523   - # there was an error during Unicode to Latin-1 conversion:
524   - raise IOError('incorrect Unicode name')
525   -
526   -
527 512 def filetime2datetime(filetime):
528 513 """
529 514 convert FILETIME (64 bits int) to Python datetime.datetime
... ... @@ -910,8 +895,11 @@ class _OleDirectoryEntry:
910 895 namelength = 64
911 896 # only characters without ending null char are kept:
912 897 name = name[:(namelength-2)]
913   - # name is converted from unicode to Latin-1:
914   - self.name = _unicode(name)
  898 + #TODO: check if the name is actually followed by a null unicode character ([MS-CFB] 2.6.1)
  899 + #TODO: check if the name does not contain forbidden characters:
  900 + # [MS-CFB] 2.6.1: "The following characters are illegal and MUST NOT be part of the name: '/', '\', ':', '!'."
  901 + # name is converted from UTF-16LE to the path encoding specified in the OleFileIO:
  902 + self.name = olefile._decode_utf16_str(name)
915 903  
916 904 debug('DirEntry SID=%d: %s' % (self.sid, repr(self.name)))
917 905 debug(' - type: %d' % self.entry_type)
... ... @@ -1112,7 +1100,7 @@ class OleFileIO:
1112 1100 """
1113 1101  
1114 1102 def __init__(self, filename=None, raise_defects=DEFECT_FATAL,
1115   - write_mode=False, debug=False):
  1103 + write_mode=False, debug=False, path_encoding=DEFAULT_PATH_ENCODING):
1116 1104 """
1117 1105 Constructor for the OleFileIO class.
1118 1106  
... ... @@ -1133,6 +1121,11 @@ class OleFileIO:
1133 1121 of read-only by default.
1134 1122  
1135 1123 :param debug: bool, set debug mode
  1124 +
  1125 + :param path_encoding: None or str, name of the codec to use for path
  1126 + names (streams and storages), or None for Unicode.
  1127 + Unicode by default on Python 3+, UTF-8 on Python 2.x.
  1128 + (new in olefile 0.42, was hardcoded to Latin-1 until olefile v0.41)
1136 1129 """
1137 1130 set_debug_mode(debug)
1138 1131 # minimal level for defects to be raised as exceptions:
... ... @@ -1141,6 +1134,7 @@ class OleFileIO:
1141 1134 # tuples of (exception type, message)
1142 1135 self.parsing_issues = []
1143 1136 self.write_mode = write_mode
  1137 + self.path_encoding = path_encoding
1144 1138 self._filesize = None
1145 1139 self.fp = None
1146 1140 if filename:
... ... @@ -1171,6 +1165,25 @@ class OleFileIO:
1171 1165 self.parsing_issues.append((exception_type, message))
1172 1166  
1173 1167  
  1168 + def _decode_utf16_str(self, utf16_str, errors='replace'):
  1169 + """
  1170 + Decode a string encoded in UTF-16 LE format, as found in the OLE
  1171 + directory or in property streams. Return a string encoded
  1172 + according to the path_encoding specified for the OleFileIO object.
  1173 +
  1174 + :param utf16_str: bytes string encoded in UTF-16 LE format
  1175 + :param errors: str, see python documentation for str.decode()
  1176 + :return: str, encoded according to path_encoding
  1177 + """
  1178 + unicode_str = utf16_str.decode('UTF-16LE', errors)
  1179 + if self.path_encoding:
  1180 + # an encoding has been specified for path names:
  1181 + return unicode_str.encode(self.path_encoding, errors)
  1182 + else:
  1183 + # path_encoding=None, return the Unicode string as-is:
  1184 + return unicode_str
  1185 +
  1186 +
1174 1187 def open(self, filename, write_mode=False):
1175 1188 """
1176 1189 Open an OLE2 file in read-only or read/write mode.
... ... @@ -1813,6 +1826,7 @@ class OleFileIO:
1813 1826 """
1814 1827 prefix = prefix + [node.name]
1815 1828 for entry in node.kids:
  1829 + #TODO: fix bug here, check entry type, a storage can have no kids
1816 1830 if entry.kids:
1817 1831 # this is a storage
1818 1832 if storages:
... ... @@ -2060,6 +2074,7 @@ class OleFileIO:
2060 2074  
2061 2075 :returns: a dictionary of values indexed by id (integer)
2062 2076 """
  2077 + #REFERENCE: [MS-OLEPS] https://msdn.microsoft.com/en-us/library/dd942421.aspx
2063 2078 # make sure no_conversion is a list, just to simplify code below:
2064 2079 if no_conversion == None:
2065 2080 no_conversion = []
... ... @@ -2140,7 +2155,7 @@ class OleFileIO:
2140 2155 # "the string should NOT contain embedded or additional trailing
2141 2156 # null characters."
2142 2157 count = i32(s, offset+4)
2143   - value = _unicode(s[offset+8:offset+8+count*2])
  2158 + value = self._decode_utf16_str(s[offset+8:offset+8+count*2])
2144 2159 elif type == VT_FILETIME:
2145 2160 value = long(i32(s, offset+4)) + (long(i32(s, offset+8))<<32)
2146 2161 # FILETIME is a 64-bit int: "number of 100ns periods
... ...