Commit 649f2e0f2e91fe55a93df7ed8a7ca5bd97ff5c2f
Merged in christian_intra2net/oletools_json/ppt-parser (pull request #10)
Ppt parser
Showing
6 changed files
with
3231 additions
and
2939 deletions
oletools/README.html
| ... | ... | @@ -14,7 +14,8 @@ |
| 14 | 14 | <p>Note: python-oletools is not related to OLETools published by BeCubed Software.</p> |
| 15 | 15 | <h2 id="news">News</h2> |
| 16 | 16 | <ul> |
| 17 | -<li><strong>2016-04-12 v0.45</strong>: improved <a href="https://bitbucket.org/decalage/oletools/wiki/rtfobj">rtfobj</a> to handle several <a href="http://www.decalage.info/rtf_tricks">anti-analysis tricks</a>, improved <a href="https://bitbucket.org/decalage/oletools/wiki/olevba">olevba</a> to export results in JSON format.</li> | |
| 17 | +<li><strong>2016-04-19 v0.46</strong>: <a href="https://bitbucket.org/decalage/oletools/wiki/olevba">olevba</a> does not deobfuscate VBA expressions by default (much faster), new option --deobf to enable it. Fixed color display bug on Windows for several tools.</li> | |
| 18 | +<li>2016-04-12 v0.45: improved <a href="https://bitbucket.org/decalage/oletools/wiki/rtfobj">rtfobj</a> to handle several <a href="http://www.decalage.info/rtf_tricks">anti-analysis tricks</a>, improved <a href="https://bitbucket.org/decalage/oletools/wiki/olevba">olevba</a> to export results in JSON format.</li> | |
| 18 | 19 | <li>2016-03-11 v0.44: improved <a href="https://bitbucket.org/decalage/oletools/wiki/olevba">olevba</a> to extract and analyse strings from VBA Forms.</li> |
| 19 | 20 | <li>2016-03-04 v0.43: added new tool MacroRaptor (mraptor) to detect malicious macros, bugfix and slight improvements in <a href="https://bitbucket.org/decalage/oletools/wiki/olevba">olevba</a>.</li> |
| 20 | 21 | <li>2016-02-07 v0.42: added two new tools oledir and olemap, better handling of malformed files and several bugfixes in <a href="https://bitbucket.org/decalage/oletools/wiki/olevba">olevba</a>, improved display for <a href="https://bitbucket.org/decalage/oletools/wiki/olemeta">olemeta</a>.</li> | ... | ... |
oletools/README.rst
| ... | ... | @@ -26,7 +26,12 @@ Software. |
| 26 | 26 | News |
| 27 | 27 | ---- |
| 28 | 28 | |
| 29 | -- **2016-04-12 v0.45**: improved | |
| 29 | +- **2016-04-19 v0.46**: | |
| 30 | + `olevba <https://bitbucket.org/decalage/oletools/wiki/olevba>`__ does | |
| 31 | + not deobfuscate VBA expressions by default (much faster), new option | |
| 32 | + --deobf to enable it. Fixed color display bug on Windows for several | |
| 33 | + tools. | |
| 34 | +- 2016-04-12 v0.45: improved | |
| 30 | 35 | `rtfobj <https://bitbucket.org/decalage/oletools/wiki/rtfobj>`__ to |
| 31 | 36 | handle several `anti-analysis |
| 32 | 37 | tricks <http://www.decalage.info/rtf_tricks>`__, improved | ... | ... |
oletools/olevba.py
| ... | ... | @@ -77,7 +77,7 @@ https://github.com/unixfreak0037/officeparser |
| 77 | 77 | # CHANGELOG: |
| 78 | 78 | # 2014-08-05 v0.01 PL: - first version based on officeparser code |
| 79 | 79 | # 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser |
| 80 | -# 2014-08-15 PL: - fixed incorrect value check in PROJECTHELPFILEPATH Record | |
| 80 | +# 2014-08-15 PL: - fixed incorrect value check in projecthelpfilepath Record | |
| 81 | 81 | # 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats |
| 82 | 82 | # and to find the VBA project root anywhere in the file |
| 83 | 83 | # 2014-11-29 v0.04 PL: - use olefile instead of OleFileIO_PL |
| ... | ... | @@ -169,8 +169,12 @@ https://github.com/unixfreak0037/officeparser |
| 169 | 169 | # 2016-03-16 CH: - added option --no-deobfuscate (temporary) |
| 170 | 170 | # 2016-04-19 v0.46 PL: - new option --deobf instead of --no-deobfuscate |
| 171 | 171 | # - updated suspicious keywords |
| 172 | +# 2016-05-04 v0.47 PL: - look for VBA code in any stream including orphans | |
| 173 | +# 2016-04-28 CH: - return an exit code depending on the results | |
| 174 | +# - improved error and exception handling | |
| 175 | +# - improved JSON output | |
| 172 | 176 | |
| 173 | -__version__ = '0.46' | |
| 177 | +__version__ = '0.47' | |
| 174 | 178 | |
| 175 | 179 | #------------------------------------------------------------------------------ |
| 176 | 180 | # TODO: |
| ... | ... | @@ -211,10 +215,8 @@ import math |
| 211 | 215 | import zipfile |
| 212 | 216 | import re |
| 213 | 217 | import optparse |
| 214 | -import os.path | |
| 215 | 218 | import binascii |
| 216 | 219 | import base64 |
| 217 | -import traceback | |
| 218 | 220 | import zlib |
| 219 | 221 | import email # for MHTML parsing |
| 220 | 222 | import string # for printable |
| ... | ... | @@ -291,8 +293,51 @@ def get_logger(name, level=logging.CRITICAL+1): |
| 291 | 293 | log = get_logger('olevba') |
| 292 | 294 | |
| 293 | 295 | |
| 296 | +#=== EXCEPTIONS ============================================================== | |
| 297 | + | |
| 298 | +class FileOpenError(Exception): | |
| 299 | + """ raised by VBA_Parser constructor if all open_... attempts failed | |
| 300 | + | |
| 301 | + probably means the file type is not supported | |
| 302 | + """ | |
| 303 | + | |
| 304 | + def __init__(self, filename): | |
| 305 | + super(FileOpenError, self).__init__( | |
| 306 | + 'Failed to open file %s ... probably not supported' % filename) | |
| 307 | + self.filename = filename | |
| 308 | + | |
| 309 | + | |
| 310 | +class ProcessingError(Exception): | |
| 311 | + """ raised by VBA_Parser.process_file* functions """ | |
| 312 | + | |
| 313 | + def __init__(self, filename, orig_exception): | |
| 314 | + super(ProcessingError, self).__init__( | |
| 315 | + 'Error processing file %s (%s)' % (filename, orig_exception)) | |
| 316 | + self.filename = filename | |
| 317 | + self.orig_exception = orig_exception | |
| 318 | + | |
| 319 | + | |
| 320 | +class MsoExtractionError(RuntimeError): | |
| 321 | + """ raised by mso_file_extract if parsing MSO/ActiveMIME data failed """ | |
| 322 | + | |
| 323 | + def __init__(self, msg): | |
| 324 | + super(MsoExtractionError, self).__init__(msg) | |
| 325 | + self.msg = msg | |
| 326 | + | |
| 327 | + | |
| 294 | 328 | #--- CONSTANTS ---------------------------------------------------------------- |
| 295 | 329 | |
| 330 | +# return codes | |
| 331 | +RETURN_OK = 0 | |
| 332 | +RETURN_WARNINGS = 1 # (reserved, not used yet) | |
| 333 | +RETURN_WRONG_ARGS = 2 # (fixed, built into optparse) | |
| 334 | +RETURN_FILE_NOT_FOUND = 3 | |
| 335 | +RETURN_XGLOB_ERR = 4 | |
| 336 | +RETURN_OPEN_ERROR = 5 | |
| 337 | +RETURN_PARSE_ERROR = 6 | |
| 338 | +RETURN_SEVERAL_ERRS = 7 | |
| 339 | +RETURN_UNEXPECTED = 8 | |
| 340 | + | |
| 296 | 341 | # URL and message to report issues: |
| 297 | 342 | URL_OLEVBA_ISSUES = 'https://bitbucket.org/decalage/oletools/issues' |
| 298 | 343 | MSG_OLEVBA_ISSUES = 'Please report this issue on %s' % URL_OLEVBA_ISSUES |
| ... | ... | @@ -376,7 +421,7 @@ SUSPICIOUS_KEYWORDS = { |
| 376 | 421 | #ADODB.Stream sample: http://pastebin.com/Z4TMyuq6 |
| 377 | 422 | 'May run an executable file or a system command': |
| 378 | 423 | ('Shell', 'vbNormal', 'vbNormalFocus', 'vbHide', 'vbMinimizedFocus', 'vbMaximizedFocus', 'vbNormalNoFocus', |
| 379 | - 'vbMinimizedNoFocus', 'WScript.Shell', 'Run'), | |
| 424 | + 'vbMinimizedNoFocus', 'WScript.Shell', 'Run', 'ShellExecute'), | |
| 380 | 425 | #Shell: http://msdn.microsoft.com/en-us/library/office/gg278437%28v=office.15%29.aspx |
| 381 | 426 | #WScript.Shell+Run sample: http://pastebin.com/Z4TMyuq6 |
| 382 | 427 | 'May run PowerShell commands': |
| ... | ... | @@ -848,36 +893,37 @@ def mso_file_extract(data): |
| 848 | 893 | :param data: bytes string, MSO/ActiveMime file content |
| 849 | 894 | :return: bytes string, extracted data (uncompressed) |
| 850 | 895 | |
| 851 | - raise a RuntimeError if the data cannot be extracted | |
| 896 | + raise a MsoExtractionError if the data cannot be extracted | |
| 852 | 897 | """ |
| 853 | 898 | # check the magic: |
| 854 | 899 | assert is_mso_file(data) |
| 900 | + | |
| 901 | + # In all the samples seen so far, Word always uses an offset of 0x32, | |
| 902 | + # and Excel 0x22A. But we read the offset from the header to be more | |
| 903 | + # generic. | |
| 904 | + offsets = [0x32, 0x22A] | |
| 905 | + | |
| 855 | 906 | # First, attempt to get the compressed data offset from the header |
| 856 | 907 | # According to my tests, it should be an unsigned 16 bits integer, |
| 857 | 908 | # at offset 0x1E (little endian) + add 46: |
| 858 | 909 | try: |
| 859 | 910 | offset = struct.unpack_from('<H', data, offset=0x1E)[0] + 46 |
| 860 | 911 | log.debug('Parsing MSO file: data offset = 0x%X' % offset) |
| 861 | - except KeyboardInterrupt: | |
| 862 | - # do not ignore exceptions when the user presses Ctrl+C/Pause: | |
| 863 | - raise | |
| 864 | - except: | |
| 865 | - log.exception('Unable to parse MSO/ActiveMime file header') | |
| 866 | - raise RuntimeError('Unable to parse MSO/ActiveMime file header') | |
| 867 | - # In all the samples seen so far, Word always uses an offset of 0x32, | |
| 868 | - # and Excel 0x22A. But we read the offset from the header to be more | |
| 869 | - # generic. | |
| 870 | - # Let's try that offset, then 0x32 and 0x22A, just in case: | |
| 871 | - for start in (offset, 0x32, 0x22A): | |
| 912 | + offsets.insert(0, offset) # insert at beginning of offsets | |
| 913 | + except struct.error as exc: | |
| 914 | + log.info('Unable to parse MSO/ActiveMime file header (%s)' % exc) | |
| 915 | + log.debug('Trace:', exc_info=True) | |
| 916 | + raise MsoExtractionError('Unable to parse MSO/ActiveMime file header') | |
| 917 | + # now try offsets | |
| 918 | + for start in offsets: | |
| 872 | 919 | try: |
| 873 | 920 | log.debug('Attempting zlib decompression from MSO file offset 0x%X' % start) |
| 874 | 921 | extracted_data = zlib.decompress(data[start:]) |
| 875 | 922 | return extracted_data |
| 876 | - except KeyboardInterrupt: | |
| 877 | - # do not ignore exceptions when the user presses Ctrl+C/Pause: | |
| 878 | - raise | |
| 879 | - except: | |
| 880 | - log.exception('zlib decompression failed') | |
| 923 | + except zlib.error as exc: | |
| 924 | + log.info('zlib decompression failed for offset %s (%s)' | |
| 925 | + % (start, exc)) | |
| 926 | + log.debug('Trace:', exc_info=True) | |
| 881 | 927 | # None of the guessed offsets worked, let's try brute-forcing by looking |
| 882 | 928 | # for potential zlib-compressed blocks starting with 0x78: |
| 883 | 929 | log.debug('Looking for potential zlib-compressed blocks in MSO file') |
| ... | ... | @@ -887,12 +933,10 @@ def mso_file_extract(data): |
| 887 | 933 | log.debug('Attempting zlib decompression from MSO file offset 0x%X' % start) |
| 888 | 934 | extracted_data = zlib.decompress(data[start:]) |
| 889 | 935 | return extracted_data |
| 890 | - except KeyboardInterrupt: | |
| 891 | - # do not ignore exceptions when the user presses Ctrl+C/Pause: | |
| 892 | - raise | |
| 893 | - except: | |
| 894 | - log.exception('zlib decompression failed') | |
| 895 | - raise RuntimeError('Unable to decompress data from a MSO/ActiveMime file') | |
| 936 | + except zlib.error as exc: | |
| 937 | + log.info('zlib decompression failed (%s)' % exc) | |
| 938 | + log.debug('Trace:', exc_info=True) | |
| 939 | + raise MsoExtractionError('Unable to decompress data from a MSO/ActiveMime file') | |
| 896 | 940 | |
| 897 | 941 | |
| 898 | 942 | #--- FUNCTIONS ---------------------------------------------------------------- |
| ... | ... | @@ -913,29 +957,6 @@ def is_printable(s): |
| 913 | 957 | return set(s).issubset(_PRINTABLE_SET) |
| 914 | 958 | |
| 915 | 959 | |
| 916 | -def print_json(j): | |
| 917 | - """ | |
| 918 | - Print a dictionary, a list or any other object to stdout | |
| 919 | - :param j: object to be printed | |
| 920 | - :return: | |
| 921 | - """ | |
| 922 | - if isinstance(j, dict): | |
| 923 | - for key, val in j.items(): | |
| 924 | - print_json(key) | |
| 925 | - print_json(val) | |
| 926 | - elif isinstance(j, list): | |
| 927 | - for elem in j: | |
| 928 | - print_json(elem) | |
| 929 | - else: | |
| 930 | - try: | |
| 931 | - if len(j) > 20: | |
| 932 | - print type(j), repr(j[:20]), '...(len {0})'.format(len(j)) | |
| 933 | - else: | |
| 934 | - print type(j), repr(j) | |
| 935 | - except TypeError: | |
| 936 | - print type(j), repr(j) | |
| 937 | - | |
| 938 | - | |
| 939 | 960 | def copytoken_help(decompressed_current, decompressed_chunk_start): |
| 940 | 961 | """ |
| 941 | 962 | compute bit masks to decode a CopyToken according to MS-OVBA 2.4.1.3.19.1 CopyToken Help |
| ... | ... | @@ -1059,7 +1080,7 @@ def decompress_stream(compressed_container): |
| 1059 | 1080 | copy_token = \ |
| 1060 | 1081 | struct.unpack("<H", compressed_container[compressed_current:compressed_current + 2])[0] |
| 1061 | 1082 | #TODO: check this |
| 1062 | - length_mask, offset_mask, bit_count, maximum_length = copytoken_help( | |
| 1083 | + length_mask, offset_mask, bit_count, _ = copytoken_help( | |
| 1063 | 1084 | len(decompressed_container), decompressed_chunk_start) |
| 1064 | 1085 | length = (copy_token & length_mask) + 3 |
| 1065 | 1086 | temp1 = copy_token & offset_mask |
| ... | ... | @@ -1138,122 +1159,130 @@ def _extract_vba(ole, vba_root, project_path, dir_path): |
| 1138 | 1159 | dir_stream = cStringIO.StringIO(decompress_stream(dir_compressed)) |
| 1139 | 1160 | |
| 1140 | 1161 | # PROJECTSYSKIND Record |
| 1141 | - PROJECTSYSKIND_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1142 | - check_value('PROJECTSYSKIND_Id', 0x0001, PROJECTSYSKIND_Id) | |
| 1143 | - PROJECTSYSKIND_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1144 | - check_value('PROJECTSYSKIND_Size', 0x0004, PROJECTSYSKIND_Size) | |
| 1145 | - PROJECTSYSKIND_SysKind = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1146 | - if PROJECTSYSKIND_SysKind == 0x00: | |
| 1162 | + projectsyskind_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1163 | + check_value('PROJECTSYSKIND_Id', 0x0001, projectsyskind_id) | |
| 1164 | + projectsyskind_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1165 | + check_value('PROJECTSYSKIND_Size', 0x0004, projectsyskind_size) | |
| 1166 | + projectsyskind_syskind = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1167 | + if projectsyskind_syskind == 0x00: | |
| 1147 | 1168 | log.debug("16-bit Windows") |
| 1148 | - elif PROJECTSYSKIND_SysKind == 0x01: | |
| 1169 | + elif projectsyskind_syskind == 0x01: | |
| 1149 | 1170 | log.debug("32-bit Windows") |
| 1150 | - elif PROJECTSYSKIND_SysKind == 0x02: | |
| 1171 | + elif projectsyskind_syskind == 0x02: | |
| 1151 | 1172 | log.debug("Macintosh") |
| 1152 | - elif PROJECTSYSKIND_SysKind == 0x03: | |
| 1173 | + elif projectsyskind_syskind == 0x03: | |
| 1153 | 1174 | log.debug("64-bit Windows") |
| 1154 | 1175 | else: |
| 1155 | - log.error("invalid PROJECTSYSKIND_SysKind {0:04X}".format(PROJECTSYSKIND_SysKind)) | |
| 1176 | + log.error("invalid PROJECTSYSKIND_SysKind {0:04X}".format(projectsyskind_syskind)) | |
| 1156 | 1177 | |
| 1157 | 1178 | # PROJECTLCID Record |
| 1158 | - PROJECTLCID_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1159 | - check_value('PROJECTLCID_Id', 0x0002, PROJECTLCID_Id) | |
| 1160 | - PROJECTLCID_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1161 | - check_value('PROJECTLCID_Size', 0x0004, PROJECTLCID_Size) | |
| 1162 | - PROJECTLCID_Lcid = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1163 | - check_value('PROJECTLCID_Lcid', 0x409, PROJECTLCID_Lcid) | |
| 1179 | + projectlcid_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1180 | + check_value('PROJECTLCID_Id', 0x0002, projectlcid_id) | |
| 1181 | + projectlcid_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1182 | + check_value('PROJECTLCID_Size', 0x0004, projectlcid_size) | |
| 1183 | + projectlcid_lcid = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1184 | + check_value('PROJECTLCID_Lcid', 0x409, projectlcid_lcid) | |
| 1164 | 1185 | |
| 1165 | 1186 | # PROJECTLCIDINVOKE Record |
| 1166 | - PROJECTLCIDINVOKE_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1167 | - check_value('PROJECTLCIDINVOKE_Id', 0x0014, PROJECTLCIDINVOKE_Id) | |
| 1168 | - PROJECTLCIDINVOKE_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1169 | - check_value('PROJECTLCIDINVOKE_Size', 0x0004, PROJECTLCIDINVOKE_Size) | |
| 1170 | - PROJECTLCIDINVOKE_LcidInvoke = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1171 | - check_value('PROJECTLCIDINVOKE_LcidInvoke', 0x409, PROJECTLCIDINVOKE_LcidInvoke) | |
| 1187 | + projectlcidinvoke_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1188 | + check_value('PROJECTLCIDINVOKE_Id', 0x0014, projectlcidinvoke_id) | |
| 1189 | + projectlcidinvoke_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1190 | + check_value('PROJECTLCIDINVOKE_Size', 0x0004, projectlcidinvoke_size) | |
| 1191 | + projectlcidinvoke_lcidinvoke = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1192 | + check_value('PROJECTLCIDINVOKE_LcidInvoke', 0x409, projectlcidinvoke_lcidinvoke) | |
| 1172 | 1193 | |
| 1173 | 1194 | # PROJECTCODEPAGE Record |
| 1174 | - PROJECTCODEPAGE_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1175 | - check_value('PROJECTCODEPAGE_Id', 0x0003, PROJECTCODEPAGE_Id) | |
| 1176 | - PROJECTCODEPAGE_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1177 | - check_value('PROJECTCODEPAGE_Size', 0x0002, PROJECTCODEPAGE_Size) | |
| 1178 | - PROJECTCODEPAGE_CodePage = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1195 | + projectcodepage_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1196 | + check_value('PROJECTCODEPAGE_Id', 0x0003, projectcodepage_id) | |
| 1197 | + projectcodepage_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1198 | + check_value('PROJECTCODEPAGE_Size', 0x0002, projectcodepage_size) | |
| 1199 | + projectcodepage_codepage = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1179 | 1200 | |
| 1180 | 1201 | # PROJECTNAME Record |
| 1181 | - PROJECTNAME_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1182 | - check_value('PROJECTNAME_Id', 0x0004, PROJECTNAME_Id) | |
| 1183 | - PROJECTNAME_SizeOfProjectName = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1184 | - if PROJECTNAME_SizeOfProjectName < 1 or PROJECTNAME_SizeOfProjectName > 128: | |
| 1185 | - log.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(PROJECTNAME_SizeOfProjectName)) | |
| 1186 | - PROJECTNAME_ProjectName = dir_stream.read(PROJECTNAME_SizeOfProjectName) | |
| 1202 | + projectname_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1203 | + check_value('PROJECTNAME_Id', 0x0004, projectname_id) | |
| 1204 | + projectname_sizeof_projectname = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1205 | + if projectname_sizeof_projectname < 1 or projectname_sizeof_projectname > 128: | |
| 1206 | + log.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(projectname_sizeof_projectname)) | |
| 1207 | + projectname_projectname = dir_stream.read(projectname_sizeof_projectname) | |
| 1208 | + unused = projectname_projectname | |
| 1187 | 1209 | |
| 1188 | 1210 | # PROJECTDOCSTRING Record |
| 1189 | - PROJECTDOCSTRING_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1190 | - check_value('PROJECTDOCSTRING_Id', 0x0005, PROJECTDOCSTRING_Id) | |
| 1191 | - PROJECTDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1192 | - if PROJECTNAME_SizeOfProjectName > 2000: | |
| 1211 | + projectdocstring_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1212 | + check_value('PROJECTDOCSTRING_Id', 0x0005, projectdocstring_id) | |
| 1213 | + projectdocstring_sizeof_docstring = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1214 | + if projectdocstring_sizeof_docstring > 2000: | |
| 1193 | 1215 | log.error( |
| 1194 | - "PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(PROJECTDOCSTRING_SizeOfDocString)) | |
| 1195 | - PROJECTDOCSTRING_DocString = dir_stream.read(PROJECTDOCSTRING_SizeOfDocString) | |
| 1196 | - PROJECTDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1197 | - check_value('PROJECTDOCSTRING_Reserved', 0x0040, PROJECTDOCSTRING_Reserved) | |
| 1198 | - PROJECTDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1199 | - if PROJECTDOCSTRING_SizeOfDocStringUnicode % 2 != 0: | |
| 1216 | + "PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(projectdocstring_sizeof_docstring)) | |
| 1217 | + projectdocstring_docstring = dir_stream.read(projectdocstring_sizeof_docstring) | |
| 1218 | + projectdocstring_reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1219 | + check_value('PROJECTDOCSTRING_Reserved', 0x0040, projectdocstring_reserved) | |
| 1220 | + projectdocstring_sizeof_docstring_unicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1221 | + if projectdocstring_sizeof_docstring_unicode % 2 != 0: | |
| 1200 | 1222 | log.error("PROJECTDOCSTRING_SizeOfDocStringUnicode is not even") |
| 1201 | - PROJECTDOCSTRING_DocStringUnicode = dir_stream.read(PROJECTDOCSTRING_SizeOfDocStringUnicode) | |
| 1223 | + projectdocstring_docstring_unicode = dir_stream.read(projectdocstring_sizeof_docstring_unicode) | |
| 1224 | + unused = projectdocstring_docstring | |
| 1225 | + unused = projectdocstring_docstring_unicode | |
| 1202 | 1226 | |
| 1203 | 1227 | # PROJECTHELPFILEPATH Record - MS-OVBA 2.3.4.2.1.7 |
| 1204 | - PROJECTHELPFILEPATH_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1205 | - check_value('PROJECTHELPFILEPATH_Id', 0x0006, PROJECTHELPFILEPATH_Id) | |
| 1206 | - PROJECTHELPFILEPATH_SizeOfHelpFile1 = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1207 | - if PROJECTHELPFILEPATH_SizeOfHelpFile1 > 260: | |
| 1228 | + projecthelpfilepath_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1229 | + check_value('PROJECTHELPFILEPATH_Id', 0x0006, projecthelpfilepath_id) | |
| 1230 | + projecthelpfilepath_sizeof_helpfile1 = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1231 | + if projecthelpfilepath_sizeof_helpfile1 > 260: | |
| 1208 | 1232 | log.error( |
| 1209 | - "PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(PROJECTHELPFILEPATH_SizeOfHelpFile1)) | |
| 1210 | - PROJECTHELPFILEPATH_HelpFile1 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile1) | |
| 1211 | - PROJECTHELPFILEPATH_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1212 | - check_value('PROJECTHELPFILEPATH_Reserved', 0x003D, PROJECTHELPFILEPATH_Reserved) | |
| 1213 | - PROJECTHELPFILEPATH_SizeOfHelpFile2 = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1214 | - if PROJECTHELPFILEPATH_SizeOfHelpFile2 != PROJECTHELPFILEPATH_SizeOfHelpFile1: | |
| 1233 | + "PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(projecthelpfilepath_sizeof_helpfile1)) | |
| 1234 | + projecthelpfilepath_helpfile1 = dir_stream.read(projecthelpfilepath_sizeof_helpfile1) | |
| 1235 | + projecthelpfilepath_reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1236 | + check_value('PROJECTHELPFILEPATH_Reserved', 0x003D, projecthelpfilepath_reserved) | |
| 1237 | + projecthelpfilepath_sizeof_helpfile2 = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1238 | + if projecthelpfilepath_sizeof_helpfile2 != projecthelpfilepath_sizeof_helpfile1: | |
| 1215 | 1239 | log.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 does not equal PROJECTHELPFILEPATH_SizeOfHelpFile2") |
| 1216 | - PROJECTHELPFILEPATH_HelpFile2 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile2) | |
| 1217 | - if PROJECTHELPFILEPATH_HelpFile2 != PROJECTHELPFILEPATH_HelpFile1: | |
| 1240 | + projecthelpfilepath_helpfile2 = dir_stream.read(projecthelpfilepath_sizeof_helpfile2) | |
| 1241 | + if projecthelpfilepath_helpfile2 != projecthelpfilepath_helpfile1: | |
| 1218 | 1242 | log.error("PROJECTHELPFILEPATH_HelpFile1 does not equal PROJECTHELPFILEPATH_HelpFile2") |
| 1219 | 1243 | |
| 1220 | 1244 | # PROJECTHELPCONTEXT Record |
| 1221 | - PROJECTHELPCONTEXT_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1222 | - check_value('PROJECTHELPCONTEXT_Id', 0x0007, PROJECTHELPCONTEXT_Id) | |
| 1223 | - PROJECTHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1224 | - check_value('PROJECTHELPCONTEXT_Size', 0x0004, PROJECTHELPCONTEXT_Size) | |
| 1225 | - PROJECTHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1245 | + projecthelpcontext_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1246 | + check_value('PROJECTHELPCONTEXT_Id', 0x0007, projecthelpcontext_id) | |
| 1247 | + projecthelpcontext_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1248 | + check_value('PROJECTHELPCONTEXT_Size', 0x0004, projecthelpcontext_size) | |
| 1249 | + projecthelpcontext_helpcontext = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1250 | + unused = projecthelpcontext_helpcontext | |
| 1226 | 1251 | |
| 1227 | 1252 | # PROJECTLIBFLAGS Record |
| 1228 | - PROJECTLIBFLAGS_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1229 | - check_value('PROJECTLIBFLAGS_Id', 0x0008, PROJECTLIBFLAGS_Id) | |
| 1230 | - PROJECTLIBFLAGS_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1231 | - check_value('PROJECTLIBFLAGS_Size', 0x0004, PROJECTLIBFLAGS_Size) | |
| 1232 | - PROJECTLIBFLAGS_ProjectLibFlags = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1233 | - check_value('PROJECTLIBFLAGS_ProjectLibFlags', 0x0000, PROJECTLIBFLAGS_ProjectLibFlags) | |
| 1253 | + projectlibflags_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1254 | + check_value('PROJECTLIBFLAGS_Id', 0x0008, projectlibflags_id) | |
| 1255 | + projectlibflags_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1256 | + check_value('PROJECTLIBFLAGS_Size', 0x0004, projectlibflags_size) | |
| 1257 | + projectlibflags_projectlibflags = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1258 | + check_value('PROJECTLIBFLAGS_ProjectLibFlags', 0x0000, projectlibflags_projectlibflags) | |
| 1234 | 1259 | |
| 1235 | 1260 | # PROJECTVERSION Record |
| 1236 | - PROJECTVERSION_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1237 | - check_value('PROJECTVERSION_Id', 0x0009, PROJECTVERSION_Id) | |
| 1238 | - PROJECTVERSION_Reserved = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1239 | - check_value('PROJECTVERSION_Reserved', 0x0004, PROJECTVERSION_Reserved) | |
| 1240 | - PROJECTVERSION_VersionMajor = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1241 | - PROJECTVERSION_VersionMinor = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1261 | + projectversion_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1262 | + check_value('PROJECTVERSION_Id', 0x0009, projectversion_id) | |
| 1263 | + projectversion_reserved = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1264 | + check_value('PROJECTVERSION_Reserved', 0x0004, projectversion_reserved) | |
| 1265 | + projectversion_versionmajor = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1266 | + projectversion_versionminor = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1267 | + unused = projectversion_versionmajor | |
| 1268 | + unused = projectversion_versionminor | |
| 1242 | 1269 | |
| 1243 | 1270 | # PROJECTCONSTANTS Record |
| 1244 | - PROJECTCONSTANTS_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1245 | - check_value('PROJECTCONSTANTS_Id', 0x000C, PROJECTCONSTANTS_Id) | |
| 1246 | - PROJECTCONSTANTS_SizeOfConstants = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1247 | - if PROJECTCONSTANTS_SizeOfConstants > 1015: | |
| 1271 | + projectconstants_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1272 | + check_value('PROJECTCONSTANTS_Id', 0x000C, projectconstants_id) | |
| 1273 | + projectconstants_sizeof_constants = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1274 | + if projectconstants_sizeof_constants > 1015: | |
| 1248 | 1275 | log.error( |
| 1249 | - "PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(PROJECTCONSTANTS_SizeOfConstants)) | |
| 1250 | - PROJECTCONSTANTS_Constants = dir_stream.read(PROJECTCONSTANTS_SizeOfConstants) | |
| 1251 | - PROJECTCONSTANTS_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1252 | - check_value('PROJECTCONSTANTS_Reserved', 0x003C, PROJECTCONSTANTS_Reserved) | |
| 1253 | - PROJECTCONSTANTS_SizeOfConstantsUnicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1254 | - if PROJECTCONSTANTS_SizeOfConstantsUnicode % 2 != 0: | |
| 1276 | + "PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(projectconstants_sizeof_constants)) | |
| 1277 | + projectconstants_constants = dir_stream.read(projectconstants_sizeof_constants) | |
| 1278 | + projectconstants_reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1279 | + check_value('PROJECTCONSTANTS_Reserved', 0x003C, projectconstants_reserved) | |
| 1280 | + projectconstants_sizeof_constants_unicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1281 | + if projectconstants_sizeof_constants_unicode % 2 != 0: | |
| 1255 | 1282 | log.error("PROJECTCONSTANTS_SizeOfConstantsUnicode is not even") |
| 1256 | - PROJECTCONSTANTS_ConstantsUnicode = dir_stream.read(PROJECTCONSTANTS_SizeOfConstantsUnicode) | |
| 1283 | + projectconstants_constants_unicode = dir_stream.read(projectconstants_sizeof_constants_unicode) | |
| 1284 | + unused = projectconstants_constants | |
| 1285 | + unused = projectconstants_constants_unicode | |
| 1257 | 1286 | |
| 1258 | 1287 | # array of REFERENCE records |
| 1259 | 1288 | check = None |
| ... | ... | @@ -1265,194 +1294,230 @@ def _extract_vba(ole, vba_root, project_path, dir_path): |
| 1265 | 1294 | |
| 1266 | 1295 | if check == 0x0016: |
| 1267 | 1296 | # REFERENCENAME |
| 1268 | - REFERENCE_Id = check | |
| 1269 | - REFERENCE_SizeOfName = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1270 | - REFERENCE_Name = dir_stream.read(REFERENCE_SizeOfName) | |
| 1271 | - REFERENCE_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1272 | - check_value('REFERENCE_Reserved', 0x003E, REFERENCE_Reserved) | |
| 1273 | - REFERENCE_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1274 | - REFERENCE_NameUnicode = dir_stream.read(REFERENCE_SizeOfNameUnicode) | |
| 1297 | + reference_id = check | |
| 1298 | + reference_sizeof_name = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1299 | + reference_name = dir_stream.read(reference_sizeof_name) | |
| 1300 | + reference_reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1301 | + check_value('REFERENCE_Reserved', 0x003E, reference_reserved) | |
| 1302 | + reference_sizeof_name_unicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1303 | + reference_name_unicode = dir_stream.read(reference_sizeof_name_unicode) | |
| 1304 | + unused = reference_id | |
| 1305 | + unused = reference_name | |
| 1306 | + unused = reference_name_unicode | |
| 1275 | 1307 | continue |
| 1276 | 1308 | |
| 1277 | 1309 | if check == 0x0033: |
| 1278 | 1310 | # REFERENCEORIGINAL (followed by REFERENCECONTROL) |
| 1279 | - REFERENCEORIGINAL_Id = check | |
| 1280 | - REFERENCEORIGINAL_SizeOfLibidOriginal = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1281 | - REFERENCEORIGINAL_LibidOriginal = dir_stream.read(REFERENCEORIGINAL_SizeOfLibidOriginal) | |
| 1311 | + referenceoriginal_id = check | |
| 1312 | + referenceoriginal_sizeof_libidoriginal = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1313 | + referenceoriginal_libidoriginal = dir_stream.read(referenceoriginal_sizeof_libidoriginal) | |
| 1314 | + unused = referenceoriginal_id | |
| 1315 | + unused = referenceoriginal_libidoriginal | |
| 1282 | 1316 | continue |
| 1283 | 1317 | |
| 1284 | 1318 | if check == 0x002F: |
| 1285 | 1319 | # REFERENCECONTROL |
| 1286 | - REFERENCECONTROL_Id = check | |
| 1287 | - REFERENCECONTROL_SizeTwiddled = struct.unpack("<L", dir_stream.read(4))[0] # ignore | |
| 1288 | - REFERENCECONTROL_SizeOfLibidTwiddled = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1289 | - REFERENCECONTROL_LibidTwiddled = dir_stream.read(REFERENCECONTROL_SizeOfLibidTwiddled) | |
| 1290 | - REFERENCECONTROL_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0] # ignore | |
| 1291 | - check_value('REFERENCECONTROL_Reserved1', 0x0000, REFERENCECONTROL_Reserved1) | |
| 1292 | - REFERENCECONTROL_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0] # ignore | |
| 1293 | - check_value('REFERENCECONTROL_Reserved2', 0x0000, REFERENCECONTROL_Reserved2) | |
| 1320 | + referencecontrol_id = check | |
| 1321 | + referencecontrol_sizetwiddled = struct.unpack("<L", dir_stream.read(4))[0] # ignore | |
| 1322 | + referencecontrol_sizeof_libidtwiddled = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1323 | + referencecontrol_libidtwiddled = dir_stream.read(referencecontrol_sizeof_libidtwiddled) | |
| 1324 | + referencecontrol_reserved1 = struct.unpack("<L", dir_stream.read(4))[0] # ignore | |
| 1325 | + check_value('REFERENCECONTROL_Reserved1', 0x0000, referencecontrol_reserved1) | |
| 1326 | + referencecontrol_reserved2 = struct.unpack("<H", dir_stream.read(2))[0] # ignore | |
| 1327 | + check_value('REFERENCECONTROL_Reserved2', 0x0000, referencecontrol_reserved2) | |
| 1328 | + unused = referencecontrol_id | |
| 1329 | + unused = referencecontrol_sizetwiddled | |
| 1330 | + unused = referencecontrol_libidtwiddled | |
| 1294 | 1331 | # optional field |
| 1295 | 1332 | check2 = struct.unpack("<H", dir_stream.read(2))[0] |
| 1296 | 1333 | if check2 == 0x0016: |
| 1297 | - REFERENCECONTROL_NameRecordExtended_Id = check | |
| 1298 | - REFERENCECONTROL_NameRecordExtended_SizeofName = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1299 | - REFERENCECONTROL_NameRecordExtended_Name = dir_stream.read( | |
| 1300 | - REFERENCECONTROL_NameRecordExtended_SizeofName) | |
| 1301 | - REFERENCECONTROL_NameRecordExtended_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1334 | + referencecontrol_namerecordextended_id = check | |
| 1335 | + referencecontrol_namerecordextended_sizeof_name = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1336 | + referencecontrol_namerecordextended_name = dir_stream.read( | |
| 1337 | + referencecontrol_namerecordextended_sizeof_name) | |
| 1338 | + referencecontrol_namerecordextended_reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1302 | 1339 | check_value('REFERENCECONTROL_NameRecordExtended_Reserved', 0x003E, |
| 1303 | - REFERENCECONTROL_NameRecordExtended_Reserved) | |
| 1304 | - REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1305 | - REFERENCECONTROL_NameRecordExtended_NameUnicode = dir_stream.read( | |
| 1306 | - REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode) | |
| 1307 | - REFERENCECONTROL_Reserved3 = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1340 | + referencecontrol_namerecordextended_reserved) | |
| 1341 | + referencecontrol_namerecordextended_sizeof_name_unicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1342 | + referencecontrol_namerecordextended_name_unicode = dir_stream.read( | |
| 1343 | + referencecontrol_namerecordextended_sizeof_name_unicode) | |
| 1344 | + referencecontrol_reserved3 = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1345 | + unused = referencecontrol_namerecordextended_id | |
| 1346 | + unused = referencecontrol_namerecordextended_name | |
| 1347 | + unused = referencecontrol_namerecordextended_name_unicode | |
| 1308 | 1348 | else: |
| 1309 | - REFERENCECONTROL_Reserved3 = check2 | |
| 1310 | - | |
| 1311 | - check_value('REFERENCECONTROL_Reserved3', 0x0030, REFERENCECONTROL_Reserved3) | |
| 1312 | - REFERENCECONTROL_SizeExtended = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1313 | - REFERENCECONTROL_SizeOfLibidExtended = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1314 | - REFERENCECONTROL_LibidExtended = dir_stream.read(REFERENCECONTROL_SizeOfLibidExtended) | |
| 1315 | - REFERENCECONTROL_Reserved4 = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1316 | - REFERENCECONTROL_Reserved5 = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1317 | - REFERENCECONTROL_OriginalTypeLib = dir_stream.read(16) | |
| 1318 | - REFERENCECONTROL_Cookie = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1349 | + referencecontrol_reserved3 = check2 | |
| 1350 | + | |
| 1351 | + check_value('REFERENCECONTROL_Reserved3', 0x0030, referencecontrol_reserved3) | |
| 1352 | + referencecontrol_sizeextended = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1353 | + referencecontrol_sizeof_libidextended = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1354 | + referencecontrol_libidextended = dir_stream.read(referencecontrol_sizeof_libidextended) | |
| 1355 | + referencecontrol_reserved4 = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1356 | + referencecontrol_reserved5 = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1357 | + referencecontrol_originaltypelib = dir_stream.read(16) | |
| 1358 | + referencecontrol_cookie = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1359 | + unused = referencecontrol_sizeextended | |
| 1360 | + unused = referencecontrol_libidextended | |
| 1361 | + unused = referencecontrol_reserved4 | |
| 1362 | + unused = referencecontrol_reserved5 | |
| 1363 | + unused = referencecontrol_originaltypelib | |
| 1364 | + unused = referencecontrol_cookie | |
| 1319 | 1365 | continue |
| 1320 | 1366 | |
| 1321 | 1367 | if check == 0x000D: |
| 1322 | 1368 | # REFERENCEREGISTERED |
| 1323 | - REFERENCEREGISTERED_Id = check | |
| 1324 | - REFERENCEREGISTERED_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1325 | - REFERENCEREGISTERED_SizeOfLibid = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1326 | - REFERENCEREGISTERED_Libid = dir_stream.read(REFERENCEREGISTERED_SizeOfLibid) | |
| 1327 | - REFERENCEREGISTERED_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1328 | - check_value('REFERENCEREGISTERED_Reserved1', 0x0000, REFERENCEREGISTERED_Reserved1) | |
| 1329 | - REFERENCEREGISTERED_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1330 | - check_value('REFERENCEREGISTERED_Reserved2', 0x0000, REFERENCEREGISTERED_Reserved2) | |
| 1369 | + referenceregistered_id = check | |
| 1370 | + referenceregistered_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1371 | + referenceregistered_sizeof_libid = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1372 | + referenceregistered_libid = dir_stream.read(referenceregistered_sizeof_libid) | |
| 1373 | + referenceregistered_reserved1 = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1374 | + check_value('REFERENCEREGISTERED_Reserved1', 0x0000, referenceregistered_reserved1) | |
| 1375 | + referenceregistered_reserved2 = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1376 | + check_value('REFERENCEREGISTERED_Reserved2', 0x0000, referenceregistered_reserved2) | |
| 1377 | + unused = referenceregistered_id | |
| 1378 | + unused = referenceregistered_size | |
| 1379 | + unused = referenceregistered_libid | |
| 1331 | 1380 | continue |
| 1332 | 1381 | |
| 1333 | 1382 | if check == 0x000E: |
| 1334 | 1383 | # REFERENCEPROJECT |
| 1335 | - REFERENCEPROJECT_Id = check | |
| 1336 | - REFERENCEPROJECT_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1337 | - REFERENCEPROJECT_SizeOfLibidAbsolute = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1338 | - REFERENCEPROJECT_LibidAbsolute = dir_stream.read(REFERENCEPROJECT_SizeOfLibidAbsolute) | |
| 1339 | - REFERENCEPROJECT_SizeOfLibidRelative = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1340 | - REFERENCEPROJECT_LibidRelative = dir_stream.read(REFERENCEPROJECT_SizeOfLibidRelative) | |
| 1341 | - REFERENCEPROJECT_MajorVersion = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1342 | - REFERENCEPROJECT_MinorVersion = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1384 | + referenceproject_id = check | |
| 1385 | + referenceproject_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1386 | + referenceproject_sizeof_libidabsolute = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1387 | + referenceproject_libidabsolute = dir_stream.read(referenceproject_sizeof_libidabsolute) | |
| 1388 | + referenceproject_sizeof_libidrelative = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1389 | + referenceproject_libidrelative = dir_stream.read(referenceproject_sizeof_libidrelative) | |
| 1390 | + referenceproject_majorversion = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1391 | + referenceproject_minorversion = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1392 | + unused = referenceproject_id | |
| 1393 | + unused = referenceproject_size | |
| 1394 | + unused = referenceproject_libidabsolute | |
| 1395 | + unused = referenceproject_libidrelative | |
| 1396 | + unused = referenceproject_majorversion | |
| 1397 | + unused = referenceproject_minorversion | |
| 1343 | 1398 | continue |
| 1344 | 1399 | |
| 1345 | 1400 | log.error('invalid or unknown check Id {0:04X}'.format(check)) |
| 1346 | 1401 | sys.exit(0) |
| 1347 | 1402 | |
| 1348 | - PROJECTMODULES_Id = check #struct.unpack("<H", dir_stream.read(2))[0] | |
| 1349 | - check_value('PROJECTMODULES_Id', 0x000F, PROJECTMODULES_Id) | |
| 1350 | - PROJECTMODULES_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1351 | - check_value('PROJECTMODULES_Size', 0x0002, PROJECTMODULES_Size) | |
| 1352 | - PROJECTMODULES_Count = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1353 | - PROJECTMODULES_ProjectCookieRecord_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1354 | - check_value('PROJECTMODULES_ProjectCookieRecord_Id', 0x0013, PROJECTMODULES_ProjectCookieRecord_Id) | |
| 1355 | - PROJECTMODULES_ProjectCookieRecord_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1356 | - check_value('PROJECTMODULES_ProjectCookieRecord_Size', 0x0002, PROJECTMODULES_ProjectCookieRecord_Size) | |
| 1357 | - PROJECTMODULES_ProjectCookieRecord_Cookie = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1358 | - | |
| 1359 | - log.debug("parsing {0} modules".format(PROJECTMODULES_Count)) | |
| 1360 | - for x in xrange(0, PROJECTMODULES_Count): | |
| 1361 | - MODULENAME_Id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1362 | - check_value('MODULENAME_Id', 0x0019, MODULENAME_Id) | |
| 1363 | - MODULENAME_SizeOfModuleName = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1364 | - MODULENAME_ModuleName = dir_stream.read(MODULENAME_SizeOfModuleName) | |
| 1403 | + projectmodules_id = check #struct.unpack("<H", dir_stream.read(2))[0] | |
| 1404 | + check_value('PROJECTMODULES_Id', 0x000F, projectmodules_id) | |
| 1405 | + projectmodules_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1406 | + check_value('PROJECTMODULES_Size', 0x0002, projectmodules_size) | |
| 1407 | + projectmodules_count = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1408 | + projectmodules_projectcookierecord_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1409 | + check_value('PROJECTMODULES_ProjectCookieRecord_Id', 0x0013, projectmodules_projectcookierecord_id) | |
| 1410 | + projectmodules_projectcookierecord_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1411 | + check_value('PROJECTMODULES_ProjectCookieRecord_Size', 0x0002, projectmodules_projectcookierecord_size) | |
| 1412 | + projectmodules_projectcookierecord_cookie = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1413 | + unused = projectmodules_projectcookierecord_cookie | |
| 1414 | + | |
| 1415 | + log.debug("parsing {0} modules".format(projectmodules_count)) | |
| 1416 | + for _ in xrange(0, projectmodules_count): | |
| 1417 | + modulename_id = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1418 | + check_value('MODULENAME_Id', 0x0019, modulename_id) | |
| 1419 | + modulename_sizeof_modulename = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1420 | + modulename_modulename = dir_stream.read(modulename_sizeof_modulename) | |
| 1365 | 1421 | # account for optional sections |
| 1366 | 1422 | section_id = struct.unpack("<H", dir_stream.read(2))[0] |
| 1367 | 1423 | if section_id == 0x0047: |
| 1368 | - MODULENAMEUNICODE_Id = section_id | |
| 1369 | - MODULENAMEUNICODE_SizeOfModuleNameUnicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1370 | - MODULENAMEUNICODE_ModuleNameUnicode = dir_stream.read(MODULENAMEUNICODE_SizeOfModuleNameUnicode) | |
| 1424 | + modulename_unicode_id = section_id | |
| 1425 | + modulename_unicode_sizeof_modulename_unicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1426 | + modulename_unicode_modulename_unicode = dir_stream.read(modulename_unicode_sizeof_modulename_unicode) | |
| 1427 | + unused = modulename_unicode_id | |
| 1428 | + unused = modulename_unicode_modulename_unicode | |
| 1371 | 1429 | section_id = struct.unpack("<H", dir_stream.read(2))[0] |
| 1372 | 1430 | if section_id == 0x001A: |
| 1373 | - MODULESTREAMNAME_id = section_id | |
| 1374 | - MODULESTREAMNAME_SizeOfStreamName = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1375 | - MODULESTREAMNAME_StreamName = dir_stream.read(MODULESTREAMNAME_SizeOfStreamName) | |
| 1376 | - MODULESTREAMNAME_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1377 | - check_value('MODULESTREAMNAME_Reserved', 0x0032, MODULESTREAMNAME_Reserved) | |
| 1378 | - MODULESTREAMNAME_SizeOfStreamNameUnicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1379 | - MODULESTREAMNAME_StreamNameUnicode = dir_stream.read(MODULESTREAMNAME_SizeOfStreamNameUnicode) | |
| 1431 | + modulestreamname_id = section_id | |
| 1432 | + modulestreamname_sizeof_streamname = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1433 | + modulestreamname_streamname = dir_stream.read(modulestreamname_sizeof_streamname) | |
| 1434 | + modulestreamname_reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1435 | + check_value('MODULESTREAMNAME_Reserved', 0x0032, modulestreamname_reserved) | |
| 1436 | + modulestreamname_sizeof_streamname_unicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1437 | + modulestreamname_streamname_unicode = dir_stream.read(modulestreamname_sizeof_streamname_unicode) | |
| 1438 | + unused = modulestreamname_id | |
| 1380 | 1439 | section_id = struct.unpack("<H", dir_stream.read(2))[0] |
| 1381 | 1440 | if section_id == 0x001C: |
| 1382 | - MODULEDOCSTRING_Id = section_id | |
| 1383 | - check_value('MODULEDOCSTRING_Id', 0x001C, MODULEDOCSTRING_Id) | |
| 1384 | - MODULEDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1385 | - MODULEDOCSTRING_DocString = dir_stream.read(MODULEDOCSTRING_SizeOfDocString) | |
| 1386 | - MODULEDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1387 | - check_value('MODULEDOCSTRING_Reserved', 0x0048, MODULEDOCSTRING_Reserved) | |
| 1388 | - MODULEDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1389 | - MODULEDOCSTRING_DocStringUnicode = dir_stream.read(MODULEDOCSTRING_SizeOfDocStringUnicode) | |
| 1441 | + moduledocstring_id = section_id | |
| 1442 | + check_value('MODULEDOCSTRING_Id', 0x001C, moduledocstring_id) | |
| 1443 | + moduledocstring_sizeof_docstring = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1444 | + moduledocstring_docstring = dir_stream.read(moduledocstring_sizeof_docstring) | |
| 1445 | + moduledocstring_reserved = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1446 | + check_value('MODULEDOCSTRING_Reserved', 0x0048, moduledocstring_reserved) | |
| 1447 | + moduledocstring_sizeof_docstring_unicode = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1448 | + moduledocstring_docstring_unicode = dir_stream.read(moduledocstring_sizeof_docstring_unicode) | |
| 1449 | + unused = moduledocstring_docstring | |
| 1450 | + unused = moduledocstring_docstring_unicode | |
| 1390 | 1451 | section_id = struct.unpack("<H", dir_stream.read(2))[0] |
| 1391 | 1452 | if section_id == 0x0031: |
| 1392 | - MODULEOFFSET_Id = section_id | |
| 1393 | - check_value('MODULEOFFSET_Id', 0x0031, MODULEOFFSET_Id) | |
| 1394 | - MODULEOFFSET_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1395 | - check_value('MODULEOFFSET_Size', 0x0004, MODULEOFFSET_Size) | |
| 1396 | - MODULEOFFSET_TextOffset = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1453 | + moduleoffset_id = section_id | |
| 1454 | + check_value('MODULEOFFSET_Id', 0x0031, moduleoffset_id) | |
| 1455 | + moduleoffset_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1456 | + check_value('MODULEOFFSET_Size', 0x0004, moduleoffset_size) | |
| 1457 | + moduleoffset_textoffset = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1397 | 1458 | section_id = struct.unpack("<H", dir_stream.read(2))[0] |
| 1398 | 1459 | if section_id == 0x001E: |
| 1399 | - MODULEHELPCONTEXT_Id = section_id | |
| 1400 | - check_value('MODULEHELPCONTEXT_Id', 0x001E, MODULEHELPCONTEXT_Id) | |
| 1401 | - MODULEHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1402 | - check_value('MODULEHELPCONTEXT_Size', 0x0004, MODULEHELPCONTEXT_Size) | |
| 1403 | - MODULEHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1460 | + modulehelpcontext_id = section_id | |
| 1461 | + check_value('MODULEHELPCONTEXT_Id', 0x001E, modulehelpcontext_id) | |
| 1462 | + modulehelpcontext_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1463 | + check_value('MODULEHELPCONTEXT_Size', 0x0004, modulehelpcontext_size) | |
| 1464 | + modulehelpcontext_helpcontext = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1465 | + unused = modulehelpcontext_helpcontext | |
| 1404 | 1466 | section_id = struct.unpack("<H", dir_stream.read(2))[0] |
| 1405 | 1467 | if section_id == 0x002C: |
| 1406 | - MODULECOOKIE_Id = section_id | |
| 1407 | - check_value('MODULECOOKIE_Id', 0x002C, MODULECOOKIE_Id) | |
| 1408 | - MODULECOOKIE_Size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1409 | - check_value('MODULECOOKIE_Size', 0x0002, MODULECOOKIE_Size) | |
| 1410 | - MODULECOOKIE_Cookie = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1468 | + modulecookie_id = section_id | |
| 1469 | + check_value('MODULECOOKIE_Id', 0x002C, modulecookie_id) | |
| 1470 | + modulecookie_size = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1471 | + check_value('MODULECOOKIE_Size', 0x0002, modulecookie_size) | |
| 1472 | + modulecookie_cookie = struct.unpack("<H", dir_stream.read(2))[0] | |
| 1473 | + unused = modulecookie_cookie | |
| 1411 | 1474 | section_id = struct.unpack("<H", dir_stream.read(2))[0] |
| 1412 | 1475 | if section_id == 0x0021 or section_id == 0x0022: |
| 1413 | - MODULETYPE_Id = section_id | |
| 1414 | - MODULETYPE_Reserved = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1476 | + moduletype_id = section_id | |
| 1477 | + moduletype_reserved = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1478 | + unused = moduletype_id | |
| 1479 | + unused = moduletype_reserved | |
| 1415 | 1480 | section_id = struct.unpack("<H", dir_stream.read(2))[0] |
| 1416 | 1481 | if section_id == 0x0025: |
| 1417 | - MODULEREADONLY_Id = section_id | |
| 1418 | - check_value('MODULEREADONLY_Id', 0x0025, MODULEREADONLY_Id) | |
| 1419 | - MODULEREADONLY_Reserved = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1420 | - check_value('MODULEREADONLY_Reserved', 0x0000, MODULEREADONLY_Reserved) | |
| 1482 | + modulereadonly_id = section_id | |
| 1483 | + check_value('MODULEREADONLY_Id', 0x0025, modulereadonly_id) | |
| 1484 | + modulereadonly_reserved = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1485 | + check_value('MODULEREADONLY_Reserved', 0x0000, modulereadonly_reserved) | |
| 1421 | 1486 | section_id = struct.unpack("<H", dir_stream.read(2))[0] |
| 1422 | 1487 | if section_id == 0x0028: |
| 1423 | - MODULEPRIVATE_Id = section_id | |
| 1424 | - check_value('MODULEPRIVATE_Id', 0x0028, MODULEPRIVATE_Id) | |
| 1425 | - MODULEPRIVATE_Reserved = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1426 | - check_value('MODULEPRIVATE_Reserved', 0x0000, MODULEPRIVATE_Reserved) | |
| 1488 | + moduleprivate_id = section_id | |
| 1489 | + check_value('MODULEPRIVATE_Id', 0x0028, moduleprivate_id) | |
| 1490 | + moduleprivate_reserved = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1491 | + check_value('MODULEPRIVATE_Reserved', 0x0000, moduleprivate_reserved) | |
| 1427 | 1492 | section_id = struct.unpack("<H", dir_stream.read(2))[0] |
| 1428 | 1493 | if section_id == 0x002B: # TERMINATOR |
| 1429 | - MODULE_Reserved = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1430 | - check_value('MODULE_Reserved', 0x0000, MODULE_Reserved) | |
| 1494 | + module_reserved = struct.unpack("<L", dir_stream.read(4))[0] | |
| 1495 | + check_value('MODULE_Reserved', 0x0000, module_reserved) | |
| 1431 | 1496 | section_id = None |
| 1432 | 1497 | if section_id != None: |
| 1433 | 1498 | log.warning('unknown or invalid module section id {0:04X}'.format(section_id)) |
| 1434 | 1499 | |
| 1435 | - log.debug('Project CodePage = %d' % PROJECTCODEPAGE_CodePage) | |
| 1436 | - vba_codec = 'cp%d' % PROJECTCODEPAGE_CodePage | |
| 1437 | - log.debug("ModuleName = {0}".format(MODULENAME_ModuleName)) | |
| 1438 | - log.debug("StreamName = {0}".format(repr(MODULESTREAMNAME_StreamName))) | |
| 1439 | - streamname_unicode = MODULESTREAMNAME_StreamName.decode(vba_codec) | |
| 1500 | + log.debug('Project CodePage = %d' % projectcodepage_codepage) | |
| 1501 | + vba_codec = 'cp%d' % projectcodepage_codepage | |
| 1502 | + log.debug("ModuleName = {0}".format(modulename_modulename)) | |
| 1503 | + log.debug("StreamName = {0}".format(repr(modulestreamname_streamname))) | |
| 1504 | + streamname_unicode = modulestreamname_streamname.decode(vba_codec) | |
| 1440 | 1505 | log.debug("StreamName.decode('%s') = %s" % (vba_codec, repr(streamname_unicode))) |
| 1441 | - log.debug("StreamNameUnicode = {0}".format(repr(MODULESTREAMNAME_StreamNameUnicode))) | |
| 1442 | - log.debug("TextOffset = {0}".format(MODULEOFFSET_TextOffset)) | |
| 1506 | + log.debug("StreamNameUnicode = {0}".format(repr(modulestreamname_streamname_unicode))) | |
| 1507 | + log.debug("TextOffset = {0}".format(moduleoffset_textoffset)) | |
| 1443 | 1508 | |
| 1444 | 1509 | code_path = vba_root + u'VBA/' + streamname_unicode |
| 1445 | 1510 | #TODO: test if stream exists |
| 1446 | 1511 | log.debug('opening VBA code stream %s' % repr(code_path)) |
| 1447 | 1512 | code_data = ole.openstream(code_path).read() |
| 1448 | 1513 | log.debug("length of code_data = {0}".format(len(code_data))) |
| 1449 | - log.debug("offset of code_data = {0}".format(MODULEOFFSET_TextOffset)) | |
| 1450 | - code_data = code_data[MODULEOFFSET_TextOffset:] | |
| 1514 | + log.debug("offset of code_data = {0}".format(moduleoffset_textoffset)) | |
| 1515 | + code_data = code_data[moduleoffset_textoffset:] | |
| 1451 | 1516 | if len(code_data) > 0: |
| 1452 | 1517 | code_data = decompress_stream(code_data) |
| 1453 | 1518 | # case-insensitive search in the code_modules dict to find the file extension: |
| 1454 | - filext = code_modules.get(MODULENAME_ModuleName.lower(), 'bin') | |
| 1455 | - filename = '{0}.{1}'.format(MODULENAME_ModuleName, filext) | |
| 1519 | + filext = code_modules.get(modulename_modulename.lower(), 'bin') | |
| 1520 | + filename = '{0}.{1}'.format(modulename_modulename, filext) | |
| 1456 | 1521 | #TODO: also yield the codepage so that callers can decode it properly |
| 1457 | 1522 | yield (code_path, filename, code_data) |
| 1458 | 1523 | # print '-'*79 |
| ... | ... | @@ -1462,7 +1527,8 @@ def _extract_vba(ole, vba_root, project_path, dir_path): |
| 1462 | 1527 | # print '' |
| 1463 | 1528 | log.debug('extracted file {0}'.format(filename)) |
| 1464 | 1529 | else: |
| 1465 | - log.warning("module stream {0} has code data length 0".format(MODULESTREAMNAME_StreamName)) | |
| 1530 | + log.warning("module stream {0} has code data length 0".format(modulestreamname_streamname)) | |
| 1531 | + _ = unused | |
| 1466 | 1532 | return |
| 1467 | 1533 | |
| 1468 | 1534 | |
| ... | ... | @@ -1618,12 +1684,9 @@ def detect_base64_strings(vba_code): |
| 1618 | 1684 | decoded = base64.b64decode(value) |
| 1619 | 1685 | results.append((value, decoded)) |
| 1620 | 1686 | found.add(value) |
| 1621 | - except KeyboardInterrupt: | |
| 1622 | - # do not ignore exceptions when the user presses Ctrl+C/Pause: | |
| 1623 | - raise | |
| 1624 | - except: | |
| 1687 | + except (TypeError, ValueError) as exc: | |
| 1688 | + log.debug('Failed to base64-decode (%s)' % exc) | |
| 1625 | 1689 | # if an exception occurs, it is likely not a base64-encoded string |
| 1626 | - pass | |
| 1627 | 1690 | return results |
| 1628 | 1691 | |
| 1629 | 1692 | |
| ... | ... | @@ -1648,12 +1711,9 @@ def detect_dridex_strings(vba_code): |
| 1648 | 1711 | decoded = DridexUrlDecode(value) |
| 1649 | 1712 | results.append((value, decoded)) |
| 1650 | 1713 | found.add(value) |
| 1651 | - except KeyboardInterrupt: | |
| 1652 | - # do not ignore exceptions when the user presses Ctrl+C/Pause: | |
| 1653 | - raise | |
| 1654 | - except: | |
| 1714 | + except Exception as exc: | |
| 1715 | + log.debug('Failed to Dridex-decode (%s)' % exc) | |
| 1655 | 1716 | # if an exception occurs, it is likely not a dridex-encoded string |
| 1656 | - pass | |
| 1657 | 1717 | return results |
| 1658 | 1718 | |
| 1659 | 1719 | |
| ... | ... | @@ -1703,16 +1763,17 @@ def json2ascii(json_obj, encoding='utf8', errors='replace'): |
| 1703 | 1763 | elif isinstance(json_obj, (bool, int, float)): |
| 1704 | 1764 | pass |
| 1705 | 1765 | elif isinstance(json_obj, str): |
| 1766 | + # de-code and re-encode | |
| 1706 | 1767 | dencoded = json_obj.decode(encoding, errors).encode(encoding, errors) |
| 1707 | - if dencoded != str: | |
| 1708 | - logging.info('json2ascii: replaced: {0} (len {1})' | |
| 1709 | - .format(json_obj, len(json_obj))) | |
| 1710 | - logging.info('json2ascii: with: {0} (len {1})' | |
| 1711 | - .format(dencoded, len(dencoded))) | |
| 1768 | + if dencoded != json_obj: | |
| 1769 | + log.info('json2ascii: replaced: {0} (len {1})' | |
| 1770 | + .format(json_obj, len(json_obj))) | |
| 1771 | + log.info('json2ascii: with: {0} (len {1})' | |
| 1772 | + .format(dencoded, len(dencoded))) | |
| 1712 | 1773 | return dencoded |
| 1713 | 1774 | elif isinstance(json_obj, unicode): |
| 1714 | - logging.info('json2ascii: replaced: {0}' | |
| 1715 | - .format(json_obj.encode(encoding, errors))) | |
| 1775 | + log.info('json2ascii: replaced: {0}' | |
| 1776 | + .format(json_obj.encode(encoding, errors))) | |
| 1716 | 1777 | # cannot put original into logger |
| 1717 | 1778 | # print 'original: ' json_obj |
| 1718 | 1779 | return json_obj.encode(encoding, errors) |
| ... | ... | @@ -1723,11 +1784,50 @@ def json2ascii(json_obj, encoding='utf8', errors='replace'): |
| 1723 | 1784 | for item in json_obj: |
| 1724 | 1785 | item = json2ascii(item) |
| 1725 | 1786 | else: |
| 1726 | - logging.debug('unexpected type in json2ascii: {0} -- leave as is' | |
| 1727 | - .format(type(json_obj))) | |
| 1787 | + log.debug('unexpected type in json2ascii: {0} -- leave as is' | |
| 1788 | + .format(type(json_obj))) | |
| 1728 | 1789 | return json_obj |
| 1729 | 1790 | |
| 1730 | 1791 | |
| 1792 | +_have_printed_json_start = False | |
| 1793 | + | |
| 1794 | +def print_json(json_dict=None, _json_is_last=False, **json_parts): | |
| 1795 | + """ line-wise print of json.dumps(json2ascii(..)) with options and indent+1 | |
| 1796 | + | |
| 1797 | + can use in two ways: | |
| 1798 | + (1) print_json(some_dict) | |
| 1799 | + (2) print_json(key1=value1, key2=value2, ...) | |
| 1800 | + | |
| 1801 | + :param bool _json_is_last: set to True only for very last entry to complete | |
| 1802 | + the top-level json-list | |
| 1803 | + """ | |
| 1804 | + global _have_printed_json_start | |
| 1805 | + | |
| 1806 | + if json_dict and json_parts: | |
| 1807 | + raise ValueError('Invalid json argument: want either single dict or ' | |
| 1808 | + 'key=value parts but got both)') | |
| 1809 | + elif (json_dict is not None) and (not isinstance(json_dict, dict)): | |
| 1810 | + raise ValueError('Invalid json argument: want either single dict or ' | |
| 1811 | + 'key=value parts but got {} instead of dict)' | |
| 1812 | + .format(type(json_dict))) | |
| 1813 | + if json_parts: | |
| 1814 | + json_dict = json_parts | |
| 1815 | + | |
| 1816 | + if not _have_printed_json_start: | |
| 1817 | + print '[' | |
| 1818 | + _have_printed_json_start = True | |
| 1819 | + | |
| 1820 | + lines = json.dumps(json2ascii(json_dict), check_circular=False, | |
| 1821 | + indent=4, ensure_ascii=False).splitlines() | |
| 1822 | + for line in lines[:-1]: | |
| 1823 | + print ' {}'.format(line) | |
| 1824 | + if _json_is_last: | |
| 1825 | + print ' {}'.format(lines[-1]) # print last line without comma | |
| 1826 | + print ']' | |
| 1827 | + else: | |
| 1828 | + print ' {},'.format(lines[-1]) # print last line with comma | |
| 1829 | + | |
| 1830 | + | |
| 1731 | 1831 | class VBA_Scanner(object): |
| 1732 | 1832 | """ |
| 1733 | 1833 | Class to scan the source code of a VBA module to find obfuscated strings, |
| ... | ... | @@ -1927,6 +2027,8 @@ class VBA_Parser(object): |
| 1927 | 2027 | |
| 1928 | 2028 | :param container: str, path and filename of container if the file is within |
| 1929 | 2029 | a zip archive, None otherwise. |
| 2030 | + | |
| 2031 | + raises a FileOpenError if all attemps to interpret the data header failed | |
| 1930 | 2032 | """ |
| 1931 | 2033 | #TODO: filename should only be a string, data should be used for the file-like object |
| 1932 | 2034 | #TODO: filename should be mandatory, optional data is a string or file-like object |
| ... | ... | @@ -2006,8 +2108,8 @@ class VBA_Parser(object): |
| 2006 | 2108 | if self.type is None: |
| 2007 | 2109 | # At this stage, could not match a known format: |
| 2008 | 2110 | msg = '%s is not a supported file type, cannot extract VBA Macros.' % self.filename |
| 2009 | - log.error(msg) | |
| 2010 | - raise TypeError(msg) | |
| 2111 | + log.info(msg) | |
| 2112 | + raise FileOpenError(msg) | |
| 2011 | 2113 | |
| 2012 | 2114 | def open_ole(self, _file): |
| 2013 | 2115 | """ |
| ... | ... | @@ -2021,13 +2123,10 @@ class VBA_Parser(object): |
| 2021 | 2123 | self.ole_file = olefile.OleFileIO(_file, path_encoding=None) |
| 2022 | 2124 | # set type only if parsing succeeds |
| 2023 | 2125 | self.type = TYPE_OLE |
| 2024 | - except KeyboardInterrupt: | |
| 2025 | - # do not ignore exceptions when the user presses Ctrl+C/Pause: | |
| 2026 | - raise | |
| 2027 | - except: | |
| 2126 | + except (IOError, TypeError, ValueError) as exc: | |
| 2028 | 2127 | # TODO: handle OLE parsing exceptions |
| 2029 | - log.exception('Failed OLE parsing for file %r' % self.filename) | |
| 2030 | - pass | |
| 2128 | + log.info('Failed OLE parsing for file %r (%s)' % (self.filename, exc)) | |
| 2129 | + log.debug('Trace:', exc_info=True) | |
| 2031 | 2130 | |
| 2032 | 2131 | |
| 2033 | 2132 | def open_openxml(self, _file): |
| ... | ... | @@ -2053,22 +2152,17 @@ class VBA_Parser(object): |
| 2053 | 2152 | ole_data = z.open(subfile).read() |
| 2054 | 2153 | try: |
| 2055 | 2154 | self.ole_subfiles.append(VBA_Parser(filename=subfile, data=ole_data)) |
| 2056 | - except KeyboardInterrupt: | |
| 2057 | - # do not ignore exceptions when the user presses Ctrl+C/Pause: | |
| 2058 | - raise | |
| 2059 | - except: | |
| 2060 | - log.debug('%s is not a valid OLE file' % subfile) | |
| 2155 | + except FileOpenError as exc: | |
| 2156 | + log.info('%s is not a valid OLE file (%s)' % (subfile, exc)) | |
| 2061 | 2157 | continue |
| 2062 | 2158 | z.close() |
| 2063 | 2159 | # set type only if parsing succeeds |
| 2064 | 2160 | self.type = TYPE_OpenXML |
| 2065 | - except KeyboardInterrupt: | |
| 2066 | - # do not ignore exceptions when the user presses Ctrl+C/Pause: | |
| 2067 | - raise | |
| 2068 | - except: | |
| 2161 | + except (RuntimeError, zipfile.BadZipfile, zipfile.LargeZipFile, IOError) as exc: | |
| 2069 | 2162 | # TODO: handle parsing exceptions |
| 2070 | - log.exception('Failed Zip/OpenXML parsing for file %r' % self.filename) | |
| 2071 | - pass | |
| 2163 | + log.info('Failed Zip/OpenXML parsing for file %r (%s)' | |
| 2164 | + % (self.filename, exc)) | |
| 2165 | + log.debug('Trace:', exc_info=True) | |
| 2072 | 2166 | |
| 2073 | 2167 | def open_word2003xml(self, data): |
| 2074 | 2168 | """ |
| ... | ... | @@ -2092,25 +2186,25 @@ class VBA_Parser(object): |
| 2092 | 2186 | if is_mso_file(mso_data): |
| 2093 | 2187 | # decompress the zlib data stored in the MSO file, which is the OLE container: |
| 2094 | 2188 | # TODO: handle different offsets => separate function |
| 2095 | - ole_data = mso_file_extract(mso_data) | |
| 2096 | 2189 | try: |
| 2190 | + ole_data = mso_file_extract(mso_data) | |
| 2097 | 2191 | self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data)) |
| 2098 | - except KeyboardInterrupt: | |
| 2099 | - # do not ignore exceptions when the user presses Ctrl+C/Pause: | |
| 2100 | - raise | |
| 2101 | - except: | |
| 2102 | - log.error('%s does not contain a valid OLE file' % fname) | |
| 2192 | + except MsoExtractionError: | |
| 2193 | + log.info('Failed decompressing an MSO container in %r - %s' | |
| 2194 | + % (fname, MSG_OLEVBA_ISSUES)) | |
| 2195 | + log.debug('Trace:', exc_info=True) | |
| 2196 | + except FileOpenError as exc: | |
| 2197 | + log.debug('%s is not a valid OLE sub file (%s)' % (fname, exc)) | |
| 2103 | 2198 | else: |
| 2104 | - log.error('%s is not a valid MSO file' % fname) | |
| 2199 | + log.info('%s is not a valid MSO file' % fname) | |
| 2105 | 2200 | # set type only if parsing succeeds |
| 2106 | 2201 | self.type = TYPE_Word2003_XML |
| 2107 | - except KeyboardInterrupt: | |
| 2108 | - # do not ignore exceptions when the user presses Ctrl+C/Pause: | |
| 2109 | - raise | |
| 2110 | - except: | |
| 2202 | + except Exception as exc: | |
| 2111 | 2203 | # TODO: differentiate exceptions for each parsing stage |
| 2112 | - log.exception('Failed XML parsing for file %r' % self.filename) | |
| 2113 | - pass | |
| 2204 | + # (but ET is different libs, no good exception description in API) | |
| 2205 | + # found: XMLSyntaxError | |
| 2206 | + log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc)) | |
| 2207 | + log.debug('Trace:', exc_info=True) | |
| 2114 | 2208 | |
| 2115 | 2209 | def open_mht(self, data): |
| 2116 | 2210 | """ |
| ... | ... | @@ -2153,40 +2247,30 @@ class VBA_Parser(object): |
| 2153 | 2247 | log.debug('Found ActiveMime header, decompressing MSO container') |
| 2154 | 2248 | try: |
| 2155 | 2249 | ole_data = mso_file_extract(part_data) |
| 2156 | - try: | |
| 2157 | - # TODO: check if it is actually an OLE file | |
| 2158 | - # TODO: get the MSO filename from content_location? | |
| 2159 | - self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data)) | |
| 2160 | - except KeyboardInterrupt: | |
| 2161 | - # do not ignore exceptions when the user presses Ctrl+C/Pause: | |
| 2162 | - raise | |
| 2163 | - except: | |
| 2164 | - log.debug('%s does not contain a valid OLE file' % fname) | |
| 2165 | - except KeyboardInterrupt: | |
| 2166 | - # do not ignore exceptions when the user presses Ctrl+C/Pause: | |
| 2167 | - raise | |
| 2168 | - except: | |
| 2169 | - log.exception('Failed decompressing an MSO container in %r - %s' | |
| 2250 | + | |
| 2251 | + # TODO: check if it is actually an OLE file | |
| 2252 | + # TODO: get the MSO filename from content_location? | |
| 2253 | + self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data)) | |
| 2254 | + except MsoExtractionError: | |
| 2255 | + log.info('Failed decompressing an MSO container in %r - %s' | |
| 2170 | 2256 | % (fname, MSG_OLEVBA_ISSUES)) |
| 2257 | + log.debug('Trace:', exc_info=True) | |
| 2171 | 2258 | # TODO: bug here - need to split in smaller functions/classes? |
| 2259 | + except FileOpenError as exc: | |
| 2260 | + log.debug('%s does not contain a valid OLE file (%s)' | |
| 2261 | + % (fname, exc)) | |
| 2172 | 2262 | else: |
| 2263 | + log.debug('type(part_data) = %s' % type(part_data)) | |
| 2173 | 2264 | try: |
| 2174 | - log.debug('type(part_data) = %s' % type(part_data)) | |
| 2175 | 2265 | log.debug('part_data[0:20] = %r' % part_data[0:20]) |
| 2176 | - except KeyboardInterrupt: | |
| 2177 | - # do not ignore exceptions when the user presses Ctrl+C/Pause: | |
| 2178 | - raise | |
| 2179 | - except: | |
| 2180 | - pass | |
| 2266 | + except TypeError as err: | |
| 2267 | + log.debug('part_data has no __getitem__') | |
| 2181 | 2268 | # set type only if parsing succeeds |
| 2182 | 2269 | self.type = TYPE_MHTML |
| 2183 | - except KeyboardInterrupt: | |
| 2184 | - # do not ignore exceptions when the user presses Ctrl+C/Pause: | |
| 2185 | - raise | |
| 2186 | - except: | |
| 2187 | - log.exception('Failed MIME parsing for file %r - %s' | |
| 2188 | - % (self.filename, MSG_OLEVBA_ISSUES)) | |
| 2189 | - pass | |
| 2270 | + except Exception: | |
| 2271 | + log.info('Failed MIME parsing for file %r - %s' | |
| 2272 | + % (self.filename, MSG_OLEVBA_ISSUES)) | |
| 2273 | + log.debug('Trace:', exc_info=True) | |
| 2190 | 2274 | |
| 2191 | 2275 | def open_ppt(self): |
| 2192 | 2276 | """ try to interpret self.ole_file as PowerPoint 97-2003 using PptParser |
| ... | ... | @@ -2225,19 +2309,11 @@ class VBA_Parser(object): |
| 2225 | 2309 | :return: nothing |
| 2226 | 2310 | """ |
| 2227 | 2311 | log.info('Opening text file %s' % self.filename) |
| 2228 | - try: | |
| 2229 | - # directly store the source code: | |
| 2230 | - self.vba_code_all_modules = data | |
| 2231 | - self.contains_macros = True | |
| 2232 | - # set type only if parsing succeeds | |
| 2233 | - self.type = TYPE_TEXT | |
| 2234 | - except KeyboardInterrupt: | |
| 2235 | - # do not ignore exceptions when the user presses Ctrl+C/Pause: | |
| 2236 | - raise | |
| 2237 | - except: | |
| 2238 | - log.exception('Failed text parsing for file %r - %s' | |
| 2239 | - % (self.filename, MSG_OLEVBA_ISSUES)) | |
| 2240 | - pass | |
| 2312 | + # directly store the source code: | |
| 2313 | + self.vba_code_all_modules = data | |
| 2314 | + self.contains_macros = True | |
| 2315 | + # set type only if parsing succeeds | |
| 2316 | + self.type = TYPE_TEXT | |
| 2241 | 2317 | |
| 2242 | 2318 | |
| 2243 | 2319 | def find_vba_projects(self): |
| ... | ... | @@ -2294,6 +2370,15 @@ class VBA_Parser(object): |
| 2294 | 2370 | # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream |
| 2295 | 2371 | # - all names are case-insensitive |
| 2296 | 2372 | |
| 2373 | + def check_vba_stream(ole, vba_root, stream_path): | |
| 2374 | + full_path = vba_root + stream_path | |
| 2375 | + if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM: | |
| 2376 | + log.debug('Found %s stream: %s' % (stream_path, full_path)) | |
| 2377 | + return full_path | |
| 2378 | + else: | |
| 2379 | + log.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path) | |
| 2380 | + return False | |
| 2381 | + | |
| 2297 | 2382 | # start with an empty list: |
| 2298 | 2383 | self.vba_projects = [] |
| 2299 | 2384 | # Look for any storage containing those storage/streams: |
| ... | ... | @@ -2310,15 +2395,6 @@ class VBA_Parser(object): |
| 2310 | 2395 | vba_root += '/' |
| 2311 | 2396 | log.debug('Checking vba_root="%s"' % vba_root) |
| 2312 | 2397 | |
| 2313 | - def check_vba_stream(ole, vba_root, stream_path): | |
| 2314 | - full_path = vba_root + stream_path | |
| 2315 | - if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM: | |
| 2316 | - log.debug('Found %s stream: %s' % (stream_path, full_path)) | |
| 2317 | - return full_path | |
| 2318 | - else: | |
| 2319 | - log.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path) | |
| 2320 | - return False | |
| 2321 | - | |
| 2322 | 2398 | # Check if the VBA root storage also contains a PROJECT stream: |
| 2323 | 2399 | project_path = check_vba_stream(ole, vba_root, 'PROJECT') |
| 2324 | 2400 | if not project_path: continue |
| ... | ... | @@ -2369,6 +2445,29 @@ class VBA_Parser(object): |
| 2369 | 2445 | self.contains_macros = False |
| 2370 | 2446 | else: |
| 2371 | 2447 | self.contains_macros = True |
| 2448 | + # Also look for VBA code in any stream including orphans | |
| 2449 | + # (happens in some malformed files) | |
| 2450 | + ole = self.ole_file | |
| 2451 | + for sid in xrange(len(ole.direntries)): | |
| 2452 | + # check if id is already done above: | |
| 2453 | + log.debug('Checking DirEntry #%d' % sid) | |
| 2454 | + d = ole.direntries[sid] | |
| 2455 | + if d is None: | |
| 2456 | + # this direntry is not part of the tree: either unused or an orphan | |
| 2457 | + d = ole._load_direntry(sid) | |
| 2458 | + log.debug('This DirEntry is an orphan or unused') | |
| 2459 | + if d.entry_type == olefile.STGTY_STREAM: | |
| 2460 | + # read data | |
| 2461 | + log.debug('Reading data from stream %r - size: %d bytes' % (d.name, d.size)) | |
| 2462 | + try: | |
| 2463 | + data = ole._open(d.isectStart, d.size).read() | |
| 2464 | + log.debug('Read %d bytes' % len(data)) | |
| 2465 | + log.debug(repr(data)) | |
| 2466 | + if 'Attribut' in data: | |
| 2467 | + log.debug('Found VBA compressed code') | |
| 2468 | + self.contains_macros = True | |
| 2469 | + except: | |
| 2470 | + log.exception('Error when reading OLE Stream %r' % d.name) | |
| 2372 | 2471 | return self.contains_macros |
| 2373 | 2472 | |
| 2374 | 2473 | def extract_macros(self): |
| ... | ... | @@ -2381,6 +2480,7 @@ class VBA_Parser(object): |
| 2381 | 2480 | within the zip archive, e.g. word/vbaProject.bin. |
| 2382 | 2481 | If the file is PPT, result is as for OpenXML but filename is useless |
| 2383 | 2482 | """ |
| 2483 | + log.debug('extract_macros:') | |
| 2384 | 2484 | if self.ole_file is None: |
| 2385 | 2485 | # This may be either an OpenXML/PPT or a text file: |
| 2386 | 2486 | if self.type == TYPE_TEXT: |
| ... | ... | @@ -2394,11 +2494,41 @@ class VBA_Parser(object): |
| 2394 | 2494 | else: |
| 2395 | 2495 | # This is an OLE file: |
| 2396 | 2496 | self.find_vba_projects() |
| 2497 | + # set of stream ids | |
| 2498 | + vba_stream_ids = set() | |
| 2397 | 2499 | for vba_root, project_path, dir_path in self.vba_projects: |
| 2398 | 2500 | # extract all VBA macros from that VBA root storage: |
| 2399 | 2501 | for stream_path, vba_filename, vba_code in _extract_vba(self.ole_file, vba_root, project_path, |
| 2400 | 2502 | dir_path): |
| 2503 | + # store direntry ids in a set: | |
| 2504 | + vba_stream_ids.add(self.ole_file._find(stream_path)) | |
| 2401 | 2505 | yield (self.filename, stream_path, vba_filename, vba_code) |
| 2506 | + # Also look for VBA code in any stream including orphans | |
| 2507 | + # (happens in some malformed files) | |
| 2508 | + ole = self.ole_file | |
| 2509 | + for sid in xrange(len(ole.direntries)): | |
| 2510 | + # check if id is already done above: | |
| 2511 | + log.debug('Checking DirEntry #%d' % sid) | |
| 2512 | + if sid in vba_stream_ids: | |
| 2513 | + log.debug('Already extracted') | |
| 2514 | + continue | |
| 2515 | + d = ole.direntries[sid] | |
| 2516 | + if d is None: | |
| 2517 | + # this direntry is not part of the tree: either unused or an orphan | |
| 2518 | + d = ole._load_direntry(sid) | |
| 2519 | + log.debug('This DirEntry is an orphan or unused') | |
| 2520 | + if d.entry_type == olefile.STGTY_STREAM: | |
| 2521 | + # read data | |
| 2522 | + log.debug('Reading data from stream %r' % d.name) | |
| 2523 | + data = ole._open(d.isectStart, d.size).read() | |
| 2524 | + for match in re.finditer(r'\x00Attribut[^e]', data, flags=re.IGNORECASE): | |
| 2525 | + start = match.start() - 3 | |
| 2526 | + log.debug('Found VBA compressed code at index %X' % start) | |
| 2527 | + compressed_code = data[start:] | |
| 2528 | + vba_code = decompress_stream(compressed_code) | |
| 2529 | + yield (self.filename, d.name, d.name, vba_code) | |
| 2530 | + | |
| 2531 | + | |
| 2402 | 2532 | |
| 2403 | 2533 | |
| 2404 | 2534 | def extract_all_macros(self): |
| ... | ... | @@ -2429,10 +2559,10 @@ class VBA_Parser(object): |
| 2429 | 2559 | # variable to merge source code from all modules: |
| 2430 | 2560 | if self.vba_code_all_modules is None: |
| 2431 | 2561 | self.vba_code_all_modules = '' |
| 2432 | - for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros(): | |
| 2562 | + for (_, _, _, vba_code) in self.extract_all_macros(): | |
| 2433 | 2563 | #TODO: filter code? (each module) |
| 2434 | 2564 | self.vba_code_all_modules += vba_code + '\n' |
| 2435 | - for (subfilename, form_path, form_string) in self.extract_form_strings(): | |
| 2565 | + for (_, _, form_string) in self.extract_form_strings(): | |
| 2436 | 2566 | self.vba_code_all_modules += form_string + '\n' |
| 2437 | 2567 | # Analyze the whole code at once: |
| 2438 | 2568 | scanner = VBA_Scanner(self.vba_code_all_modules) |
| ... | ... | @@ -2592,8 +2722,7 @@ class VBA_Parser_CLI(VBA_Parser): |
| 2592 | 2722 | def __init__(self, filename, data=None, container=None): |
| 2593 | 2723 | """ |
| 2594 | 2724 | Constructor for VBA_Parser_CLI. |
| 2595 | - Calls __init__ from VBA_Parser, but handles the TypeError exception | |
| 2596 | - when the file type is not supported. | |
| 2725 | + Calls __init__ from VBA_Parser | |
| 2597 | 2726 | |
| 2598 | 2727 | :param filename: filename or path of file to parse, or file-like object |
| 2599 | 2728 | |
| ... | ... | @@ -2604,11 +2733,7 @@ class VBA_Parser_CLI(VBA_Parser): |
| 2604 | 2733 | :param container: str, path and filename of container if the file is within |
| 2605 | 2734 | a zip archive, None otherwise. |
| 2606 | 2735 | """ |
| 2607 | - try: | |
| 2608 | - VBA_Parser.__init__(self, filename, data=data, container=container) | |
| 2609 | - except TypeError: | |
| 2610 | - # in that case, self.type=None | |
| 2611 | - pass | |
| 2736 | + super(VBA_Parser_CLI, self).__init__(filename, data=data, container=container) | |
| 2612 | 2737 | |
| 2613 | 2738 | |
| 2614 | 2739 | def print_analysis(self, show_decoded_strings=False, deobfuscate=False): |
| ... | ... | @@ -2658,7 +2783,7 @@ class VBA_Parser_CLI(VBA_Parser): |
| 2658 | 2783 | for kw_type, keyword, description in self.analyze_macros(show_decoded_strings)] |
| 2659 | 2784 | |
| 2660 | 2785 | def process_file(self, show_decoded_strings=False, |
| 2661 | - display_code=True, global_analysis=True, hide_attributes=True, | |
| 2786 | + display_code=True, hide_attributes=True, | |
| 2662 | 2787 | vba_code_only=False, show_deobfuscated_code=False, |
| 2663 | 2788 | deobfuscate=False): |
| 2664 | 2789 | """ |
| ... | ... | @@ -2704,19 +2829,12 @@ class VBA_Parser_CLI(VBA_Parser): |
| 2704 | 2829 | print '(empty macro)' |
| 2705 | 2830 | else: |
| 2706 | 2831 | print vba_code_filtered |
| 2707 | - if not global_analysis and not vba_code_only: | |
| 2708 | - #TODO: remove this option | |
| 2709 | - raise NotImplementedError | |
| 2710 | - print '- ' * 39 | |
| 2711 | - print 'ANALYSIS:' | |
| 2712 | - # analyse each module's code, filtered to avoid false positives: | |
| 2713 | - self.print_analysis(show_decoded_strings, deobfuscate) | |
| 2714 | 2832 | for (subfilename, stream_path, form_string) in self.extract_form_strings(): |
| 2715 | 2833 | print '-' * 79 |
| 2716 | 2834 | print 'VBA FORM STRING IN %r - OLE stream: %r' % (subfilename, stream_path) |
| 2717 | 2835 | print '- ' * 39 |
| 2718 | 2836 | print form_string |
| 2719 | - if global_analysis and not vba_code_only: | |
| 2837 | + if not vba_code_only: | |
| 2720 | 2838 | # analyse the code from all modules at once: |
| 2721 | 2839 | self.print_analysis(show_decoded_strings, deobfuscate) |
| 2722 | 2840 | if show_deobfuscated_code: |
| ... | ... | @@ -2724,20 +2842,16 @@ class VBA_Parser_CLI(VBA_Parser): |
| 2724 | 2842 | print self.reveal() |
| 2725 | 2843 | else: |
| 2726 | 2844 | print 'No VBA macros found.' |
| 2727 | - except KeyboardInterrupt: | |
| 2728 | - # do not ignore exceptions when the user presses Ctrl+C/Pause: | |
| 2729 | - raise | |
| 2730 | - except: #TypeError: | |
| 2731 | - #raise | |
| 2732 | - #TODO: print more info if debug mode | |
| 2733 | - #print sys.exc_value | |
| 2734 | - # display the exception with full stack trace for debugging, but do not stop: | |
| 2735 | - traceback.print_exc() | |
| 2845 | + except Exception as exc: | |
| 2846 | + # display the exception with full stack trace for debugging | |
| 2847 | + log.info('Error processing file %s (%s)' % (self.filename, exc)) | |
| 2848 | + log.debug('Traceback:', exc_info=True) | |
| 2849 | + raise ProcessingError(self.filename, exc) | |
| 2736 | 2850 | print '' |
| 2737 | 2851 | |
| 2738 | 2852 | |
| 2739 | 2853 | def process_file_json(self, show_decoded_strings=False, |
| 2740 | - display_code=True, global_analysis=True, hide_attributes=True, | |
| 2854 | + display_code=True, hide_attributes=True, | |
| 2741 | 2855 | vba_code_only=False, show_deobfuscated_code=False): |
| 2742 | 2856 | """ |
| 2743 | 2857 | Process a single file |
| ... | ... | @@ -2786,27 +2900,19 @@ class VBA_Parser_CLI(VBA_Parser): |
| 2786 | 2900 | curr_macro['ole_stream'] = stream_path |
| 2787 | 2901 | if display_code: |
| 2788 | 2902 | curr_macro['code'] = vba_code_filtered.strip() |
| 2789 | - if not global_analysis and not vba_code_only: | |
| 2790 | - # analyse each module's code, filtered to avoid false positives: | |
| 2791 | - #TODO: remove this option | |
| 2792 | - curr_macro['analysis'] = self.print_analysis_json(show_decoded_strings) | |
| 2793 | 2903 | macros.append(curr_macro) |
| 2794 | - if global_analysis and not vba_code_only: | |
| 2904 | + if not vba_code_only: | |
| 2795 | 2905 | # analyse the code from all modules at once: |
| 2796 | 2906 | result['analysis'] = self.print_analysis_json(show_decoded_strings) |
| 2797 | 2907 | if show_deobfuscated_code: |
| 2798 | 2908 | result['code_deobfuscated'] = self.reveal() |
| 2799 | 2909 | result['macros'] = macros |
| 2800 | 2910 | result['json_conversion_successful'] = True |
| 2801 | - except KeyboardInterrupt: | |
| 2802 | - # do not ignore exceptions when the user presses Ctrl+C/Pause: | |
| 2803 | - raise | |
| 2804 | - except: #TypeError: | |
| 2805 | - #raise | |
| 2806 | - #TODO: print more info if debug mode | |
| 2807 | - #print sys.exc_value | |
| 2808 | - # display the exception with full stack trace for debugging, but do not stop: | |
| 2809 | - traceback.print_exc() | |
| 2911 | + except Exception as exc: | |
| 2912 | + # display the exception with full stack trace for debugging | |
| 2913 | + log.info('Error processing file %s (%s)' % (self.filename, exc)) | |
| 2914 | + log.debug('Traceback:', exc_info=True) | |
| 2915 | + raise ProcessingError(self.filename, exc) | |
| 2810 | 2916 | |
| 2811 | 2917 | return result |
| 2812 | 2918 | |
| ... | ... | @@ -2816,57 +2922,46 @@ class VBA_Parser_CLI(VBA_Parser): |
| 2816 | 2922 | Process a file in triage mode, showing only summary results on one line. |
| 2817 | 2923 | """ |
| 2818 | 2924 | #TODO: replace print by writing to a provided output file (sys.stdout by default) |
| 2819 | - message = '' | |
| 2820 | 2925 | try: |
| 2821 | - if self.type is not None: | |
| 2822 | - #TODO: handle olefile errors, when an OLE file is malformed | |
| 2823 | - if self.detect_vba_macros(): | |
| 2824 | - # print a waiting message only if the output is not redirected to a file: | |
| 2825 | - if sys.stdout.isatty(): | |
| 2826 | - print 'Analysis...\r', | |
| 2827 | - sys.stdout.flush() | |
| 2828 | - self.analyze_macros(show_decoded_strings=show_decoded_strings, | |
| 2829 | - deobfuscate=deobfuscate) | |
| 2830 | - flags = TYPE2TAG[self.type] | |
| 2831 | - macros = autoexec = suspicious = iocs = hexstrings = base64obf = dridex = vba_obf = '-' | |
| 2832 | - if self.contains_macros: macros = 'M' | |
| 2833 | - if self.nb_autoexec: autoexec = 'A' | |
| 2834 | - if self.nb_suspicious: suspicious = 'S' | |
| 2835 | - if self.nb_iocs: iocs = 'I' | |
| 2836 | - if self.nb_hexstrings: hexstrings = 'H' | |
| 2837 | - if self.nb_base64strings: base64obf = 'B' | |
| 2838 | - if self.nb_dridexstrings: dridex = 'D' | |
| 2839 | - if self.nb_vbastrings: vba_obf = 'V' | |
| 2840 | - flags += '%s%s%s%s%s%s%s%s' % (macros, autoexec, suspicious, iocs, hexstrings, | |
| 2841 | - base64obf, dridex, vba_obf) | |
| 2842 | - # old table display: | |
| 2843 | - # macros = autoexec = suspicious = iocs = hexstrings = 'no' | |
| 2844 | - # if nb_macros: macros = 'YES:%d' % nb_macros | |
| 2845 | - # if nb_autoexec: autoexec = 'YES:%d' % nb_autoexec | |
| 2846 | - # if nb_suspicious: suspicious = 'YES:%d' % nb_suspicious | |
| 2847 | - # if nb_iocs: iocs = 'YES:%d' % nb_iocs | |
| 2848 | - # if nb_hexstrings: hexstrings = 'YES:%d' % nb_hexstrings | |
| 2849 | - # # 2nd line = info | |
| 2850 | - # print '%-8s %-7s %-7s %-7s %-7s %-7s' % (self.type, macros, autoexec, suspicious, iocs, hexstrings) | |
| 2851 | - else: | |
| 2852 | - # self.type==None | |
| 2853 | - # file type not OLE nor OpenXML | |
| 2854 | - flags = '?' | |
| 2855 | - message = 'File format not supported' | |
| 2856 | - except KeyboardInterrupt: | |
| 2857 | - # do not ignore exceptions when the user presses Ctrl+C/Pause: | |
| 2858 | - raise | |
| 2859 | - except: | |
| 2860 | - # another error occurred | |
| 2861 | - #raise | |
| 2862 | - #TODO: print more info if debug mode | |
| 2863 | - #TODO: distinguish real errors from incorrect file types | |
| 2864 | - flags = '!ERROR' | |
| 2865 | - message = sys.exc_value | |
| 2866 | - line = '%-12s %s' % (flags, self.filename) | |
| 2867 | - if message: | |
| 2868 | - line += ' - %s' % message | |
| 2869 | - print line | |
| 2926 | + #TODO: handle olefile errors, when an OLE file is malformed | |
| 2927 | + if self.detect_vba_macros(): | |
| 2928 | + # print a waiting message only if the output is not redirected to a file: | |
| 2929 | + if sys.stdout.isatty(): | |
| 2930 | + print 'Analysis...\r', | |
| 2931 | + sys.stdout.flush() | |
| 2932 | + self.analyze_macros(show_decoded_strings=show_decoded_strings, | |
| 2933 | + deobfuscate=deobfuscate) | |
| 2934 | + flags = TYPE2TAG[self.type] | |
| 2935 | + macros = autoexec = suspicious = iocs = hexstrings = base64obf = dridex = vba_obf = '-' | |
| 2936 | + if self.contains_macros: macros = 'M' | |
| 2937 | + if self.nb_autoexec: autoexec = 'A' | |
| 2938 | + if self.nb_suspicious: suspicious = 'S' | |
| 2939 | + if self.nb_iocs: iocs = 'I' | |
| 2940 | + if self.nb_hexstrings: hexstrings = 'H' | |
| 2941 | + if self.nb_base64strings: base64obf = 'B' | |
| 2942 | + if self.nb_dridexstrings: dridex = 'D' | |
| 2943 | + if self.nb_vbastrings: vba_obf = 'V' | |
| 2944 | + flags += '%s%s%s%s%s%s%s%s' % (macros, autoexec, suspicious, iocs, hexstrings, | |
| 2945 | + base64obf, dridex, vba_obf) | |
| 2946 | + | |
| 2947 | + line = '%-12s %s' % (flags, self.filename) | |
| 2948 | + print line | |
| 2949 | + | |
| 2950 | + # old table display: | |
| 2951 | + # macros = autoexec = suspicious = iocs = hexstrings = 'no' | |
| 2952 | + # if nb_macros: macros = 'YES:%d' % nb_macros | |
| 2953 | + # if nb_autoexec: autoexec = 'YES:%d' % nb_autoexec | |
| 2954 | + # if nb_suspicious: suspicious = 'YES:%d' % nb_suspicious | |
| 2955 | + # if nb_iocs: iocs = 'YES:%d' % nb_iocs | |
| 2956 | + # if nb_hexstrings: hexstrings = 'YES:%d' % nb_hexstrings | |
| 2957 | + # # 2nd line = info | |
| 2958 | + # print '%-8s %-7s %-7s %-7s %-7s %-7s' % (self.type, macros, autoexec, suspicious, iocs, hexstrings) | |
| 2959 | + except Exception as exc: | |
| 2960 | + # display the exception with full stack trace for debugging only | |
| 2961 | + log.debug('Error processing file %s (%s)' % (self.filename, exc), | |
| 2962 | + exc_info=True) | |
| 2963 | + raise ProcessingError(self.filename, exc) | |
| 2964 | + | |
| 2870 | 2965 | |
| 2871 | 2966 | # t = prettytable.PrettyTable(('filename', 'type', 'macros', 'autoexec', 'suspicious', 'ioc', 'hexstrings'), |
| 2872 | 2967 | # header=False, border=False) |
| ... | ... | @@ -2888,7 +2983,6 @@ def main(): |
| 2888 | 2983 | """ |
| 2889 | 2984 | Main function, called when olevba is run from the command line |
| 2890 | 2985 | """ |
| 2891 | - global log | |
| 2892 | 2986 | DEFAULT_LOG_LEVEL = "warning" # Default log level |
| 2893 | 2987 | LOG_LEVELS = { |
| 2894 | 2988 | 'debug': logging.DEBUG, |
| ... | ... | @@ -2944,13 +3038,14 @@ def main(): |
| 2944 | 3038 | if len(args) == 0: |
| 2945 | 3039 | print __doc__ |
| 2946 | 3040 | parser.print_help() |
| 2947 | - sys.exit() | |
| 3041 | + sys.exit(RETURN_WRONG_ARGS) | |
| 2948 | 3042 | |
| 2949 | 3043 | # provide info about tool and its version |
| 2950 | 3044 | if options.output_mode == 'json': |
| 2951 | - json_results = [dict(script_name='olevba', version=__version__, | |
| 2952 | - url='http://decalage.info/python/oletools', | |
| 2953 | - type='MetaInformation'), ] | |
| 3045 | + # prints opening [ | |
| 3046 | + print_json(script_name='olevba', version=__version__, | |
| 3047 | + url='http://decalage.info/python/oletools', | |
| 3048 | + type='MetaInformation') | |
| 2954 | 3049 | else: |
| 2955 | 3050 | print 'olevba %s - http://decalage.info/python/oletools' % __version__ |
| 2956 | 3051 | |
| ... | ... | @@ -2976,65 +3071,120 @@ def main(): |
| 2976 | 3071 | count = 0 |
| 2977 | 3072 | container = filename = data = None |
| 2978 | 3073 | vba_parser = None |
| 2979 | - for container, filename, data in xglob.iter_files(args, recursive=options.recursive, | |
| 2980 | - zip_password=options.zip_password, zip_fname=options.zip_fname): | |
| 2981 | - # ignore directory names stored in zip files: | |
| 2982 | - if container and filename.endswith('/'): | |
| 2983 | - continue | |
| 2984 | - # Open the file | |
| 2985 | - vba_parser = VBA_Parser_CLI(filename, data=data, container=container) | |
| 2986 | - if options.output_mode == 'detailed': | |
| 2987 | - # fully detailed output | |
| 2988 | - vba_parser.process_file(show_decoded_strings=options.show_decoded_strings, | |
| 2989 | - display_code=options.display_code, global_analysis=True, #options.global_analysis, | |
| 2990 | - hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only, | |
| 2991 | - show_deobfuscated_code=options.show_deobfuscated_code, | |
| 2992 | - deobfuscate=options.deobfuscate) | |
| 2993 | - elif options.output_mode in ('triage', 'unspecified'): | |
| 2994 | - # print container name when it changes: | |
| 2995 | - if container != previous_container: | |
| 2996 | - if container is not None: | |
| 2997 | - print '\nFiles in %s:' % container | |
| 2998 | - previous_container = container | |
| 2999 | - # summarized output for triage: | |
| 3000 | - vba_parser.process_file_triage(show_decoded_strings=options.show_decoded_strings, | |
| 3001 | - deobfuscate=options.deobfuscate) | |
| 3002 | - elif options.output_mode == 'json': | |
| 3003 | - json_results.append( | |
| 3004 | - vba_parser.process_file_json(show_decoded_strings=options.show_decoded_strings, | |
| 3005 | - display_code=options.display_code, global_analysis=True, #options.global_analysis, | |
| 3006 | - hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only, | |
| 3007 | - show_deobfuscated_code=options.show_deobfuscated_code)) | |
| 3008 | - else: # (should be impossible) | |
| 3009 | - raise ValueError('unexpected output mode: "{0}"!'.format(options.output_mode)) | |
| 3010 | - count += 1 | |
| 3011 | - if options.output_mode == 'triage': | |
| 3012 | - print '\n(Flags: OpX=OpenXML, XML=Word2003XML, MHT=MHTML, TXT=Text, M=Macros, ' \ | |
| 3013 | - 'A=Auto-executable, S=Suspicious keywords, I=IOCs, H=Hex strings, ' \ | |
| 3014 | - 'B=Base64 strings, D=Dridex strings, V=VBA strings, ?=Unknown)\n' | |
| 3015 | - | |
| 3016 | - if count == 1 and options.output_mode == 'unspecified': | |
| 3017 | - # if options -t, -d and -j were not specified and it's a single file, print details: | |
| 3018 | - vba_parser.process_file(show_decoded_strings=options.show_decoded_strings, | |
| 3019 | - display_code=options.display_code, global_analysis=True, #options.global_analysis, | |
| 3020 | - hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only, | |
| 3021 | - show_deobfuscated_code=options.show_deobfuscated_code, | |
| 3022 | - deobfuscate=options.deobfuscate) | |
| 3023 | - | |
| 3024 | - if options.output_mode == 'json': | |
| 3025 | - json_options = dict(check_circular=False, indent=4, ensure_ascii=False) | |
| 3026 | - | |
| 3027 | - # json.dump[s] cannot deal with unicode objects that are not properly | |
| 3028 | - # encoded --> encode in own function: | |
| 3029 | - json_results = json2ascii(json_results) | |
| 3030 | - #print_json(json_results) | |
| 3031 | - | |
| 3032 | - # if False: # options.outfile: # (option currently commented out) | |
| 3033 | - # with open(outfile, 'w') as write_handle: | |
| 3034 | - # json.dump(write_handle, **json_options) | |
| 3035 | - # else: | |
| 3036 | - print json.dumps(json_results, **json_options) | |
| 3074 | + return_code = RETURN_OK | |
| 3075 | + try: | |
| 3076 | + for container, filename, data in xglob.iter_files(args, recursive=options.recursive, | |
| 3077 | + zip_password=options.zip_password, zip_fname=options.zip_fname): | |
| 3078 | + # ignore directory names stored in zip files: | |
| 3079 | + if container and filename.endswith('/'): | |
| 3080 | + continue | |
| 3081 | + | |
| 3082 | + # handle errors from xglob | |
| 3083 | + if isinstance(data, Exception): | |
| 3084 | + if isinstance(data, PathNotFoundException): | |
| 3085 | + if options.output_mode in ('triage', 'unspecified'): | |
| 3086 | + print '%-12s %s - File not found' % ('?', filename) | |
| 3087 | + elif options.output_mode != 'json': | |
| 3088 | + log.error('Given path %r does not exist!' % filename) | |
| 3089 | + return_code = RETURN_FILE_NOT_FOUND if return_code == 0 \ | |
| 3090 | + else RETURN_SEVERAL_ERRS | |
| 3091 | + else: | |
| 3092 | + if options.output_mode in ('triage', 'unspecified'): | |
| 3093 | + print '%-12s %s - Failed to read from zip file %s' % ('?', filename, container) | |
| 3094 | + elif options.output_mode != 'json': | |
| 3095 | + log.error('Exception opening/reading %r from zip file %r: %s' | |
| 3096 | + % (filename, container, data)) | |
| 3097 | + return_code = RETURN_XGLOB_ERR if return_code == 0 \ | |
| 3098 | + else RETURN_SEVERAL_ERRS | |
| 3099 | + if options.output_mode == 'json': | |
| 3100 | + print_json(file=filename, type='error', | |
| 3101 | + error=type(data).__name__, message=str(data)) | |
| 3102 | + continue | |
| 3037 | 3103 | |
| 3104 | + try: | |
| 3105 | + # Open the file | |
| 3106 | + vba_parser = VBA_Parser_CLI(filename, data=data, container=container) | |
| 3107 | + | |
| 3108 | + if options.output_mode == 'detailed': | |
| 3109 | + # fully detailed output | |
| 3110 | + vba_parser.process_file(show_decoded_strings=options.show_decoded_strings, | |
| 3111 | + display_code=options.display_code, | |
| 3112 | + hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only, | |
| 3113 | + show_deobfuscated_code=options.show_deobfuscated_code, | |
| 3114 | + deobfuscate=options.deobfuscate) | |
| 3115 | + elif options.output_mode in ('triage', 'unspecified'): | |
| 3116 | + # print container name when it changes: | |
| 3117 | + if container != previous_container: | |
| 3118 | + if container is not None: | |
| 3119 | + print '\nFiles in %s:' % container | |
| 3120 | + previous_container = container | |
| 3121 | + # summarized output for triage: | |
| 3122 | + vba_parser.process_file_triage(show_decoded_strings=options.show_decoded_strings, | |
| 3123 | + deobfuscate=options.deobfuscate) | |
| 3124 | + elif options.output_mode == 'json': | |
| 3125 | + print_json( | |
| 3126 | + vba_parser.process_file_json(show_decoded_strings=options.show_decoded_strings, | |
| 3127 | + display_code=options.display_code, | |
| 3128 | + hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only, | |
| 3129 | + show_deobfuscated_code=options.show_deobfuscated_code)) | |
| 3130 | + else: # (should be impossible) | |
| 3131 | + raise ValueError('unexpected output mode: "{0}"!'.format(options.output_mode)) | |
| 3132 | + count += 1 | |
| 3133 | + | |
| 3134 | + except FileOpenError as exc: | |
| 3135 | + if options.output_mode in ('triage', 'unspecified'): | |
| 3136 | + print '%-12s %s - File format not supported' % ('?', filename) | |
| 3137 | + elif options.output_mode == 'json': | |
| 3138 | + print_json(file=filename, type='error', | |
| 3139 | + error=type(exc).__name__, message=str(exc)) | |
| 3140 | + else: | |
| 3141 | + log.exception('Failed to open %s -- probably not supported!' % filename) | |
| 3142 | + return_code = RETURN_OPEN_ERROR if return_code == 0 \ | |
| 3143 | + else RETURN_SEVERAL_ERRS | |
| 3144 | + except ProcessingError as exc: | |
| 3145 | + if options.output_mode in ('triage', 'unspecified'): | |
| 3146 | + print '%-12s %s - %s' % ('!ERROR', filename, exc.orig_exception) | |
| 3147 | + elif options.output_mode == 'json': | |
| 3148 | + print_json(file=filename, type='error', | |
| 3149 | + error=type(exc).__name__, | |
| 3150 | + message=str(exc.orig_exception)) | |
| 3151 | + else: | |
| 3152 | + log.exception('Error processing file %s (%s)!' | |
| 3153 | + % (filename, exc.orig_exception)) | |
| 3154 | + return_code = RETURN_PARSE_ERROR if return_code == 0 \ | |
| 3155 | + else RETURN_SEVERAL_ERRS | |
| 3156 | + finally: | |
| 3157 | + if vba_parser is not None: | |
| 3158 | + vba_parser.close() | |
| 3159 | + | |
| 3160 | + if options.output_mode == 'triage': | |
| 3161 | + print '\n(Flags: OpX=OpenXML, XML=Word2003XML, MHT=MHTML, TXT=Text, M=Macros, ' \ | |
| 3162 | + 'A=Auto-executable, S=Suspicious keywords, I=IOCs, H=Hex strings, ' \ | |
| 3163 | + 'B=Base64 strings, D=Dridex strings, V=VBA strings, ?=Unknown)\n' | |
| 3164 | + | |
| 3165 | + if count == 1 and options.output_mode == 'unspecified': | |
| 3166 | + # if options -t, -d and -j were not specified and it's a single file, print details: | |
| 3167 | + vba_parser.process_file(show_decoded_strings=options.show_decoded_strings, | |
| 3168 | + display_code=options.display_code, | |
| 3169 | + hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only, | |
| 3170 | + show_deobfuscated_code=options.show_deobfuscated_code, | |
| 3171 | + deobfuscate=options.deobfuscate) | |
| 3172 | + | |
| 3173 | + if options.output_mode == 'json': | |
| 3174 | + # print last json entry (a last one without a comma) and closing ] | |
| 3175 | + print_json(type='MetaInformation', return_code=return_code, | |
| 3176 | + n_processed=count, _json_is_last=True) | |
| 3177 | + | |
| 3178 | + except Exception as exc: | |
| 3179 | + # some unexpected error, maybe some of the types caught in except clauses | |
| 3180 | + # above were not sufficient. This is very bad, so log complete trace at exception level | |
| 3181 | + # and do not care about output mode | |
| 3182 | + log.exception('Unhandled exception in main: %s' % exc, exc_info=True) | |
| 3183 | + return_code = RETURN_UNEXPECTED # even if there were others before -- this is more important | |
| 3184 | + | |
| 3185 | + # done. exit | |
| 3186 | + log.debug('will exit now with code %s' % return_code) | |
| 3187 | + sys.exit(return_code) | |
| 3038 | 3188 | |
| 3039 | 3189 | if __name__ == '__main__': |
| 3040 | 3190 | main() | ... | ... |
oletools/rtfobj.py
| ... | ... | @@ -48,8 +48,11 @@ http://www.decalage.info/python/oletools |
| 48 | 48 | # - extract files from OLE Package objects |
| 49 | 49 | # 2016-04-01 v0.04 PL: - fixed logging output to use stdout instead of stderr |
| 50 | 50 | # 2016-04-07 v0.45 PL: - improved parsing to handle some malware tricks |
| 51 | +# 2016-05-06 v0.47 TJ: - added option -d to set the output directory | |
| 52 | +# (contribution by Thomas Jarosch) | |
| 53 | +# TJ: - sanitize filenames to avoid special characters | |
| 51 | 54 | |
| 52 | -__version__ = '0.45' | |
| 55 | +__version__ = '0.47' | |
| 53 | 56 | |
| 54 | 57 | #------------------------------------------------------------------------------ |
| 55 | 58 | # TODO: |
| ... | ... | @@ -60,7 +63,7 @@ __version__ = '0.45' |
| 60 | 63 | |
| 61 | 64 | #=== IMPORTS ================================================================= |
| 62 | 65 | |
| 63 | -import re, sys, string, binascii, logging, optparse | |
| 66 | +import re, os, sys, string, binascii, logging, optparse | |
| 64 | 67 | |
| 65 | 68 | from thirdparty.xglob import xglob |
| 66 | 69 | from oleobj import OleObject, OleNativeStream |
| ... | ... | @@ -280,7 +283,42 @@ def rtf_iter_objects (data, min_size=32): |
| 280 | 283 | match = re_hexblock.search(data, pos=current) |
| 281 | 284 | |
| 282 | 285 | |
| 283 | -def process_file(container, filename, data): | |
| 286 | + | |
| 287 | +def sanitize_filename(filename, replacement='_', max_length=200): | |
| 288 | + """compute basename of filename. Replaces all non-whitelisted characters. | |
| 289 | + The returned filename is always a basename of the file.""" | |
| 290 | + basepath = os.path.basename(filename).strip() | |
| 291 | + sane_fname = re.sub(r'[^\w\.\- ]', replacement, basepath) | |
| 292 | + | |
| 293 | + while ".." in sane_fname: | |
| 294 | + sane_fname = sane_fname.replace('..', '.') | |
| 295 | + | |
| 296 | + while " " in sane_fname: | |
| 297 | + sane_fname = sane_fname.replace(' ', ' ') | |
| 298 | + | |
| 299 | + if not len(filename): | |
| 300 | + sane_fname = 'NONAME' | |
| 301 | + | |
| 302 | + # limit filename length | |
| 303 | + if max_length: | |
| 304 | + sane_fname = sane_fname[:max_length] | |
| 305 | + | |
| 306 | + return sane_fname | |
| 307 | + | |
| 308 | + | |
| 309 | +def process_file(container, filename, data, output_dir=None): | |
| 310 | + if output_dir: | |
| 311 | + if not os.path.isdir(output_dir): | |
| 312 | + log.info('creating output directory %s' % output_dir) | |
| 313 | + os.mkdir(output_dir) | |
| 314 | + | |
| 315 | + fname_prefix = os.path.join(output_dir, | |
| 316 | + sanitize_filename(filename)) | |
| 317 | + else: | |
| 318 | + base_dir = os.path.dirname(filename) | |
| 319 | + sane_fname = sanitize_filename(filename) | |
| 320 | + fname_prefix = os.path.join(base_dir, sane_fname) | |
| 321 | + | |
| 284 | 322 | # TODO: option to extract objects to files (false by default) |
| 285 | 323 | if data is None: |
| 286 | 324 | data = open(filename, 'rb').read() |
| ... | ... | @@ -288,7 +326,7 @@ def process_file(container, filename, data): |
| 288 | 326 | print 'File: %r - %d bytes' % (filename, len(data)) |
| 289 | 327 | for index, orig_len, objdata in rtf_iter_objects(data): |
| 290 | 328 | print 'found object size %d at index %08X - end %08X' % (len(objdata), index, index+orig_len) |
| 291 | - fname = '%s_object_%08X.raw' % (filename, index) | |
| 329 | + fname = '%s_object_%08X.raw' % (fname_prefix, index) | |
| 292 | 330 | print 'saving object to file %s' % fname |
| 293 | 331 | open(fname, 'wb').write(objdata) |
| 294 | 332 | # TODO: check if all hex data is extracted properly |
| ... | ... | @@ -308,7 +346,8 @@ def process_file(container, filename, data): |
| 308 | 346 | ext = 'package' |
| 309 | 347 | else: |
| 310 | 348 | ext = 'bin' |
| 311 | - fname = '%s_object_%08X.%s' % (filename, index, ext) | |
| 349 | + | |
| 350 | + fname = '%s_object_%08X.%s' % (fname_prefix, index, ext) | |
| 312 | 351 | print 'saving to file %s' % fname |
| 313 | 352 | open(fname, 'wb').write(obj.data) |
| 314 | 353 | if obj.class_name.lower() == 'package': |
| ... | ... | @@ -318,9 +357,10 @@ def process_file(container, filename, data): |
| 318 | 357 | print 'Source path = %r' % opkg.src_path |
| 319 | 358 | print 'Temp path = %r' % opkg.temp_path |
| 320 | 359 | if opkg.filename: |
| 321 | - fname = '%s_%s' % (filename, opkg.filename) | |
| 360 | + fname = '%s_%s' % (fname_prefix, | |
| 361 | + sanitize_filename(opkg.filename)) | |
| 322 | 362 | else: |
| 323 | - fname = '%s_object_%08X.noname' % (filename, index) | |
| 363 | + fname = '%s_object_%08X.noname' % (fname_prefix, index) | |
| 324 | 364 | print 'saving to file %s' % fname |
| 325 | 365 | open(fname, 'wb').write(opkg.data) |
| 326 | 366 | except: |
| ... | ... | @@ -354,6 +394,8 @@ if __name__ == '__main__': |
| 354 | 394 | # help='export results to a CSV file') |
| 355 | 395 | parser.add_option("-r", action="store_true", dest="recursive", |
| 356 | 396 | help='find files recursively in subdirectories.') |
| 397 | + parser.add_option("-d", type="str", dest="output_dir", | |
| 398 | + help='use specified directory to output files.', default=None) | |
| 357 | 399 | parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None, |
| 358 | 400 | help='if the file is a zip archive, open first file from it, using the provided password (requires Python 2.6+)') |
| 359 | 401 | parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*', |
| ... | ... | @@ -384,7 +426,7 @@ if __name__ == '__main__': |
| 384 | 426 | # ignore directory names stored in zip files: |
| 385 | 427 | if container and filename.endswith('/'): |
| 386 | 428 | continue |
| 387 | - process_file(container, filename, data) | |
| 429 | + process_file(container, filename, data, options.output_dir) | |
| 388 | 430 | |
| 389 | 431 | |
| 390 | 432 | ... | ... |
oletools/thirdparty/olefile/olefile.py
100644 โ 100755
| 1 | -#!/usr/bin/env python | |
| 2 | - | |
| 3 | -# olefile (formerly OleFileIO_PL) | |
| 4 | -# | |
| 5 | -# Module to read/write Microsoft OLE2 files (also called Structured Storage or | |
| 6 | -# Microsoft Compound Document File Format), such as Microsoft Office 97-2003 | |
| 7 | -# documents, Image Composer and FlashPix files, Outlook messages, ... | |
| 8 | -# This version is compatible with Python 2.6+ and 3.x | |
| 9 | -# | |
| 10 | -# Project website: http://www.decalage.info/olefile | |
| 11 | -# | |
| 12 | -# olefile is copyright (c) 2005-2016 Philippe Lagadec (http://www.decalage.info) | |
| 13 | -# | |
| 14 | -# olefile is based on the OleFileIO module from the PIL library v1.1.6 | |
| 15 | -# See: http://www.pythonware.com/products/pil/index.htm | |
| 16 | -# | |
| 17 | -# The Python Imaging Library (PIL) is | |
| 18 | -# Copyright (c) 1997-2005 by Secret Labs AB | |
| 19 | -# Copyright (c) 1995-2005 by Fredrik Lundh | |
| 20 | -# | |
| 21 | -# See source code and LICENSE.txt for information on usage and redistribution. | |
| 22 | - | |
| 23 | - | |
| 24 | -# Since OleFileIO_PL v0.30, only Python 2.6+ and 3.x is supported | |
| 25 | -# This import enables print() as a function rather than a keyword | |
| 26 | -# (main requirement to be compatible with Python 3.x) | |
| 27 | -# The comment on the line below should be printed on Python 2.5 or older: | |
| 28 | -from __future__ import print_function # This version of olefile requires Python 2.6+ or 3.x. | |
| 29 | - | |
| 30 | - | |
| 31 | -__author__ = "Philippe Lagadec" | |
| 32 | -__date__ = "2016-02-02" | |
| 33 | -__version__ = '0.44' | |
| 34 | - | |
| 35 | -#--- LICENSE ------------------------------------------------------------------ | |
| 36 | - | |
| 37 | -# olefile (formerly OleFileIO_PL) is copyright (c) 2005-2016 Philippe Lagadec | |
| 38 | -# (http://www.decalage.info) | |
| 39 | -# | |
| 40 | -# All rights reserved. | |
| 41 | -# | |
| 42 | -# Redistribution and use in source and binary forms, with or without modification, | |
| 43 | -# are permitted provided that the following conditions are met: | |
| 44 | -# | |
| 45 | -# * Redistributions of source code must retain the above copyright notice, this | |
| 46 | -# list of conditions and the following disclaimer. | |
| 47 | -# * Redistributions in binary form must reproduce the above copyright notice, | |
| 48 | -# this list of conditions and the following disclaimer in the documentation | |
| 49 | -# and/or other materials provided with the distribution. | |
| 50 | -# | |
| 51 | -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
| 52 | -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
| 53 | -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
| 54 | -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | |
| 55 | -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 56 | -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
| 57 | -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
| 58 | -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
| 59 | -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
| 60 | -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 61 | - | |
| 62 | -# ---------- | |
| 63 | -# PIL License: | |
| 64 | -# | |
| 65 | -# olefile is based on source code from the OleFileIO module of the Python | |
| 66 | -# Imaging Library (PIL) published by Fredrik Lundh under the following license: | |
| 67 | - | |
| 68 | -# The Python Imaging Library (PIL) is | |
| 69 | -# Copyright (c) 1997-2005 by Secret Labs AB | |
| 70 | -# Copyright (c) 1995-2005 by Fredrik Lundh | |
| 71 | -# | |
| 72 | -# By obtaining, using, and/or copying this software and/or its associated | |
| 73 | -# documentation, you agree that you have read, understood, and will comply with | |
| 74 | -# the following terms and conditions: | |
| 75 | -# | |
| 76 | -# Permission to use, copy, modify, and distribute this software and its | |
| 77 | -# associated documentation for any purpose and without fee is hereby granted, | |
| 78 | -# provided that the above copyright notice appears in all copies, and that both | |
| 79 | -# that copyright notice and this permission notice appear in supporting | |
| 80 | -# documentation, and that the name of Secret Labs AB or the author(s) not be used | |
| 81 | -# in advertising or publicity pertaining to distribution of the software | |
| 82 | -# without specific, written prior permission. | |
| 83 | -# | |
| 84 | -# SECRET LABS AB AND THE AUTHORS DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS | |
| 85 | -# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. | |
| 86 | -# IN NO EVENT SHALL SECRET LABS AB OR THE AUTHORS BE LIABLE FOR ANY SPECIAL, | |
| 87 | -# INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM | |
| 88 | -# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR | |
| 89 | -# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR | |
| 90 | -# PERFORMANCE OF THIS SOFTWARE. | |
| 91 | - | |
| 92 | -#----------------------------------------------------------------------------- | |
| 93 | -# CHANGELOG: (only olefile/OleFileIO_PL changes compared to PIL 1.1.6) | |
| 94 | -# 2005-05-11 v0.10 PL: - a few fixes for Python 2.4 compatibility | |
| 95 | -# (all changes flagged with [PL]) | |
| 96 | -# 2006-02-22 v0.11 PL: - a few fixes for some Office 2003 documents which raise | |
| 97 | -# exceptions in _OleStream.__init__() | |
| 98 | -# 2006-06-09 v0.12 PL: - fixes for files above 6.8MB (DIFAT in loadfat) | |
| 99 | -# - added some constants | |
| 100 | -# - added header values checks | |
| 101 | -# - added some docstrings | |
| 102 | -# - getsect: bugfix in case sectors >512 bytes | |
| 103 | -# - getsect: added conformity checks | |
| 104 | -# - DEBUG_MODE constant to activate debug display | |
| 105 | -# 2007-09-04 v0.13 PL: - improved/translated (lots of) comments | |
| 106 | -# - updated license | |
| 107 | -# - converted tabs to 4 spaces | |
| 108 | -# 2007-11-19 v0.14 PL: - added OleFileIO._raise_defect() to adapt sensitivity | |
| 109 | -# - improved _unicode() to use Python 2.x unicode support | |
| 110 | -# - fixed bug in _OleDirectoryEntry | |
| 111 | -# 2007-11-25 v0.15 PL: - added safety checks to detect FAT loops | |
| 112 | -# - fixed _OleStream which didn't check stream size | |
| 113 | -# - added/improved many docstrings and comments | |
| 114 | -# - moved helper functions _unicode and _clsid out of | |
| 115 | -# OleFileIO class | |
| 116 | -# - improved OleFileIO._find() to add Unix path syntax | |
| 117 | -# - OleFileIO._find() is now case-insensitive | |
| 118 | -# - added get_type() and get_rootentry_name() | |
| 119 | -# - rewritten loaddirectory and _OleDirectoryEntry | |
| 120 | -# 2007-11-27 v0.16 PL: - added _OleDirectoryEntry.kids_dict | |
| 121 | -# - added detection of duplicate filenames in storages | |
| 122 | -# - added detection of duplicate references to streams | |
| 123 | -# - added get_size() and exists() to _OleDirectoryEntry | |
| 124 | -# - added isOleFile to check header before parsing | |
| 125 | -# - added __all__ list to control public keywords in pydoc | |
| 126 | -# 2007-12-04 v0.17 PL: - added _load_direntry to fix a bug in loaddirectory | |
| 127 | -# - improved _unicode(), added workarounds for Python <2.3 | |
| 128 | -# - added set_debug_mode and -d option to set debug mode | |
| 129 | -# - fixed bugs in OleFileIO.open and _OleDirectoryEntry | |
| 130 | -# - added safety check in main for large or binary | |
| 131 | -# properties | |
| 132 | -# - allow size>0 for storages for some implementations | |
| 133 | -# 2007-12-05 v0.18 PL: - fixed several bugs in handling of FAT, MiniFAT and | |
| 134 | -# streams | |
| 135 | -# - added option '-c' in main to check all streams | |
| 136 | -# 2009-12-10 v0.19 PL: - bugfix for 32 bit arrays on 64 bits platforms | |
| 137 | -# (thanks to Ben G. and Martijn for reporting the bug) | |
| 138 | -# 2009-12-11 v0.20 PL: - bugfix in OleFileIO.open when filename is not plain str | |
| 139 | -# 2010-01-22 v0.21 PL: - added support for big-endian CPUs such as PowerPC Macs | |
| 140 | -# 2012-02-16 v0.22 PL: - fixed bug in getproperties, patch by chuckleberryfinn | |
| 141 | -# (https://bitbucket.org/decalage/olefileio_pl/issue/7) | |
| 142 | -# - added close method to OleFileIO (fixed issue #2) | |
| 143 | -# 2012-07-25 v0.23 PL: - added support for file-like objects (patch by mete0r_kr) | |
| 144 | -# 2013-05-05 v0.24 PL: - getproperties: added conversion from filetime to python | |
| 145 | -# datetime | |
| 146 | -# - main: displays properties with date format | |
| 147 | -# - new class OleMetadata to parse standard properties | |
| 148 | -# - added get_metadata method | |
| 149 | -# 2013-05-07 v0.24 PL: - a few improvements in OleMetadata | |
| 150 | -# 2013-05-24 v0.25 PL: - getproperties: option to not convert some timestamps | |
| 151 | -# - OleMetaData: total_edit_time is now a number of seconds, | |
| 152 | -# not a timestamp | |
| 153 | -# - getproperties: added support for VT_BOOL, VT_INT, V_UINT | |
| 154 | -# - getproperties: filter out null chars from strings | |
| 155 | -# - getproperties: raise non-fatal defects instead of | |
| 156 | -# exceptions when properties cannot be parsed properly | |
| 157 | -# 2013-05-27 PL: - getproperties: improved exception handling | |
| 158 | -# - _raise_defect: added option to set exception type | |
| 159 | -# - all non-fatal issues are now recorded, and displayed | |
| 160 | -# when run as a script | |
| 161 | -# 2013-07-11 v0.26 PL: - added methods to get modification and creation times | |
| 162 | -# of a directory entry or a storage/stream | |
| 163 | -# - fixed parsing of direntry timestamps | |
| 164 | -# 2013-07-24 PL: - new options in listdir to list storages and/or streams | |
| 165 | -# 2014-02-04 v0.30 PL: - upgraded code to support Python 3.x by Martin Panter | |
| 166 | -# - several fixes for Python 2.6 (xrange, MAGIC) | |
| 167 | -# - reused i32 from Pillow's _binary | |
| 168 | -# 2014-07-18 v0.31 - preliminary support for 4K sectors | |
| 169 | -# 2014-07-27 v0.31 PL: - a few improvements in OleFileIO.open (header parsing) | |
| 170 | -# - Fixed loadfat for large files with 4K sectors (issue #3) | |
| 171 | -# 2014-07-30 v0.32 PL: - added write_sect to write sectors to disk | |
| 172 | -# - added write_mode option to OleFileIO.__init__ and open | |
| 173 | -# 2014-07-31 PL: - fixed padding in write_sect for Python 3, added checks | |
| 174 | -# - added write_stream to write a stream to disk | |
| 175 | -# 2014-09-26 v0.40 PL: - renamed OleFileIO_PL to olefile | |
| 176 | -# 2014-11-09 NE: - added support for Jython (Niko Ehrenfeuchter) | |
| 177 | -# 2014-11-13 v0.41 PL: - improved isOleFile and OleFileIO.open to support OLE | |
| 178 | -# data in a string buffer and file-like objects. | |
| 179 | -# 2014-11-21 PL: - updated comments according to Pillow's commits | |
| 180 | -# 2015-01-24 v0.42 PL: - changed the default path name encoding from Latin-1 | |
| 181 | -# to UTF-8 on Python 2.x (Unicode on Python 3.x) | |
| 182 | -# - added path_encoding option to override the default | |
| 183 | -# - fixed a bug in _list when a storage is empty | |
| 184 | -# 2015-04-17 v0.43 PL: - slight changes in _OleDirectoryEntry | |
| 185 | -# 2015-10-19 - fixed issue #26 in OleFileIO.getproperties | |
| 186 | -# (using id and type as local variable names) | |
| 187 | -# 2015-10-29 - replaced debug() with proper logging | |
| 188 | -# - use optparse to handle command line options | |
| 189 | -# - improved attribute names in OleFileIO class | |
| 190 | -# 2015-11-05 - fixed issue #27 by correcting the MiniFAT sector | |
| 191 | -# cutoff size if invalid. | |
| 192 | -# 2016-02-02 - logging is disabled by default | |
| 193 | - | |
| 194 | -#----------------------------------------------------------------------------- | |
| 195 | -# TODO (for version 1.0): | |
| 196 | -# + get rid of print statements, to simplify Python 2.x and 3.x support | |
| 197 | -# + add is_stream and is_storage | |
| 198 | -# + remove leading and trailing slashes where a path is used | |
| 199 | -# + add functions path_list2str and path_str2list | |
| 200 | -# + fix how all the methods handle unicode str and/or bytes as arguments | |
| 201 | -# + add path attrib to _OleDirEntry, set it once and for all in init or | |
| 202 | -# append_kids (then listdir/_list can be simplified) | |
| 203 | -# - TESTS with Linux, MacOSX, Python 1.5.2, various files, PIL, ... | |
| 204 | -# - add underscore to each private method, to avoid their display in | |
| 205 | -# pydoc/epydoc documentation - Remove it for classes to be documented | |
| 206 | -# - replace all raised exceptions with _raise_defect (at least in OleFileIO) | |
| 207 | -# - merge code from _OleStream and OleFileIO.getsect to read sectors | |
| 208 | -# (maybe add a class for FAT and MiniFAT ?) | |
| 209 | -# - add method to check all streams (follow sectors chains without storing all | |
| 210 | -# stream in memory, and report anomalies) | |
| 211 | -# - use _OleDirectoryEntry.kids_dict to improve _find and _list ? | |
| 212 | -# - fix Unicode names handling (find some way to stay compatible with Py1.5.2) | |
| 213 | -# => if possible avoid converting names to Latin-1 | |
| 214 | -# - review DIFAT code: fix handling of DIFSECT blocks in FAT (not stop) | |
| 215 | -# - rewrite OleFileIO.getproperties | |
| 216 | -# - improve docstrings to show more sample uses | |
| 217 | -# - see also original notes and FIXME below | |
| 218 | -# - remove all obsolete FIXMEs | |
| 219 | -# - OleMetadata: fix version attrib according to | |
| 220 | -# http://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx | |
| 221 | - | |
| 222 | -# IDEAS: | |
| 223 | -# - in OleFileIO._open and _OleStream, use size=None instead of 0x7FFFFFFF for | |
| 224 | -# streams with unknown size | |
| 225 | -# - use arrays of int instead of long integers for FAT/MiniFAT, to improve | |
| 226 | -# performance and reduce memory usage ? (possible issue with values >2^31) | |
| 227 | -# - provide tests with unittest (may need write support to create samples) | |
| 228 | -# - move all debug code (and maybe dump methods) to a separate module, with | |
| 229 | -# a class which inherits OleFileIO ? | |
| 230 | -# - fix docstrings to follow epydoc format | |
| 231 | -# - add support for big endian byte order ? | |
| 232 | -# - create a simple OLE explorer with wxPython | |
| 233 | - | |
| 234 | -# FUTURE EVOLUTIONS to add write support: | |
| 235 | -# see issue #6 on Bitbucket: | |
| 236 | -# https://bitbucket.org/decalage/olefileio_pl/issue/6/improve-olefileio_pl-to-write-ole-files | |
| 237 | - | |
| 238 | -#----------------------------------------------------------------------------- | |
| 239 | -# NOTES from PIL 1.1.6: | |
| 240 | - | |
| 241 | -# History: | |
| 242 | -# 1997-01-20 fl Created | |
| 243 | -# 1997-01-22 fl Fixed 64-bit portability quirk | |
| 244 | -# 2003-09-09 fl Fixed typo in OleFileIO.loadfat (noted by Daniel Haertle) | |
| 245 | -# 2004-02-29 fl Changed long hex constants to signed integers | |
| 246 | -# | |
| 247 | -# Notes: | |
| 248 | -# FIXME: sort out sign problem (eliminate long hex constants) | |
| 249 | -# FIXME: change filename to use "a/b/c" instead of ["a", "b", "c"] | |
| 250 | -# FIXME: provide a glob mechanism function (using fnmatchcase) | |
| 251 | -# | |
| 252 | -# Literature: | |
| 253 | -# | |
| 254 | -# "FlashPix Format Specification, Appendix A", Kodak and Microsoft, | |
| 255 | -# September 1996. | |
| 256 | -# | |
| 257 | -# Quotes: | |
| 258 | -# | |
| 259 | -# "If this document and functionality of the Software conflict, | |
| 260 | -# the actual functionality of the Software represents the correct | |
| 261 | -# functionality" -- Microsoft, in the OLE format specification | |
| 262 | - | |
| 263 | -#------------------------------------------------------------------------------ | |
| 264 | - | |
| 265 | - | |
| 266 | -import io | |
| 267 | -import sys | |
| 268 | -import struct, array, os.path, datetime, logging | |
| 269 | - | |
| 270 | -#=== COMPATIBILITY WORKAROUNDS ================================================ | |
| 271 | - | |
| 272 | -#[PL] Define explicitly the public API to avoid private objects in pydoc: | |
| 273 | -#TODO: add more | |
| 274 | -# __all__ = ['OleFileIO', 'isOleFile', 'MAGIC'] | |
| 275 | - | |
| 276 | -# For Python 3.x, need to redefine long as int: | |
| 277 | -if str is not bytes: | |
| 278 | - long = int | |
| 279 | - | |
| 280 | -# Need to make sure we use xrange both on Python 2 and 3.x: | |
| 281 | -try: | |
| 282 | - # on Python 2 we need xrange: | |
| 283 | - iterrange = xrange | |
| 284 | -except: | |
| 285 | - # no xrange, for Python 3 it was renamed as range: | |
| 286 | - iterrange = range | |
| 287 | - | |
| 288 | -#[PL] workaround to fix an issue with array item size on 64 bits systems: | |
| 289 | -if array.array('L').itemsize == 4: | |
| 290 | - # on 32 bits platforms, long integers in an array are 32 bits: | |
| 291 | - UINT32 = 'L' | |
| 292 | -elif array.array('I').itemsize == 4: | |
| 293 | - # on 64 bits platforms, integers in an array are 32 bits: | |
| 294 | - UINT32 = 'I' | |
| 295 | -elif array.array('i').itemsize == 4: | |
| 296 | - # On 64 bit Jython, signed integers ('i') are the only way to store our 32 | |
| 297 | - # bit values in an array in a *somewhat* reasonable way, as the otherwise | |
| 298 | - # perfectly suited 'H' (unsigned int, 32 bits) results in a completely | |
| 299 | - # unusable behaviour. This is most likely caused by the fact that Java | |
| 300 | - # doesn't have unsigned values, and thus Jython's "array" implementation, | |
| 301 | - # which is based on "jarray", doesn't have them either. | |
| 302 | - # NOTE: to trick Jython into converting the values it would normally | |
| 303 | - # interpret as "signed" into "unsigned", a binary-and operation with | |
| 304 | - # 0xFFFFFFFF can be used. This way it is possible to use the same comparing | |
| 305 | - # operations on all platforms / implementations. The corresponding code | |
| 306 | - # lines are flagged with a 'JYTHON-WORKAROUND' tag below. | |
| 307 | - UINT32 = 'i' | |
| 308 | -else: | |
| 309 | - raise ValueError('Need to fix a bug with 32 bit arrays, please contact author...') | |
| 310 | - | |
| 311 | - | |
| 312 | -#[PL] These workarounds were inspired from the Path module | |
| 313 | -# (see http://www.jorendorff.com/articles/python/path/) | |
| 314 | -#TODO: test with old Python versions | |
| 315 | - | |
| 316 | -# Pre-2.3 workaround for basestring. | |
| 317 | -try: | |
| 318 | - basestring | |
| 319 | -except NameError: | |
| 320 | - try: | |
| 321 | - # is Unicode supported (Python >2.0 or >1.6 ?) | |
| 322 | - basestring = (str, unicode) | |
| 323 | - except NameError: | |
| 324 | - basestring = str | |
| 325 | - | |
| 326 | -#[PL] Experimental setting: if True, OLE filenames will be kept in Unicode | |
| 327 | -# if False (default PIL behaviour), all filenames are converted to Latin-1. | |
| 328 | -KEEP_UNICODE_NAMES = True | |
| 329 | - | |
| 330 | -if sys.version_info[0] < 3: | |
| 331 | - # On Python 2.x, the default encoding for path names is UTF-8: | |
| 332 | - DEFAULT_PATH_ENCODING = 'utf-8' | |
| 333 | -else: | |
| 334 | - # On Python 3.x, the default encoding for path names is Unicode (None): | |
| 335 | - DEFAULT_PATH_ENCODING = None | |
| 336 | - | |
| 337 | - | |
| 338 | -# === LOGGING ================================================================= | |
| 339 | - | |
| 340 | -class NullHandler(logging.Handler): | |
| 341 | - """ | |
| 342 | - Log Handler without output, to avoid printing messages if logging is not | |
| 343 | - configured by the main application. | |
| 344 | - Python 2.7 has logging.NullHandler, but this is necessary for 2.6: | |
| 345 | - see https://docs.python.org/2.6/library/logging.html#configuring-logging-for-a-library | |
| 346 | - """ | |
| 347 | - def emit(self, record): | |
| 348 | - pass | |
| 349 | - | |
| 350 | -def get_logger(name, level=logging.CRITICAL+1): | |
| 351 | - """ | |
| 352 | - Create a suitable logger object for this module. | |
| 353 | - The goal is not to change settings of the root logger, to avoid getting | |
| 354 | - other modules' logs on the screen. | |
| 355 | - If a logger exists with same name, reuse it. (Else it would have duplicate | |
| 356 | - handlers and messages would be doubled.) | |
| 357 | - The level is set to CRITICAL+1 by default, to avoid any logging. | |
| 358 | - """ | |
| 359 | - # First, test if there is already a logger with the same name, else it | |
| 360 | - # will generate duplicate messages (due to duplicate handlers): | |
| 361 | - if name in logging.Logger.manager.loggerDict: | |
| 362 | - #NOTE: another less intrusive but more "hackish" solution would be to | |
| 363 | - # use getLogger then test if its effective level is not default. | |
| 364 | - logger = logging.getLogger(name) | |
| 365 | - # make sure level is OK: | |
| 366 | - logger.setLevel(level) | |
| 367 | - return logger | |
| 368 | - # get a new logger: | |
| 369 | - logger = logging.getLogger(name) | |
| 370 | - # only add a NullHandler for this logger, it is up to the application | |
| 371 | - # to configure its own logging: | |
| 372 | - logger.addHandler(NullHandler()) | |
| 373 | - logger.setLevel(level) | |
| 374 | - return logger | |
| 375 | - | |
| 376 | -# a global logger object used for debugging: | |
| 377 | -log = get_logger('olefile') | |
| 378 | - | |
| 379 | - | |
| 380 | -#=== CONSTANTS =============================================================== | |
| 381 | - | |
| 382 | -# magic bytes that should be at the beginning of every OLE file: | |
| 383 | -MAGIC = b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1' | |
| 384 | - | |
| 385 | -#[PL]: added constants for Sector IDs (from AAF specifications) | |
| 386 | -MAXREGSECT = 0xFFFFFFFA # (-6) maximum SECT | |
| 387 | -DIFSECT = 0xFFFFFFFC # (-4) denotes a DIFAT sector in a FAT | |
| 388 | -FATSECT = 0xFFFFFFFD # (-3) denotes a FAT sector in a FAT | |
| 389 | -ENDOFCHAIN = 0xFFFFFFFE # (-2) end of a virtual stream chain | |
| 390 | -FREESECT = 0xFFFFFFFF # (-1) unallocated sector | |
| 391 | - | |
| 392 | -#[PL]: added constants for Directory Entry IDs (from AAF specifications) | |
| 393 | -MAXREGSID = 0xFFFFFFFA # (-6) maximum directory entry ID | |
| 394 | -NOSTREAM = 0xFFFFFFFF # (-1) unallocated directory entry | |
| 395 | - | |
| 396 | -#[PL] object types in storage (from AAF specifications) | |
| 397 | -STGTY_EMPTY = 0 # empty directory entry (according to OpenOffice.org doc) | |
| 398 | -STGTY_STORAGE = 1 # element is a storage object | |
| 399 | -STGTY_STREAM = 2 # element is a stream object | |
| 400 | -STGTY_LOCKBYTES = 3 # element is an ILockBytes object | |
| 401 | -STGTY_PROPERTY = 4 # element is an IPropertyStorage object | |
| 402 | -STGTY_ROOT = 5 # element is a root storage | |
| 403 | - | |
| 404 | - | |
| 405 | -# | |
| 406 | -# -------------------------------------------------------------------- | |
| 407 | -# property types | |
| 408 | - | |
| 409 | -VT_EMPTY=0; VT_NULL=1; VT_I2=2; VT_I4=3; VT_R4=4; VT_R8=5; VT_CY=6; | |
| 410 | -VT_DATE=7; VT_BSTR=8; VT_DISPATCH=9; VT_ERROR=10; VT_BOOL=11; | |
| 411 | -VT_VARIANT=12; VT_UNKNOWN=13; VT_DECIMAL=14; VT_I1=16; VT_UI1=17; | |
| 412 | -VT_UI2=18; VT_UI4=19; VT_I8=20; VT_UI8=21; VT_INT=22; VT_UINT=23; | |
| 413 | -VT_VOID=24; VT_HRESULT=25; VT_PTR=26; VT_SAFEARRAY=27; VT_CARRAY=28; | |
| 414 | -VT_USERDEFINED=29; VT_LPSTR=30; VT_LPWSTR=31; VT_FILETIME=64; | |
| 415 | -VT_BLOB=65; VT_STREAM=66; VT_STORAGE=67; VT_STREAMED_OBJECT=68; | |
| 416 | -VT_STORED_OBJECT=69; VT_BLOB_OBJECT=70; VT_CF=71; VT_CLSID=72; | |
| 417 | -VT_VECTOR=0x1000; | |
| 418 | - | |
| 419 | -# map property id to name (for debugging purposes) | |
| 420 | - | |
| 421 | -VT = {} | |
| 422 | -for keyword, var in list(vars().items()): | |
| 423 | - if keyword[:3] == "VT_": | |
| 424 | - VT[var] = keyword | |
| 425 | - | |
| 426 | -# | |
| 427 | -# -------------------------------------------------------------------- | |
| 428 | -# Some common document types (root.clsid fields) | |
| 429 | - | |
| 430 | -WORD_CLSID = "00020900-0000-0000-C000-000000000046" | |
| 431 | -#TODO: check Excel, PPT, ... | |
| 432 | - | |
| 433 | -#[PL]: Defect levels to classify parsing errors - see OleFileIO._raise_defect() | |
| 434 | -DEFECT_UNSURE = 10 # a case which looks weird, but not sure it's a defect | |
| 435 | -DEFECT_POTENTIAL = 20 # a potential defect | |
| 436 | -DEFECT_INCORRECT = 30 # an error according to specifications, but parsing | |
| 437 | - # can go on | |
| 438 | -DEFECT_FATAL = 40 # an error which cannot be ignored, parsing is | |
| 439 | - # impossible | |
| 440 | - | |
| 441 | -# Minimal size of an empty OLE file, with 512-bytes sectors = 1536 bytes | |
| 442 | -# (this is used in isOleFile and OleFile.open) | |
| 443 | -MINIMAL_OLEFILE_SIZE = 1536 | |
| 444 | - | |
| 445 | -#[PL] add useful constants to __all__: | |
| 446 | -# for key in list(vars().keys()): | |
| 447 | -# if key.startswith('STGTY_') or key.startswith('DEFECT_'): | |
| 448 | -# __all__.append(key) | |
| 449 | - | |
| 450 | - | |
| 451 | -#=== FUNCTIONS =============================================================== | |
| 452 | - | |
| 453 | -def isOleFile (filename): | |
| 454 | - """ | |
| 455 | - Test if a file is an OLE container (according to the magic bytes in its header). | |
| 456 | - | |
| 457 | - :param filename: string-like or file-like object, OLE file to parse | |
| 458 | - | |
| 459 | - - if filename is a string smaller than 1536 bytes, it is the path | |
| 460 | - of the file to open. (bytes or unicode string) | |
| 461 | - - if filename is a string longer than 1535 bytes, it is parsed | |
| 462 | - as the content of an OLE file in memory. (bytes type only) | |
| 463 | - - if filename is a file-like object (with read and seek methods), | |
| 464 | - it is parsed as-is. | |
| 465 | - | |
| 466 | - :returns: True if OLE, False otherwise. | |
| 467 | - """ | |
| 468 | - # check if filename is a string-like or file-like object: | |
| 469 | - if hasattr(filename, 'read'): | |
| 470 | - # file-like object: use it directly | |
| 471 | - header = filename.read(len(MAGIC)) | |
| 472 | - # just in case, seek back to start of file: | |
| 473 | - filename.seek(0) | |
| 474 | - elif isinstance(filename, bytes) and len(filename) >= MINIMAL_OLEFILE_SIZE: | |
| 475 | - # filename is a bytes string containing the OLE file to be parsed: | |
| 476 | - header = filename[:len(MAGIC)] | |
| 477 | - else: | |
| 478 | - # string-like object: filename of file on disk | |
| 479 | - header = open(filename, 'rb').read(len(MAGIC)) | |
| 480 | - if header == MAGIC: | |
| 481 | - return True | |
| 482 | - else: | |
| 483 | - return False | |
| 484 | - | |
| 485 | - | |
| 486 | -if bytes is str: | |
| 487 | - # version for Python 2.x | |
| 488 | - def i8(c): | |
| 489 | - return ord(c) | |
| 490 | -else: | |
| 491 | - # version for Python 3.x | |
| 492 | - def i8(c): | |
| 493 | - return c if c.__class__ is int else c[0] | |
| 494 | - | |
| 495 | - | |
| 496 | -#TODO: replace i16 and i32 with more readable struct.unpack equivalent? | |
| 497 | - | |
| 498 | -def i16(c, o = 0): | |
| 499 | - """ | |
| 500 | - Converts a 2-bytes (16 bits) string to an integer. | |
| 501 | - | |
| 502 | - :param c: string containing bytes to convert | |
| 503 | - :param o: offset of bytes to convert in string | |
| 504 | - """ | |
| 505 | - return i8(c[o]) | (i8(c[o+1])<<8) | |
| 506 | - | |
| 507 | - | |
| 508 | -def i32(c, o = 0): | |
| 509 | - """ | |
| 510 | - Converts a 4-bytes (32 bits) string to an integer. | |
| 511 | - | |
| 512 | - :param c: string containing bytes to convert | |
| 513 | - :param o: offset of bytes to convert in string | |
| 514 | - """ | |
| 515 | -## return int(ord(c[o])+(ord(c[o+1])<<8)+(ord(c[o+2])<<16)+(ord(c[o+3])<<24)) | |
| 516 | -## # [PL]: added int() because "<<" gives long int since Python 2.4 | |
| 517 | - # copied from Pillow's _binary: | |
| 518 | - return i8(c[o]) | (i8(c[o+1])<<8) | (i8(c[o+2])<<16) | (i8(c[o+3])<<24) | |
| 519 | - | |
| 520 | - | |
| 521 | -def _clsid(clsid): | |
| 522 | - """ | |
| 523 | - Converts a CLSID to a human-readable string. | |
| 524 | - | |
| 525 | - :param clsid: string of length 16. | |
| 526 | - """ | |
| 527 | - assert len(clsid) == 16 | |
| 528 | - # if clsid is only made of null bytes, return an empty string: | |
| 529 | - # (PL: why not simply return the string with zeroes?) | |
| 530 | - if not clsid.strip(b"\0"): | |
| 531 | - return "" | |
| 532 | - return (("%08X-%04X-%04X-%02X%02X-" + "%02X" * 6) % | |
| 533 | - ((i32(clsid, 0), i16(clsid, 4), i16(clsid, 6)) + | |
| 534 | - tuple(map(i8, clsid[8:16])))) | |
| 535 | - | |
| 536 | - | |
| 537 | - | |
| 538 | -def filetime2datetime(filetime): | |
| 539 | - """ | |
| 540 | - convert FILETIME (64 bits int) to Python datetime.datetime | |
| 541 | - """ | |
| 542 | - # TODO: manage exception when microseconds is too large | |
| 543 | - # inspired from http://code.activestate.com/recipes/511425-filetime-to-datetime/ | |
| 544 | - _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0) | |
| 545 | - #log.debug('timedelta days=%d' % (filetime//(10*1000000*3600*24))) | |
| 546 | - return _FILETIME_null_date + datetime.timedelta(microseconds=filetime//10) | |
| 547 | - | |
| 548 | - | |
| 549 | - | |
| 550 | -#=== CLASSES ================================================================== | |
| 551 | - | |
| 552 | -class OleMetadata: | |
| 553 | - """ | |
| 554 | - class to parse and store metadata from standard properties of OLE files. | |
| 555 | - | |
| 556 | - Available attributes: | |
| 557 | - codepage, title, subject, author, keywords, comments, template, | |
| 558 | - last_saved_by, revision_number, total_edit_time, last_printed, create_time, | |
| 559 | - last_saved_time, num_pages, num_words, num_chars, thumbnail, | |
| 560 | - creating_application, security, codepage_doc, category, presentation_target, | |
| 561 | - bytes, lines, paragraphs, slides, notes, hidden_slides, mm_clips, | |
| 562 | - scale_crop, heading_pairs, titles_of_parts, manager, company, links_dirty, | |
| 563 | - chars_with_spaces, unused, shared_doc, link_base, hlinks, hlinks_changed, | |
| 564 | - version, dig_sig, content_type, content_status, language, doc_version | |
| 565 | - | |
| 566 | - Note: an attribute is set to None when not present in the properties of the | |
| 567 | - OLE file. | |
| 568 | - | |
| 569 | - References for SummaryInformation stream: | |
| 570 | - - http://msdn.microsoft.com/en-us/library/dd942545.aspx | |
| 571 | - - http://msdn.microsoft.com/en-us/library/dd925819%28v=office.12%29.aspx | |
| 572 | - - http://msdn.microsoft.com/en-us/library/windows/desktop/aa380376%28v=vs.85%29.aspx | |
| 573 | - - http://msdn.microsoft.com/en-us/library/aa372045.aspx | |
| 574 | - - http://sedna-soft.de/summary-information-stream/ | |
| 575 | - - http://poi.apache.org/apidocs/org/apache/poi/hpsf/SummaryInformation.html | |
| 576 | - | |
| 577 | - References for DocumentSummaryInformation stream: | |
| 578 | - - http://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx | |
| 579 | - - http://msdn.microsoft.com/en-us/library/windows/desktop/aa380374%28v=vs.85%29.aspx | |
| 580 | - - http://poi.apache.org/apidocs/org/apache/poi/hpsf/DocumentSummaryInformation.html | |
| 581 | - | |
| 582 | - new in version 0.25 | |
| 583 | - """ | |
| 584 | - | |
| 585 | - # attribute names for SummaryInformation stream properties: | |
| 586 | - # (ordered by property id, starting at 1) | |
| 587 | - SUMMARY_ATTRIBS = ['codepage', 'title', 'subject', 'author', 'keywords', 'comments', | |
| 588 | - 'template', 'last_saved_by', 'revision_number', 'total_edit_time', | |
| 589 | - 'last_printed', 'create_time', 'last_saved_time', 'num_pages', | |
| 590 | - 'num_words', 'num_chars', 'thumbnail', 'creating_application', | |
| 591 | - 'security'] | |
| 592 | - | |
| 593 | - # attribute names for DocumentSummaryInformation stream properties: | |
| 594 | - # (ordered by property id, starting at 1) | |
| 595 | - DOCSUM_ATTRIBS = ['codepage_doc', 'category', 'presentation_target', 'bytes', 'lines', 'paragraphs', | |
| 596 | - 'slides', 'notes', 'hidden_slides', 'mm_clips', | |
| 597 | - 'scale_crop', 'heading_pairs', 'titles_of_parts', 'manager', | |
| 598 | - 'company', 'links_dirty', 'chars_with_spaces', 'unused', 'shared_doc', | |
| 599 | - 'link_base', 'hlinks', 'hlinks_changed', 'version', 'dig_sig', | |
| 600 | - 'content_type', 'content_status', 'language', 'doc_version'] | |
| 601 | - | |
| 602 | - def __init__(self): | |
| 603 | - """ | |
| 604 | - Constructor for OleMetadata | |
| 605 | - All attributes are set to None by default | |
| 606 | - """ | |
| 607 | - # properties from SummaryInformation stream | |
| 608 | - self.codepage = None | |
| 609 | - self.title = None | |
| 610 | - self.subject = None | |
| 611 | - self.author = None | |
| 612 | - self.keywords = None | |
| 613 | - self.comments = None | |
| 614 | - self.template = None | |
| 615 | - self.last_saved_by = None | |
| 616 | - self.revision_number = None | |
| 617 | - self.total_edit_time = None | |
| 618 | - self.last_printed = None | |
| 619 | - self.create_time = None | |
| 620 | - self.last_saved_time = None | |
| 621 | - self.num_pages = None | |
| 622 | - self.num_words = None | |
| 623 | - self.num_chars = None | |
| 624 | - self.thumbnail = None | |
| 625 | - self.creating_application = None | |
| 626 | - self.security = None | |
| 627 | - # properties from DocumentSummaryInformation stream | |
| 628 | - self.codepage_doc = None | |
| 629 | - self.category = None | |
| 630 | - self.presentation_target = None | |
| 631 | - self.bytes = None | |
| 632 | - self.lines = None | |
| 633 | - self.paragraphs = None | |
| 634 | - self.slides = None | |
| 635 | - self.notes = None | |
| 636 | - self.hidden_slides = None | |
| 637 | - self.mm_clips = None | |
| 638 | - self.scale_crop = None | |
| 639 | - self.heading_pairs = None | |
| 640 | - self.titles_of_parts = None | |
| 641 | - self.manager = None | |
| 642 | - self.company = None | |
| 643 | - self.links_dirty = None | |
| 644 | - self.chars_with_spaces = None | |
| 645 | - self.unused = None | |
| 646 | - self.shared_doc = None | |
| 647 | - self.link_base = None | |
| 648 | - self.hlinks = None | |
| 649 | - self.hlinks_changed = None | |
| 650 | - self.version = None | |
| 651 | - self.dig_sig = None | |
| 652 | - self.content_type = None | |
| 653 | - self.content_status = None | |
| 654 | - self.language = None | |
| 655 | - self.doc_version = None | |
| 656 | - | |
| 657 | - | |
| 658 | - def parse_properties(self, olefile): | |
| 659 | - """ | |
| 660 | - Parse standard properties of an OLE file, from the streams | |
| 661 | - "\x05SummaryInformation" and "\x05DocumentSummaryInformation", | |
| 662 | - if present. | |
| 663 | - Properties are converted to strings, integers or python datetime objects. | |
| 664 | - If a property is not present, its value is set to None. | |
| 665 | - """ | |
| 666 | - # first set all attributes to None: | |
| 667 | - for attrib in (self.SUMMARY_ATTRIBS + self.DOCSUM_ATTRIBS): | |
| 668 | - setattr(self, attrib, None) | |
| 669 | - if olefile.exists("\x05SummaryInformation"): | |
| 670 | - # get properties from the stream: | |
| 671 | - # (converting timestamps to python datetime, except total_edit_time, | |
| 672 | - # which is property #10) | |
| 673 | - props = olefile.getproperties("\x05SummaryInformation", | |
| 674 | - convert_time=True, no_conversion=[10]) | |
| 675 | - # store them into this object's attributes: | |
| 676 | - for i in range(len(self.SUMMARY_ATTRIBS)): | |
| 677 | - # ids for standards properties start at 0x01, until 0x13 | |
| 678 | - value = props.get(i+1, None) | |
| 679 | - setattr(self, self.SUMMARY_ATTRIBS[i], value) | |
| 680 | - if olefile.exists("\x05DocumentSummaryInformation"): | |
| 681 | - # get properties from the stream: | |
| 682 | - props = olefile.getproperties("\x05DocumentSummaryInformation", | |
| 683 | - convert_time=True) | |
| 684 | - # store them into this object's attributes: | |
| 685 | - for i in range(len(self.DOCSUM_ATTRIBS)): | |
| 686 | - # ids for standards properties start at 0x01, until 0x13 | |
| 687 | - value = props.get(i+1, None) | |
| 688 | - setattr(self, self.DOCSUM_ATTRIBS[i], value) | |
| 689 | - | |
| 690 | - def dump(self): | |
| 691 | - """ | |
| 692 | - Dump all metadata, for debugging purposes. | |
| 693 | - """ | |
| 694 | - print('Properties from SummaryInformation stream:') | |
| 695 | - for prop in self.SUMMARY_ATTRIBS: | |
| 696 | - value = getattr(self, prop) | |
| 697 | - print('- %s: %s' % (prop, repr(value))) | |
| 698 | - print('Properties from DocumentSummaryInformation stream:') | |
| 699 | - for prop in self.DOCSUM_ATTRIBS: | |
| 700 | - value = getattr(self, prop) | |
| 701 | - print('- %s: %s' % (prop, repr(value))) | |
| 702 | - | |
| 703 | - | |
| 704 | -#--- _OleStream --------------------------------------------------------------- | |
| 705 | - | |
| 706 | -class _OleStream(io.BytesIO): | |
| 707 | - """ | |
| 708 | - OLE2 Stream | |
| 709 | - | |
| 710 | - Returns a read-only file object which can be used to read | |
| 711 | - the contents of a OLE stream (instance of the BytesIO class). | |
| 712 | - To open a stream, use the openstream method in the OleFile class. | |
| 713 | - | |
| 714 | - This function can be used with either ordinary streams, | |
| 715 | - or ministreams, depending on the offset, sectorsize, and | |
| 716 | - fat table arguments. | |
| 717 | - | |
| 718 | - Attributes: | |
| 719 | - | |
| 720 | - - size: actual size of data stream, after it was opened. | |
| 721 | - """ | |
| 722 | - #TODO: use _raise_defect instead of exceptions | |
| 723 | - | |
| 724 | - # FIXME: should store the list of sects obtained by following | |
| 725 | - # the fat chain, and load new sectors on demand instead of | |
| 726 | - # loading it all in one go. | |
| 727 | - | |
| 728 | - def __init__(self, fp, sect, size, offset, sectorsize, fat, filesize): | |
| 729 | - """ | |
| 730 | - Constructor for _OleStream class. | |
| 731 | - | |
| 732 | - :param fp: file object, the OLE container or the MiniFAT stream | |
| 733 | - :param sect: sector index of first sector in the stream | |
| 734 | - :param size: total size of the stream | |
| 735 | - :param offset: offset in bytes for the first FAT or MiniFAT sector | |
| 736 | - :param sectorsize: size of one sector | |
| 737 | - :param fat: array/list of sector indexes (FAT or MiniFAT) | |
| 738 | - :param filesize: size of OLE file (for debugging) | |
| 739 | - :returns: a BytesIO instance containing the OLE stream | |
| 740 | - """ | |
| 741 | - log.debug('_OleStream.__init__:') | |
| 742 | - log.debug(' sect=%d (%X), size=%d, offset=%d, sectorsize=%d, len(fat)=%d, fp=%s' | |
| 743 | - %(sect,sect,size,offset,sectorsize,len(fat), repr(fp))) | |
| 744 | - #[PL] To detect malformed documents with FAT loops, we compute the | |
| 745 | - # expected number of sectors in the stream: | |
| 746 | - unknown_size = False | |
| 747 | - if size==0x7FFFFFFF: | |
| 748 | - # this is the case when called from OleFileIO._open(), and stream | |
| 749 | - # size is not known in advance (for example when reading the | |
| 750 | - # Directory stream). Then we can only guess maximum size: | |
| 751 | - size = len(fat)*sectorsize | |
| 752 | - # and we keep a record that size was unknown: | |
| 753 | - unknown_size = True | |
| 754 | - log.debug(' stream with UNKNOWN SIZE') | |
| 755 | - nb_sectors = (size + (sectorsize-1)) // sectorsize | |
| 756 | - log.debug('nb_sectors = %d' % nb_sectors) | |
| 757 | - # This number should (at least) be less than the total number of | |
| 758 | - # sectors in the given FAT: | |
| 759 | - if nb_sectors > len(fat): | |
| 760 | - raise IOError('malformed OLE document, stream too large') | |
| 761 | - # optimization(?): data is first a list of strings, and join() is called | |
| 762 | - # at the end to concatenate all in one string. | |
| 763 | - # (this may not be really useful with recent Python versions) | |
| 764 | - data = [] | |
| 765 | - # if size is zero, then first sector index should be ENDOFCHAIN: | |
| 766 | - if size == 0 and sect != ENDOFCHAIN: | |
| 767 | - log.debug('size == 0 and sect != ENDOFCHAIN:') | |
| 768 | - raise IOError('incorrect OLE sector index for empty stream') | |
| 769 | - #[PL] A fixed-length for loop is used instead of an undefined while | |
| 770 | - # loop to avoid DoS attacks: | |
| 771 | - for i in range(nb_sectors): | |
| 772 | - # Sector index may be ENDOFCHAIN, but only if size was unknown | |
| 773 | - if sect == ENDOFCHAIN: | |
| 774 | - if unknown_size: | |
| 775 | - break | |
| 776 | - else: | |
| 777 | - # else this means that the stream is smaller than declared: | |
| 778 | - log.debug('sect=ENDOFCHAIN before expected size') | |
| 779 | - raise IOError('incomplete OLE stream') | |
| 780 | - # sector index should be within FAT: | |
| 781 | - if sect<0 or sect>=len(fat): | |
| 782 | - log.debug('sect=%d (%X) / len(fat)=%d' % (sect, sect, len(fat))) | |
| 783 | - log.debug('i=%d / nb_sectors=%d' %(i, nb_sectors)) | |
| 784 | -## tmp_data = b"".join(data) | |
| 785 | -## f = open('test_debug.bin', 'wb') | |
| 786 | -## f.write(tmp_data) | |
| 787 | -## f.close() | |
| 788 | -## log.debug('data read so far: %d bytes' % len(tmp_data)) | |
| 789 | - raise IOError('incorrect OLE FAT, sector index out of range') | |
| 790 | - #TODO: merge this code with OleFileIO.getsect() ? | |
| 791 | - #TODO: check if this works with 4K sectors: | |
| 792 | - try: | |
| 793 | - fp.seek(offset + sectorsize * sect) | |
| 794 | - except: | |
| 795 | - log.debug('sect=%d, seek=%d, filesize=%d' % | |
| 796 | - (sect, offset+sectorsize*sect, filesize)) | |
| 797 | - raise IOError('OLE sector index out of range') | |
| 798 | - sector_data = fp.read(sectorsize) | |
| 799 | - # [PL] check if there was enough data: | |
| 800 | - # Note: if sector is the last of the file, sometimes it is not a | |
| 801 | - # complete sector (of 512 or 4K), so we may read less than | |
| 802 | - # sectorsize. | |
| 803 | - if len(sector_data)!=sectorsize and sect!=(len(fat)-1): | |
| 804 | - log.debug('sect=%d / len(fat)=%d, seek=%d / filesize=%d, len read=%d' % | |
| 805 | - (sect, len(fat), offset+sectorsize*sect, filesize, len(sector_data))) | |
| 806 | - log.debug('seek+len(read)=%d' % (offset+sectorsize*sect+len(sector_data))) | |
| 807 | - raise IOError('incomplete OLE sector') | |
| 808 | - data.append(sector_data) | |
| 809 | - # jump to next sector in the FAT: | |
| 810 | - try: | |
| 811 | - sect = fat[sect] & 0xFFFFFFFF # JYTHON-WORKAROUND | |
| 812 | - except IndexError: | |
| 813 | - # [PL] if pointer is out of the FAT an exception is raised | |
| 814 | - raise IOError('incorrect OLE FAT, sector index out of range') | |
| 815 | - #[PL] Last sector should be a "end of chain" marker: | |
| 816 | - if sect != ENDOFCHAIN: | |
| 817 | - raise IOError('incorrect last sector index in OLE stream') | |
| 818 | - data = b"".join(data) | |
| 819 | - # Data is truncated to the actual stream size: | |
| 820 | - if len(data) >= size: | |
| 821 | - data = data[:size] | |
| 822 | - # actual stream size is stored for future use: | |
| 823 | - self.size = size | |
| 824 | - elif unknown_size: | |
| 825 | - # actual stream size was not known, now we know the size of read | |
| 826 | - # data: | |
| 827 | - self.size = len(data) | |
| 828 | - else: | |
| 829 | - # read data is less than expected: | |
| 830 | - log.debug('len(data)=%d, size=%d' % (len(data), size)) | |
| 831 | - # TODO: provide details in exception message | |
| 832 | - raise IOError('OLE stream size is less than declared') | |
| 833 | - # when all data is read in memory, BytesIO constructor is called | |
| 834 | - io.BytesIO.__init__(self, data) | |
| 835 | - # Then the _OleStream object can be used as a read-only file object. | |
| 836 | - | |
| 837 | - | |
| 838 | -#--- _OleDirectoryEntry ------------------------------------------------------- | |
| 839 | - | |
| 840 | -class _OleDirectoryEntry: | |
| 841 | - | |
| 842 | - """ | |
| 843 | - OLE2 Directory Entry | |
| 844 | - """ | |
| 845 | - #[PL] parsing code moved from OleFileIO.loaddirectory | |
| 846 | - | |
| 847 | - # struct to parse directory entries: | |
| 848 | - # <: little-endian byte order, standard sizes | |
| 849 | - # (note: this should guarantee that Q returns a 64 bits int) | |
| 850 | - # 64s: string containing entry name in unicode UTF-16 (max 31 chars) + null char = 64 bytes | |
| 851 | - # H: uint16, number of bytes used in name buffer, including null = (len+1)*2 | |
| 852 | - # B: uint8, dir entry type (between 0 and 5) | |
| 853 | - # B: uint8, color: 0=black, 1=red | |
| 854 | - # I: uint32, index of left child node in the red-black tree, NOSTREAM if none | |
| 855 | - # I: uint32, index of right child node in the red-black tree, NOSTREAM if none | |
| 856 | - # I: uint32, index of child root node if it is a storage, else NOSTREAM | |
| 857 | - # 16s: CLSID, unique identifier (only used if it is a storage) | |
| 858 | - # I: uint32, user flags | |
| 859 | - # Q (was 8s): uint64, creation timestamp or zero | |
| 860 | - # Q (was 8s): uint64, modification timestamp or zero | |
| 861 | - # I: uint32, SID of first sector if stream or ministream, SID of 1st sector | |
| 862 | - # of stream containing ministreams if root entry, 0 otherwise | |
| 863 | - # I: uint32, total stream size in bytes if stream (low 32 bits), 0 otherwise | |
| 864 | - # I: uint32, total stream size in bytes if stream (high 32 bits), 0 otherwise | |
| 865 | - STRUCT_DIRENTRY = '<64sHBBIII16sIQQIII' | |
| 866 | - # size of a directory entry: 128 bytes | |
| 867 | - DIRENTRY_SIZE = 128 | |
| 868 | - assert struct.calcsize(STRUCT_DIRENTRY) == DIRENTRY_SIZE | |
| 869 | - | |
| 870 | - | |
| 871 | - def __init__(self, entry, sid, olefile): | |
| 872 | - """ | |
| 873 | - Constructor for an _OleDirectoryEntry object. | |
| 874 | - Parses a 128-bytes entry from the OLE Directory stream. | |
| 875 | - | |
| 876 | - :param entry : string (must be 128 bytes long) | |
| 877 | - :param sid : index of this directory entry in the OLE file directory | |
| 878 | - :param olefile: OleFileIO containing this directory entry | |
| 879 | - """ | |
| 880 | - self.sid = sid | |
| 881 | - # ref to olefile is stored for future use | |
| 882 | - self.olefile = olefile | |
| 883 | - # kids is a list of children entries, if this entry is a storage: | |
| 884 | - # (list of _OleDirectoryEntry objects) | |
| 885 | - self.kids = [] | |
| 886 | - # kids_dict is a dictionary of children entries, indexed by their | |
| 887 | - # name in lowercase: used to quickly find an entry, and to detect | |
| 888 | - # duplicates | |
| 889 | - self.kids_dict = {} | |
| 890 | - # flag used to detect if the entry is referenced more than once in | |
| 891 | - # directory: | |
| 892 | - self.used = False | |
| 893 | - # decode DirEntry | |
| 894 | - ( | |
| 895 | - self.name_raw, # 64s: string containing entry name in unicode UTF-16 (max 31 chars) + null char = 64 bytes | |
| 896 | - self.namelength, # H: uint16, number of bytes used in name buffer, including null = (len+1)*2 | |
| 897 | - self.entry_type, | |
| 898 | - self.color, | |
| 899 | - self.sid_left, | |
| 900 | - self.sid_right, | |
| 901 | - self.sid_child, | |
| 902 | - clsid, | |
| 903 | - self.dwUserFlags, | |
| 904 | - self.createTime, | |
| 905 | - self.modifyTime, | |
| 906 | - self.isectStart, | |
| 907 | - self.sizeLow, | |
| 908 | - self.sizeHigh | |
| 909 | - ) = struct.unpack(_OleDirectoryEntry.STRUCT_DIRENTRY, entry) | |
| 910 | - if self.entry_type not in [STGTY_ROOT, STGTY_STORAGE, STGTY_STREAM, STGTY_EMPTY]: | |
| 911 | - olefile._raise_defect(DEFECT_INCORRECT, 'unhandled OLE storage type') | |
| 912 | - # only first directory entry can (and should) be root: | |
| 913 | - if self.entry_type == STGTY_ROOT and sid != 0: | |
| 914 | - olefile._raise_defect(DEFECT_INCORRECT, 'duplicate OLE root entry') | |
| 915 | - if sid == 0 and self.entry_type != STGTY_ROOT: | |
| 916 | - olefile._raise_defect(DEFECT_INCORRECT, 'incorrect OLE root entry') | |
| 917 | - #log.debug(struct.unpack(fmt_entry, entry[:len_entry])) | |
| 918 | - # name should be at most 31 unicode characters + null character, | |
| 919 | - # so 64 bytes in total (31*2 + 2): | |
| 920 | - if self.namelength>64: | |
| 921 | - olefile._raise_defect(DEFECT_INCORRECT, 'incorrect DirEntry name length >64 bytes') | |
| 922 | - # if exception not raised, namelength is set to the maximum value: | |
| 923 | - self.namelength = 64 | |
| 924 | - # only characters without ending null char are kept: | |
| 925 | - self.name_utf16 = self.name_raw[:(self.namelength-2)] | |
| 926 | - #TODO: check if the name is actually followed by a null unicode character ([MS-CFB] 2.6.1) | |
| 927 | - #TODO: check if the name does not contain forbidden characters: | |
| 928 | - # [MS-CFB] 2.6.1: "The following characters are illegal and MUST NOT be part of the name: '/', '\', ':', '!'." | |
| 929 | - # name is converted from UTF-16LE to the path encoding specified in the OleFileIO: | |
| 930 | - self.name = olefile._decode_utf16_str(self.name_utf16) | |
| 931 | - | |
| 932 | - log.debug('DirEntry SID=%d: %s' % (self.sid, repr(self.name))) | |
| 933 | - log.debug(' - type: %d' % self.entry_type) | |
| 934 | - log.debug(' - sect: %Xh' % self.isectStart) | |
| 935 | - log.debug(' - SID left: %d, right: %d, child: %d' % (self.sid_left, | |
| 936 | - self.sid_right, self.sid_child)) | |
| 937 | - | |
| 938 | - # sizeHigh is only used for 4K sectors, it should be zero for 512 bytes | |
| 939 | - # sectors, BUT apparently some implementations set it as 0xFFFFFFFF, 1 | |
| 940 | - # or some other value so it cannot be raised as a defect in general: | |
| 941 | - if olefile.sectorsize == 512: | |
| 942 | - if self.sizeHigh != 0 and self.sizeHigh != 0xFFFFFFFF: | |
| 943 | - log.debug('sectorsize=%d, sizeLow=%d, sizeHigh=%d (%X)' % | |
| 944 | - (olefile.sectorsize, self.sizeLow, self.sizeHigh, self.sizeHigh)) | |
| 945 | - olefile._raise_defect(DEFECT_UNSURE, 'incorrect OLE stream size') | |
| 946 | - self.size = self.sizeLow | |
| 947 | - else: | |
| 948 | - self.size = self.sizeLow + (long(self.sizeHigh)<<32) | |
| 949 | - log.debug(' - size: %d (sizeLow=%d, sizeHigh=%d)' % (self.size, self.sizeLow, self.sizeHigh)) | |
| 950 | - | |
| 951 | - self.clsid = _clsid(clsid) | |
| 952 | - # a storage should have a null size, BUT some implementations such as | |
| 953 | - # Word 8 for Mac seem to allow non-null values => Potential defect: | |
| 954 | - if self.entry_type == STGTY_STORAGE and self.size != 0: | |
| 955 | - olefile._raise_defect(DEFECT_POTENTIAL, 'OLE storage with size>0') | |
| 956 | - # check if stream is not already referenced elsewhere: | |
| 957 | - if self.entry_type in (STGTY_ROOT, STGTY_STREAM) and self.size>0: | |
| 958 | - if self.size < olefile.minisectorcutoff \ | |
| 959 | - and self.entry_type==STGTY_STREAM: # only streams can be in MiniFAT | |
| 960 | - # ministream object | |
| 961 | - minifat = True | |
| 962 | - else: | |
| 963 | - minifat = False | |
| 964 | - olefile._check_duplicate_stream(self.isectStart, minifat) | |
| 965 | - | |
| 966 | - | |
| 967 | - | |
| 968 | - def build_storage_tree(self): | |
| 969 | - """ | |
| 970 | - Read and build the red-black tree attached to this _OleDirectoryEntry | |
| 971 | - object, if it is a storage. | |
| 972 | - Note that this method builds a tree of all subentries, so it should | |
| 973 | - only be called for the root object once. | |
| 974 | - """ | |
| 975 | - log.debug('build_storage_tree: SID=%d - %s - sid_child=%d' | |
| 976 | - % (self.sid, repr(self.name), self.sid_child)) | |
| 977 | - if self.sid_child != NOSTREAM: | |
| 978 | - # if child SID is not NOSTREAM, then this entry is a storage. | |
| 979 | - # Let's walk through the tree of children to fill the kids list: | |
| 980 | - self.append_kids(self.sid_child) | |
| 981 | - | |
| 982 | - # Note from OpenOffice documentation: the safest way is to | |
| 983 | - # recreate the tree because some implementations may store broken | |
| 984 | - # red-black trees... | |
| 985 | - | |
| 986 | - # in the OLE file, entries are sorted on (length, name). | |
| 987 | - # for convenience, we sort them on name instead: | |
| 988 | - # (see rich comparison methods in this class) | |
| 989 | - self.kids.sort() | |
| 990 | - | |
| 991 | - | |
| 992 | - def append_kids(self, child_sid): | |
| 993 | - """ | |
| 994 | - Walk through red-black tree of children of this directory entry to add | |
| 995 | - all of them to the kids list. (recursive method) | |
| 996 | - | |
| 997 | - :param child_sid : index of child directory entry to use, or None when called | |
| 998 | - first time for the root. (only used during recursion) | |
| 999 | - """ | |
| 1000 | - #[PL] this method was added to use simple recursion instead of a complex | |
| 1001 | - # algorithm. | |
| 1002 | - # if this is not a storage or a leaf of the tree, nothing to do: | |
| 1003 | - if child_sid == NOSTREAM: | |
| 1004 | - return | |
| 1005 | - # check if child SID is in the proper range: | |
| 1006 | - if child_sid<0 or child_sid>=len(self.olefile.direntries): | |
| 1007 | - self.olefile._raise_defect(DEFECT_FATAL, 'OLE DirEntry index out of range') | |
| 1008 | - # get child direntry: | |
| 1009 | - child = self.olefile._load_direntry(child_sid) #direntries[child_sid] | |
| 1010 | - log.debug('append_kids: child_sid=%d - %s - sid_left=%d, sid_right=%d, sid_child=%d' | |
| 1011 | - % (child.sid, repr(child.name), child.sid_left, child.sid_right, child.sid_child)) | |
| 1012 | - # the directory entries are organized as a red-black tree. | |
| 1013 | - # (cf. Wikipedia for details) | |
| 1014 | - # First walk through left side of the tree: | |
| 1015 | - self.append_kids(child.sid_left) | |
| 1016 | - # Check if its name is not already used (case-insensitive): | |
| 1017 | - name_lower = child.name.lower() | |
| 1018 | - if name_lower in self.kids_dict: | |
| 1019 | - self.olefile._raise_defect(DEFECT_INCORRECT, | |
| 1020 | - "Duplicate filename in OLE storage") | |
| 1021 | - # Then the child_sid _OleDirectoryEntry object is appended to the | |
| 1022 | - # kids list and dictionary: | |
| 1023 | - self.kids.append(child) | |
| 1024 | - self.kids_dict[name_lower] = child | |
| 1025 | - # Check if kid was not already referenced in a storage: | |
| 1026 | - if child.used: | |
| 1027 | - self.olefile._raise_defect(DEFECT_INCORRECT, | |
| 1028 | - 'OLE Entry referenced more than once') | |
| 1029 | - child.used = True | |
| 1030 | - # Finally walk through right side of the tree: | |
| 1031 | - self.append_kids(child.sid_right) | |
| 1032 | - # Afterwards build kid's own tree if it's also a storage: | |
| 1033 | - child.build_storage_tree() | |
| 1034 | - | |
| 1035 | - | |
| 1036 | - def __eq__(self, other): | |
| 1037 | - "Compare entries by name" | |
| 1038 | - return self.name == other.name | |
| 1039 | - | |
| 1040 | - def __lt__(self, other): | |
| 1041 | - "Compare entries by name" | |
| 1042 | - return self.name < other.name | |
| 1043 | - | |
| 1044 | - def __ne__(self, other): | |
| 1045 | - return not self.__eq__(other) | |
| 1046 | - | |
| 1047 | - def __le__(self, other): | |
| 1048 | - return self.__eq__(other) or self.__lt__(other) | |
| 1049 | - | |
| 1050 | - # Reflected __lt__() and __le__() will be used for __gt__() and __ge__() | |
| 1051 | - | |
| 1052 | - #TODO: replace by the same function as MS implementation ? | |
| 1053 | - # (order by name length first, then case-insensitive order) | |
| 1054 | - | |
| 1055 | - | |
| 1056 | - def dump(self, tab = 0): | |
| 1057 | - "Dump this entry, and all its subentries (for debug purposes only)" | |
| 1058 | - TYPES = ["(invalid)", "(storage)", "(stream)", "(lockbytes)", | |
| 1059 | - "(property)", "(root)"] | |
| 1060 | - print(" "*tab + repr(self.name), TYPES[self.entry_type], end=' ') | |
| 1061 | - if self.entry_type in (STGTY_STREAM, STGTY_ROOT): | |
| 1062 | - print(self.size, "bytes", end=' ') | |
| 1063 | - print() | |
| 1064 | - if self.entry_type in (STGTY_STORAGE, STGTY_ROOT) and self.clsid: | |
| 1065 | - print(" "*tab + "{%s}" % self.clsid) | |
| 1066 | - | |
| 1067 | - for kid in self.kids: | |
| 1068 | - kid.dump(tab + 2) | |
| 1069 | - | |
| 1070 | - | |
| 1071 | - def getmtime(self): | |
| 1072 | - """ | |
| 1073 | - Return modification time of a directory entry. | |
| 1074 | - | |
| 1075 | - :returns: None if modification time is null, a python datetime object | |
| 1076 | - otherwise (UTC timezone) | |
| 1077 | - | |
| 1078 | - new in version 0.26 | |
| 1079 | - """ | |
| 1080 | - if self.modifyTime == 0: | |
| 1081 | - return None | |
| 1082 | - return filetime2datetime(self.modifyTime) | |
| 1083 | - | |
| 1084 | - | |
| 1085 | - def getctime(self): | |
| 1086 | - """ | |
| 1087 | - Return creation time of a directory entry. | |
| 1088 | - | |
| 1089 | - :returns: None if modification time is null, a python datetime object | |
| 1090 | - otherwise (UTC timezone) | |
| 1091 | - | |
| 1092 | - new in version 0.26 | |
| 1093 | - """ | |
| 1094 | - if self.createTime == 0: | |
| 1095 | - return None | |
| 1096 | - return filetime2datetime(self.createTime) | |
| 1097 | - | |
| 1098 | - | |
| 1099 | -#--- OleFileIO ---------------------------------------------------------------- | |
| 1100 | - | |
| 1101 | -class OleFileIO: | |
| 1102 | - """ | |
| 1103 | - OLE container object | |
| 1104 | - | |
| 1105 | - This class encapsulates the interface to an OLE 2 structured | |
| 1106 | - storage file. Use the listdir and openstream methods to | |
| 1107 | - access the contents of this file. | |
| 1108 | - | |
| 1109 | - Object names are given as a list of strings, one for each subentry | |
| 1110 | - level. The root entry should be omitted. For example, the following | |
| 1111 | - code extracts all image streams from a Microsoft Image Composer file:: | |
| 1112 | - | |
| 1113 | - ole = OleFileIO("fan.mic") | |
| 1114 | - | |
| 1115 | - for entry in ole.listdir(): | |
| 1116 | - if entry[1:2] == "Image": | |
| 1117 | - fin = ole.openstream(entry) | |
| 1118 | - fout = open(entry[0:1], "wb") | |
| 1119 | - while True: | |
| 1120 | - s = fin.read(8192) | |
| 1121 | - if not s: | |
| 1122 | - break | |
| 1123 | - fout.write(s) | |
| 1124 | - | |
| 1125 | - You can use the viewer application provided with the Python Imaging | |
| 1126 | - Library to view the resulting files (which happens to be standard | |
| 1127 | - TIFF files). | |
| 1128 | - """ | |
| 1129 | - | |
| 1130 | - def __init__(self, filename=None, raise_defects=DEFECT_FATAL, | |
| 1131 | - write_mode=False, debug=False, path_encoding=DEFAULT_PATH_ENCODING): | |
| 1132 | - """ | |
| 1133 | - Constructor for the OleFileIO class. | |
| 1134 | - | |
| 1135 | - :param filename: file to open. | |
| 1136 | - | |
| 1137 | - - if filename is a string smaller than 1536 bytes, it is the path | |
| 1138 | - of the file to open. (bytes or unicode string) | |
| 1139 | - - if filename is a string longer than 1535 bytes, it is parsed | |
| 1140 | - as the content of an OLE file in memory. (bytes type only) | |
| 1141 | - - if filename is a file-like object (with read, seek and tell methods), | |
| 1142 | - it is parsed as-is. | |
| 1143 | - | |
| 1144 | - :param raise_defects: minimal level for defects to be raised as exceptions. | |
| 1145 | - (use DEFECT_FATAL for a typical application, DEFECT_INCORRECT for a | |
| 1146 | - security-oriented application, see source code for details) | |
| 1147 | - | |
| 1148 | - :param write_mode: bool, if True the file is opened in read/write mode instead | |
| 1149 | - of read-only by default. | |
| 1150 | - | |
| 1151 | - :param debug: bool, set debug mode (deprecated, not used anymore) | |
| 1152 | - | |
| 1153 | - :param path_encoding: None or str, name of the codec to use for path | |
| 1154 | - names (streams and storages), or None for Unicode. | |
| 1155 | - Unicode by default on Python 3+, UTF-8 on Python 2.x. | |
| 1156 | - (new in olefile 0.42, was hardcoded to Latin-1 until olefile v0.41) | |
| 1157 | - """ | |
| 1158 | - # minimal level for defects to be raised as exceptions: | |
| 1159 | - self._raise_defects_level = raise_defects | |
| 1160 | - # list of defects/issues not raised as exceptions: | |
| 1161 | - # tuples of (exception type, message) | |
| 1162 | - self.parsing_issues = [] | |
| 1163 | - self.write_mode = write_mode | |
| 1164 | - self.path_encoding = path_encoding | |
| 1165 | - self._filesize = None | |
| 1166 | - self.fp = None | |
| 1167 | - if filename: | |
| 1168 | - self.open(filename, write_mode=write_mode) | |
| 1169 | - | |
| 1170 | - | |
| 1171 | - def _raise_defect(self, defect_level, message, exception_type=IOError): | |
| 1172 | - """ | |
| 1173 | - This method should be called for any defect found during file parsing. | |
| 1174 | - It may raise an IOError exception according to the minimal level chosen | |
| 1175 | - for the OleFileIO object. | |
| 1176 | - | |
| 1177 | - :param defect_level: defect level, possible values are: | |
| 1178 | - | |
| 1179 | - - DEFECT_UNSURE : a case which looks weird, but not sure it's a defect | |
| 1180 | - - DEFECT_POTENTIAL : a potential defect | |
| 1181 | - - DEFECT_INCORRECT : an error according to specifications, but parsing can go on | |
| 1182 | - - DEFECT_FATAL : an error which cannot be ignored, parsing is impossible | |
| 1183 | - | |
| 1184 | - :param message: string describing the defect, used with raised exception. | |
| 1185 | - :param exception_type: exception class to be raised, IOError by default | |
| 1186 | - """ | |
| 1187 | - # added by [PL] | |
| 1188 | - if defect_level >= self._raise_defects_level: | |
| 1189 | - log.error(message) | |
| 1190 | - raise exception_type(message) | |
| 1191 | - else: | |
| 1192 | - # just record the issue, no exception raised: | |
| 1193 | - self.parsing_issues.append((exception_type, message)) | |
| 1194 | - log.warning(message) | |
| 1195 | - | |
| 1196 | - | |
| 1197 | - def _decode_utf16_str(self, utf16_str, errors='replace'): | |
| 1198 | - """ | |
| 1199 | - Decode a string encoded in UTF-16 LE format, as found in the OLE | |
| 1200 | - directory or in property streams. Return a string encoded | |
| 1201 | - according to the path_encoding specified for the OleFileIO object. | |
| 1202 | - | |
| 1203 | - :param utf16_str: bytes string encoded in UTF-16 LE format | |
| 1204 | - :param errors: str, see python documentation for str.decode() | |
| 1205 | - :return: str, encoded according to path_encoding | |
| 1206 | - """ | |
| 1207 | - unicode_str = utf16_str.decode('UTF-16LE', errors) | |
| 1208 | - if self.path_encoding: | |
| 1209 | - # an encoding has been specified for path names: | |
| 1210 | - return unicode_str.encode(self.path_encoding, errors) | |
| 1211 | - else: | |
| 1212 | - # path_encoding=None, return the Unicode string as-is: | |
| 1213 | - return unicode_str | |
| 1214 | - | |
| 1215 | - | |
| 1216 | - def open(self, filename, write_mode=False): | |
| 1217 | - """ | |
| 1218 | - Open an OLE2 file in read-only or read/write mode. | |
| 1219 | - Read and parse the header, FAT and directory. | |
| 1220 | - | |
| 1221 | - :param filename: string-like or file-like object, OLE file to parse | |
| 1222 | - | |
| 1223 | - - if filename is a string smaller than 1536 bytes, it is the path | |
| 1224 | - of the file to open. (bytes or unicode string) | |
| 1225 | - - if filename is a string longer than 1535 bytes, it is parsed | |
| 1226 | - as the content of an OLE file in memory. (bytes type only) | |
| 1227 | - - if filename is a file-like object (with read, seek and tell methods), | |
| 1228 | - it is parsed as-is. | |
| 1229 | - | |
| 1230 | - :param write_mode: bool, if True the file is opened in read/write mode instead | |
| 1231 | - of read-only by default. (ignored if filename is not a path) | |
| 1232 | - """ | |
| 1233 | - self.write_mode = write_mode | |
| 1234 | - #[PL] check if filename is a string-like or file-like object: | |
| 1235 | - # (it is better to check for a read() method) | |
| 1236 | - if hasattr(filename, 'read'): | |
| 1237 | - #TODO: also check seek and tell methods? | |
| 1238 | - # file-like object: use it directly | |
| 1239 | - self.fp = filename | |
| 1240 | - elif isinstance(filename, bytes) and len(filename) >= MINIMAL_OLEFILE_SIZE: | |
| 1241 | - # filename is a bytes string containing the OLE file to be parsed: | |
| 1242 | - # convert it to BytesIO | |
| 1243 | - self.fp = io.BytesIO(filename) | |
| 1244 | - else: | |
| 1245 | - # string-like object: filename of file on disk | |
| 1246 | - if self.write_mode: | |
| 1247 | - # open file in mode 'read with update, binary' | |
| 1248 | - # According to https://docs.python.org/2/library/functions.html#open | |
| 1249 | - # 'w' would truncate the file, 'a' may only append on some Unixes | |
| 1250 | - mode = 'r+b' | |
| 1251 | - else: | |
| 1252 | - # read-only mode by default | |
| 1253 | - mode = 'rb' | |
| 1254 | - self.fp = open(filename, mode) | |
| 1255 | - # obtain the filesize by using seek and tell, which should work on most | |
| 1256 | - # file-like objects: | |
| 1257 | - #TODO: do it above, using getsize with filename when possible? | |
| 1258 | - #TODO: fix code to fail with clear exception when filesize cannot be obtained | |
| 1259 | - filesize=0 | |
| 1260 | - self.fp.seek(0, os.SEEK_END) | |
| 1261 | - try: | |
| 1262 | - filesize = self.fp.tell() | |
| 1263 | - finally: | |
| 1264 | - self.fp.seek(0) | |
| 1265 | - self._filesize = filesize | |
| 1266 | - log.debug('File size: %d' % self._filesize) | |
| 1267 | - | |
| 1268 | - # lists of streams in FAT and MiniFAT, to detect duplicate references | |
| 1269 | - # (list of indexes of first sectors of each stream) | |
| 1270 | - self._used_streams_fat = [] | |
| 1271 | - self._used_streams_minifat = [] | |
| 1272 | - | |
| 1273 | - header = self.fp.read(512) | |
| 1274 | - | |
| 1275 | - if len(header) != 512 or header[:8] != MAGIC: | |
| 1276 | - log.debug('Magic = %r instead of %r' % (header[:8], MAGIC)) | |
| 1277 | - self._raise_defect(DEFECT_FATAL, "not an OLE2 structured storage file") | |
| 1278 | - | |
| 1279 | - # [PL] header structure according to AAF specifications: | |
| 1280 | - ##Header | |
| 1281 | - ##struct StructuredStorageHeader { // [offset from start (bytes), length (bytes)] | |
| 1282 | - ##BYTE _abSig[8]; // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, | |
| 1283 | - ## // 0x1a, 0xe1} for current version | |
| 1284 | - ##CLSID _clsid; // [08H,16] reserved must be zero (WriteClassStg/ | |
| 1285 | - ## // GetClassFile uses root directory class id) | |
| 1286 | - ##USHORT _uMinorVersion; // [18H,02] minor version of the format: 33 is | |
| 1287 | - ## // written by reference implementation | |
| 1288 | - ##USHORT _uDllVersion; // [1AH,02] major version of the dll/format: 3 for | |
| 1289 | - ## // 512-byte sectors, 4 for 4 KB sectors | |
| 1290 | - ##USHORT _uByteOrder; // [1CH,02] 0xFFFE: indicates Intel byte-ordering | |
| 1291 | - ##USHORT _uSectorShift; // [1EH,02] size of sectors in power-of-two; | |
| 1292 | - ## // typically 9 indicating 512-byte sectors | |
| 1293 | - ##USHORT _uMiniSectorShift; // [20H,02] size of mini-sectors in power-of-two; | |
| 1294 | - ## // typically 6 indicating 64-byte mini-sectors | |
| 1295 | - ##USHORT _usReserved; // [22H,02] reserved, must be zero | |
| 1296 | - ##ULONG _ulReserved1; // [24H,04] reserved, must be zero | |
| 1297 | - ##FSINDEX _csectDir; // [28H,04] must be zero for 512-byte sectors, | |
| 1298 | - ## // number of SECTs in directory chain for 4 KB | |
| 1299 | - ## // sectors | |
| 1300 | - ##FSINDEX _csectFat; // [2CH,04] number of SECTs in the FAT chain | |
| 1301 | - ##SECT _sectDirStart; // [30H,04] first SECT in the directory chain | |
| 1302 | - ##DFSIGNATURE _signature; // [34H,04] signature used for transactions; must | |
| 1303 | - ## // be zero. The reference implementation | |
| 1304 | - ## // does not support transactions | |
| 1305 | - ##ULONG _ulMiniSectorCutoff; // [38H,04] maximum size for a mini stream; | |
| 1306 | - ## // typically 4096 bytes | |
| 1307 | - ##SECT _sectMiniFatStart; // [3CH,04] first SECT in the MiniFAT chain | |
| 1308 | - ##FSINDEX _csectMiniFat; // [40H,04] number of SECTs in the MiniFAT chain | |
| 1309 | - ##SECT _sectDifStart; // [44H,04] first SECT in the DIFAT chain | |
| 1310 | - ##FSINDEX _csectDif; // [48H,04] number of SECTs in the DIFAT chain | |
| 1311 | - ##SECT _sectFat[109]; // [4CH,436] the SECTs of first 109 FAT sectors | |
| 1312 | - ##}; | |
| 1313 | - | |
| 1314 | - # [PL] header decoding: | |
| 1315 | - # '<' indicates little-endian byte ordering for Intel (cf. struct module help) | |
| 1316 | - fmt_header = '<8s16sHHHHHHLLLLLLLLLL' | |
| 1317 | - header_size = struct.calcsize(fmt_header) | |
| 1318 | - log.debug( "fmt_header size = %d, +FAT = %d" % (header_size, header_size + 109*4) ) | |
| 1319 | - header1 = header[:header_size] | |
| 1320 | - ( | |
| 1321 | - self.header_signature, | |
| 1322 | - self.header_clsid, | |
| 1323 | - self.minor_version, | |
| 1324 | - self.dll_version, | |
| 1325 | - self.byte_order, | |
| 1326 | - self.sector_shift, | |
| 1327 | - self.mini_sector_shift, | |
| 1328 | - self.reserved1, | |
| 1329 | - self.reserved2, | |
| 1330 | - self.num_dir_sectors, | |
| 1331 | - self.num_fat_sectors, | |
| 1332 | - self.first_dir_sector, | |
| 1333 | - self.transaction_signature_number, | |
| 1334 | - self.mini_stream_cutoff_size, | |
| 1335 | - self.first_mini_fat_sector, | |
| 1336 | - self.num_mini_fat_sectors, | |
| 1337 | - self.first_difat_sector, | |
| 1338 | - self.num_difat_sectors | |
| 1339 | - ) = struct.unpack(fmt_header, header1) | |
| 1340 | - log.debug( struct.unpack(fmt_header, header1)) | |
| 1341 | - | |
| 1342 | - if self.header_signature != MAGIC: | |
| 1343 | - # OLE signature should always be present | |
| 1344 | - self._raise_defect(DEFECT_FATAL, "incorrect OLE signature") | |
| 1345 | - if self.header_clsid != bytearray(16): | |
| 1346 | - # according to AAF specs, CLSID should always be zero | |
| 1347 | - self._raise_defect(DEFECT_INCORRECT, "incorrect CLSID in OLE header") | |
| 1348 | - log.debug( "Minor Version = %d" % self.minor_version ) | |
| 1349 | - log.debug( "DLL Version = %d (expected: 3 or 4)" % self.dll_version ) | |
| 1350 | - if self.dll_version not in [3, 4]: | |
| 1351 | - # version 3: usual format, 512 bytes per sector | |
| 1352 | - # version 4: large format, 4K per sector | |
| 1353 | - self._raise_defect(DEFECT_INCORRECT, "incorrect DllVersion in OLE header") | |
| 1354 | - log.debug( "Byte Order = %X (expected: FFFE)" % self.byte_order ) | |
| 1355 | - if self.byte_order != 0xFFFE: | |
| 1356 | - # For now only common little-endian documents are handled correctly | |
| 1357 | - self._raise_defect(DEFECT_FATAL, "incorrect ByteOrder in OLE header") | |
| 1358 | - # TODO: add big-endian support for documents created on Mac ? | |
| 1359 | - # But according to [MS-CFB] ? v20140502, ByteOrder MUST be 0xFFFE. | |
| 1360 | - self.sector_size = 2**self.sector_shift | |
| 1361 | - log.debug( "Sector Size = %d bytes (expected: 512 or 4096)" % self.sector_size ) | |
| 1362 | - if self.sector_size not in [512, 4096]: | |
| 1363 | - self._raise_defect(DEFECT_INCORRECT, "incorrect sector_size in OLE header") | |
| 1364 | - if (self.dll_version==3 and self.sector_size!=512) \ | |
| 1365 | - or (self.dll_version==4 and self.sector_size!=4096): | |
| 1366 | - self._raise_defect(DEFECT_INCORRECT, "sector_size does not match DllVersion in OLE header") | |
| 1367 | - self.mini_sector_size = 2**self.mini_sector_shift | |
| 1368 | - log.debug( "MiniFAT Sector Size = %d bytes (expected: 64)" % self.mini_sector_size ) | |
| 1369 | - if self.mini_sector_size not in [64]: | |
| 1370 | - self._raise_defect(DEFECT_INCORRECT, "incorrect mini_sector_size in OLE header") | |
| 1371 | - if self.reserved1 != 0 or self.reserved2 != 0: | |
| 1372 | - self._raise_defect(DEFECT_INCORRECT, "incorrect OLE header (non-null reserved bytes)") | |
| 1373 | - log.debug( "Number of directory sectors = %d" % self.num_dir_sectors ) | |
| 1374 | - # Number of directory sectors (only allowed if DllVersion != 3) | |
| 1375 | - if self.sector_size==512 and self.num_dir_sectors!=0: | |
| 1376 | - self._raise_defect(DEFECT_INCORRECT, "incorrect number of directory sectors in OLE header") | |
| 1377 | - log.debug( "num_fat_sectors = %d" % self.num_fat_sectors ) | |
| 1378 | - # num_fat_sectors = number of FAT sectors in the file | |
| 1379 | - log.debug( "first_dir_sector = %X" % self.first_dir_sector ) | |
| 1380 | - # first_dir_sector = 1st sector containing the directory | |
| 1381 | - log.debug( "transaction_signature_number = %d" % self.transaction_signature_number ) | |
| 1382 | - # Signature should be zero, BUT some implementations do not follow this | |
| 1383 | - # rule => only a potential defect: | |
| 1384 | - # (according to MS-CFB, may be != 0 for applications supporting file | |
| 1385 | - # transactions) | |
| 1386 | - if self.transaction_signature_number != 0: | |
| 1387 | - self._raise_defect(DEFECT_POTENTIAL, "incorrect OLE header (transaction_signature_number>0)") | |
| 1388 | - log.debug( "mini_stream_cutoff_size = 0x%X (expected: 0x1000)" % self.mini_stream_cutoff_size ) | |
| 1389 | - # MS-CFB: This integer field MUST be set to 0x00001000. This field | |
| 1390 | - # specifies the maximum size of a user-defined data stream allocated | |
| 1391 | - # from the mini FAT and mini stream, and that cutoff is 4096 bytes. | |
| 1392 | - # Any user-defined data stream larger than or equal to this cutoff size | |
| 1393 | - # must be allocated as normal sectors from the FAT. | |
| 1394 | - if self.mini_stream_cutoff_size != 0x1000: | |
| 1395 | - self._raise_defect(DEFECT_INCORRECT, "incorrect mini_stream_cutoff_size in OLE header") | |
| 1396 | - # if no exception is raised, the cutoff size is fixed to 0x1000 | |
| 1397 | - log.warning('Fixing the mini_stream_cutoff_size to 4096 (mandatory value) instead of %d' % | |
| 1398 | - self.mini_stream_cutoff_size) | |
| 1399 | - self.mini_stream_cutoff_size = 0x1000 | |
| 1400 | - log.debug( "first_mini_fat_sector = %Xh" % self.first_mini_fat_sector ) | |
| 1401 | - log.debug( "num_mini_fat_sectors = %d" % self.num_mini_fat_sectors ) | |
| 1402 | - log.debug( "first_difat_sector = %Xh" % self.first_difat_sector ) | |
| 1403 | - log.debug( "num_difat_sectors = %d" % self.num_difat_sectors ) | |
| 1404 | - | |
| 1405 | - # calculate the number of sectors in the file | |
| 1406 | - # (-1 because header doesn't count) | |
| 1407 | - self.nb_sect = ( (filesize + self.sector_size-1) // self.sector_size) - 1 | |
| 1408 | - log.debug( "Number of sectors in the file: %d" % self.nb_sect ) | |
| 1409 | - #TODO: change this test, because an OLE file MAY contain other data | |
| 1410 | - # after the last sector. | |
| 1411 | - | |
| 1412 | - # file clsid | |
| 1413 | - self.header_clsid = _clsid(header[8:24]) | |
| 1414 | - | |
| 1415 | - #TODO: remove redundant attributes, and fix the code which uses them? | |
| 1416 | - self.sectorsize = self.sector_size #1 << i16(header, 30) | |
| 1417 | - self.minisectorsize = self.mini_sector_size #1 << i16(header, 32) | |
| 1418 | - self.minisectorcutoff = self.mini_stream_cutoff_size # i32(header, 56) | |
| 1419 | - | |
| 1420 | - # check known streams for duplicate references (these are always in FAT, | |
| 1421 | - # never in MiniFAT): | |
| 1422 | - self._check_duplicate_stream(self.first_dir_sector) | |
| 1423 | - # check MiniFAT only if it is not empty: | |
| 1424 | - if self.num_mini_fat_sectors: | |
| 1425 | - self._check_duplicate_stream(self.first_mini_fat_sector) | |
| 1426 | - # check DIFAT only if it is not empty: | |
| 1427 | - if self.num_difat_sectors: | |
| 1428 | - self._check_duplicate_stream(self.first_difat_sector) | |
| 1429 | - | |
| 1430 | - # Load file allocation tables | |
| 1431 | - self.loadfat(header) | |
| 1432 | - # Load direcory. This sets both the direntries list (ordered by sid) | |
| 1433 | - # and the root (ordered by hierarchy) members. | |
| 1434 | - self.loaddirectory(self.first_dir_sector)#i32(header, 48)) | |
| 1435 | - self.ministream = None | |
| 1436 | - self.minifatsect = self.first_mini_fat_sector #i32(header, 60) | |
| 1437 | - | |
| 1438 | - | |
| 1439 | - def close(self): | |
| 1440 | - """ | |
| 1441 | - close the OLE file, to release the file object | |
| 1442 | - """ | |
| 1443 | - self.fp.close() | |
| 1444 | - | |
| 1445 | - | |
| 1446 | - def _check_duplicate_stream(self, first_sect, minifat=False): | |
| 1447 | - """ | |
| 1448 | - Checks if a stream has not been already referenced elsewhere. | |
| 1449 | - This method should only be called once for each known stream, and only | |
| 1450 | - if stream size is not null. | |
| 1451 | - | |
| 1452 | - :param first_sect: int, index of first sector of the stream in FAT | |
| 1453 | - :param minifat: bool, if True, stream is located in the MiniFAT, else in the FAT | |
| 1454 | - """ | |
| 1455 | - if minifat: | |
| 1456 | - log.debug('_check_duplicate_stream: sect=%Xh in MiniFAT' % first_sect) | |
| 1457 | - used_streams = self._used_streams_minifat | |
| 1458 | - else: | |
| 1459 | - log.debug('_check_duplicate_stream: sect=%Xh in FAT' % first_sect) | |
| 1460 | - # some values can be safely ignored (not a real stream): | |
| 1461 | - if first_sect in (DIFSECT,FATSECT,ENDOFCHAIN,FREESECT): | |
| 1462 | - return | |
| 1463 | - used_streams = self._used_streams_fat | |
| 1464 | - #TODO: would it be more efficient using a dict or hash values, instead | |
| 1465 | - # of a list of long ? | |
| 1466 | - if first_sect in used_streams: | |
| 1467 | - self._raise_defect(DEFECT_INCORRECT, 'Stream referenced twice') | |
| 1468 | - else: | |
| 1469 | - used_streams.append(first_sect) | |
| 1470 | - | |
| 1471 | - | |
| 1472 | - def dumpfat(self, fat, firstindex=0): | |
| 1473 | - """ | |
| 1474 | - Display a part of FAT in human-readable form for debugging purposes | |
| 1475 | - """ | |
| 1476 | - # dictionary to convert special FAT values in human-readable strings | |
| 1477 | - VPL = 8 # values per line (8+1 * 8+1 = 81) | |
| 1478 | - fatnames = { | |
| 1479 | - FREESECT: "..free..", | |
| 1480 | - ENDOFCHAIN: "[ END. ]", | |
| 1481 | - FATSECT: "FATSECT ", | |
| 1482 | - DIFSECT: "DIFSECT " | |
| 1483 | - } | |
| 1484 | - nbsect = len(fat) | |
| 1485 | - nlines = (nbsect+VPL-1)//VPL | |
| 1486 | - print("index", end=" ") | |
| 1487 | - for i in range(VPL): | |
| 1488 | - print("%8X" % i, end=" ") | |
| 1489 | - print() | |
| 1490 | - for l in range(nlines): | |
| 1491 | - index = l*VPL | |
| 1492 | - print("%6X:" % (firstindex+index), end=" ") | |
| 1493 | - for i in range(index, index+VPL): | |
| 1494 | - if i>=nbsect: | |
| 1495 | - break | |
| 1496 | - sect = fat[i] | |
| 1497 | - aux = sect & 0xFFFFFFFF # JYTHON-WORKAROUND | |
| 1498 | - if aux in fatnames: | |
| 1499 | - name = fatnames[aux] | |
| 1500 | - else: | |
| 1501 | - if sect == i+1: | |
| 1502 | - name = " --->" | |
| 1503 | - else: | |
| 1504 | - name = "%8X" % sect | |
| 1505 | - print(name, end=" ") | |
| 1506 | - print() | |
| 1507 | - | |
| 1508 | - | |
| 1509 | - def dumpsect(self, sector, firstindex=0): | |
| 1510 | - """ | |
| 1511 | - Display a sector in a human-readable form, for debugging purposes | |
| 1512 | - """ | |
| 1513 | - VPL=8 # number of values per line (8+1 * 8+1 = 81) | |
| 1514 | - tab = array.array(UINT32, sector) | |
| 1515 | - if sys.byteorder == 'big': | |
| 1516 | - tab.byteswap() | |
| 1517 | - nbsect = len(tab) | |
| 1518 | - nlines = (nbsect+VPL-1)//VPL | |
| 1519 | - print("index", end=" ") | |
| 1520 | - for i in range(VPL): | |
| 1521 | - print("%8X" % i, end=" ") | |
| 1522 | - print() | |
| 1523 | - for l in range(nlines): | |
| 1524 | - index = l*VPL | |
| 1525 | - print("%6X:" % (firstindex+index), end=" ") | |
| 1526 | - for i in range(index, index+VPL): | |
| 1527 | - if i>=nbsect: | |
| 1528 | - break | |
| 1529 | - sect = tab[i] | |
| 1530 | - name = "%8X" % sect | |
| 1531 | - print(name, end=" ") | |
| 1532 | - print() | |
| 1533 | - | |
| 1534 | - def sect2array(self, sect): | |
| 1535 | - """ | |
| 1536 | - convert a sector to an array of 32 bits unsigned integers, | |
| 1537 | - swapping bytes on big endian CPUs such as PowerPC (old Macs) | |
| 1538 | - """ | |
| 1539 | - a = array.array(UINT32, sect) | |
| 1540 | - # if CPU is big endian, swap bytes: | |
| 1541 | - if sys.byteorder == 'big': | |
| 1542 | - a.byteswap() | |
| 1543 | - return a | |
| 1544 | - | |
| 1545 | - | |
| 1546 | - def loadfat_sect(self, sect): | |
| 1547 | - """ | |
| 1548 | - Adds the indexes of the given sector to the FAT | |
| 1549 | - | |
| 1550 | - :param sect: string containing the first FAT sector, or array of long integers | |
| 1551 | - :returns: index of last FAT sector. | |
| 1552 | - """ | |
| 1553 | - # a FAT sector is an array of ulong integers. | |
| 1554 | - if isinstance(sect, array.array): | |
| 1555 | - # if sect is already an array it is directly used | |
| 1556 | - fat1 = sect | |
| 1557 | - else: | |
| 1558 | - # if it's a raw sector, it is parsed in an array | |
| 1559 | - fat1 = self.sect2array(sect) | |
| 1560 | - # Display the sector contents only if the logging level is debug: | |
| 1561 | - if log.isEnabledFor(logging.DEBUG): | |
| 1562 | - self.dumpsect(sect) | |
| 1563 | - # The FAT is a sector chain starting at the first index of itself. | |
| 1564 | - # initialize isect, just in case: | |
| 1565 | - isect = None | |
| 1566 | - for isect in fat1: | |
| 1567 | - isect = isect & 0xFFFFFFFF # JYTHON-WORKAROUND | |
| 1568 | - log.debug("isect = %X" % isect) | |
| 1569 | - if isect == ENDOFCHAIN or isect == FREESECT: | |
| 1570 | - # the end of the sector chain has been reached | |
| 1571 | - log.debug("found end of sector chain") | |
| 1572 | - break | |
| 1573 | - # read the FAT sector | |
| 1574 | - s = self.getsect(isect) | |
| 1575 | - # parse it as an array of 32 bits integers, and add it to the | |
| 1576 | - # global FAT array | |
| 1577 | - nextfat = self.sect2array(s) | |
| 1578 | - self.fat = self.fat + nextfat | |
| 1579 | - return isect | |
| 1580 | - | |
| 1581 | - | |
| 1582 | - def loadfat(self, header): | |
| 1583 | - """ | |
| 1584 | - Load the FAT table. | |
| 1585 | - """ | |
| 1586 | - # The 1st sector of the file contains sector numbers for the first 109 | |
| 1587 | - # FAT sectors, right after the header which is 76 bytes long. | |
| 1588 | - # (always 109, whatever the sector size: 512 bytes = 76+4*109) | |
| 1589 | - # Additional sectors are described by DIF blocks | |
| 1590 | - | |
| 1591 | - sect = header[76:512] | |
| 1592 | - log.debug( "len(sect)=%d, so %d integers" % (len(sect), len(sect)//4) ) | |
| 1593 | - #fat = [] | |
| 1594 | - # [PL] FAT is an array of 32 bits unsigned ints, it's more effective | |
| 1595 | - # to use an array than a list in Python. | |
| 1596 | - # It's initialized as empty first: | |
| 1597 | - self.fat = array.array(UINT32) | |
| 1598 | - self.loadfat_sect(sect) | |
| 1599 | - #self.dumpfat(self.fat) | |
| 1600 | -## for i in range(0, len(sect), 4): | |
| 1601 | -## ix = i32(sect, i) | |
| 1602 | -## #[PL] if ix == -2 or ix == -1: # ix == 0xFFFFFFFE or ix == 0xFFFFFFFF: | |
| 1603 | -## if ix == 0xFFFFFFFE or ix == 0xFFFFFFFF: | |
| 1604 | -## break | |
| 1605 | -## s = self.getsect(ix) | |
| 1606 | -## #fat = fat + [i32(s, i) for i in range(0, len(s), 4)] | |
| 1607 | -## fat = fat + array.array(UINT32, s) | |
| 1608 | - if self.num_difat_sectors != 0: | |
| 1609 | - # [PL] There's a DIFAT because file is larger than 6.8MB | |
| 1610 | - # some checks just in case: | |
| 1611 | - if self.num_fat_sectors <= 109: | |
| 1612 | - # there must be at least 109 blocks in header and the rest in | |
| 1613 | - # DIFAT, so number of sectors must be >109. | |
| 1614 | - self._raise_defect(DEFECT_INCORRECT, 'incorrect DIFAT, not enough sectors') | |
| 1615 | - if self.first_difat_sector >= self.nb_sect: | |
| 1616 | - # initial DIFAT block index must be valid | |
| 1617 | - self._raise_defect(DEFECT_FATAL, 'incorrect DIFAT, first index out of range') | |
| 1618 | - log.debug( "DIFAT analysis..." ) | |
| 1619 | - # We compute the necessary number of DIFAT sectors : | |
| 1620 | - # Number of pointers per DIFAT sector = (sectorsize/4)-1 | |
| 1621 | - # (-1 because the last pointer is the next DIFAT sector number) | |
| 1622 | - nb_difat_sectors = (self.sectorsize//4)-1 | |
| 1623 | - # (if 512 bytes: each DIFAT sector = 127 pointers + 1 towards next DIFAT sector) | |
| 1624 | - nb_difat = (self.num_fat_sectors-109 + nb_difat_sectors-1)//nb_difat_sectors | |
| 1625 | - log.debug( "nb_difat = %d" % nb_difat ) | |
| 1626 | - if self.num_difat_sectors != nb_difat: | |
| 1627 | - raise IOError('incorrect DIFAT') | |
| 1628 | - isect_difat = self.first_difat_sector | |
| 1629 | - for i in iterrange(nb_difat): | |
| 1630 | - log.debug( "DIFAT block %d, sector %X" % (i, isect_difat) ) | |
| 1631 | - #TODO: check if corresponding FAT SID = DIFSECT | |
| 1632 | - sector_difat = self.getsect(isect_difat) | |
| 1633 | - difat = self.sect2array(sector_difat) | |
| 1634 | - # Display the sector contents only if the logging level is debug: | |
| 1635 | - if log.isEnabledFor(logging.DEBUG): | |
| 1636 | - self.dumpsect(sector_difat) | |
| 1637 | - self.loadfat_sect(difat[:nb_difat_sectors]) | |
| 1638 | - # last DIFAT pointer is next DIFAT sector: | |
| 1639 | - isect_difat = difat[nb_difat_sectors] | |
| 1640 | - log.debug( "next DIFAT sector: %X" % isect_difat ) | |
| 1641 | - # checks: | |
| 1642 | - if isect_difat not in [ENDOFCHAIN, FREESECT]: | |
| 1643 | - # last DIFAT pointer value must be ENDOFCHAIN or FREESECT | |
| 1644 | - raise IOError('incorrect end of DIFAT') | |
| 1645 | -## if len(self.fat) != self.num_fat_sectors: | |
| 1646 | -## # FAT should contain num_fat_sectors blocks | |
| 1647 | -## print("FAT length: %d instead of %d" % (len(self.fat), self.num_fat_sectors)) | |
| 1648 | -## raise IOError('incorrect DIFAT') | |
| 1649 | - # since FAT is read from fixed-size sectors, it may contain more values | |
| 1650 | - # than the actual number of sectors in the file. | |
| 1651 | - # Keep only the relevant sector indexes: | |
| 1652 | - if len(self.fat) > self.nb_sect: | |
| 1653 | - log.debug('len(fat)=%d, shrunk to nb_sect=%d' % (len(self.fat), self.nb_sect)) | |
| 1654 | - self.fat = self.fat[:self.nb_sect] | |
| 1655 | - # Display the FAT contents only if the logging level is debug: | |
| 1656 | - if log.isEnabledFor(logging.DEBUG): | |
| 1657 | - log.debug('\nFAT:') | |
| 1658 | - self.dumpfat(self.fat) | |
| 1659 | - | |
| 1660 | - | |
| 1661 | - def loadminifat(self): | |
| 1662 | - """ | |
| 1663 | - Load the MiniFAT table. | |
| 1664 | - """ | |
| 1665 | - # MiniFAT is stored in a standard sub-stream, pointed to by a header | |
| 1666 | - # field. | |
| 1667 | - # NOTE: there are two sizes to take into account for this stream: | |
| 1668 | - # 1) Stream size is calculated according to the number of sectors | |
| 1669 | - # declared in the OLE header. This allocated stream may be more than | |
| 1670 | - # needed to store the actual sector indexes. | |
| 1671 | - # (self.num_mini_fat_sectors is the number of sectors of size self.sector_size) | |
| 1672 | - stream_size = self.num_mini_fat_sectors * self.sector_size | |
| 1673 | - # 2) Actually used size is calculated by dividing the MiniStream size | |
| 1674 | - # (given by root entry size) by the size of mini sectors, *4 for | |
| 1675 | - # 32 bits indexes: | |
| 1676 | - nb_minisectors = (self.root.size + self.mini_sector_size-1) // self.mini_sector_size | |
| 1677 | - used_size = nb_minisectors * 4 | |
| 1678 | - log.debug('loadminifat(): minifatsect=%d, nb FAT sectors=%d, used_size=%d, stream_size=%d, nb MiniSectors=%d' % | |
| 1679 | - (self.minifatsect, self.num_mini_fat_sectors, used_size, stream_size, nb_minisectors)) | |
| 1680 | - if used_size > stream_size: | |
| 1681 | - # This is not really a problem, but may indicate a wrong implementation: | |
| 1682 | - self._raise_defect(DEFECT_INCORRECT, 'OLE MiniStream is larger than MiniFAT') | |
| 1683 | - # In any case, first read stream_size: | |
| 1684 | - s = self._open(self.minifatsect, stream_size, force_FAT=True).read() | |
| 1685 | - #[PL] Old code replaced by an array: | |
| 1686 | - #self.minifat = [i32(s, i) for i in range(0, len(s), 4)] | |
| 1687 | - self.minifat = self.sect2array(s) | |
| 1688 | - # Then shrink the array to used size, to avoid indexes out of MiniStream: | |
| 1689 | - log.debug('MiniFAT shrunk from %d to %d sectors' % (len(self.minifat), nb_minisectors)) | |
| 1690 | - self.minifat = self.minifat[:nb_minisectors] | |
| 1691 | - log.debug('loadminifat(): len=%d' % len(self.minifat)) | |
| 1692 | - # Display the FAT contents only if the logging level is debug: | |
| 1693 | - if log.isEnabledFor(logging.DEBUG): | |
| 1694 | - log.debug('\nMiniFAT:') | |
| 1695 | - self.dumpfat(self.minifat) | |
| 1696 | - | |
| 1697 | - def getsect(self, sect): | |
| 1698 | - """ | |
| 1699 | - Read given sector from file on disk. | |
| 1700 | - | |
| 1701 | - :param sect: int, sector index | |
| 1702 | - :returns: a string containing the sector data. | |
| 1703 | - """ | |
| 1704 | - # From [MS-CFB]: A sector number can be converted into a byte offset | |
| 1705 | - # into the file by using the following formula: | |
| 1706 | - # (sector number + 1) x Sector Size. | |
| 1707 | - # This implies that sector #0 of the file begins at byte offset Sector | |
| 1708 | - # Size, not at 0. | |
| 1709 | - | |
| 1710 | - # [PL] the original code in PIL was wrong when sectors are 4KB instead of | |
| 1711 | - # 512 bytes: | |
| 1712 | - #self.fp.seek(512 + self.sectorsize * sect) | |
| 1713 | - #[PL]: added safety checks: | |
| 1714 | - #print("getsect(%X)" % sect) | |
| 1715 | - try: | |
| 1716 | - self.fp.seek(self.sectorsize * (sect+1)) | |
| 1717 | - except: | |
| 1718 | - log.debug('getsect(): sect=%X, seek=%d, filesize=%d' % | |
| 1719 | - (sect, self.sectorsize*(sect+1), self._filesize)) | |
| 1720 | - self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range') | |
| 1721 | - sector = self.fp.read(self.sectorsize) | |
| 1722 | - if len(sector) != self.sectorsize: | |
| 1723 | - log.debug('getsect(): sect=%X, read=%d, sectorsize=%d' % | |
| 1724 | - (sect, len(sector), self.sectorsize)) | |
| 1725 | - self._raise_defect(DEFECT_FATAL, 'incomplete OLE sector') | |
| 1726 | - return sector | |
| 1727 | - | |
| 1728 | - | |
| 1729 | - def write_sect(self, sect, data, padding=b'\x00'): | |
| 1730 | - """ | |
| 1731 | - Write given sector to file on disk. | |
| 1732 | - | |
| 1733 | - :param sect: int, sector index | |
| 1734 | - :param data: bytes, sector data | |
| 1735 | - :param padding: single byte, padding character if data < sector size | |
| 1736 | - """ | |
| 1737 | - if not isinstance(data, bytes): | |
| 1738 | - raise TypeError("write_sect: data must be a bytes string") | |
| 1739 | - if not isinstance(padding, bytes) or len(padding)!=1: | |
| 1740 | - raise TypeError("write_sect: padding must be a bytes string of 1 char") | |
| 1741 | - #TODO: we could allow padding=None for no padding at all | |
| 1742 | - try: | |
| 1743 | - self.fp.seek(self.sectorsize * (sect+1)) | |
| 1744 | - except: | |
| 1745 | - log.debug('write_sect(): sect=%X, seek=%d, filesize=%d' % | |
| 1746 | - (sect, self.sectorsize*(sect+1), self._filesize)) | |
| 1747 | - self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range') | |
| 1748 | - if len(data) < self.sectorsize: | |
| 1749 | - # add padding | |
| 1750 | - data += padding * (self.sectorsize - len(data)) | |
| 1751 | - elif len(data) < self.sectorsize: | |
| 1752 | - raise ValueError("Data is larger than sector size") | |
| 1753 | - self.fp.write(data) | |
| 1754 | - | |
| 1755 | - | |
| 1756 | - def loaddirectory(self, sect): | |
| 1757 | - """ | |
| 1758 | - Load the directory. | |
| 1759 | - | |
| 1760 | - :param sect: sector index of directory stream. | |
| 1761 | - """ | |
| 1762 | - # The directory is stored in a standard | |
| 1763 | - # substream, independent of its size. | |
| 1764 | - | |
| 1765 | - # open directory stream as a read-only file: | |
| 1766 | - # (stream size is not known in advance) | |
| 1767 | - self.directory_fp = self._open(sect) | |
| 1768 | - | |
| 1769 | - #[PL] to detect malformed documents and avoid DoS attacks, the maximum | |
| 1770 | - # number of directory entries can be calculated: | |
| 1771 | - max_entries = self.directory_fp.size // 128 | |
| 1772 | - log.debug('loaddirectory: size=%d, max_entries=%d' % | |
| 1773 | - (self.directory_fp.size, max_entries)) | |
| 1774 | - | |
| 1775 | - # Create list of directory entries | |
| 1776 | - #self.direntries = [] | |
| 1777 | - # We start with a list of "None" object | |
| 1778 | - self.direntries = [None] * max_entries | |
| 1779 | -## for sid in iterrange(max_entries): | |
| 1780 | -## entry = fp.read(128) | |
| 1781 | -## if not entry: | |
| 1782 | -## break | |
| 1783 | -## self.direntries.append(_OleDirectoryEntry(entry, sid, self)) | |
| 1784 | - # load root entry: | |
| 1785 | - root_entry = self._load_direntry(0) | |
| 1786 | - # Root entry is the first entry: | |
| 1787 | - self.root = self.direntries[0] | |
| 1788 | - # TODO: read ALL directory entries (ignore bad entries?) | |
| 1789 | - # TODO: adapt build_storage_tree to avoid duplicate reads | |
| 1790 | - # for i in range(1, max_entries): | |
| 1791 | - # self._load_direntry(i) | |
| 1792 | - # read and build all storage trees, starting from the root: | |
| 1793 | - self.root.build_storage_tree() | |
| 1794 | - | |
| 1795 | - | |
| 1796 | - def _load_direntry (self, sid): | |
| 1797 | - """ | |
| 1798 | - Load a directory entry from the directory. | |
| 1799 | - This method should only be called once for each storage/stream when | |
| 1800 | - loading the directory. | |
| 1801 | - | |
| 1802 | - :param sid: index of storage/stream in the directory. | |
| 1803 | - :returns: a _OleDirectoryEntry object | |
| 1804 | - | |
| 1805 | - :exception IOError: if the entry has always been referenced. | |
| 1806 | - """ | |
| 1807 | - # check if SID is OK: | |
| 1808 | - if sid<0 or sid>=len(self.direntries): | |
| 1809 | - self._raise_defect(DEFECT_FATAL, "OLE directory index out of range") | |
| 1810 | - # check if entry was already referenced: | |
| 1811 | - if self.direntries[sid] is not None: | |
| 1812 | - self._raise_defect(DEFECT_INCORRECT, | |
| 1813 | - "double reference for OLE stream/storage") | |
| 1814 | - # if exception not raised, return the object | |
| 1815 | - return self.direntries[sid] | |
| 1816 | - self.directory_fp.seek(sid * 128) | |
| 1817 | - entry = self.directory_fp.read(128) | |
| 1818 | - self.direntries[sid] = _OleDirectoryEntry(entry, sid, self) | |
| 1819 | - return self.direntries[sid] | |
| 1820 | - | |
| 1821 | - | |
| 1822 | - def dumpdirectory(self): | |
| 1823 | - """ | |
| 1824 | - Dump directory (for debugging only) | |
| 1825 | - """ | |
| 1826 | - self.root.dump() | |
| 1827 | - | |
| 1828 | - | |
| 1829 | - def _open(self, start, size = 0x7FFFFFFF, force_FAT=False): | |
| 1830 | - """ | |
| 1831 | - Open a stream, either in FAT or MiniFAT according to its size. | |
| 1832 | - (openstream helper) | |
| 1833 | - | |
| 1834 | - :param start: index of first sector | |
| 1835 | - :param size: size of stream (or nothing if size is unknown) | |
| 1836 | - :param force_FAT: if False (default), stream will be opened in FAT or MiniFAT | |
| 1837 | - according to size. If True, it will always be opened in FAT. | |
| 1838 | - """ | |
| 1839 | - log.debug('OleFileIO.open(): sect=%Xh, size=%d, force_FAT=%s' % | |
| 1840 | - (start, size, str(force_FAT))) | |
| 1841 | - # stream size is compared to the mini_stream_cutoff_size threshold: | |
| 1842 | - if size < self.minisectorcutoff and not force_FAT: | |
| 1843 | - # ministream object | |
| 1844 | - if not self.ministream: | |
| 1845 | - # load MiniFAT if it wasn't already done: | |
| 1846 | - self.loadminifat() | |
| 1847 | - # The first sector index of the miniFAT stream is stored in the | |
| 1848 | - # root directory entry: | |
| 1849 | - size_ministream = self.root.size | |
| 1850 | - log.debug('Opening MiniStream: sect=%Xh, size=%d' % | |
| 1851 | - (self.root.isectStart, size_ministream)) | |
| 1852 | - self.ministream = self._open(self.root.isectStart, | |
| 1853 | - size_ministream, force_FAT=True) | |
| 1854 | - return _OleStream(fp=self.ministream, sect=start, size=size, | |
| 1855 | - offset=0, sectorsize=self.minisectorsize, | |
| 1856 | - fat=self.minifat, filesize=self.ministream.size) | |
| 1857 | - else: | |
| 1858 | - # standard stream | |
| 1859 | - return _OleStream(fp=self.fp, sect=start, size=size, | |
| 1860 | - offset=self.sectorsize, | |
| 1861 | - sectorsize=self.sectorsize, fat=self.fat, | |
| 1862 | - filesize=self._filesize) | |
| 1863 | - | |
| 1864 | - | |
| 1865 | - def _list(self, files, prefix, node, streams=True, storages=False): | |
| 1866 | - """ | |
| 1867 | - listdir helper | |
| 1868 | - | |
| 1869 | - :param files: list of files to fill in | |
| 1870 | - :param prefix: current location in storage tree (list of names) | |
| 1871 | - :param node: current node (_OleDirectoryEntry object) | |
| 1872 | - :param streams: bool, include streams if True (True by default) - new in v0.26 | |
| 1873 | - :param storages: bool, include storages if True (False by default) - new in v0.26 | |
| 1874 | - (note: the root storage is never included) | |
| 1875 | - """ | |
| 1876 | - prefix = prefix + [node.name] | |
| 1877 | - for entry in node.kids: | |
| 1878 | - if entry.entry_type == STGTY_STORAGE: | |
| 1879 | - # this is a storage | |
| 1880 | - if storages: | |
| 1881 | - # add it to the list | |
| 1882 | - files.append(prefix[1:] + [entry.name]) | |
| 1883 | - # check its kids | |
| 1884 | - self._list(files, prefix, entry, streams, storages) | |
| 1885 | - elif entry.entry_type == STGTY_STREAM: | |
| 1886 | - # this is a stream | |
| 1887 | - if streams: | |
| 1888 | - # add it to the list | |
| 1889 | - files.append(prefix[1:] + [entry.name]) | |
| 1890 | - else: | |
| 1891 | - self._raise_defect(DEFECT_INCORRECT, 'The directory tree contains an entry which is not a stream nor a storage.') | |
| 1892 | - | |
| 1893 | - | |
| 1894 | - def listdir(self, streams=True, storages=False): | |
| 1895 | - """ | |
| 1896 | - Return a list of streams and/or storages stored in this file | |
| 1897 | - | |
| 1898 | - :param streams: bool, include streams if True (True by default) - new in v0.26 | |
| 1899 | - :param storages: bool, include storages if True (False by default) - new in v0.26 | |
| 1900 | - (note: the root storage is never included) | |
| 1901 | - :returns: list of stream and/or storage paths | |
| 1902 | - """ | |
| 1903 | - files = [] | |
| 1904 | - self._list(files, [], self.root, streams, storages) | |
| 1905 | - return files | |
| 1906 | - | |
| 1907 | - | |
| 1908 | - def _find(self, filename): | |
| 1909 | - """ | |
| 1910 | - Returns directory entry of given filename. (openstream helper) | |
| 1911 | - Note: this method is case-insensitive. | |
| 1912 | - | |
| 1913 | - :param filename: path of stream in storage tree (except root entry), either: | |
| 1914 | - | |
| 1915 | - - a string using Unix path syntax, for example: | |
| 1916 | - 'storage_1/storage_1.2/stream' | |
| 1917 | - - or a list of storage filenames, path to the desired stream/storage. | |
| 1918 | - Example: ['storage_1', 'storage_1.2', 'stream'] | |
| 1919 | - | |
| 1920 | - :returns: sid of requested filename | |
| 1921 | - :exception IOError: if file not found | |
| 1922 | - """ | |
| 1923 | - | |
| 1924 | - # if filename is a string instead of a list, split it on slashes to | |
| 1925 | - # convert to a list: | |
| 1926 | - if isinstance(filename, basestring): | |
| 1927 | - filename = filename.split('/') | |
| 1928 | - # walk across storage tree, following given path: | |
| 1929 | - node = self.root | |
| 1930 | - for name in filename: | |
| 1931 | - for kid in node.kids: | |
| 1932 | - if kid.name.lower() == name.lower(): | |
| 1933 | - break | |
| 1934 | - else: | |
| 1935 | - raise IOError("file not found") | |
| 1936 | - node = kid | |
| 1937 | - return node.sid | |
| 1938 | - | |
| 1939 | - | |
| 1940 | - def openstream(self, filename): | |
| 1941 | - """ | |
| 1942 | - Open a stream as a read-only file object (BytesIO). | |
| 1943 | - Note: filename is case-insensitive. | |
| 1944 | - | |
| 1945 | - :param filename: path of stream in storage tree (except root entry), either: | |
| 1946 | - | |
| 1947 | - - a string using Unix path syntax, for example: | |
| 1948 | - 'storage_1/storage_1.2/stream' | |
| 1949 | - - or a list of storage filenames, path to the desired stream/storage. | |
| 1950 | - Example: ['storage_1', 'storage_1.2', 'stream'] | |
| 1951 | - | |
| 1952 | - :returns: file object (read-only) | |
| 1953 | - :exception IOError: if filename not found, or if this is not a stream. | |
| 1954 | - """ | |
| 1955 | - sid = self._find(filename) | |
| 1956 | - entry = self.direntries[sid] | |
| 1957 | - if entry.entry_type != STGTY_STREAM: | |
| 1958 | - raise IOError("this file is not a stream") | |
| 1959 | - return self._open(entry.isectStart, entry.size) | |
| 1960 | - | |
| 1961 | - | |
| 1962 | - def write_stream(self, stream_name, data): | |
| 1963 | - """ | |
| 1964 | - Write a stream to disk. For now, it is only possible to replace an | |
| 1965 | - existing stream by data of the same size. | |
| 1966 | - | |
| 1967 | - :param stream_name: path of stream in storage tree (except root entry), either: | |
| 1968 | - | |
| 1969 | - - a string using Unix path syntax, for example: | |
| 1970 | - 'storage_1/storage_1.2/stream' | |
| 1971 | - - or a list of storage filenames, path to the desired stream/storage. | |
| 1972 | - Example: ['storage_1', 'storage_1.2', 'stream'] | |
| 1973 | - | |
| 1974 | - :param data: bytes, data to be written, must be the same size as the original | |
| 1975 | - stream. | |
| 1976 | - """ | |
| 1977 | - if not isinstance(data, bytes): | |
| 1978 | - raise TypeError("write_stream: data must be a bytes string") | |
| 1979 | - sid = self._find(stream_name) | |
| 1980 | - entry = self.direntries[sid] | |
| 1981 | - if entry.entry_type != STGTY_STREAM: | |
| 1982 | - raise IOError("this is not a stream") | |
| 1983 | - size = entry.size | |
| 1984 | - if size != len(data): | |
| 1985 | - raise ValueError("write_stream: data must be the same size as the existing stream") | |
| 1986 | - if size < self.minisectorcutoff: | |
| 1987 | - raise NotImplementedError("Writing a stream in MiniFAT is not implemented yet") | |
| 1988 | - sect = entry.isectStart | |
| 1989 | - # number of sectors to write | |
| 1990 | - nb_sectors = (size + (self.sectorsize-1)) // self.sectorsize | |
| 1991 | - log.debug('nb_sectors = %d' % nb_sectors) | |
| 1992 | - for i in range(nb_sectors): | |
| 1993 | -## try: | |
| 1994 | -## self.fp.seek(offset + self.sectorsize * sect) | |
| 1995 | -## except: | |
| 1996 | -## log.debug('sect=%d, seek=%d' % | |
| 1997 | -## (sect, offset+self.sectorsize*sect)) | |
| 1998 | -## raise IOError('OLE sector index out of range') | |
| 1999 | - # extract one sector from data, the last one being smaller: | |
| 2000 | - if i<(nb_sectors-1): | |
| 2001 | - data_sector = data [i*self.sectorsize : (i+1)*self.sectorsize] | |
| 2002 | - #TODO: comment this if it works | |
| 2003 | - assert(len(data_sector)==self.sectorsize) | |
| 2004 | - else: | |
| 2005 | - data_sector = data [i*self.sectorsize:] | |
| 2006 | - #TODO: comment this if it works | |
| 2007 | - log.debug('write_stream: size=%d sectorsize=%d data_sector=%Xh size%%sectorsize=%d' | |
| 2008 | - % (size, self.sectorsize, len(data_sector), size % self.sectorsize)) | |
| 2009 | - assert(len(data_sector) % self.sectorsize==size % self.sectorsize) | |
| 2010 | - self.write_sect(sect, data_sector) | |
| 2011 | -## self.fp.write(data_sector) | |
| 2012 | - # jump to next sector in the FAT: | |
| 2013 | - try: | |
| 2014 | - sect = self.fat[sect] | |
| 2015 | - except IndexError: | |
| 2016 | - # [PL] if pointer is out of the FAT an exception is raised | |
| 2017 | - raise IOError('incorrect OLE FAT, sector index out of range') | |
| 2018 | - #[PL] Last sector should be a "end of chain" marker: | |
| 2019 | - if sect != ENDOFCHAIN: | |
| 2020 | - raise IOError('incorrect last sector index in OLE stream') | |
| 2021 | - | |
| 2022 | - | |
| 2023 | - def get_type(self, filename): | |
| 2024 | - """ | |
| 2025 | - Test if given filename exists as a stream or a storage in the OLE | |
| 2026 | - container, and return its type. | |
| 2027 | - | |
| 2028 | - :param filename: path of stream in storage tree. (see openstream for syntax) | |
| 2029 | - :returns: False if object does not exist, its entry type (>0) otherwise: | |
| 2030 | - | |
| 2031 | - - STGTY_STREAM: a stream | |
| 2032 | - - STGTY_STORAGE: a storage | |
| 2033 | - - STGTY_ROOT: the root entry | |
| 2034 | - """ | |
| 2035 | - try: | |
| 2036 | - sid = self._find(filename) | |
| 2037 | - entry = self.direntries[sid] | |
| 2038 | - return entry.entry_type | |
| 2039 | - except: | |
| 2040 | - return False | |
| 2041 | - | |
| 2042 | - | |
| 2043 | - def getmtime(self, filename): | |
| 2044 | - """ | |
| 2045 | - Return modification time of a stream/storage. | |
| 2046 | - | |
| 2047 | - :param filename: path of stream/storage in storage tree. (see openstream for | |
| 2048 | - syntax) | |
| 2049 | - :returns: None if modification time is null, a python datetime object | |
| 2050 | - otherwise (UTC timezone) | |
| 2051 | - | |
| 2052 | - new in version 0.26 | |
| 2053 | - """ | |
| 2054 | - sid = self._find(filename) | |
| 2055 | - entry = self.direntries[sid] | |
| 2056 | - return entry.getmtime() | |
| 2057 | - | |
| 2058 | - | |
| 2059 | - def getctime(self, filename): | |
| 2060 | - """ | |
| 2061 | - Return creation time of a stream/storage. | |
| 2062 | - | |
| 2063 | - :param filename: path of stream/storage in storage tree. (see openstream for | |
| 2064 | - syntax) | |
| 2065 | - :returns: None if creation time is null, a python datetime object | |
| 2066 | - otherwise (UTC timezone) | |
| 2067 | - | |
| 2068 | - new in version 0.26 | |
| 2069 | - """ | |
| 2070 | - sid = self._find(filename) | |
| 2071 | - entry = self.direntries[sid] | |
| 2072 | - return entry.getctime() | |
| 2073 | - | |
| 2074 | - | |
| 2075 | - def exists(self, filename): | |
| 2076 | - """ | |
| 2077 | - Test if given filename exists as a stream or a storage in the OLE | |
| 2078 | - container. | |
| 2079 | - Note: filename is case-insensitive. | |
| 2080 | - | |
| 2081 | - :param filename: path of stream in storage tree. (see openstream for syntax) | |
| 2082 | - :returns: True if object exist, else False. | |
| 2083 | - """ | |
| 2084 | - try: | |
| 2085 | - sid = self._find(filename) | |
| 2086 | - return True | |
| 2087 | - except: | |
| 2088 | - return False | |
| 2089 | - | |
| 2090 | - | |
| 2091 | - def get_size(self, filename): | |
| 2092 | - """ | |
| 2093 | - Return size of a stream in the OLE container, in bytes. | |
| 2094 | - | |
| 2095 | - :param filename: path of stream in storage tree (see openstream for syntax) | |
| 2096 | - :returns: size in bytes (long integer) | |
| 2097 | - :exception IOError: if file not found | |
| 2098 | - :exception TypeError: if this is not a stream. | |
| 2099 | - """ | |
| 2100 | - sid = self._find(filename) | |
| 2101 | - entry = self.direntries[sid] | |
| 2102 | - if entry.entry_type != STGTY_STREAM: | |
| 2103 | - #TODO: Should it return zero instead of raising an exception ? | |
| 2104 | - raise TypeError('object is not an OLE stream') | |
| 2105 | - return entry.size | |
| 2106 | - | |
| 2107 | - | |
| 2108 | - def get_rootentry_name(self): | |
| 2109 | - """ | |
| 2110 | - Return root entry name. Should usually be 'Root Entry' or 'R' in most | |
| 2111 | - implementations. | |
| 2112 | - """ | |
| 2113 | - return self.root.name | |
| 2114 | - | |
| 2115 | - | |
| 2116 | - def getproperties(self, filename, convert_time=False, no_conversion=None): | |
| 2117 | - """ | |
| 2118 | - Return properties described in substream. | |
| 2119 | - | |
| 2120 | - :param filename: path of stream in storage tree (see openstream for syntax) | |
| 2121 | - :param convert_time: bool, if True timestamps will be converted to Python datetime | |
| 2122 | - :param no_conversion: None or list of int, timestamps not to be converted | |
| 2123 | - (for example total editing time is not a real timestamp) | |
| 2124 | - | |
| 2125 | - :returns: a dictionary of values indexed by id (integer) | |
| 2126 | - """ | |
| 2127 | - #REFERENCE: [MS-OLEPS] https://msdn.microsoft.com/en-us/library/dd942421.aspx | |
| 2128 | - # make sure no_conversion is a list, just to simplify code below: | |
| 2129 | - if no_conversion == None: | |
| 2130 | - no_conversion = [] | |
| 2131 | - # stream path as a string to report exceptions: | |
| 2132 | - streampath = filename | |
| 2133 | - if not isinstance(streampath, str): | |
| 2134 | - streampath = '/'.join(streampath) | |
| 2135 | - | |
| 2136 | - fp = self.openstream(filename) | |
| 2137 | - | |
| 2138 | - data = {} | |
| 2139 | - | |
| 2140 | - try: | |
| 2141 | - # header | |
| 2142 | - s = fp.read(28) | |
| 2143 | - clsid = _clsid(s[8:24]) | |
| 2144 | - | |
| 2145 | - # format id | |
| 2146 | - s = fp.read(20) | |
| 2147 | - fmtid = _clsid(s[:16]) | |
| 2148 | - fp.seek(i32(s, 16)) | |
| 2149 | - | |
| 2150 | - # get section | |
| 2151 | - s = b"****" + fp.read(i32(fp.read(4))-4) | |
| 2152 | - # number of properties: | |
| 2153 | - num_props = i32(s, 4) | |
| 2154 | - except BaseException as exc: | |
| 2155 | - # catch exception while parsing property header, and only raise | |
| 2156 | - # a DEFECT_INCORRECT then return an empty dict, because this is not | |
| 2157 | - # a fatal error when parsing the whole file | |
| 2158 | - msg = 'Error while parsing properties header in stream %s: %s' % ( | |
| 2159 | - repr(streampath), exc) | |
| 2160 | - self._raise_defect(DEFECT_INCORRECT, msg, type(exc)) | |
| 2161 | - return data | |
| 2162 | - | |
| 2163 | - for i in range(num_props): | |
| 2164 | - property_id = 0 # just in case of an exception | |
| 2165 | - try: | |
| 2166 | - property_id = i32(s, 8+i*8) | |
| 2167 | - offset = i32(s, 12+i*8) | |
| 2168 | - property_type = i32(s, offset) | |
| 2169 | - | |
| 2170 | - log.debug('property id=%d: type=%d offset=%X' % (property_id, property_type, offset)) | |
| 2171 | - | |
| 2172 | - # test for common types first (should perhaps use | |
| 2173 | - # a dictionary instead?) | |
| 2174 | - | |
| 2175 | - if property_type == VT_I2: # 16-bit signed integer | |
| 2176 | - value = i16(s, offset+4) | |
| 2177 | - if value >= 32768: | |
| 2178 | - value = value - 65536 | |
| 2179 | - elif property_type == VT_UI2: # 2-byte unsigned integer | |
| 2180 | - value = i16(s, offset+4) | |
| 2181 | - elif property_type in (VT_I4, VT_INT, VT_ERROR): | |
| 2182 | - # VT_I4: 32-bit signed integer | |
| 2183 | - # VT_ERROR: HRESULT, similar to 32-bit signed integer, | |
| 2184 | - # see http://msdn.microsoft.com/en-us/library/cc230330.aspx | |
| 2185 | - value = i32(s, offset+4) | |
| 2186 | - elif property_type in (VT_UI4, VT_UINT): # 4-byte unsigned integer | |
| 2187 | - value = i32(s, offset+4) # FIXME | |
| 2188 | - elif property_type in (VT_BSTR, VT_LPSTR): | |
| 2189 | - # CodePageString, see http://msdn.microsoft.com/en-us/library/dd942354.aspx | |
| 2190 | - # size is a 32 bits integer, including the null terminator, and | |
| 2191 | - # possibly trailing or embedded null chars | |
| 2192 | - #TODO: if codepage is unicode, the string should be converted as such | |
| 2193 | - count = i32(s, offset+4) | |
| 2194 | - value = s[offset+8:offset+8+count-1] | |
| 2195 | - # remove all null chars: | |
| 2196 | - value = value.replace(b'\x00', b'') | |
| 2197 | - elif property_type == VT_BLOB: | |
| 2198 | - # binary large object (BLOB) | |
| 2199 | - # see http://msdn.microsoft.com/en-us/library/dd942282.aspx | |
| 2200 | - count = i32(s, offset+4) | |
| 2201 | - value = s[offset+8:offset+8+count] | |
| 2202 | - elif property_type == VT_LPWSTR: | |
| 2203 | - # UnicodeString | |
| 2204 | - # see http://msdn.microsoft.com/en-us/library/dd942313.aspx | |
| 2205 | - # "the string should NOT contain embedded or additional trailing | |
| 2206 | - # null characters." | |
| 2207 | - count = i32(s, offset+4) | |
| 2208 | - value = self._decode_utf16_str(s[offset+8:offset+8+count*2]) | |
| 2209 | - elif property_type == VT_FILETIME: | |
| 2210 | - value = long(i32(s, offset+4)) + (long(i32(s, offset+8))<<32) | |
| 2211 | - # FILETIME is a 64-bit int: "number of 100ns periods | |
| 2212 | - # since Jan 1,1601". | |
| 2213 | - if convert_time and property_id not in no_conversion: | |
| 2214 | - log.debug('Converting property #%d to python datetime, value=%d=%fs' | |
| 2215 | - %(property_id, value, float(value)/10000000)) | |
| 2216 | - # convert FILETIME to Python datetime.datetime | |
| 2217 | - # inspired from http://code.activestate.com/recipes/511425-filetime-to-datetime/ | |
| 2218 | - _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0) | |
| 2219 | - log.debug('timedelta days=%d' % (value//(10*1000000*3600*24))) | |
| 2220 | - value = _FILETIME_null_date + datetime.timedelta(microseconds=value//10) | |
| 2221 | - else: | |
| 2222 | - # legacy code kept for backward compatibility: returns a | |
| 2223 | - # number of seconds since Jan 1,1601 | |
| 2224 | - value = value // 10000000 # seconds | |
| 2225 | - elif property_type == VT_UI1: # 1-byte unsigned integer | |
| 2226 | - value = i8(s[offset+4]) | |
| 2227 | - elif property_type == VT_CLSID: | |
| 2228 | - value = _clsid(s[offset+4:offset+20]) | |
| 2229 | - elif property_type == VT_CF: | |
| 2230 | - # PropertyIdentifier or ClipboardData?? | |
| 2231 | - # see http://msdn.microsoft.com/en-us/library/dd941945.aspx | |
| 2232 | - count = i32(s, offset+4) | |
| 2233 | - value = s[offset+8:offset+8+count] | |
| 2234 | - elif property_type == VT_BOOL: | |
| 2235 | - # VARIANT_BOOL, 16 bits bool, 0x0000=Fals, 0xFFFF=True | |
| 2236 | - # see http://msdn.microsoft.com/en-us/library/cc237864.aspx | |
| 2237 | - value = bool(i16(s, offset+4)) | |
| 2238 | - else: | |
| 2239 | - value = None # everything else yields "None" | |
| 2240 | - log.debug('property id=%d: type=%d not implemented in parser yet' % (property_id, property_type)) | |
| 2241 | - | |
| 2242 | - # missing: VT_EMPTY, VT_NULL, VT_R4, VT_R8, VT_CY, VT_DATE, | |
| 2243 | - # VT_DECIMAL, VT_I1, VT_I8, VT_UI8, | |
| 2244 | - # see http://msdn.microsoft.com/en-us/library/dd942033.aspx | |
| 2245 | - | |
| 2246 | - # FIXME: add support for VT_VECTOR | |
| 2247 | - # VT_VECTOR is a 32 uint giving the number of items, followed by | |
| 2248 | - # the items in sequence. The VT_VECTOR value is combined with the | |
| 2249 | - # type of items, e.g. VT_VECTOR|VT_BSTR | |
| 2250 | - # see http://msdn.microsoft.com/en-us/library/dd942011.aspx | |
| 2251 | - | |
| 2252 | - #print("%08x" % property_id, repr(value), end=" ") | |
| 2253 | - #print("(%s)" % VT[i32(s, offset) & 0xFFF]) | |
| 2254 | - | |
| 2255 | - data[property_id] = value | |
| 2256 | - except BaseException as exc: | |
| 2257 | - # catch exception while parsing each property, and only raise | |
| 2258 | - # a DEFECT_INCORRECT, because parsing can go on | |
| 2259 | - msg = 'Error while parsing property id %d in stream %s: %s' % ( | |
| 2260 | - property_id, repr(streampath), exc) | |
| 2261 | - self._raise_defect(DEFECT_INCORRECT, msg, type(exc)) | |
| 2262 | - | |
| 2263 | - return data | |
| 2264 | - | |
| 2265 | - def get_metadata(self): | |
| 2266 | - """ | |
| 2267 | - Parse standard properties streams, return an OleMetadata object | |
| 2268 | - containing all the available metadata. | |
| 2269 | - (also stored in the metadata attribute of the OleFileIO object) | |
| 2270 | - | |
| 2271 | - new in version 0.25 | |
| 2272 | - """ | |
| 2273 | - self.metadata = OleMetadata() | |
| 2274 | - self.metadata.parse_properties(self) | |
| 2275 | - return self.metadata | |
| 2276 | - | |
| 2277 | -# | |
| 2278 | -# -------------------------------------------------------------------- | |
| 2279 | -# This script can be used to dump the directory of any OLE2 structured | |
| 2280 | -# storage file. | |
| 2281 | - | |
| 2282 | -if __name__ == "__main__": | |
| 2283 | - | |
| 2284 | - import sys, optparse | |
| 2285 | - | |
| 2286 | - DEFAULT_LOG_LEVEL = "warning" # Default log level | |
| 2287 | - LOG_LEVELS = { | |
| 2288 | - 'debug': logging.DEBUG, | |
| 2289 | - 'info': logging.INFO, | |
| 2290 | - 'warning': logging.WARNING, | |
| 2291 | - 'error': logging.ERROR, | |
| 2292 | - 'critical': logging.CRITICAL | |
| 2293 | - } | |
| 2294 | - | |
| 2295 | - usage = 'usage: %prog [options] <filename> [filename2 ...]' | |
| 2296 | - parser = optparse.OptionParser(usage=usage) | |
| 2297 | - parser.add_option("-c", action="store_true", dest="check_streams", | |
| 2298 | - help='check all streams (for debugging purposes)') | |
| 2299 | - parser.add_option("-d", action="store_true", dest="debug_mode", | |
| 2300 | - help='debug mode, shortcut for -l debug (displays a lot of debug information, for developers only)') | |
| 2301 | - parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL, | |
| 2302 | - help="logging level debug/info/warning/error/critical (default=%default)") | |
| 2303 | - | |
| 2304 | - (options, args) = parser.parse_args() | |
| 2305 | - | |
| 2306 | - print('olefile version %s %s - http://www.decalage.info/en/olefile\n' % (__version__, __date__)) | |
| 2307 | - | |
| 2308 | - # Print help if no arguments are passed | |
| 2309 | - if len(args) == 0: | |
| 2310 | - print(__doc__) | |
| 2311 | - parser.print_help() | |
| 2312 | - sys.exit() | |
| 2313 | - | |
| 2314 | - if options.debug_mode: | |
| 2315 | - options.loglevel = 'debug' | |
| 2316 | - | |
| 2317 | - # setup logging to the console | |
| 2318 | - logging.basicConfig(level=LOG_LEVELS[options.loglevel], format='%(levelname)-8s %(message)s') | |
| 2319 | - | |
| 2320 | - # also set the same log level for the module's logger to enable it: | |
| 2321 | - log.setLevel(LOG_LEVELS[options.loglevel]) | |
| 2322 | - | |
| 2323 | - for filename in args: | |
| 2324 | - try: | |
| 2325 | - ole = OleFileIO(filename)#, raise_defects=DEFECT_INCORRECT) | |
| 2326 | - print("-" * 68) | |
| 2327 | - print(filename) | |
| 2328 | - print("-" * 68) | |
| 2329 | - ole.dumpdirectory() | |
| 2330 | - for streamname in ole.listdir(): | |
| 2331 | - if streamname[-1][0] == "\005": | |
| 2332 | - print("%r: properties" % streamname) | |
| 2333 | - try: | |
| 2334 | - props = ole.getproperties(streamname, convert_time=True) | |
| 2335 | - props = sorted(props.items()) | |
| 2336 | - for k, v in props: | |
| 2337 | - #[PL]: avoid to display too large or binary values: | |
| 2338 | - if isinstance(v, (basestring, bytes)): | |
| 2339 | - if len(v) > 50: | |
| 2340 | - v = v[:50] | |
| 2341 | - if isinstance(v, bytes): | |
| 2342 | - # quick and dirty binary check: | |
| 2343 | - for c in (1,2,3,4,5,6,7,11,12,14,15,16,17,18,19,20, | |
| 2344 | - 21,22,23,24,25,26,27,28,29,30,31): | |
| 2345 | - if c in bytearray(v): | |
| 2346 | - v = '(binary data)' | |
| 2347 | - break | |
| 2348 | - print(" ", k, v) | |
| 2349 | - except: | |
| 2350 | - log.exception('Error while parsing property stream %r' % streamname) | |
| 2351 | - | |
| 2352 | - if options.check_streams: | |
| 2353 | - # Read all streams to check if there are errors: | |
| 2354 | - print('\nChecking streams...') | |
| 2355 | - for streamname in ole.listdir(): | |
| 2356 | - # print name using repr() to convert binary chars to \xNN: | |
| 2357 | - print('-', repr('/'.join(streamname)),'-', end=' ') | |
| 2358 | - st_type = ole.get_type(streamname) | |
| 2359 | - if st_type == STGTY_STREAM: | |
| 2360 | - print('size %d' % ole.get_size(streamname)) | |
| 2361 | - # just try to read stream in memory: | |
| 2362 | - ole.openstream(streamname) | |
| 2363 | - else: | |
| 2364 | - print('NOT a stream : type=%d' % st_type) | |
| 2365 | - print() | |
| 2366 | - | |
| 2367 | -## for streamname in ole.listdir(): | |
| 2368 | -## # print name using repr() to convert binary chars to \xNN: | |
| 2369 | -## print('-', repr('/'.join(streamname)),'-', end=' ') | |
| 2370 | -## print(ole.getmtime(streamname)) | |
| 2371 | -## print() | |
| 2372 | - | |
| 2373 | - print('Modification/Creation times of all directory entries:') | |
| 2374 | - for entry in ole.direntries: | |
| 2375 | - if entry is not None: | |
| 2376 | - print('- %s: mtime=%s ctime=%s' % (entry.name, | |
| 2377 | - entry.getmtime(), entry.getctime())) | |
| 2378 | - print() | |
| 2379 | - | |
| 2380 | - # parse and display metadata: | |
| 2381 | - try: | |
| 2382 | - meta = ole.get_metadata() | |
| 2383 | - meta.dump() | |
| 2384 | - except: | |
| 2385 | - log.exception('Error while parsing metadata') | |
| 2386 | - print() | |
| 2387 | - #[PL] Test a few new methods: | |
| 2388 | - root = ole.get_rootentry_name() | |
| 2389 | - print('Root entry name: "%s"' % root) | |
| 2390 | - if ole.exists('worddocument'): | |
| 2391 | - print("This is a Word document.") | |
| 2392 | - print("type of stream 'WordDocument':", ole.get_type('worddocument')) | |
| 2393 | - print("size :", ole.get_size('worddocument')) | |
| 2394 | - if ole.exists('macros/vba'): | |
| 2395 | - print("This document may contain VBA macros.") | |
| 2396 | - | |
| 2397 | - # print parsing issues: | |
| 2398 | - print('\nNon-fatal issues raised during parsing:') | |
| 2399 | - if ole.parsing_issues: | |
| 2400 | - for exctype, msg in ole.parsing_issues: | |
| 2401 | - print('- %s: %s' % (exctype.__name__, msg)) | |
| 2402 | - else: | |
| 2403 | - print('None') | |
| 2404 | - except: | |
| 2405 | - log.exception('Error while parsing file %r' % filename) | |
| 2406 | - | |
| 2407 | -# this code was developed while listening to The Wedding Present "Sea Monsters" | |
| 1 | +#!/usr/bin/env python | |
| 2 | + | |
| 3 | +# olefile (formerly OleFileIO_PL) | |
| 4 | +# | |
| 5 | +# Module to read/write Microsoft OLE2 files (also called Structured Storage or | |
| 6 | +# Microsoft Compound Document File Format), such as Microsoft Office 97-2003 | |
| 7 | +# documents, Image Composer and FlashPix files, Outlook messages, ... | |
| 8 | +# This version is compatible with Python 2.6+ and 3.x | |
| 9 | +# | |
| 10 | +# Project website: http://www.decalage.info/olefile | |
| 11 | +# | |
| 12 | +# olefile is copyright (c) 2005-2016 Philippe Lagadec (http://www.decalage.info) | |
| 13 | +# | |
| 14 | +# olefile is based on the OleFileIO module from the PIL library v1.1.6 | |
| 15 | +# See: http://www.pythonware.com/products/pil/index.htm | |
| 16 | +# | |
| 17 | +# The Python Imaging Library (PIL) is | |
| 18 | +# Copyright (c) 1997-2005 by Secret Labs AB | |
| 19 | +# Copyright (c) 1995-2005 by Fredrik Lundh | |
| 20 | +# | |
| 21 | +# See source code and LICENSE.txt for information on usage and redistribution. | |
| 22 | + | |
| 23 | + | |
| 24 | +# Since OleFileIO_PL v0.30, only Python 2.6+ and 3.x is supported | |
| 25 | +# This import enables print() as a function rather than a keyword | |
| 26 | +# (main requirement to be compatible with Python 3.x) | |
| 27 | +# The comment on the line below should be printed on Python 2.5 or older: | |
| 28 | +from __future__ import print_function # This version of olefile requires Python 2.6+ or 3.x. | |
| 29 | + | |
| 30 | + | |
| 31 | +__author__ = "Philippe Lagadec" | |
| 32 | +__date__ = "2016-04-26" | |
| 33 | +__version__ = '0.44' | |
| 34 | + | |
| 35 | +#--- LICENSE ------------------------------------------------------------------ | |
| 36 | + | |
| 37 | +# olefile (formerly OleFileIO_PL) is copyright (c) 2005-2016 Philippe Lagadec | |
| 38 | +# (http://www.decalage.info) | |
| 39 | +# | |
| 40 | +# All rights reserved. | |
| 41 | +# | |
| 42 | +# Redistribution and use in source and binary forms, with or without modification, | |
| 43 | +# are permitted provided that the following conditions are met: | |
| 44 | +# | |
| 45 | +# * Redistributions of source code must retain the above copyright notice, this | |
| 46 | +# list of conditions and the following disclaimer. | |
| 47 | +# * Redistributions in binary form must reproduce the above copyright notice, | |
| 48 | +# this list of conditions and the following disclaimer in the documentation | |
| 49 | +# and/or other materials provided with the distribution. | |
| 50 | +# | |
| 51 | +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
| 52 | +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
| 53 | +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
| 54 | +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | |
| 55 | +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 56 | +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
| 57 | +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
| 58 | +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
| 59 | +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
| 60 | +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 61 | + | |
| 62 | +# ---------- | |
| 63 | +# PIL License: | |
| 64 | +# | |
| 65 | +# olefile is based on source code from the OleFileIO module of the Python | |
| 66 | +# Imaging Library (PIL) published by Fredrik Lundh under the following license: | |
| 67 | + | |
| 68 | +# The Python Imaging Library (PIL) is | |
| 69 | +# Copyright (c) 1997-2005 by Secret Labs AB | |
| 70 | +# Copyright (c) 1995-2005 by Fredrik Lundh | |
| 71 | +# | |
| 72 | +# By obtaining, using, and/or copying this software and/or its associated | |
| 73 | +# documentation, you agree that you have read, understood, and will comply with | |
| 74 | +# the following terms and conditions: | |
| 75 | +# | |
| 76 | +# Permission to use, copy, modify, and distribute this software and its | |
| 77 | +# associated documentation for any purpose and without fee is hereby granted, | |
| 78 | +# provided that the above copyright notice appears in all copies, and that both | |
| 79 | +# that copyright notice and this permission notice appear in supporting | |
| 80 | +# documentation, and that the name of Secret Labs AB or the author(s) not be used | |
| 81 | +# in advertising or publicity pertaining to distribution of the software | |
| 82 | +# without specific, written prior permission. | |
| 83 | +# | |
| 84 | +# SECRET LABS AB AND THE AUTHORS DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS | |
| 85 | +# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. | |
| 86 | +# IN NO EVENT SHALL SECRET LABS AB OR THE AUTHORS BE LIABLE FOR ANY SPECIAL, | |
| 87 | +# INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM | |
| 88 | +# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR | |
| 89 | +# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR | |
| 90 | +# PERFORMANCE OF THIS SOFTWARE. | |
| 91 | + | |
| 92 | +#----------------------------------------------------------------------------- | |
| 93 | +# CHANGELOG: (only olefile/OleFileIO_PL changes compared to PIL 1.1.6) | |
| 94 | +# 2005-05-11 v0.10 PL: - a few fixes for Python 2.4 compatibility | |
| 95 | +# (all changes flagged with [PL]) | |
| 96 | +# 2006-02-22 v0.11 PL: - a few fixes for some Office 2003 documents which raise | |
| 97 | +# exceptions in OleStream.__init__() | |
| 98 | +# 2006-06-09 v0.12 PL: - fixes for files above 6.8MB (DIFAT in loadfat) | |
| 99 | +# - added some constants | |
| 100 | +# - added header values checks | |
| 101 | +# - added some docstrings | |
| 102 | +# - getsect: bugfix in case sectors >512 bytes | |
| 103 | +# - getsect: added conformity checks | |
| 104 | +# - DEBUG_MODE constant to activate debug display | |
| 105 | +# 2007-09-04 v0.13 PL: - improved/translated (lots of) comments | |
| 106 | +# - updated license | |
| 107 | +# - converted tabs to 4 spaces | |
| 108 | +# 2007-11-19 v0.14 PL: - added OleFileIO._raise_defect() to adapt sensitivity | |
| 109 | +# - improved _unicode() to use Python 2.x unicode support | |
| 110 | +# - fixed bug in OleDirectoryEntry | |
| 111 | +# 2007-11-25 v0.15 PL: - added safety checks to detect FAT loops | |
| 112 | +# - fixed OleStream which didn't check stream size | |
| 113 | +# - added/improved many docstrings and comments | |
| 114 | +# - moved helper functions _unicode and _clsid out of | |
| 115 | +# OleFileIO class | |
| 116 | +# - improved OleFileIO._find() to add Unix path syntax | |
| 117 | +# - OleFileIO._find() is now case-insensitive | |
| 118 | +# - added get_type() and get_rootentry_name() | |
| 119 | +# - rewritten loaddirectory and OleDirectoryEntry | |
| 120 | +# 2007-11-27 v0.16 PL: - added OleDirectoryEntry.kids_dict | |
| 121 | +# - added detection of duplicate filenames in storages | |
| 122 | +# - added detection of duplicate references to streams | |
| 123 | +# - added get_size() and exists() to OleDirectoryEntry | |
| 124 | +# - added isOleFile to check header before parsing | |
| 125 | +# - added __all__ list to control public keywords in pydoc | |
| 126 | +# 2007-12-04 v0.17 PL: - added _load_direntry to fix a bug in loaddirectory | |
| 127 | +# - improved _unicode(), added workarounds for Python <2.3 | |
| 128 | +# - added set_debug_mode and -d option to set debug mode | |
| 129 | +# - fixed bugs in OleFileIO.open and OleDirectoryEntry | |
| 130 | +# - added safety check in main for large or binary | |
| 131 | +# properties | |
| 132 | +# - allow size>0 for storages for some implementations | |
| 133 | +# 2007-12-05 v0.18 PL: - fixed several bugs in handling of FAT, MiniFAT and | |
| 134 | +# streams | |
| 135 | +# - added option '-c' in main to check all streams | |
| 136 | +# 2009-12-10 v0.19 PL: - bugfix for 32 bit arrays on 64 bits platforms | |
| 137 | +# (thanks to Ben G. and Martijn for reporting the bug) | |
| 138 | +# 2009-12-11 v0.20 PL: - bugfix in OleFileIO.open when filename is not plain str | |
| 139 | +# 2010-01-22 v0.21 PL: - added support for big-endian CPUs such as PowerPC Macs | |
| 140 | +# 2012-02-16 v0.22 PL: - fixed bug in getproperties, patch by chuckleberryfinn | |
| 141 | +# (https://bitbucket.org/decalage/olefileio_pl/issue/7) | |
| 142 | +# - added close method to OleFileIO (fixed issue #2) | |
| 143 | +# 2012-07-25 v0.23 PL: - added support for file-like objects (patch by mete0r_kr) | |
| 144 | +# 2013-05-05 v0.24 PL: - getproperties: added conversion from filetime to python | |
| 145 | +# datetime | |
| 146 | +# - main: displays properties with date format | |
| 147 | +# - new class OleMetadata to parse standard properties | |
| 148 | +# - added get_metadata method | |
| 149 | +# 2013-05-07 v0.24 PL: - a few improvements in OleMetadata | |
| 150 | +# 2013-05-24 v0.25 PL: - getproperties: option to not convert some timestamps | |
| 151 | +# - OleMetaData: total_edit_time is now a number of seconds, | |
| 152 | +# not a timestamp | |
| 153 | +# - getproperties: added support for VT_BOOL, VT_INT, V_UINT | |
| 154 | +# - getproperties: filter out null chars from strings | |
| 155 | +# - getproperties: raise non-fatal defects instead of | |
| 156 | +# exceptions when properties cannot be parsed properly | |
| 157 | +# 2013-05-27 PL: - getproperties: improved exception handling | |
| 158 | +# - _raise_defect: added option to set exception type | |
| 159 | +# - all non-fatal issues are now recorded, and displayed | |
| 160 | +# when run as a script | |
| 161 | +# 2013-07-11 v0.26 PL: - added methods to get modification and creation times | |
| 162 | +# of a directory entry or a storage/stream | |
| 163 | +# - fixed parsing of direntry timestamps | |
| 164 | +# 2013-07-24 PL: - new options in listdir to list storages and/or streams | |
| 165 | +# 2014-02-04 v0.30 PL: - upgraded code to support Python 3.x by Martin Panter | |
| 166 | +# - several fixes for Python 2.6 (xrange, MAGIC) | |
| 167 | +# - reused i32 from Pillow's _binary | |
| 168 | +# 2014-07-18 v0.31 - preliminary support for 4K sectors | |
| 169 | +# 2014-07-27 v0.31 PL: - a few improvements in OleFileIO.open (header parsing) | |
| 170 | +# - Fixed loadfat for large files with 4K sectors (issue #3) | |
| 171 | +# 2014-07-30 v0.32 PL: - added write_sect to write sectors to disk | |
| 172 | +# - added write_mode option to OleFileIO.__init__ and open | |
| 173 | +# 2014-07-31 PL: - fixed padding in write_sect for Python 3, added checks | |
| 174 | +# - added write_stream to write a stream to disk | |
| 175 | +# 2014-09-26 v0.40 PL: - renamed OleFileIO_PL to olefile | |
| 176 | +# 2014-11-09 NE: - added support for Jython (Niko Ehrenfeuchter) | |
| 177 | +# 2014-11-13 v0.41 PL: - improved isOleFile and OleFileIO.open to support OLE | |
| 178 | +# data in a string buffer and file-like objects. | |
| 179 | +# 2014-11-21 PL: - updated comments according to Pillow's commits | |
| 180 | +# 2015-01-24 v0.42 PL: - changed the default path name encoding from Latin-1 | |
| 181 | +# to UTF-8 on Python 2.x (Unicode on Python 3.x) | |
| 182 | +# - added path_encoding option to override the default | |
| 183 | +# - fixed a bug in _list when a storage is empty | |
| 184 | +# 2015-04-17 v0.43 PL: - slight changes in OleDirectoryEntry | |
| 185 | +# 2015-10-19 - fixed issue #26 in OleFileIO.getproperties | |
| 186 | +# (using id and type as local variable names) | |
| 187 | +# 2015-10-29 - replaced debug() with proper logging | |
| 188 | +# - use optparse to handle command line options | |
| 189 | +# - improved attribute names in OleFileIO class | |
| 190 | +# 2015-11-05 - fixed issue #27 by correcting the MiniFAT sector | |
| 191 | +# cutoff size if invalid. | |
| 192 | +# 2016-02-02 - logging is disabled by default | |
| 193 | +# 2016-04-26 v0.44 PL: - added enable_logging | |
| 194 | +# - renamed _OleDirectoryEntry and _OleStream without '_' | |
| 195 | +# - in OleStream use _raise_defect instead of exceptions | |
| 196 | +# 2016-04-27 - added support for incomplete streams and incorrect | |
| 197 | +# directory entries (to read malformed documents) | |
| 198 | +# 2016-05-04 - fixed slight bug in OleStream | |
| 199 | + | |
| 200 | +#----------------------------------------------------------------------------- | |
| 201 | +# TODO (for version 1.0): | |
| 202 | +# + get rid of print statements, to simplify Python 2.x and 3.x support | |
| 203 | +# + add is_stream and is_storage | |
| 204 | +# + remove leading and trailing slashes where a path is used | |
| 205 | +# + add functions path_list2str and path_str2list | |
| 206 | +# + fix how all the methods handle unicode str and/or bytes as arguments | |
| 207 | +# + add path attrib to _OleDirEntry, set it once and for all in init or | |
| 208 | +# append_kids (then listdir/_list can be simplified) | |
| 209 | +# - TESTS with Linux, MacOSX, Python 1.5.2, various files, PIL, ... | |
| 210 | +# - add underscore to each private method, to avoid their display in | |
| 211 | +# pydoc/epydoc documentation - Remove it for classes to be documented | |
| 212 | +# - replace all raised exceptions with _raise_defect (at least in OleFileIO) | |
| 213 | +# - merge code from OleStream and OleFileIO.getsect to read sectors | |
| 214 | +# (maybe add a class for FAT and MiniFAT ?) | |
| 215 | +# - add method to check all streams (follow sectors chains without storing all | |
| 216 | +# stream in memory, and report anomalies) | |
| 217 | +# - use OleDirectoryEntry.kids_dict to improve _find and _list ? | |
| 218 | +# - fix Unicode names handling (find some way to stay compatible with Py1.5.2) | |
| 219 | +# => if possible avoid converting names to Latin-1 | |
| 220 | +# - review DIFAT code: fix handling of DIFSECT blocks in FAT (not stop) | |
| 221 | +# - rewrite OleFileIO.getproperties | |
| 222 | +# - improve docstrings to show more sample uses | |
| 223 | +# - see also original notes and FIXME below | |
| 224 | +# - remove all obsolete FIXMEs | |
| 225 | +# - OleMetadata: fix version attrib according to | |
| 226 | +# http://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx | |
| 227 | + | |
| 228 | +# IDEAS: | |
| 229 | +# - in OleFileIO._open and OleStream, use size=None instead of 0x7FFFFFFF for | |
| 230 | +# streams with unknown size | |
| 231 | +# - use arrays of int instead of long integers for FAT/MiniFAT, to improve | |
| 232 | +# performance and reduce memory usage ? (possible issue with values >2^31) | |
| 233 | +# - provide tests with unittest (may need write support to create samples) | |
| 234 | +# - move all debug code (and maybe dump methods) to a separate module, with | |
| 235 | +# a class which inherits OleFileIO ? | |
| 236 | +# - fix docstrings to follow epydoc format | |
| 237 | +# - add support for big endian byte order ? | |
| 238 | +# - create a simple OLE explorer with wxPython | |
| 239 | + | |
| 240 | +# FUTURE EVOLUTIONS to add write support: | |
| 241 | +# see issue #6 on Bitbucket: | |
| 242 | +# https://bitbucket.org/decalage/olefileio_pl/issue/6/improve-olefileio_pl-to-write-ole-files | |
| 243 | + | |
| 244 | +#----------------------------------------------------------------------------- | |
| 245 | +# NOTES from PIL 1.1.6: | |
| 246 | + | |
| 247 | +# History: | |
| 248 | +# 1997-01-20 fl Created | |
| 249 | +# 1997-01-22 fl Fixed 64-bit portability quirk | |
| 250 | +# 2003-09-09 fl Fixed typo in OleFileIO.loadfat (noted by Daniel Haertle) | |
| 251 | +# 2004-02-29 fl Changed long hex constants to signed integers | |
| 252 | +# | |
| 253 | +# Notes: | |
| 254 | +# FIXME: sort out sign problem (eliminate long hex constants) | |
| 255 | +# FIXME: change filename to use "a/b/c" instead of ["a", "b", "c"] | |
| 256 | +# FIXME: provide a glob mechanism function (using fnmatchcase) | |
| 257 | +# | |
| 258 | +# Literature: | |
| 259 | +# | |
| 260 | +# "FlashPix Format Specification, Appendix A", Kodak and Microsoft, | |
| 261 | +# September 1996. | |
| 262 | +# | |
| 263 | +# Quotes: | |
| 264 | +# | |
| 265 | +# "If this document and functionality of the Software conflict, | |
| 266 | +# the actual functionality of the Software represents the correct | |
| 267 | +# functionality" -- Microsoft, in the OLE format specification | |
| 268 | + | |
| 269 | +#------------------------------------------------------------------------------ | |
| 270 | + | |
| 271 | + | |
| 272 | +import io | |
| 273 | +import sys | |
| 274 | +import struct, array, os.path, datetime, logging | |
| 275 | + | |
| 276 | +#=== COMPATIBILITY WORKAROUNDS ================================================ | |
| 277 | + | |
| 278 | +#[PL] Define explicitly the public API to avoid private objects in pydoc: | |
| 279 | +#TODO: add more | |
| 280 | +# __all__ = ['OleFileIO', 'isOleFile', 'MAGIC'] | |
| 281 | + | |
| 282 | +# For Python 3.x, need to redefine long as int: | |
| 283 | +if str is not bytes: | |
| 284 | + long = int | |
| 285 | + | |
| 286 | +# Need to make sure we use xrange both on Python 2 and 3.x: | |
| 287 | +try: | |
| 288 | + # on Python 2 we need xrange: | |
| 289 | + iterrange = xrange | |
| 290 | +except: | |
| 291 | + # no xrange, for Python 3 it was renamed as range: | |
| 292 | + iterrange = range | |
| 293 | + | |
| 294 | +#[PL] workaround to fix an issue with array item size on 64 bits systems: | |
| 295 | +if array.array('L').itemsize == 4: | |
| 296 | + # on 32 bits platforms, long integers in an array are 32 bits: | |
| 297 | + UINT32 = 'L' | |
| 298 | +elif array.array('I').itemsize == 4: | |
| 299 | + # on 64 bits platforms, integers in an array are 32 bits: | |
| 300 | + UINT32 = 'I' | |
| 301 | +elif array.array('i').itemsize == 4: | |
| 302 | + # On 64 bit Jython, signed integers ('i') are the only way to store our 32 | |
| 303 | + # bit values in an array in a *somewhat* reasonable way, as the otherwise | |
| 304 | + # perfectly suited 'H' (unsigned int, 32 bits) results in a completely | |
| 305 | + # unusable behaviour. This is most likely caused by the fact that Java | |
| 306 | + # doesn't have unsigned values, and thus Jython's "array" implementation, | |
| 307 | + # which is based on "jarray", doesn't have them either. | |
| 308 | + # NOTE: to trick Jython into converting the values it would normally | |
| 309 | + # interpret as "signed" into "unsigned", a binary-and operation with | |
| 310 | + # 0xFFFFFFFF can be used. This way it is possible to use the same comparing | |
| 311 | + # operations on all platforms / implementations. The corresponding code | |
| 312 | + # lines are flagged with a 'JYTHON-WORKAROUND' tag below. | |
| 313 | + UINT32 = 'i' | |
| 314 | +else: | |
| 315 | + raise ValueError('Need to fix a bug with 32 bit arrays, please contact author...') | |
| 316 | + | |
| 317 | + | |
| 318 | +#[PL] These workarounds were inspired from the Path module | |
| 319 | +# (see http://www.jorendorff.com/articles/python/path/) | |
| 320 | +#TODO: test with old Python versions | |
| 321 | + | |
| 322 | +# Pre-2.3 workaround for basestring. | |
| 323 | +try: | |
| 324 | + basestring | |
| 325 | +except NameError: | |
| 326 | + try: | |
| 327 | + # is Unicode supported (Python >2.0 or >1.6 ?) | |
| 328 | + basestring = (str, unicode) | |
| 329 | + except NameError: | |
| 330 | + basestring = str | |
| 331 | + | |
| 332 | +#[PL] Experimental setting: if True, OLE filenames will be kept in Unicode | |
| 333 | +# if False (default PIL behaviour), all filenames are converted to Latin-1. | |
| 334 | +KEEP_UNICODE_NAMES = True | |
| 335 | + | |
| 336 | +if sys.version_info[0] < 3: | |
| 337 | + # On Python 2.x, the default encoding for path names is UTF-8: | |
| 338 | + DEFAULT_PATH_ENCODING = 'utf-8' | |
| 339 | +else: | |
| 340 | + # On Python 3.x, the default encoding for path names is Unicode (None): | |
| 341 | + DEFAULT_PATH_ENCODING = None | |
| 342 | + | |
| 343 | + | |
| 344 | +# === LOGGING ================================================================= | |
| 345 | + | |
| 346 | +class NullHandler(logging.Handler): | |
| 347 | + """ | |
| 348 | + Log Handler without output, to avoid printing messages if logging is not | |
| 349 | + configured by the main application. | |
| 350 | + Python 2.7 has logging.NullHandler, but this is necessary for 2.6: | |
| 351 | + see https://docs.python.org/2.6/library/logging.html#configuring-logging-for-a-library | |
| 352 | + """ | |
| 353 | + def emit(self, record): | |
| 354 | + pass | |
| 355 | + | |
| 356 | +def get_logger(name, level=logging.CRITICAL+1): | |
| 357 | + """ | |
| 358 | + Create a suitable logger object for this module. | |
| 359 | + The goal is not to change settings of the root logger, to avoid getting | |
| 360 | + other modules' logs on the screen. | |
| 361 | + If a logger exists with same name, reuse it. (Else it would have duplicate | |
| 362 | + handlers and messages would be doubled.) | |
| 363 | + The level is set to CRITICAL+1 by default, to avoid any logging. | |
| 364 | + """ | |
| 365 | + # First, test if there is already a logger with the same name, else it | |
| 366 | + # will generate duplicate messages (due to duplicate handlers): | |
| 367 | + if name in logging.Logger.manager.loggerDict: | |
| 368 | + #NOTE: another less intrusive but more "hackish" solution would be to | |
| 369 | + # use getLogger then test if its effective level is not default. | |
| 370 | + logger = logging.getLogger(name) | |
| 371 | + # make sure level is OK: | |
| 372 | + logger.setLevel(level) | |
| 373 | + return logger | |
| 374 | + # get a new logger: | |
| 375 | + logger = logging.getLogger(name) | |
| 376 | + # only add a NullHandler for this logger, it is up to the application | |
| 377 | + # to configure its own logging: | |
| 378 | + logger.addHandler(NullHandler()) | |
| 379 | + logger.setLevel(level) | |
| 380 | + return logger | |
| 381 | + | |
| 382 | + | |
| 383 | +# a global logger object used for debugging: | |
| 384 | +log = get_logger('olefile') | |
| 385 | + | |
| 386 | + | |
| 387 | +def enable_logging(): | |
| 388 | + """ | |
| 389 | + Enable logging for this module (disabled by default). | |
| 390 | + This will set the module-specific logger level to NOTSET, which | |
| 391 | + means the main application controls the actual logging level. | |
| 392 | + """ | |
| 393 | + log.setLevel(logging.NOTSET) | |
| 394 | + | |
| 395 | + | |
| 396 | +#=== CONSTANTS =============================================================== | |
| 397 | + | |
| 398 | +# magic bytes that should be at the beginning of every OLE file: | |
| 399 | +MAGIC = b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1' | |
| 400 | + | |
| 401 | +#[PL]: added constants for Sector IDs (from AAF specifications) | |
| 402 | +MAXREGSECT = 0xFFFFFFFA # (-6) maximum SECT | |
| 403 | +DIFSECT = 0xFFFFFFFC # (-4) denotes a DIFAT sector in a FAT | |
| 404 | +FATSECT = 0xFFFFFFFD # (-3) denotes a FAT sector in a FAT | |
| 405 | +ENDOFCHAIN = 0xFFFFFFFE # (-2) end of a virtual stream chain | |
| 406 | +FREESECT = 0xFFFFFFFF # (-1) unallocated sector | |
| 407 | + | |
| 408 | +#[PL]: added constants for Directory Entry IDs (from AAF specifications) | |
| 409 | +MAXREGSID = 0xFFFFFFFA # (-6) maximum directory entry ID | |
| 410 | +NOSTREAM = 0xFFFFFFFF # (-1) unallocated directory entry | |
| 411 | + | |
| 412 | +#[PL] object types in storage (from AAF specifications) | |
| 413 | +STGTY_EMPTY = 0 # empty directory entry (according to OpenOffice.org doc) | |
| 414 | +STGTY_STORAGE = 1 # element is a storage object | |
| 415 | +STGTY_STREAM = 2 # element is a stream object | |
| 416 | +STGTY_LOCKBYTES = 3 # element is an ILockBytes object | |
| 417 | +STGTY_PROPERTY = 4 # element is an IPropertyStorage object | |
| 418 | +STGTY_ROOT = 5 # element is a root storage | |
| 419 | + | |
| 420 | +# Unknown size for a stream (used by OleStream): | |
| 421 | +UNKNOWN_SIZE = 0x7FFFFFFF | |
| 422 | + | |
| 423 | +# | |
| 424 | +# -------------------------------------------------------------------- | |
| 425 | +# property types | |
| 426 | + | |
| 427 | +VT_EMPTY=0; VT_NULL=1; VT_I2=2; VT_I4=3; VT_R4=4; VT_R8=5; VT_CY=6; | |
| 428 | +VT_DATE=7; VT_BSTR=8; VT_DISPATCH=9; VT_ERROR=10; VT_BOOL=11; | |
| 429 | +VT_VARIANT=12; VT_UNKNOWN=13; VT_DECIMAL=14; VT_I1=16; VT_UI1=17; | |
| 430 | +VT_UI2=18; VT_UI4=19; VT_I8=20; VT_UI8=21; VT_INT=22; VT_UINT=23; | |
| 431 | +VT_VOID=24; VT_HRESULT=25; VT_PTR=26; VT_SAFEARRAY=27; VT_CARRAY=28; | |
| 432 | +VT_USERDEFINED=29; VT_LPSTR=30; VT_LPWSTR=31; VT_FILETIME=64; | |
| 433 | +VT_BLOB=65; VT_STREAM=66; VT_STORAGE=67; VT_STREAMED_OBJECT=68; | |
| 434 | +VT_STORED_OBJECT=69; VT_BLOB_OBJECT=70; VT_CF=71; VT_CLSID=72; | |
| 435 | +VT_VECTOR=0x1000; | |
| 436 | + | |
| 437 | +# map property id to name (for debugging purposes) | |
| 438 | + | |
| 439 | +VT = {} | |
| 440 | +for keyword, var in list(vars().items()): | |
| 441 | + if keyword[:3] == "VT_": | |
| 442 | + VT[var] = keyword | |
| 443 | + | |
| 444 | +# | |
| 445 | +# -------------------------------------------------------------------- | |
| 446 | +# Some common document types (root.clsid fields) | |
| 447 | + | |
| 448 | +WORD_CLSID = "00020900-0000-0000-C000-000000000046" | |
| 449 | +#TODO: check Excel, PPT, ... | |
| 450 | + | |
| 451 | +#[PL]: Defect levels to classify parsing errors - see OleFileIO._raise_defect() | |
| 452 | +DEFECT_UNSURE = 10 # a case which looks weird, but not sure it's a defect | |
| 453 | +DEFECT_POTENTIAL = 20 # a potential defect | |
| 454 | +DEFECT_INCORRECT = 30 # an error according to specifications, but parsing | |
| 455 | + # can go on | |
| 456 | +DEFECT_FATAL = 40 # an error which cannot be ignored, parsing is | |
| 457 | + # impossible | |
| 458 | + | |
| 459 | +# Minimal size of an empty OLE file, with 512-bytes sectors = 1536 bytes | |
| 460 | +# (this is used in isOleFile and OleFile.open) | |
| 461 | +MINIMAL_OLEFILE_SIZE = 1536 | |
| 462 | + | |
| 463 | +#[PL] add useful constants to __all__: | |
| 464 | +# for key in list(vars().keys()): | |
| 465 | +# if key.startswith('STGTY_') or key.startswith('DEFECT_'): | |
| 466 | +# __all__.append(key) | |
| 467 | + | |
| 468 | + | |
| 469 | +#=== FUNCTIONS =============================================================== | |
| 470 | + | |
| 471 | +def isOleFile (filename): | |
| 472 | + """ | |
| 473 | + Test if a file is an OLE container (according to the magic bytes in its header). | |
| 474 | + | |
| 475 | + :param filename: string-like or file-like object, OLE file to parse | |
| 476 | + | |
| 477 | + - if filename is a string smaller than 1536 bytes, it is the path | |
| 478 | + of the file to open. (bytes or unicode string) | |
| 479 | + - if filename is a string longer than 1535 bytes, it is parsed | |
| 480 | + as the content of an OLE file in memory. (bytes type only) | |
| 481 | + - if filename is a file-like object (with read and seek methods), | |
| 482 | + it is parsed as-is. | |
| 483 | + | |
| 484 | + :returns: True if OLE, False otherwise. | |
| 485 | + """ | |
| 486 | + # check if filename is a string-like or file-like object: | |
| 487 | + if hasattr(filename, 'read'): | |
| 488 | + # file-like object: use it directly | |
| 489 | + header = filename.read(len(MAGIC)) | |
| 490 | + # just in case, seek back to start of file: | |
| 491 | + filename.seek(0) | |
| 492 | + elif isinstance(filename, bytes) and len(filename) >= MINIMAL_OLEFILE_SIZE: | |
| 493 | + # filename is a bytes string containing the OLE file to be parsed: | |
| 494 | + header = filename[:len(MAGIC)] | |
| 495 | + else: | |
| 496 | + # string-like object: filename of file on disk | |
| 497 | + header = open(filename, 'rb').read(len(MAGIC)) | |
| 498 | + if header == MAGIC: | |
| 499 | + return True | |
| 500 | + else: | |
| 501 | + return False | |
| 502 | + | |
| 503 | + | |
| 504 | +if bytes is str: | |
| 505 | + # version for Python 2.x | |
| 506 | + def i8(c): | |
| 507 | + return ord(c) | |
| 508 | +else: | |
| 509 | + # version for Python 3.x | |
| 510 | + def i8(c): | |
| 511 | + return c if c.__class__ is int else c[0] | |
| 512 | + | |
| 513 | + | |
| 514 | +#TODO: replace i16 and i32 with more readable struct.unpack equivalent? | |
| 515 | + | |
| 516 | +def i16(c, o = 0): | |
| 517 | + """ | |
| 518 | + Converts a 2-bytes (16 bits) string to an integer. | |
| 519 | + | |
| 520 | + :param c: string containing bytes to convert | |
| 521 | + :param o: offset of bytes to convert in string | |
| 522 | + """ | |
| 523 | + return i8(c[o]) | (i8(c[o+1])<<8) | |
| 524 | + | |
| 525 | + | |
| 526 | +def i32(c, o = 0): | |
| 527 | + """ | |
| 528 | + Converts a 4-bytes (32 bits) string to an integer. | |
| 529 | + | |
| 530 | + :param c: string containing bytes to convert | |
| 531 | + :param o: offset of bytes to convert in string | |
| 532 | + """ | |
| 533 | +## return int(ord(c[o])+(ord(c[o+1])<<8)+(ord(c[o+2])<<16)+(ord(c[o+3])<<24)) | |
| 534 | +## # [PL]: added int() because "<<" gives long int since Python 2.4 | |
| 535 | + # copied from Pillow's _binary: | |
| 536 | + return i8(c[o]) | (i8(c[o+1])<<8) | (i8(c[o+2])<<16) | (i8(c[o+3])<<24) | |
| 537 | + | |
| 538 | + | |
| 539 | +def _clsid(clsid): | |
| 540 | + """ | |
| 541 | + Converts a CLSID to a human-readable string. | |
| 542 | + | |
| 543 | + :param clsid: string of length 16. | |
| 544 | + """ | |
| 545 | + assert len(clsid) == 16 | |
| 546 | + # if clsid is only made of null bytes, return an empty string: | |
| 547 | + # (PL: why not simply return the string with zeroes?) | |
| 548 | + if not clsid.strip(b"\0"): | |
| 549 | + return "" | |
| 550 | + return (("%08X-%04X-%04X-%02X%02X-" + "%02X" * 6) % | |
| 551 | + ((i32(clsid, 0), i16(clsid, 4), i16(clsid, 6)) + | |
| 552 | + tuple(map(i8, clsid[8:16])))) | |
| 553 | + | |
| 554 | + | |
| 555 | + | |
| 556 | +def filetime2datetime(filetime): | |
| 557 | + """ | |
| 558 | + convert FILETIME (64 bits int) to Python datetime.datetime | |
| 559 | + """ | |
| 560 | + # TODO: manage exception when microseconds is too large | |
| 561 | + # inspired from http://code.activestate.com/recipes/511425-filetime-to-datetime/ | |
| 562 | + _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0) | |
| 563 | + #log.debug('timedelta days=%d' % (filetime//(10*1000000*3600*24))) | |
| 564 | + return _FILETIME_null_date + datetime.timedelta(microseconds=filetime//10) | |
| 565 | + | |
| 566 | + | |
| 567 | + | |
| 568 | +#=== CLASSES ================================================================== | |
| 569 | + | |
| 570 | +class OleMetadata: | |
| 571 | + """ | |
| 572 | + class to parse and store metadata from standard properties of OLE files. | |
| 573 | + | |
| 574 | + Available attributes: | |
| 575 | + codepage, title, subject, author, keywords, comments, template, | |
| 576 | + last_saved_by, revision_number, total_edit_time, last_printed, create_time, | |
| 577 | + last_saved_time, num_pages, num_words, num_chars, thumbnail, | |
| 578 | + creating_application, security, codepage_doc, category, presentation_target, | |
| 579 | + bytes, lines, paragraphs, slides, notes, hidden_slides, mm_clips, | |
| 580 | + scale_crop, heading_pairs, titles_of_parts, manager, company, links_dirty, | |
| 581 | + chars_with_spaces, unused, shared_doc, link_base, hlinks, hlinks_changed, | |
| 582 | + version, dig_sig, content_type, content_status, language, doc_version | |
| 583 | + | |
| 584 | + Note: an attribute is set to None when not present in the properties of the | |
| 585 | + OLE file. | |
| 586 | + | |
| 587 | + References for SummaryInformation stream: | |
| 588 | + - http://msdn.microsoft.com/en-us/library/dd942545.aspx | |
| 589 | + - http://msdn.microsoft.com/en-us/library/dd925819%28v=office.12%29.aspx | |
| 590 | + - http://msdn.microsoft.com/en-us/library/windows/desktop/aa380376%28v=vs.85%29.aspx | |
| 591 | + - http://msdn.microsoft.com/en-us/library/aa372045.aspx | |
| 592 | + - http://sedna-soft.de/summary-information-stream/ | |
| 593 | + - http://poi.apache.org/apidocs/org/apache/poi/hpsf/SummaryInformation.html | |
| 594 | + | |
| 595 | + References for DocumentSummaryInformation stream: | |
| 596 | + - http://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx | |
| 597 | + - http://msdn.microsoft.com/en-us/library/windows/desktop/aa380374%28v=vs.85%29.aspx | |
| 598 | + - http://poi.apache.org/apidocs/org/apache/poi/hpsf/DocumentSummaryInformation.html | |
| 599 | + | |
| 600 | + new in version 0.25 | |
| 601 | + """ | |
| 602 | + | |
| 603 | + # attribute names for SummaryInformation stream properties: | |
| 604 | + # (ordered by property id, starting at 1) | |
| 605 | + SUMMARY_ATTRIBS = ['codepage', 'title', 'subject', 'author', 'keywords', 'comments', | |
| 606 | + 'template', 'last_saved_by', 'revision_number', 'total_edit_time', | |
| 607 | + 'last_printed', 'create_time', 'last_saved_time', 'num_pages', | |
| 608 | + 'num_words', 'num_chars', 'thumbnail', 'creating_application', | |
| 609 | + 'security'] | |
| 610 | + | |
| 611 | + # attribute names for DocumentSummaryInformation stream properties: | |
| 612 | + # (ordered by property id, starting at 1) | |
| 613 | + DOCSUM_ATTRIBS = ['codepage_doc', 'category', 'presentation_target', 'bytes', 'lines', 'paragraphs', | |
| 614 | + 'slides', 'notes', 'hidden_slides', 'mm_clips', | |
| 615 | + 'scale_crop', 'heading_pairs', 'titles_of_parts', 'manager', | |
| 616 | + 'company', 'links_dirty', 'chars_with_spaces', 'unused', 'shared_doc', | |
| 617 | + 'link_base', 'hlinks', 'hlinks_changed', 'version', 'dig_sig', | |
| 618 | + 'content_type', 'content_status', 'language', 'doc_version'] | |
| 619 | + | |
| 620 | + def __init__(self): | |
| 621 | + """ | |
| 622 | + Constructor for OleMetadata | |
| 623 | + All attributes are set to None by default | |
| 624 | + """ | |
| 625 | + # properties from SummaryInformation stream | |
| 626 | + self.codepage = None | |
| 627 | + self.title = None | |
| 628 | + self.subject = None | |
| 629 | + self.author = None | |
| 630 | + self.keywords = None | |
| 631 | + self.comments = None | |
| 632 | + self.template = None | |
| 633 | + self.last_saved_by = None | |
| 634 | + self.revision_number = None | |
| 635 | + self.total_edit_time = None | |
| 636 | + self.last_printed = None | |
| 637 | + self.create_time = None | |
| 638 | + self.last_saved_time = None | |
| 639 | + self.num_pages = None | |
| 640 | + self.num_words = None | |
| 641 | + self.num_chars = None | |
| 642 | + self.thumbnail = None | |
| 643 | + self.creating_application = None | |
| 644 | + self.security = None | |
| 645 | + # properties from DocumentSummaryInformation stream | |
| 646 | + self.codepage_doc = None | |
| 647 | + self.category = None | |
| 648 | + self.presentation_target = None | |
| 649 | + self.bytes = None | |
| 650 | + self.lines = None | |
| 651 | + self.paragraphs = None | |
| 652 | + self.slides = None | |
| 653 | + self.notes = None | |
| 654 | + self.hidden_slides = None | |
| 655 | + self.mm_clips = None | |
| 656 | + self.scale_crop = None | |
| 657 | + self.heading_pairs = None | |
| 658 | + self.titles_of_parts = None | |
| 659 | + self.manager = None | |
| 660 | + self.company = None | |
| 661 | + self.links_dirty = None | |
| 662 | + self.chars_with_spaces = None | |
| 663 | + self.unused = None | |
| 664 | + self.shared_doc = None | |
| 665 | + self.link_base = None | |
| 666 | + self.hlinks = None | |
| 667 | + self.hlinks_changed = None | |
| 668 | + self.version = None | |
| 669 | + self.dig_sig = None | |
| 670 | + self.content_type = None | |
| 671 | + self.content_status = None | |
| 672 | + self.language = None | |
| 673 | + self.doc_version = None | |
| 674 | + | |
| 675 | + | |
| 676 | + def parse_properties(self, olefile): | |
| 677 | + """ | |
| 678 | + Parse standard properties of an OLE file, from the streams | |
| 679 | + "\x05SummaryInformation" and "\x05DocumentSummaryInformation", | |
| 680 | + if present. | |
| 681 | + Properties are converted to strings, integers or python datetime objects. | |
| 682 | + If a property is not present, its value is set to None. | |
| 683 | + """ | |
| 684 | + # first set all attributes to None: | |
| 685 | + for attrib in (self.SUMMARY_ATTRIBS + self.DOCSUM_ATTRIBS): | |
| 686 | + setattr(self, attrib, None) | |
| 687 | + if olefile.exists("\x05SummaryInformation"): | |
| 688 | + # get properties from the stream: | |
| 689 | + # (converting timestamps to python datetime, except total_edit_time, | |
| 690 | + # which is property #10) | |
| 691 | + props = olefile.getproperties("\x05SummaryInformation", | |
| 692 | + convert_time=True, no_conversion=[10]) | |
| 693 | + # store them into this object's attributes: | |
| 694 | + for i in range(len(self.SUMMARY_ATTRIBS)): | |
| 695 | + # ids for standards properties start at 0x01, until 0x13 | |
| 696 | + value = props.get(i+1, None) | |
| 697 | + setattr(self, self.SUMMARY_ATTRIBS[i], value) | |
| 698 | + if olefile.exists("\x05DocumentSummaryInformation"): | |
| 699 | + # get properties from the stream: | |
| 700 | + props = olefile.getproperties("\x05DocumentSummaryInformation", | |
| 701 | + convert_time=True) | |
| 702 | + # store them into this object's attributes: | |
| 703 | + for i in range(len(self.DOCSUM_ATTRIBS)): | |
| 704 | + # ids for standards properties start at 0x01, until 0x13 | |
| 705 | + value = props.get(i+1, None) | |
| 706 | + setattr(self, self.DOCSUM_ATTRIBS[i], value) | |
| 707 | + | |
| 708 | + def dump(self): | |
| 709 | + """ | |
| 710 | + Dump all metadata, for debugging purposes. | |
| 711 | + """ | |
| 712 | + print('Properties from SummaryInformation stream:') | |
| 713 | + for prop in self.SUMMARY_ATTRIBS: | |
| 714 | + value = getattr(self, prop) | |
| 715 | + print('- %s: %s' % (prop, repr(value))) | |
| 716 | + print('Properties from DocumentSummaryInformation stream:') | |
| 717 | + for prop in self.DOCSUM_ATTRIBS: | |
| 718 | + value = getattr(self, prop) | |
| 719 | + print('- %s: %s' % (prop, repr(value))) | |
| 720 | + | |
| 721 | + | |
| 722 | +#--- OleStream --------------------------------------------------------------- | |
| 723 | + | |
| 724 | +class OleStream(io.BytesIO): | |
| 725 | + """ | |
| 726 | + OLE2 Stream | |
| 727 | + | |
| 728 | + Returns a read-only file object which can be used to read | |
| 729 | + the contents of a OLE stream (instance of the BytesIO class). | |
| 730 | + To open a stream, use the openstream method in the OleFile class. | |
| 731 | + | |
| 732 | + This function can be used with either ordinary streams, | |
| 733 | + or ministreams, depending on the offset, sectorsize, and | |
| 734 | + fat table arguments. | |
| 735 | + | |
| 736 | + Attributes: | |
| 737 | + | |
| 738 | + - size: actual size of data stream, after it was opened. | |
| 739 | + """ | |
| 740 | + # FIXME: should store the list of sects obtained by following | |
| 741 | + # the fat chain, and load new sectors on demand instead of | |
| 742 | + # loading it all in one go. | |
| 743 | + | |
| 744 | + def __init__(self, fp, sect, size, offset, sectorsize, fat, filesize, olefileio): | |
| 745 | + """ | |
| 746 | + Constructor for OleStream class. | |
| 747 | + | |
| 748 | + :param fp: file object, the OLE container or the MiniFAT stream | |
| 749 | + :param sect: sector index of first sector in the stream | |
| 750 | + :param size: total size of the stream | |
| 751 | + :param offset: offset in bytes for the first FAT or MiniFAT sector | |
| 752 | + :param sectorsize: size of one sector | |
| 753 | + :param fat: array/list of sector indexes (FAT or MiniFAT) | |
| 754 | + :param filesize: size of OLE file (for debugging) | |
| 755 | + :param olefileio: OleFileIO object containing this stream | |
| 756 | + :returns: a BytesIO instance containing the OLE stream | |
| 757 | + """ | |
| 758 | + log.debug('OleStream.__init__:') | |
| 759 | + log.debug(' sect=%d (%X), size=%d, offset=%d, sectorsize=%d, len(fat)=%d, fp=%s' | |
| 760 | + %(sect,sect,size,offset,sectorsize,len(fat), repr(fp))) | |
| 761 | + self.ole = olefileio | |
| 762 | + #[PL] To detect malformed documents with FAT loops, we compute the | |
| 763 | + # expected number of sectors in the stream: | |
| 764 | + unknown_size = False | |
| 765 | + if size == UNKNOWN_SIZE: | |
| 766 | + # this is the case when called from OleFileIO._open(), and stream | |
| 767 | + # size is not known in advance (for example when reading the | |
| 768 | + # Directory stream). Then we can only guess maximum size: | |
| 769 | + size = len(fat)*sectorsize | |
| 770 | + # and we keep a record that size was unknown: | |
| 771 | + unknown_size = True | |
| 772 | + log.debug(' stream with UNKNOWN SIZE') | |
| 773 | + nb_sectors = (size + (sectorsize-1)) // sectorsize | |
| 774 | + log.debug('nb_sectors = %d' % nb_sectors) | |
| 775 | + # This number should (at least) be less than the total number of | |
| 776 | + # sectors in the given FAT: | |
| 777 | + if nb_sectors > len(fat): | |
| 778 | + self.ole._raise_defect(DEFECT_INCORRECT, 'malformed OLE document, stream too large') | |
| 779 | + # optimization(?): data is first a list of strings, and join() is called | |
| 780 | + # at the end to concatenate all in one string. | |
| 781 | + # (this may not be really useful with recent Python versions) | |
| 782 | + data = [] | |
| 783 | + # if size is zero, then first sector index should be ENDOFCHAIN: | |
| 784 | + if size == 0 and sect != ENDOFCHAIN: | |
| 785 | + log.debug('size == 0 and sect != ENDOFCHAIN:') | |
| 786 | + self.ole._raise_defect(DEFECT_INCORRECT, 'incorrect OLE sector index for empty stream') | |
| 787 | + #[PL] A fixed-length for loop is used instead of an undefined while | |
| 788 | + # loop to avoid DoS attacks: | |
| 789 | + for i in range(nb_sectors): | |
| 790 | + log.debug('Reading stream sector[%d] = %Xh' % (i, sect)) | |
| 791 | + # Sector index may be ENDOFCHAIN, but only if size was unknown | |
| 792 | + if sect == ENDOFCHAIN: | |
| 793 | + if unknown_size: | |
| 794 | + log.debug('Reached ENDOFCHAIN sector for stream with unknown size') | |
| 795 | + break | |
| 796 | + else: | |
| 797 | + # else this means that the stream is smaller than declared: | |
| 798 | + log.debug('sect=ENDOFCHAIN before expected size') | |
| 799 | + self.ole._raise_defect(DEFECT_INCORRECT, 'incomplete OLE stream') | |
| 800 | + # sector index should be within FAT: | |
| 801 | + if sect<0 or sect>=len(fat): | |
| 802 | + log.debug('sect=%d (%X) / len(fat)=%d' % (sect, sect, len(fat))) | |
| 803 | + log.debug('i=%d / nb_sectors=%d' %(i, nb_sectors)) | |
| 804 | +## tmp_data = b"".join(data) | |
| 805 | +## f = open('test_debug.bin', 'wb') | |
| 806 | +## f.write(tmp_data) | |
| 807 | +## f.close() | |
| 808 | +## log.debug('data read so far: %d bytes' % len(tmp_data)) | |
| 809 | + self.ole._raise_defect(DEFECT_INCORRECT, 'incorrect OLE FAT, sector index out of range') | |
| 810 | + # stop reading here if the exception is ignored: | |
| 811 | + break | |
| 812 | + #TODO: merge this code with OleFileIO.getsect() ? | |
| 813 | + #TODO: check if this works with 4K sectors: | |
| 814 | + try: | |
| 815 | + fp.seek(offset + sectorsize * sect) | |
| 816 | + except: | |
| 817 | + log.debug('sect=%d, seek=%d, filesize=%d' % | |
| 818 | + (sect, offset+sectorsize*sect, filesize)) | |
| 819 | + self.ole._raise_defect(DEFECT_INCORRECT, 'OLE sector index out of range') | |
| 820 | + # stop reading here if the exception is ignored: | |
| 821 | + break | |
| 822 | + sector_data = fp.read(sectorsize) | |
| 823 | + # [PL] check if there was enough data: | |
| 824 | + # Note: if sector is the last of the file, sometimes it is not a | |
| 825 | + # complete sector (of 512 or 4K), so we may read less than | |
| 826 | + # sectorsize. | |
| 827 | + if len(sector_data)!=sectorsize and sect!=(len(fat)-1): | |
| 828 | + log.debug('sect=%d / len(fat)=%d, seek=%d / filesize=%d, len read=%d' % | |
| 829 | + (sect, len(fat), offset+sectorsize*sect, filesize, len(sector_data))) | |
| 830 | + log.debug('seek+len(read)=%d' % (offset+sectorsize*sect+len(sector_data))) | |
| 831 | + self.ole._raise_defect(DEFECT_INCORRECT, 'incomplete OLE sector') | |
| 832 | + data.append(sector_data) | |
| 833 | + # jump to next sector in the FAT: | |
| 834 | + try: | |
| 835 | + sect = fat[sect] & 0xFFFFFFFF # JYTHON-WORKAROUND | |
| 836 | + except IndexError: | |
| 837 | + # [PL] if pointer is out of the FAT an exception is raised | |
| 838 | + self.ole._raise_defect(DEFECT_INCORRECT, 'incorrect OLE FAT, sector index out of range') | |
| 839 | + # stop reading here if the exception is ignored: | |
| 840 | + break | |
| 841 | + #[PL] Last sector should be a "end of chain" marker: | |
| 842 | + # if sect != ENDOFCHAIN: | |
| 843 | + # raise IOError('incorrect last sector index in OLE stream') | |
| 844 | + data = b"".join(data) | |
| 845 | + # Data is truncated to the actual stream size: | |
| 846 | + if len(data) >= size: | |
| 847 | + log.debug('Read data of length %d, truncated to stream size %d' % (len(data), size)) | |
| 848 | + data = data[:size] | |
| 849 | + # actual stream size is stored for future use: | |
| 850 | + self.size = size | |
| 851 | + elif unknown_size: | |
| 852 | + # actual stream size was not known, now we know the size of read | |
| 853 | + # data: | |
| 854 | + log.debug('Read data of length %d, the stream size was unkown' % len(data)) | |
| 855 | + self.size = len(data) | |
| 856 | + else: | |
| 857 | + # read data is less than expected: | |
| 858 | + log.debug('Read data of length %d, less than expected stream size %d' % (len(data), size)) | |
| 859 | + # TODO: provide details in exception message | |
| 860 | + self.ole._raise_defect(DEFECT_INCORRECT, 'OLE stream size is less than declared') | |
| 861 | + self.size = len(data) | |
| 862 | + # when all data is read in memory, BytesIO constructor is called | |
| 863 | + io.BytesIO.__init__(self, data) | |
| 864 | + # Then the OleStream object can be used as a read-only file object. | |
| 865 | + | |
| 866 | + | |
| 867 | +#--- OleDirectoryEntry ------------------------------------------------------- | |
| 868 | + | |
| 869 | +class OleDirectoryEntry: | |
| 870 | + | |
| 871 | + """ | |
| 872 | + OLE2 Directory Entry | |
| 873 | + """ | |
| 874 | + #[PL] parsing code moved from OleFileIO.loaddirectory | |
| 875 | + | |
| 876 | + # struct to parse directory entries: | |
| 877 | + # <: little-endian byte order, standard sizes | |
| 878 | + # (note: this should guarantee that Q returns a 64 bits int) | |
| 879 | + # 64s: string containing entry name in unicode UTF-16 (max 31 chars) + null char = 64 bytes | |
| 880 | + # H: uint16, number of bytes used in name buffer, including null = (len+1)*2 | |
| 881 | + # B: uint8, dir entry type (between 0 and 5) | |
| 882 | + # B: uint8, color: 0=black, 1=red | |
| 883 | + # I: uint32, index of left child node in the red-black tree, NOSTREAM if none | |
| 884 | + # I: uint32, index of right child node in the red-black tree, NOSTREAM if none | |
| 885 | + # I: uint32, index of child root node if it is a storage, else NOSTREAM | |
| 886 | + # 16s: CLSID, unique identifier (only used if it is a storage) | |
| 887 | + # I: uint32, user flags | |
| 888 | + # Q (was 8s): uint64, creation timestamp or zero | |
| 889 | + # Q (was 8s): uint64, modification timestamp or zero | |
| 890 | + # I: uint32, SID of first sector if stream or ministream, SID of 1st sector | |
| 891 | + # of stream containing ministreams if root entry, 0 otherwise | |
| 892 | + # I: uint32, total stream size in bytes if stream (low 32 bits), 0 otherwise | |
| 893 | + # I: uint32, total stream size in bytes if stream (high 32 bits), 0 otherwise | |
| 894 | + STRUCT_DIRENTRY = '<64sHBBIII16sIQQIII' | |
| 895 | + # size of a directory entry: 128 bytes | |
| 896 | + DIRENTRY_SIZE = 128 | |
| 897 | + assert struct.calcsize(STRUCT_DIRENTRY) == DIRENTRY_SIZE | |
| 898 | + | |
| 899 | + | |
| 900 | + def __init__(self, entry, sid, olefile): | |
| 901 | + """ | |
| 902 | + Constructor for an OleDirectoryEntry object. | |
| 903 | + Parses a 128-bytes entry from the OLE Directory stream. | |
| 904 | + | |
| 905 | + :param entry : string (must be 128 bytes long) | |
| 906 | + :param sid : index of this directory entry in the OLE file directory | |
| 907 | + :param olefile: OleFileIO containing this directory entry | |
| 908 | + """ | |
| 909 | + self.sid = sid | |
| 910 | + # ref to olefile is stored for future use | |
| 911 | + self.olefile = olefile | |
| 912 | + # kids is a list of children entries, if this entry is a storage: | |
| 913 | + # (list of OleDirectoryEntry objects) | |
| 914 | + self.kids = [] | |
| 915 | + # kids_dict is a dictionary of children entries, indexed by their | |
| 916 | + # name in lowercase: used to quickly find an entry, and to detect | |
| 917 | + # duplicates | |
| 918 | + self.kids_dict = {} | |
| 919 | + # flag used to detect if the entry is referenced more than once in | |
| 920 | + # directory: | |
| 921 | + self.used = False | |
| 922 | + # decode DirEntry | |
| 923 | + ( | |
| 924 | + self.name_raw, # 64s: string containing entry name in unicode UTF-16 (max 31 chars) + null char = 64 bytes | |
| 925 | + self.namelength, # H: uint16, number of bytes used in name buffer, including null = (len+1)*2 | |
| 926 | + self.entry_type, | |
| 927 | + self.color, | |
| 928 | + self.sid_left, | |
| 929 | + self.sid_right, | |
| 930 | + self.sid_child, | |
| 931 | + clsid, | |
| 932 | + self.dwUserFlags, | |
| 933 | + self.createTime, | |
| 934 | + self.modifyTime, | |
| 935 | + self.isectStart, | |
| 936 | + self.sizeLow, | |
| 937 | + self.sizeHigh | |
| 938 | + ) = struct.unpack(OleDirectoryEntry.STRUCT_DIRENTRY, entry) | |
| 939 | + if self.entry_type not in [STGTY_ROOT, STGTY_STORAGE, STGTY_STREAM, STGTY_EMPTY]: | |
| 940 | + olefile._raise_defect(DEFECT_INCORRECT, 'unhandled OLE storage type') | |
| 941 | + # only first directory entry can (and should) be root: | |
| 942 | + if self.entry_type == STGTY_ROOT and sid != 0: | |
| 943 | + olefile._raise_defect(DEFECT_INCORRECT, 'duplicate OLE root entry') | |
| 944 | + if sid == 0 and self.entry_type != STGTY_ROOT: | |
| 945 | + olefile._raise_defect(DEFECT_INCORRECT, 'incorrect OLE root entry') | |
| 946 | + #log.debug(struct.unpack(fmt_entry, entry[:len_entry])) | |
| 947 | + # name should be at most 31 unicode characters + null character, | |
| 948 | + # so 64 bytes in total (31*2 + 2): | |
| 949 | + if self.namelength>64: | |
| 950 | + olefile._raise_defect(DEFECT_INCORRECT, 'incorrect DirEntry name length >64 bytes') | |
| 951 | + # if exception not raised, namelength is set to the maximum value: | |
| 952 | + self.namelength = 64 | |
| 953 | + # only characters without ending null char are kept: | |
| 954 | + self.name_utf16 = self.name_raw[:(self.namelength-2)] | |
| 955 | + #TODO: check if the name is actually followed by a null unicode character ([MS-CFB] 2.6.1) | |
| 956 | + #TODO: check if the name does not contain forbidden characters: | |
| 957 | + # [MS-CFB] 2.6.1: "The following characters are illegal and MUST NOT be part of the name: '/', '\', ':', '!'." | |
| 958 | + # name is converted from UTF-16LE to the path encoding specified in the OleFileIO: | |
| 959 | + self.name = olefile._decode_utf16_str(self.name_utf16) | |
| 960 | + | |
| 961 | + log.debug('DirEntry SID=%d: %s' % (self.sid, repr(self.name))) | |
| 962 | + log.debug(' - type: %d' % self.entry_type) | |
| 963 | + log.debug(' - sect: %Xh' % self.isectStart) | |
| 964 | + log.debug(' - SID left: %d, right: %d, child: %d' % (self.sid_left, | |
| 965 | + self.sid_right, self.sid_child)) | |
| 966 | + | |
| 967 | + # sizeHigh is only used for 4K sectors, it should be zero for 512 bytes | |
| 968 | + # sectors, BUT apparently some implementations set it as 0xFFFFFFFF, 1 | |
| 969 | + # or some other value so it cannot be raised as a defect in general: | |
| 970 | + if olefile.sectorsize == 512: | |
| 971 | + if self.sizeHigh != 0 and self.sizeHigh != 0xFFFFFFFF: | |
| 972 | + log.debug('sectorsize=%d, sizeLow=%d, sizeHigh=%d (%X)' % | |
| 973 | + (olefile.sectorsize, self.sizeLow, self.sizeHigh, self.sizeHigh)) | |
| 974 | + olefile._raise_defect(DEFECT_UNSURE, 'incorrect OLE stream size') | |
| 975 | + self.size = self.sizeLow | |
| 976 | + else: | |
| 977 | + self.size = self.sizeLow + (long(self.sizeHigh)<<32) | |
| 978 | + log.debug(' - size: %d (sizeLow=%d, sizeHigh=%d)' % (self.size, self.sizeLow, self.sizeHigh)) | |
| 979 | + | |
| 980 | + self.clsid = _clsid(clsid) | |
| 981 | + # a storage should have a null size, BUT some implementations such as | |
| 982 | + # Word 8 for Mac seem to allow non-null values => Potential defect: | |
| 983 | + if self.entry_type == STGTY_STORAGE and self.size != 0: | |
| 984 | + olefile._raise_defect(DEFECT_POTENTIAL, 'OLE storage with size>0') | |
| 985 | + # check if stream is not already referenced elsewhere: | |
| 986 | + if self.entry_type in (STGTY_ROOT, STGTY_STREAM) and self.size>0: | |
| 987 | + if self.size < olefile.minisectorcutoff \ | |
| 988 | + and self.entry_type==STGTY_STREAM: # only streams can be in MiniFAT | |
| 989 | + # ministream object | |
| 990 | + minifat = True | |
| 991 | + else: | |
| 992 | + minifat = False | |
| 993 | + olefile._check_duplicate_stream(self.isectStart, minifat) | |
| 994 | + | |
| 995 | + | |
| 996 | + | |
| 997 | + def build_storage_tree(self): | |
| 998 | + """ | |
| 999 | + Read and build the red-black tree attached to this OleDirectoryEntry | |
| 1000 | + object, if it is a storage. | |
| 1001 | + Note that this method builds a tree of all subentries, so it should | |
| 1002 | + only be called for the root object once. | |
| 1003 | + """ | |
| 1004 | + log.debug('build_storage_tree: SID=%d - %s - sid_child=%d' | |
| 1005 | + % (self.sid, repr(self.name), self.sid_child)) | |
| 1006 | + if self.sid_child != NOSTREAM: | |
| 1007 | + # if child SID is not NOSTREAM, then this entry is a storage. | |
| 1008 | + # Let's walk through the tree of children to fill the kids list: | |
| 1009 | + self.append_kids(self.sid_child) | |
| 1010 | + | |
| 1011 | + # Note from OpenOffice documentation: the safest way is to | |
| 1012 | + # recreate the tree because some implementations may store broken | |
| 1013 | + # red-black trees... | |
| 1014 | + | |
| 1015 | + # in the OLE file, entries are sorted on (length, name). | |
| 1016 | + # for convenience, we sort them on name instead: | |
| 1017 | + # (see rich comparison methods in this class) | |
| 1018 | + self.kids.sort() | |
| 1019 | + | |
| 1020 | + | |
| 1021 | + def append_kids(self, child_sid): | |
| 1022 | + """ | |
| 1023 | + Walk through red-black tree of children of this directory entry to add | |
| 1024 | + all of them to the kids list. (recursive method) | |
| 1025 | + | |
| 1026 | + :param child_sid : index of child directory entry to use, or None when called | |
| 1027 | + first time for the root. (only used during recursion) | |
| 1028 | + """ | |
| 1029 | + log.debug('append_kids: child_sid=%d' % child_sid) | |
| 1030 | + #[PL] this method was added to use simple recursion instead of a complex | |
| 1031 | + # algorithm. | |
| 1032 | + # if this is not a storage or a leaf of the tree, nothing to do: | |
| 1033 | + if child_sid == NOSTREAM: | |
| 1034 | + return | |
| 1035 | + # check if child SID is in the proper range: | |
| 1036 | + if child_sid<0 or child_sid>=len(self.olefile.direntries): | |
| 1037 | + self.olefile._raise_defect(DEFECT_INCORRECT, 'OLE DirEntry index out of range') | |
| 1038 | + else: | |
| 1039 | + # get child direntry: | |
| 1040 | + child = self.olefile._load_direntry(child_sid) #direntries[child_sid] | |
| 1041 | + log.debug('append_kids: child_sid=%d - %s - sid_left=%d, sid_right=%d, sid_child=%d' | |
| 1042 | + % (child.sid, repr(child.name), child.sid_left, child.sid_right, child.sid_child)) | |
| 1043 | + # the directory entries are organized as a red-black tree. | |
| 1044 | + # (cf. Wikipedia for details) | |
| 1045 | + # First walk through left side of the tree: | |
| 1046 | + self.append_kids(child.sid_left) | |
| 1047 | + # Check if its name is not already used (case-insensitive): | |
| 1048 | + name_lower = child.name.lower() | |
| 1049 | + if name_lower in self.kids_dict: | |
| 1050 | + self.olefile._raise_defect(DEFECT_INCORRECT, | |
| 1051 | + "Duplicate filename in OLE storage") | |
| 1052 | + # Then the child_sid OleDirectoryEntry object is appended to the | |
| 1053 | + # kids list and dictionary: | |
| 1054 | + self.kids.append(child) | |
| 1055 | + self.kids_dict[name_lower] = child | |
| 1056 | + # Check if kid was not already referenced in a storage: | |
| 1057 | + if child.used: | |
| 1058 | + self.olefile._raise_defect(DEFECT_INCORRECT, | |
| 1059 | + 'OLE Entry referenced more than once') | |
| 1060 | + child.used = True | |
| 1061 | + # Finally walk through right side of the tree: | |
| 1062 | + self.append_kids(child.sid_right) | |
| 1063 | + # Afterwards build kid's own tree if it's also a storage: | |
| 1064 | + child.build_storage_tree() | |
| 1065 | + | |
| 1066 | + | |
| 1067 | + def __eq__(self, other): | |
| 1068 | + "Compare entries by name" | |
| 1069 | + return self.name == other.name | |
| 1070 | + | |
| 1071 | + def __lt__(self, other): | |
| 1072 | + "Compare entries by name" | |
| 1073 | + return self.name < other.name | |
| 1074 | + | |
| 1075 | + def __ne__(self, other): | |
| 1076 | + return not self.__eq__(other) | |
| 1077 | + | |
| 1078 | + def __le__(self, other): | |
| 1079 | + return self.__eq__(other) or self.__lt__(other) | |
| 1080 | + | |
| 1081 | + # Reflected __lt__() and __le__() will be used for __gt__() and __ge__() | |
| 1082 | + | |
| 1083 | + #TODO: replace by the same function as MS implementation ? | |
| 1084 | + # (order by name length first, then case-insensitive order) | |
| 1085 | + | |
| 1086 | + | |
| 1087 | + def dump(self, tab = 0): | |
| 1088 | + "Dump this entry, and all its subentries (for debug purposes only)" | |
| 1089 | + TYPES = ["(invalid)", "(storage)", "(stream)", "(lockbytes)", | |
| 1090 | + "(property)", "(root)"] | |
| 1091 | + print(" "*tab + repr(self.name), TYPES[self.entry_type], end=' ') | |
| 1092 | + if self.entry_type in (STGTY_STREAM, STGTY_ROOT): | |
| 1093 | + print(self.size, "bytes", end=' ') | |
| 1094 | + print() | |
| 1095 | + if self.entry_type in (STGTY_STORAGE, STGTY_ROOT) and self.clsid: | |
| 1096 | + print(" "*tab + "{%s}" % self.clsid) | |
| 1097 | + | |
| 1098 | + for kid in self.kids: | |
| 1099 | + kid.dump(tab + 2) | |
| 1100 | + | |
| 1101 | + | |
| 1102 | + def getmtime(self): | |
| 1103 | + """ | |
| 1104 | + Return modification time of a directory entry. | |
| 1105 | + | |
| 1106 | + :returns: None if modification time is null, a python datetime object | |
| 1107 | + otherwise (UTC timezone) | |
| 1108 | + | |
| 1109 | + new in version 0.26 | |
| 1110 | + """ | |
| 1111 | + if self.modifyTime == 0: | |
| 1112 | + return None | |
| 1113 | + return filetime2datetime(self.modifyTime) | |
| 1114 | + | |
| 1115 | + | |
| 1116 | + def getctime(self): | |
| 1117 | + """ | |
| 1118 | + Return creation time of a directory entry. | |
| 1119 | + | |
| 1120 | + :returns: None if modification time is null, a python datetime object | |
| 1121 | + otherwise (UTC timezone) | |
| 1122 | + | |
| 1123 | + new in version 0.26 | |
| 1124 | + """ | |
| 1125 | + if self.createTime == 0: | |
| 1126 | + return None | |
| 1127 | + return filetime2datetime(self.createTime) | |
| 1128 | + | |
| 1129 | + | |
| 1130 | +#--- OleFileIO ---------------------------------------------------------------- | |
| 1131 | + | |
| 1132 | +class OleFileIO: | |
| 1133 | + """ | |
| 1134 | + OLE container object | |
| 1135 | + | |
| 1136 | + This class encapsulates the interface to an OLE 2 structured | |
| 1137 | + storage file. Use the listdir and openstream methods to | |
| 1138 | + access the contents of this file. | |
| 1139 | + | |
| 1140 | + Object names are given as a list of strings, one for each subentry | |
| 1141 | + level. The root entry should be omitted. For example, the following | |
| 1142 | + code extracts all image streams from a Microsoft Image Composer file:: | |
| 1143 | + | |
| 1144 | + ole = OleFileIO("fan.mic") | |
| 1145 | + | |
| 1146 | + for entry in ole.listdir(): | |
| 1147 | + if entry[1:2] == "Image": | |
| 1148 | + fin = ole.openstream(entry) | |
| 1149 | + fout = open(entry[0:1], "wb") | |
| 1150 | + while True: | |
| 1151 | + s = fin.read(8192) | |
| 1152 | + if not s: | |
| 1153 | + break | |
| 1154 | + fout.write(s) | |
| 1155 | + | |
| 1156 | + You can use the viewer application provided with the Python Imaging | |
| 1157 | + Library to view the resulting files (which happens to be standard | |
| 1158 | + TIFF files). | |
| 1159 | + """ | |
| 1160 | + | |
| 1161 | + def __init__(self, filename=None, raise_defects=DEFECT_FATAL, | |
| 1162 | + write_mode=False, debug=False, path_encoding=DEFAULT_PATH_ENCODING): | |
| 1163 | + """ | |
| 1164 | + Constructor for the OleFileIO class. | |
| 1165 | + | |
| 1166 | + :param filename: file to open. | |
| 1167 | + | |
| 1168 | + - if filename is a string smaller than 1536 bytes, it is the path | |
| 1169 | + of the file to open. (bytes or unicode string) | |
| 1170 | + - if filename is a string longer than 1535 bytes, it is parsed | |
| 1171 | + as the content of an OLE file in memory. (bytes type only) | |
| 1172 | + - if filename is a file-like object (with read, seek and tell methods), | |
| 1173 | + it is parsed as-is. | |
| 1174 | + | |
| 1175 | + :param raise_defects: minimal level for defects to be raised as exceptions. | |
| 1176 | + (use DEFECT_FATAL for a typical application, DEFECT_INCORRECT for a | |
| 1177 | + security-oriented application, see source code for details) | |
| 1178 | + | |
| 1179 | + :param write_mode: bool, if True the file is opened in read/write mode instead | |
| 1180 | + of read-only by default. | |
| 1181 | + | |
| 1182 | + :param debug: bool, set debug mode (deprecated, not used anymore) | |
| 1183 | + | |
| 1184 | + :param path_encoding: None or str, name of the codec to use for path | |
| 1185 | + names (streams and storages), or None for Unicode. | |
| 1186 | + Unicode by default on Python 3+, UTF-8 on Python 2.x. | |
| 1187 | + (new in olefile 0.42, was hardcoded to Latin-1 until olefile v0.41) | |
| 1188 | + """ | |
| 1189 | + # minimal level for defects to be raised as exceptions: | |
| 1190 | + self._raise_defects_level = raise_defects | |
| 1191 | + # list of defects/issues not raised as exceptions: | |
| 1192 | + # tuples of (exception type, message) | |
| 1193 | + self.parsing_issues = [] | |
| 1194 | + self.write_mode = write_mode | |
| 1195 | + self.path_encoding = path_encoding | |
| 1196 | + self._filesize = None | |
| 1197 | + self.fp = None | |
| 1198 | + if filename: | |
| 1199 | + self.open(filename, write_mode=write_mode) | |
| 1200 | + | |
| 1201 | + | |
| 1202 | + def _raise_defect(self, defect_level, message, exception_type=IOError): | |
| 1203 | + """ | |
| 1204 | + This method should be called for any defect found during file parsing. | |
| 1205 | + It may raise an IOError exception according to the minimal level chosen | |
| 1206 | + for the OleFileIO object. | |
| 1207 | + | |
| 1208 | + :param defect_level: defect level, possible values are: | |
| 1209 | + | |
| 1210 | + - DEFECT_UNSURE : a case which looks weird, but not sure it's a defect | |
| 1211 | + - DEFECT_POTENTIAL : a potential defect | |
| 1212 | + - DEFECT_INCORRECT : an error according to specifications, but parsing can go on | |
| 1213 | + - DEFECT_FATAL : an error which cannot be ignored, parsing is impossible | |
| 1214 | + | |
| 1215 | + :param message: string describing the defect, used with raised exception. | |
| 1216 | + :param exception_type: exception class to be raised, IOError by default | |
| 1217 | + """ | |
| 1218 | + # added by [PL] | |
| 1219 | + if defect_level >= self._raise_defects_level: | |
| 1220 | + log.error(message) | |
| 1221 | + raise exception_type(message) | |
| 1222 | + else: | |
| 1223 | + # just record the issue, no exception raised: | |
| 1224 | + self.parsing_issues.append((exception_type, message)) | |
| 1225 | + log.warning(message) | |
| 1226 | + | |
| 1227 | + | |
| 1228 | + def _decode_utf16_str(self, utf16_str, errors='replace'): | |
| 1229 | + """ | |
| 1230 | + Decode a string encoded in UTF-16 LE format, as found in the OLE | |
| 1231 | + directory or in property streams. Return a string encoded | |
| 1232 | + according to the path_encoding specified for the OleFileIO object. | |
| 1233 | + | |
| 1234 | + :param utf16_str: bytes string encoded in UTF-16 LE format | |
| 1235 | + :param errors: str, see python documentation for str.decode() | |
| 1236 | + :return: str, encoded according to path_encoding | |
| 1237 | + """ | |
| 1238 | + unicode_str = utf16_str.decode('UTF-16LE', errors) | |
| 1239 | + if self.path_encoding: | |
| 1240 | + # an encoding has been specified for path names: | |
| 1241 | + return unicode_str.encode(self.path_encoding, errors) | |
| 1242 | + else: | |
| 1243 | + # path_encoding=None, return the Unicode string as-is: | |
| 1244 | + return unicode_str | |
| 1245 | + | |
| 1246 | + | |
| 1247 | + def open(self, filename, write_mode=False): | |
| 1248 | + """ | |
| 1249 | + Open an OLE2 file in read-only or read/write mode. | |
| 1250 | + Read and parse the header, FAT and directory. | |
| 1251 | + | |
| 1252 | + :param filename: string-like or file-like object, OLE file to parse | |
| 1253 | + | |
| 1254 | + - if filename is a string smaller than 1536 bytes, it is the path | |
| 1255 | + of the file to open. (bytes or unicode string) | |
| 1256 | + - if filename is a string longer than 1535 bytes, it is parsed | |
| 1257 | + as the content of an OLE file in memory. (bytes type only) | |
| 1258 | + - if filename is a file-like object (with read, seek and tell methods), | |
| 1259 | + it is parsed as-is. | |
| 1260 | + | |
| 1261 | + :param write_mode: bool, if True the file is opened in read/write mode instead | |
| 1262 | + of read-only by default. (ignored if filename is not a path) | |
| 1263 | + """ | |
| 1264 | + self.write_mode = write_mode | |
| 1265 | + #[PL] check if filename is a string-like or file-like object: | |
| 1266 | + # (it is better to check for a read() method) | |
| 1267 | + if hasattr(filename, 'read'): | |
| 1268 | + #TODO: also check seek and tell methods? | |
| 1269 | + # file-like object: use it directly | |
| 1270 | + self.fp = filename | |
| 1271 | + elif isinstance(filename, bytes) and len(filename) >= MINIMAL_OLEFILE_SIZE: | |
| 1272 | + # filename is a bytes string containing the OLE file to be parsed: | |
| 1273 | + # convert it to BytesIO | |
| 1274 | + self.fp = io.BytesIO(filename) | |
| 1275 | + else: | |
| 1276 | + # string-like object: filename of file on disk | |
| 1277 | + if self.write_mode: | |
| 1278 | + # open file in mode 'read with update, binary' | |
| 1279 | + # According to https://docs.python.org/2/library/functions.html#open | |
| 1280 | + # 'w' would truncate the file, 'a' may only append on some Unixes | |
| 1281 | + mode = 'r+b' | |
| 1282 | + else: | |
| 1283 | + # read-only mode by default | |
| 1284 | + mode = 'rb' | |
| 1285 | + self.fp = open(filename, mode) | |
| 1286 | + # obtain the filesize by using seek and tell, which should work on most | |
| 1287 | + # file-like objects: | |
| 1288 | + #TODO: do it above, using getsize with filename when possible? | |
| 1289 | + #TODO: fix code to fail with clear exception when filesize cannot be obtained | |
| 1290 | + filesize=0 | |
| 1291 | + self.fp.seek(0, os.SEEK_END) | |
| 1292 | + try: | |
| 1293 | + filesize = self.fp.tell() | |
| 1294 | + finally: | |
| 1295 | + self.fp.seek(0) | |
| 1296 | + self._filesize = filesize | |
| 1297 | + log.debug('File size: %d bytes (%Xh)' % (self._filesize, self._filesize)) | |
| 1298 | + | |
| 1299 | + # lists of streams in FAT and MiniFAT, to detect duplicate references | |
| 1300 | + # (list of indexes of first sectors of each stream) | |
| 1301 | + self._used_streams_fat = [] | |
| 1302 | + self._used_streams_minifat = [] | |
| 1303 | + | |
| 1304 | + header = self.fp.read(512) | |
| 1305 | + | |
| 1306 | + if len(header) != 512 or header[:8] != MAGIC: | |
| 1307 | + log.debug('Magic = %r instead of %r' % (header[:8], MAGIC)) | |
| 1308 | + self._raise_defect(DEFECT_FATAL, "not an OLE2 structured storage file") | |
| 1309 | + | |
| 1310 | + # [PL] header structure according to AAF specifications: | |
| 1311 | + ##Header | |
| 1312 | + ##struct StructuredStorageHeader { // [offset from start (bytes), length (bytes)] | |
| 1313 | + ##BYTE _abSig[8]; // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, | |
| 1314 | + ## // 0x1a, 0xe1} for current version | |
| 1315 | + ##CLSID _clsid; // [08H,16] reserved must be zero (WriteClassStg/ | |
| 1316 | + ## // GetClassFile uses root directory class id) | |
| 1317 | + ##USHORT _uMinorVersion; // [18H,02] minor version of the format: 33 is | |
| 1318 | + ## // written by reference implementation | |
| 1319 | + ##USHORT _uDllVersion; // [1AH,02] major version of the dll/format: 3 for | |
| 1320 | + ## // 512-byte sectors, 4 for 4 KB sectors | |
| 1321 | + ##USHORT _uByteOrder; // [1CH,02] 0xFFFE: indicates Intel byte-ordering | |
| 1322 | + ##USHORT _uSectorShift; // [1EH,02] size of sectors in power-of-two; | |
| 1323 | + ## // typically 9 indicating 512-byte sectors | |
| 1324 | + ##USHORT _uMiniSectorShift; // [20H,02] size of mini-sectors in power-of-two; | |
| 1325 | + ## // typically 6 indicating 64-byte mini-sectors | |
| 1326 | + ##USHORT _usReserved; // [22H,02] reserved, must be zero | |
| 1327 | + ##ULONG _ulReserved1; // [24H,04] reserved, must be zero | |
| 1328 | + ##FSINDEX _csectDir; // [28H,04] must be zero for 512-byte sectors, | |
| 1329 | + ## // number of SECTs in directory chain for 4 KB | |
| 1330 | + ## // sectors | |
| 1331 | + ##FSINDEX _csectFat; // [2CH,04] number of SECTs in the FAT chain | |
| 1332 | + ##SECT _sectDirStart; // [30H,04] first SECT in the directory chain | |
| 1333 | + ##DFSIGNATURE _signature; // [34H,04] signature used for transactions; must | |
| 1334 | + ## // be zero. The reference implementation | |
| 1335 | + ## // does not support transactions | |
| 1336 | + ##ULONG _ulMiniSectorCutoff; // [38H,04] maximum size for a mini stream; | |
| 1337 | + ## // typically 4096 bytes | |
| 1338 | + ##SECT _sectMiniFatStart; // [3CH,04] first SECT in the MiniFAT chain | |
| 1339 | + ##FSINDEX _csectMiniFat; // [40H,04] number of SECTs in the MiniFAT chain | |
| 1340 | + ##SECT _sectDifStart; // [44H,04] first SECT in the DIFAT chain | |
| 1341 | + ##FSINDEX _csectDif; // [48H,04] number of SECTs in the DIFAT chain | |
| 1342 | + ##SECT _sectFat[109]; // [4CH,436] the SECTs of first 109 FAT sectors | |
| 1343 | + ##}; | |
| 1344 | + | |
| 1345 | + # [PL] header decoding: | |
| 1346 | + # '<' indicates little-endian byte ordering for Intel (cf. struct module help) | |
| 1347 | + fmt_header = '<8s16sHHHHHHLLLLLLLLLL' | |
| 1348 | + header_size = struct.calcsize(fmt_header) | |
| 1349 | + log.debug( "fmt_header size = %d, +FAT = %d" % (header_size, header_size + 109*4) ) | |
| 1350 | + header1 = header[:header_size] | |
| 1351 | + ( | |
| 1352 | + self.header_signature, | |
| 1353 | + self.header_clsid, | |
| 1354 | + self.minor_version, | |
| 1355 | + self.dll_version, | |
| 1356 | + self.byte_order, | |
| 1357 | + self.sector_shift, | |
| 1358 | + self.mini_sector_shift, | |
| 1359 | + self.reserved1, | |
| 1360 | + self.reserved2, | |
| 1361 | + self.num_dir_sectors, | |
| 1362 | + self.num_fat_sectors, | |
| 1363 | + self.first_dir_sector, | |
| 1364 | + self.transaction_signature_number, | |
| 1365 | + self.mini_stream_cutoff_size, | |
| 1366 | + self.first_mini_fat_sector, | |
| 1367 | + self.num_mini_fat_sectors, | |
| 1368 | + self.first_difat_sector, | |
| 1369 | + self.num_difat_sectors | |
| 1370 | + ) = struct.unpack(fmt_header, header1) | |
| 1371 | + log.debug( struct.unpack(fmt_header, header1)) | |
| 1372 | + | |
| 1373 | + if self.header_signature != MAGIC: | |
| 1374 | + # OLE signature should always be present | |
| 1375 | + self._raise_defect(DEFECT_FATAL, "incorrect OLE signature") | |
| 1376 | + if self.header_clsid != bytearray(16): | |
| 1377 | + # according to AAF specs, CLSID should always be zero | |
| 1378 | + self._raise_defect(DEFECT_INCORRECT, "incorrect CLSID in OLE header") | |
| 1379 | + log.debug( "Minor Version = %d" % self.minor_version ) | |
| 1380 | + # TODO: according to MS-CFB, minor version should be 0x003E | |
| 1381 | + log.debug( "DLL Version = %d (expected: 3 or 4)" % self.dll_version ) | |
| 1382 | + if self.dll_version not in [3, 4]: | |
| 1383 | + # version 3: usual format, 512 bytes per sector | |
| 1384 | + # version 4: large format, 4K per sector | |
| 1385 | + self._raise_defect(DEFECT_INCORRECT, "incorrect DllVersion in OLE header") | |
| 1386 | + log.debug( "Byte Order = %X (expected: FFFE)" % self.byte_order ) | |
| 1387 | + if self.byte_order != 0xFFFE: | |
| 1388 | + # For now only common little-endian documents are handled correctly | |
| 1389 | + self._raise_defect(DEFECT_FATAL, "incorrect ByteOrder in OLE header") | |
| 1390 | + # TODO: add big-endian support for documents created on Mac ? | |
| 1391 | + # But according to [MS-CFB] ? v20140502, ByteOrder MUST be 0xFFFE. | |
| 1392 | + self.sector_size = 2**self.sector_shift | |
| 1393 | + log.debug( "Sector Size = %d bytes (expected: 512 or 4096)" % self.sector_size ) | |
| 1394 | + if self.sector_size not in [512, 4096]: | |
| 1395 | + self._raise_defect(DEFECT_INCORRECT, "incorrect sector_size in OLE header") | |
| 1396 | + if (self.dll_version==3 and self.sector_size!=512) \ | |
| 1397 | + or (self.dll_version==4 and self.sector_size!=4096): | |
| 1398 | + self._raise_defect(DEFECT_INCORRECT, "sector_size does not match DllVersion in OLE header") | |
| 1399 | + self.mini_sector_size = 2**self.mini_sector_shift | |
| 1400 | + log.debug( "MiniFAT Sector Size = %d bytes (expected: 64)" % self.mini_sector_size ) | |
| 1401 | + if self.mini_sector_size not in [64]: | |
| 1402 | + self._raise_defect(DEFECT_INCORRECT, "incorrect mini_sector_size in OLE header") | |
| 1403 | + if self.reserved1 != 0 or self.reserved2 != 0: | |
| 1404 | + self._raise_defect(DEFECT_INCORRECT, "incorrect OLE header (non-null reserved bytes)") | |
| 1405 | + log.debug( "Number of Directory sectors = %d" % self.num_dir_sectors ) | |
| 1406 | + # Number of directory sectors (only allowed if DllVersion != 3) | |
| 1407 | + if self.sector_size==512 and self.num_dir_sectors!=0: | |
| 1408 | + self._raise_defect(DEFECT_INCORRECT, "incorrect number of directory sectors in OLE header") | |
| 1409 | + log.debug( "Number of FAT sectors = %d" % self.num_fat_sectors ) | |
| 1410 | + # num_fat_sectors = number of FAT sectors in the file | |
| 1411 | + log.debug( "First Directory sector = %Xh" % self.first_dir_sector ) | |
| 1412 | + # first_dir_sector = 1st sector containing the directory | |
| 1413 | + log.debug( "Transaction Signature Number = %d" % self.transaction_signature_number ) | |
| 1414 | + # Signature should be zero, BUT some implementations do not follow this | |
| 1415 | + # rule => only a potential defect: | |
| 1416 | + # (according to MS-CFB, may be != 0 for applications supporting file | |
| 1417 | + # transactions) | |
| 1418 | + if self.transaction_signature_number != 0: | |
| 1419 | + self._raise_defect(DEFECT_POTENTIAL, "incorrect OLE header (transaction_signature_number>0)") | |
| 1420 | + log.debug( "Mini Stream cutoff size = %Xh (expected: 1000h)" % self.mini_stream_cutoff_size ) | |
| 1421 | + # MS-CFB: This integer field MUST be set to 0x00001000. This field | |
| 1422 | + # specifies the maximum size of a user-defined data stream allocated | |
| 1423 | + # from the mini FAT and mini stream, and that cutoff is 4096 bytes. | |
| 1424 | + # Any user-defined data stream larger than or equal to this cutoff size | |
| 1425 | + # must be allocated as normal sectors from the FAT. | |
| 1426 | + if self.mini_stream_cutoff_size != 0x1000: | |
| 1427 | + self._raise_defect(DEFECT_INCORRECT, "incorrect mini_stream_cutoff_size in OLE header") | |
| 1428 | + # if no exception is raised, the cutoff size is fixed to 0x1000 | |
| 1429 | + log.warning('Fixing the mini_stream_cutoff_size to 4096 (mandatory value) instead of %d' % | |
| 1430 | + self.mini_stream_cutoff_size) | |
| 1431 | + self.mini_stream_cutoff_size = 0x1000 | |
| 1432 | + # TODO: check if these values are OK | |
| 1433 | + log.debug( "First MiniFAT sector = %Xh" % self.first_mini_fat_sector ) | |
| 1434 | + log.debug( "Number of MiniFAT sectors = %d" % self.num_mini_fat_sectors ) | |
| 1435 | + log.debug( "First DIFAT sector = %Xh" % self.first_difat_sector ) | |
| 1436 | + log.debug( "Number of DIFAT sectors = %d" % self.num_difat_sectors ) | |
| 1437 | + | |
| 1438 | + # calculate the number of sectors in the file | |
| 1439 | + # (-1 because header doesn't count) | |
| 1440 | + self.nb_sect = ( (filesize + self.sector_size-1) // self.sector_size) - 1 | |
| 1441 | + log.debug( "Maximum number of sectors in the file: %d (%Xh)" % (self.nb_sect, self.nb_sect)) | |
| 1442 | + #TODO: change this test, because an OLE file MAY contain other data | |
| 1443 | + # after the last sector. | |
| 1444 | + | |
| 1445 | + # file clsid | |
| 1446 | + self.header_clsid = _clsid(header[8:24]) | |
| 1447 | + | |
| 1448 | + #TODO: remove redundant attributes, and fix the code which uses them? | |
| 1449 | + self.sectorsize = self.sector_size #1 << i16(header, 30) | |
| 1450 | + self.minisectorsize = self.mini_sector_size #1 << i16(header, 32) | |
| 1451 | + self.minisectorcutoff = self.mini_stream_cutoff_size # i32(header, 56) | |
| 1452 | + | |
| 1453 | + # check known streams for duplicate references (these are always in FAT, | |
| 1454 | + # never in MiniFAT): | |
| 1455 | + self._check_duplicate_stream(self.first_dir_sector) | |
| 1456 | + # check MiniFAT only if it is not empty: | |
| 1457 | + if self.num_mini_fat_sectors: | |
| 1458 | + self._check_duplicate_stream(self.first_mini_fat_sector) | |
| 1459 | + # check DIFAT only if it is not empty: | |
| 1460 | + if self.num_difat_sectors: | |
| 1461 | + self._check_duplicate_stream(self.first_difat_sector) | |
| 1462 | + | |
| 1463 | + # Load file allocation tables | |
| 1464 | + self.loadfat(header) | |
| 1465 | + # Load directory. This sets both the direntries list (ordered by sid) | |
| 1466 | + # and the root (ordered by hierarchy) members. | |
| 1467 | + self.loaddirectory(self.first_dir_sector) | |
| 1468 | + self.ministream = None | |
| 1469 | + self.minifatsect = self.first_mini_fat_sector | |
| 1470 | + | |
| 1471 | + | |
| 1472 | + def close(self): | |
| 1473 | + """ | |
| 1474 | + close the OLE file, to release the file object | |
| 1475 | + """ | |
| 1476 | + self.fp.close() | |
| 1477 | + | |
| 1478 | + | |
| 1479 | + def _check_duplicate_stream(self, first_sect, minifat=False): | |
| 1480 | + """ | |
| 1481 | + Checks if a stream has not been already referenced elsewhere. | |
| 1482 | + This method should only be called once for each known stream, and only | |
| 1483 | + if stream size is not null. | |
| 1484 | + | |
| 1485 | + :param first_sect: int, index of first sector of the stream in FAT | |
| 1486 | + :param minifat: bool, if True, stream is located in the MiniFAT, else in the FAT | |
| 1487 | + """ | |
| 1488 | + if minifat: | |
| 1489 | + log.debug('_check_duplicate_stream: sect=%Xh in MiniFAT' % first_sect) | |
| 1490 | + used_streams = self._used_streams_minifat | |
| 1491 | + else: | |
| 1492 | + log.debug('_check_duplicate_stream: sect=%Xh in FAT' % first_sect) | |
| 1493 | + # some values can be safely ignored (not a real stream): | |
| 1494 | + if first_sect in (DIFSECT,FATSECT,ENDOFCHAIN,FREESECT): | |
| 1495 | + return | |
| 1496 | + used_streams = self._used_streams_fat | |
| 1497 | + #TODO: would it be more efficient using a dict or hash values, instead | |
| 1498 | + # of a list of long ? | |
| 1499 | + if first_sect in used_streams: | |
| 1500 | + self._raise_defect(DEFECT_INCORRECT, 'Stream referenced twice') | |
| 1501 | + else: | |
| 1502 | + used_streams.append(first_sect) | |
| 1503 | + | |
| 1504 | + | |
| 1505 | + def dumpfat(self, fat, firstindex=0): | |
| 1506 | + """ | |
| 1507 | + Display a part of FAT in human-readable form for debugging purposes | |
| 1508 | + """ | |
| 1509 | + # dictionary to convert special FAT values in human-readable strings | |
| 1510 | + VPL = 8 # values per line (8+1 * 8+1 = 81) | |
| 1511 | + fatnames = { | |
| 1512 | + FREESECT: "..free..", | |
| 1513 | + ENDOFCHAIN: "[ END. ]", | |
| 1514 | + FATSECT: "FATSECT ", | |
| 1515 | + DIFSECT: "DIFSECT " | |
| 1516 | + } | |
| 1517 | + nbsect = len(fat) | |
| 1518 | + nlines = (nbsect+VPL-1)//VPL | |
| 1519 | + print("index", end=" ") | |
| 1520 | + for i in range(VPL): | |
| 1521 | + print("%8X" % i, end=" ") | |
| 1522 | + print() | |
| 1523 | + for l in range(nlines): | |
| 1524 | + index = l*VPL | |
| 1525 | + print("%6X:" % (firstindex+index), end=" ") | |
| 1526 | + for i in range(index, index+VPL): | |
| 1527 | + if i>=nbsect: | |
| 1528 | + break | |
| 1529 | + sect = fat[i] | |
| 1530 | + aux = sect & 0xFFFFFFFF # JYTHON-WORKAROUND | |
| 1531 | + if aux in fatnames: | |
| 1532 | + name = fatnames[aux] | |
| 1533 | + else: | |
| 1534 | + if sect == i+1: | |
| 1535 | + name = " --->" | |
| 1536 | + else: | |
| 1537 | + name = "%8X" % sect | |
| 1538 | + print(name, end=" ") | |
| 1539 | + print() | |
| 1540 | + | |
| 1541 | + | |
| 1542 | + def dumpsect(self, sector, firstindex=0): | |
| 1543 | + """ | |
| 1544 | + Display a sector in a human-readable form, for debugging purposes | |
| 1545 | + """ | |
| 1546 | + VPL=8 # number of values per line (8+1 * 8+1 = 81) | |
| 1547 | + tab = array.array(UINT32, sector) | |
| 1548 | + if sys.byteorder == 'big': | |
| 1549 | + tab.byteswap() | |
| 1550 | + nbsect = len(tab) | |
| 1551 | + nlines = (nbsect+VPL-1)//VPL | |
| 1552 | + print("index", end=" ") | |
| 1553 | + for i in range(VPL): | |
| 1554 | + print("%8X" % i, end=" ") | |
| 1555 | + print() | |
| 1556 | + for l in range(nlines): | |
| 1557 | + index = l*VPL | |
| 1558 | + print("%6X:" % (firstindex+index), end=" ") | |
| 1559 | + for i in range(index, index+VPL): | |
| 1560 | + if i>=nbsect: | |
| 1561 | + break | |
| 1562 | + sect = tab[i] | |
| 1563 | + name = "%8X" % sect | |
| 1564 | + print(name, end=" ") | |
| 1565 | + print() | |
| 1566 | + | |
| 1567 | + def sect2array(self, sect): | |
| 1568 | + """ | |
| 1569 | + convert a sector to an array of 32 bits unsigned integers, | |
| 1570 | + swapping bytes on big endian CPUs such as PowerPC (old Macs) | |
| 1571 | + """ | |
| 1572 | + a = array.array(UINT32, sect) | |
| 1573 | + # if CPU is big endian, swap bytes: | |
| 1574 | + if sys.byteorder == 'big': | |
| 1575 | + a.byteswap() | |
| 1576 | + return a | |
| 1577 | + | |
| 1578 | + | |
| 1579 | + def loadfat_sect(self, sect): | |
| 1580 | + """ | |
| 1581 | + Adds the indexes of the given sector to the FAT | |
| 1582 | + | |
| 1583 | + :param sect: string containing the first FAT sector, or array of long integers | |
| 1584 | + :returns: index of last FAT sector. | |
| 1585 | + """ | |
| 1586 | + # a FAT sector is an array of ulong integers. | |
| 1587 | + if isinstance(sect, array.array): | |
| 1588 | + # if sect is already an array it is directly used | |
| 1589 | + fat1 = sect | |
| 1590 | + else: | |
| 1591 | + # if it's a raw sector, it is parsed in an array | |
| 1592 | + fat1 = self.sect2array(sect) | |
| 1593 | + # Display the sector contents only if the logging level is debug: | |
| 1594 | + if log.isEnabledFor(logging.DEBUG): | |
| 1595 | + self.dumpsect(sect) | |
| 1596 | + # The FAT is a sector chain starting at the first index of itself. | |
| 1597 | + # initialize isect, just in case: | |
| 1598 | + isect = None | |
| 1599 | + for isect in fat1: | |
| 1600 | + isect = isect & 0xFFFFFFFF # JYTHON-WORKAROUND | |
| 1601 | + log.debug("isect = %X" % isect) | |
| 1602 | + if isect == ENDOFCHAIN or isect == FREESECT: | |
| 1603 | + # the end of the sector chain has been reached | |
| 1604 | + log.debug("found end of sector chain") | |
| 1605 | + break | |
| 1606 | + # read the FAT sector | |
| 1607 | + s = self.getsect(isect) | |
| 1608 | + # parse it as an array of 32 bits integers, and add it to the | |
| 1609 | + # global FAT array | |
| 1610 | + nextfat = self.sect2array(s) | |
| 1611 | + self.fat = self.fat + nextfat | |
| 1612 | + return isect | |
| 1613 | + | |
| 1614 | + | |
| 1615 | + def loadfat(self, header): | |
| 1616 | + """ | |
| 1617 | + Load the FAT table. | |
| 1618 | + """ | |
| 1619 | + # The 1st sector of the file contains sector numbers for the first 109 | |
| 1620 | + # FAT sectors, right after the header which is 76 bytes long. | |
| 1621 | + # (always 109, whatever the sector size: 512 bytes = 76+4*109) | |
| 1622 | + # Additional sectors are described by DIF blocks | |
| 1623 | + | |
| 1624 | + log.debug('Loading the FAT table, starting with the 1st sector after the header') | |
| 1625 | + sect = header[76:512] | |
| 1626 | + log.debug( "len(sect)=%d, so %d integers" % (len(sect), len(sect)//4) ) | |
| 1627 | + #fat = [] | |
| 1628 | + # [PL] FAT is an array of 32 bits unsigned ints, it's more effective | |
| 1629 | + # to use an array than a list in Python. | |
| 1630 | + # It's initialized as empty first: | |
| 1631 | + self.fat = array.array(UINT32) | |
| 1632 | + self.loadfat_sect(sect) | |
| 1633 | + #self.dumpfat(self.fat) | |
| 1634 | +## for i in range(0, len(sect), 4): | |
| 1635 | +## ix = i32(sect, i) | |
| 1636 | +## #[PL] if ix == -2 or ix == -1: # ix == 0xFFFFFFFE or ix == 0xFFFFFFFF: | |
| 1637 | +## if ix == 0xFFFFFFFE or ix == 0xFFFFFFFF: | |
| 1638 | +## break | |
| 1639 | +## s = self.getsect(ix) | |
| 1640 | +## #fat = fat + [i32(s, i) for i in range(0, len(s), 4)] | |
| 1641 | +## fat = fat + array.array(UINT32, s) | |
| 1642 | + if self.num_difat_sectors != 0: | |
| 1643 | + log.debug('DIFAT is used, because file size > 6.8MB.') | |
| 1644 | + # [PL] There's a DIFAT because file is larger than 6.8MB | |
| 1645 | + # some checks just in case: | |
| 1646 | + if self.num_fat_sectors <= 109: | |
| 1647 | + # there must be at least 109 blocks in header and the rest in | |
| 1648 | + # DIFAT, so number of sectors must be >109. | |
| 1649 | + self._raise_defect(DEFECT_INCORRECT, 'incorrect DIFAT, not enough sectors') | |
| 1650 | + if self.first_difat_sector >= self.nb_sect: | |
| 1651 | + # initial DIFAT block index must be valid | |
| 1652 | + self._raise_defect(DEFECT_FATAL, 'incorrect DIFAT, first index out of range') | |
| 1653 | + log.debug( "DIFAT analysis..." ) | |
| 1654 | + # We compute the necessary number of DIFAT sectors : | |
| 1655 | + # Number of pointers per DIFAT sector = (sectorsize/4)-1 | |
| 1656 | + # (-1 because the last pointer is the next DIFAT sector number) | |
| 1657 | + nb_difat_sectors = (self.sectorsize//4)-1 | |
| 1658 | + # (if 512 bytes: each DIFAT sector = 127 pointers + 1 towards next DIFAT sector) | |
| 1659 | + nb_difat = (self.num_fat_sectors-109 + nb_difat_sectors-1)//nb_difat_sectors | |
| 1660 | + log.debug( "nb_difat = %d" % nb_difat ) | |
| 1661 | + if self.num_difat_sectors != nb_difat: | |
| 1662 | + raise IOError('incorrect DIFAT') | |
| 1663 | + isect_difat = self.first_difat_sector | |
| 1664 | + for i in iterrange(nb_difat): | |
| 1665 | + log.debug( "DIFAT block %d, sector %X" % (i, isect_difat) ) | |
| 1666 | + #TODO: check if corresponding FAT SID = DIFSECT | |
| 1667 | + sector_difat = self.getsect(isect_difat) | |
| 1668 | + difat = self.sect2array(sector_difat) | |
| 1669 | + # Display the sector contents only if the logging level is debug: | |
| 1670 | + if log.isEnabledFor(logging.DEBUG): | |
| 1671 | + self.dumpsect(sector_difat) | |
| 1672 | + self.loadfat_sect(difat[:nb_difat_sectors]) | |
| 1673 | + # last DIFAT pointer is next DIFAT sector: | |
| 1674 | + isect_difat = difat[nb_difat_sectors] | |
| 1675 | + log.debug( "next DIFAT sector: %X" % isect_difat ) | |
| 1676 | + # checks: | |
| 1677 | + if isect_difat not in [ENDOFCHAIN, FREESECT]: | |
| 1678 | + # last DIFAT pointer value must be ENDOFCHAIN or FREESECT | |
| 1679 | + raise IOError('incorrect end of DIFAT') | |
| 1680 | +## if len(self.fat) != self.num_fat_sectors: | |
| 1681 | +## # FAT should contain num_fat_sectors blocks | |
| 1682 | +## print("FAT length: %d instead of %d" % (len(self.fat), self.num_fat_sectors)) | |
| 1683 | +## raise IOError('incorrect DIFAT') | |
| 1684 | + else: | |
| 1685 | + log.debug('No DIFAT, because file size < 6.8MB.') | |
| 1686 | + # since FAT is read from fixed-size sectors, it may contain more values | |
| 1687 | + # than the actual number of sectors in the file. | |
| 1688 | + # Keep only the relevant sector indexes: | |
| 1689 | + if len(self.fat) > self.nb_sect: | |
| 1690 | + log.debug('len(fat)=%d, shrunk to nb_sect=%d' % (len(self.fat), self.nb_sect)) | |
| 1691 | + self.fat = self.fat[:self.nb_sect] | |
| 1692 | + log.debug('FAT references %d sectors / Maximum %d sectors in file' % (len(self.fat), self.nb_sect)) | |
| 1693 | + # Display the FAT contents only if the logging level is debug: | |
| 1694 | + if log.isEnabledFor(logging.DEBUG): | |
| 1695 | + log.debug('\nFAT:') | |
| 1696 | + self.dumpfat(self.fat) | |
| 1697 | + | |
| 1698 | + | |
| 1699 | + def loadminifat(self): | |
| 1700 | + """ | |
| 1701 | + Load the MiniFAT table. | |
| 1702 | + """ | |
| 1703 | + # MiniFAT is stored in a standard sub-stream, pointed to by a header | |
| 1704 | + # field. | |
| 1705 | + # NOTE: there are two sizes to take into account for this stream: | |
| 1706 | + # 1) Stream size is calculated according to the number of sectors | |
| 1707 | + # declared in the OLE header. This allocated stream may be more than | |
| 1708 | + # needed to store the actual sector indexes. | |
| 1709 | + # (self.num_mini_fat_sectors is the number of sectors of size self.sector_size) | |
| 1710 | + stream_size = self.num_mini_fat_sectors * self.sector_size | |
| 1711 | + # 2) Actually used size is calculated by dividing the MiniStream size | |
| 1712 | + # (given by root entry size) by the size of mini sectors, *4 for | |
| 1713 | + # 32 bits indexes: | |
| 1714 | + nb_minisectors = (self.root.size + self.mini_sector_size-1) // self.mini_sector_size | |
| 1715 | + used_size = nb_minisectors * 4 | |
| 1716 | + log.debug('loadminifat(): minifatsect=%d, nb FAT sectors=%d, used_size=%d, stream_size=%d, nb MiniSectors=%d' % | |
| 1717 | + (self.minifatsect, self.num_mini_fat_sectors, used_size, stream_size, nb_minisectors)) | |
| 1718 | + if used_size > stream_size: | |
| 1719 | + # This is not really a problem, but may indicate a wrong implementation: | |
| 1720 | + self._raise_defect(DEFECT_INCORRECT, 'OLE MiniStream is larger than MiniFAT') | |
| 1721 | + # In any case, first read stream_size: | |
| 1722 | + s = self._open(self.minifatsect, stream_size, force_FAT=True).read() | |
| 1723 | + #[PL] Old code replaced by an array: | |
| 1724 | + #self.minifat = [i32(s, i) for i in range(0, len(s), 4)] | |
| 1725 | + self.minifat = self.sect2array(s) | |
| 1726 | + # Then shrink the array to used size, to avoid indexes out of MiniStream: | |
| 1727 | + log.debug('MiniFAT shrunk from %d to %d sectors' % (len(self.minifat), nb_minisectors)) | |
| 1728 | + self.minifat = self.minifat[:nb_minisectors] | |
| 1729 | + log.debug('loadminifat(): len=%d' % len(self.minifat)) | |
| 1730 | + # Display the FAT contents only if the logging level is debug: | |
| 1731 | + if log.isEnabledFor(logging.DEBUG): | |
| 1732 | + log.debug('\nMiniFAT:') | |
| 1733 | + self.dumpfat(self.minifat) | |
| 1734 | + | |
| 1735 | + def getsect(self, sect): | |
| 1736 | + """ | |
| 1737 | + Read given sector from file on disk. | |
| 1738 | + | |
| 1739 | + :param sect: int, sector index | |
| 1740 | + :returns: a string containing the sector data. | |
| 1741 | + """ | |
| 1742 | + # From [MS-CFB]: A sector number can be converted into a byte offset | |
| 1743 | + # into the file by using the following formula: | |
| 1744 | + # (sector number + 1) x Sector Size. | |
| 1745 | + # This implies that sector #0 of the file begins at byte offset Sector | |
| 1746 | + # Size, not at 0. | |
| 1747 | + | |
| 1748 | + # [PL] the original code in PIL was wrong when sectors are 4KB instead of | |
| 1749 | + # 512 bytes: | |
| 1750 | + #self.fp.seek(512 + self.sectorsize * sect) | |
| 1751 | + #[PL]: added safety checks: | |
| 1752 | + #print("getsect(%X)" % sect) | |
| 1753 | + try: | |
| 1754 | + self.fp.seek(self.sectorsize * (sect+1)) | |
| 1755 | + except: | |
| 1756 | + log.debug('getsect(): sect=%X, seek=%d, filesize=%d' % | |
| 1757 | + (sect, self.sectorsize*(sect+1), self._filesize)) | |
| 1758 | + self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range') | |
| 1759 | + sector = self.fp.read(self.sectorsize) | |
| 1760 | + if len(sector) != self.sectorsize: | |
| 1761 | + log.debug('getsect(): sect=%X, read=%d, sectorsize=%d' % | |
| 1762 | + (sect, len(sector), self.sectorsize)) | |
| 1763 | + self._raise_defect(DEFECT_FATAL, 'incomplete OLE sector') | |
| 1764 | + return sector | |
| 1765 | + | |
| 1766 | + | |
| 1767 | + def write_sect(self, sect, data, padding=b'\x00'): | |
| 1768 | + """ | |
| 1769 | + Write given sector to file on disk. | |
| 1770 | + | |
| 1771 | + :param sect: int, sector index | |
| 1772 | + :param data: bytes, sector data | |
| 1773 | + :param padding: single byte, padding character if data < sector size | |
| 1774 | + """ | |
| 1775 | + if not isinstance(data, bytes): | |
| 1776 | + raise TypeError("write_sect: data must be a bytes string") | |
| 1777 | + if not isinstance(padding, bytes) or len(padding)!=1: | |
| 1778 | + raise TypeError("write_sect: padding must be a bytes string of 1 char") | |
| 1779 | + #TODO: we could allow padding=None for no padding at all | |
| 1780 | + try: | |
| 1781 | + self.fp.seek(self.sectorsize * (sect+1)) | |
| 1782 | + except: | |
| 1783 | + log.debug('write_sect(): sect=%X, seek=%d, filesize=%d' % | |
| 1784 | + (sect, self.sectorsize*(sect+1), self._filesize)) | |
| 1785 | + self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range') | |
| 1786 | + if len(data) < self.sectorsize: | |
| 1787 | + # add padding | |
| 1788 | + data += padding * (self.sectorsize - len(data)) | |
| 1789 | + elif len(data) < self.sectorsize: | |
| 1790 | + raise ValueError("Data is larger than sector size") | |
| 1791 | + self.fp.write(data) | |
| 1792 | + | |
| 1793 | + | |
| 1794 | + def loaddirectory(self, sect): | |
| 1795 | + """ | |
| 1796 | + Load the directory. | |
| 1797 | + | |
| 1798 | + :param sect: sector index of directory stream. | |
| 1799 | + """ | |
| 1800 | + log.debug('Loading the Directory:') | |
| 1801 | + # The directory is stored in a standard | |
| 1802 | + # substream, independent of its size. | |
| 1803 | + | |
| 1804 | + # open directory stream as a read-only file: | |
| 1805 | + # (stream size is not known in advance) | |
| 1806 | + self.directory_fp = self._open(sect) | |
| 1807 | + | |
| 1808 | + #[PL] to detect malformed documents and avoid DoS attacks, the maximum | |
| 1809 | + # number of directory entries can be calculated: | |
| 1810 | + max_entries = self.directory_fp.size // 128 | |
| 1811 | + log.debug('loaddirectory: size=%d, max_entries=%d' % | |
| 1812 | + (self.directory_fp.size, max_entries)) | |
| 1813 | + | |
| 1814 | + # Create list of directory entries | |
| 1815 | + #self.direntries = [] | |
| 1816 | + # We start with a list of "None" object | |
| 1817 | + self.direntries = [None] * max_entries | |
| 1818 | +## for sid in iterrange(max_entries): | |
| 1819 | +## entry = fp.read(128) | |
| 1820 | +## if not entry: | |
| 1821 | +## break | |
| 1822 | +## self.direntries.append(OleDirectoryEntry(entry, sid, self)) | |
| 1823 | + # load root entry: | |
| 1824 | + root_entry = self._load_direntry(0) | |
| 1825 | + # Root entry is the first entry: | |
| 1826 | + self.root = self.direntries[0] | |
| 1827 | + # TODO: read ALL directory entries (ignore bad entries?) | |
| 1828 | + # TODO: adapt build_storage_tree to avoid duplicate reads | |
| 1829 | + # for i in range(1, max_entries): | |
| 1830 | + # self._load_direntry(i) | |
| 1831 | + # read and build all storage trees, starting from the root: | |
| 1832 | + self.root.build_storage_tree() | |
| 1833 | + | |
| 1834 | + | |
| 1835 | + def _load_direntry (self, sid): | |
| 1836 | + """ | |
| 1837 | + Load a directory entry from the directory. | |
| 1838 | + This method should only be called once for each storage/stream when | |
| 1839 | + loading the directory. | |
| 1840 | + | |
| 1841 | + :param sid: index of storage/stream in the directory. | |
| 1842 | + :returns: a OleDirectoryEntry object | |
| 1843 | + | |
| 1844 | + :exception IOError: if the entry has always been referenced. | |
| 1845 | + """ | |
| 1846 | + # check if SID is OK: | |
| 1847 | + if sid<0 or sid>=len(self.direntries): | |
| 1848 | + self._raise_defect(DEFECT_FATAL, "OLE directory index out of range") | |
| 1849 | + # check if entry was already referenced: | |
| 1850 | + if self.direntries[sid] is not None: | |
| 1851 | + self._raise_defect(DEFECT_INCORRECT, | |
| 1852 | + "double reference for OLE stream/storage") | |
| 1853 | + # if exception not raised, return the object | |
| 1854 | + return self.direntries[sid] | |
| 1855 | + self.directory_fp.seek(sid * 128) | |
| 1856 | + entry = self.directory_fp.read(128) | |
| 1857 | + self.direntries[sid] = OleDirectoryEntry(entry, sid, self) | |
| 1858 | + return self.direntries[sid] | |
| 1859 | + | |
| 1860 | + | |
| 1861 | + def dumpdirectory(self): | |
| 1862 | + """ | |
| 1863 | + Dump directory (for debugging only) | |
| 1864 | + """ | |
| 1865 | + self.root.dump() | |
| 1866 | + | |
| 1867 | + | |
| 1868 | + def _open(self, start, size = UNKNOWN_SIZE, force_FAT=False): | |
| 1869 | + """ | |
| 1870 | + Open a stream, either in FAT or MiniFAT according to its size. | |
| 1871 | + (openstream helper) | |
| 1872 | + | |
| 1873 | + :param start: index of first sector | |
| 1874 | + :param size: size of stream (or nothing if size is unknown) | |
| 1875 | + :param force_FAT: if False (default), stream will be opened in FAT or MiniFAT | |
| 1876 | + according to size. If True, it will always be opened in FAT. | |
| 1877 | + """ | |
| 1878 | + log.debug('OleFileIO.open(): sect=%Xh, size=%d, force_FAT=%s' % | |
| 1879 | + (start, size, str(force_FAT))) | |
| 1880 | + # stream size is compared to the mini_stream_cutoff_size threshold: | |
| 1881 | + if size < self.minisectorcutoff and not force_FAT: | |
| 1882 | + # ministream object | |
| 1883 | + if not self.ministream: | |
| 1884 | + # load MiniFAT if it wasn't already done: | |
| 1885 | + self.loadminifat() | |
| 1886 | + # The first sector index of the miniFAT stream is stored in the | |
| 1887 | + # root directory entry: | |
| 1888 | + size_ministream = self.root.size | |
| 1889 | + log.debug('Opening MiniStream: sect=%Xh, size=%d' % | |
| 1890 | + (self.root.isectStart, size_ministream)) | |
| 1891 | + self.ministream = self._open(self.root.isectStart, | |
| 1892 | + size_ministream, force_FAT=True) | |
| 1893 | + return OleStream(fp=self.ministream, sect=start, size=size, | |
| 1894 | + offset=0, sectorsize=self.minisectorsize, | |
| 1895 | + fat=self.minifat, filesize=self.ministream.size, | |
| 1896 | + olefileio=self) | |
| 1897 | + else: | |
| 1898 | + # standard stream | |
| 1899 | + return OleStream(fp=self.fp, sect=start, size=size, | |
| 1900 | + offset=self.sectorsize, | |
| 1901 | + sectorsize=self.sectorsize, fat=self.fat, | |
| 1902 | + filesize=self._filesize, | |
| 1903 | + olefileio=self) | |
| 1904 | + | |
| 1905 | + | |
| 1906 | + def _list(self, files, prefix, node, streams=True, storages=False): | |
| 1907 | + """ | |
| 1908 | + listdir helper | |
| 1909 | + | |
| 1910 | + :param files: list of files to fill in | |
| 1911 | + :param prefix: current location in storage tree (list of names) | |
| 1912 | + :param node: current node (OleDirectoryEntry object) | |
| 1913 | + :param streams: bool, include streams if True (True by default) - new in v0.26 | |
| 1914 | + :param storages: bool, include storages if True (False by default) - new in v0.26 | |
| 1915 | + (note: the root storage is never included) | |
| 1916 | + """ | |
| 1917 | + prefix = prefix + [node.name] | |
| 1918 | + for entry in node.kids: | |
| 1919 | + if entry.entry_type == STGTY_STORAGE: | |
| 1920 | + # this is a storage | |
| 1921 | + if storages: | |
| 1922 | + # add it to the list | |
| 1923 | + files.append(prefix[1:] + [entry.name]) | |
| 1924 | + # check its kids | |
| 1925 | + self._list(files, prefix, entry, streams, storages) | |
| 1926 | + elif entry.entry_type == STGTY_STREAM: | |
| 1927 | + # this is a stream | |
| 1928 | + if streams: | |
| 1929 | + # add it to the list | |
| 1930 | + files.append(prefix[1:] + [entry.name]) | |
| 1931 | + else: | |
| 1932 | + self._raise_defect(DEFECT_INCORRECT, 'The directory tree contains an entry which is not a stream nor a storage.') | |
| 1933 | + | |
| 1934 | + | |
| 1935 | + def listdir(self, streams=True, storages=False): | |
| 1936 | + """ | |
| 1937 | + Return a list of streams and/or storages stored in this file | |
| 1938 | + | |
| 1939 | + :param streams: bool, include streams if True (True by default) - new in v0.26 | |
| 1940 | + :param storages: bool, include storages if True (False by default) - new in v0.26 | |
| 1941 | + (note: the root storage is never included) | |
| 1942 | + :returns: list of stream and/or storage paths | |
| 1943 | + """ | |
| 1944 | + files = [] | |
| 1945 | + self._list(files, [], self.root, streams, storages) | |
| 1946 | + return files | |
| 1947 | + | |
| 1948 | + | |
| 1949 | + def _find(self, filename): | |
| 1950 | + """ | |
| 1951 | + Returns directory entry of given filename. (openstream helper) | |
| 1952 | + Note: this method is case-insensitive. | |
| 1953 | + | |
| 1954 | + :param filename: path of stream in storage tree (except root entry), either: | |
| 1955 | + | |
| 1956 | + - a string using Unix path syntax, for example: | |
| 1957 | + 'storage_1/storage_1.2/stream' | |
| 1958 | + - or a list of storage filenames, path to the desired stream/storage. | |
| 1959 | + Example: ['storage_1', 'storage_1.2', 'stream'] | |
| 1960 | + | |
| 1961 | + :returns: sid of requested filename | |
| 1962 | + :exception IOError: if file not found | |
| 1963 | + """ | |
| 1964 | + | |
| 1965 | + # if filename is a string instead of a list, split it on slashes to | |
| 1966 | + # convert to a list: | |
| 1967 | + if isinstance(filename, basestring): | |
| 1968 | + filename = filename.split('/') | |
| 1969 | + # walk across storage tree, following given path: | |
| 1970 | + node = self.root | |
| 1971 | + for name in filename: | |
| 1972 | + for kid in node.kids: | |
| 1973 | + if kid.name.lower() == name.lower(): | |
| 1974 | + break | |
| 1975 | + else: | |
| 1976 | + raise IOError("file not found") | |
| 1977 | + node = kid | |
| 1978 | + return node.sid | |
| 1979 | + | |
| 1980 | + | |
| 1981 | + def openstream(self, filename): | |
| 1982 | + """ | |
| 1983 | + Open a stream as a read-only file object (BytesIO). | |
| 1984 | + Note: filename is case-insensitive. | |
| 1985 | + | |
| 1986 | + :param filename: path of stream in storage tree (except root entry), either: | |
| 1987 | + | |
| 1988 | + - a string using Unix path syntax, for example: | |
| 1989 | + 'storage_1/storage_1.2/stream' | |
| 1990 | + - or a list of storage filenames, path to the desired stream/storage. | |
| 1991 | + Example: ['storage_1', 'storage_1.2', 'stream'] | |
| 1992 | + | |
| 1993 | + :returns: file object (read-only) | |
| 1994 | + :exception IOError: if filename not found, or if this is not a stream. | |
| 1995 | + """ | |
| 1996 | + sid = self._find(filename) | |
| 1997 | + entry = self.direntries[sid] | |
| 1998 | + if entry.entry_type != STGTY_STREAM: | |
| 1999 | + raise IOError("this file is not a stream") | |
| 2000 | + return self._open(entry.isectStart, entry.size) | |
| 2001 | + | |
| 2002 | + | |
| 2003 | + def write_stream(self, stream_name, data): | |
| 2004 | + """ | |
| 2005 | + Write a stream to disk. For now, it is only possible to replace an | |
| 2006 | + existing stream by data of the same size. | |
| 2007 | + | |
| 2008 | + :param stream_name: path of stream in storage tree (except root entry), either: | |
| 2009 | + | |
| 2010 | + - a string using Unix path syntax, for example: | |
| 2011 | + 'storage_1/storage_1.2/stream' | |
| 2012 | + - or a list of storage filenames, path to the desired stream/storage. | |
| 2013 | + Example: ['storage_1', 'storage_1.2', 'stream'] | |
| 2014 | + | |
| 2015 | + :param data: bytes, data to be written, must be the same size as the original | |
| 2016 | + stream. | |
| 2017 | + """ | |
| 2018 | + if not isinstance(data, bytes): | |
| 2019 | + raise TypeError("write_stream: data must be a bytes string") | |
| 2020 | + sid = self._find(stream_name) | |
| 2021 | + entry = self.direntries[sid] | |
| 2022 | + if entry.entry_type != STGTY_STREAM: | |
| 2023 | + raise IOError("this is not a stream") | |
| 2024 | + size = entry.size | |
| 2025 | + if size != len(data): | |
| 2026 | + raise ValueError("write_stream: data must be the same size as the existing stream") | |
| 2027 | + if size < self.minisectorcutoff: | |
| 2028 | + raise NotImplementedError("Writing a stream in MiniFAT is not implemented yet") | |
| 2029 | + sect = entry.isectStart | |
| 2030 | + # number of sectors to write | |
| 2031 | + nb_sectors = (size + (self.sectorsize-1)) // self.sectorsize | |
| 2032 | + log.debug('nb_sectors = %d' % nb_sectors) | |
| 2033 | + for i in range(nb_sectors): | |
| 2034 | +## try: | |
| 2035 | +## self.fp.seek(offset + self.sectorsize * sect) | |
| 2036 | +## except: | |
| 2037 | +## log.debug('sect=%d, seek=%d' % | |
| 2038 | +## (sect, offset+self.sectorsize*sect)) | |
| 2039 | +## raise IOError('OLE sector index out of range') | |
| 2040 | + # extract one sector from data, the last one being smaller: | |
| 2041 | + if i<(nb_sectors-1): | |
| 2042 | + data_sector = data [i*self.sectorsize : (i+1)*self.sectorsize] | |
| 2043 | + #TODO: comment this if it works | |
| 2044 | + assert(len(data_sector)==self.sectorsize) | |
| 2045 | + else: | |
| 2046 | + data_sector = data [i*self.sectorsize:] | |
| 2047 | + #TODO: comment this if it works | |
| 2048 | + log.debug('write_stream: size=%d sectorsize=%d data_sector=%Xh size%%sectorsize=%d' | |
| 2049 | + % (size, self.sectorsize, len(data_sector), size % self.sectorsize)) | |
| 2050 | + assert(len(data_sector) % self.sectorsize==size % self.sectorsize) | |
| 2051 | + self.write_sect(sect, data_sector) | |
| 2052 | +## self.fp.write(data_sector) | |
| 2053 | + # jump to next sector in the FAT: | |
| 2054 | + try: | |
| 2055 | + sect = self.fat[sect] | |
| 2056 | + except IndexError: | |
| 2057 | + # [PL] if pointer is out of the FAT an exception is raised | |
| 2058 | + raise IOError('incorrect OLE FAT, sector index out of range') | |
| 2059 | + #[PL] Last sector should be a "end of chain" marker: | |
| 2060 | + if sect != ENDOFCHAIN: | |
| 2061 | + raise IOError('incorrect last sector index in OLE stream') | |
| 2062 | + | |
| 2063 | + | |
| 2064 | + def get_type(self, filename): | |
| 2065 | + """ | |
| 2066 | + Test if given filename exists as a stream or a storage in the OLE | |
| 2067 | + container, and return its type. | |
| 2068 | + | |
| 2069 | + :param filename: path of stream in storage tree. (see openstream for syntax) | |
| 2070 | + :returns: False if object does not exist, its entry type (>0) otherwise: | |
| 2071 | + | |
| 2072 | + - STGTY_STREAM: a stream | |
| 2073 | + - STGTY_STORAGE: a storage | |
| 2074 | + - STGTY_ROOT: the root entry | |
| 2075 | + """ | |
| 2076 | + try: | |
| 2077 | + sid = self._find(filename) | |
| 2078 | + entry = self.direntries[sid] | |
| 2079 | + return entry.entry_type | |
| 2080 | + except: | |
| 2081 | + return False | |
| 2082 | + | |
| 2083 | + | |
| 2084 | + def getmtime(self, filename): | |
| 2085 | + """ | |
| 2086 | + Return modification time of a stream/storage. | |
| 2087 | + | |
| 2088 | + :param filename: path of stream/storage in storage tree. (see openstream for | |
| 2089 | + syntax) | |
| 2090 | + :returns: None if modification time is null, a python datetime object | |
| 2091 | + otherwise (UTC timezone) | |
| 2092 | + | |
| 2093 | + new in version 0.26 | |
| 2094 | + """ | |
| 2095 | + sid = self._find(filename) | |
| 2096 | + entry = self.direntries[sid] | |
| 2097 | + return entry.getmtime() | |
| 2098 | + | |
| 2099 | + | |
| 2100 | + def getctime(self, filename): | |
| 2101 | + """ | |
| 2102 | + Return creation time of a stream/storage. | |
| 2103 | + | |
| 2104 | + :param filename: path of stream/storage in storage tree. (see openstream for | |
| 2105 | + syntax) | |
| 2106 | + :returns: None if creation time is null, a python datetime object | |
| 2107 | + otherwise (UTC timezone) | |
| 2108 | + | |
| 2109 | + new in version 0.26 | |
| 2110 | + """ | |
| 2111 | + sid = self._find(filename) | |
| 2112 | + entry = self.direntries[sid] | |
| 2113 | + return entry.getctime() | |
| 2114 | + | |
| 2115 | + | |
| 2116 | + def exists(self, filename): | |
| 2117 | + """ | |
| 2118 | + Test if given filename exists as a stream or a storage in the OLE | |
| 2119 | + container. | |
| 2120 | + Note: filename is case-insensitive. | |
| 2121 | + | |
| 2122 | + :param filename: path of stream in storage tree. (see openstream for syntax) | |
| 2123 | + :returns: True if object exist, else False. | |
| 2124 | + """ | |
| 2125 | + try: | |
| 2126 | + sid = self._find(filename) | |
| 2127 | + return True | |
| 2128 | + except: | |
| 2129 | + return False | |
| 2130 | + | |
| 2131 | + | |
| 2132 | + def get_size(self, filename): | |
| 2133 | + """ | |
| 2134 | + Return size of a stream in the OLE container, in bytes. | |
| 2135 | + | |
| 2136 | + :param filename: path of stream in storage tree (see openstream for syntax) | |
| 2137 | + :returns: size in bytes (long integer) | |
| 2138 | + :exception IOError: if file not found | |
| 2139 | + :exception TypeError: if this is not a stream. | |
| 2140 | + """ | |
| 2141 | + sid = self._find(filename) | |
| 2142 | + entry = self.direntries[sid] | |
| 2143 | + if entry.entry_type != STGTY_STREAM: | |
| 2144 | + #TODO: Should it return zero instead of raising an exception ? | |
| 2145 | + raise TypeError('object is not an OLE stream') | |
| 2146 | + return entry.size | |
| 2147 | + | |
| 2148 | + | |
| 2149 | + def get_rootentry_name(self): | |
| 2150 | + """ | |
| 2151 | + Return root entry name. Should usually be 'Root Entry' or 'R' in most | |
| 2152 | + implementations. | |
| 2153 | + """ | |
| 2154 | + return self.root.name | |
| 2155 | + | |
| 2156 | + | |
| 2157 | + def getproperties(self, filename, convert_time=False, no_conversion=None): | |
| 2158 | + """ | |
| 2159 | + Return properties described in substream. | |
| 2160 | + | |
| 2161 | + :param filename: path of stream in storage tree (see openstream for syntax) | |
| 2162 | + :param convert_time: bool, if True timestamps will be converted to Python datetime | |
| 2163 | + :param no_conversion: None or list of int, timestamps not to be converted | |
| 2164 | + (for example total editing time is not a real timestamp) | |
| 2165 | + | |
| 2166 | + :returns: a dictionary of values indexed by id (integer) | |
| 2167 | + """ | |
| 2168 | + #REFERENCE: [MS-OLEPS] https://msdn.microsoft.com/en-us/library/dd942421.aspx | |
| 2169 | + # make sure no_conversion is a list, just to simplify code below: | |
| 2170 | + if no_conversion == None: | |
| 2171 | + no_conversion = [] | |
| 2172 | + # stream path as a string to report exceptions: | |
| 2173 | + streampath = filename | |
| 2174 | + if not isinstance(streampath, str): | |
| 2175 | + streampath = '/'.join(streampath) | |
| 2176 | + | |
| 2177 | + fp = self.openstream(filename) | |
| 2178 | + | |
| 2179 | + data = {} | |
| 2180 | + | |
| 2181 | + try: | |
| 2182 | + # header | |
| 2183 | + s = fp.read(28) | |
| 2184 | + clsid = _clsid(s[8:24]) | |
| 2185 | + | |
| 2186 | + # format id | |
| 2187 | + s = fp.read(20) | |
| 2188 | + fmtid = _clsid(s[:16]) | |
| 2189 | + fp.seek(i32(s, 16)) | |
| 2190 | + | |
| 2191 | + # get section | |
| 2192 | + s = b"****" + fp.read(i32(fp.read(4))-4) | |
| 2193 | + # number of properties: | |
| 2194 | + num_props = i32(s, 4) | |
| 2195 | + except BaseException as exc: | |
| 2196 | + # catch exception while parsing property header, and only raise | |
| 2197 | + # a DEFECT_INCORRECT then return an empty dict, because this is not | |
| 2198 | + # a fatal error when parsing the whole file | |
| 2199 | + msg = 'Error while parsing properties header in stream %s: %s' % ( | |
| 2200 | + repr(streampath), exc) | |
| 2201 | + self._raise_defect(DEFECT_INCORRECT, msg, type(exc)) | |
| 2202 | + return data | |
| 2203 | + | |
| 2204 | + for i in range(num_props): | |
| 2205 | + property_id = 0 # just in case of an exception | |
| 2206 | + try: | |
| 2207 | + property_id = i32(s, 8+i*8) | |
| 2208 | + offset = i32(s, 12+i*8) | |
| 2209 | + property_type = i32(s, offset) | |
| 2210 | + | |
| 2211 | + log.debug('property id=%d: type=%d offset=%X' % (property_id, property_type, offset)) | |
| 2212 | + | |
| 2213 | + # test for common types first (should perhaps use | |
| 2214 | + # a dictionary instead?) | |
| 2215 | + | |
| 2216 | + if property_type == VT_I2: # 16-bit signed integer | |
| 2217 | + value = i16(s, offset+4) | |
| 2218 | + if value >= 32768: | |
| 2219 | + value = value - 65536 | |
| 2220 | + elif property_type == VT_UI2: # 2-byte unsigned integer | |
| 2221 | + value = i16(s, offset+4) | |
| 2222 | + elif property_type in (VT_I4, VT_INT, VT_ERROR): | |
| 2223 | + # VT_I4: 32-bit signed integer | |
| 2224 | + # VT_ERROR: HRESULT, similar to 32-bit signed integer, | |
| 2225 | + # see http://msdn.microsoft.com/en-us/library/cc230330.aspx | |
| 2226 | + value = i32(s, offset+4) | |
| 2227 | + elif property_type in (VT_UI4, VT_UINT): # 4-byte unsigned integer | |
| 2228 | + value = i32(s, offset+4) # FIXME | |
| 2229 | + elif property_type in (VT_BSTR, VT_LPSTR): | |
| 2230 | + # CodePageString, see http://msdn.microsoft.com/en-us/library/dd942354.aspx | |
| 2231 | + # size is a 32 bits integer, including the null terminator, and | |
| 2232 | + # possibly trailing or embedded null chars | |
| 2233 | + #TODO: if codepage is unicode, the string should be converted as such | |
| 2234 | + count = i32(s, offset+4) | |
| 2235 | + value = s[offset+8:offset+8+count-1] | |
| 2236 | + # remove all null chars: | |
| 2237 | + value = value.replace(b'\x00', b'') | |
| 2238 | + elif property_type == VT_BLOB: | |
| 2239 | + # binary large object (BLOB) | |
| 2240 | + # see http://msdn.microsoft.com/en-us/library/dd942282.aspx | |
| 2241 | + count = i32(s, offset+4) | |
| 2242 | + value = s[offset+8:offset+8+count] | |
| 2243 | + elif property_type == VT_LPWSTR: | |
| 2244 | + # UnicodeString | |
| 2245 | + # see http://msdn.microsoft.com/en-us/library/dd942313.aspx | |
| 2246 | + # "the string should NOT contain embedded or additional trailing | |
| 2247 | + # null characters." | |
| 2248 | + count = i32(s, offset+4) | |
| 2249 | + value = self._decode_utf16_str(s[offset+8:offset+8+count*2]) | |
| 2250 | + elif property_type == VT_FILETIME: | |
| 2251 | + value = long(i32(s, offset+4)) + (long(i32(s, offset+8))<<32) | |
| 2252 | + # FILETIME is a 64-bit int: "number of 100ns periods | |
| 2253 | + # since Jan 1,1601". | |
| 2254 | + if convert_time and property_id not in no_conversion: | |
| 2255 | + log.debug('Converting property #%d to python datetime, value=%d=%fs' | |
| 2256 | + %(property_id, value, float(value)/10000000)) | |
| 2257 | + # convert FILETIME to Python datetime.datetime | |
| 2258 | + # inspired from http://code.activestate.com/recipes/511425-filetime-to-datetime/ | |
| 2259 | + _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0) | |
| 2260 | + log.debug('timedelta days=%d' % (value//(10*1000000*3600*24))) | |
| 2261 | + value = _FILETIME_null_date + datetime.timedelta(microseconds=value//10) | |
| 2262 | + else: | |
| 2263 | + # legacy code kept for backward compatibility: returns a | |
| 2264 | + # number of seconds since Jan 1,1601 | |
| 2265 | + value = value // 10000000 # seconds | |
| 2266 | + elif property_type == VT_UI1: # 1-byte unsigned integer | |
| 2267 | + value = i8(s[offset+4]) | |
| 2268 | + elif property_type == VT_CLSID: | |
| 2269 | + value = _clsid(s[offset+4:offset+20]) | |
| 2270 | + elif property_type == VT_CF: | |
| 2271 | + # PropertyIdentifier or ClipboardData?? | |
| 2272 | + # see http://msdn.microsoft.com/en-us/library/dd941945.aspx | |
| 2273 | + count = i32(s, offset+4) | |
| 2274 | + value = s[offset+8:offset+8+count] | |
| 2275 | + elif property_type == VT_BOOL: | |
| 2276 | + # VARIANT_BOOL, 16 bits bool, 0x0000=Fals, 0xFFFF=True | |
| 2277 | + # see http://msdn.microsoft.com/en-us/library/cc237864.aspx | |
| 2278 | + value = bool(i16(s, offset+4)) | |
| 2279 | + else: | |
| 2280 | + value = None # everything else yields "None" | |
| 2281 | + log.debug('property id=%d: type=%d not implemented in parser yet' % (property_id, property_type)) | |
| 2282 | + | |
| 2283 | + # missing: VT_EMPTY, VT_NULL, VT_R4, VT_R8, VT_CY, VT_DATE, | |
| 2284 | + # VT_DECIMAL, VT_I1, VT_I8, VT_UI8, | |
| 2285 | + # see http://msdn.microsoft.com/en-us/library/dd942033.aspx | |
| 2286 | + | |
| 2287 | + # FIXME: add support for VT_VECTOR | |
| 2288 | + # VT_VECTOR is a 32 uint giving the number of items, followed by | |
| 2289 | + # the items in sequence. The VT_VECTOR value is combined with the | |
| 2290 | + # type of items, e.g. VT_VECTOR|VT_BSTR | |
| 2291 | + # see http://msdn.microsoft.com/en-us/library/dd942011.aspx | |
| 2292 | + | |
| 2293 | + #print("%08x" % property_id, repr(value), end=" ") | |
| 2294 | + #print("(%s)" % VT[i32(s, offset) & 0xFFF]) | |
| 2295 | + | |
| 2296 | + data[property_id] = value | |
| 2297 | + except BaseException as exc: | |
| 2298 | + # catch exception while parsing each property, and only raise | |
| 2299 | + # a DEFECT_INCORRECT, because parsing can go on | |
| 2300 | + msg = 'Error while parsing property id %d in stream %s: %s' % ( | |
| 2301 | + property_id, repr(streampath), exc) | |
| 2302 | + self._raise_defect(DEFECT_INCORRECT, msg, type(exc)) | |
| 2303 | + | |
| 2304 | + return data | |
| 2305 | + | |
| 2306 | + def get_metadata(self): | |
| 2307 | + """ | |
| 2308 | + Parse standard properties streams, return an OleMetadata object | |
| 2309 | + containing all the available metadata. | |
| 2310 | + (also stored in the metadata attribute of the OleFileIO object) | |
| 2311 | + | |
| 2312 | + new in version 0.25 | |
| 2313 | + """ | |
| 2314 | + self.metadata = OleMetadata() | |
| 2315 | + self.metadata.parse_properties(self) | |
| 2316 | + return self.metadata | |
| 2317 | + | |
| 2318 | +# | |
| 2319 | +# -------------------------------------------------------------------- | |
| 2320 | +# This script can be used to dump the directory of any OLE2 structured | |
| 2321 | +# storage file. | |
| 2322 | + | |
| 2323 | +if __name__ == "__main__": | |
| 2324 | + | |
| 2325 | + import sys, optparse | |
| 2326 | + | |
| 2327 | + DEFAULT_LOG_LEVEL = "warning" # Default log level | |
| 2328 | + LOG_LEVELS = { | |
| 2329 | + 'debug': logging.DEBUG, | |
| 2330 | + 'info': logging.INFO, | |
| 2331 | + 'warning': logging.WARNING, | |
| 2332 | + 'error': logging.ERROR, | |
| 2333 | + 'critical': logging.CRITICAL | |
| 2334 | + } | |
| 2335 | + | |
| 2336 | + usage = 'usage: %prog [options] <filename> [filename2 ...]' | |
| 2337 | + parser = optparse.OptionParser(usage=usage) | |
| 2338 | + parser.add_option("-c", action="store_true", dest="check_streams", | |
| 2339 | + help='check all streams (for debugging purposes)') | |
| 2340 | + parser.add_option("-d", action="store_true", dest="debug_mode", | |
| 2341 | + help='debug mode, shortcut for -l debug (displays a lot of debug information, for developers only)') | |
| 2342 | + parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL, | |
| 2343 | + help="logging level debug/info/warning/error/critical (default=%default)") | |
| 2344 | + | |
| 2345 | + (options, args) = parser.parse_args() | |
| 2346 | + | |
| 2347 | + print('olefile version %s %s - http://www.decalage.info/en/olefile\n' % (__version__, __date__)) | |
| 2348 | + | |
| 2349 | + # Print help if no arguments are passed | |
| 2350 | + if len(args) == 0: | |
| 2351 | + print(__doc__) | |
| 2352 | + parser.print_help() | |
| 2353 | + sys.exit() | |
| 2354 | + | |
| 2355 | + if options.debug_mode: | |
| 2356 | + options.loglevel = 'debug' | |
| 2357 | + | |
| 2358 | + # setup logging to the console | |
| 2359 | + logging.basicConfig(level=LOG_LEVELS[options.loglevel], format='%(levelname)-8s %(message)s') | |
| 2360 | + | |
| 2361 | + # also enable the module's logger: | |
| 2362 | + enable_logging() | |
| 2363 | + | |
| 2364 | + for filename in args: | |
| 2365 | + try: | |
| 2366 | + ole = OleFileIO(filename)#, raise_defects=DEFECT_INCORRECT) | |
| 2367 | + print("-" * 68) | |
| 2368 | + print(filename) | |
| 2369 | + print("-" * 68) | |
| 2370 | + ole.dumpdirectory() | |
| 2371 | + for streamname in ole.listdir(): | |
| 2372 | + if streamname[-1][0] == "\005": | |
| 2373 | + print("%r: properties" % streamname) | |
| 2374 | + try: | |
| 2375 | + props = ole.getproperties(streamname, convert_time=True) | |
| 2376 | + props = sorted(props.items()) | |
| 2377 | + for k, v in props: | |
| 2378 | + #[PL]: avoid to display too large or binary values: | |
| 2379 | + if isinstance(v, (basestring, bytes)): | |
| 2380 | + if len(v) > 50: | |
| 2381 | + v = v[:50] | |
| 2382 | + if isinstance(v, bytes): | |
| 2383 | + # quick and dirty binary check: | |
| 2384 | + for c in (1,2,3,4,5,6,7,11,12,14,15,16,17,18,19,20, | |
| 2385 | + 21,22,23,24,25,26,27,28,29,30,31): | |
| 2386 | + if c in bytearray(v): | |
| 2387 | + v = '(binary data)' | |
| 2388 | + break | |
| 2389 | + print(" ", k, v) | |
| 2390 | + except: | |
| 2391 | + log.exception('Error while parsing property stream %r' % streamname) | |
| 2392 | + | |
| 2393 | + if options.check_streams: | |
| 2394 | + # Read all streams to check if there are errors: | |
| 2395 | + print('\nChecking streams...') | |
| 2396 | + for streamname in ole.listdir(): | |
| 2397 | + # print name using repr() to convert binary chars to \xNN: | |
| 2398 | + print('-', repr('/'.join(streamname)),'-', end=' ') | |
| 2399 | + st_type = ole.get_type(streamname) | |
| 2400 | + if st_type == STGTY_STREAM: | |
| 2401 | + print('size %d' % ole.get_size(streamname)) | |
| 2402 | + # just try to read stream in memory: | |
| 2403 | + ole.openstream(streamname) | |
| 2404 | + else: | |
| 2405 | + print('NOT a stream : type=%d' % st_type) | |
| 2406 | + print() | |
| 2407 | + | |
| 2408 | +## for streamname in ole.listdir(): | |
| 2409 | +## # print name using repr() to convert binary chars to \xNN: | |
| 2410 | +## print('-', repr('/'.join(streamname)),'-', end=' ') | |
| 2411 | +## print(ole.getmtime(streamname)) | |
| 2412 | +## print() | |
| 2413 | + | |
| 2414 | + print('Modification/Creation times of all directory entries:') | |
| 2415 | + for entry in ole.direntries: | |
| 2416 | + if entry is not None: | |
| 2417 | + print('- %s: mtime=%s ctime=%s' % (entry.name, | |
| 2418 | + entry.getmtime(), entry.getctime())) | |
| 2419 | + print() | |
| 2420 | + | |
| 2421 | + # parse and display metadata: | |
| 2422 | + try: | |
| 2423 | + meta = ole.get_metadata() | |
| 2424 | + meta.dump() | |
| 2425 | + except: | |
| 2426 | + log.exception('Error while parsing metadata') | |
| 2427 | + print() | |
| 2428 | + #[PL] Test a few new methods: | |
| 2429 | + root = ole.get_rootentry_name() | |
| 2430 | + print('Root entry name: "%s"' % root) | |
| 2431 | + if ole.exists('worddocument'): | |
| 2432 | + print("This is a Word document.") | |
| 2433 | + print("type of stream 'WordDocument':", ole.get_type('worddocument')) | |
| 2434 | + print("size :", ole.get_size('worddocument')) | |
| 2435 | + if ole.exists('macros/vba'): | |
| 2436 | + print("This document may contain VBA macros.") | |
| 2437 | + | |
| 2438 | + # print parsing issues: | |
| 2439 | + print('\nNon-fatal issues raised during parsing:') | |
| 2440 | + if ole.parsing_issues: | |
| 2441 | + for exctype, msg in ole.parsing_issues: | |
| 2442 | + print('- %s: %s' % (exctype.__name__, msg)) | |
| 2443 | + else: | |
| 2444 | + print('None') | |
| 2445 | + except: | |
| 2446 | + log.exception('Error while parsing file %r' % filename) | |
| 2447 | + | |
| 2448 | +# this code was developed while listening to The Wedding Present "Sea Monsters" | ... | ... |
oletools/thirdparty/xglob/xglob.py
| ... | ... | @@ -52,14 +52,25 @@ For more info and updates: http://www.decalage.info/xglob |
| 52 | 52 | # 2015-01-03 v0.04 PL: - fixed issues in iter_files + yield container name |
| 53 | 53 | # 2016-02-24 v0.05 PL: - do not stop on exceptions, return them as data |
| 54 | 54 | # - fixed issue when using wildcards with empty path |
| 55 | +# 2016-04-28 v0.06 CH: - improved handling of non-existing files | |
| 56 | +# (by Christian Herdtweck) | |
| 55 | 57 | |
| 56 | -__version__ = '0.05' | |
| 58 | +__version__ = '0.06' | |
| 57 | 59 | |
| 58 | 60 | |
| 59 | 61 | #=== IMPORTS ================================================================= |
| 60 | 62 | |
| 61 | 63 | import os, fnmatch, glob, zipfile |
| 62 | 64 | |
| 65 | +#=== EXCEPTIONS ============================================================== | |
| 66 | + | |
| 67 | +class PathNotFoundException(Exception): | |
| 68 | + """ raised if given a fixed file/dir (not a glob) that does not exist """ | |
| 69 | + def __init__(self, path): | |
| 70 | + super(PathNotFoundException, self).__init__( | |
| 71 | + 'Given path does not exist: %r' % path) | |
| 72 | + | |
| 73 | + | |
| 63 | 74 | #=== FUNCTIONS =============================================================== |
| 64 | 75 | |
| 65 | 76 | # recursive glob function to find files in any subfolder: |
| ... | ... | @@ -118,8 +129,11 @@ def iter_files(files, recursive=False, zip_password=None, zip_fname='*'): |
| 118 | 129 | - then files matching zip_fname are opened from the zip archive |
| 119 | 130 | |
| 120 | 131 | Iterator: yields (container, filename, data) for each file. If zip_password is None, then |
| 121 | - only the filename is returned, container and data=None. Otherwise container si the | |
| 122 | - filename of the container (zip file), and data is the file content. | |
| 132 | + only the filename is returned, container and data=None. Otherwise container is the | |
| 133 | + filename of the container (zip file), and data is the file content (or an exception). | |
| 134 | + If a given filename is not a glob and does not exist, the triplet | |
| 135 | + (None, filename, PathNotFoundException) is yielded. (Globs matching nothing | |
| 136 | + do not trigger exceptions) | |
| 123 | 137 | """ |
| 124 | 138 | #TODO: catch exceptions and yield them for the caller (no file found, file is not zip, wrong password, etc) |
| 125 | 139 | #TODO: use logging instead of printing |
| ... | ... | @@ -131,6 +145,9 @@ def iter_files(files, recursive=False, zip_password=None, zip_fname='*'): |
| 131 | 145 | else: |
| 132 | 146 | iglob = glob.iglob |
| 133 | 147 | for filespec in files: |
| 148 | + if not is_glob(filespec) and not os.path.exists(filespec): | |
| 149 | + yield None, filespec, PathNotFoundException(filespec) | |
| 150 | + continue | |
| 134 | 151 | for filename in iglob(filespec): |
| 135 | 152 | if zip_password is not None: |
| 136 | 153 | # Each file is expected to be a zip archive: |
| ... | ... | @@ -153,3 +170,39 @@ def iter_files(files, recursive=False, zip_password=None, zip_fname='*'): |
| 153 | 170 | #data = open(filename, 'rb').read() |
| 154 | 171 | #yield None, filename, data |
| 155 | 172 | |
| 173 | + | |
| 174 | +def is_glob(filespec): | |
| 175 | + """ determine if given file specification is a single file name or a glob | |
| 176 | + | |
| 177 | + python's glob and fnmatch can only interpret ?, *, [list], and [ra-nge], | |
| 178 | + (and combinations: hex_*_[A-Fabcdef0-9]). | |
| 179 | + The special chars *?[-] can only be escaped using [] | |
| 180 | + --> file_name is not a glob | |
| 181 | + --> file?name is a glob | |
| 182 | + --> file* is a glob | |
| 183 | + --> file[-._]name is a glob | |
| 184 | + --> file[?]name is not a glob (matches literal "file?name") | |
| 185 | + --> file[*]name is not a glob (matches literal "file*name") | |
| 186 | + --> file[-]name is not a glob (matches literal "file-name") | |
| 187 | + --> file-name is not a glob | |
| 188 | + | |
| 189 | + Also, obviously incorrect globs are treated as non-globs | |
| 190 | + --> file[name is not a glob (matches literal "file[name") | |
| 191 | + --> file]-[name is treated as a glob | |
| 192 | + (it is not a valid glob but detecting errors like this requires | |
| 193 | + sophisticated regular expression matching) | |
| 194 | + | |
| 195 | + Python's glob also works with globs in directory-part of path | |
| 196 | + --> dir-part of path is analyzed just like filename-part | |
| 197 | + --> thirdparty/*/xglob.py is a (valid) glob | |
| 198 | + | |
| 199 | + TODO: create a correct regexp to test for validity of ranges | |
| 200 | + """ | |
| 201 | + | |
| 202 | + # remove escaped special chars | |
| 203 | + cleaned = filespec.replace('[*]', '').replace('[?]', '') \ | |
| 204 | + .replace('[[]', '').replace('[]]', '').replace('[-]', '') | |
| 205 | + | |
| 206 | + # check if special chars remain | |
| 207 | + return '*' in cleaned or '?' in cleaned or \ | |
| 208 | + ('[' in cleaned and ']' in cleaned) | ... | ... |