Commit 383ae415084903920c31ef787cb343b8572187e5
1 parent
43f6a95c
rtfobj: extract OLE 1.0 objects and files from OLE Package objects, improved CLI…
… options and logging. Added new module oleobj to parse OLE structures.
Showing
2 changed files
with
554 additions
and
15 deletions
oletools/oleobj.py
0 → 100644
| 1 | +#!/usr/bin/env python | ||
| 2 | +""" | ||
| 3 | +oleobj.py | ||
| 4 | + | ||
| 5 | +oleobj is a Python script and module to parse OLE objects and files stored | ||
| 6 | +into various file formats such as RTF or MS Office documents (e.g. Word, Excel). | ||
| 7 | + | ||
| 8 | +Author: Philippe Lagadec - http://www.decalage.info | ||
| 9 | +License: BSD, see source code or documentation | ||
| 10 | + | ||
| 11 | +oleobj is part of the python-oletools package: | ||
| 12 | +http://www.decalage.info/python/oletools | ||
| 13 | +""" | ||
| 14 | + | ||
| 15 | +# === LICENSE ================================================================== | ||
| 16 | + | ||
| 17 | +# oleobj is copyright (c) 2015 Philippe Lagadec (http://www.decalage.info) | ||
| 18 | +# All rights reserved. | ||
| 19 | +# | ||
| 20 | +# Redistribution and use in source and binary forms, with or without modification, | ||
| 21 | +# are permitted provided that the following conditions are met: | ||
| 22 | +# | ||
| 23 | +# * Redistributions of source code must retain the above copyright notice, this | ||
| 24 | +# list of conditions and the following disclaimer. | ||
| 25 | +# * Redistributions in binary form must reproduce the above copyright notice, | ||
| 26 | +# this list of conditions and the following disclaimer in the documentation | ||
| 27 | +# and/or other materials provided with the distribution. | ||
| 28 | +# | ||
| 29 | +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | ||
| 30 | +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | ||
| 31 | +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | ||
| 32 | +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | ||
| 33 | +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
| 34 | +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||
| 35 | +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | ||
| 36 | +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||
| 37 | +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
| 38 | +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| 39 | + | ||
| 40 | + | ||
| 41 | +#------------------------------------------------------------------------------ | ||
| 42 | +# CHANGELOG: | ||
| 43 | +# 2015-12-05 v0.01 PL: - first version | ||
| 44 | + | ||
| 45 | +__version__ = '0.01' | ||
| 46 | + | ||
| 47 | +#------------------------------------------------------------------------------ | ||
| 48 | +# TODO: | ||
| 49 | +# + setup logging (common with other oletools) | ||
| 50 | + | ||
| 51 | + | ||
| 52 | +#------------------------------------------------------------------------------ | ||
| 53 | +# REFERENCES: | ||
| 54 | + | ||
| 55 | +# Reference for the storage of embedded OLE objects/files: | ||
| 56 | +# [MS-OLEDS]: Object Linking and Embedding (OLE) Data Structures | ||
| 57 | +# https://msdn.microsoft.com/en-us/library/dd942265.aspx | ||
| 58 | + | ||
| 59 | +# - officeparser: https://github.com/unixfreak0037/officeparser | ||
| 60 | +# TODO: oledump | ||
| 61 | + | ||
| 62 | + | ||
| 63 | +#--- IMPORTS ------------------------------------------------------------------ | ||
| 64 | + | ||
| 65 | +import logging, struct | ||
| 66 | + | ||
| 67 | + | ||
| 68 | +# === LOGGING ================================================================= | ||
| 69 | + | ||
| 70 | +class NullHandler(logging.Handler): | ||
| 71 | + """ | ||
| 72 | + Log Handler without output, to avoid printing messages if logging is not | ||
| 73 | + configured by the main application. | ||
| 74 | + Python 2.7 has logging.NullHandler, but this is necessary for 2.6: | ||
| 75 | + see https://docs.python.org/2.6/library/logging.html#configuring-logging-for-a-library | ||
| 76 | + """ | ||
| 77 | + def emit(self, record): | ||
| 78 | + pass | ||
| 79 | + | ||
| 80 | +def get_logger(name, level=logging.CRITICAL+1): | ||
| 81 | + """ | ||
| 82 | + Create a suitable logger object for this module. | ||
| 83 | + The goal is not to change settings of the root logger, to avoid getting | ||
| 84 | + other modules' logs on the screen. | ||
| 85 | + If a logger exists with same name, reuse it. (Else it would have duplicate | ||
| 86 | + handlers and messages would be doubled.) | ||
| 87 | + The level is set to CRITICAL+1 by default, to avoid any logging. | ||
| 88 | + """ | ||
| 89 | + # First, test if there is already a logger with the same name, else it | ||
| 90 | + # will generate duplicate messages (due to duplicate handlers): | ||
| 91 | + if name in logging.Logger.manager.loggerDict: | ||
| 92 | + #NOTE: another less intrusive but more "hackish" solution would be to | ||
| 93 | + # use getLogger then test if its effective level is not default. | ||
| 94 | + logger = logging.getLogger(name) | ||
| 95 | + # make sure level is OK: | ||
| 96 | + logger.setLevel(level) | ||
| 97 | + return logger | ||
| 98 | + # get a new logger: | ||
| 99 | + logger = logging.getLogger(name) | ||
| 100 | + # only add a NullHandler for this logger, it is up to the application | ||
| 101 | + # to configure its own logging: | ||
| 102 | + logger.addHandler(NullHandler()) | ||
| 103 | + logger.setLevel(level) | ||
| 104 | + return logger | ||
| 105 | + | ||
| 106 | +# a global logger object used for debugging: | ||
| 107 | +log = get_logger('oleobj') | ||
| 108 | + | ||
| 109 | + | ||
| 110 | +# === GLOBAL VARIABLES ======================================================= | ||
| 111 | + | ||
| 112 | +# struct to parse an unsigned integer of 32 bits: | ||
| 113 | +struct_uint32 = struct.Struct('<L') | ||
| 114 | +assert struct_uint32.size == 4 # make sure it matches 4 bytes | ||
| 115 | + | ||
| 116 | +# struct to parse an unsigned integer of 16 bits: | ||
| 117 | +struct_uint16 = struct.Struct('<H') | ||
| 118 | +assert struct_uint16.size == 2 # make sure it matches 2 bytes | ||
| 119 | + | ||
| 120 | + | ||
| 121 | +# === FUNCTIONS ============================================================== | ||
| 122 | + | ||
| 123 | +def read_uint32(data): | ||
| 124 | + """ | ||
| 125 | + Read an unsigned integer from the first 32 bits of data. | ||
| 126 | + | ||
| 127 | + :param data: bytes string containing the data to be extracted. | ||
| 128 | + :return: tuple (value, new_data) containing the read value (int), | ||
| 129 | + and the new data without the bytes read. | ||
| 130 | + """ | ||
| 131 | + value = struct_uint32.unpack(data[0:4])[0] | ||
| 132 | + new_data = data[4:] | ||
| 133 | + return (value, new_data) | ||
| 134 | + | ||
| 135 | + | ||
| 136 | +def read_uint16(data): | ||
| 137 | + """ | ||
| 138 | + Read an unsigned integer from the first 16 bits of data. | ||
| 139 | + | ||
| 140 | + :param data: bytes string containing the data to be extracted. | ||
| 141 | + :return: tuple (value, new_data) containing the read value (int), | ||
| 142 | + and the new data without the bytes read. | ||
| 143 | + """ | ||
| 144 | + value = struct_uint16.unpack(data[0:2])[0] | ||
| 145 | + new_data = data[2:] | ||
| 146 | + return (value, new_data) | ||
| 147 | + | ||
| 148 | + | ||
| 149 | +def read_LengthPrefixedAnsiString(data): | ||
| 150 | + """ | ||
| 151 | + Read a length-prefixed ANSI string from data. | ||
| 152 | + | ||
| 153 | + :param data: bytes string containing the data to be extracted. | ||
| 154 | + :return: tuple (value, new_data) containing the read value (bytes string), | ||
| 155 | + and the new data without the bytes read. | ||
| 156 | + """ | ||
| 157 | + length, data = read_uint32(data) | ||
| 158 | + # if length = 0, return a null string (no null character) | ||
| 159 | + if length == 0: | ||
| 160 | + return ('', data) | ||
| 161 | + # extract the string without the last null character | ||
| 162 | + ansi_string = data[:length-1] | ||
| 163 | + # TODO: only in strict mode: | ||
| 164 | + # check the presence of the null char: | ||
| 165 | + assert data[length] == '\x00' | ||
| 166 | + new_data = data[length:] | ||
| 167 | + return (ansi_string, new_data) | ||
| 168 | + | ||
| 169 | + | ||
| 170 | +# === CLASSES ================================================================ | ||
| 171 | + | ||
| 172 | +class OleNativeStream (object): | ||
| 173 | + """ | ||
| 174 | + OLE object contained into an OLENativeStream structure. | ||
| 175 | + (see MS-OLEDS 2.3.6 OLENativeStream) | ||
| 176 | + """ | ||
| 177 | + # constants for the type attribute: | ||
| 178 | + # see MS-OLEDS 2.2.4 ObjectHeader | ||
| 179 | + TYPE_LINKED = 0x01 | ||
| 180 | + TYPE_EMBEDDED = 0x02 | ||
| 181 | + | ||
| 182 | + | ||
| 183 | + def __init__(self, bindata=None): | ||
| 184 | + """ | ||
| 185 | + Constructor for OleNativeStream. | ||
| 186 | + If bindata is provided, it will be parsed using the parse() method. | ||
| 187 | + | ||
| 188 | + :param bindata: bytes, OLENativeStream structure containing an OLE object | ||
| 189 | + """ | ||
| 190 | + self.filename = None | ||
| 191 | + self.src_path = None | ||
| 192 | + self.unknown_short = None | ||
| 193 | + self.unknown_long_1 = None | ||
| 194 | + self.unknown_long_2 = None | ||
| 195 | + self.temp_path = None | ||
| 196 | + self.actual_size = None | ||
| 197 | + self.data = None | ||
| 198 | + if bindata is not None: | ||
| 199 | + self.parse(data=bindata) | ||
| 200 | + | ||
| 201 | + def parse(self, data): | ||
| 202 | + """ | ||
| 203 | + Parse binary data containing an OLENativeStream structure, | ||
| 204 | + to extract the OLE object it contains. | ||
| 205 | + (see MS-OLEDS 2.3.6 OLENativeStream) | ||
| 206 | + | ||
| 207 | + :param data: bytes, OLENativeStream structure containing an OLE object | ||
| 208 | + :return: | ||
| 209 | + """ | ||
| 210 | + # TODO: strict mode to raise exceptions when values are incorrect | ||
| 211 | + # (permissive mode by default) | ||
| 212 | + # self.native_data_size = struct.unpack('<L', data[0:4])[0] | ||
| 213 | + # data = data[4:] | ||
| 214 | + # log.debug('OLE native data size = {0:08X} ({0} bytes)'.format(self.native_data_size)) | ||
| 215 | + # I thought this might be an OLE type specifier ??? | ||
| 216 | + self.unknown_short, data = read_uint16(data) | ||
| 217 | + self.filename, data = data.split('\x00', 1) | ||
| 218 | + # source path | ||
| 219 | + self.src_path, data = data.split('\x00', 1) | ||
| 220 | + # TODO I bet these next 8 bytes are a timestamp => FILETIME from olefile | ||
| 221 | + self.unknown_long_1, data = read_uint32(data) | ||
| 222 | + self.unknown_long_2, data = read_uint32(data) | ||
| 223 | + # temp path? | ||
| 224 | + self.temp_path, data = data.split('\x00', 1) | ||
| 225 | + # size of the rest of the data | ||
| 226 | + self.actual_size, data = read_uint32(data) | ||
| 227 | + self.data = data[0:self.actual_size] | ||
| 228 | + # TODO: exception when size > remaining data | ||
| 229 | + # TODO: SLACK DATA | ||
| 230 | + | ||
| 231 | + | ||
| 232 | +class OleObject (object): | ||
| 233 | + """ | ||
| 234 | + OLE 1.0 Object | ||
| 235 | + | ||
| 236 | + see MS-OLEDS 2.2 OLE1.0 Format Structures | ||
| 237 | + """ | ||
| 238 | + | ||
| 239 | + # constants for the format_id attribute: | ||
| 240 | + # see MS-OLEDS 2.2.4 ObjectHeader | ||
| 241 | + TYPE_LINKED = 0x01 | ||
| 242 | + TYPE_EMBEDDED = 0x02 | ||
| 243 | + | ||
| 244 | + | ||
| 245 | + def __init__(self, bindata=None): | ||
| 246 | + """ | ||
| 247 | + Constructor for OleObject. | ||
| 248 | + If bindata is provided, it will be parsed using the parse() method. | ||
| 249 | + | ||
| 250 | + :param bindata: bytes, OLE 1.0 Object structure containing an OLE object | ||
| 251 | + """ | ||
| 252 | + self.ole_version = None | ||
| 253 | + self.format_id = None | ||
| 254 | + self.class_name = None | ||
| 255 | + self.topic_name = None | ||
| 256 | + self.item_name = None | ||
| 257 | + self.data = None | ||
| 258 | + self.data_size = None | ||
| 259 | + | ||
| 260 | + def parse(self, data): | ||
| 261 | + """ | ||
| 262 | + Parse binary data containing an OLE 1.0 Object structure, | ||
| 263 | + to extract the OLE object it contains. | ||
| 264 | + (see MS-OLEDS 2.2 OLE1.0 Format Structures) | ||
| 265 | + | ||
| 266 | + :param data: bytes, OLE 1.0 Object structure containing an OLE object | ||
| 267 | + :return: | ||
| 268 | + """ | ||
| 269 | + # Header: see MS-OLEDS 2.2.4 ObjectHeader | ||
| 270 | + self.ole_version, data = read_uint32(data) | ||
| 271 | + self.format_id, data = read_uint32(data) | ||
| 272 | + log.debug('OLE version=%08X - Format ID=%08X' % (self.ole_version, self.format_id)) | ||
| 273 | + assert self.format_id in (self.TYPE_EMBEDDED, self.TYPE_LINKED) | ||
| 274 | + self.class_name, data = read_LengthPrefixedAnsiString(data) | ||
| 275 | + self.topic_name, data = read_LengthPrefixedAnsiString(data) | ||
| 276 | + self.item_name, data = read_LengthPrefixedAnsiString(data) | ||
| 277 | + log.debug('Class name=%r - Topic name=%r - Item name=%r' | ||
| 278 | + % (self.class_name, self.topic_name, self.item_name)) | ||
| 279 | + if self.format_id == self.TYPE_EMBEDDED: | ||
| 280 | + # Embedded object: see MS-OLEDS 2.2.5 EmbeddedObject | ||
| 281 | + #assert self.topic_name != '' and self.item_name != '' | ||
| 282 | + self.data_size, data = read_uint32(data) | ||
| 283 | + log.debug('Declared data size=%d - remaining size=%d' % (self.data_size, len(data))) | ||
| 284 | + # TODO: handle incorrect size to avoid exception | ||
| 285 | + self.data = data[:self.data_size] | ||
| 286 | + assert len(self.data) == self.data_size | ||
| 287 | + self.extra_data = data[self.data_size:] |
oletools/rtfobj.py
| 1 | #!/usr/bin/env python | 1 | #!/usr/bin/env python |
| 2 | """ | 2 | """ |
| 3 | -rtfobj.py - Philippe Lagadec 2013-04-02 | 3 | +rtfobj.py |
| 4 | 4 | ||
| 5 | rtfobj is a Python module to extract embedded objects from RTF files, such as | 5 | rtfobj is a Python module to extract embedded objects from RTF files, such as |
| 6 | OLE ojects. It can be used as a Python library or a command-line tool. | 6 | OLE ojects. It can be used as a Python library or a command-line tool. |
| @@ -43,8 +43,11 @@ http://www.decalage.info/python/oletools | @@ -43,8 +43,11 @@ http://www.decalage.info/python/oletools | ||
| 43 | # CHANGELOG: | 43 | # CHANGELOG: |
| 44 | # 2012-11-09 v0.01 PL: - first version | 44 | # 2012-11-09 v0.01 PL: - first version |
| 45 | # 2013-04-02 v0.02 PL: - fixed bug in main | 45 | # 2013-04-02 v0.02 PL: - fixed bug in main |
| 46 | +# 2015-12-09 v0.03 PL: - configurable logging, CLI options | ||
| 47 | +# - extract OLE 1.0 objects | ||
| 48 | +# - extract files from OLE Package objects | ||
| 46 | 49 | ||
| 47 | -__version__ = '0.02' | 50 | +__version__ = '0.03' |
| 48 | 51 | ||
| 49 | #------------------------------------------------------------------------------ | 52 | #------------------------------------------------------------------------------ |
| 50 | # TODO: | 53 | # TODO: |
| @@ -52,9 +55,55 @@ __version__ = '0.02' | @@ -52,9 +55,55 @@ __version__ = '0.02' | ||
| 52 | # - allow semicolon within hex, as found in this sample: | 55 | # - allow semicolon within hex, as found in this sample: |
| 53 | # http://contagiodump.blogspot.nl/2011/10/sep-28-cve-2010-3333-manuscript-with.html | 56 | # http://contagiodump.blogspot.nl/2011/10/sep-28-cve-2010-3333-manuscript-with.html |
| 54 | 57 | ||
| 58 | + | ||
| 55 | #=== IMPORTS ================================================================= | 59 | #=== IMPORTS ================================================================= |
| 56 | 60 | ||
| 57 | -import re, sys, string, binascii | 61 | +import re, sys, string, binascii, logging, optparse |
| 62 | + | ||
| 63 | +from thirdparty.xglob import xglob | ||
| 64 | +from oleobj import OleObject, OleNativeStream | ||
| 65 | +import oleobj | ||
| 66 | + | ||
| 67 | +# === LOGGING ================================================================= | ||
| 68 | + | ||
| 69 | +class NullHandler(logging.Handler): | ||
| 70 | + """ | ||
| 71 | + Log Handler without output, to avoid printing messages if logging is not | ||
| 72 | + configured by the main application. | ||
| 73 | + Python 2.7 has logging.NullHandler, but this is necessary for 2.6: | ||
| 74 | + see https://docs.python.org/2.6/library/logging.html#configuring-logging-for-a-library | ||
| 75 | + """ | ||
| 76 | + def emit(self, record): | ||
| 77 | + pass | ||
| 78 | + | ||
| 79 | +def get_logger(name, level=logging.CRITICAL+1): | ||
| 80 | + """ | ||
| 81 | + Create a suitable logger object for this module. | ||
| 82 | + The goal is not to change settings of the root logger, to avoid getting | ||
| 83 | + other modules' logs on the screen. | ||
| 84 | + If a logger exists with same name, reuse it. (Else it would have duplicate | ||
| 85 | + handlers and messages would be doubled.) | ||
| 86 | + The level is set to CRITICAL+1 by default, to avoid any logging. | ||
| 87 | + """ | ||
| 88 | + # First, test if there is already a logger with the same name, else it | ||
| 89 | + # will generate duplicate messages (due to duplicate handlers): | ||
| 90 | + if name in logging.Logger.manager.loggerDict: | ||
| 91 | + #NOTE: another less intrusive but more "hackish" solution would be to | ||
| 92 | + # use getLogger then test if its effective level is not default. | ||
| 93 | + logger = logging.getLogger(name) | ||
| 94 | + # make sure level is OK: | ||
| 95 | + logger.setLevel(level) | ||
| 96 | + return logger | ||
| 97 | + # get a new logger: | ||
| 98 | + logger = logging.getLogger(name) | ||
| 99 | + # only add a NullHandler for this logger, it is up to the application | ||
| 100 | + # to configure its own logging: | ||
| 101 | + logger.addHandler(NullHandler()) | ||
| 102 | + logger.setLevel(level) | ||
| 103 | + return logger | ||
| 104 | + | ||
| 105 | +# a global logger object used for debugging: | ||
| 106 | +log = get_logger('rtfobj') | ||
| 58 | 107 | ||
| 59 | 108 | ||
| 60 | #=== CONSTANTS================================================================= | 109 | #=== CONSTANTS================================================================= |
| @@ -62,19 +111,47 @@ import re, sys, string, binascii | @@ -62,19 +111,47 @@ import re, sys, string, binascii | ||
| 62 | # REGEX pattern to extract embedded OLE objects in hexadecimal format: | 111 | # REGEX pattern to extract embedded OLE objects in hexadecimal format: |
| 63 | # alphanum digit: [0-9A-Fa-f] | 112 | # alphanum digit: [0-9A-Fa-f] |
| 64 | # hex char = two alphanum digits: [0-9A-Fa-f]{2} | 113 | # hex char = two alphanum digits: [0-9A-Fa-f]{2} |
| 114 | +HEX_CHAR = r'[0-9A-Fa-f]{2}' | ||
| 65 | # several hex chars, at least 4: (?:[0-9A-Fa-f]{2}){4,} | 115 | # several hex chars, at least 4: (?:[0-9A-Fa-f]{2}){4,} |
| 116 | +# + word boundaries | ||
| 117 | +HEX_CHARS_4orMORE = r'\b(?:' + HEX_CHAR + r'){4,}\b' | ||
| 118 | +# at least 1 hex char: | ||
| 119 | +HEX_CHARS_1orMORE = r'(?:' + HEX_CHAR + r')+' | ||
| 120 | +# at least 1 hex char, followed by whitespace or CR/LF: | ||
| 121 | +HEX_CHARS_1orMORE_WHITESPACES = r'(?:' + HEX_CHAR + r')+\s+' | ||
| 122 | +# + word boundaries around hex block | ||
| 123 | +# HEX_CHARS_1orMORE_WHITESPACES = r'\b(?:' + HEX_CHAR + r')+\b\s*' | ||
| 124 | +# at least one block of hex and whitespace chars, followed by closing curly bracket: | ||
| 125 | +# HEX_BLOCK_CURLY_BRACKET = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')+\}' | ||
| 126 | +PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE | ||
| 127 | + | ||
| 66 | # at least 4 hex chars, followed by whitespace or CR/LF: (?:[0-9A-Fa-f]{2}){4,}\s* | 128 | # at least 4 hex chars, followed by whitespace or CR/LF: (?:[0-9A-Fa-f]{2}){4,}\s* |
| 67 | -PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}' | 129 | +# PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}' |
| 68 | # improved pattern, allowing semicolons within hex: | 130 | # improved pattern, allowing semicolons within hex: |
| 69 | #PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}' | 131 | #PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}' |
| 70 | 132 | ||
| 71 | # a dummy translation table for str.translate, which does not change anythying: | 133 | # a dummy translation table for str.translate, which does not change anythying: |
| 72 | TRANSTABLE_NOCHANGE = string.maketrans('', '') | 134 | TRANSTABLE_NOCHANGE = string.maketrans('', '') |
| 73 | 135 | ||
| 136 | +re_hexblock = re.compile(PATTERN) | ||
| 137 | +re_decimal = re.compile(r'\d+') | ||
| 138 | + | ||
| 139 | +re_delimiter = re.compile(r'[ \t\r\n\f\v]') | ||
| 74 | 140 | ||
| 75 | -#=== FUNCTIONS ================================================================= | 141 | +DELIMITER = r'[ \t\r\n\f\v]' |
| 142 | +DELIMITERS_ZeroOrMore = r'[ \t\r\n\f\v]*' | ||
| 143 | +ANTISLASH_BIN = r'\\bin' | ||
| 144 | +# According to my tests, Word accepts up to 250 digits (leading zeroes) | ||
| 145 | +DECIMAL_GROUP = r'(\d{1,250})' | ||
| 76 | 146 | ||
| 77 | -def rtf_iter_objects (filename, min_size=32): | 147 | +re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + ANTISLASH_BIN |
| 148 | + + DECIMAL_GROUP + DELIMITER) | ||
| 149 | +re_delim_hexblock = re.compile(DELIMITER + PATTERN) | ||
| 150 | + | ||
| 151 | + | ||
| 152 | +#=== FUNCTIONS =============================================================== | ||
| 153 | + | ||
| 154 | +def rtf_iter_objects_old (filename, min_size=32): | ||
| 78 | """ | 155 | """ |
| 79 | Open a RTF file, extract each embedded object encoded in hexadecimal of | 156 | Open a RTF file, extract each embedded object encoded in hexadecimal of |
| 80 | size > min_size, yield the index of the object in the RTF file and its data | 157 | size > min_size, yield the index of the object in the RTF file and its data |
| @@ -84,22 +161,197 @@ def rtf_iter_objects (filename, min_size=32): | @@ -84,22 +161,197 @@ def rtf_iter_objects (filename, min_size=32): | ||
| 84 | data = open(filename, 'rb').read() | 161 | data = open(filename, 'rb').read() |
| 85 | for m in re.finditer(PATTERN, data): | 162 | for m in re.finditer(PATTERN, data): |
| 86 | found = m.group(0) | 163 | found = m.group(0) |
| 164 | + orig_len = len(found) | ||
| 87 | # remove all whitespace and line feeds: | 165 | # remove all whitespace and line feeds: |
| 88 | #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE | 166 | #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE |
| 89 | - found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') | 167 | + found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v}') |
| 90 | found = binascii.unhexlify(found) | 168 | found = binascii.unhexlify(found) |
| 91 | #print repr(found) | 169 | #print repr(found) |
| 92 | if len(found)>min_size: | 170 | if len(found)>min_size: |
| 93 | - yield m.start(), found | 171 | + yield m.start(), orig_len, found |
| 172 | + | ||
| 173 | +# TODO: backward-compatible API? | ||
| 174 | + | ||
| 175 | + | ||
| 176 | +def search_hex_block(data, pos=0, min_size=32, first=True): | ||
| 177 | + if first: | ||
| 178 | + # Search 1st occurence of a hex block: | ||
| 179 | + match = re_hexblock.search(data, pos=pos) | ||
| 180 | + else: | ||
| 181 | + # Match next occurences of a hex block, from the current position only: | ||
| 182 | + match = re_hexblock.match(data, pos=pos) | ||
| 183 | + | ||
| 184 | + | ||
| 185 | + | ||
| 186 | +def rtf_iter_objects (data, min_size=32): | ||
| 187 | + """ | ||
| 188 | + Open a RTF file, extract each embedded object encoded in hexadecimal of | ||
| 189 | + size > min_size, yield the index of the object in the RTF file and its data | ||
| 190 | + in binary format. | ||
| 191 | + This is an iterator. | ||
| 192 | + """ | ||
| 193 | + # Search 1st occurence of a hex block: | ||
| 194 | + match = re_hexblock.search(data) | ||
| 195 | + if match is None: | ||
| 196 | + # no hex block found | ||
| 197 | + return | ||
| 198 | + while match is not None: | ||
| 199 | + found = match.group(0) | ||
| 200 | + # start index | ||
| 201 | + start = match.start() | ||
| 202 | + # current position | ||
| 203 | + current = match.end() | ||
| 204 | + if len(found) < min_size: | ||
| 205 | + match = re_hexblock.search(data, pos=current) | ||
| 206 | + continue | ||
| 207 | + log.debug('Found hex block starting at %08X, end %08X' % (start, current)) | ||
| 208 | + # remove all whitespace and line feeds: | ||
| 209 | + #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE | ||
| 210 | + found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') | ||
| 211 | + # object data extracted from the RTF file | ||
| 212 | + objdata = binascii.unhexlify(found) | ||
| 213 | + # Detect the "\bin" control word, which is sometimes used for obfuscation: | ||
| 214 | + bin_match = re_delims_bin_decimal.match(data, pos=current) | ||
| 215 | + while bin_match is not None: | ||
| 216 | + log.debug('Found \\bin block starting at %08X : %r' | ||
| 217 | + % (bin_match.start(), bin_match.group(0))) | ||
| 218 | + # extract the decimal integer following '\bin' | ||
| 219 | + bin_len = int(bin_match.group(1)) | ||
| 220 | + log.debug('\\bin block length = %d' % bin_len) | ||
| 221 | + if current+bin_len > len(data): | ||
| 222 | + log.error('\\bin block length is larger than the remaining data') | ||
| 223 | + # move the current index, ignore the \bin block | ||
| 224 | + current += len(bin_match.group(0)) | ||
| 225 | + break | ||
| 226 | + # read that number of bytes: | ||
| 227 | + objdata += data[current:current+bin_len] | ||
| 228 | + # TODO: handle exception | ||
| 229 | + current += len(bin_match.group(0)) + bin_len | ||
| 230 | + # TODO: check if current is out of range | ||
| 231 | + # TODO: is Word limiting the \bin length to a number of digits? | ||
| 232 | + log.debug('Current position = %08X' % current) | ||
| 233 | + match = re_delim_hexblock.match(data, pos=current) | ||
| 234 | + if match is not None: | ||
| 235 | + log.debug('Found next hex block starting at %08X, end %08X' | ||
| 236 | + % (match.start(), match.end())) | ||
| 237 | + found = match.group(0) | ||
| 238 | + # remove all whitespace and line feeds: | ||
| 239 | + #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE | ||
| 240 | + found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') | ||
| 241 | + objdata += binascii.unhexlify(found) | ||
| 242 | + current = match.end() | ||
| 243 | + bin_match = re_delims_bin_decimal.match(data, pos=current) | ||
| 244 | + | ||
| 245 | + # print repr(found) | ||
| 246 | + if len(objdata)>min_size: | ||
| 247 | + yield start, current-start, objdata | ||
| 248 | + # Search next occurence of a hex block: | ||
| 249 | + match = re_hexblock.search(data, pos=current) | ||
| 250 | + | ||
| 251 | + | ||
| 252 | +def process_file(container, filename, data): | ||
| 253 | + # TODO: option to extract objects to files (false by default) | ||
| 254 | + if data is None: | ||
| 255 | + data = open(filename, 'rb').read() | ||
| 256 | + print '-'*79 | ||
| 257 | + print 'File: %r - %d bytes' % (filename, len(data)) | ||
| 258 | + for index, orig_len, objdata in rtf_iter_objects(data): | ||
| 259 | + print 'found object size %d at index %08X - end %08X' % (len(objdata), index, index+orig_len) | ||
| 260 | + fname = '%s_object_%08X.raw' % (filename, index) | ||
| 261 | + print 'saving object to file %s' % fname | ||
| 262 | + open(fname, 'wb').write(objdata) | ||
| 263 | + # TODO: check if all hex data is extracted properly | ||
| 264 | + | ||
| 265 | + obj = OleObject() | ||
| 266 | + try: | ||
| 267 | + obj.parse(objdata) | ||
| 268 | + print 'extract file embedded in OLE object:' | ||
| 269 | + print 'format_id = %d' % obj.format_id | ||
| 270 | + print 'class name = %r' % obj.class_name | ||
| 271 | + print 'data size = %d' % obj.data_size | ||
| 272 | + # set a file extension according to the class name: | ||
| 273 | + class_name = obj.class_name.lower() | ||
| 274 | + if class_name.startswith('word'): | ||
| 275 | + ext = 'doc' | ||
| 276 | + elif class_name.startswith('package'): | ||
| 277 | + ext = 'package' | ||
| 278 | + else: | ||
| 279 | + ext = 'bin' | ||
| 280 | + fname = '%s_object_%08X.%s' % (filename, index, ext) | ||
| 281 | + print 'saving to file %s' % fname | ||
| 282 | + open(fname, 'wb').write(obj.data) | ||
| 283 | + if obj.class_name.lower() == 'package': | ||
| 284 | + print 'Parsing OLE Package' | ||
| 285 | + opkg = OleNativeStream(bindata=obj.data) | ||
| 286 | + print 'Filename = %r' % opkg.filename | ||
| 287 | + print 'Source path = %r' % opkg.src_path | ||
| 288 | + print 'Temp path = %r' % opkg.temp_path | ||
| 289 | + if opkg.filename: | ||
| 290 | + fname = '%s_%s' % (filename, opkg.filename) | ||
| 291 | + else: | ||
| 292 | + fname = '%s_object_%08X.noname' % (filename, index) | ||
| 293 | + print 'saving to file %s' % fname | ||
| 294 | + open(fname, 'wb').write(opkg.data) | ||
| 295 | + except: | ||
| 296 | + pass | ||
| 297 | + # log.exception('*** Not an OLE 1.0 Object') | ||
| 298 | + | ||
| 94 | 299 | ||
| 95 | 300 | ||
| 96 | #=== MAIN ================================================================= | 301 | #=== MAIN ================================================================= |
| 97 | 302 | ||
| 98 | if __name__ == '__main__': | 303 | if __name__ == '__main__': |
| 99 | - if len(sys.argv)<2: | ||
| 100 | - sys.exit(__doc__) | ||
| 101 | - for index, data in rtf_iter_objects(sys.argv[1]): | ||
| 102 | - print 'found object size %d at index %08X' % (len(data), index) | ||
| 103 | - fname = 'object_%08X.bin' % index | ||
| 104 | - print 'saving to file %s' % fname | ||
| 105 | - open(fname, 'wb').write(data) | 304 | + # print banner with version |
| 305 | + print ('rtfobj %s - http://decalage.info/python/oletools' % __version__) | ||
| 306 | + print ('THIS IS WORK IN PROGRESS - Check updates regularly!') | ||
| 307 | + print ('Please report any issue at https://bitbucket.org/decalage/oletools/issues') | ||
| 308 | + print ('') | ||
| 309 | + | ||
| 310 | + DEFAULT_LOG_LEVEL = "warning" # Default log level | ||
| 311 | + LOG_LEVELS = {'debug': logging.DEBUG, | ||
| 312 | + 'info': logging.INFO, | ||
| 313 | + 'warning': logging.WARNING, | ||
| 314 | + 'error': logging.ERROR, | ||
| 315 | + 'critical': logging.CRITICAL | ||
| 316 | + } | ||
| 317 | + | ||
| 318 | + usage = 'usage: %prog [options] <filename> [filename2 ...]' | ||
| 319 | + parser = optparse.OptionParser(usage=usage) | ||
| 320 | + # parser.add_option('-o', '--outfile', dest='outfile', | ||
| 321 | + # help='output file') | ||
| 322 | + # parser.add_option('-c', '--csv', dest='csv', | ||
| 323 | + # help='export results to a CSV file') | ||
| 324 | + parser.add_option("-r", action="store_true", dest="recursive", | ||
| 325 | + help='find files recursively in subdirectories.') | ||
| 326 | + parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None, | ||
| 327 | + help='if the file is a zip archive, open first file from it, using the provided password (requires Python 2.6+)') | ||
| 328 | + parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*', | ||
| 329 | + help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)') | ||
| 330 | + parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL, | ||
| 331 | + help="logging level debug/info/warning/error/critical (default=%default)") | ||
| 332 | + | ||
| 333 | + (options, args) = parser.parse_args() | ||
| 334 | + | ||
| 335 | + # Print help if no arguments are passed | ||
| 336 | + if len(args) == 0: | ||
| 337 | + print __doc__ | ||
| 338 | + parser.print_help() | ||
| 339 | + sys.exit() | ||
| 340 | + | ||
| 341 | + # setup logging to the console | ||
| 342 | + logging.basicConfig(level=LOG_LEVELS[options.loglevel], format='%(levelname)-8s %(message)s') | ||
| 343 | + # enable logging in the modules: | ||
| 344 | + log.setLevel(logging.NOTSET) | ||
| 345 | + oleobj.log.setLevel(logging.NOTSET) | ||
| 346 | + | ||
| 347 | + | ||
| 348 | + for container, filename, data in xglob.iter_files(args, recursive=options.recursive, | ||
| 349 | + zip_password=options.zip_password, zip_fname=options.zip_fname): | ||
| 350 | + # ignore directory names stored in zip files: | ||
| 351 | + if container and filename.endswith('/'): | ||
| 352 | + continue | ||
| 353 | + process_file(container, filename, data) | ||
| 354 | + | ||
| 355 | + | ||
| 356 | + | ||
| 357 | + |