Commit dad20c2ce0134e0f9e9f3cecbbc82ace58c8f18a

Authored by Christian Herdtweck
1 parent 2b3f8d3e

oleobj: change data parsing to change index rather than data

This is more efficient and simplifies generalization to using byte-streams
instead of byte arrays as data input.
Showing 1 changed file with 61 additions and 43 deletions
oletools/oleobj.py
@@ -162,51 +162,64 @@ assert struct_uint16.size == 2 # make sure it matches 2 bytes @@ -162,51 +162,64 @@ assert struct_uint16.size == 2 # make sure it matches 2 bytes
162 162
163 # === FUNCTIONS ============================================================== 163 # === FUNCTIONS ==============================================================
164 164
165 -def read_uint32(data): 165 +def read_uint32(data, index):
166 """ 166 """
167 Read an unsigned integer from the first 32 bits of data. 167 Read an unsigned integer from the first 32 bits of data.
168 168
169 :param data: bytes string containing the data to be extracted. 169 :param data: bytes string containing the data to be extracted.
170 - :return: tuple (value, new_data) containing the read value (int),  
171 - and the new data without the bytes read. 170 + :param index: index to start reading from.
  171 + :return: tuple (value, index) containing the read value (int),
  172 + and the index to continue reading next time.
172 """ 173 """
173 - value = struct_uint32.unpack(data[0:4])[0]  
174 - new_data = data[4:]  
175 - return (value, new_data) 174 + value = struct_uint32.unpack(data[index:index+4])[0]
  175 + return (value, index+4)
176 176
177 177
178 -def read_uint16(data): 178 +def read_uint16(data, index):
179 """ 179 """
180 - Read an unsigned integer from the first 16 bits of data. 180 + Read an unsigned integer from the 16 bits of data following index.
181 181
182 :param data: bytes string containing the data to be extracted. 182 :param data: bytes string containing the data to be extracted.
183 - :return: tuple (value, new_data) containing the read value (int),  
184 - and the new data without the bytes read. 183 + :param index: index to start reading from.
  184 + :return: tuple (value, index) containing the read value (int),
  185 + and the index to continue reading next time.
185 """ 186 """
186 - value = struct_uint16.unpack(data[0:2])[0]  
187 - new_data = data[2:]  
188 - return (value, new_data) 187 + value = struct_uint16.unpack(data[index:index+2])[0]
  188 + return (value, index+2)
189 189
190 190
191 -def read_LengthPrefixedAnsiString(data): 191 +def read_LengthPrefixedAnsiString(data, index):
192 """ 192 """
193 Read a length-prefixed ANSI string from data. 193 Read a length-prefixed ANSI string from data.
194 194
195 :param data: bytes string containing the data to be extracted. 195 :param data: bytes string containing the data to be extracted.
196 - :return: tuple (value, new_data) containing the read value (bytes string),  
197 - and the new data without the bytes read. 196 + :param index: index in data where string size starts
  197 + :return: tuple (value, index) containing the read value (bytes string),
  198 + and the index to start reading from next time.
198 """ 199 """
199 - length, data = read_uint32(data) 200 + length, index = read_uint32(data, index)
200 # if length = 0, return a null string (no null character) 201 # if length = 0, return a null string (no null character)
201 if length == 0: 202 if length == 0:
202 - return ('', data) 203 + return ('', index)
203 # extract the string without the last null character 204 # extract the string without the last null character
204 - ansi_string = data[:length-1] 205 + ansi_string = data[index:index+length-1]
205 # TODO: only in strict mode: 206 # TODO: only in strict mode:
206 # check the presence of the null char: 207 # check the presence of the null char:
207 - assert data[length] == NULL_CHAR  
208 - new_data = data[length:]  
209 - return (ansi_string, new_data) 208 + assert data[index+length] == NULL_CHAR
  209 + return (ansi_string, index+length)
  210 +
  211 +
  212 +def read_zero_terminated_ansi_string(data, index):
  213 + """
  214 + Read a zero-terminated ANSI string from data
  215 +
  216 + :param data: bytes string containing an ansi string
  217 + :param index: index at which the string should start
  218 + :return: tuple (string, index) containing the read string (bytes string),
  219 + and the index to start reading from next time.
  220 + """
  221 + end_idx = data.find(b'\x00', index)
  222 + return data[index:end_idx], end_idx+1 # return index after the 0-byte
210 223
211 224
212 # === CLASSES ================================================================ 225 # === CLASSES ================================================================
@@ -254,25 +267,30 @@ class OleNativeStream (object): @@ -254,25 +267,30 @@ class OleNativeStream (object):
254 # TODO: strict mode to raise exceptions when values are incorrect 267 # TODO: strict mode to raise exceptions when values are incorrect
255 # (permissive mode by default) 268 # (permissive mode by default)
256 # An OLE Package object does not have the native data size field 269 # An OLE Package object does not have the native data size field
  270 + index = 0
257 if not self.package: 271 if not self.package:
258 - self.native_data_size = struct.unpack('<L', data[0:4])[0]  
259 - data = data[4:] 272 + self.native_data_size, index = read_uint32(data, index)
260 log.debug('OLE native data size = {0:08X} ({0} bytes)'.format(self.native_data_size)) 273 log.debug('OLE native data size = {0:08X} ({0} bytes)'.format(self.native_data_size))
261 # I thought this might be an OLE type specifier ??? 274 # I thought this might be an OLE type specifier ???
262 - self.unknown_short, data = read_uint16(data)  
263 - self.filename, data = data.split(b'\x00', 1) 275 + self.unknown_short, index = read_uint16(data, index)
  276 + self.filename, index = read_zero_terminated_ansi_string(data, index)
264 # source path 277 # source path
265 - self.src_path, data = data.split(b'\x00', 1) 278 + self.src_path, index = read_zero_terminated_ansi_string(data, index)
266 # TODO I bet these next 8 bytes are a timestamp => FILETIME from olefile 279 # TODO I bet these next 8 bytes are a timestamp => FILETIME from olefile
267 - self.unknown_long_1, data = read_uint32(data)  
268 - self.unknown_long_2, data = read_uint32(data) 280 + self.unknown_long_1, index = read_uint32(data, index)
  281 + self.unknown_long_2, index = read_uint32(data, index)
269 # temp path? 282 # temp path?
270 - self.temp_path, data = data.split(b'\x00', 1) 283 + self.temp_path, index = read_zero_terminated_ansi_string(data, index)
271 # size of the rest of the data 284 # size of the rest of the data
272 - self.actual_size, data = read_uint32(data)  
273 - self.data = data[0:self.actual_size]  
274 - # TODO: exception when size > remaining data  
275 - # TODO: SLACK DATA 285 + try:
  286 + self.actual_size, index = read_uint32(data, index)
  287 + self.data = data[index:index+self.actual_size]
  288 + # TODO: exception when size > remaining data
  289 + # TODO: SLACK DATA
  290 + except IOError: # data is not embedded but only linked to
  291 + logging.debug('data is not embedded but only a link')
  292 + self.actual_size = 0
  293 + self.data = None
276 294
277 295
278 class OleObject (object): 296 class OleObject (object):
@@ -316,24 +334,24 @@ class OleObject (object): @@ -316,24 +334,24 @@ class OleObject (object):
316 # print("Parsing OLE object data:") 334 # print("Parsing OLE object data:")
317 # print(hexdump3(data, length=16)) 335 # print(hexdump3(data, length=16))
318 # Header: see MS-OLEDS 2.2.4 ObjectHeader 336 # Header: see MS-OLEDS 2.2.4 ObjectHeader
319 - self.ole_version, data = read_uint32(data)  
320 - self.format_id, data = read_uint32(data) 337 + self.ole_version, index = read_uint32(data, index)
  338 + self.format_id, index = read_uint32(data, index)
321 log.debug('OLE version=%08X - Format ID=%08X' % (self.ole_version, self.format_id)) 339 log.debug('OLE version=%08X - Format ID=%08X' % (self.ole_version, self.format_id))
322 assert self.format_id in (self.TYPE_EMBEDDED, self.TYPE_LINKED) 340 assert self.format_id in (self.TYPE_EMBEDDED, self.TYPE_LINKED)
323 - self.class_name, data = read_LengthPrefixedAnsiString(data)  
324 - self.topic_name, data = read_LengthPrefixedAnsiString(data)  
325 - self.item_name, data = read_LengthPrefixedAnsiString(data) 341 + self.class_name, index = read_LengthPrefixedAnsiString(data, index)
  342 + self.topic_name, index = read_LengthPrefixedAnsiString(data, index)
  343 + self.item_name, index = read_LengthPrefixedAnsiString(data, index)
326 log.debug('Class name=%r - Topic name=%r - Item name=%r' 344 log.debug('Class name=%r - Topic name=%r - Item name=%r'
327 % (self.class_name, self.topic_name, self.item_name)) 345 % (self.class_name, self.topic_name, self.item_name))
328 if self.format_id == self.TYPE_EMBEDDED: 346 if self.format_id == self.TYPE_EMBEDDED:
329 # Embedded object: see MS-OLEDS 2.2.5 EmbeddedObject 347 # Embedded object: see MS-OLEDS 2.2.5 EmbeddedObject
330 #assert self.topic_name != '' and self.item_name != '' 348 #assert self.topic_name != '' and self.item_name != ''
331 - self.data_size, data = read_uint32(data)  
332 - log.debug('Declared data size=%d - remaining size=%d' % (self.data_size, len(data))) 349 + self.data_size, index = read_uint32(data, index)
  350 + log.debug('Declared data size=%d - remaining size=%d' % (self.data_size, len(data)-index))
333 # TODO: handle incorrect size to avoid exception 351 # TODO: handle incorrect size to avoid exception
334 - self.data = data[:self.data_size] 352 + self.data = data[index:index+self.data_size]
335 assert len(self.data) == self.data_size 353 assert len(self.data) == self.data_size
336 - self.extra_data = data[self.data_size:] 354 + self.extra_data = data[index+self.data_size:]
337 355
338 356
339 357