Commit dad20c2ce0134e0f9e9f3cecbbc82ace58c8f18a

Authored by Christian Herdtweck
1 parent 2b3f8d3e

oleobj: change data parsing to change index rather than data

This is more efficient and simplifies generalization to using byte-streams
instead of byte arrays as data input.
Showing 1 changed file with 61 additions and 43 deletions
oletools/oleobj.py
... ... @@ -162,51 +162,64 @@ assert struct_uint16.size == 2 # make sure it matches 2 bytes
162 162  
163 163 # === FUNCTIONS ==============================================================
164 164  
165   -def read_uint32(data):
  165 +def read_uint32(data, index):
166 166 """
167 167 Read an unsigned integer from the first 32 bits of data.
168 168  
169 169 :param data: bytes string containing the data to be extracted.
170   - :return: tuple (value, new_data) containing the read value (int),
171   - and the new data without the bytes read.
  170 + :param index: index to start reading from.
  171 + :return: tuple (value, index) containing the read value (int),
  172 + and the index to continue reading next time.
172 173 """
173   - value = struct_uint32.unpack(data[0:4])[0]
174   - new_data = data[4:]
175   - return (value, new_data)
  174 + value = struct_uint32.unpack(data[index:index+4])[0]
  175 + return (value, index+4)
176 176  
177 177  
178   -def read_uint16(data):
  178 +def read_uint16(data, index):
179 179 """
180   - Read an unsigned integer from the first 16 bits of data.
  180 + Read an unsigned integer from the 16 bits of data following index.
181 181  
182 182 :param data: bytes string containing the data to be extracted.
183   - :return: tuple (value, new_data) containing the read value (int),
184   - and the new data without the bytes read.
  183 + :param index: index to start reading from.
  184 + :return: tuple (value, index) containing the read value (int),
  185 + and the index to continue reading next time.
185 186 """
186   - value = struct_uint16.unpack(data[0:2])[0]
187   - new_data = data[2:]
188   - return (value, new_data)
  187 + value = struct_uint16.unpack(data[index:index+2])[0]
  188 + return (value, index+2)
189 189  
190 190  
191   -def read_LengthPrefixedAnsiString(data):
  191 +def read_LengthPrefixedAnsiString(data, index):
192 192 """
193 193 Read a length-prefixed ANSI string from data.
194 194  
195 195 :param data: bytes string containing the data to be extracted.
196   - :return: tuple (value, new_data) containing the read value (bytes string),
197   - and the new data without the bytes read.
  196 + :param index: index in data where string size starts
  197 + :return: tuple (value, index) containing the read value (bytes string),
  198 + and the index to start reading from next time.
198 199 """
199   - length, data = read_uint32(data)
  200 + length, index = read_uint32(data, index)
200 201 # if length = 0, return a null string (no null character)
201 202 if length == 0:
202   - return ('', data)
  203 + return ('', index)
203 204 # extract the string without the last null character
204   - ansi_string = data[:length-1]
  205 + ansi_string = data[index:index+length-1]
205 206 # TODO: only in strict mode:
206 207 # check the presence of the null char:
207   - assert data[length] == NULL_CHAR
208   - new_data = data[length:]
209   - return (ansi_string, new_data)
  208 + assert data[index+length] == NULL_CHAR
  209 + return (ansi_string, index+length)
  210 +
  211 +
  212 +def read_zero_terminated_ansi_string(data, index):
  213 + """
  214 + Read a zero-terminated ANSI string from data
  215 +
  216 + :param data: bytes string containing an ansi string
  217 + :param index: index at which the string should start
  218 + :return: tuple (string, index) containing the read string (bytes string),
  219 + and the index to start reading from next time.
  220 + """
  221 + end_idx = data.find(b'\x00', index)
  222 + return data[index:end_idx], end_idx+1 # return index after the 0-byte
210 223  
211 224  
212 225 # === CLASSES ================================================================
... ... @@ -254,25 +267,30 @@ class OleNativeStream (object):
254 267 # TODO: strict mode to raise exceptions when values are incorrect
255 268 # (permissive mode by default)
256 269 # An OLE Package object does not have the native data size field
  270 + index = 0
257 271 if not self.package:
258   - self.native_data_size = struct.unpack('<L', data[0:4])[0]
259   - data = data[4:]
  272 + self.native_data_size, index = read_uint32(data, index)
260 273 log.debug('OLE native data size = {0:08X} ({0} bytes)'.format(self.native_data_size))
261 274 # I thought this might be an OLE type specifier ???
262   - self.unknown_short, data = read_uint16(data)
263   - self.filename, data = data.split(b'\x00', 1)
  275 + self.unknown_short, index = read_uint16(data, index)
  276 + self.filename, index = read_zero_terminated_ansi_string(data, index)
264 277 # source path
265   - self.src_path, data = data.split(b'\x00', 1)
  278 + self.src_path, index = read_zero_terminated_ansi_string(data, index)
266 279 # TODO I bet these next 8 bytes are a timestamp => FILETIME from olefile
267   - self.unknown_long_1, data = read_uint32(data)
268   - self.unknown_long_2, data = read_uint32(data)
  280 + self.unknown_long_1, index = read_uint32(data, index)
  281 + self.unknown_long_2, index = read_uint32(data, index)
269 282 # temp path?
270   - self.temp_path, data = data.split(b'\x00', 1)
  283 + self.temp_path, index = read_zero_terminated_ansi_string(data, index)
271 284 # size of the rest of the data
272   - self.actual_size, data = read_uint32(data)
273   - self.data = data[0:self.actual_size]
274   - # TODO: exception when size > remaining data
275   - # TODO: SLACK DATA
  285 + try:
  286 + self.actual_size, index = read_uint32(data, index)
  287 + self.data = data[index:index+self.actual_size]
  288 + # TODO: exception when size > remaining data
  289 + # TODO: SLACK DATA
  290 + except IOError: # data is not embedded but only linked to
  291 + logging.debug('data is not embedded but only a link')
  292 + self.actual_size = 0
  293 + self.data = None
276 294  
277 295  
278 296 class OleObject (object):
... ... @@ -316,24 +334,24 @@ class OleObject (object):
316 334 # print("Parsing OLE object data:")
317 335 # print(hexdump3(data, length=16))
318 336 # Header: see MS-OLEDS 2.2.4 ObjectHeader
319   - self.ole_version, data = read_uint32(data)
320   - self.format_id, data = read_uint32(data)
  337 + self.ole_version, index = read_uint32(data, index)
  338 + self.format_id, index = read_uint32(data, index)
321 339 log.debug('OLE version=%08X - Format ID=%08X' % (self.ole_version, self.format_id))
322 340 assert self.format_id in (self.TYPE_EMBEDDED, self.TYPE_LINKED)
323   - self.class_name, data = read_LengthPrefixedAnsiString(data)
324   - self.topic_name, data = read_LengthPrefixedAnsiString(data)
325   - self.item_name, data = read_LengthPrefixedAnsiString(data)
  341 + self.class_name, index = read_LengthPrefixedAnsiString(data, index)
  342 + self.topic_name, index = read_LengthPrefixedAnsiString(data, index)
  343 + self.item_name, index = read_LengthPrefixedAnsiString(data, index)
326 344 log.debug('Class name=%r - Topic name=%r - Item name=%r'
327 345 % (self.class_name, self.topic_name, self.item_name))
328 346 if self.format_id == self.TYPE_EMBEDDED:
329 347 # Embedded object: see MS-OLEDS 2.2.5 EmbeddedObject
330 348 #assert self.topic_name != '' and self.item_name != ''
331   - self.data_size, data = read_uint32(data)
332   - log.debug('Declared data size=%d - remaining size=%d' % (self.data_size, len(data)))
  349 + self.data_size, index = read_uint32(data, index)
  350 + log.debug('Declared data size=%d - remaining size=%d' % (self.data_size, len(data)-index))
333 351 # TODO: handle incorrect size to avoid exception
334   - self.data = data[:self.data_size]
  352 + self.data = data[index:index+self.data_size]
335 353 assert len(self.data) == self.data_size
336   - self.extra_data = data[self.data_size:]
  354 + self.extra_data = data[index+self.data_size:]
337 355  
338 356  
339 357  
... ...