Commit dad20c2ce0134e0f9e9f3cecbbc82ace58c8f18a
1 parent
2b3f8d3e
oleobj: change data parsing to change index rather than data
This is more efficient and simplifies generalization to using byte-streams instead of byte arrays as data input.
Showing
1 changed file
with
61 additions
and
43 deletions
oletools/oleobj.py
| ... | ... | @@ -162,51 +162,64 @@ assert struct_uint16.size == 2 # make sure it matches 2 bytes |
| 162 | 162 | |
| 163 | 163 | # === FUNCTIONS ============================================================== |
| 164 | 164 | |
| 165 | -def read_uint32(data): | |
| 165 | +def read_uint32(data, index): | |
| 166 | 166 | """ |
| 167 | 167 | Read an unsigned integer from the first 32 bits of data. |
| 168 | 168 | |
| 169 | 169 | :param data: bytes string containing the data to be extracted. |
| 170 | - :return: tuple (value, new_data) containing the read value (int), | |
| 171 | - and the new data without the bytes read. | |
| 170 | + :param index: index to start reading from. | |
| 171 | + :return: tuple (value, index) containing the read value (int), | |
| 172 | + and the index to continue reading next time. | |
| 172 | 173 | """ |
| 173 | - value = struct_uint32.unpack(data[0:4])[0] | |
| 174 | - new_data = data[4:] | |
| 175 | - return (value, new_data) | |
| 174 | + value = struct_uint32.unpack(data[index:index+4])[0] | |
| 175 | + return (value, index+4) | |
| 176 | 176 | |
| 177 | 177 | |
| 178 | -def read_uint16(data): | |
| 178 | +def read_uint16(data, index): | |
| 179 | 179 | """ |
| 180 | - Read an unsigned integer from the first 16 bits of data. | |
| 180 | + Read an unsigned integer from the 16 bits of data following index. | |
| 181 | 181 | |
| 182 | 182 | :param data: bytes string containing the data to be extracted. |
| 183 | - :return: tuple (value, new_data) containing the read value (int), | |
| 184 | - and the new data without the bytes read. | |
| 183 | + :param index: index to start reading from. | |
| 184 | + :return: tuple (value, index) containing the read value (int), | |
| 185 | + and the index to continue reading next time. | |
| 185 | 186 | """ |
| 186 | - value = struct_uint16.unpack(data[0:2])[0] | |
| 187 | - new_data = data[2:] | |
| 188 | - return (value, new_data) | |
| 187 | + value = struct_uint16.unpack(data[index:index+2])[0] | |
| 188 | + return (value, index+2) | |
| 189 | 189 | |
| 190 | 190 | |
| 191 | -def read_LengthPrefixedAnsiString(data): | |
| 191 | +def read_LengthPrefixedAnsiString(data, index): | |
| 192 | 192 | """ |
| 193 | 193 | Read a length-prefixed ANSI string from data. |
| 194 | 194 | |
| 195 | 195 | :param data: bytes string containing the data to be extracted. |
| 196 | - :return: tuple (value, new_data) containing the read value (bytes string), | |
| 197 | - and the new data without the bytes read. | |
| 196 | + :param index: index in data where string size starts | |
| 197 | + :return: tuple (value, index) containing the read value (bytes string), | |
| 198 | + and the index to start reading from next time. | |
| 198 | 199 | """ |
| 199 | - length, data = read_uint32(data) | |
| 200 | + length, index = read_uint32(data, index) | |
| 200 | 201 | # if length = 0, return a null string (no null character) |
| 201 | 202 | if length == 0: |
| 202 | - return ('', data) | |
| 203 | + return ('', index) | |
| 203 | 204 | # extract the string without the last null character |
| 204 | - ansi_string = data[:length-1] | |
| 205 | + ansi_string = data[index:index+length-1] | |
| 205 | 206 | # TODO: only in strict mode: |
| 206 | 207 | # check the presence of the null char: |
| 207 | - assert data[length] == NULL_CHAR | |
| 208 | - new_data = data[length:] | |
| 209 | - return (ansi_string, new_data) | |
| 208 | + assert data[index+length] == NULL_CHAR | |
| 209 | + return (ansi_string, index+length) | |
| 210 | + | |
| 211 | + | |
| 212 | +def read_zero_terminated_ansi_string(data, index): | |
| 213 | + """ | |
| 214 | + Read a zero-terminated ANSI string from data | |
| 215 | + | |
| 216 | + :param data: bytes string containing an ansi string | |
| 217 | + :param index: index at which the string should start | |
| 218 | + :return: tuple (string, index) containing the read string (bytes string), | |
| 219 | + and the index to start reading from next time. | |
| 220 | + """ | |
| 221 | + end_idx = data.find(b'\x00', index) | |
| 222 | + return data[index:end_idx], end_idx+1 # return index after the 0-byte | |
| 210 | 223 | |
| 211 | 224 | |
| 212 | 225 | # === CLASSES ================================================================ |
| ... | ... | @@ -254,25 +267,30 @@ class OleNativeStream (object): |
| 254 | 267 | # TODO: strict mode to raise exceptions when values are incorrect |
| 255 | 268 | # (permissive mode by default) |
| 256 | 269 | # An OLE Package object does not have the native data size field |
| 270 | + index = 0 | |
| 257 | 271 | if not self.package: |
| 258 | - self.native_data_size = struct.unpack('<L', data[0:4])[0] | |
| 259 | - data = data[4:] | |
| 272 | + self.native_data_size, index = read_uint32(data, index) | |
| 260 | 273 | log.debug('OLE native data size = {0:08X} ({0} bytes)'.format(self.native_data_size)) |
| 261 | 274 | # I thought this might be an OLE type specifier ??? |
| 262 | - self.unknown_short, data = read_uint16(data) | |
| 263 | - self.filename, data = data.split(b'\x00', 1) | |
| 275 | + self.unknown_short, index = read_uint16(data, index) | |
| 276 | + self.filename, index = read_zero_terminated_ansi_string(data, index) | |
| 264 | 277 | # source path |
| 265 | - self.src_path, data = data.split(b'\x00', 1) | |
| 278 | + self.src_path, index = read_zero_terminated_ansi_string(data, index) | |
| 266 | 279 | # TODO I bet these next 8 bytes are a timestamp => FILETIME from olefile |
| 267 | - self.unknown_long_1, data = read_uint32(data) | |
| 268 | - self.unknown_long_2, data = read_uint32(data) | |
| 280 | + self.unknown_long_1, index = read_uint32(data, index) | |
| 281 | + self.unknown_long_2, index = read_uint32(data, index) | |
| 269 | 282 | # temp path? |
| 270 | - self.temp_path, data = data.split(b'\x00', 1) | |
| 283 | + self.temp_path, index = read_zero_terminated_ansi_string(data, index) | |
| 271 | 284 | # size of the rest of the data |
| 272 | - self.actual_size, data = read_uint32(data) | |
| 273 | - self.data = data[0:self.actual_size] | |
| 274 | - # TODO: exception when size > remaining data | |
| 275 | - # TODO: SLACK DATA | |
| 285 | + try: | |
| 286 | + self.actual_size, index = read_uint32(data, index) | |
| 287 | + self.data = data[index:index+self.actual_size] | |
| 288 | + # TODO: exception when size > remaining data | |
| 289 | + # TODO: SLACK DATA | |
| 290 | + except IOError: # data is not embedded but only linked to | |
| 291 | + logging.debug('data is not embedded but only a link') | |
| 292 | + self.actual_size = 0 | |
| 293 | + self.data = None | |
| 276 | 294 | |
| 277 | 295 | |
| 278 | 296 | class OleObject (object): |
| ... | ... | @@ -316,24 +334,24 @@ class OleObject (object): |
| 316 | 334 | # print("Parsing OLE object data:") |
| 317 | 335 | # print(hexdump3(data, length=16)) |
| 318 | 336 | # Header: see MS-OLEDS 2.2.4 ObjectHeader |
| 319 | - self.ole_version, data = read_uint32(data) | |
| 320 | - self.format_id, data = read_uint32(data) | |
| 337 | + self.ole_version, index = read_uint32(data, index) | |
| 338 | + self.format_id, index = read_uint32(data, index) | |
| 321 | 339 | log.debug('OLE version=%08X - Format ID=%08X' % (self.ole_version, self.format_id)) |
| 322 | 340 | assert self.format_id in (self.TYPE_EMBEDDED, self.TYPE_LINKED) |
| 323 | - self.class_name, data = read_LengthPrefixedAnsiString(data) | |
| 324 | - self.topic_name, data = read_LengthPrefixedAnsiString(data) | |
| 325 | - self.item_name, data = read_LengthPrefixedAnsiString(data) | |
| 341 | + self.class_name, index = read_LengthPrefixedAnsiString(data, index) | |
| 342 | + self.topic_name, index = read_LengthPrefixedAnsiString(data, index) | |
| 343 | + self.item_name, index = read_LengthPrefixedAnsiString(data, index) | |
| 326 | 344 | log.debug('Class name=%r - Topic name=%r - Item name=%r' |
| 327 | 345 | % (self.class_name, self.topic_name, self.item_name)) |
| 328 | 346 | if self.format_id == self.TYPE_EMBEDDED: |
| 329 | 347 | # Embedded object: see MS-OLEDS 2.2.5 EmbeddedObject |
| 330 | 348 | #assert self.topic_name != '' and self.item_name != '' |
| 331 | - self.data_size, data = read_uint32(data) | |
| 332 | - log.debug('Declared data size=%d - remaining size=%d' % (self.data_size, len(data))) | |
| 349 | + self.data_size, index = read_uint32(data, index) | |
| 350 | + log.debug('Declared data size=%d - remaining size=%d' % (self.data_size, len(data)-index)) | |
| 333 | 351 | # TODO: handle incorrect size to avoid exception |
| 334 | - self.data = data[:self.data_size] | |
| 352 | + self.data = data[index:index+self.data_size] | |
| 335 | 353 | assert len(self.data) == self.data_size |
| 336 | - self.extra_data = data[self.data_size:] | |
| 354 | + self.extra_data = data[index+self.data_size:] | |
| 337 | 355 | |
| 338 | 356 | |
| 339 | 357 | ... | ... |