Commit c8a4b6a9267a51d345fdd0e9a60f12db8b48e3f2
1 parent
aa95f26a
oleobj: parse and dump from stream
This way we do not have to keep a whole big office file in memory. (Olefile might do that, anyway, but then we have one copy less.) Also merge subfunction process_native_stream back into process_file (harder to read but makes more sense for exception handling)
Showing
1 changed file
with
56 additions
and
26 deletions
oletools/oleobj.py
| ... | ... | @@ -162,6 +162,10 @@ assert struct_uint16.size == 2 # make sure it matches 2 bytes |
| 162 | 162 | # max length of a zero-terminated ansi string. Not sure what this really is |
| 163 | 163 | STR_MAX_LEN = 1024 |
| 164 | 164 | |
| 165 | +# size of chunks to copy from ole stream to file | |
| 166 | +DUMP_CHUNK_SIZE = 4096 | |
| 167 | + | |
| 168 | + | |
| 165 | 169 | # === FUNCTIONS ============================================================== |
| 166 | 170 | |
| 167 | 171 | def read_uint32(data, index): |
| ... | ... | @@ -324,8 +328,9 @@ class OleNativeStream (object): |
| 324 | 328 | self.data = data[index:index+self.actual_size] |
| 325 | 329 | # TODO: exception when size > remaining data |
| 326 | 330 | # TODO: SLACK DATA |
| 327 | - except IOError, struct.error: # no data to read actual_size | |
| 331 | + except (IOError, struct.error): # no data to read actual_size | |
| 328 | 332 | logging.debug('data is not embedded but only a link') |
| 333 | + self.is_link = True | |
| 329 | 334 | self.actual_size = 0 |
| 330 | 335 | self.data = None |
| 331 | 336 | |
| ... | ... | @@ -519,38 +524,63 @@ def process_file(container, filename, data, output_dir=None): |
| 519 | 524 | |
| 520 | 525 | # look for ole files inside file (e.g. unzip docx) |
| 521 | 526 | flag_no_ole = False |
| 527 | + flag_stream_error = False | |
| 522 | 528 | for ole in find_ole(filename, data): |
| 523 | 529 | if ole is None: # no ole file found |
| 524 | 530 | flag_no_ole = True |
| 525 | 531 | continue |
| 526 | 532 | |
| 527 | - for stream in ole.listdir(): | |
| 528 | - if stream[-1] == '\x01Ole10Native': | |
| 529 | - process_native_stream(ole, stream, fname_prefix, index) | |
| 530 | - index += 1 | |
| 533 | + for path_parts in ole.listdir(): | |
| 534 | + if path_parts[-1] == '\x01Ole10Native': | |
| 535 | + stream_path = '/'.join(path_parts) | |
| 536 | + log.debug('Checking stream %r' % stream_path) | |
| 537 | + stream = None | |
| 538 | + try: | |
| 539 | + stream = ole.openstream(path_parts) | |
| 540 | + print('extract file embedded in OLE object from stream %r:' % stream_path) | |
| 541 | + print ('Parsing OLE Package') | |
| 542 | + opkg = OleNativeStream(stream) | |
| 543 | + # leave stream open until dumping is finished | |
| 544 | + except Exception: | |
| 545 | + log.warning('*** Not an OLE 1.0 Object ({0})'.format(exc)) | |
| 546 | + flag_stream_error = True | |
| 547 | + if stream is not None: | |
| 548 | + stream.close() | |
| 549 | + continue | |
| 531 | 550 | |
| 551 | + # print info | |
| 552 | + print ('Filename = %r' % opkg.filename) | |
| 553 | + print ('Source path = %r' % opkg.src_path) | |
| 554 | + print ('Temp path = %r' % opkg.temp_path) | |
| 555 | + if opkg.filename: | |
| 556 | + fname = '%s_%s' % (fname_prefix, | |
| 557 | + sanitize_filename(opkg.filename)) | |
| 558 | + else: | |
| 559 | + fname = '%s_object_%03d.noname' % (fname_prefix, index) | |
| 532 | 560 | |
| 533 | -def process_native_stream(ole, stream, fname_prefix, index): | |
| 534 | - """ Dump data from OLE embedded object stream """ | |
| 535 | - objdata = ole.openstream(stream).read() | |
| 536 | - stream_path = '/'.join(stream) | |
| 537 | - log.debug('Checking stream %r' % stream_path) | |
| 538 | - try: | |
| 539 | - print('extract file embedded in OLE object from stream %r:' % stream_path) | |
| 540 | - print ('Parsing OLE Package') | |
| 541 | - opkg = OleNativeStream(bindata=objdata) | |
| 542 | - print ('Filename = %r' % opkg.filename) | |
| 543 | - print ('Source path = %r' % opkg.src_path) | |
| 544 | - print ('Temp path = %r' % opkg.temp_path) | |
| 545 | - if opkg.filename: | |
| 546 | - fname = '%s_%s' % (fname_prefix, | |
| 547 | - sanitize_filename(opkg.filename)) | |
| 548 | - else: | |
| 549 | - fname = '%s_object_%03d.noname' % (fname_prefix, index) | |
| 550 | - print ('saving to file %s' % fname) | |
| 551 | - open(fname, 'wb').write(opkg.data) | |
| 552 | - except Exception: | |
| 553 | - log.debug('*** Not an OLE 1.0 Object') | |
| 561 | + # dump | |
| 562 | + try: | |
| 563 | + print ('saving to file %s' % fname) | |
| 564 | + with open(fname, 'wb') as writer: | |
| 565 | + n_dumped = 0 | |
| 566 | + next_size = min(DUMP_CHUNK_SIZE, opkg.actual_size) | |
| 567 | + while next_size: | |
| 568 | + data = stream.read(next_size) | |
| 569 | + writer.write(data) | |
| 570 | + n_dumped += len(data) | |
| 571 | + if len(data) != next_size: | |
| 572 | + logging.warning('Wanted to read {0}, got {1}' | |
| 573 | + .format(next_size, len(data))) | |
| 574 | + break | |
| 575 | + next_size = min(DUMP_CHUNK_SIZE, | |
| 576 | + opkg.actual_size - n_dumped) | |
| 577 | + except Exception as exc: | |
| 578 | + log.warning('error dumping to {0} ({1})' | |
| 579 | + .format(fname, exc)) | |
| 580 | + finally: | |
| 581 | + stream.close() | |
| 582 | + | |
| 583 | + index += 1 | |
| 554 | 584 | |
| 555 | 585 | |
| 556 | 586 | #=== MAIN ================================================================= | ... | ... |