Commit c8a4b6a9267a51d345fdd0e9a60f12db8b48e3f2

Authored by Christian Herdtweck
1 parent aa95f26a

oleobj: parse and dump from stream

This way we do not have to keep a whole big office file in memory.
(Olefile might do that, anyway, but then we have one copy less.)

Also merge subfunction process_native_stream back into process_file
(harder to read but makes more sense for exception handling)
Showing 1 changed file with 56 additions and 26 deletions
oletools/oleobj.py
@@ -162,6 +162,10 @@ assert struct_uint16.size == 2 # make sure it matches 2 bytes @@ -162,6 +162,10 @@ assert struct_uint16.size == 2 # make sure it matches 2 bytes
162 # max length of a zero-terminated ansi string. Not sure what this really is 162 # max length of a zero-terminated ansi string. Not sure what this really is
163 STR_MAX_LEN = 1024 163 STR_MAX_LEN = 1024
164 164
  165 +# size of chunks to copy from ole stream to file
  166 +DUMP_CHUNK_SIZE = 4096
  167 +
  168 +
165 # === FUNCTIONS ============================================================== 169 # === FUNCTIONS ==============================================================
166 170
167 def read_uint32(data, index): 171 def read_uint32(data, index):
@@ -324,8 +328,9 @@ class OleNativeStream (object): @@ -324,8 +328,9 @@ class OleNativeStream (object):
324 self.data = data[index:index+self.actual_size] 328 self.data = data[index:index+self.actual_size]
325 # TODO: exception when size > remaining data 329 # TODO: exception when size > remaining data
326 # TODO: SLACK DATA 330 # TODO: SLACK DATA
327 - except IOError, struct.error: # no data to read actual_size 331 + except (IOError, struct.error): # no data to read actual_size
328 logging.debug('data is not embedded but only a link') 332 logging.debug('data is not embedded but only a link')
  333 + self.is_link = True
329 self.actual_size = 0 334 self.actual_size = 0
330 self.data = None 335 self.data = None
331 336
@@ -519,38 +524,63 @@ def process_file(container, filename, data, output_dir=None): @@ -519,38 +524,63 @@ def process_file(container, filename, data, output_dir=None):
519 524
520 # look for ole files inside file (e.g. unzip docx) 525 # look for ole files inside file (e.g. unzip docx)
521 flag_no_ole = False 526 flag_no_ole = False
  527 + flag_stream_error = False
522 for ole in find_ole(filename, data): 528 for ole in find_ole(filename, data):
523 if ole is None: # no ole file found 529 if ole is None: # no ole file found
524 flag_no_ole = True 530 flag_no_ole = True
525 continue 531 continue
526 532
527 - for stream in ole.listdir():  
528 - if stream[-1] == '\x01Ole10Native':  
529 - process_native_stream(ole, stream, fname_prefix, index)  
530 - index += 1 533 + for path_parts in ole.listdir():
  534 + if path_parts[-1] == '\x01Ole10Native':
  535 + stream_path = '/'.join(path_parts)
  536 + log.debug('Checking stream %r' % stream_path)
  537 + stream = None
  538 + try:
  539 + stream = ole.openstream(path_parts)
  540 + print('extract file embedded in OLE object from stream %r:' % stream_path)
  541 + print ('Parsing OLE Package')
  542 + opkg = OleNativeStream(stream)
  543 + # leave stream open until dumping is finished
  544 + except Exception:
  545 + log.warning('*** Not an OLE 1.0 Object ({0})'.format(exc))
  546 + flag_stream_error = True
  547 + if stream is not None:
  548 + stream.close()
  549 + continue
531 550
  551 + # print info
  552 + print ('Filename = %r' % opkg.filename)
  553 + print ('Source path = %r' % opkg.src_path)
  554 + print ('Temp path = %r' % opkg.temp_path)
  555 + if opkg.filename:
  556 + fname = '%s_%s' % (fname_prefix,
  557 + sanitize_filename(opkg.filename))
  558 + else:
  559 + fname = '%s_object_%03d.noname' % (fname_prefix, index)
532 560
533 -def process_native_stream(ole, stream, fname_prefix, index):  
534 - """ Dump data from OLE embedded object stream """  
535 - objdata = ole.openstream(stream).read()  
536 - stream_path = '/'.join(stream)  
537 - log.debug('Checking stream %r' % stream_path)  
538 - try:  
539 - print('extract file embedded in OLE object from stream %r:' % stream_path)  
540 - print ('Parsing OLE Package')  
541 - opkg = OleNativeStream(bindata=objdata)  
542 - print ('Filename = %r' % opkg.filename)  
543 - print ('Source path = %r' % opkg.src_path)  
544 - print ('Temp path = %r' % opkg.temp_path)  
545 - if opkg.filename:  
546 - fname = '%s_%s' % (fname_prefix,  
547 - sanitize_filename(opkg.filename))  
548 - else:  
549 - fname = '%s_object_%03d.noname' % (fname_prefix, index)  
550 - print ('saving to file %s' % fname)  
551 - open(fname, 'wb').write(opkg.data)  
552 - except Exception:  
553 - log.debug('*** Not an OLE 1.0 Object') 561 + # dump
  562 + try:
  563 + print ('saving to file %s' % fname)
  564 + with open(fname, 'wb') as writer:
  565 + n_dumped = 0
  566 + next_size = min(DUMP_CHUNK_SIZE, opkg.actual_size)
  567 + while next_size:
  568 + data = stream.read(next_size)
  569 + writer.write(data)
  570 + n_dumped += len(data)
  571 + if len(data) != next_size:
  572 + logging.warning('Wanted to read {0}, got {1}'
  573 + .format(next_size, len(data)))
  574 + break
  575 + next_size = min(DUMP_CHUNK_SIZE,
  576 + opkg.actual_size - n_dumped)
  577 + except Exception as exc:
  578 + log.warning('error dumping to {0} ({1})'
  579 + .format(fname, exc))
  580 + finally:
  581 + stream.close()
  582 +
  583 + index += 1
554 584
555 585
556 #=== MAIN ================================================================= 586 #=== MAIN =================================================================