Commit 795647119dfba810e0c136e93de6a5c27f1abb00

Authored by Christian Herdtweck
1 parent a93b2109

ppt_record_parser: provide OleFileIO from embedded files

This was not easy to do if we want to avoid having the complete embedded file
in uncompressed form in memory. Had to create a stream around an iterable,
kind of fun :-)
Showing 1 changed file with 133 additions and 5 deletions
oletools/ppt_record_parser.py
... ... @@ -432,6 +432,117 @@ class PptRecordExOleObjAtom(PptRecord):
432 432 self.SUB_TYPES.get(self.sub_type, str(self.sub_type)))
433 433  
434 434  
  435 +class IterStream(io.RawIOBase):
  436 + """ make a read-only, seekable bytes-stream from an iterable
  437 +
  438 + copied from stackoverflow answer by Mechanical snail from Nov 18th 2013
  439 + https://stackoverflow.com/a/20260030/4405656 and extended
  440 + """
  441 +
  442 + def __init__(self, iterable_creator, size=None):
  443 + """ create a Stream using a function that creates the iterable """
  444 + super(IterStream, self).__init__()
  445 + self.iterable_creator = iterable_creator
  446 + self.size = size
  447 + logging.debug('IterStream.size is {0}'.format(self.size))
  448 + self.reset()
  449 +
  450 + def reset(self):
  451 + """ re-set array to state right after creation """
  452 + self.iterable = None
  453 + self.leftover = None
  454 + self.at_end = False
  455 + self.curr_pos = 0
  456 +
  457 + def writable(self):
  458 + return False
  459 +
  460 + def readable(self):
  461 + return True
  462 +
  463 + def seekable(self):
  464 + return True
  465 +
  466 + def readinto(self, target):
  467 + """ read as much data from iterable as necessary to fill target """
  468 + logging.debug('IterStream.readinto size {0}'.format(len(target)))
  469 + if self.at_end:
  470 + logging.debug('IterStream: we are at (fake) end')
  471 + return 0
  472 + if self.iterable is None:
  473 + self.iterable = self.iterable_creator()
  474 + logging.debug('IterStream: created iterable {0}'
  475 + .format(self.iterable))
  476 + self.curr_pos = 0
  477 + try:
  478 + target_len = len(target) # we should return at most this much
  479 + chunk = self.leftover or next(self.iterable)
  480 + logging.debug('IterStream: chunk is size {0}'.format(len(chunk)))
  481 + output, self.leftover = chunk[:target_len], chunk[target_len:]
  482 + logging.debug('IterStream: output is size {0}, leftover is {1}'
  483 + .format(len(output), len(self.leftover)))
  484 + target[:len(output)] = output
  485 + self.curr_pos += len(output)
  486 + logging.debug('IterStream: pos updated to {0}'
  487 + .format(self.curr_pos))
  488 + return len(output)
  489 + except StopIteration:
  490 + logging.debug('IterStream: source iterable exhausted')
  491 + self.at_end = True
  492 + return 0 # indicate EOF
  493 +
  494 + def seek(self, offset, whence=io.SEEK_SET):
  495 + """ can seek to start, possibly end """
  496 + if offset != 0 and whence == io.SEEK_SET:
  497 + logging.debug('IterStream: trying to seek to offset {0}.'
  498 + .format(offset))
  499 + if offset > self.curr_pos:
  500 + self.readinto(bytearray(offset - self.curr_pos))
  501 + elif offset == self.curr_pos:
  502 + pass
  503 + else: # need to re-create iterable
  504 + self.reset()
  505 + self.readinto(bytearray(offset))
  506 + if self.curr_pos != offset:
  507 + logging.debug('IterStream: curr_pos {0} != offset {1}!'
  508 + .format(self.curr_pos, offset))
  509 + raise RuntimeError('programming error in IterStream.tell!')
  510 + return self.curr_pos
  511 + elif whence == io.SEEK_END: # seek to end
  512 + logging.debug('IterStream: seek to end')
  513 + if self.size is None:
  514 + logging.debug('IterStream: trying to seek to end but size '
  515 + 'unknown --> raise IOError')
  516 + raise IOError('size unknown, cannot seek to end')
  517 + self.at_end = True # fake jumping to the end
  518 + self.iterable = None # cannot safely be used any more
  519 + self.leftover = None
  520 + return self.size
  521 + elif whence == io.SEEK_SET: # seek to start
  522 + logging.debug('IterStream: seek to start')
  523 + self.reset()
  524 + return 0
  525 + elif whence == io.SEEK_CUR: # e.g. called by tell()
  526 + logging.debug('IterStream: seek to curr pos')
  527 + if self.at_end:
  528 + return self.size
  529 + return self.curr_pos
  530 + elif whence not in (io.SEEK_SET, io.SEEK_CUR, io.SEEK_END):
  531 + logging.debug('Illegal 2nd argument to seek(): {0}'.format(whence))
  532 + raise IOError('Illegal 2nd argument to seek(): {0}'.format(whence))
  533 + else:
  534 + logging.debug('not implemented: {0}, {1}'.format(offset, whence))
  535 + raise NotImplementedError('seek only partially implemented. '
  536 + 'Cannot yet seek to {0} from {1}'
  537 + .format(offset, whence))
  538 +
  539 + def close(self):
  540 + self.iterable = None
  541 + self.leftover = None
  542 + self.at_end = False
  543 + self.curr_pos = 0
  544 +
  545 +
435 546 class PptRecordExOleVbaActiveXAtom(PptRecord):
436 547 """ record that contains and ole object / vba storage / active x control
437 548  
... ... @@ -509,6 +620,18 @@ class PptRecordExOleVbaActiveXAtom(PptRecord):
509 620 logging.warning('Decompressed data has wrong size {0} != {1}'
510 621 .format(out_size, self.get_uncompressed_size()))
511 622  
  623 + def get_data_as_olefile(self, debug_output=False):
  624 + """ return an OleFileIO that streams from iter_uncompressed
  625 +
  626 + probably only works if data is an OLE object, otherwise expect
  627 + exception
  628 + """
  629 + if debug_output:
  630 + record_base.enable_olefile_logging()
  631 + return record_base.OleFileIO(IterStream(self.iter_uncompressed,
  632 + self.get_uncompressed_size()),
  633 + debug=debug_output)
  634 +
512 635 def __str__(self):
513 636 text = super(PptRecordExOleVbaActiveXAtom, self).__str__()
514 637 compr_text = 'compressed' if self.is_compressed() else 'uncompressed'
... ... @@ -546,11 +669,16 @@ def print_records(record, print_fn, indent, do_print_record):
546 669 # for chunk in record.iter_uncompressed():
547 670 # logging.info('{0}--> "{1}"'.format(' ' * indent, chunk))
548 671 # writer.write(chunk)
549   - chunk1 = next(record.iter_uncompressed())
550   - logging.info('{0}--> decompressed size {1}, data {2}...'
551   - .format(' ' * indent, record.get_uncompressed_size(),
552   - ', '.join('{0:02x}'.format(ord(c))
553   - for c in chunk1[:32])))
  672 +
  673 + #chunk1 = next(record.iter_uncompressed())
  674 + #logging.info('{0}--> decompressed size {1}, data {2}...'
  675 + # .format(' ' * indent, record.get_uncompressed_size(),
  676 + # ', '.join('{0:02x}'.format(ord(c))
  677 + # for c in chunk1[:32])))
  678 +
  679 + ole = record.get_data_as_olefile()
  680 + for entry in ole.listdir():
  681 + logging.info('{0}ole entry {1}'.format(' ' * indent, entry))
554 682  
555 683  
556 684 if __name__ == '__main__':
... ...