Commit 795647119dfba810e0c136e93de6a5c27f1abb00
1 parent
a93b2109
ppt_record_parser: provide OleFileIO from embedded files
This was not easy to do if we want to avoid having the complete embedded file in uncompressed form in memory. Had to create a stream around an iterable, kind of fun :-)
Showing
1 changed file
with
133 additions
and
5 deletions
oletools/ppt_record_parser.py
| ... | ... | @@ -432,6 +432,117 @@ class PptRecordExOleObjAtom(PptRecord): |
| 432 | 432 | self.SUB_TYPES.get(self.sub_type, str(self.sub_type))) |
| 433 | 433 | |
| 434 | 434 | |
| 435 | +class IterStream(io.RawIOBase): | |
| 436 | + """ make a read-only, seekable bytes-stream from an iterable | |
| 437 | + | |
| 438 | + copied from stackoverflow answer by Mechanical snail from Nov 18th 2013 | |
| 439 | + https://stackoverflow.com/a/20260030/4405656 and extended | |
| 440 | + """ | |
| 441 | + | |
| 442 | + def __init__(self, iterable_creator, size=None): | |
| 443 | + """ create a Stream using a function that creates the iterable """ | |
| 444 | + super(IterStream, self).__init__() | |
| 445 | + self.iterable_creator = iterable_creator | |
| 446 | + self.size = size | |
| 447 | + logging.debug('IterStream.size is {0}'.format(self.size)) | |
| 448 | + self.reset() | |
| 449 | + | |
| 450 | + def reset(self): | |
| 451 | + """ re-set array to state right after creation """ | |
| 452 | + self.iterable = None | |
| 453 | + self.leftover = None | |
| 454 | + self.at_end = False | |
| 455 | + self.curr_pos = 0 | |
| 456 | + | |
| 457 | + def writable(self): | |
| 458 | + return False | |
| 459 | + | |
| 460 | + def readable(self): | |
| 461 | + return True | |
| 462 | + | |
| 463 | + def seekable(self): | |
| 464 | + return True | |
| 465 | + | |
| 466 | + def readinto(self, target): | |
| 467 | + """ read as much data from iterable as necessary to fill target """ | |
| 468 | + logging.debug('IterStream.readinto size {0}'.format(len(target))) | |
| 469 | + if self.at_end: | |
| 470 | + logging.debug('IterStream: we are at (fake) end') | |
| 471 | + return 0 | |
| 472 | + if self.iterable is None: | |
| 473 | + self.iterable = self.iterable_creator() | |
| 474 | + logging.debug('IterStream: created iterable {0}' | |
| 475 | + .format(self.iterable)) | |
| 476 | + self.curr_pos = 0 | |
| 477 | + try: | |
| 478 | + target_len = len(target) # we should return at most this much | |
| 479 | + chunk = self.leftover or next(self.iterable) | |
| 480 | + logging.debug('IterStream: chunk is size {0}'.format(len(chunk))) | |
| 481 | + output, self.leftover = chunk[:target_len], chunk[target_len:] | |
| 482 | + logging.debug('IterStream: output is size {0}, leftover is {1}' | |
| 483 | + .format(len(output), len(self.leftover))) | |
| 484 | + target[:len(output)] = output | |
| 485 | + self.curr_pos += len(output) | |
| 486 | + logging.debug('IterStream: pos updated to {0}' | |
| 487 | + .format(self.curr_pos)) | |
| 488 | + return len(output) | |
| 489 | + except StopIteration: | |
| 490 | + logging.debug('IterStream: source iterable exhausted') | |
| 491 | + self.at_end = True | |
| 492 | + return 0 # indicate EOF | |
| 493 | + | |
| 494 | + def seek(self, offset, whence=io.SEEK_SET): | |
| 495 | + """ can seek to start, possibly end """ | |
| 496 | + if offset != 0 and whence == io.SEEK_SET: | |
| 497 | + logging.debug('IterStream: trying to seek to offset {0}.' | |
| 498 | + .format(offset)) | |
| 499 | + if offset > self.curr_pos: | |
| 500 | + self.readinto(bytearray(offset - self.curr_pos)) | |
| 501 | + elif offset == self.curr_pos: | |
| 502 | + pass | |
| 503 | + else: # need to re-create iterable | |
| 504 | + self.reset() | |
| 505 | + self.readinto(bytearray(offset)) | |
| 506 | + if self.curr_pos != offset: | |
| 507 | + logging.debug('IterStream: curr_pos {0} != offset {1}!' | |
| 508 | + .format(self.curr_pos, offset)) | |
| 509 | + raise RuntimeError('programming error in IterStream.tell!') | |
| 510 | + return self.curr_pos | |
| 511 | + elif whence == io.SEEK_END: # seek to end | |
| 512 | + logging.debug('IterStream: seek to end') | |
| 513 | + if self.size is None: | |
| 514 | + logging.debug('IterStream: trying to seek to end but size ' | |
| 515 | + 'unknown --> raise IOError') | |
| 516 | + raise IOError('size unknown, cannot seek to end') | |
| 517 | + self.at_end = True # fake jumping to the end | |
| 518 | + self.iterable = None # cannot safely be used any more | |
| 519 | + self.leftover = None | |
| 520 | + return self.size | |
| 521 | + elif whence == io.SEEK_SET: # seek to start | |
| 522 | + logging.debug('IterStream: seek to start') | |
| 523 | + self.reset() | |
| 524 | + return 0 | |
| 525 | + elif whence == io.SEEK_CUR: # e.g. called by tell() | |
| 526 | + logging.debug('IterStream: seek to curr pos') | |
| 527 | + if self.at_end: | |
| 528 | + return self.size | |
| 529 | + return self.curr_pos | |
| 530 | + elif whence not in (io.SEEK_SET, io.SEEK_CUR, io.SEEK_END): | |
| 531 | + logging.debug('Illegal 2nd argument to seek(): {0}'.format(whence)) | |
| 532 | + raise IOError('Illegal 2nd argument to seek(): {0}'.format(whence)) | |
| 533 | + else: | |
| 534 | + logging.debug('not implemented: {0}, {1}'.format(offset, whence)) | |
| 535 | + raise NotImplementedError('seek only partially implemented. ' | |
| 536 | + 'Cannot yet seek to {0} from {1}' | |
| 537 | + .format(offset, whence)) | |
| 538 | + | |
| 539 | + def close(self): | |
| 540 | + self.iterable = None | |
| 541 | + self.leftover = None | |
| 542 | + self.at_end = False | |
| 543 | + self.curr_pos = 0 | |
| 544 | + | |
| 545 | + | |
| 435 | 546 | class PptRecordExOleVbaActiveXAtom(PptRecord): |
| 436 | 547 | """ record that contains and ole object / vba storage / active x control |
| 437 | 548 | |
| ... | ... | @@ -509,6 +620,18 @@ class PptRecordExOleVbaActiveXAtom(PptRecord): |
| 509 | 620 | logging.warning('Decompressed data has wrong size {0} != {1}' |
| 510 | 621 | .format(out_size, self.get_uncompressed_size())) |
| 511 | 622 | |
| 623 | + def get_data_as_olefile(self, debug_output=False): | |
| 624 | + """ return an OleFileIO that streams from iter_uncompressed | |
| 625 | + | |
| 626 | + probably only works if data is an OLE object, otherwise expect | |
| 627 | + exception | |
| 628 | + """ | |
| 629 | + if debug_output: | |
| 630 | + record_base.enable_olefile_logging() | |
| 631 | + return record_base.OleFileIO(IterStream(self.iter_uncompressed, | |
| 632 | + self.get_uncompressed_size()), | |
| 633 | + debug=debug_output) | |
| 634 | + | |
| 512 | 635 | def __str__(self): |
| 513 | 636 | text = super(PptRecordExOleVbaActiveXAtom, self).__str__() |
| 514 | 637 | compr_text = 'compressed' if self.is_compressed() else 'uncompressed' |
| ... | ... | @@ -546,11 +669,16 @@ def print_records(record, print_fn, indent, do_print_record): |
| 546 | 669 | # for chunk in record.iter_uncompressed(): |
| 547 | 670 | # logging.info('{0}--> "{1}"'.format(' ' * indent, chunk)) |
| 548 | 671 | # writer.write(chunk) |
| 549 | - chunk1 = next(record.iter_uncompressed()) | |
| 550 | - logging.info('{0}--> decompressed size {1}, data {2}...' | |
| 551 | - .format(' ' * indent, record.get_uncompressed_size(), | |
| 552 | - ', '.join('{0:02x}'.format(ord(c)) | |
| 553 | - for c in chunk1[:32]))) | |
| 672 | + | |
| 673 | + #chunk1 = next(record.iter_uncompressed()) | |
| 674 | + #logging.info('{0}--> decompressed size {1}, data {2}...' | |
| 675 | + # .format(' ' * indent, record.get_uncompressed_size(), | |
| 676 | + # ', '.join('{0:02x}'.format(ord(c)) | |
| 677 | + # for c in chunk1[:32]))) | |
| 678 | + | |
| 679 | + ole = record.get_data_as_olefile() | |
| 680 | + for entry in ole.listdir(): | |
| 681 | + logging.info('{0}ole entry {1}'.format(' ' * indent, entry)) | |
| 554 | 682 | |
| 555 | 683 | |
| 556 | 684 | if __name__ == '__main__': | ... | ... |