Commit 795647119dfba810e0c136e93de6a5c27f1abb00
1 parent
a93b2109
ppt_record_parser: provide OleFileIO from embedded files
This was not easy to do if we want to avoid having the complete embedded file in uncompressed form in memory. Had to create a stream around an iterable, kind of fun :-)
Showing
1 changed file
with
133 additions
and
5 deletions
oletools/ppt_record_parser.py
| @@ -432,6 +432,117 @@ class PptRecordExOleObjAtom(PptRecord): | @@ -432,6 +432,117 @@ class PptRecordExOleObjAtom(PptRecord): | ||
| 432 | self.SUB_TYPES.get(self.sub_type, str(self.sub_type))) | 432 | self.SUB_TYPES.get(self.sub_type, str(self.sub_type))) |
| 433 | 433 | ||
| 434 | 434 | ||
| 435 | +class IterStream(io.RawIOBase): | ||
| 436 | + """ make a read-only, seekable bytes-stream from an iterable | ||
| 437 | + | ||
| 438 | + copied from stackoverflow answer by Mechanical snail from Nov 18th 2013 | ||
| 439 | + https://stackoverflow.com/a/20260030/4405656 and extended | ||
| 440 | + """ | ||
| 441 | + | ||
| 442 | + def __init__(self, iterable_creator, size=None): | ||
| 443 | + """ create a Stream using a function that creates the iterable """ | ||
| 444 | + super(IterStream, self).__init__() | ||
| 445 | + self.iterable_creator = iterable_creator | ||
| 446 | + self.size = size | ||
| 447 | + logging.debug('IterStream.size is {0}'.format(self.size)) | ||
| 448 | + self.reset() | ||
| 449 | + | ||
| 450 | + def reset(self): | ||
| 451 | + """ re-set array to state right after creation """ | ||
| 452 | + self.iterable = None | ||
| 453 | + self.leftover = None | ||
| 454 | + self.at_end = False | ||
| 455 | + self.curr_pos = 0 | ||
| 456 | + | ||
| 457 | + def writable(self): | ||
| 458 | + return False | ||
| 459 | + | ||
| 460 | + def readable(self): | ||
| 461 | + return True | ||
| 462 | + | ||
| 463 | + def seekable(self): | ||
| 464 | + return True | ||
| 465 | + | ||
| 466 | + def readinto(self, target): | ||
| 467 | + """ read as much data from iterable as necessary to fill target """ | ||
| 468 | + logging.debug('IterStream.readinto size {0}'.format(len(target))) | ||
| 469 | + if self.at_end: | ||
| 470 | + logging.debug('IterStream: we are at (fake) end') | ||
| 471 | + return 0 | ||
| 472 | + if self.iterable is None: | ||
| 473 | + self.iterable = self.iterable_creator() | ||
| 474 | + logging.debug('IterStream: created iterable {0}' | ||
| 475 | + .format(self.iterable)) | ||
| 476 | + self.curr_pos = 0 | ||
| 477 | + try: | ||
| 478 | + target_len = len(target) # we should return at most this much | ||
| 479 | + chunk = self.leftover or next(self.iterable) | ||
| 480 | + logging.debug('IterStream: chunk is size {0}'.format(len(chunk))) | ||
| 481 | + output, self.leftover = chunk[:target_len], chunk[target_len:] | ||
| 482 | + logging.debug('IterStream: output is size {0}, leftover is {1}' | ||
| 483 | + .format(len(output), len(self.leftover))) | ||
| 484 | + target[:len(output)] = output | ||
| 485 | + self.curr_pos += len(output) | ||
| 486 | + logging.debug('IterStream: pos updated to {0}' | ||
| 487 | + .format(self.curr_pos)) | ||
| 488 | + return len(output) | ||
| 489 | + except StopIteration: | ||
| 490 | + logging.debug('IterStream: source iterable exhausted') | ||
| 491 | + self.at_end = True | ||
| 492 | + return 0 # indicate EOF | ||
| 493 | + | ||
| 494 | + def seek(self, offset, whence=io.SEEK_SET): | ||
| 495 | + """ can seek to start, possibly end """ | ||
| 496 | + if offset != 0 and whence == io.SEEK_SET: | ||
| 497 | + logging.debug('IterStream: trying to seek to offset {0}.' | ||
| 498 | + .format(offset)) | ||
| 499 | + if offset > self.curr_pos: | ||
| 500 | + self.readinto(bytearray(offset - self.curr_pos)) | ||
| 501 | + elif offset == self.curr_pos: | ||
| 502 | + pass | ||
| 503 | + else: # need to re-create iterable | ||
| 504 | + self.reset() | ||
| 505 | + self.readinto(bytearray(offset)) | ||
| 506 | + if self.curr_pos != offset: | ||
| 507 | + logging.debug('IterStream: curr_pos {0} != offset {1}!' | ||
| 508 | + .format(self.curr_pos, offset)) | ||
| 509 | + raise RuntimeError('programming error in IterStream.tell!') | ||
| 510 | + return self.curr_pos | ||
| 511 | + elif whence == io.SEEK_END: # seek to end | ||
| 512 | + logging.debug('IterStream: seek to end') | ||
| 513 | + if self.size is None: | ||
| 514 | + logging.debug('IterStream: trying to seek to end but size ' | ||
| 515 | + 'unknown --> raise IOError') | ||
| 516 | + raise IOError('size unknown, cannot seek to end') | ||
| 517 | + self.at_end = True # fake jumping to the end | ||
| 518 | + self.iterable = None # cannot safely be used any more | ||
| 519 | + self.leftover = None | ||
| 520 | + return self.size | ||
| 521 | + elif whence == io.SEEK_SET: # seek to start | ||
| 522 | + logging.debug('IterStream: seek to start') | ||
| 523 | + self.reset() | ||
| 524 | + return 0 | ||
| 525 | + elif whence == io.SEEK_CUR: # e.g. called by tell() | ||
| 526 | + logging.debug('IterStream: seek to curr pos') | ||
| 527 | + if self.at_end: | ||
| 528 | + return self.size | ||
| 529 | + return self.curr_pos | ||
| 530 | + elif whence not in (io.SEEK_SET, io.SEEK_CUR, io.SEEK_END): | ||
| 531 | + logging.debug('Illegal 2nd argument to seek(): {0}'.format(whence)) | ||
| 532 | + raise IOError('Illegal 2nd argument to seek(): {0}'.format(whence)) | ||
| 533 | + else: | ||
| 534 | + logging.debug('not implemented: {0}, {1}'.format(offset, whence)) | ||
| 535 | + raise NotImplementedError('seek only partially implemented. ' | ||
| 536 | + 'Cannot yet seek to {0} from {1}' | ||
| 537 | + .format(offset, whence)) | ||
| 538 | + | ||
| 539 | + def close(self): | ||
| 540 | + self.iterable = None | ||
| 541 | + self.leftover = None | ||
| 542 | + self.at_end = False | ||
| 543 | + self.curr_pos = 0 | ||
| 544 | + | ||
| 545 | + | ||
| 435 | class PptRecordExOleVbaActiveXAtom(PptRecord): | 546 | class PptRecordExOleVbaActiveXAtom(PptRecord): |
| 436 | """ record that contains and ole object / vba storage / active x control | 547 | """ record that contains and ole object / vba storage / active x control |
| 437 | 548 | ||
| @@ -509,6 +620,18 @@ class PptRecordExOleVbaActiveXAtom(PptRecord): | @@ -509,6 +620,18 @@ class PptRecordExOleVbaActiveXAtom(PptRecord): | ||
| 509 | logging.warning('Decompressed data has wrong size {0} != {1}' | 620 | logging.warning('Decompressed data has wrong size {0} != {1}' |
| 510 | .format(out_size, self.get_uncompressed_size())) | 621 | .format(out_size, self.get_uncompressed_size())) |
| 511 | 622 | ||
| 623 | + def get_data_as_olefile(self, debug_output=False): | ||
| 624 | + """ return an OleFileIO that streams from iter_uncompressed | ||
| 625 | + | ||
| 626 | + probably only works if data is an OLE object, otherwise expect | ||
| 627 | + exception | ||
| 628 | + """ | ||
| 629 | + if debug_output: | ||
| 630 | + record_base.enable_olefile_logging() | ||
| 631 | + return record_base.OleFileIO(IterStream(self.iter_uncompressed, | ||
| 632 | + self.get_uncompressed_size()), | ||
| 633 | + debug=debug_output) | ||
| 634 | + | ||
| 512 | def __str__(self): | 635 | def __str__(self): |
| 513 | text = super(PptRecordExOleVbaActiveXAtom, self).__str__() | 636 | text = super(PptRecordExOleVbaActiveXAtom, self).__str__() |
| 514 | compr_text = 'compressed' if self.is_compressed() else 'uncompressed' | 637 | compr_text = 'compressed' if self.is_compressed() else 'uncompressed' |
| @@ -546,11 +669,16 @@ def print_records(record, print_fn, indent, do_print_record): | @@ -546,11 +669,16 @@ def print_records(record, print_fn, indent, do_print_record): | ||
| 546 | # for chunk in record.iter_uncompressed(): | 669 | # for chunk in record.iter_uncompressed(): |
| 547 | # logging.info('{0}--> "{1}"'.format(' ' * indent, chunk)) | 670 | # logging.info('{0}--> "{1}"'.format(' ' * indent, chunk)) |
| 548 | # writer.write(chunk) | 671 | # writer.write(chunk) |
| 549 | - chunk1 = next(record.iter_uncompressed()) | ||
| 550 | - logging.info('{0}--> decompressed size {1}, data {2}...' | ||
| 551 | - .format(' ' * indent, record.get_uncompressed_size(), | ||
| 552 | - ', '.join('{0:02x}'.format(ord(c)) | ||
| 553 | - for c in chunk1[:32]))) | 672 | + |
| 673 | + #chunk1 = next(record.iter_uncompressed()) | ||
| 674 | + #logging.info('{0}--> decompressed size {1}, data {2}...' | ||
| 675 | + # .format(' ' * indent, record.get_uncompressed_size(), | ||
| 676 | + # ', '.join('{0:02x}'.format(ord(c)) | ||
| 677 | + # for c in chunk1[:32]))) | ||
| 678 | + | ||
| 679 | + ole = record.get_data_as_olefile() | ||
| 680 | + for entry in ole.listdir(): | ||
| 681 | + logging.info('{0}ole entry {1}'.format(' ' * indent, entry)) | ||
| 554 | 682 | ||
| 555 | 683 | ||
| 556 | if __name__ == '__main__': | 684 | if __name__ == '__main__': |