Commit 27e0a1c88ce0c6da147fb6f9b6224e0e9429bb10
1 parent
87a69ade
bugfixing but failed to correctly parse DocumentContainer :-(
Showing
1 changed file
with
185 additions
and
47 deletions
oletools/ppt_parser.py
| @@ -121,6 +121,9 @@ class RecordHeader(object): | @@ -121,6 +121,9 @@ class RecordHeader(object): | ||
| 121 | obj.rec_instance, obj.rec_ver = divmod(version_instance, 2**4) | 121 | obj.rec_instance, obj.rec_ver = divmod(version_instance, 2**4) |
| 122 | obj.rec_type, = struct.unpack('<H', stream.read(2)) | 122 | obj.rec_type, = struct.unpack('<H', stream.read(2)) |
| 123 | obj.rec_len, = struct.unpack('<L', stream.read(4)) | 123 | obj.rec_len, = struct.unpack('<L', stream.read(4)) |
| 124 | + log.debug('type is {0:04X}, instance {1:04X}, version {2:04X}, len {3}' | ||
| 125 | + .format(obj.rec_type, obj.rec_instance, obj.rec_ver, | ||
| 126 | + obj.rec_len)) | ||
| 124 | return obj | 127 | return obj |
| 125 | 128 | ||
| 126 | 129 | ||
| @@ -273,35 +276,28 @@ class CurrentUserAtom(PptType): | @@ -273,35 +276,28 @@ class CurrentUserAtom(PptType): | ||
| 273 | def extract_from(clz, stream): | 276 | def extract_from(clz, stream): |
| 274 | """ create instance with info from stream """ | 277 | """ create instance with info from stream """ |
| 275 | 278 | ||
| 276 | - stream = None | ||
| 277 | - try: | ||
| 278 | - obj = clz() | ||
| 279 | - | ||
| 280 | - # parse record header | ||
| 281 | - obj.rec_head = RecordHeader.extract_from(stream) | 279 | + obj = clz() |
| 282 | 280 | ||
| 283 | - size, = struct.unpack('<L', stream.read(4)) | ||
| 284 | - obj.header_token, = struct.unpack('<L', stream.read(4)) | ||
| 285 | - obj.offset_to_current_edit, = struct.unpack('<L', stream.read(4)) | ||
| 286 | - obj.len_user_name, = struct.unpack('<H', stream.read(2)) | ||
| 287 | - obj.doc_file_version, = struct.unpack('<H', stream.read(2)) | ||
| 288 | - obj.major_version, = struct.unpack('<B', stream.read(1)) | ||
| 289 | - obj.minor_version, = struct.unpack('<B', stream.read(1)) | ||
| 290 | - stream.read(2) # unused | ||
| 291 | - obj.ansi_user_name = stream.read(obj.len_user_name) | ||
| 292 | - obj.rel_version, = struct.unpack('<L', stream.read(4)) | ||
| 293 | - obj.unicode_user_name = stream.read(2 * obj.len_user_name) | 281 | + # parse record header |
| 282 | + obj.rec_head = RecordHeader.extract_from(stream) | ||
| 294 | 283 | ||
| 295 | - return obj | 284 | + obj.size, = struct.unpack('<L', stream.read(4)) |
| 285 | + obj.header_token, = struct.unpack('<L', stream.read(4)) | ||
| 286 | + obj.offset_to_current_edit, = struct.unpack('<L', stream.read(4)) | ||
| 287 | + obj.len_user_name, = struct.unpack('<H', stream.read(2)) | ||
| 288 | + obj.doc_file_version, = struct.unpack('<H', stream.read(2)) | ||
| 289 | + obj.major_version, = struct.unpack('<B', stream.read(1)) | ||
| 290 | + obj.minor_version, = struct.unpack('<B', stream.read(1)) | ||
| 291 | + stream.read(2) # unused | ||
| 292 | + obj.ansi_user_name = stream.read(obj.len_user_name) | ||
| 293 | + obj.rel_version, = struct.unpack('<L', stream.read(4)) | ||
| 294 | + obj.unicode_user_name = stream.read(2 * obj.len_user_name) | ||
| 296 | 295 | ||
| 297 | - finally: | ||
| 298 | - if stream is not None: | ||
| 299 | - log.debug('closing stream') | ||
| 300 | - stream.close() | 296 | + return obj |
| 301 | 297 | ||
| 302 | def check_validity(self): | 298 | def check_validity(self): |
| 303 | errs = self.check_rec_head() | 299 | errs = self.check_rec_head() |
| 304 | - errs.extend(self.check_value('size', size, self.SIZE) | 300 | + errs.extend(self.check_value('size', self.size, self.SIZE)) |
| 305 | errs.extend(self.check_value('headerToken', self.header_token, | 301 | errs.extend(self.check_value('headerToken', self.header_token, |
| 306 | [clz.HEADER_TOKEN_ENCRYPT, | 302 | [clz.HEADER_TOKEN_ENCRYPT, |
| 307 | clz.HEADER_TOKEN_NOCRYPT])) | 303 | clz.HEADER_TOKEN_NOCRYPT])) |
| @@ -567,6 +563,52 @@ class PersistDirectoryEntry(object): | @@ -567,6 +563,52 @@ class PersistDirectoryEntry(object): | ||
| 567 | return errs | 563 | return errs |
| 568 | 564 | ||
| 569 | 565 | ||
| 566 | +class DocInfoListSubContainerOrAtom(PptType): | ||
| 567 | + """ one of various types found in a DocInfoListContainer | ||
| 568 | + | ||
| 569 | + https://msdn.microsoft.com/en-us/library/dd921705%28v=office.12%29.aspx | ||
| 570 | + | ||
| 571 | + actual type of this object is defined by the recVersion field in its Record | ||
| 572 | + Head | ||
| 573 | + | ||
| 574 | + Similar to DummyType, RECORD_TYPE varies from instance to instance for this | ||
| 575 | + type | ||
| 576 | + """ | ||
| 577 | + | ||
| 578 | + # RECORD_TYPE varies, is specified only in extract_from | ||
| 579 | + VALID_RECORD_TYPES = [0x1388, # self.RECORD_TYPE_PROG_TAGS, \ | ||
| 580 | + 0x0414, # self.RECORD_TYPE_NORMAL_VIEW_SET_INFO_9, \ | ||
| 581 | + 0x0413, # self.RECORD_TYPE_NOTES_TEXT_VIEW_INFO_9, \ | ||
| 582 | + 0x0407, # self.RECORD_TYPE_OUTLINE_VIEW_INFO, \ | ||
| 583 | + 0x03FA, # self.RECORD_TYPE_SLIDE_VIEW_INFO, \ | ||
| 584 | + 0x0408] # self.RECORD_TYPE_SORTER_VIEW_INFO | ||
| 585 | + | ||
| 586 | + def __init__(self): | ||
| 587 | + super(DocInfoListSubContainerOrAtom, self).__init__() | ||
| 588 | + | ||
| 589 | + @classmethod | ||
| 590 | + def extract_from(clz, stream): | ||
| 591 | + """ build instance with info read from stream """ | ||
| 592 | + | ||
| 593 | + log.debug('Parsing DocInfoListSubContainerOrAtom from stream') | ||
| 594 | + | ||
| 595 | + obj = clz() | ||
| 596 | + obj.read_rec_head(stream) | ||
| 597 | + if obj.rec_head.rec_type == VBAInfoContainer.RECORD_TYPE: | ||
| 598 | + obj = VBAInfoContainer.extract_from(stream, obj.rec_head) | ||
| 599 | + else: | ||
| 600 | + log.debug('skipping over {} Byte in DocInfoListSubContainerOrAtom' | ||
| 601 | + .format(obj.rec_head.rec_len)) | ||
| 602 | + log.debug('start at pos {}'.format(stream.tell())) | ||
| 603 | + stream.seek(obj.rec_head.rec_len, os.SEEK_CUR) | ||
| 604 | + log.debug('now at pos {}'.format(stream.tell())) | ||
| 605 | + return obj | ||
| 606 | + | ||
| 607 | + def check_validity(self): | ||
| 608 | + """ can be any of multiple types """ | ||
| 609 | + self.check_value(self.rec_head.rec_type, self.VALID_RECORD_TYPES) | ||
| 610 | + | ||
| 611 | + | ||
| 570 | class DocInfoListContainer(PptType): | 612 | class DocInfoListContainer(PptType): |
| 571 | """ information about the document and document display settings | 613 | """ information about the document and document display settings |
| 572 | 614 | ||
| @@ -579,6 +621,40 @@ class DocInfoListContainer(PptType): | @@ -579,6 +621,40 @@ class DocInfoListContainer(PptType): | ||
| 579 | def __init__(self): | 621 | def __init__(self): |
| 580 | super(DocInfoListContainer, self).__init__() | 622 | super(DocInfoListContainer, self).__init__() |
| 581 | 623 | ||
| 624 | + @classmethod | ||
| 625 | + def extract_from(clz, stream): | ||
| 626 | + """ build instance with info read from stream """ | ||
| 627 | + | ||
| 628 | + log.debug('Parsing DocInfoListContainer from stream') | ||
| 629 | + obj = clz() | ||
| 630 | + obj.read_rec_head(stream) | ||
| 631 | + | ||
| 632 | + # rgChildRec (variable): An array of DocInfoListSubContainerOrAtom | ||
| 633 | + # records (section 2.4.5) that specifies information about the document | ||
| 634 | + # or how the document is displayed. The size, in bytes, of the array is | ||
| 635 | + # specified by rh.recLen | ||
| 636 | + curr_pos = stream.tell() | ||
| 637 | + end_pos = curr_pos + obj.rec_head.rec_len | ||
| 638 | + log.debug('start reading at pos {}, will read until {}' | ||
| 639 | + .format(curr_pos, end_pos)) | ||
| 640 | + bytes_read = 0 | ||
| 641 | + obj.rg_child_rec = [] | ||
| 642 | + | ||
| 643 | + while curr_pos < end_pos: | ||
| 644 | + new_obj = DocInfoListSubContainerOrAtom().extract_from(stream) | ||
| 645 | + obj.rg_child_rec.append(new_obj) | ||
| 646 | + curr_pos = stream.tell() | ||
| 647 | + log.debug('now at pos {}'.format(curr_pos)) | ||
| 648 | + | ||
| 649 | + log.debug('reached end pos {} ({}). stop reading DocInfoListContainer' | ||
| 650 | + .format(end_pos, curr_pos)) | ||
| 651 | + | ||
| 652 | + def check_validity(self): | ||
| 653 | + errs = self.check_rec_head() | ||
| 654 | + for obj in self.rg_child_rec: | ||
| 655 | + errs.extend(obj.check_validity()) | ||
| 656 | + return errs | ||
| 657 | + | ||
| 582 | 658 | ||
| 583 | class DocumentContainer(PptType): | 659 | class DocumentContainer(PptType): |
| 584 | """ a DocumentContainer record | 660 | """ a DocumentContainer record |
| @@ -618,26 +694,32 @@ class DocumentContainer(PptType): | @@ -618,26 +694,32 @@ class DocumentContainer(PptType): | ||
| 618 | 694 | ||
| 619 | this container contains lots of data we are not interested in. | 695 | this container contains lots of data we are not interested in. |
| 620 | """ | 696 | """ |
| 697 | + | ||
| 698 | + log.debug('Parsing DocumentContainer from stream') | ||
| 621 | obj = clz() | 699 | obj = clz() |
| 622 | 700 | ||
| 623 | # parse record header | 701 | # parse record header |
| 624 | obj.read_rec_head(stream) | 702 | obj.read_rec_head(stream) |
| 703 | + log.info('validity: {} errs'.format(len(obj.check_rec_head()))) | ||
| 625 | 704 | ||
| 626 | # documentAtom (48 bytes): A DocumentAtom record (section 2.4.2) that | 705 | # documentAtom (48 bytes): A DocumentAtom record (section 2.4.2) that |
| 627 | # specifies size information for presentation slides and notes slides. | 706 | # specifies size information for presentation slides and notes slides. |
| 628 | obj.document_atom = DummyType('DocumentAtom', 0x03E9, rec_ver=0x1, | 707 | obj.document_atom = DummyType('DocumentAtom', 0x03E9, rec_ver=0x1, |
| 629 | rec_len=0x28).extract_from(stream) | 708 | rec_len=0x28).extract_from(stream) |
| 709 | + log.info('validity: {} errs'.format(len(obj.document_atom.check_validity()))) | ||
| 630 | 710 | ||
| 631 | # exObjList (variable): An optional ExObjListContainer record (section | 711 | # exObjList (variable): An optional ExObjListContainer record (section |
| 632 | # 2.10.1) that specifies the list of external objects in the document. | 712 | # 2.10.1) that specifies the list of external objects in the document. |
| 633 | obj.ex_obj_list = DummyType('ExObjListContainer', 0x0409, rec_ver=0xF)\ | 713 | obj.ex_obj_list = DummyType('ExObjListContainer', 0x0409, rec_ver=0xF)\ |
| 634 | .extract_from(stream) | 714 | .extract_from(stream) |
| 715 | + log.info('validity: {} errs'.format(len(obj.ex_obj_list.check_validity()))) | ||
| 635 | 716 | ||
| 636 | # documentTextInfo (variable): A DocumentTextInfoContainer record | 717 | # documentTextInfo (variable): A DocumentTextInfoContainer record |
| 637 | # (section 2.9.1) that specifies the default text styles for the | 718 | # (section 2.9.1) that specifies the default text styles for the |
| 638 | # document. | 719 | # document. |
| 639 | obj.document_text_info = DummyType('DocumentTextInfoContainer', 0x03F2, | 720 | obj.document_text_info = DummyType('DocumentTextInfoContainer', 0x03F2, |
| 640 | rec_ver=0xF).extract_from(stream) | 721 | rec_ver=0xF).extract_from(stream) |
| 722 | + log.info('validity: {} errs'.format(len(obj.document_text_info.check_validity()))) | ||
| 641 | 723 | ||
| 642 | # soundCollection (variable): An optional SoundCollectionContainer | 724 | # soundCollection (variable): An optional SoundCollectionContainer |
| 643 | # record (section 2.4.16.1) that specifies the list of sounds in the | 725 | # record (section 2.4.16.1) that specifies the list of sounds in the |
| @@ -645,17 +727,20 @@ class DocumentContainer(PptType): | @@ -645,17 +727,20 @@ class DocumentContainer(PptType): | ||
| 645 | obj.sound_collection = DummyType('SoundCollectionContainer', 0x07E4, | 727 | obj.sound_collection = DummyType('SoundCollectionContainer', 0x07E4, |
| 646 | rec_ver=0xF, rec_instance=0x005)\ | 728 | rec_ver=0xF, rec_instance=0x005)\ |
| 647 | .extract_from(stream) | 729 | .extract_from(stream) |
| 730 | + log.info('validity: {} errs'.format(len(obj.sound_collection.check_validity()))) | ||
| 648 | 731 | ||
| 649 | # drawingGroup (variable): A DrawingGroupContainer record (section | 732 | # drawingGroup (variable): A DrawingGroupContainer record (section |
| 650 | # 2.4.3) that specifies drawing information for the document. | 733 | # 2.4.3) that specifies drawing information for the document. |
| 651 | obj.drawing_group = DummyType('DrawingGroupContainer', 0x040B, | 734 | obj.drawing_group = DummyType('DrawingGroupContainer', 0x040B, |
| 652 | rec_ver=0xF).extract_from(stream) | 735 | rec_ver=0xF).extract_from(stream) |
| 736 | + log.info('validity: {} errs'.format(len(obj.drawing_group.check_validity()))) | ||
| 653 | 737 | ||
| 654 | # masterList (variable): A MasterListWithTextContainer record (section | 738 | # masterList (variable): A MasterListWithTextContainer record (section |
| 655 | # 2.4.14.1) that specifies the list of main master slides and title | 739 | # 2.4.14.1) that specifies the list of main master slides and title |
| 656 | # master slides. | 740 | # master slides. |
| 657 | obj.master_list = DummyType('MasterListWithContainer', 0x0FF0, | 741 | obj.master_list = DummyType('MasterListWithContainer', 0x0FF0, |
| 658 | rec_ver=0xF).extract_from(stream) | 742 | rec_ver=0xF).extract_from(stream) |
| 743 | + log.info('validity: {} errs'.format(len(obj.master_list.check_validity()))) | ||
| 659 | 744 | ||
| 660 | # docInfoList (variable): An optional DocInfoListContainer record | 745 | # docInfoList (variable): An optional DocInfoListContainer record |
| 661 | # (section 2.4.4) that specifies additional document information. | 746 | # (section 2.4.4) that specifies additional document information. |
| @@ -747,15 +832,16 @@ class VBAInfoContainer(PptType): | @@ -747,15 +832,16 @@ class VBAInfoContainer(PptType): | ||
| 747 | self.vba_info_atom = None | 832 | self.vba_info_atom = None |
| 748 | 833 | ||
| 749 | @classmethod | 834 | @classmethod |
| 750 | - def extract_from(clz, stream): | 835 | + def extract_from(clz, stream, rec_head): |
| 836 | + """ since can determine this type only after reading header, it is arg | ||
| 837 | + """ | ||
| 751 | log.debug('parsing VBAInfoContainer') | 838 | log.debug('parsing VBAInfoContainer') |
| 752 | obj = clz() | 839 | obj = clz() |
| 753 | - obj.read_rec_head() | 840 | + obj.rec_head = rec_head |
| 754 | obj.vba_info_atom = VBAInfoAtom.extract_from(stream) | 841 | obj.vba_info_atom = VBAInfoAtom.extract_from(stream) |
| 755 | return obj | 842 | return obj |
| 756 | 843 | ||
| 757 | - def check_validty(self): | ||
| 758 | - | 844 | + def check_validity(self): |
| 759 | errs = self.check_rec_head(length=0x14) | 845 | errs = self.check_rec_head(length=0x14) |
| 760 | errs.extend(self.vba_info_atom.check_validity()) | 846 | errs.extend(self.vba_info_atom.check_validity()) |
| 761 | return errs | 847 | return errs |
| @@ -768,6 +854,7 @@ class VBAInfoAtom(PptType): | @@ -768,6 +854,7 @@ class VBAInfoAtom(PptType): | ||
| 768 | """ | 854 | """ |
| 769 | 855 | ||
| 770 | RECORD_TYPE = 0x0400 | 856 | RECORD_TYPE = 0x0400 |
| 857 | + RECORD_VERSION = 0x2 | ||
| 771 | 858 | ||
| 772 | def __init__(self): | 859 | def __init__(self): |
| 773 | super(VBAInfoAtom, self).__init__() | 860 | super(VBAInfoAtom, self).__init__() |
| @@ -803,9 +890,9 @@ class VBAInfoAtom(PptType): | @@ -803,9 +890,9 @@ class VBAInfoAtom(PptType): | ||
| 803 | errs = self.check_rec_head(length=0x14) | 890 | errs = self.check_rec_head(length=0x14) |
| 804 | 891 | ||
| 805 | # must be 0 or 1: | 892 | # must be 0 or 1: |
| 806 | - errs.extend(self.check_range('fHasMacros', self.f_has_macros, None, 2) | ||
| 807 | - errs.extend(self.check_value('version', self.version, 2) | ||
| 808 | - return errs | 893 | + errs.extend(self.check_range('fHasMacros', self.f_has_macros, None, 2)) |
| 894 | + errs.extend(self.check_value('version', self.version, 2)) | ||
| 895 | + return errs | ||
| 809 | 896 | ||
| 810 | # === PptParser =============================================================== | 897 | # === PptParser =============================================================== |
| 811 | 898 | ||
| @@ -919,6 +1006,9 @@ class PptParser(object): | @@ -919,6 +1006,9 @@ class PptParser(object): | ||
| 919 | log.warning('re-reading and overwriting ' | 1006 | log.warning('re-reading and overwriting ' |
| 920 | 'previously read persist_object_directory') | 1007 | 'previously read persist_object_directory') |
| 921 | 1008 | ||
| 1009 | + # Step 1: Read the CurrentUserAtom record (section 2.3.2) from the | ||
| 1010 | + # Current User Stream (section 2.1.1). All seek operations in the steps | ||
| 1011 | + # that follow this step are in the PowerPoint Document Stream. | ||
| 922 | if self.current_user_atom is None: | 1012 | if self.current_user_atom is None: |
| 923 | self.parse_current_user() | 1013 | self.parse_current_user() |
| 924 | 1014 | ||
| @@ -931,9 +1021,17 @@ class PptParser(object): | @@ -931,9 +1021,17 @@ class PptParser(object): | ||
| 931 | try: | 1021 | try: |
| 932 | log.debug('opening stream') | 1022 | log.debug('opening stream') |
| 933 | stream = self.ole.openstream(MAIN_STREAM_NAME) | 1023 | stream = self.ole.openstream(MAIN_STREAM_NAME) |
| 1024 | + | ||
| 1025 | + # Repeat steps 3 through 6 until offsetLastEdit is 0x00000000. | ||
| 934 | while offset != 0: | 1026 | while offset != 0: |
| 935 | 1027 | ||
| 1028 | + # Step 2: Seek, in the PowerPoint Document Stream, to the | ||
| 1029 | + # offset specified by the offsetToCurrentEdit field of the | ||
| 1030 | + # CurrentUserAtom record identified in step 1. | ||
| 936 | stream.seek(offset, os.SEEK_SET) | 1031 | stream.seek(offset, os.SEEK_SET) |
| 1032 | + | ||
| 1033 | + # Step 3: Read the UserEditAtom record at the current offset. | ||
| 1034 | + # Let this record be a live record. | ||
| 937 | user_edit = UserEditAtom.extract_from(stream, is_encrypted) | 1035 | user_edit = UserEditAtom.extract_from(stream, is_encrypted) |
| 938 | if self.newest_user_edit is None: | 1036 | if self.newest_user_edit is None: |
| 939 | self.newest_user_edit = user_edit | 1037 | self.newest_user_edit = user_edit |
| @@ -948,10 +1046,15 @@ class PptParser(object): | @@ -948,10 +1046,15 @@ class PptParser(object): | ||
| 948 | if errs and self.fast_fail: | 1046 | if errs and self.fast_fail: |
| 949 | raise errs[0] | 1047 | raise errs[0] |
| 950 | 1048 | ||
| 1049 | + # Step 4: Seek to the offset specified by the | ||
| 1050 | + # offsetPersistDirectory field of the UserEditAtom record | ||
| 1051 | + # identified in step 3. | ||
| 951 | log.debug('seeking to pos {}' | 1052 | log.debug('seeking to pos {}' |
| 952 | .format(user_edit.offset_persist_directory)) | 1053 | .format(user_edit.offset_persist_directory)) |
| 953 | stream.seek(user_edit.offset_persist_directory, os.SEEK_SET) | 1054 | stream.seek(user_edit.offset_persist_directory, os.SEEK_SET) |
| 954 | 1055 | ||
| 1056 | + # Step 5: Read the PersistDirectoryAtom record at the current | ||
| 1057 | + # offset. Let this record be a live record. | ||
| 955 | persist_dir_atom = PersistDirectoryAtom.extract_from(stream) | 1058 | persist_dir_atom = PersistDirectoryAtom.extract_from(stream) |
| 956 | 1059 | ||
| 957 | log.debug('checking validity') | 1060 | log.debug('checking validity') |
| @@ -965,14 +1068,37 @@ class PptParser(object): | @@ -965,14 +1068,37 @@ class PptParser(object): | ||
| 965 | if errs and self.fast_fail: | 1068 | if errs and self.fast_fail: |
| 966 | raise errs[0] | 1069 | raise errs[0] |
| 967 | 1070 | ||
| 1071 | + | ||
| 1072 | + # Construct the complete persist object directory for this file | ||
| 1073 | + # as follows: | ||
| 1074 | + # - For each PersistDirectoryAtom record previously identified | ||
| 1075 | + # in step 5, add the persist object identifier and persist | ||
| 1076 | + # object stream offset pairs to the persist object directory | ||
| 1077 | + # starting with the PersistDirectoryAtom record last | ||
| 1078 | + # identified, that is, the one closest to the beginning of the | ||
| 1079 | + # stream. | ||
| 1080 | + # - Continue adding these pairs to the persist object directory | ||
| 1081 | + # for each PersistDirectoryAtom record in the reverse order | ||
| 1082 | + # that they were identified in step 5; that is, the pairs from | ||
| 1083 | + # the PersistDirectoryAtom record closest to the end of the | ||
| 1084 | + # stream are added last. | ||
| 1085 | + # - When adding a new pair to the persist object directory, if | ||
| 1086 | + # the persist object identifier already exists in the persist | ||
| 1087 | + # object directory, the persist object stream offset from the | ||
| 1088 | + # new pair replaces the existing persist object stream offset | ||
| 1089 | + # for that persist object identifier. | ||
| 968 | for entry in persist_dir_atom.rg_persist_dir_entry: | 1090 | for entry in persist_dir_atom.rg_persist_dir_entry: |
| 969 | - log.debug('saving {} offsets for persist_id {}' | ||
| 970 | - .format(len(entry.rg_persist_offset), | ||
| 971 | - entry.persist_id)) | ||
| 972 | - self.persist_object_directory[entry.persist_id] = \ | ||
| 973 | - entry.rg_persist_offset | 1091 | + last_id = entry.persist_id+len(entry.rg_persist_offset)-1 |
| 1092 | + log.debug('for persist IDs {}-{}, save offsets {}' | ||
| 1093 | + .format(entry.persist_id, last_id, | ||
| 1094 | + entry.rg_persist_offset)) | ||
| 1095 | + for count, offset in enumerate(entry.rg_persist_offset): | ||
| 1096 | + self.persist_object_directory[entry.persist_id+count] \ | ||
| 1097 | + = offset | ||
| 974 | 1098 | ||
| 975 | # check for more | 1099 | # check for more |
| 1100 | + # Step 6: Seek to the offset specified by the offsetLastEdit | ||
| 1101 | + # field in the UserEditAtom record identified in step 3. | ||
| 976 | offset = user_edit.offset_last_edit | 1102 | offset = user_edit.offset_last_edit |
| 977 | except Exception: | 1103 | except Exception: |
| 978 | if self.fast_fail: | 1104 | if self.fast_fail: |
| @@ -985,27 +1111,36 @@ class PptParser(object): | @@ -985,27 +1111,36 @@ class PptParser(object): | ||
| 985 | stream.close() | 1111 | stream.close() |
| 986 | 1112 | ||
| 987 | def parse_document_persist_object(self): | 1113 | def parse_document_persist_object(self): |
| 988 | - """ """ | 1114 | + """ Part 2: Identify the document persist object """ |
| 989 | if self.document_persist_obj is not None: | 1115 | if self.document_persist_obj is not None: |
| 990 | log.warning('re-reading and overwriting ' | 1116 | log.warning('re-reading and overwriting ' |
| 991 | 'previously read document_persist_object') | 1117 | 'previously read document_persist_object') |
| 992 | 1118 | ||
| 1119 | + # Step 1: Read the docPersistIdRef field of the UserEditAtom record | ||
| 1120 | + # first identified in step 3 of Part 1, that is, the UserEditAtom | ||
| 1121 | + # record closest to the end of the stream. | ||
| 993 | if self.persist_object_directory is None: | 1122 | if self.persist_object_directory is None: |
| 994 | self.parse_persist_object_directory() | 1123 | self.parse_persist_object_directory() |
| 995 | 1124 | ||
| 996 | - # find the offset of the document container | 1125 | + # Step 2: Lookup the value of the docPersistIdRef field in the persist |
| 1126 | + # object directory constructed in step 8 of Part 1 to find the stream | ||
| 1127 | + # offset of a persist object. | ||
| 997 | newest_ref = self.newest_user_edit.doc_persist_id_ref | 1128 | newest_ref = self.newest_user_edit.doc_persist_id_ref |
| 998 | offset = self.persist_object_directory[newest_ref] | 1129 | offset = self.persist_object_directory[newest_ref] |
| 999 | - raise NotImplementedError('should have 1 offset here!') | 1130 | + log.debug('newest user edit ID is {}, offset is {}' |
| 1131 | + .format(newest_ref, offset)) | ||
| 1000 | 1132 | ||
| 1001 | stream = None | 1133 | stream = None |
| 1002 | 1134 | ||
| 1003 | try: | 1135 | try: |
| 1136 | + # Step 3: Seek to the stream offset specified in step 2. | ||
| 1004 | log.debug('opening stream') | 1137 | log.debug('opening stream') |
| 1005 | stream = self.ole.openstream(MAIN_STREAM_NAME) | 1138 | stream = self.ole.openstream(MAIN_STREAM_NAME) |
| 1006 | - log.debug('stream pos: {}'.format(stream.tell())) | ||
| 1007 | - stream.seek(offset) | ||
| 1008 | - log.debug('seek by {} to {}'.format(offset, stream.tell())) | 1139 | + log.debug('seek to {}'.format(offset)) |
| 1140 | + stream.seek(offset, os.SEEK_SET) | ||
| 1141 | + | ||
| 1142 | + # Step 4: Read the DocumentContainer record at the current offset. | ||
| 1143 | + # Let this record be a live record. | ||
| 1009 | self.document_persist_obj = DocumentContainer.extract_from(stream) | 1144 | self.document_persist_obj = DocumentContainer.extract_from(stream) |
| 1010 | except Exception: | 1145 | except Exception: |
| 1011 | if self.fast_fail: | 1146 | if self.fast_fail: |
| @@ -1032,17 +1167,20 @@ class PptParser(object): | @@ -1032,17 +1167,20 @@ class PptParser(object): | ||
| 1032 | def test(): | 1167 | def test(): |
| 1033 | """ for testing and debugging """ | 1168 | """ for testing and debugging """ |
| 1034 | 1169 | ||
| 1170 | + from glob import glob | ||
| 1171 | + | ||
| 1035 | # setup logging | 1172 | # setup logging |
| 1036 | logging.basicConfig(level=logging.DEBUG, | 1173 | logging.basicConfig(level=logging.DEBUG, |
| 1037 | format='%(levelname)-8s %(message)s') | 1174 | format='%(levelname)-8s %(message)s') |
| 1038 | log.setLevel(logging.NOTSET) | 1175 | log.setLevel(logging.NOTSET) |
| 1039 | 1176 | ||
| 1040 | - # test file with some autostart macros | ||
| 1041 | - test_file = 'gelaber_autostart.ppt' | ||
| 1042 | - | ||
| 1043 | - # parse | ||
| 1044 | - ppt = PptParser(test_file, fast_fail=False) | ||
| 1045 | - ppt.parse_document_persist_object() | 1177 | + #test_file = 'gelaber_autostart.ppt' |
| 1178 | + for file_name in glob('*.ppt'): | ||
| 1179 | + # parse | ||
| 1180 | + log.info('-' * 72) | ||
| 1181 | + log.info('test file: {}'.format(file_name)) | ||
| 1182 | + ppt = PptParser(file_name, fast_fail=False) | ||
| 1183 | + ppt.parse_document_persist_object() | ||
| 1046 | 1184 | ||
| 1047 | 1185 | ||
| 1048 | if __name__ == '__main__': | 1186 | if __name__ == '__main__': |