Commit 27e0a1c88ce0c6da147fb6f9b6224e0e9429bb10

Authored by Christian Herdtweck
1 parent 87a69ade

bugfixing but failed to correctly parse DocumentContainer :-(

Showing 1 changed file with 185 additions and 47 deletions
oletools/ppt_parser.py
... ... @@ -121,6 +121,9 @@ class RecordHeader(object):
121 121 obj.rec_instance, obj.rec_ver = divmod(version_instance, 2**4)
122 122 obj.rec_type, = struct.unpack('<H', stream.read(2))
123 123 obj.rec_len, = struct.unpack('<L', stream.read(4))
  124 + log.debug('type is {0:04X}, instance {1:04X}, version {2:04X}, len {3}'
  125 + .format(obj.rec_type, obj.rec_instance, obj.rec_ver,
  126 + obj.rec_len))
124 127 return obj
125 128  
126 129  
... ... @@ -273,35 +276,28 @@ class CurrentUserAtom(PptType):
273 276 def extract_from(clz, stream):
274 277 """ create instance with info from stream """
275 278  
276   - stream = None
277   - try:
278   - obj = clz()
279   -
280   - # parse record header
281   - obj.rec_head = RecordHeader.extract_from(stream)
  279 + obj = clz()
282 280  
283   - size, = struct.unpack('<L', stream.read(4))
284   - obj.header_token, = struct.unpack('<L', stream.read(4))
285   - obj.offset_to_current_edit, = struct.unpack('<L', stream.read(4))
286   - obj.len_user_name, = struct.unpack('<H', stream.read(2))
287   - obj.doc_file_version, = struct.unpack('<H', stream.read(2))
288   - obj.major_version, = struct.unpack('<B', stream.read(1))
289   - obj.minor_version, = struct.unpack('<B', stream.read(1))
290   - stream.read(2) # unused
291   - obj.ansi_user_name = stream.read(obj.len_user_name)
292   - obj.rel_version, = struct.unpack('<L', stream.read(4))
293   - obj.unicode_user_name = stream.read(2 * obj.len_user_name)
  281 + # parse record header
  282 + obj.rec_head = RecordHeader.extract_from(stream)
294 283  
295   - return obj
  284 + obj.size, = struct.unpack('<L', stream.read(4))
  285 + obj.header_token, = struct.unpack('<L', stream.read(4))
  286 + obj.offset_to_current_edit, = struct.unpack('<L', stream.read(4))
  287 + obj.len_user_name, = struct.unpack('<H', stream.read(2))
  288 + obj.doc_file_version, = struct.unpack('<H', stream.read(2))
  289 + obj.major_version, = struct.unpack('<B', stream.read(1))
  290 + obj.minor_version, = struct.unpack('<B', stream.read(1))
  291 + stream.read(2) # unused
  292 + obj.ansi_user_name = stream.read(obj.len_user_name)
  293 + obj.rel_version, = struct.unpack('<L', stream.read(4))
  294 + obj.unicode_user_name = stream.read(2 * obj.len_user_name)
296 295  
297   - finally:
298   - if stream is not None:
299   - log.debug('closing stream')
300   - stream.close()
  296 + return obj
301 297  
302 298 def check_validity(self):
303 299 errs = self.check_rec_head()
304   - errs.extend(self.check_value('size', size, self.SIZE)
  300 + errs.extend(self.check_value('size', self.size, self.SIZE))
305 301 errs.extend(self.check_value('headerToken', self.header_token,
306 302 [clz.HEADER_TOKEN_ENCRYPT,
307 303 clz.HEADER_TOKEN_NOCRYPT]))
... ... @@ -567,6 +563,52 @@ class PersistDirectoryEntry(object):
567 563 return errs
568 564  
569 565  
  566 +class DocInfoListSubContainerOrAtom(PptType):
  567 + """ one of various types found in a DocInfoListContainer
  568 +
  569 + https://msdn.microsoft.com/en-us/library/dd921705%28v=office.12%29.aspx
  570 +
  571 + actual type of this object is defined by the recVersion field in its Record
  572 + Head
  573 +
  574 + Similar to DummyType, RECORD_TYPE varies from instance to instance for this
  575 + type
  576 + """
  577 +
  578 + # RECORD_TYPE varies, is specified only in extract_from
  579 + VALID_RECORD_TYPES = [0x1388, # self.RECORD_TYPE_PROG_TAGS, \
  580 + 0x0414, # self.RECORD_TYPE_NORMAL_VIEW_SET_INFO_9, \
  581 + 0x0413, # self.RECORD_TYPE_NOTES_TEXT_VIEW_INFO_9, \
  582 + 0x0407, # self.RECORD_TYPE_OUTLINE_VIEW_INFO, \
  583 + 0x03FA, # self.RECORD_TYPE_SLIDE_VIEW_INFO, \
  584 + 0x0408] # self.RECORD_TYPE_SORTER_VIEW_INFO
  585 +
  586 + def __init__(self):
  587 + super(DocInfoListSubContainerOrAtom, self).__init__()
  588 +
  589 + @classmethod
  590 + def extract_from(clz, stream):
  591 + """ build instance with info read from stream """
  592 +
  593 + log.debug('Parsing DocInfoListSubContainerOrAtom from stream')
  594 +
  595 + obj = clz()
  596 + obj.read_rec_head(stream)
  597 + if obj.rec_head.rec_type == VBAInfoContainer.RECORD_TYPE:
  598 + obj = VBAInfoContainer.extract_from(stream, obj.rec_head)
  599 + else:
  600 + log.debug('skipping over {} Byte in DocInfoListSubContainerOrAtom'
  601 + .format(obj.rec_head.rec_len))
  602 + log.debug('start at pos {}'.format(stream.tell()))
  603 + stream.seek(obj.rec_head.rec_len, os.SEEK_CUR)
  604 + log.debug('now at pos {}'.format(stream.tell()))
  605 + return obj
  606 +
  607 + def check_validity(self):
  608 + """ can be any of multiple types """
  609 + self.check_value(self.rec_head.rec_type, self.VALID_RECORD_TYPES)
  610 +
  611 +
570 612 class DocInfoListContainer(PptType):
571 613 """ information about the document and document display settings
572 614  
... ... @@ -579,6 +621,40 @@ class DocInfoListContainer(PptType):
579 621 def __init__(self):
580 622 super(DocInfoListContainer, self).__init__()
581 623  
  624 + @classmethod
  625 + def extract_from(clz, stream):
  626 + """ build instance with info read from stream """
  627 +
  628 + log.debug('Parsing DocInfoListContainer from stream')
  629 + obj = clz()
  630 + obj.read_rec_head(stream)
  631 +
  632 + # rgChildRec (variable): An array of DocInfoListSubContainerOrAtom
  633 + # records (section 2.4.5) that specifies information about the document
  634 + # or how the document is displayed. The size, in bytes, of the array is
  635 + # specified by rh.recLen
  636 + curr_pos = stream.tell()
  637 + end_pos = curr_pos + obj.rec_head.rec_len
  638 + log.debug('start reading at pos {}, will read until {}'
  639 + .format(curr_pos, end_pos))
  640 + bytes_read = 0
  641 + obj.rg_child_rec = []
  642 +
  643 + while curr_pos < end_pos:
  644 + new_obj = DocInfoListSubContainerOrAtom().extract_from(stream)
  645 + obj.rg_child_rec.append(new_obj)
  646 + curr_pos = stream.tell()
  647 + log.debug('now at pos {}'.format(curr_pos))
  648 +
  649 + log.debug('reached end pos {} ({}). stop reading DocInfoListContainer'
  650 + .format(end_pos, curr_pos))
  651 +
  652 + def check_validity(self):
  653 + errs = self.check_rec_head()
  654 + for obj in self.rg_child_rec:
  655 + errs.extend(obj.check_validity())
  656 + return errs
  657 +
582 658  
583 659 class DocumentContainer(PptType):
584 660 """ a DocumentContainer record
... ... @@ -618,26 +694,32 @@ class DocumentContainer(PptType):
618 694  
619 695 this container contains lots of data we are not interested in.
620 696 """
  697 +
  698 + log.debug('Parsing DocumentContainer from stream')
621 699 obj = clz()
622 700  
623 701 # parse record header
624 702 obj.read_rec_head(stream)
  703 + log.info('validity: {} errs'.format(len(obj.check_rec_head())))
625 704  
626 705 # documentAtom (48 bytes): A DocumentAtom record (section 2.4.2) that
627 706 # specifies size information for presentation slides and notes slides.
628 707 obj.document_atom = DummyType('DocumentAtom', 0x03E9, rec_ver=0x1,
629 708 rec_len=0x28).extract_from(stream)
  709 + log.info('validity: {} errs'.format(len(obj.document_atom.check_validity())))
630 710  
631 711 # exObjList (variable): An optional ExObjListContainer record (section
632 712 # 2.10.1) that specifies the list of external objects in the document.
633 713 obj.ex_obj_list = DummyType('ExObjListContainer', 0x0409, rec_ver=0xF)\
634 714 .extract_from(stream)
  715 + log.info('validity: {} errs'.format(len(obj.ex_obj_list.check_validity())))
635 716  
636 717 # documentTextInfo (variable): A DocumentTextInfoContainer record
637 718 # (section 2.9.1) that specifies the default text styles for the
638 719 # document.
639 720 obj.document_text_info = DummyType('DocumentTextInfoContainer', 0x03F2,
640 721 rec_ver=0xF).extract_from(stream)
  722 + log.info('validity: {} errs'.format(len(obj.document_text_info.check_validity())))
641 723  
642 724 # soundCollection (variable): An optional SoundCollectionContainer
643 725 # record (section 2.4.16.1) that specifies the list of sounds in the
... ... @@ -645,17 +727,20 @@ class DocumentContainer(PptType):
645 727 obj.sound_collection = DummyType('SoundCollectionContainer', 0x07E4,
646 728 rec_ver=0xF, rec_instance=0x005)\
647 729 .extract_from(stream)
  730 + log.info('validity: {} errs'.format(len(obj.sound_collection.check_validity())))
648 731  
649 732 # drawingGroup (variable): A DrawingGroupContainer record (section
650 733 # 2.4.3) that specifies drawing information for the document.
651 734 obj.drawing_group = DummyType('DrawingGroupContainer', 0x040B,
652 735 rec_ver=0xF).extract_from(stream)
  736 + log.info('validity: {} errs'.format(len(obj.drawing_group.check_validity())))
653 737  
654 738 # masterList (variable): A MasterListWithTextContainer record (section
655 739 # 2.4.14.1) that specifies the list of main master slides and title
656 740 # master slides.
657 741 obj.master_list = DummyType('MasterListWithContainer', 0x0FF0,
658 742 rec_ver=0xF).extract_from(stream)
  743 + log.info('validity: {} errs'.format(len(obj.master_list.check_validity())))
659 744  
660 745 # docInfoList (variable): An optional DocInfoListContainer record
661 746 # (section 2.4.4) that specifies additional document information.
... ... @@ -747,15 +832,16 @@ class VBAInfoContainer(PptType):
747 832 self.vba_info_atom = None
748 833  
749 834 @classmethod
750   - def extract_from(clz, stream):
  835 + def extract_from(clz, stream, rec_head):
  836 + """ since can determine this type only after reading header, it is arg
  837 + """
751 838 log.debug('parsing VBAInfoContainer')
752 839 obj = clz()
753   - obj.read_rec_head()
  840 + obj.rec_head = rec_head
754 841 obj.vba_info_atom = VBAInfoAtom.extract_from(stream)
755 842 return obj
756 843  
757   - def check_validty(self):
758   -
  844 + def check_validity(self):
759 845 errs = self.check_rec_head(length=0x14)
760 846 errs.extend(self.vba_info_atom.check_validity())
761 847 return errs
... ... @@ -768,6 +854,7 @@ class VBAInfoAtom(PptType):
768 854 """
769 855  
770 856 RECORD_TYPE = 0x0400
  857 + RECORD_VERSION = 0x2
771 858  
772 859 def __init__(self):
773 860 super(VBAInfoAtom, self).__init__()
... ... @@ -803,9 +890,9 @@ class VBAInfoAtom(PptType):
803 890 errs = self.check_rec_head(length=0x14)
804 891  
805 892 # must be 0 or 1:
806   - errs.extend(self.check_range('fHasMacros', self.f_has_macros, None, 2)
807   - errs.extend(self.check_value('version', self.version, 2)
808   - return errs
  893 + errs.extend(self.check_range('fHasMacros', self.f_has_macros, None, 2))
  894 + errs.extend(self.check_value('version', self.version, 2))
  895 + return errs
809 896  
810 897 # === PptParser ===============================================================
811 898  
... ... @@ -919,6 +1006,9 @@ class PptParser(object):
919 1006 log.warning('re-reading and overwriting '
920 1007 'previously read persist_object_directory')
921 1008  
  1009 + # Step 1: Read the CurrentUserAtom record (section 2.3.2) from the
  1010 + # Current User Stream (section 2.1.1). All seek operations in the steps
  1011 + # that follow this step are in the PowerPoint Document Stream.
922 1012 if self.current_user_atom is None:
923 1013 self.parse_current_user()
924 1014  
... ... @@ -931,9 +1021,17 @@ class PptParser(object):
931 1021 try:
932 1022 log.debug('opening stream')
933 1023 stream = self.ole.openstream(MAIN_STREAM_NAME)
  1024 +
  1025 + # Repeat steps 3 through 6 until offsetLastEdit is 0x00000000.
934 1026 while offset != 0:
935 1027  
  1028 + # Step 2: Seek, in the PowerPoint Document Stream, to the
  1029 + # offset specified by the offsetToCurrentEdit field of the
  1030 + # CurrentUserAtom record identified in step 1.
936 1031 stream.seek(offset, os.SEEK_SET)
  1032 +
  1033 + # Step 3: Read the UserEditAtom record at the current offset.
  1034 + # Let this record be a live record.
937 1035 user_edit = UserEditAtom.extract_from(stream, is_encrypted)
938 1036 if self.newest_user_edit is None:
939 1037 self.newest_user_edit = user_edit
... ... @@ -948,10 +1046,15 @@ class PptParser(object):
948 1046 if errs and self.fast_fail:
949 1047 raise errs[0]
950 1048  
  1049 + # Step 4: Seek to the offset specified by the
  1050 + # offsetPersistDirectory field of the UserEditAtom record
  1051 + # identified in step 3.
951 1052 log.debug('seeking to pos {}'
952 1053 .format(user_edit.offset_persist_directory))
953 1054 stream.seek(user_edit.offset_persist_directory, os.SEEK_SET)
954 1055  
  1056 + # Step 5: Read the PersistDirectoryAtom record at the current
  1057 + # offset. Let this record be a live record.
955 1058 persist_dir_atom = PersistDirectoryAtom.extract_from(stream)
956 1059  
957 1060 log.debug('checking validity')
... ... @@ -965,14 +1068,37 @@ class PptParser(object):
965 1068 if errs and self.fast_fail:
966 1069 raise errs[0]
967 1070  
  1071 +
  1072 + # Construct the complete persist object directory for this file
  1073 + # as follows:
  1074 + # - For each PersistDirectoryAtom record previously identified
  1075 + # in step 5, add the persist object identifier and persist
  1076 + # object stream offset pairs to the persist object directory
  1077 + # starting with the PersistDirectoryAtom record last
  1078 + # identified, that is, the one closest to the beginning of the
  1079 + # stream.
  1080 + # - Continue adding these pairs to the persist object directory
  1081 + # for each PersistDirectoryAtom record in the reverse order
  1082 + # that they were identified in step 5; that is, the pairs from
  1083 + # the PersistDirectoryAtom record closest to the end of the
  1084 + # stream are added last.
  1085 + # - When adding a new pair to the persist object directory, if
  1086 + # the persist object identifier already exists in the persist
  1087 + # object directory, the persist object stream offset from the
  1088 + # new pair replaces the existing persist object stream offset
  1089 + # for that persist object identifier.
968 1090 for entry in persist_dir_atom.rg_persist_dir_entry:
969   - log.debug('saving {} offsets for persist_id {}'
970   - .format(len(entry.rg_persist_offset),
971   - entry.persist_id))
972   - self.persist_object_directory[entry.persist_id] = \
973   - entry.rg_persist_offset
  1091 + last_id = entry.persist_id+len(entry.rg_persist_offset)-1
  1092 + log.debug('for persist IDs {}-{}, save offsets {}'
  1093 + .format(entry.persist_id, last_id,
  1094 + entry.rg_persist_offset))
  1095 + for count, offset in enumerate(entry.rg_persist_offset):
  1096 + self.persist_object_directory[entry.persist_id+count] \
  1097 + = offset
974 1098  
975 1099 # check for more
  1100 + # Step 6: Seek to the offset specified by the offsetLastEdit
  1101 + # field in the UserEditAtom record identified in step 3.
976 1102 offset = user_edit.offset_last_edit
977 1103 except Exception:
978 1104 if self.fast_fail:
... ... @@ -985,27 +1111,36 @@ class PptParser(object):
985 1111 stream.close()
986 1112  
987 1113 def parse_document_persist_object(self):
988   - """ """
  1114 + """ Part 2: Identify the document persist object """
989 1115 if self.document_persist_obj is not None:
990 1116 log.warning('re-reading and overwriting '
991 1117 'previously read document_persist_object')
992 1118  
  1119 + # Step 1: Read the docPersistIdRef field of the UserEditAtom record
  1120 + # first identified in step 3 of Part 1, that is, the UserEditAtom
  1121 + # record closest to the end of the stream.
993 1122 if self.persist_object_directory is None:
994 1123 self.parse_persist_object_directory()
995 1124  
996   - # find the offset of the document container
  1125 + # Step 2: Lookup the value of the docPersistIdRef field in the persist
  1126 + # object directory constructed in step 8 of Part 1 to find the stream
  1127 + # offset of a persist object.
997 1128 newest_ref = self.newest_user_edit.doc_persist_id_ref
998 1129 offset = self.persist_object_directory[newest_ref]
999   - raise NotImplementedError('should have 1 offset here!')
  1130 + log.debug('newest user edit ID is {}, offset is {}'
  1131 + .format(newest_ref, offset))
1000 1132  
1001 1133 stream = None
1002 1134  
1003 1135 try:
  1136 + # Step 3: Seek to the stream offset specified in step 2.
1004 1137 log.debug('opening stream')
1005 1138 stream = self.ole.openstream(MAIN_STREAM_NAME)
1006   - log.debug('stream pos: {}'.format(stream.tell()))
1007   - stream.seek(offset)
1008   - log.debug('seek by {} to {}'.format(offset, stream.tell()))
  1139 + log.debug('seek to {}'.format(offset))
  1140 + stream.seek(offset, os.SEEK_SET)
  1141 +
  1142 + # Step 4: Read the DocumentContainer record at the current offset.
  1143 + # Let this record be a live record.
1009 1144 self.document_persist_obj = DocumentContainer.extract_from(stream)
1010 1145 except Exception:
1011 1146 if self.fast_fail:
... ... @@ -1032,17 +1167,20 @@ class PptParser(object):
1032 1167 def test():
1033 1168 """ for testing and debugging """
1034 1169  
  1170 + from glob import glob
  1171 +
1035 1172 # setup logging
1036 1173 logging.basicConfig(level=logging.DEBUG,
1037 1174 format='%(levelname)-8s %(message)s')
1038 1175 log.setLevel(logging.NOTSET)
1039 1176  
1040   - # test file with some autostart macros
1041   - test_file = 'gelaber_autostart.ppt'
1042   -
1043   - # parse
1044   - ppt = PptParser(test_file, fast_fail=False)
1045   - ppt.parse_document_persist_object()
  1177 + #test_file = 'gelaber_autostart.ppt'
  1178 + for file_name in glob('*.ppt'):
  1179 + # parse
  1180 + log.info('-' * 72)
  1181 + log.info('test file: {}'.format(file_name))
  1182 + ppt = PptParser(file_name, fast_fail=False)
  1183 + ppt.parse_document_persist_object()
1046 1184  
1047 1185  
1048 1186 if __name__ == '__main__':
... ...