From 63dafd09eec50aa4a50b2cbe5752c3a3c5cd0f0c Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Fri, 6 May 2016 17:48:01 +0200 Subject: [PATCH] added base type PptType, parse all of persist dir (not just first), added DummyType and DocumentContainer --- oletools/ppt_parser.py | 647 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------------------------ 1 file changed, 605 insertions(+), 42 deletions(-) diff --git a/oletools/ppt_parser.py b/oletools/ppt_parser.py index 8aefdec..8c136ee 100644 --- a/oletools/ppt_parser.py +++ b/oletools/ppt_parser.py @@ -16,6 +16,8 @@ References: # TODO #------------------------------------------------------------------------------ # TODO: +# - make CurrentUserAtom and UserEditAtom PptTypes; adjust parse +# - make stream optional in PptUnexpectedData # - license # - create a AtomBase class that defines check_value and parses RecordHead? # @@ -30,6 +32,7 @@ import sys import logging import struct import traceback +import os import thirdparty.olefile as olefile from olevba import get_logger @@ -41,6 +44,8 @@ log = get_logger('ppt') #--- CONSTANTS ---------------------------------------------------------------- +# name of main stream +MAIN_STREAM_NAME = 'PowerPoint Document' # URL and message to report issues: URL_OLEVBA_ISSUES = 'https://bitbucket.org/decalage/oletools/issues' @@ -66,7 +71,7 @@ def check_value(name, value, expected): """ simplify verification of values in extract_from """ if isinstance(expected, (list, tuple)): if value not in expected: - exp_str = '[' + ' OR '.join('{0:04X}'.format(val) + exp_str = '[' + ' OR '.join('{0:04X}'.format(val) for val in expected) + ']' raise PptUnexpectedData( 'Current User', name, @@ -78,7 +83,7 @@ def check_value(name, value, expected): class RecordHeader(object): - """ a record header, often found in ppt files + """ a record header, at start of many types found in ppt files https://msdn.microsoft.com/en-us/library/dd926377%28v=office.12%29.aspx https://msdn.microsoft.com/en-us/library/dd948895%28v=office.12%29.aspx @@ -97,7 +102,7 @@ class RecordHeader(object): obj = clz() # first half byte is version, next 3 half bytes are instance version_instance, = struct.unpack(' 255: raise PptUnexpectedData( 'Current User', 'CurrentUserAtom.lenUserName', @@ -185,24 +186,146 @@ class CurrentUserAtom(object): check_value('minorVersion', obj.minor_version, clz.MINOR_VERSION) stream.read(2) # unused obj.ansi_user_name = stream.read(obj.len_user_name) - log.debug('ansiUserName: {!r}'.format(obj.ansi_user_name)) obj.rel_version, = struct.unpack('= expect_upper: + is_err = True + + if is_err: + clz_name = self.__class__.__name__ + if expect_lower is None: + expect_str = '< {0:04X}'.format(expect_upper) + elif expect_upper is None: + expect_str = '> {0:04X}'.format(expect_lower) + else: + expect_str = 'within ({0:04X}, {1:04X})'.format(expect_lower, + expect_upper) + return [PptUnexpectedData(self.stream_name, clz_name + '.' + name, + '{0:04X}'.format(value), expect_str), ] + else: + return [] + + def check_rec_head(self, length=None): + """ to be called by check_validity to check the self.rec_head + + uses self.RECORD_... constants, (not quite that constant for DummyType) + """ + + errs = [] + errs.extend(self.check_value('rec_head.recVer', self.rec_head.rec_ver, + self.RECORD_VERSION)) + errs.extend(self.check_value('rec_head.recInstance', + self.rec_head.rec_instance, + self.RECORD_INSTANCE)) + if self.RECORD_TYPE is None: + raise NotImplementedError('RECORD_TYPE not specified!') + errs.extend(self.check_value('rec_head.recType', + self.rec_head.rec_type, + self.RECORD_TYPE)) + if length is not None: + errs.extend(self.check_value('rec_head.recLen', + self.rec_head.rec_len, length)) + return errs + + +class UserEditAtom(PptType): """ An atom record that specifies information about a user edit https://msdn.microsoft.com/en-us/library/dd945746%28v=office.12%29.aspx @@ -213,6 +336,7 @@ class UserEditAtom(object): MAJOR_VERSION = 0x03 def __init__(self): + super(UserEditAtom, self).__init__() self.rec_head = None self.last_slide_id_ref = None self.version = None @@ -235,35 +359,383 @@ class UserEditAtom(object): # parse record header obj.rec_head = RecordHeader.extract_from(stream) - check_value('rec_version', obj.rec_head.rec_ver, 0) - check_value('rec_instance', obj.rec_head.rec_ver, 0) - check_value('rec_type', obj.rec_head.rec_type, clz.RECORD_TYPE) obj.last_slide_id_ref, = struct.unpack('= offset: + errs.append(PptUnexpectedData( + 'PowerPoint Document', 'UserEditAtom.offsetLastEdit', + self.offset_last_edit, '< {}'.format(offset))) + if self.offset_persist_directory >= offset or \ + self.offset_persist_directory <= self.offset_last_edit: + errs.append(PptUnexpectedData( + 'PowerPoint Document', + 'UserEditAtom.offsetPersistDirectory', + self.offset_last_edit, + 'in ({}, {})'.format(self.offset_last_edit, offset))) + errs.extend(self.check_value('docPersistIdRef', + self.doc_persist_id_ref, 1)) + return errs + + # TODO: offer to check persist_id_seed given PersistDirectoryAtom) + + +class DummyType(PptType): + """ a type that is found in ppt documents we are not interested in + + instead of parsing many uninteresting types, we just read their + RecordHeader and set the RECORD_... values on an instance- (instead of + class-) level + + used to skip over uninteresting types in e.g. DocumentContainer + """ + + def __init__(self, type_name, record_type, rec_ver=0, rec_instance=0, + rec_len=None): + super(DummyType, self).__init__() + self.type_name = type_name + self.RECORD_TYPE = record_type + self.RECORD_VERSION = rec_ver + self.RECORD_INSTANCE = rec_instance + self.record_length = rec_len + + def extract_from(self, stream): + """ extract record header and just skip as many bytes as header says + + Since this requires RECORD_... values set in constructor, this is NOT + a classmethod like all the other extract_from! + + Otherwise this tries to be compatible with other extract_from methods + (e.g. returns self) + """ + self.read_rec_head(stream) + log.debug('skipping over {} Byte for type {}' + .format(self.rec_head.rec_len, self.type_name)) + log.debug('start at pos {}'.format(stream.tell())) + stream.seek(self.rec_head.rec_len, os.SEEK_CUR) + log.debug('now at pos {}'.format(stream.tell())) + return self + + def check_validity(self): + return self.check_rec_head(self.record_length) + + +class PersistDirectoryAtom(PptType): + """ one part of a persist object directory with unique persist object id + + contains PersistDirectoryEntry objects + + https://msdn.microsoft.com/en-us/library/dd952680%28v=office.12%29.aspx + """ + + RECORD_TYPE = 0x1772 + + def __init__(self): + super(PersistDirectoryAtom, self).__init__() + self.rg_persist_dir_entry = None # actually, this will be an array + self.stream_offset = None + + @classmethod + def extract_from(clz, stream): + """ create and return object with data from given stream """ + + log.debug("Extracting a PersistDirectoryAtom from stream") + obj = clz() + + # remember own offset for checking validity + obj.stream_offset = stream.tell() + + # parse record header + obj.read_rec_head(stream) + + # read directory entries from list until reach size for this object + curr_pos = stream.tell() + stop_pos = curr_pos + obj.rec_head.rec_len + log.debug('start reading at pos {}, read until {}' + .format(curr_pos, stop_pos)) + obj.rg_persist_dir_entry = [] + + while curr_pos < stop_pos: + new_entry = PersistDirectoryEntry.extract_from(stream) + obj.rg_persist_dir_entry.append(new_entry) + curr_pos = stream.tell() + log.debug('at pos {}'.format(curr_pos)) + return obj + + def check_validity(self, user_edit_last_offset=None): + errs = self.check_rec_head() + for entry in self.rg_persist_dir_entry: + errs.extend(entry.check_validity(user_edit_last_offset, + self.stream_offset)) + return errs + + +class PersistDirectoryEntry(object): + """ an entry contained in a PersistDirectoryAtom.rg_persist_dir_entry + + A structure that specifies a compressed table of sequential persist object + identifiers and stream offsets to associated persist objects. + + NOT a subclass of PptType because has no RecordHeader + + https://msdn.microsoft.com/en-us/library/dd947347%28v=office.12%29.aspx + """ + + def __init__(self): + self.persist_id = None + self.c_persist = None + self.rg_persist_offset = None + + @classmethod + def extract_from(clz, stream): + # take a 4-byte (=32bit) number, divide into 20bit and 12 bit) + log.debug("Extracting a PersistDirectoryEntry from stream") + obj = clz() + + # persistId (20 bits): An unsigned integer that specifies a starting + # persist object identifier. It MUST be less than or equal to 0xFFFFE. + # The first entry in rgPersistOffset is associated with persistId. The + # next entry, if present, is associated with persistId plus 1. Each + # entry in rgPersistOffset is associated with a persist object + # identifier in this manner, with the final entry associated with + # persistId + cPersist - 1. + + # cPersist (12 bits): An unsigned integer that specifies the count of + # items in rgPersistOffset. It MUST be greater than or equal to 0x001. + temp, = struct.unpack(' id is {1}, reading {2} offsets' + .format(temp, obj.persist_id, obj.c_persist)) + + # rgPersistOffset (variable): An array of PersistOffsetEntry (section + # 2.3.6) that specifies stream offsets to persist objects. The count of + # items in the array is specified by cPersist. The value of each item + # MUST be greater than or equal to offsetLastEdit in the corresponding + # user edit and MUST be less than the offset, in bytes, of the + # corresponding persist object directory. + # PersistOffsetEntry: An unsigned 4-byte integer that specifies an + # offset, in bytes, from the beginning of the PowerPoint Document + # Stream (section 2.1.2) to a persist object. + obj.rg_persist_offset = [struct.unpack(' 0xFFFFE: # (--> == 0xFFFFF since 20bit) + errs.append(PptUnexpectedData( + MAIN_STREAM_NAME, 'PersistDirectoryEntry.persist_id', + self.persist_id, '< 0xFFFFE (dec: {})'.format(0xFFFFE))) + if self.c_persist == 0: + errs.append(PptUnexpectedData( + MAIN_STREAM_NAME, 'PersistDirectoryEntry.c_persist', + self.c_persist, '> 0')) + if user_edit_last_offset is not None \ + and min(self.rg_persist_offset) < user_edit_last_offset: + errs.append(PptUnexpectedData( + MAIN_STREAM_NAME, 'PersistDirectoryEntry.rg_persist_offset', + min(self.rg_persist_offset), + '> UserEdit.offsetLastEdit ({})' + .format(user_edit_last_offset))) + if persist_obj_dir_offset is not None \ + and max(self.rg_persist_offset) > persist_obj_dir_offset: + errs.append(PptUnexpectedData( + MAIN_STREAM_NAME, 'PersistDirectoryEntry.rg_persist_offset', + max(self.rg_persist_offset), + '> PersistObjectDirectory offset ({})' + .format(persist_obj_dir_offset))) + return errs + + +class DocInfoListContainer(PptType): + """ information about the document and document display settings + + https://msdn.microsoft.com/en-us/library/dd926767%28v=office.12%29.aspx + """ + + RECORD_VERSION = 0xF + RECORD_TYPE = 0x07D0 + + def __init__(self): + super(DocInfoListContainer, self).__init__() + + +class DocumentContainer(PptType): + """ a DocumentContainer record + + https://msdn.microsoft.com/en-us/library/dd947357%28v=office.12%29.aspx + """ + + RECORD_TYPE = 0x03E8 + + def __init__(self): + super(DocumentContainer, self).__init__() + self.document_atom = None + self.ex_obj_list = None + self.document_text_info = None + self.sound_collection = None + self.drawing_group = None + self.master_list = None + self.doc_info_list = None + self.slide_hf = None + self.notes_hf = None + self.slide_list = None + self.notes_list = None + self.slide_show_doc_info = None + self.named_shows = None + self.summary = None + self.doc_routing_slip = None + self.print_options = None + self.rt_custom_table_styles_1 = None + self.end_document = None + self.rt_custom_table_styles_2 = None + + @classmethod + def extract_from(clz, stream): + """ created object with values from given stream + + stream is assumed to be positioned correctly + + this container contains lots of data we are not interested in. + """ + obj = clz() + + # parse record header + obj.read_rec_head(stream) + + # documentAtom (48 bytes): A DocumentAtom record (section 2.4.2) that + # specifies size information for presentation slides and notes slides. + obj.document_atom = DummyType('DocumentAtom', 0x03E9, rec_ver=0x1, + rec_len=0x28).extract_from(stream) + + # exObjList (variable): An optional ExObjListContainer record (section + # 2.10.1) that specifies the list of external objects in the document. + obj.ex_obj_list = DummyType('ExObjListContainer', 0x0409, rec_ver=0xF)\ + .extract_from(stream) + + # documentTextInfo (variable): A DocumentTextInfoContainer record + # (section 2.9.1) that specifies the default text styles for the + # document. + obj.document_text_info = DummyType('DocumentTextInfoContainer', 0x03F2, + rec_ver=0xF).extract_from(stream) + + # soundCollection (variable): An optional SoundCollectionContainer + # record (section 2.4.16.1) that specifies the list of sounds in the + # file. + obj.sound_collection = DummyType('SoundCollectionContainer', 0x07E4, + rec_ver=0xF, rec_instance=0x005)\ + .extract_from(stream) + + # drawingGroup (variable): A DrawingGroupContainer record (section + # 2.4.3) that specifies drawing information for the document. + obj.drawing_group = DummyType('DrawingGroupContainer', 0x040B, + rec_ver=0xF).extract_from(stream) + + # masterList (variable): A MasterListWithTextContainer record (section + # 2.4.14.1) that specifies the list of main master slides and title + # master slides. + obj.master_list = DummyType('MasterListWithContainer', 0x0FF0, + rec_ver=0xF).extract_from(stream) + + # docInfoList (variable): An optional DocInfoListContainer record + # (section 2.4.4) that specifies additional document information. + # this is the variable we are interested in! + obj.doc_info_list = DocInfoListContainer.extract_from(stream) + + # slideHF (variable): An optional SlideHeadersFootersContainer record + # (section 2.4.15.1) that specifies the default header and footer + # information for presentation slides. + obj.slide_hf = None + + # notesHF (variable): An optional NotesHeadersFootersContainer record + # (section 2.4.15.6) that specifies the default header and footer + # information for notes slides. + obj.notes_hf = None + + # slideList (variable): An optional SlideListWithTextContainer record + # (section 2.4.14.3) that specifies the list of presentation slides. + obj.slide_list = None + + # notesList (variable): An optional NotesListWithTextContainer record + # (section 2.4.14.6) that specifies the list of notes slides. + obj.notes_list = None + + # slideShowDocInfoAtom (88 bytes): An optional SlideShowDocInfoAtom + # record (section 2.6.1) that specifies slide show information for the + # document. + obj.slide_show_doc_info = None + + # namedShows (variable): An optional NamedShowsContainer record + # (section 2.6.2) that specifies named shows in the document. + obj.named_shows = None + + # summary (variable): An optional SummaryContainer record (section + # 2.4.22.3) that specifies bookmarks for the document. + obj.summary = None + + # docRoutingSlipAtom (variable): An optional DocRoutingSlipAtom record + # (section 2.11.1) that specifies document routing information. + obj.doc_routing_slip = None + + # printOptionsAtom (13 bytes): An optional PrintOptionsAtom record + # (section 2.4.12) that specifies default print options. + obj.print_options = None + + # rtCustomTableStylesAtom1 (variable): An optional + # RoundTripCustomTableStyles12Atom record (section 2.11.13) that + # specifies round-trip information for custom table styles. + obj.rt_custom_table_styles_1 = None + + # endDocumentAtom (8 bytes): An EndDocumentAtom record (section 2.4.13) + # that specifies the end of the information for the document. + obj.end_document = None + + # rtCustomTableStylesAtom2 (variable): An optional + # RoundTripCustomTableStyles12Atom record that specifies round-trip + # information for custom table styles. It MUST NOT exist if + # rtCustomTableStylesAtom1 exists. + obj.rt_custom_table_styles_2 = None + + return obj + + + def check_validity(self): + """ check all values in object for valid values """ + errs = self.check_rec_head() + errs.extend(self.document_atom.check_validity()) + errs.extend(self.ex_obj_list.check_validity()) + errs.extend(self.document_text_info.check_validity()) + errs.extend(self.sound_collection.check_validity()) + errs.extend(self.drawing_group.check_validity()) + errs.extend(self.master_list.check_validity()) + errs.extend(self.doc_info_list.check_validity()) + return errs # === PptParser =============================================================== @@ -276,7 +748,7 @@ class PptParser(object): def __init__(self, ole, fast_fail=False): """ constructor - + :param ole: OleFileIO or anything that OleFileIO constructor accepts :param bool fast_fail: if True, all unexpected data will raise a PptUnexpectedData; if False will only log error @@ -290,6 +762,8 @@ class PptParser(object): self.fast_fail = fast_fail self.current_user_atom = None + self.document_persist_obj = None + self.persist_object_directory = None # basic compatibility check: root directory structure is # [['\x05DocumentSummaryInformation'], @@ -304,12 +778,12 @@ class PptParser(object): root_streams = [stream[0].lower() for stream in root_streams] if not 'current user' in root_streams: self._fail('root', 'listdir', root_streams, 'Current User') - if not 'powerpoint document' in root_streams: - self._fail('root', 'listdir', root_streams, 'PowerPoint Document') + if not MAIN_STREAM_NAME.lower() in root_streams: + self._fail('root', 'listdir', root_streams, MAIN_STREAM_NAME) def _log_exception(self, msg=None): """ log an exception instead of raising it - + call in one of 2 ways: try: if fail(): @@ -348,7 +822,7 @@ class PptParser(object): if self.current_user_atom is not None: log.warning('re-reading and overwriting ' - 'previously read CurrentUserAtom') + 'previously read current_user_atom') try: self.current_user_atom = CurrentUserAtom.extract_from(self.ole) @@ -358,41 +832,130 @@ class PptParser(object): else: self._log_exception() - def construct_persist_object_directory(self): - """ part 2 """ - + def parse_persist_object_directory(self): + """ Part 1: Construct the persist object directory """ + + if self.persist_object_directory is not None: + log.warning('re-reading and overwriting ' + 'previously read persist_object_directory') + if self.current_user_atom is None: self.parse_current_user() offset = self.current_user_atom.offset_to_current_edit is_encrypted = self.current_user_atom.is_encrypted() + self.persist_object_directory = {} + + stream = None + try: + log.debug('opening stream') + stream = self.ole.openstream(MAIN_STREAM_NAME) + while offset != 0: + + stream.seek(offset, os.SEEK_SET) + user_edit = UserEditAtom.extract_from(stream, is_encrypted) + + log.debug('checking validity') + errs = user_edit.check_validity() + if errs: + log.warning('check_validity found {} issues' + .format(len(errs))) + for err in errs: + log.warning('UserEditAtom.check_validity: {}'.format(err)) + if errs and self.fast_fail: + raise errs[0] + + log.debug('seeking to pos {}' + .format(user_edit.offset_persist_directory)) + stream.seek(user_edit.offset_persist_directory, os.SEEK_SET) + + persist_dir_atom = PersistDirectoryAtom.extract_from(stream) + + log.debug('checking validity') + errs = persist_dir_atom.check_validity(offset) + if errs: + log.warning('check_validity found {} issues' + .format(len(errs))) + for err in errs: + log.warning('PersistDirectoryAtom.check_validity: {}' + .format(err)) + if errs and self.fast_fail: + raise errs[0] + + for entry in persist_dir_atom.rg_persist_dir_entry: + log.debug('saving {} offsets for persist_id {}' + .format(len(entry.rg_persist_offset), + entry.persist_id)) + self.persist_object_directory[entry.persist_id] = \ + entry.rg_persist_offset + + # check for more + offset = user_edit.offset_last_edit + except Exception: + if self.fast_fail: + raise + else: + self._log_exception() + finally: + if stream is not None: + log.debug('closing stream') + stream.close() + + def parse_document_persist_object(self): + """ """ + if self.document_persist_obj is not None: + log.warning('re-reading and overwriting ' + 'previously read document_persist_object') + + if self.persist_object_directory is None: + self.parse_persist_object_directory() + + offset = None # TODO: read from object directory stream = None try: - stream = self.ole.openstream('PowerPoint Document') + log.debug('opening stream') + stream = self.ole.openstream(MAIN_STREAM_NAME) + log.debug('stream pos: {}'.format(stream.tell())) stream.seek(offset) - user_edit = UserEditAtom.extract_from(stream, is_encrypted) + log.debug('seek by {} to {}'.format(offset, stream.tell())) + self.document_persist_obj = DocumentContainer.extract_from(stream) + except Exception: + if self.fast_fail: + raise + else: + self._log_exception() finally: if stream is not None: log.debug('closing stream') stream.close() + log.debug('checking validity') + errs = self.document_persist_obj.check_validity() + if errs: + log.warning('check_validity found {} issues'.format(len(errs))) + for err in errs: + log.warning('check_validity(document_persist_obj): {}' + .format(err)) + if errs and self.fast_fail: + raise errs[0] + # === TESTING ================================================================= def test(): """ for testing and debugging """ # setup logging - logging.basicConfig(level=logging.DEBUG, format='%(levelname)-8s %(message)s') + logging.basicConfig(level=logging.DEBUG, + format='%(levelname)-8s %(message)s') log.setLevel(logging.NOTSET) # test file with some autostart macros test_file = 'gelaber_autostart.ppt' # parse - ppt = PptParser(test_file) - ppt.parse_current_user() - ppt.construct_persist_object_directory() + ppt = PptParser(test_file, fast_fail=False) + ppt.parse_document_persist_object() if __name__ == '__main__': -- libgit2 0.21.4