From 823d07b35bee7ad7705e8a0ce73d8c1ca72ce758 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Thu, 12 May 2016 18:03:36 +0200 Subject: [PATCH] make many ppt_parser functions generators; use decorator for try-open-except-close(stream) --- oletools/olevba.py | 23 +++-------------------- oletools/ppt_parser.py | 582 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 2 files changed, 306 insertions(+), 299 deletions(-) diff --git a/oletools/olevba.py b/oletools/olevba.py index dea06fe..c3a2e38 100755 --- a/oletools/olevba.py +++ b/oletools/olevba.py @@ -2203,27 +2203,10 @@ class VBA_Parser(object): ppt_parser.enable_logging() try: ppt = ppt_parser.PptParser(self.ole_file, fast_fail=True) - info_container = ppt.search_vba_info() - n_infos = len(info_container) - n_macros = sum(1 for info in info_container - if info.vba_info_atom.f_has_macros > 0) - n_infos = len(ppt.search_vba_info()) - # TODO: does it make sense at all to continue if n_macros == 0? - # --> no vba-info, so all storages probably ActiveX or other OLE - storages = ppt.search_vba_storage() - n_storages = len(storages) - n_compressed = 0 - for storage in storages: - if storage.is_compressed: - storage_decomp = ppt.decompress_vba_storage(storage) - n_compressed += 1 - else: - storage_decomp = ppt.read_vba_storage_data(storage) - self.ole_subfiles.append(VBA_Parser(None, storage_decomp, + for vba_data in ppt.iter_vba_data(): + self.ole_subfiles.append(VBA_Parser(None, vba_data, container='PptParser')) - log.info('File is PPT with {} vba infos ({} with macros) and {} ' - 'vba storages ({} compressed)' - .format(n_infos, n_macros, n_storages, n_compressed)) + log.info('File is PPT') self.ole_file.close() # just in case self.ole_file = None # required to make other methods look at ole_subfiles self.type = TYPE_PPT diff --git a/oletools/ppt_parser.py b/oletools/ppt_parser.py index 9417e5a..0cb551d 100644 --- a/oletools/ppt_parser.py +++ b/oletools/ppt_parser.py @@ -21,12 +21,12 @@ References: # - can speed-up by using less bigger struct.parse calls? # - license # - make buffered stream from output of iterative_decompress -# - less stream open/close, possibly through decorator for open+closing? +# - maybe can merge the 2 decorators into 1? (with_opened_main_stream) # # CHANGELOG: # 2016-05-04 v0.01 CH: - start parsing "Current User" stream -__version__ = '0.01' +__version__ = '0.02' #--- IMPORTS ------------------------------------------------------------------ @@ -1030,6 +1030,78 @@ class ExternalObjectStorageCompressed(ExternalObjectStorage): # === PptParser =============================================================== +def with_opened_main_stream(func): + """ a decorator that can open and close the default stream for func + + to be applied only to functions in PptParser that read from default stream + (:py:data:`MAIN_STREAM_NAME`) + + Decorated functions need to accept args (self, stream, ...) + """ + + def wrapped(self, *args, **kwargs): + # remember who opened the stream so that function also closes it + stream_opened_by_me = False + try: + # open stream if required + if self._open_main_stream is None: + log.debug('opening stream {!r} for {}' + .format(MAIN_STREAM_NAME, func.__name__)) + self._open_main_stream = self.ole.openstream(MAIN_STREAM_NAME) + stream_opened_by_me = True + + # run wrapped function + return func(self, self._open_main_stream, *args, **kwargs) + + # error handling + except Exception: + if self.fast_fail: + raise + else: + self._log_exception() + finally: + # ensure stream is closed by the one who opened it (even if error) + if stream_opened_by_me: + log.debug('closing stream {!r} after {}' + .format(MAIN_STREAM_NAME, func.__name__)) + self._open_main_stream.close() + self._open_main_stream = None + return wrapped + + +def generator_with_opened_main_stream(func): + """ same as with_opened_main_stream but with yield instead of return """ + + def wrapped(self, *args, **kwargs): + # remember who opened the stream so that function also closes it + stream_opened_by_me = False + try: + # open stream if required + if self._open_main_stream is None: + log.debug('opening stream {!r} for {}' + .format(MAIN_STREAM_NAME, func.__name__)) + self._open_main_stream = self.ole.openstream(MAIN_STREAM_NAME) + stream_opened_by_me = True + + # run actual function + for result in func(self, self._open_main_stream, *args, **kwargs): + yield result + + # error handling + except Exception: + if self.fast_fail: + raise + else: + self._log_exception() + finally: + # ensure stream is closed by the one who opened it (even if error) + if stream_opened_by_me: + log.debug('closing stream {!r} after {}' + .format(MAIN_STREAM_NAME, func.__name__)) + self._open_main_stream.close() + self._open_main_stream = None + return wrapped + class PptParser(object): """ Parser for PowerPoint 97-2003 specific data structures @@ -1074,6 +1146,8 @@ class PptParser(object): if not MAIN_STREAM_NAME.lower() in root_streams: self._fail('root', 'listdir', root_streams, MAIN_STREAM_NAME) + self._open_main_stream = None + def _log_exception(self, msg=None): """ log an exception instead of raising it @@ -1121,7 +1195,7 @@ class PptParser(object): stream = None try: - log.debug('opening stream') + log.debug('opening stream "Current User"') stream = self.ole.openstream('Current User') self.current_user_atom = CurrentUserAtom.extract_from(stream) except Exception: @@ -1131,10 +1205,11 @@ class PptParser(object): self._log_exception() finally: if stream is not None: - log.debug('closing stream') + log.debug('closing stream "Current User"') stream.close() - def parse_persist_object_directory(self): + @with_opened_main_stream + def parse_persist_object_directory(self, stream): """ Part 1: Construct the persist object directory """ if self.persist_object_directory is not None: @@ -1152,100 +1227,87 @@ class PptParser(object): self.persist_object_directory = {} self.newest_user_edit = None - stream = None - try: - log.debug('opening stream') - stream = self.ole.openstream(MAIN_STREAM_NAME) - - # Repeat steps 3 through 6 until offsetLastEdit is 0x00000000. - while offset != 0: - - # Step 2: Seek, in the PowerPoint Document Stream, to the - # offset specified by the offsetToCurrentEdit field of the - # CurrentUserAtom record identified in step 1. - stream.seek(offset, os.SEEK_SET) - - # Step 3: Read the UserEditAtom record at the current offset. - # Let this record be a live record. - user_edit = UserEditAtom.extract_from(stream, is_encrypted) - if self.newest_user_edit is None: - self.newest_user_edit = user_edit - - log.debug('checking validity') - errs = user_edit.check_validity() - if errs: - log.warning('check_validity found {} issues' - .format(len(errs))) - for err in errs: - log.warning('UserEditAtom.check_validity: {}'.format(err)) - if errs and self.fast_fail: - raise errs[0] - - # Step 4: Seek to the offset specified by the - # offsetPersistDirectory field of the UserEditAtom record - # identified in step 3. - log.debug('seeking to pos {}' - .format(user_edit.offset_persist_directory)) - stream.seek(user_edit.offset_persist_directory, os.SEEK_SET) - - # Step 5: Read the PersistDirectoryAtom record at the current - # offset. Let this record be a live record. - persist_dir_atom = PersistDirectoryAtom.extract_from(stream) + # Repeat steps 3 through 6 until offsetLastEdit is 0x00000000. + while offset != 0: - log.debug('checking validity') - errs = persist_dir_atom.check_validity(offset) - if errs: - log.warning('check_validity found {} issues' - .format(len(errs))) - for err in errs: - log.warning('PersistDirectoryAtom.check_validity: {}' - .format(err)) - if errs and self.fast_fail: - raise errs[0] - - - # Construct the complete persist object directory for this file - # as follows: - # - For each PersistDirectoryAtom record previously identified - # in step 5, add the persist object identifier and persist - # object stream offset pairs to the persist object directory - # starting with the PersistDirectoryAtom record last - # identified, that is, the one closest to the beginning of the - # stream. - # - Continue adding these pairs to the persist object directory - # for each PersistDirectoryAtom record in the reverse order - # that they were identified in step 5; that is, the pairs from - # the PersistDirectoryAtom record closest to the end of the - # stream are added last. - # - When adding a new pair to the persist object directory, if - # the persist object identifier already exists in the persist - # object directory, the persist object stream offset from the - # new pair replaces the existing persist object stream offset - # for that persist object identifier. - for entry in persist_dir_atom.rg_persist_dir_entry: - last_id = entry.persist_id+len(entry.rg_persist_offset)-1 - log.debug('for persist IDs {}-{}, save offsets {}' - .format(entry.persist_id, last_id, - entry.rg_persist_offset)) - for count, offset in enumerate(entry.rg_persist_offset): - self.persist_object_directory[entry.persist_id+count] \ - = offset - - # check for more - # Step 6: Seek to the offset specified by the offsetLastEdit - # field in the UserEditAtom record identified in step 3. - offset = user_edit.offset_last_edit - except Exception: - if self.fast_fail: - raise - else: - self._log_exception() - finally: - if stream is not None: - log.debug('closing stream') - stream.close() + # Step 2: Seek, in the PowerPoint Document Stream, to the + # offset specified by the offsetToCurrentEdit field of the + # CurrentUserAtom record identified in step 1. + stream.seek(offset, os.SEEK_SET) - def parse_document_persist_object(self): + # Step 3: Read the UserEditAtom record at the current offset. + # Let this record be a live record. + user_edit = UserEditAtom.extract_from(stream, is_encrypted) + if self.newest_user_edit is None: + self.newest_user_edit = user_edit + + log.debug('checking validity') + errs = user_edit.check_validity() + if errs: + log.warning('check_validity found {} issues' + .format(len(errs))) + for err in errs: + log.warning('UserEditAtom.check_validity: {}'.format(err)) + if errs and self.fast_fail: + raise errs[0] + + # Step 4: Seek to the offset specified by the + # offsetPersistDirectory field of the UserEditAtom record + # identified in step 3. + log.debug('seeking to pos {}' + .format(user_edit.offset_persist_directory)) + stream.seek(user_edit.offset_persist_directory, os.SEEK_SET) + + # Step 5: Read the PersistDirectoryAtom record at the current + # offset. Let this record be a live record. + persist_dir_atom = PersistDirectoryAtom.extract_from(stream) + + log.debug('checking validity') + errs = persist_dir_atom.check_validity(offset) + if errs: + log.warning('check_validity found {} issues' + .format(len(errs))) + for err in errs: + log.warning('PersistDirectoryAtom.check_validity: {}' + .format(err)) + if errs and self.fast_fail: + raise errs[0] + + + # Construct the complete persist object directory for this file + # as follows: + # - For each PersistDirectoryAtom record previously identified + # in step 5, add the persist object identifier and persist + # object stream offset pairs to the persist object directory + # starting with the PersistDirectoryAtom record last + # identified, that is, the one closest to the beginning of the + # stream. + # - Continue adding these pairs to the persist object directory + # for each PersistDirectoryAtom record in the reverse order + # that they were identified in step 5; that is, the pairs from + # the PersistDirectoryAtom record closest to the end of the + # stream are added last. + # - When adding a new pair to the persist object directory, if + # the persist object identifier already exists in the persist + # object directory, the persist object stream offset from the + # new pair replaces the existing persist object stream offset + # for that persist object identifier. + for entry in persist_dir_atom.rg_persist_dir_entry: + last_id = entry.persist_id+len(entry.rg_persist_offset)-1 + log.debug('for persist IDs {}-{}, save offsets {}' + .format(entry.persist_id, last_id, + entry.rg_persist_offset)) + for count, offset in enumerate(entry.rg_persist_offset): + self.persist_object_directory[entry.persist_id+count] \ + = offset + + # check for more + # Step 6: Seek to the offset specified by the offsetLastEdit + # field in the UserEditAtom record identified in step 3. + offset = user_edit.offset_last_edit + + @with_opened_main_stream + def parse_document_persist_object(self, stream): """ Part 2: Identify the document persist object """ if self.document_persist_obj is not None: log.warning('re-reading and overwriting ' @@ -1265,27 +1327,13 @@ class PptParser(object): log.debug('newest user edit ID is {}, offset is {}' .format(newest_ref, offset)) - stream = None + # Step 3: Seek to the stream offset specified in step 2. + log.debug('seek to {}'.format(offset)) + stream.seek(offset, os.SEEK_SET) - try: - # Step 3: Seek to the stream offset specified in step 2. - log.debug('opening stream') - stream = self.ole.openstream(MAIN_STREAM_NAME) - log.debug('seek to {}'.format(offset)) - stream.seek(offset, os.SEEK_SET) - - # Step 4: Read the DocumentContainer record at the current offset. - # Let this record be a live record. - self.document_persist_obj = DocumentContainer.extract_from(stream) - except Exception: - if self.fast_fail: - raise - else: - self._log_exception() - finally: - if stream is not None: - log.debug('closing stream') - stream.close() + # Step 4: Read the DocumentContainer record at the current offset. + # Let this record be a live record. + self.document_persist_obj = DocumentContainer.extract_from(stream) log.debug('checking validity') errs = self.document_persist_obj.check_validity() @@ -1297,7 +1345,13 @@ class PptParser(object): if errs and self.fast_fail: raise errs[0] - def search_pattern(self, pattern, stream): + #-------------------------------------------------------------------------- + # 2nd attempt: do not parse whole structure but search through stream and + # yield results as they become available + # Keep in mind that after every yield the stream position may be anything! + + @generator_with_opened_main_stream + def search_pattern(self, stream, pattern): """ search for pattern in stream, return indices """ BUF_SIZE = 1024 @@ -1308,30 +1362,28 @@ class PptParser(object): raise ValueError('need buf > pattern to search!') n_reads = 0 - candidates = [] while True: start_pos = stream.tell() n_reads += 1 - #log.debug('read {} starting from {}' - # .format(BUF_SIZE, start_pos)) + log.debug('read {} starting from {}' + .format(BUF_SIZE, start_pos)) buf = stream.read(BUF_SIZE) idx = buf.find(pattern) while idx != -1: log.debug('found pattern at index {}'.format(start_pos+idx)) - candidates.append(start_pos+idx) + yield start_pos + idx idx = buf.find(pattern, idx+1) if len(buf) == BUF_SIZE: # move back a bit to avoid splitting of pattern through buf - stream.seek(-1 * pattern_len, os.SEEK_CUR) + stream.seek(start_pos + BUF_SIZE - pattern_len, os.SEEK_SET) else: log.debug('reached end of buf (read {}<{}) after {} reads' .format(len(buf), BUF_SIZE, n_reads)) break - return candidates - - def search_vba_info(self): + @generator_with_opened_main_stream + def search_vba_info(self, stream): """ search through stream for VBAInfoContainer, alternative to parse... quick-and-dirty: do not parse everything, just look for right bytes @@ -1348,51 +1400,37 @@ class PptParser(object): rec_len=VBAInfoContainer.RECORD_LENGTH) \ + VBAInfoAtom.generate_pattern( rec_len=VBAInfoAtom.RECORD_LENGTH) - stream = None - try: - log.debug('opening stream') - stream = self.ole.openstream(MAIN_STREAM_NAME) - - # look for candidate positions - candidates = self.search_pattern(pattern, stream) - - # try parse - containers = [] - for idx in candidates: - # assume that in stream at idx there is a VBAInfoContainer - stream.seek(idx) - log.debug('extracting at idx {}'.format(idx)) - try: - container = VBAInfoContainer.extract_from(stream) - except Exception: - self._log_exception() - continue - - errs = container.check_validity() - if errs: - log.warning('check_validity found {} issues' - .format(len(errs))) - else: - log.debug('container is ok') - atom = container.vba_info_atom - log.debug('persist id ref is {}, has_macros {}, version {}' - .format(atom.persist_id_ref, atom.f_has_macros, - atom.version)) - containers.append(container) - for err in errs: - log.warning('check_validity(VBAInfoContainer): {}' - .format(err)) - if errs and self.fast_fail: - raise errs[0] - - return containers - finally: - if stream is not None: - log.debug('closing stream') - stream.close() + # try parse + for idx in self.search_pattern(pattern): + # assume that in stream at idx there is a VBAInfoContainer + stream.seek(idx) + log.debug('extracting at idx {}'.format(idx)) + try: + container = VBAInfoContainer.extract_from(stream) + except Exception: + self._log_exception() + continue - def search_vba_storage(self): + errs = container.check_validity() + if errs: + log.warning('check_validity found {} issues' + .format(len(errs))) + else: + log.debug('container is ok') + atom = container.vba_info_atom + log.debug('persist id ref is {}, has_macros {}, version {}' + .format(atom.persist_id_ref, atom.f_has_macros, + atom.version)) + yield container + for err in errs: + log.warning('check_validity(VBAInfoContainer): {}' + .format(err)) + if errs and self.fast_fail: + raise errs[0] + + @generator_with_opened_main_stream + def search_vba_storage(self, stream): """ search through stream for VBAProjectStg, alternative to parse... quick-and-dirty: do not parse everything, just look for right bytes @@ -1403,120 +1441,113 @@ class PptParser(object): The storages found could also contain (instead of VBA data): ActiveX data or general OLE data + yields results as it finds them + .. seealso:: :py:meth:`search_vba_info` """ logging.debug('looking for VBA storage objects') - stream = None - try: - log.debug('opening stream') - stream = self.ole.openstream(MAIN_STREAM_NAME) - - storages = [] - for obj_type in (ExternalObjectStorageUncompressed, - ExternalObjectStorageCompressed): - # re-position stream at start - stream.seek(0, os.SEEK_SET) - - # look for candidate positions - pattern = obj_type.generate_pattern() - candidates = self.search_pattern(pattern, stream) - - # try parse - for idx in candidates: - # assume a ExternalObjectStorage in stream at idx - stream.seek(idx) - log.debug('extracting at idx {}'.format(idx)) - try: - storage = obj_type.extract_from(stream) - except Exception: - self._log_exception() - continue - - errs = storage.check_validity() - if errs: - log.warning('check_validity found {} issues' - .format(len(errs))) - else: - log.debug('storage is ok; compressed={}, size={}, ' - 'size_decomp={}' - .format(storage.is_compressed, - storage.rec_head.rec_len, - storage.uncompressed_size)) - storages.append(storage) - for err in errs: - log.warning('check_validity({}): {}' - .format(obj_type.__name__, err)) - if errs and self.fast_fail: - raise errs[0] - - return storages + for obj_type in (ExternalObjectStorageUncompressed, + ExternalObjectStorageCompressed): + # re-position stream at start + stream.seek(0, os.SEEK_SET) - finally: - if stream is not None: - log.debug('closing stream') - stream.close() + pattern = obj_type.generate_pattern() + # try parse + for idx in self.search_pattern(pattern): + # assume a ExternalObjectStorage in stream at idx + stream.seek(idx) + log.debug('extracting at idx {}'.format(idx)) + try: + storage = obj_type.extract_from(stream) + except Exception: + self._log_exception() + continue + + errs = storage.check_validity() + if errs: + log.warning('check_validity found {} issues' + .format(len(errs))) + else: + log.debug('storage is ok; compressed={}, size={}, ' + 'size_decomp={}' + .format(storage.is_compressed, + storage.rec_head.rec_len, + storage.uncompressed_size)) + yield storage + for err in errs: + log.warning('check_validity({}): {}' + .format(obj_type.__name__, err)) + if errs and self.fast_fail: + raise errs[0] - def decompress_vba_storage(self, storage): + @with_opened_main_stream + def decompress_vba_storage(self, stream, storage): """ return decompressed data from search_vba_storage """ log.debug('decompressing storage for VBA OLE data stream ') - stream = None - try: - log.debug('opening stream') - stream = self.ole.openstream(MAIN_STREAM_NAME) - - # decompress iteratively; a zlib.decompress of all data - # failed with Error -5 (incomplete or truncated stream) - stream.seek(storage.data_offset, os.SEEK_SET) - decomp, n_read, err = \ - iterative_decompress(stream, storage.data_size) - log.debug('decompressed {} to {} bytes; found err: {}' - .format(n_read, len(decomp), err)) - if err and self.fast_fail: - raise err - # otherwise try to continue with partial data - - return decomp - - ## create OleFileIO from decompressed data - #ole = olefile.OleFileIO(decomp) - #root_streams = [entry[0].lower() for entry in ole.listdir()] - #for required in 'project', 'projectwm', 'vba': - # if required not in root_streams: - # raise ValueError('storage seems to not be a VBA storage ' - # '({} not found in root streams)' - # .format(required)) - #log.debug('tests succeeded') - #return ole - - finally: - if stream is not None: - log.debug('closing stream') - stream.close() - - def read_vba_storage_data(self, storage): + # decompress iteratively; a zlib.decompress of all data + # failed with Error -5 (incomplete or truncated stream) + stream.seek(storage.data_offset, os.SEEK_SET) + decomp, n_read, err = \ + iterative_decompress(stream, storage.data_size) + log.debug('decompressed {} to {} bytes; found err: {}' + .format(n_read, len(decomp), err)) + if err and self.fast_fail: + raise err + # otherwise try to continue with partial data + + return decomp + + ## create OleFileIO from decompressed data + #ole = olefile.OleFileIO(decomp) + #root_streams = [entry[0].lower() for entry in ole.listdir()] + #for required in 'project', 'projectwm', 'vba': + # if required not in root_streams: + # raise ValueError('storage seems to not be a VBA storage ' + # '({} not found in root streams)' + # .format(required)) + #log.debug('tests succeeded') + #return ole + + @with_opened_main_stream + def read_vba_storage_data(self, stream, storage): """ return data pointed to by uncompressed storage """ - log.debug('reading uncompressed VBA OLE data stream') - stream = None - try: - log.debug('opening stream') - stream = self.ole.openstream(MAIN_STREAM_NAME) - - log.debug('reading {} bytes starting at {}' - .format(storage.data_size, storage.data_offset)) - stream.seek(storage.data_offset, os.SEEK_SET) - data = stream.read(storage.data_size) - - return data + log.debug('reading uncompressed VBA OLE data stream: ' + '{} bytes starting at {}' + .format(storage.data_size, storage.data_offset)) + stream.seek(storage.data_offset, os.SEEK_SET) + data = stream.read(storage.data_size) + return data + + @generator_with_opened_main_stream + def iter_vba_data(self, stream): + """ search vba infos and storages, yield uncompressed storage data """ + + n_infos = 0 + n_macros = 0 + for info in self.search_vba_info(): + n_infos += 1 + if info.vba_info_atom.f_has_macros > 0: + n_macros += 1 + # TODO: does it make sense at all to continue if n_macros == 0? + # --> no vba-info, so all storages probably ActiveX or other OLE + n_storages = 0 + n_compressed = 0 + for storage in self.search_vba_storage(): + n_storages += 1 + if storage.is_compressed: + n_compressed += 1 + yield self.decompress_vba_storage(storage) + else: + yield self.read_vba_storage_data(storage) - finally: - if stream is not None: - log.debug('closing stream') - stream.close() + log.info('found {} infos ({} with macros) and {} storages ' + '({} compressed)' + .format(n_infos, n_macros, n_storages, n_compressed)) def iterative_decompress(stream, size, chunk_size=4096): @@ -1559,16 +1590,9 @@ def test(): try: ppt = PptParser(file_name, fast_fail=False) #ppt.parse_document_persist_object() - n_infos = len(ppt.search_vba_info()) - storages = ppt.search_vba_storage() - n_storages = len(storages) - log.debug('found {} infos and {} storages'.format(n_infos, - n_storages)) - if n_infos != n_storages: - log.warning('found different number of vba infos and storages') - for storage in storages: - parser = VBA_Parser(None, ppt.decompress_vba_storage(storage), - container='PptParser') + + for vba_data in ppt.iter_vba_data(): + parser = VBA_Parser(None, vba_data, container='PptParser') for vba_root, project_path, dir_path in \ parser.find_vba_projects(): log.info('found vba project: root={}, proj={}, dir={}' -- libgit2 0.21.4