From 05a27a43643562e52dbc3710e7e3dd044adf872f Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Wed, 11 May 2016 17:57:35 +0200 Subject: [PATCH] managed to extract vba stream from ppt by byte-search for ExternalObjectStorage --- oletools/ppt_parser.py | 343 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------------------------- 1 file changed, 300 insertions(+), 43 deletions(-) diff --git a/oletools/ppt_parser.py b/oletools/ppt_parser.py index 569cbfc..5d71d1f 100644 --- a/oletools/ppt_parser.py +++ b/oletools/ppt_parser.py @@ -19,7 +19,8 @@ References: # - make stream optional in PptUnexpectedData # - can speed-up by using less bigger struct.parse calls? # - license -# - create a AtomBase class that defines check_value and parses RecordHead? +# - make buffered stream from output of iterative_decompress +# - less stream open/close, possibly through decorator for open+closing? # # CHANGELOG: # 2016-05-04 v0.01 CH: - start parsing "Current User" stream @@ -33,9 +34,11 @@ import logging import struct import traceback import os +import cStringIO import thirdparty.olefile as olefile from olevba import get_logger +import zlib # a global logger object used for debugging: @@ -132,6 +135,8 @@ class RecordHeader(object): length of result depends on rec_len being given or not """ + if rec_type is None: + raise ValueError('RECORD_TYPE not set!') version_instance = rec_ver + 2**4 * rec_instance if rec_len is None: return struct.pack(' reduce size') + self.uncompressed_size = read_4(stream) + self.data_size -= 4 + self.data_offset = stream.tell() + + def check_validity(self): + return self.check_rec_head() + + +class ExternalObjectStorageUncompressed(ExternalObjectStorage): + """ subclass of ExternalObjectStorage for uncompressed objects """ + RECORD_INSTANCE = ExternalObjectStorage.RECORD_INSTANCE_UNCOMPRESSED + + def __init__(self): + super(ExternalObjectStorageUncompressed, self).__init__(False) + + @classmethod + def extract_from(clz, stream): + """ note the usage of super here: call instance method of super class! + """ + obj = clz() + super(ExternalObjectStorageUncompressed, obj).extract_from(stream) + return obj + + +class ExternalObjectStorageCompressed(ExternalObjectStorage): + """ subclass of ExternalObjectStorage for compressed objects """ + RECORD_INSTANCE = ExternalObjectStorage.RECORD_INSTANCE_COMPRESSED + + def __init__(self): + super(ExternalObjectStorageCompressed, self).__init__(True) + + @classmethod + def extract_from(clz, stream): + """ note the usage of super here: call instance method of super class! + """ + obj = clz() + super(ExternalObjectStorageCompressed, obj).extract_from(stream) + return obj + + # === PptParser =============================================================== @@ -1180,59 +1282,65 @@ class PptParser(object): if errs and self.fast_fail: raise errs[0] - def search_vba(self): - """ quick-and-dirty: do not parse everything, just look for right bytes - - "quick" here means quick to program. Runtime now is linear is document - size (--> for big documents the other method might be faster) - """ + def search_pattern(self, pattern, stream): + """ search for pattern in stream, return indices """ BUF_SIZE = 1024 - pattern = RecordHeader.generate( - VBAInfoContainer.RECORD_TYPE, - rec_len=VBAInfoContainer.RECORD_LENGTH, - rec_instance=VBAInfoContainer.RECORD_INSTANCE, - rec_ver=VBAInfoContainer.RECORD_VERSION) \ - + RecordHeader.generate( - VBAInfoAtom.RECORD_TYPE, - rec_len=VBAInfoAtom.RECORD_LENGTH, - rec_instance=VBAInfoAtom.RECORD_INSTANCE, - rec_ver=VBAInfoAtom.RECORD_VERSION) pattern_len = len(pattern) log.debug('pattern length is {}'.format(pattern_len)) if pattern_len > BUF_SIZE: raise ValueError('need buf > pattern to search!') + n_reads = 0 + candidates = [] + while True: + start_pos = stream.tell() + n_reads += 1 + #log.debug('read {} starting from {}' + # .format(BUF_SIZE, start_pos)) + buf = stream.read(BUF_SIZE) + idx = buf.find(pattern) + while idx != -1: + log.info('found pattern at index {}'.format(start_pos+idx)) + candidates.append(start_pos+idx) + idx = buf.find(pattern, idx+1) + + if len(buf) == BUF_SIZE: + # move back a bit to avoid splitting of pattern through buf + stream.seek(-1 * pattern_len, os.SEEK_CUR) + else: + log.debug('reached end of buf (read {}<{}) after {} reads' + .format(len(buf), BUF_SIZE, n_reads)) + break + return candidates + + + def search_vba_info(self): + """ search through stream for VBAInfoContainer, alternative to parse... + + quick-and-dirty: do not parse everything, just look for right bytes + + "quick" here means quick to program. Runtime now is linear is document + size (--> for big documents the other method might be faster) + + .. seealso:: search_vba_storage + """ + + pattern = VBAInfoContainer.generate_pattern( + rec_len=VBAInfoContainer.RECORD_LENGTH) \ + + VBAInfoAtom.generate_pattern( + rec_len=VBAInfoAtom.RECORD_LENGTH) stream = None try: log.debug('opening stream') stream = self.ole.openstream(MAIN_STREAM_NAME) # look for candidate positions - n_reads = 0 - candidates = [] - while True: - start_pos = stream.tell() - n_reads += 1 - #log.debug('read {} starting from {}' - # .format(BUF_SIZE, start_pos)) - buf = stream.read(BUF_SIZE) - idx = buf.find(pattern) - while idx != -1: - log.info('found pattern at index {}'.format(start_pos+idx)) - candidates.append(start_pos+idx) - idx = buf.find(pattern, idx+1) - - if len(buf) == BUF_SIZE: - # move back a bit to avoid splitting of pattern through buf - stream.seek(-1 * pattern_len, os.SEEK_CUR) - else: - log.debug('reached end of buf (read {}<{}) after {} reads' - .format(len(buf), BUF_SIZE, n_reads)) - break + candidates = self.search_pattern(pattern, stream) # try parse + containers = [] for idx in candidates: # assume that in stream at idx there is a VBAInfoContainer stream.seek(idx) @@ -1252,23 +1360,149 @@ class PptParser(object): log.info('persist id ref is {}, has_macros {}, version {}' .format(atom.persist_id_ref, atom.f_has_macros, atom.version)) + containers.append(container) for err in errs: log.warning('check_validity(VBAInfoContainer): {}' .format(err)) if errs and self.fast_fail: raise errs[0] + return containers + finally: if stream is not None: log.debug('closing stream') stream.close() + def search_vba_storage(self): + """ search through stream for VBAProjectStg, alternative to parse... + + quick-and-dirty: do not parse everything, just look for right bytes + + "quick" here means quick to program. Runtime now is linear is document + size (--> for big documents the other method might be faster) + + The storages found could also contain (instead of VBA data): ActiveX + data or general OLE data + + .. seealso:: :py:meth:`search_vba_info` + """ + + stream = None + try: + log.debug('opening stream') + stream = self.ole.openstream(MAIN_STREAM_NAME) + + storages = [] + for obj_type in (ExternalObjectStorageUncompressed, + ExternalObjectStorageCompressed): + # re-position stream at start + stream.seek(0, os.SEEK_SET) + + # look for candidate positions + pattern = obj_type.generate_pattern() + candidates = self.search_pattern(pattern, stream) + + # try parse + for idx in candidates: + # assume a ExternalObjectStorage in stream at idx + stream.seek(idx) + log.info('extracting at idx {}'.format(idx)) + try: + storage = obj_type.extract_from(stream) + except Exception: + self._log_exception() + continue + + errs = storage.check_validity() + if errs: + log.warning('check_validity found {} issues' + .format(len(errs))) + else: + log.info('storage is ok; compressed={}, size={}, ' + 'size_decomp={}' + .format(storage.compressed, + storage.rec_head.rec_len, + storage.uncompressed_size)) + storages.append(storage) + for err in errs: + log.warning('check_validity({}): {}' + .format(obj_type.__name__, err)) + if errs and self.fast_fail: + raise errs[0] + + return storages + + finally: + if stream is not None: + log.debug('closing stream') + stream.close() + + + def decompress_vba_storage(self, storage): + """ return decompressed data from search_vba_storage """ + + log.debug('decompressing storage for VBA OLE data stream ') + stream = None + try: + log.debug('opening stream') + stream = self.ole.openstream(MAIN_STREAM_NAME) + + # decompress iteratively; a zlib.decompress of all data + # failed with Error -5 (incomplete or truncated stream) + stream.seek(storage.data_offset, os.SEEK_SET) + decomp, n_read, err = \ + iterative_decompress(stream, storage.data_size) + log.info('decompressed {} to {} bytes, err is {}' + .format(n_read, len(decomp), err)) + if err and self.fast_fail: + raise err + # otherwise try to continue with partial data + + return decomp + + ## create OleFileIO from decompressed data + #ole = olefile.OleFileIO(decomp) + #root_streams = [entry[0].lower() for entry in ole.listdir()] + #for required in 'project', 'projectwm', 'vba': + # if required not in root_streams: + # raise ValueError('storage seems to not be a VBA storage ' + # '({} not found in root streams)' + # .format(required)) + #log.debug('tests succeeded') + #return ole + + finally: + if stream is not None: + log.debug('closing stream') + stream.close() + + +def iterative_decompress(stream, size, chunk_size=4096): + """ decompress data from stream chunk-wise """ + + decompressor = zlib.decompressobj() + n_read = 0 + decomp = '' + return_err = None + + try: + while n_read < size: + n_new = min(size-n_read, chunk_size) + decomp += decompressor.decompress(stream.read(n_new)) + n_read += n_new + except zlib.error as err: + return_err = err + + return decomp, n_read, return_err + # === TESTING ================================================================= def test(): """ for testing and debugging """ from glob import glob + from olevba import VBA_Parser # setup logging logging.basicConfig(level=logging.DEBUG, @@ -1280,9 +1514,32 @@ def test(): # parse log.info('-' * 72) log.info('test file: {}'.format(file_name)) - ppt = PptParser(file_name, fast_fail=False) - #ppt.parse_document_persist_object() - ppt.search_vba() + try: + ppt = PptParser(file_name, fast_fail=False) + #ppt.parse_document_persist_object() + n_infos = len(ppt.search_vba_info()) + storages = ppt.search_vba_storage() + n_storages = len(storages) + log.debug('found {} infos and {} storages'.format(n_infos, + n_storages)) + if n_infos != n_storages: + log.warning('found different number of vba infos and storages') + for storage in storages: + parser = VBA_Parser(None, ppt.decompress_vba_storage(storage), + container=file_name) + for vba_root, project_path, dir_path in parser.find_vba_projects(): + log.info('found vba project: root={}, proj={}, dir={}' + .format(vba_root, project_path, dir_path)) + for subfilename, stream_path, vba_filename, vba_code in \ + parser.extract_all_macros(): + log.info('found macro: subfile={}, stream={}, vbafile={}' + .format(subfilename, stream_path, vba_filename)) + for line in vba_code.splitlines(): + log.info('code: {}'.format(line.rstrip())) + + + except Exception: + log.exception('exception') if __name__ == '__main__': -- libgit2 0.21.4