Commit b21c146505d6e6f83d2a2c0105a16da83791e142

Authored by Christian Herdtweck
1 parent 44737a1c

continue integration of ppt into olevba: works now!

Showing 1 changed file with 66 additions and 32 deletions
oletools/olevba.py
... ... @@ -241,6 +241,7 @@ import thirdparty.olefile as olefile
241 241 from thirdparty.prettytable import prettytable
242 242 from thirdparty.xglob import xglob
243 243 from thirdparty.pyparsing.pyparsing import *
  244 +import ppt_parser
244 245  
245 246 # monkeypatch email to fix issue #32:
246 247 # allow header lines without ":"
... ... @@ -1970,6 +1971,8 @@ class VBA_Parser(object):
1970 1971 if olefile.isOleFile(_file):
1971 1972 # This looks like an OLE file
1972 1973 self.open_ole(_file)
  1974 + # if this worked, try whether it is a ppt file (special ole file)
  1975 + self.open_ppt()
1973 1976 if self.type is None and zipfile.is_zipfile(_file):
1974 1977 # Zip file, which may be an OpenXML document
1975 1978 self.open_openxml(_file)
... ... @@ -2184,29 +2187,44 @@ class VBA_Parser(object):
2184 2187 % (self.filename, MSG_OLEVBA_ISSUES))
2185 2188 pass
2186 2189  
2187   - def open_ppt(self, ole):
2188   - """ try to interpret ole file as PowerPoint 97-2003 using PptParser """
  2190 + def open_ppt(self):
  2191 + """ try to interpret self.ole_file as PowerPoint 97-2003 using PptParser
  2192 +
  2193 + Although self.ole_file is a valid olefile.OleFileIO, we set
  2194 + self.ole_file = None in here and instead set self.ole_subfiles to the
  2195 + VBA ole streams found within the main ole file. That makes most of the
  2196 + code below treat this like an OpenXML file and only look at the
  2197 + ole_subfiles (except find_vba_* which needs to explicitly check for
  2198 + self.type)
  2199 + """
  2200 + log.info('Check whether OLE file is PPT')
  2201 + ppt_parser.enable_logging()
2189 2202 try:
2190   - ppt_parser = ppt_parser.PptParser(ole)
2191   - n_infos = len(ppt_parser.search_vba_info())
2192   - storages = ppt_parser.search_vba_storage()
  2203 + ppt = ppt_parser.PptParser(self.ole_file, fast_fail=True)
  2204 + n_infos = len(ppt.search_vba_info())
  2205 + storages = ppt.search_vba_storage()
2193 2206 n_storages = len(storages)
2194 2207 log.debug('ppt: found {} infos and {} storages'.format(n_infos,
2195 2208 n_storages))
2196 2209 if n_infos != n_storages:
2197   - log.warning('ppt: found different number of vba infos and storages!')
  2210 + # probably, some storages are ActiveX or other OLE types
  2211 + log.warning('ppt: found different number of vba infos ({} and '
  2212 + 'storages ({}) --> subfiles might make trouble'
  2213 + .format(n_infos, n_storages))
2198 2214 for storage in storages:
2199 2215 if storage.is_compressed:
2200   - storage_decomp = self.ole_file.decompress_vba_storage(storage)
  2216 + storage_decomp = ppt.decompress_vba_storage(storage)
2201 2217 else:
2202 2218 log.warning('just guessing here: decompressed storage = storage?')
2203 2219 storage_decomp = storage.read_all() # not implemented yet
2204 2220 self.ole_subfiles.append(VBA_Parser(None, storage_decomp,
2205 2221 container='PptParser'))
  2222 + self.ole_file.close() # just in case
2206 2223 self.ole_file = None # required to make other methods look at ole_subfiles
2207 2224 self.type = TYPE_PPT
2208   - except Exception:
2209   - log.exception('Failed PPT parsing for file %r' % self.filename)
  2225 + except Exception as exc:
  2226 + log.debug("File appears not to be a ppt file (%s)")
  2227 + log.debug('Exception from opening attempt:', exc_info=True)
2210 2228  
2211 2229  
2212 2230 def open_text(self, data):
... ... @@ -2251,22 +2269,27 @@ class VBA_Parser(object):
2251 2269 """
2252 2270 log.debug('VBA_Parser.find_vba_projects')
2253 2271  
2254   - # if this is a ppt file (PowerPoint 97-2003):
2255   - # let ppt_parser do its job
2256   - if self.type == TYPE_PPT:
2257   - self.vba_projects = []
2258   - for subfile in self.ole_subfiles:
2259   - self.vba_projects.extend(subfile.find_vba_projects())
2260   - return self.vba_projects
2261   -
2262 2272 # if the file is not OLE but OpenXML, return None:
2263   - if self.ole_file is None:
  2273 + if self.ole_file is None and self.type != TYPE_PPT:
2264 2274 return None
2265 2275  
2266 2276 # if this method has already been called, return previous result:
2267 2277 if self.vba_projects is not None:
2268 2278 return self.vba_projects
2269 2279  
  2280 + # if this is a ppt file (PowerPoint 97-2003):
  2281 + # self.ole_file is None but the ole_subfiles do contain vba_projects
  2282 + # (like for OpenXML files).
  2283 + if self.type == TYPE_PPT:
  2284 + # TODO: so far, this function is never called for PPT files, but
  2285 + # if that happens, the information is lost which ole file contains
  2286 + # which storage!
  2287 + log.warning('Returned info is not complete for PPT types!')
  2288 + self.vba_projects = []
  2289 + for subfile in self.ole_subfiles:
  2290 + self.vba_projects.extend(subfile.find_vba_projects())
  2291 + return self.vba_projects
  2292 +
2270 2293 # Find the VBA project root (different in MS Word, Excel, etc):
2271 2294 # - Word 97-2003: Macros
2272 2295 # - Excel 97-2003: _VBA_PROJECT_CUR
... ... @@ -2475,7 +2498,7 @@ class VBA_Parser(object):
2475 2498 """
2476 2499 log.debug('VBA_Parser.find_vba_forms')
2477 2500 # if the file is not OLE but OpenXML, return None:
2478   - if self.ole_file is None:
  2501 + if self.ole_file is None and self.type != TYPE_PPT:
2479 2502 return None
2480 2503  
2481 2504 # if this method has already been called, return previous result:
... ... @@ -2494,21 +2517,32 @@ class VBA_Parser(object):
2494 2517 # The name of this stream (1) MUST be "o".
2495 2518 # - all names are case-insensitive
2496 2519  
  2520 + if self.type == TYPE_PPT:
  2521 + # TODO: so far, this function is never called for PPT files, but
  2522 + # if that happens, the information is lost which ole file contains
  2523 + # which storage!
  2524 + ole_files = self.ole_subfiles
  2525 + log.warning('Returned info is not complete for PPT types!')
  2526 + else:
  2527 + ole_files = [self.ole_file, ]
  2528 +
2497 2529 # start with an empty list:
2498 2530 self.vba_forms = []
2499   - # Look for any storage containing those storage/streams:
2500   - ole = self.ole_file
2501   - for storage in ole.listdir(streams=False, storages=True):
2502   - log.debug('Checking storage %r' % storage)
2503   - # Look for two streams named 'o' and 'f':
2504   - o_stream = storage + ['o']
2505   - f_stream = storage + ['f']
2506   - log.debug('Checking if streams %r and %r exist' % (f_stream, o_stream))
2507   - if ole.exists(o_stream) and ole.get_type(o_stream) == olefile.STGTY_STREAM \
2508   - and ole.exists(f_stream) and ole.get_type(f_stream) == olefile.STGTY_STREAM:
2509   - form_path = '/'.join(storage)
2510   - log.debug('Found VBA Form: %r' % form_path)
2511   - self.vba_forms.append(storage)
  2531 +
  2532 + # Loop over ole streams
  2533 + for ole in ole_files:
  2534 + # Look for any storage containing those storage/streams:
  2535 + for storage in ole.listdir(streams=False, storages=True):
  2536 + log.debug('Checking storage %r' % storage)
  2537 + # Look for two streams named 'o' and 'f':
  2538 + o_stream = storage + ['o']
  2539 + f_stream = storage + ['f']
  2540 + log.debug('Checking if streams %r and %r exist' % (f_stream, o_stream))
  2541 + if ole.exists(o_stream) and ole.get_type(o_stream) == olefile.STGTY_STREAM \
  2542 + and ole.exists(f_stream) and ole.get_type(f_stream) == olefile.STGTY_STREAM:
  2543 + form_path = '/'.join(storage)
  2544 + log.debug('Found VBA Form: %r' % form_path)
  2545 + self.vba_forms.append(storage)
2512 2546 return self.vba_forms
2513 2547  
2514 2548 def extract_form_strings(self):
... ...