Commit 64f57029292e6dd169925dc5a874a5ef25319cf9

Authored by Philippe Lagadec
1 parent b23bfde7

olevba: added VBA Form strings extraction and analysis

Showing 1 changed file with 102 additions and 1 deletions
oletools/olevba.py
... ... @@ -163,8 +163,9 @@ https://github.com/unixfreak0037/officeparser
163 163 # 2016-02-07 PL: - KeyboardInterrupt is now raised properly
164 164 # 2016-02-20 v0.43 PL: - fixed issue #34 in the VBA parser and vba_chr
165 165 # 2016-02-29 PL: - added Workbook_Activate to suspicious keywords
  166 +# 2016-03-08 v0.44 PL: - added VBA Form strings extraction and analysis
166 167  
167   -__version__ = '0.43'
  168 +__version__ = '0.44'
168 169  
169 170 #------------------------------------------------------------------------------
170 171 # TODO:
... ... @@ -510,6 +511,9 @@ re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"')
510 511 # regex to check that it is not just a hex string:
511 512 re_nothex_check = re.compile(r'[G-Zg-z]')
512 513  
  514 +# regex to extract printable strings (at least 5 chars) from VBA Forms:
  515 +re_printable_string = re.compile(r'[\t\r\n\x20-\xFF]{5,}')
  516 +
513 517  
514 518 # === PARTIAL VBA GRAMMAR ====================================================
515 519  
... ... @@ -1861,6 +1865,7 @@ class VBA_Parser(object):
1861 1865 self.container = container
1862 1866 self.type = None
1863 1867 self.vba_projects = None
  1868 + self.vba_forms = None
1864 1869 self.contains_macros = None # will be set to True or False by detect_macros
1865 1870 self.vba_code_all_modules = None # to store the source code of all modules
1866 1871 # list of tuples for each module: (subfilename, stream_path, vba_filename, vba_code)
... ... @@ -2304,6 +2309,8 @@ class VBA_Parser(object):
2304 2309 for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros():
2305 2310 #TODO: filter code? (each module)
2306 2311 self.vba_code_all_modules += vba_code + '\n'
  2312 + for (subfilename, form_path, form_string) in self.extract_form_strings():
  2313 + self.vba_code_all_modules += form_string + '\n'
2307 2314 # Analyze the whole code at once:
2308 2315 scanner = VBA_Scanner(self.vba_code_all_modules)
2309 2316 self.analysis_results = scanner.scan(show_decoded_strings)
... ... @@ -2338,6 +2345,95 @@ class VBA_Parser(object):
2338 2345 #TODO: repasser l'analyse plusieurs fois si des chaines hex ou base64 sont revelees
2339 2346  
2340 2347  
  2348 + def find_vba_forms(self):
  2349 + """
  2350 + Finds all the VBA forms stored in an OLE file.
  2351 +
  2352 + Return None if the file is not OLE but OpenXML.
  2353 + Return a list of tuples (vba_root, project_path, dir_path) for each VBA project.
  2354 + vba_root is the path of the root OLE storage containing the VBA project,
  2355 + including a trailing slash unless it is the root of the OLE file.
  2356 + project_path is the path of the OLE stream named "PROJECT" within the VBA project.
  2357 + dir_path is the path of the OLE stream named "VBA/dir" within the VBA project.
  2358 +
  2359 + If this function returns an empty list for one of the supported formats
  2360 + (i.e. Word, Excel, Powerpoint except Powerpoint 97-2003), then the
  2361 + file does not contain VBA macros.
  2362 +
  2363 + :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path)
  2364 + for each VBA project found if OLE file
  2365 + """
  2366 + log.debug('VBA_Parser.find_vba_forms')
  2367 + # if the file is not OLE but OpenXML, return None:
  2368 + if self.ole_file is None:
  2369 + return None
  2370 +
  2371 + # if this method has already been called, return previous result:
  2372 + # if self.vba_projects is not None:
  2373 + # return self.vba_projects
  2374 +
  2375 + # According to MS-OFORMS section 2.1.2 Control Streams:
  2376 + # - A parent control, that is, a control that can contain embedded controls,
  2377 + # MUST be persisted as a storage that contains multiple streams.
  2378 + # - All parent controls MUST contain a FormControl. The FormControl
  2379 + # properties are persisted to a stream (1) as specified in section 2.1.1.2.
  2380 + # The name of this stream (1) MUST be "f".
  2381 + # - Embedded controls that cannot themselves contain other embedded
  2382 + # controls are persisted sequentially as FormEmbeddedActiveXControls
  2383 + # to a stream (1) contained in the same storage as the parent control.
  2384 + # The name of this stream (1) MUST be "o".
  2385 + # - all names are case-insensitive
  2386 +
  2387 + # start with an empty list:
  2388 + self.vba_forms = []
  2389 + # Look for any storage containing those storage/streams:
  2390 + ole = self.ole_file
  2391 + for storage in ole.listdir(streams=False, storages=True):
  2392 + log.debug('Checking storage %r' % storage)
  2393 + # Look for two streams named 'o' and 'f':
  2394 + o_stream = storage + ['o']
  2395 + f_stream = storage + ['f']
  2396 + log.debug('Checking if streams %r and %r exist' % (f_stream, o_stream))
  2397 + if ole.exists(o_stream) and ole.get_type(o_stream) == olefile.STGTY_STREAM \
  2398 + and ole.exists(f_stream) and ole.get_type(f_stream) == olefile.STGTY_STREAM:
  2399 + form_path = '/'.join(storage)
  2400 + log.debug('Found VBA Form: %r' % form_path)
  2401 + self.vba_forms.append(storage)
  2402 + return self.vba_forms
  2403 +
  2404 + def extract_form_strings(self):
  2405 + """
  2406 + Extract printable strings from each VBA Form found in the file
  2407 +
  2408 + Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found
  2409 + If the file is OLE, filename is the path of the file.
  2410 + If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros
  2411 + within the zip archive, e.g. word/vbaProject.bin.
  2412 + """
  2413 + if self.ole_file is None:
  2414 + # This may be either an OpenXML or a text file:
  2415 + if self.type == TYPE_TEXT:
  2416 + # This is a text file, return no results:
  2417 + return
  2418 + else:
  2419 + # OpenXML: recursively yield results from each OLE subfile:
  2420 + for ole_subfile in self.ole_subfiles:
  2421 + for results in ole_subfile.extract_form_strings():
  2422 + yield results
  2423 + else:
  2424 + # This is an OLE file:
  2425 + self.find_vba_forms()
  2426 + ole = self.ole_file
  2427 + for form_storage in self.vba_forms:
  2428 + o_stream = form_storage + ['o']
  2429 + log.debug('Opening form object stream %r' % '/'.join(o_stream))
  2430 + form_data = ole.openstream(o_stream).read()
  2431 + # Extract printable strings from the form object stream "o":
  2432 + for m in re_printable_string.finditer(form_data):
  2433 + log.debug('Printable string found in form: %r' % m.group())
  2434 + yield (self.filename, '/'.join(o_stream), m.group())
  2435 +
  2436 +
2341 2437 def close(self):
2342 2438 """
2343 2439 Close all the open files. This method must be called after usage, if
... ... @@ -2463,6 +2559,11 @@ class VBA_Parser_CLI(VBA_Parser):
2463 2559 print 'ANALYSIS:'
2464 2560 # analyse each module's code, filtered to avoid false positives:
2465 2561 self.print_analysis(show_decoded_strings)
  2562 + for (subfilename, stream_path, form_string) in self.extract_form_strings():
  2563 + print '-' * 79
  2564 + print 'VBA FORM STRING IN %r - OLE stream: %r' % (subfilename, stream_path)
  2565 + print '- ' * 39
  2566 + print form_string
2466 2567 if global_analysis and not vba_code_only:
2467 2568 # analyse the code from all modules at once:
2468 2569 self.print_analysis(show_decoded_strings)
... ...