Commit 64f57029292e6dd169925dc5a874a5ef25319cf9
1 parent
b23bfde7
olevba: added VBA Form strings extraction and analysis
Showing
1 changed file
with
102 additions
and
1 deletions
oletools/olevba.py
| @@ -163,8 +163,9 @@ https://github.com/unixfreak0037/officeparser | @@ -163,8 +163,9 @@ https://github.com/unixfreak0037/officeparser | ||
| 163 | # 2016-02-07 PL: - KeyboardInterrupt is now raised properly | 163 | # 2016-02-07 PL: - KeyboardInterrupt is now raised properly |
| 164 | # 2016-02-20 v0.43 PL: - fixed issue #34 in the VBA parser and vba_chr | 164 | # 2016-02-20 v0.43 PL: - fixed issue #34 in the VBA parser and vba_chr |
| 165 | # 2016-02-29 PL: - added Workbook_Activate to suspicious keywords | 165 | # 2016-02-29 PL: - added Workbook_Activate to suspicious keywords |
| 166 | +# 2016-03-08 v0.44 PL: - added VBA Form strings extraction and analysis | ||
| 166 | 167 | ||
| 167 | -__version__ = '0.43' | 168 | +__version__ = '0.44' |
| 168 | 169 | ||
| 169 | #------------------------------------------------------------------------------ | 170 | #------------------------------------------------------------------------------ |
| 170 | # TODO: | 171 | # TODO: |
| @@ -510,6 +511,9 @@ re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"') | @@ -510,6 +511,9 @@ re_dridex_string = re.compile(r'"[0-9A-Za-z]{20,}"') | ||
| 510 | # regex to check that it is not just a hex string: | 511 | # regex to check that it is not just a hex string: |
| 511 | re_nothex_check = re.compile(r'[G-Zg-z]') | 512 | re_nothex_check = re.compile(r'[G-Zg-z]') |
| 512 | 513 | ||
| 514 | +# regex to extract printable strings (at least 5 chars) from VBA Forms: | ||
| 515 | +re_printable_string = re.compile(r'[\t\r\n\x20-\xFF]{5,}') | ||
| 516 | + | ||
| 513 | 517 | ||
| 514 | # === PARTIAL VBA GRAMMAR ==================================================== | 518 | # === PARTIAL VBA GRAMMAR ==================================================== |
| 515 | 519 | ||
| @@ -1861,6 +1865,7 @@ class VBA_Parser(object): | @@ -1861,6 +1865,7 @@ class VBA_Parser(object): | ||
| 1861 | self.container = container | 1865 | self.container = container |
| 1862 | self.type = None | 1866 | self.type = None |
| 1863 | self.vba_projects = None | 1867 | self.vba_projects = None |
| 1868 | + self.vba_forms = None | ||
| 1864 | self.contains_macros = None # will be set to True or False by detect_macros | 1869 | self.contains_macros = None # will be set to True or False by detect_macros |
| 1865 | self.vba_code_all_modules = None # to store the source code of all modules | 1870 | self.vba_code_all_modules = None # to store the source code of all modules |
| 1866 | # list of tuples for each module: (subfilename, stream_path, vba_filename, vba_code) | 1871 | # list of tuples for each module: (subfilename, stream_path, vba_filename, vba_code) |
| @@ -2304,6 +2309,8 @@ class VBA_Parser(object): | @@ -2304,6 +2309,8 @@ class VBA_Parser(object): | ||
| 2304 | for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros(): | 2309 | for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros(): |
| 2305 | #TODO: filter code? (each module) | 2310 | #TODO: filter code? (each module) |
| 2306 | self.vba_code_all_modules += vba_code + '\n' | 2311 | self.vba_code_all_modules += vba_code + '\n' |
| 2312 | + for (subfilename, form_path, form_string) in self.extract_form_strings(): | ||
| 2313 | + self.vba_code_all_modules += form_string + '\n' | ||
| 2307 | # Analyze the whole code at once: | 2314 | # Analyze the whole code at once: |
| 2308 | scanner = VBA_Scanner(self.vba_code_all_modules) | 2315 | scanner = VBA_Scanner(self.vba_code_all_modules) |
| 2309 | self.analysis_results = scanner.scan(show_decoded_strings) | 2316 | self.analysis_results = scanner.scan(show_decoded_strings) |
| @@ -2338,6 +2345,95 @@ class VBA_Parser(object): | @@ -2338,6 +2345,95 @@ class VBA_Parser(object): | ||
| 2338 | #TODO: repasser l'analyse plusieurs fois si des chaines hex ou base64 sont revelees | 2345 | #TODO: repasser l'analyse plusieurs fois si des chaines hex ou base64 sont revelees |
| 2339 | 2346 | ||
| 2340 | 2347 | ||
| 2348 | + def find_vba_forms(self): | ||
| 2349 | + """ | ||
| 2350 | + Finds all the VBA forms stored in an OLE file. | ||
| 2351 | + | ||
| 2352 | + Return None if the file is not OLE but OpenXML. | ||
| 2353 | + Return a list of tuples (vba_root, project_path, dir_path) for each VBA project. | ||
| 2354 | + vba_root is the path of the root OLE storage containing the VBA project, | ||
| 2355 | + including a trailing slash unless it is the root of the OLE file. | ||
| 2356 | + project_path is the path of the OLE stream named "PROJECT" within the VBA project. | ||
| 2357 | + dir_path is the path of the OLE stream named "VBA/dir" within the VBA project. | ||
| 2358 | + | ||
| 2359 | + If this function returns an empty list for one of the supported formats | ||
| 2360 | + (i.e. Word, Excel, Powerpoint except Powerpoint 97-2003), then the | ||
| 2361 | + file does not contain VBA macros. | ||
| 2362 | + | ||
| 2363 | + :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path) | ||
| 2364 | + for each VBA project found if OLE file | ||
| 2365 | + """ | ||
| 2366 | + log.debug('VBA_Parser.find_vba_forms') | ||
| 2367 | + # if the file is not OLE but OpenXML, return None: | ||
| 2368 | + if self.ole_file is None: | ||
| 2369 | + return None | ||
| 2370 | + | ||
| 2371 | + # if this method has already been called, return previous result: | ||
| 2372 | + # if self.vba_projects is not None: | ||
| 2373 | + # return self.vba_projects | ||
| 2374 | + | ||
| 2375 | + # According to MS-OFORMS section 2.1.2 Control Streams: | ||
| 2376 | + # - A parent control, that is, a control that can contain embedded controls, | ||
| 2377 | + # MUST be persisted as a storage that contains multiple streams. | ||
| 2378 | + # - All parent controls MUST contain a FormControl. The FormControl | ||
| 2379 | + # properties are persisted to a stream (1) as specified in section 2.1.1.2. | ||
| 2380 | + # The name of this stream (1) MUST be "f". | ||
| 2381 | + # - Embedded controls that cannot themselves contain other embedded | ||
| 2382 | + # controls are persisted sequentially as FormEmbeddedActiveXControls | ||
| 2383 | + # to a stream (1) contained in the same storage as the parent control. | ||
| 2384 | + # The name of this stream (1) MUST be "o". | ||
| 2385 | + # - all names are case-insensitive | ||
| 2386 | + | ||
| 2387 | + # start with an empty list: | ||
| 2388 | + self.vba_forms = [] | ||
| 2389 | + # Look for any storage containing those storage/streams: | ||
| 2390 | + ole = self.ole_file | ||
| 2391 | + for storage in ole.listdir(streams=False, storages=True): | ||
| 2392 | + log.debug('Checking storage %r' % storage) | ||
| 2393 | + # Look for two streams named 'o' and 'f': | ||
| 2394 | + o_stream = storage + ['o'] | ||
| 2395 | + f_stream = storage + ['f'] | ||
| 2396 | + log.debug('Checking if streams %r and %r exist' % (f_stream, o_stream)) | ||
| 2397 | + if ole.exists(o_stream) and ole.get_type(o_stream) == olefile.STGTY_STREAM \ | ||
| 2398 | + and ole.exists(f_stream) and ole.get_type(f_stream) == olefile.STGTY_STREAM: | ||
| 2399 | + form_path = '/'.join(storage) | ||
| 2400 | + log.debug('Found VBA Form: %r' % form_path) | ||
| 2401 | + self.vba_forms.append(storage) | ||
| 2402 | + return self.vba_forms | ||
| 2403 | + | ||
| 2404 | + def extract_form_strings(self): | ||
| 2405 | + """ | ||
| 2406 | + Extract printable strings from each VBA Form found in the file | ||
| 2407 | + | ||
| 2408 | + Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found | ||
| 2409 | + If the file is OLE, filename is the path of the file. | ||
| 2410 | + If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros | ||
| 2411 | + within the zip archive, e.g. word/vbaProject.bin. | ||
| 2412 | + """ | ||
| 2413 | + if self.ole_file is None: | ||
| 2414 | + # This may be either an OpenXML or a text file: | ||
| 2415 | + if self.type == TYPE_TEXT: | ||
| 2416 | + # This is a text file, return no results: | ||
| 2417 | + return | ||
| 2418 | + else: | ||
| 2419 | + # OpenXML: recursively yield results from each OLE subfile: | ||
| 2420 | + for ole_subfile in self.ole_subfiles: | ||
| 2421 | + for results in ole_subfile.extract_form_strings(): | ||
| 2422 | + yield results | ||
| 2423 | + else: | ||
| 2424 | + # This is an OLE file: | ||
| 2425 | + self.find_vba_forms() | ||
| 2426 | + ole = self.ole_file | ||
| 2427 | + for form_storage in self.vba_forms: | ||
| 2428 | + o_stream = form_storage + ['o'] | ||
| 2429 | + log.debug('Opening form object stream %r' % '/'.join(o_stream)) | ||
| 2430 | + form_data = ole.openstream(o_stream).read() | ||
| 2431 | + # Extract printable strings from the form object stream "o": | ||
| 2432 | + for m in re_printable_string.finditer(form_data): | ||
| 2433 | + log.debug('Printable string found in form: %r' % m.group()) | ||
| 2434 | + yield (self.filename, '/'.join(o_stream), m.group()) | ||
| 2435 | + | ||
| 2436 | + | ||
| 2341 | def close(self): | 2437 | def close(self): |
| 2342 | """ | 2438 | """ |
| 2343 | Close all the open files. This method must be called after usage, if | 2439 | Close all the open files. This method must be called after usage, if |
| @@ -2463,6 +2559,11 @@ class VBA_Parser_CLI(VBA_Parser): | @@ -2463,6 +2559,11 @@ class VBA_Parser_CLI(VBA_Parser): | ||
| 2463 | print 'ANALYSIS:' | 2559 | print 'ANALYSIS:' |
| 2464 | # analyse each module's code, filtered to avoid false positives: | 2560 | # analyse each module's code, filtered to avoid false positives: |
| 2465 | self.print_analysis(show_decoded_strings) | 2561 | self.print_analysis(show_decoded_strings) |
| 2562 | + for (subfilename, stream_path, form_string) in self.extract_form_strings(): | ||
| 2563 | + print '-' * 79 | ||
| 2564 | + print 'VBA FORM STRING IN %r - OLE stream: %r' % (subfilename, stream_path) | ||
| 2565 | + print '- ' * 39 | ||
| 2566 | + print form_string | ||
| 2466 | if global_analysis and not vba_code_only: | 2567 | if global_analysis and not vba_code_only: |
| 2467 | # analyse the code from all modules at once: | 2568 | # analyse the code from all modules at once: |
| 2468 | self.print_analysis(show_decoded_strings) | 2569 | self.print_analysis(show_decoded_strings) |