Commit 20e6670e5587fff790b904623520d9fd2f513bdc

Authored by Philippe Lagadec
2 parents 4b202e66 2ff5c0b1

olevba: added JSON output (by Christian Herdtweck)

rtfobj: fixed logging output to use stdout instead of stderr
oletools/olevba.py
... ... @@ -164,8 +164,9 @@ https://github.com/unixfreak0037/officeparser
164 164 # 2016-02-20 v0.43 PL: - fixed issue #34 in the VBA parser and vba_chr
165 165 # 2016-02-29 PL: - added Workbook_Activate to suspicious keywords
166 166 # 2016-03-08 v0.44 PL: - added VBA Form strings extraction and analysis
  167 +# 2016-03-04 v0.45 CH: - added JSON output (by Christian Herdtweck)
167 168  
168   -__version__ = '0.44'
  169 +__version__ = '0.45'
169 170  
170 171 #------------------------------------------------------------------------------
171 172 # TODO:
... ... @@ -215,6 +216,7 @@ import traceback
215 216 import zlib
216 217 import email # for MHTML parsing
217 218 import string # for printable
  219 +import json # for json output mode (argument --json)
218 220  
219 221 # import lxml or ElementTree for XML parsing:
220 222 try:
... ... @@ -1655,6 +1657,42 @@ def detect_vba_strings(vba_code):
1655 1657 return results
1656 1658  
1657 1659  
  1660 +def json2ascii(json_obj, encoding='utf8', errors='replace'):
  1661 + """ ensure there is no unicode in json and all strings are safe to decode
  1662 +
  1663 + works recursively, decodes and re-encodes every string to/from unicode
  1664 + to ensure there will be no trouble in loading the dumped json output
  1665 + """
  1666 + if json_obj is None:
  1667 + pass
  1668 + elif isinstance(json_obj, (bool, int, float)):
  1669 + pass
  1670 + elif isinstance(json_obj, str):
  1671 + dencoded = json_obj.decode(encoding, errors).encode(encoding, errors)
  1672 + if dencoded != str:
  1673 + logging.info('json2ascii: replaced: {0} (len {1})'
  1674 + .format(json_obj, len(json_obj)))
  1675 + logging.info('json2ascii: with: {0} (len {1})'
  1676 + .format(dencoded, len(dencoded)))
  1677 + return dencoded
  1678 + elif isinstance(json_obj, unicode):
  1679 + logging.info('json2ascii: replaced: {0}'
  1680 + .format(json_obj.encode(encoding, errors)))
  1681 + # cannot put original into logger
  1682 + # print 'original: ' json_obj
  1683 + return json_obj.encode(encoding, errors)
  1684 + elif isinstance(json_obj, dict):
  1685 + for key in json_obj:
  1686 + json_obj[key] = json2ascii(json_obj[key])
  1687 + elif isinstance(json_obj, (list,tuple)):
  1688 + for item in json_obj:
  1689 + item = json2ascii(item)
  1690 + else:
  1691 + logging.debug('unexpected type in json2ascii: {0} -- leave as is'
  1692 + .format(type(json_obj)))
  1693 + return json_obj
  1694 +
  1695 +
1658 1696 class VBA_Scanner(object):
1659 1697 """
1660 1698 Class to scan the source code of a VBA module to find obfuscated strings,
... ... @@ -2506,6 +2544,20 @@ class VBA_Parser_CLI(VBA_Parser):
2506 2544 else:
2507 2545 print 'No suspicious keyword or IOC found.'
2508 2546  
  2547 + def print_analysis_json(self, show_decoded_strings=False):
  2548 + """
  2549 + Analyze the provided VBA code, and return the results in json format
  2550 +
  2551 + :param vba_code: str, VBA source code to be analyzed
  2552 + :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.
  2553 + :return: dict
  2554 + """
  2555 + # print a waiting message only if the output is not redirected to a file:
  2556 + if sys.stdout.isatty():
  2557 + print 'Analysis...\r',
  2558 + sys.stdout.flush()
  2559 + return [dict(type=kw_type, keyword=keyword, description=description)
  2560 + for kw_type, keyword, description in self.analyze_macros(show_decoded_strings)]
2509 2561  
2510 2562 def process_file(self, show_decoded_strings=False,
2511 2563 display_code=True, global_analysis=True, hide_attributes=True,
... ... @@ -2584,6 +2636,81 @@ class VBA_Parser_CLI(VBA_Parser):
2584 2636 print ''
2585 2637  
2586 2638  
  2639 + def process_file_json(self, show_decoded_strings=False,
  2640 + display_code=True, global_analysis=True, hide_attributes=True,
  2641 + vba_code_only=False, show_deobfuscated_code=False):
  2642 + """
  2643 + Process a single file
  2644 +
  2645 + every "show" or "print" here is to be translated as "add to json"
  2646 +
  2647 + :param filename: str, path and filename of file on disk, or within the container.
  2648 + :param data: bytes, content of the file if it is in a container, None if it is a file on disk.
  2649 + :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content.
  2650 + :param display_code: bool, if False VBA source code is not displayed (default True)
  2651 + :param global_analysis: bool, if True all modules are merged for a single analysis (default),
  2652 + otherwise each module is analyzed separately (old behaviour)
  2653 + :param hide_attributes: bool, if True the first lines starting with "Attribute VB" are hidden (default)
  2654 + """
  2655 + #TODO: fix conflicting parameters (?)
  2656 +
  2657 + if vba_code_only and not display_code:
  2658 + display_code = True
  2659 +
  2660 + result = {}
  2661 +
  2662 + if self.container:
  2663 + result['container'] = self.container
  2664 + else:
  2665 + result['container'] = None
  2666 + result['file'] = self.filename
  2667 + result['json_conversion_successful'] = False
  2668 + result['analysis'] = None
  2669 + result['code_deobfuscated'] = None
  2670 +
  2671 + try:
  2672 + #TODO: handle olefile errors, when an OLE file is malformed
  2673 + result['type'] = self.type
  2674 + macros = []
  2675 + if self.detect_vba_macros():
  2676 + for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros():
  2677 + curr_macro = {}
  2678 + if hide_attributes:
  2679 + # hide attribute lines:
  2680 + vba_code_filtered = filter_vba(vba_code)
  2681 + else:
  2682 + vba_code_filtered = vba_code
  2683 +
  2684 + curr_macro['vba_filename'] = vba_filename
  2685 + curr_macro['subfilename'] = subfilename
  2686 + curr_macro['ole_stream'] = stream_path
  2687 + if display_code:
  2688 + curr_macro['code'] = vba_code_filtered.strip()
  2689 + if not global_analysis and not vba_code_only:
  2690 + # analyse each module's code, filtered to avoid false positives:
  2691 + #TODO: remove this option
  2692 + curr_macro['analysis'] = self.print_analysis_json(show_decoded_strings)
  2693 + macros.append(curr_macro)
  2694 + if global_analysis and not vba_code_only:
  2695 + # analyse the code from all modules at once:
  2696 + result['analysis'] = self.print_analysis_json(show_decoded_strings)
  2697 + if show_deobfuscated_code:
  2698 + result['code_deobfuscated'] = self.reveal()
  2699 + result['macros'] = macros
  2700 + result['json_conversion_successful'] = True
  2701 + except KeyboardInterrupt:
  2702 + # do not ignore exceptions when the user presses Ctrl+C/Pause:
  2703 + raise
  2704 + except: #TypeError:
  2705 + #raise
  2706 + #TODO: print more info if debug mode
  2707 + #print sys.exc_value
  2708 + # display the exception with full stack trace for debugging, but do not stop:
  2709 + traceback.print_exc()
  2710 +
  2711 + return result
  2712 +
  2713 +
2587 2714 def process_file_triage(self, show_decoded_strings=False):
2588 2715 """
2589 2716 Process a file in triage mode, showing only summary results on one line.
... ... @@ -2682,10 +2809,19 @@ def main():
2682 2809 help='if the file is a zip archive, open all files from it, using the provided password (requires Python 2.6+)')
2683 2810 parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*',
2684 2811 help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)')
2685   - parser.add_option("-t", '--triage', action="store_true", dest="triage_mode",
2686   - help='triage mode, display results as a summary table (default for multiple files)')
2687   - parser.add_option("-d", '--detailed', action="store_true", dest="detailed_mode",
2688   - help='detailed mode, display full results (default for single file)')
  2812 + # output mode; could make this even simpler with add_option(type='choice') but that would make
  2813 + # cmd line interface incompatible...
  2814 + modes = optparse.OptionGroup(parser, title='Output mode (mutually exclusive)')
  2815 + modes.add_option("-t", '--triage', action="store_const", dest="output_mode",
  2816 + const='triage', default='unspecified',
  2817 + help='triage mode, display results as a summary table (default for multiple files)')
  2818 + modes.add_option("-d", '--detailed', action="store_const", dest="output_mode",
  2819 + const='detailed', default='unspecified',
  2820 + help='detailed mode, display full results (default for single file)')
  2821 + modes.add_option("-j", '--json', action="store_const", dest="output_mode",
  2822 + const='json', default='unspecified',
  2823 + help='json mode, detailed in json format (never default)')
  2824 + parser.add_option_group(modes)
2689 2825 parser.add_option("-a", '--analysis', action="store_false", dest="display_code", default=True,
2690 2826 help='display only analysis results, not the macro source code')
2691 2827 parser.add_option("-c", '--code', action="store_true", dest="vba_code_only", default=False,
... ... @@ -2715,8 +2851,13 @@ def main():
2715 2851 parser.print_help()
2716 2852 sys.exit()
2717 2853  
2718   - # print banner with version
2719   - print 'olevba %s - http://decalage.info/python/oletools' % __version__
  2854 + # provide info about tool and its version
  2855 + if options.output_mode == 'json':
  2856 + json_results = [dict(script_name='olevba', version=__version__,
  2857 + url='http://decalage.info/python/oletools',
  2858 + type='MetaInformation'), ]
  2859 + else:
  2860 + print 'olevba %s - http://decalage.info/python/oletools' % __version__
2720 2861  
2721 2862 logging.basicConfig(level=LOG_LEVELS[options.loglevel], format='%(levelname)-8s %(message)s')
2722 2863 # enable logging in the modules:
... ... @@ -2735,8 +2876,9 @@ def main():
2735 2876 # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('Type', 'Macros', 'AutoEx', 'Susp.', 'IOCs', 'HexStr')
2736 2877 # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('-'*8, '-'*7, '-'*7, '-'*7, '-'*7, '-'*7)
2737 2878  
2738   - # Column headers (except if detailed mode)
2739   - if not options.detailed_mode or options.triage_mode:
  2879 + # Column headers (do not know how many files there will be yet, so if no output_mode
  2880 + # was specified, we will print triage for first file --> need these headers)
  2881 + if options.output_mode in ('triage', 'unspecified'):
2740 2882 print '%-12s %-65s' % ('Flags', 'Filename')
2741 2883 print '%-12s %-65s' % ('-' * 11, '-' * 65)
2742 2884  
... ... @@ -2751,13 +2893,13 @@ def main():
2751 2893 continue
2752 2894 # Open the file
2753 2895 vba_parser = VBA_Parser_CLI(filename, data=data, container=container)
2754   - if options.detailed_mode and not options.triage_mode:
  2896 + if options.output_mode == 'detailed':
2755 2897 # fully detailed output
2756 2898 vba_parser.process_file(show_decoded_strings=options.show_decoded_strings,
2757 2899 display_code=options.display_code, global_analysis=True, #options.global_analysis,
2758 2900 hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only,
2759 2901 show_deobfuscated_code=options.show_deobfuscated_code)
2760   - else:
  2902 + elif options.output_mode in ('triage', 'unspecified'):
2761 2903 # print container name when it changes:
2762 2904 if container != previous_container:
2763 2905 if container is not None:
... ... @@ -2765,19 +2907,58 @@ def main():
2765 2907 previous_container = container
2766 2908 # summarized output for triage:
2767 2909 vba_parser.process_file_triage(show_decoded_strings=options.show_decoded_strings)
  2910 + elif options.output_mode == 'json':
  2911 + json_results.append(
  2912 + vba_parser.process_file_json(show_decoded_strings=options.show_decoded_strings,
  2913 + display_code=options.display_code, global_analysis=True, #options.global_analysis,
  2914 + hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only,
  2915 + show_deobfuscated_code=options.show_deobfuscated_code))
  2916 + else: # (should be impossible)
  2917 + raise ValueError('unexpected output mode: "{0}"!'.format(options.output_mode))
2768 2918 count += 1
2769   - if not options.detailed_mode or options.triage_mode:
  2919 + if options.output_mode == 'triage':
2770 2920 print '\n(Flags: OpX=OpenXML, XML=Word2003XML, MHT=MHTML, TXT=Text, M=Macros, ' \
2771 2921 'A=Auto-executable, S=Suspicious keywords, I=IOCs, H=Hex strings, ' \
2772 2922 'B=Base64 strings, D=Dridex strings, V=VBA strings, ?=Unknown)\n'
2773 2923  
2774   - if count == 1 and not options.triage_mode and not options.detailed_mode:
2775   - # if options -t and -d were not specified and it's a single file, print details:
  2924 + if count == 1 and options.output_mode == 'unspecified':
  2925 + # if options -t, -d and -j were not specified and it's a single file, print details:
2776 2926 vba_parser.process_file(show_decoded_strings=options.show_decoded_strings,
2777 2927 display_code=options.display_code, global_analysis=True, #options.global_analysis,
2778 2928 hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only,
2779 2929 show_deobfuscated_code=options.show_deobfuscated_code)
2780 2930  
  2931 + if options.output_mode == 'json':
  2932 + json_options = dict(check_circular=False, indent=4, ensure_ascii=False)
  2933 +
  2934 + # json.dump[s] cannot deal with unicode objects that are not properly
  2935 + # encoded --> encode in own function:
  2936 + json_results = json2ascii(json_results)
  2937 + #print_json(json_results)
  2938 +
  2939 + if False: # options.outfile: # (option currently commented out)
  2940 + with open(outfile, 'w') as write_handle:
  2941 + json.dump(write_handle, **json_options)
  2942 + else:
  2943 + print json.dumps(json_results, **json_options)
  2944 +
  2945 +
  2946 +def print_json(j):
  2947 + if isinstance(j, dict):
  2948 + for key, val in j.items():
  2949 + print_json(key)
  2950 + print_json(val)
  2951 + elif isinstance(j, list):
  2952 + for elem in j:
  2953 + print_json(elem)
  2954 + else:
  2955 + try:
  2956 + if len(j) > 20:
  2957 + print type(j), repr(j[:20]), '...(len {0})'.format(len(j))
  2958 + else:
  2959 + print type(j), repr(j)
  2960 + except TypeError:
  2961 + print type(j), repr(j)
2781 2962  
2782 2963 if __name__ == '__main__':
2783 2964 main()
... ...
oletools/rtfobj.py
... ... @@ -15,7 +15,7 @@ http://www.decalage.info/python/oletools
15 15  
16 16 #=== LICENSE =================================================================
17 17  
18   -# rtfobj is copyright (c) 2012-2015, Philippe Lagadec (http://www.decalage.info)
  18 +# rtfobj is copyright (c) 2012-2016, Philippe Lagadec (http://www.decalage.info)
19 19 # All rights reserved.
20 20 #
21 21 # Redistribution and use in source and binary forms, with or without modification,
... ... @@ -46,8 +46,9 @@ http://www.decalage.info/python/oletools
46 46 # 2015-12-09 v0.03 PL: - configurable logging, CLI options
47 47 # - extract OLE 1.0 objects
48 48 # - extract files from OLE Package objects
  49 +# 2016-04-01 v0.04 PL: - fixed logging output to use stdout instead of stderr
49 50  
50   -__version__ = '0.03'
  51 +__version__ = '0.04'
51 52  
52 53 #------------------------------------------------------------------------------
53 54 # TODO:
... ... @@ -338,8 +339,11 @@ if __name__ == '__main__':
338 339 parser.print_help()
339 340 sys.exit()
340 341  
341   - # setup logging to the console
342   - logging.basicConfig(level=LOG_LEVELS[options.loglevel], format='%(levelname)-8s %(message)s')
  342 + # Setup logging to the console:
  343 + # here we use stdout instead of stderr by default, so that the output
  344 + # can be redirected properly.
  345 + logging.basicConfig(level=LOG_LEVELS[options.loglevel], stream=sys.stdout,
  346 + format='%(levelname)-8s %(message)s')
343 347 # enable logging in the modules:
344 348 log.setLevel(logging.NOTSET)
345 349 oleobj.log.setLevel(logging.NOTSET)
... ...