From 7374be1e6f51334f235d58c21e072069750c054b Mon Sep 17 00:00:00 2001 From: decalage2 Date: Thu, 13 Dec 2018 11:26:53 +0100 Subject: [PATCH] olevba: added class VBA_Project --- oletools/olevba.py | 985 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 1 file changed, 531 insertions(+), 454 deletions(-) diff --git a/oletools/olevba.py b/oletools/olevba.py index 89bc895..6792353 100644 --- a/oletools/olevba.py +++ b/oletools/olevba.py @@ -262,6 +262,7 @@ import zlib import email # for MHTML parsing import string # for printable import json # for json output mode (argument --json) +import codecs # import lxml or ElementTree for XML parsing: try: @@ -1337,6 +1338,525 @@ def decompress_stream(compressed_container): return bytes(decompressed_container) +class VBA_Project(object): + """ + Class to parse a VBA project from an OLE file, and to store all the corresponding + metadata and VBA modules. + """ + + def __init__(self, ole, vba_root, project_path, dir_path, relaxed=False): + """ + Extract VBA macros from an OleFileIO object. + + :param vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream + :param vba_project: path to the PROJECT stream + :param relaxed: If True, only create info/debug log entry if data is not as expected + (e.g. opening substream fails); if False, raise an error in this case + """ + self.ole = ole + self.vba_root = vba_root + self. project_path = project_path + self.dir_path = dir_path + self.relaxed = relaxed + log.debug('Parsing the dir stream from %r' % dir_path) + # read data from dir stream (compressed) + dir_compressed = ole.openstream(dir_path).read() + # decompress it: + dir_stream = BytesIO(decompress_stream(bytearray(dir_compressed))) + # store reference for later use: + self.dir_stream = dir_stream + + # reference: MS-VBAL 2.3.4.2 dir Stream: Version Independent Project Information + + # PROJECTSYSKIND Record + # Specifies the platform for which the VBA project is created. + projectsyskind_id = struct.unpack(" 128: + # TODO: raise an actual error? What is MS Office's behaviour? + log.error("PROJECTNAME_SizeOfProjectName value not in range [1-128]: {0}".format(sizeof_projectname)) + projectname_bytes = dir_stream.read(sizeof_projectname) + self.projectname = self.decode_bytes(projectname_bytes) + + + # PROJECTDOCSTRING Record + # Specifies the description for the VBA project. + projectdocstring_id = struct.unpack(" 2000: + log.error( + "PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(projectdocstring_sizeof_docstring)) + # DocString (variable): An array of SizeOfDocString bytes that specifies the description for the VBA project. + # MUST contain MBCS characters encoded using the code page specified in PROJECTCODEPAGE (section 2.3.4.2.1.4). + # MUST NOT contain null characters. + docstring_bytes = dir_stream.read(projectdocstring_sizeof_docstring) + self.docstring = self.decode_bytes(docstring_bytes) + projectdocstring_reserved = struct.unpack(" 260: + log.error( + "PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(projecthelpfilepath_sizeof_helpfile1)) + projecthelpfilepath_helpfile1 = dir_stream.read(projecthelpfilepath_sizeof_helpfile1) + projecthelpfilepath_reserved = struct.unpack(" 1015: + log.error( + "PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(projectconstants_sizeof_constants)) + projectconstants_constants = dir_stream.read(projectconstants_sizeof_constants) + projectconstants_reserved = struct.unpack(" 0: + code_data = decompress_stream(bytearray(code_data)) + # case-insensitive search in the code_modules dict to find the file extension: + # filext = code_modules.get(modulename_modulename.lower(), 'bin') + filext = 'vba' + filename = '{0}.{1}'.format(modulename_modulename, filext) + #TODO: also yield the codepage so that callers can decode it properly + yield (code_path, filename, code_data) + # print '-'*79 + # print filename + # print '' + # print code_data + # print '' + log.debug('extracted file {0}'.format(filename)) + else: + log.warning("module stream {0} has code data length 0".format(modulestreamname_streamname)) + except (UnexpectedDataError, SubstreamOpenError): + raise + except Exception as exc: + log.info('Error parsing module {0} of {1} in _extract_vba:' + .format(projectmodule_index, projectmodules_count), + exc_info=True) + if not self.relaxed: + raise + _ = unused # make pylint happy: now variable "unused" is being used ;-) + return + + def decode_bytes(self, bytes_string, errors='replace'): + """ + Decode a bytes string to a unicode string, using the project code page + :param bytes_string: bytes, bytes string to be decoded + :param errors: str, mode to handle unicode conversion errors + :return: str/unicode, decoded string + """ + return bytes_string.decode(self.codec, errors=errors) + + + def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): """ Extract VBA macros from an OleFileIO object. @@ -1348,10 +1868,15 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): (e.g. opening substream fails); if False, raise an error in this case This is a generator, yielding (stream path, VBA filename, VBA source code) for each VBA code stream """ - # Open the PROJECT stream: - project = ole.openstream(project_path) log.debug('relaxed is %s' % relaxed) + project = VBA_Project(ole, vba_root, project_path, dir_path, relaxed=False) + + # Open the PROJECT stream: + # reference: [MS-OVBA] 2.3.1 PROJECT Stream + # TODO: in fact the PROJECT stream is encoded using the code page specified in the dir stream, should be read afterwards + project_stream = ole.openstream(project_path) + # sample content of the PROJECT stream: ## ID="{5312AC8A-349D-4950-BDD0-49BE3C4DD0F0}" @@ -1374,7 +1899,8 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): code_modules = {} - for line in project: + for line in project_stream: + line = project.decode_bytes(line) log.debug('PROJECT: %r' % line) line = line.strip() if '=' in line: @@ -1396,457 +1922,8 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): elif name == 'BaseClass': code_modules[value] = FORM_EXTENSION - # read data from dir stream (compressed) - dir_compressed = ole.openstream(dir_path).read() - - def check_value(name, expected, value): - if expected != value: - if relaxed: - log.error("invalid value for {0} expected {1:04X} got {2:04X}" - .format(name, expected, value)) - else: - raise UnexpectedDataError(dir_path, name, expected, value) - - dir_stream = BytesIO(decompress_stream(bytearray(dir_compressed))) - - # PROJECTSYSKIND Record - projectsyskind_id = struct.unpack(" 128: - log.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(projectname_sizeof_projectname)) - projectname_projectname = dir_stream.read(projectname_sizeof_projectname) - unused = projectname_projectname - - # PROJECTDOCSTRING Record - projectdocstring_id = struct.unpack(" 2000: - log.error( - "PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(projectdocstring_sizeof_docstring)) - projectdocstring_docstring = dir_stream.read(projectdocstring_sizeof_docstring) - projectdocstring_reserved = struct.unpack(" 260: - log.error( - "PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(projecthelpfilepath_sizeof_helpfile1)) - projecthelpfilepath_helpfile1 = dir_stream.read(projecthelpfilepath_sizeof_helpfile1) - projecthelpfilepath_reserved = struct.unpack(" 1015: - log.error( - "PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(projectconstants_sizeof_constants)) - projectconstants_constants = dir_stream.read(projectconstants_sizeof_constants) - projectconstants_reserved = struct.unpack(" 0: - code_data = decompress_stream(bytearray(code_data)) - # case-insensitive search in the code_modules dict to find the file extension: - filext = code_modules.get(modulename_modulename.lower(), 'bin') - filename = '{0}.{1}'.format(modulename_modulename, filext) - #TODO: also yield the codepage so that callers can decode it properly - yield (code_path, filename, code_data) - # print '-'*79 - # print filename - # print '' - # print code_data - # print '' - log.debug('extracted file {0}'.format(filename)) - else: - log.warning("module stream {0} has code data length 0".format(modulestreamname_streamname)) - except (UnexpectedDataError, SubstreamOpenError): - raise - except Exception as exc: - log.info('Error parsing module {0} of {1} in _extract_vba:' - .format(projectmodule_index, projectmodules_count), - exc_info=True) - if not relaxed: - raise - _ = unused # make pylint happy: now variable "unused" is being used ;-) - return + for code_path, filename, code_data in project.parse_modules(): + yield (code_path, filename, code_data) def vba_collapse_long_lines(vba_code): -- libgit2 0.21.4