diff --git a/oletools/olevba.py b/oletools/olevba.py new file mode 100644 index 0000000..de6c14d --- /dev/null +++ b/oletools/olevba.py @@ -0,0 +1,658 @@ +#!/usr/bin/env python +""" +olevba.py v0.02 2014-08-15 + +olevba is a script to parse OLE files such as MS Office documents (e.g. Word, +Excel), to extract VBA Macro code in clear text. + +olevba project website: http://www.decalage.info/python/olevba + +olevba is part of the python-oletools package: +http://www.decalage.info/python/oletools + +Usage: olevba.py +""" + +__version__ = '0.02' + +#=== LICENSE ================================================================== + +# olevba is copyright (c) 2014 Philippe Lagadec (http://www.decalage.info) +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +# olevba contains modified source code from the officeparser project, published +# under the following MIT License (MIT): +# +# officeparser is copyright (c) 2014 John William Davison +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +#------------------------------------------------------------------------------ +# CHANGELOG: +# 2014-08-05 v0.01 PL: - first version based on officeparser code +# 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser +# 2014-08-15 PL: - fixed incorrect value check in PROJECTHELPFILEPATH Record + +#------------------------------------------------------------------------------ +# TODO: +# + optparse +# + nicer output +# + output to file +# + setup logging (common with other oletools) +# + support OpenXML files +# + process several files in dirs or zips with password +# + look for VBA in embedded documents (e.g. Excel in Word) +# - python 3.x support +# - add support for PowerPoint macros (see libclamav, libgsf) +# - check VBA macros in Visio, Access, Project, etc +# - extract_macros: convert to a class, split long function into smaller methods +# - extract_macros: read bytes from stream file objects instead of strings +# - extract_macros: use combined struct.unpack instead of many calls + +#------------------------------------------------------------------------------ +# REFERENCES: +# - [MS-OVBA]: Microsoft Office VBA File Format Structure +# http://msdn.microsoft.com/en-us/library/office/cc313094%28v=office.12%29.aspx +# - officeparser: https://github.com/unixfreak0037/officeparser + + +#--- IMPORTS ------------------------------------------------------------------ + +import sys, logging +import struct +import cStringIO +import math + +from thirdparty.OleFileIO_PL import OleFileIO_PL + +#--- CONSTANTS ---------------------------------------------------------------- + +MODULE_EXTENSION = "bas" +CLASS_EXTENSION = "cls" +FORM_EXTENSION = "frm" + +BINFILE_PATH = "xl/vbaProject.bin" + + +#--- FUNCTIONS ---------------------------------------------------------------- + +def copytoken_help(decompressed_current, decompressed_chunk_start): + """ + compute bit masks to decode a CopyToken according to MS-OVBA 2.4.1.3.19.1 CopyToken Help + + decompressed_current: number of decompressed bytes so far, i.e. len(decompressed_container) + decompressed_chunk_start: offset of the current chunk in the decompressed container + return length_mask, offset_mask, bit_count, maximum_length + """ + difference = decompressed_current - decompressed_chunk_start + bit_count = int(math.ceil(math.log(difference, 2))) + bit_count = max([bit_count, 4]) + length_mask = 0xFFFF >> bit_count + offset_mask = ~length_mask + maximum_length = (0xFFFF >> bit_count) + 3 + return length_mask, offset_mask, bit_count, maximum_length + + +def decompress_stream (compressed_container): + """ + Decompress a stream according to MS-OVBA section 2.4.1 + + compressed_container: string compressed according to the MS-OVBA 2.4.1.3.6 Compression algorithm + return the decompressed container as a string (bytes) + """ + # 2.4.1.2 State Variables + + # The following state is maintained for the CompressedContainer (section 2.4.1.1.1): + # CompressedRecordEnd: The location of the byte after the last byte in the CompressedContainer (section 2.4.1.1.1). + # CompressedCurrent: The location of the next byte in the CompressedContainer (section 2.4.1.1.1) to be read by + # decompression or to be written by compression. + + # The following state is maintained for the current CompressedChunk (section 2.4.1.1.4): + # CompressedChunkStart: The location of the first byte of the CompressedChunk (section 2.4.1.1.4) within the + # CompressedContainer (section 2.4.1.1.1). + + # The following state is maintained for a DecompressedBuffer (section 2.4.1.1.2): + # DecompressedCurrent: The location of the next byte in the DecompressedBuffer (section 2.4.1.1.2) to be written by + # decompression or to be read by compression. + # DecompressedBufferEnd: The location of the byte after the last byte in the DecompressedBuffer (section 2.4.1.1.2). + + # The following state is maintained for the current DecompressedChunk (section 2.4.1.1.3): + # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the + # DecompressedBuffer (section 2.4.1.1.2). + + decompressed_container = '' # result + compressed_current = 0 + + sig_byte = ord(compressed_container[compressed_current]) + if sig_byte != 0x01: + raise ValueError('invalid signature byte {0:02X}'.format(sig_byte)) + + compressed_current += 1 + + #NOTE: the definition of CompressedRecordEnd is ambiguous. Here we assume that + # CompressedRecordEnd = len(compressed_container) + while compressed_current < len(compressed_container): + # 2.4.1.1.5 + compressed_chunk_start = compressed_current + # chunk header = first 16 bits + compressed_chunk_header = struct.unpack("> 12) & 0x07 + if chunk_signature != 0b011: + raise ValueError('Invalid CompressedChunkSignature in VBA compressed stream') + # chunk flag = next bit - 1 == compressed, 0 == uncompressed + chunk_flag = (compressed_chunk_header >> 15) & 0x01 + logging.debug("chunk size = {0}, compressed flag = {1}".format(chunk_size, chunk_flag)) + + #MS-OVBA 2.4.1.3.12: the maximum size of a chunk including its header is 4098 bytes (header 2 + data 4096) + # The minimum size is 3 bytes + # NOTE: there seems to be a typo in MS-OVBA, the check should be with 4098, not 4095 (which is the max value + # in chunk header before adding 3. + # Also the first test is not useful since a 12 bits value cannot be larger than 4095. + if chunk_flag == 1 and chunk_size > 4098: + raise ValueError('CompressedChunkSize > 4098 but CompressedChunkFlag == 1') + if chunk_flag == 0 and chunk_size != 4098: + raise ValueError('CompressedChunkSize != 4098 but CompressedChunkFlag == 0') + + # check if chunk_size goes beyond the compressed data, instead of silently cutting it: + #TODO: raise an exception? + if compressed_chunk_start + chunk_size > len(compressed_container): + logging.warning('Chunk size is larger than remaining compressed data') + compressed_end = min([len(compressed_container), compressed_chunk_start + chunk_size]) + # read after chunk header: + compressed_current = compressed_chunk_start + 2 + + if chunk_flag == 0: + # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk + # uncompressed chunk: read the next 4096 bytes as-is + #TODO: check if there are at least 4096 bytes left + decompressed_container += compressed_container[compressed_current:compressed_current + 4096] + compressed_current += 4096 + else: + # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk + # compressed chunk + decompressed_chunk_start = len(decompressed_container) + while compressed_current < compressed_end: + # MS-OVBA 2.4.1.3.4 Decompressing a TokenSequence + # logging.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end)) + # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or + # copy tokens (reference to a previous literal token) + flag_byte = ord(compressed_container[compressed_current]) + compressed_current += 1 + for bit_index in xrange(0, 8): + # logging.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end)) + if compressed_current >= compressed_end: + break + # MS-OVBA 2.4.1.3.5 Decompressing a Token + # MS-OVBA 2.4.1.3.17 Extract FlagBit + flag_bit = (flag_byte >> bit_index) & 1 + #logging.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit)) + if flag_bit == 0: # LiteralToken + # copy one byte directly to output + decompressed_container += compressed_container[compressed_current] + compressed_current += 1 + else: # CopyToken + # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken + copy_token = struct.unpack("> temp2) + 1 + #logging.debug('offset=%d length=%d' % (offset, length)) + copy_source = len(decompressed_container) - offset + for index in xrange(copy_source, copy_source + length): + decompressed_container += decompressed_container[index] + compressed_current += 2 + return decompressed_container + + +def extract_macros(ole): + """ + Extract VBA macros from an OLE file + """ + # Find the VBA project root (different in MS Word, Excel, etc): + vba_root = None + for stream in ('Macros', '_VBA_PROJECT_CUR'): + if ole.exists(stream): + logging.debug('found VBA root stream: %s' % stream) + vba_root = stream + break + if vba_root is None: + logging.debug('VBA root stream not found') + return None + # Find the PROJECT stream: + project = None + project_path = vba_root + '/PROJECT' + if ole.exists(project_path): + logging.debug('found PROJECT stream: %s' % project_path) + project = ole.openstream(project_path) + else: + logging.debug('missing PROJECT stream') + return None + + # sample content of the PROJECT stream: + + ## ID="{5312AC8A-349D-4950-BDD0-49BE3C4DD0F0}" + ## Document=ThisDocument/&H00000000 + ## Module=NewMacros + ## Name="Project" + ## HelpContextID="0" + ## VersionCompatible32="393222000" + ## CMG="F1F301E705E705E705E705" + ## DPB="8F8D7FE3831F2020202020" + ## GC="2D2FDD81E51EE61EE6E1" + ## + ## [Host Extender Info] + ## &H00000001={3832D640-CF90-11CF-8E43-00A0C911005A};VBE;&H00000000 + ## &H00000002={000209F2-0000-0000-C000-000000000046};Word8.0;&H00000000 + ## + ## [Workspace] + ## ThisDocument=22, 29, 339, 477, Z + ## NewMacros=-4, 42, 832, 510, C + + code_modules = {} + + for line in project: + line = line.strip() + if '=' in line: + # split line at the 1st equal sign: + name, value = line.split('=', 1) + # looking for code modules + # add the code module as a key in the dictionary + # the value will be the extension needed later + if name == 'Document': + # split value at the 1st slash, keep 1st part: + value = value.split('/', 1)[0] + code_modules[value] = CLASS_EXTENSION + elif name == 'Module': + code_modules[value] = MODULE_EXTENSION + elif name == 'Class': + code_modules[value] = CLASS_EXTENSION + elif name == 'BaseClass': + code_modules[value] = FORM_EXTENSION + + # Find the dir stream + dir_path = vba_root + '/VBA/dir' + if not ole.exists(dir_path): + logging.debug('missing dir stream') + return None + # read data from dir stream (compressed) + dir_compressed = ole.openstream(dir_path).read() + + def check_value(name, expected, value): + if expected != value: + logging.error("invalid value for {0} expected {1:04X} got {2:04X}".format(name, expected, value)) + + dir_stream = cStringIO.StringIO(decompress_stream(dir_compressed)) + + # PROJECTSYSKIND Record + PROJECTSYSKIND_Id = struct.unpack(" 128: + logging.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(PROJECTNAME_SizeOfProjectName)) + PROJECTNAME_ProjectName = dir_stream.read(PROJECTNAME_SizeOfProjectName) + + # PROJECTDOCSTRING Record + PROJECTDOCSTRING_Id = struct.unpack(" 2000: + logging.error("PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(PROJECTDOCSTRING_SizeOfDocString)) + PROJECTDOCSTRING_DocString = dir_stream.read(PROJECTDOCSTRING_SizeOfDocString) + PROJECTDOCSTRING_Reserved = struct.unpack(" 260: + logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(PROJECTHELPFILEPATH_SizeOfHelpFile1)) + PROJECTHELPFILEPATH_HelpFile1 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile1) + PROJECTHELPFILEPATH_Reserved = struct.unpack(" 1015: + logging.error("PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(PROJECTCONSTANTS_SizeOfConstants)) + PROJECTCONSTANTS_Constants = dir_stream.read(PROJECTCONSTANTS_SizeOfConstants) + PROJECTCONSTANTS_Reserved = struct.unpack(" 0: + code_data = decompress_stream(code_data) + filext = code_modules[MODULENAME_ModuleName] + filename = '{0}.{1}'.format(MODULENAME_ModuleName, filext) + #TODO: return list of strings or dict instead of printing + print '-'*79 + print filename + print '' + print code_data + print '' + logging.debug('extracted file {0}'.format(filename)) + else: + logging.warning("module stream {0} has code data length 0".format(MODULESTREAMNAME_StreamName)) + return + + +#=== MAIN ===================================================================== + +if __name__ == '__main__': + + if len(sys.argv)<2: + print __doc__ + sys.exit(1) + + logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING) + + ole = OleFileIO_PL.OleFileIO(sys.argv[1]) + extract_macros(ole) + + ole.close()