Commit 5a23b490f898cc6a7901f6d3ad3449c2505f6850

Authored by Philippe Lagadec
1 parent ac2f7443

added olevba: a new tool to extract VBA macro code from MS Office documents

Showing 1 changed file with 658 additions and 0 deletions
oletools/olevba.py 0 → 100644
  1 +#!/usr/bin/env python
  2 +"""
  3 +olevba.py v0.02 2014-08-15
  4 +
  5 +olevba is a script to parse OLE files such as MS Office documents (e.g. Word,
  6 +Excel), to extract VBA Macro code in clear text.
  7 +
  8 +olevba project website: http://www.decalage.info/python/olevba
  9 +
  10 +olevba is part of the python-oletools package:
  11 +http://www.decalage.info/python/oletools
  12 +
  13 +Usage: olevba.py <file>
  14 +"""
  15 +
  16 +__version__ = '0.02'
  17 +
  18 +#=== LICENSE ==================================================================
  19 +
  20 +# olevba is copyright (c) 2014 Philippe Lagadec (http://www.decalage.info)
  21 +# All rights reserved.
  22 +#
  23 +# Redistribution and use in source and binary forms, with or without modification,
  24 +# are permitted provided that the following conditions are met:
  25 +#
  26 +# * Redistributions of source code must retain the above copyright notice, this
  27 +# list of conditions and the following disclaimer.
  28 +# * Redistributions in binary form must reproduce the above copyright notice,
  29 +# this list of conditions and the following disclaimer in the documentation
  30 +# and/or other materials provided with the distribution.
  31 +#
  32 +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  33 +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  34 +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  35 +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  36 +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  37 +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  38 +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  39 +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  40 +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  41 +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  42 +
  43 +
  44 +# olevba contains modified source code from the officeparser project, published
  45 +# under the following MIT License (MIT):
  46 +#
  47 +# officeparser is copyright (c) 2014 John William Davison
  48 +#
  49 +# Permission is hereby granted, free of charge, to any person obtaining a copy
  50 +# of this software and associated documentation files (the "Software"), to deal
  51 +# in the Software without restriction, including without limitation the rights
  52 +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  53 +# copies of the Software, and to permit persons to whom the Software is
  54 +# furnished to do so, subject to the following conditions:
  55 +#
  56 +# The above copyright notice and this permission notice shall be included in all
  57 +# copies or substantial portions of the Software.
  58 +#
  59 +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  60 +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  61 +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  62 +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  63 +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  64 +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  65 +# SOFTWARE.
  66 +
  67 +#------------------------------------------------------------------------------
  68 +# CHANGELOG:
  69 +# 2014-08-05 v0.01 PL: - first version based on officeparser code
  70 +# 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser
  71 +# 2014-08-15 PL: - fixed incorrect value check in PROJECTHELPFILEPATH Record
  72 +
  73 +#------------------------------------------------------------------------------
  74 +# TODO:
  75 +# + optparse
  76 +# + nicer output
  77 +# + output to file
  78 +# + setup logging (common with other oletools)
  79 +# + support OpenXML files
  80 +# + process several files in dirs or zips with password
  81 +# + look for VBA in embedded documents (e.g. Excel in Word)
  82 +# - python 3.x support
  83 +# - add support for PowerPoint macros (see libclamav, libgsf)
  84 +# - check VBA macros in Visio, Access, Project, etc
  85 +# - extract_macros: convert to a class, split long function into smaller methods
  86 +# - extract_macros: read bytes from stream file objects instead of strings
  87 +# - extract_macros: use combined struct.unpack instead of many calls
  88 +
  89 +#------------------------------------------------------------------------------
  90 +# REFERENCES:
  91 +# - [MS-OVBA]: Microsoft Office VBA File Format Structure
  92 +# http://msdn.microsoft.com/en-us/library/office/cc313094%28v=office.12%29.aspx
  93 +# - officeparser: https://github.com/unixfreak0037/officeparser
  94 +
  95 +
  96 +#--- IMPORTS ------------------------------------------------------------------
  97 +
  98 +import sys, logging
  99 +import struct
  100 +import cStringIO
  101 +import math
  102 +
  103 +from thirdparty.OleFileIO_PL import OleFileIO_PL
  104 +
  105 +#--- CONSTANTS ----------------------------------------------------------------
  106 +
  107 +MODULE_EXTENSION = "bas"
  108 +CLASS_EXTENSION = "cls"
  109 +FORM_EXTENSION = "frm"
  110 +
  111 +BINFILE_PATH = "xl/vbaProject.bin"
  112 +
  113 +
  114 +#--- FUNCTIONS ----------------------------------------------------------------
  115 +
  116 +def copytoken_help(decompressed_current, decompressed_chunk_start):
  117 + """
  118 + compute bit masks to decode a CopyToken according to MS-OVBA 2.4.1.3.19.1 CopyToken Help
  119 +
  120 + decompressed_current: number of decompressed bytes so far, i.e. len(decompressed_container)
  121 + decompressed_chunk_start: offset of the current chunk in the decompressed container
  122 + return length_mask, offset_mask, bit_count, maximum_length
  123 + """
  124 + difference = decompressed_current - decompressed_chunk_start
  125 + bit_count = int(math.ceil(math.log(difference, 2)))
  126 + bit_count = max([bit_count, 4])
  127 + length_mask = 0xFFFF >> bit_count
  128 + offset_mask = ~length_mask
  129 + maximum_length = (0xFFFF >> bit_count) + 3
  130 + return length_mask, offset_mask, bit_count, maximum_length
  131 +
  132 +
  133 +def decompress_stream (compressed_container):
  134 + """
  135 + Decompress a stream according to MS-OVBA section 2.4.1
  136 +
  137 + compressed_container: string compressed according to the MS-OVBA 2.4.1.3.6 Compression algorithm
  138 + return the decompressed container as a string (bytes)
  139 + """
  140 + # 2.4.1.2 State Variables
  141 +
  142 + # The following state is maintained for the CompressedContainer (section 2.4.1.1.1):
  143 + # CompressedRecordEnd: The location of the byte after the last byte in the CompressedContainer (section 2.4.1.1.1).
  144 + # CompressedCurrent: The location of the next byte in the CompressedContainer (section 2.4.1.1.1) to be read by
  145 + # decompression or to be written by compression.
  146 +
  147 + # The following state is maintained for the current CompressedChunk (section 2.4.1.1.4):
  148 + # CompressedChunkStart: The location of the first byte of the CompressedChunk (section 2.4.1.1.4) within the
  149 + # CompressedContainer (section 2.4.1.1.1).
  150 +
  151 + # The following state is maintained for a DecompressedBuffer (section 2.4.1.1.2):
  152 + # DecompressedCurrent: The location of the next byte in the DecompressedBuffer (section 2.4.1.1.2) to be written by
  153 + # decompression or to be read by compression.
  154 + # DecompressedBufferEnd: The location of the byte after the last byte in the DecompressedBuffer (section 2.4.1.1.2).
  155 +
  156 + # The following state is maintained for the current DecompressedChunk (section 2.4.1.1.3):
  157 + # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the
  158 + # DecompressedBuffer (section 2.4.1.1.2).
  159 +
  160 + decompressed_container = '' # result
  161 + compressed_current = 0
  162 +
  163 + sig_byte = ord(compressed_container[compressed_current])
  164 + if sig_byte != 0x01:
  165 + raise ValueError('invalid signature byte {0:02X}'.format(sig_byte))
  166 +
  167 + compressed_current += 1
  168 +
  169 + #NOTE: the definition of CompressedRecordEnd is ambiguous. Here we assume that
  170 + # CompressedRecordEnd = len(compressed_container)
  171 + while compressed_current < len(compressed_container):
  172 + # 2.4.1.1.5
  173 + compressed_chunk_start = compressed_current
  174 + # chunk header = first 16 bits
  175 + compressed_chunk_header = struct.unpack("<H", compressed_container[compressed_chunk_start:compressed_chunk_start + 2])[0]
  176 + # chunk size = 12 first bits of header + 3
  177 + chunk_size = (compressed_chunk_header & 0x0FFF) + 3
  178 + # chunk signature = 3 next bits - should always be 0b011
  179 + chunk_signature = (compressed_chunk_header >> 12) & 0x07
  180 + if chunk_signature != 0b011:
  181 + raise ValueError('Invalid CompressedChunkSignature in VBA compressed stream')
  182 + # chunk flag = next bit - 1 == compressed, 0 == uncompressed
  183 + chunk_flag = (compressed_chunk_header >> 15) & 0x01
  184 + logging.debug("chunk size = {0}, compressed flag = {1}".format(chunk_size, chunk_flag))
  185 +
  186 + #MS-OVBA 2.4.1.3.12: the maximum size of a chunk including its header is 4098 bytes (header 2 + data 4096)
  187 + # The minimum size is 3 bytes
  188 + # NOTE: there seems to be a typo in MS-OVBA, the check should be with 4098, not 4095 (which is the max value
  189 + # in chunk header before adding 3.
  190 + # Also the first test is not useful since a 12 bits value cannot be larger than 4095.
  191 + if chunk_flag == 1 and chunk_size > 4098:
  192 + raise ValueError('CompressedChunkSize > 4098 but CompressedChunkFlag == 1')
  193 + if chunk_flag == 0 and chunk_size != 4098:
  194 + raise ValueError('CompressedChunkSize != 4098 but CompressedChunkFlag == 0')
  195 +
  196 + # check if chunk_size goes beyond the compressed data, instead of silently cutting it:
  197 + #TODO: raise an exception?
  198 + if compressed_chunk_start + chunk_size > len(compressed_container):
  199 + logging.warning('Chunk size is larger than remaining compressed data')
  200 + compressed_end = min([len(compressed_container), compressed_chunk_start + chunk_size])
  201 + # read after chunk header:
  202 + compressed_current = compressed_chunk_start + 2
  203 +
  204 + if chunk_flag == 0:
  205 + # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk
  206 + # uncompressed chunk: read the next 4096 bytes as-is
  207 + #TODO: check if there are at least 4096 bytes left
  208 + decompressed_container += compressed_container[compressed_current:compressed_current + 4096]
  209 + compressed_current += 4096
  210 + else:
  211 + # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk
  212 + # compressed chunk
  213 + decompressed_chunk_start = len(decompressed_container)
  214 + while compressed_current < compressed_end:
  215 + # MS-OVBA 2.4.1.3.4 Decompressing a TokenSequence
  216 + # logging.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end))
  217 + # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or
  218 + # copy tokens (reference to a previous literal token)
  219 + flag_byte = ord(compressed_container[compressed_current])
  220 + compressed_current += 1
  221 + for bit_index in xrange(0, 8):
  222 + # logging.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end))
  223 + if compressed_current >= compressed_end:
  224 + break
  225 + # MS-OVBA 2.4.1.3.5 Decompressing a Token
  226 + # MS-OVBA 2.4.1.3.17 Extract FlagBit
  227 + flag_bit = (flag_byte >> bit_index) & 1
  228 + #logging.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit))
  229 + if flag_bit == 0: # LiteralToken
  230 + # copy one byte directly to output
  231 + decompressed_container += compressed_container[compressed_current]
  232 + compressed_current += 1
  233 + else: # CopyToken
  234 + # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken
  235 + copy_token = struct.unpack("<H", compressed_container[compressed_current:compressed_current + 2])[0]
  236 + #TODO: check this
  237 + length_mask, offset_mask, bit_count, maximum_length = copytoken_help(
  238 + len(decompressed_container), decompressed_chunk_start)
  239 + length = (copy_token & length_mask) + 3
  240 + temp1 = copy_token & offset_mask
  241 + temp2 = 16 - bit_count
  242 + offset = (temp1 >> temp2) + 1
  243 + #logging.debug('offset=%d length=%d' % (offset, length))
  244 + copy_source = len(decompressed_container) - offset
  245 + for index in xrange(copy_source, copy_source + length):
  246 + decompressed_container += decompressed_container[index]
  247 + compressed_current += 2
  248 + return decompressed_container
  249 +
  250 +
  251 +def extract_macros(ole):
  252 + """
  253 + Extract VBA macros from an OLE file
  254 + """
  255 + # Find the VBA project root (different in MS Word, Excel, etc):
  256 + vba_root = None
  257 + for stream in ('Macros', '_VBA_PROJECT_CUR'):
  258 + if ole.exists(stream):
  259 + logging.debug('found VBA root stream: %s' % stream)
  260 + vba_root = stream
  261 + break
  262 + if vba_root is None:
  263 + logging.debug('VBA root stream not found')
  264 + return None
  265 + # Find the PROJECT stream:
  266 + project = None
  267 + project_path = vba_root + '/PROJECT'
  268 + if ole.exists(project_path):
  269 + logging.debug('found PROJECT stream: %s' % project_path)
  270 + project = ole.openstream(project_path)
  271 + else:
  272 + logging.debug('missing PROJECT stream')
  273 + return None
  274 +
  275 + # sample content of the PROJECT stream:
  276 +
  277 + ## ID="{5312AC8A-349D-4950-BDD0-49BE3C4DD0F0}"
  278 + ## Document=ThisDocument/&H00000000
  279 + ## Module=NewMacros
  280 + ## Name="Project"
  281 + ## HelpContextID="0"
  282 + ## VersionCompatible32="393222000"
  283 + ## CMG="F1F301E705E705E705E705"
  284 + ## DPB="8F8D7FE3831F2020202020"
  285 + ## GC="2D2FDD81E51EE61EE6E1"
  286 + ##
  287 + ## [Host Extender Info]
  288 + ## &H00000001={3832D640-CF90-11CF-8E43-00A0C911005A};VBE;&H00000000
  289 + ## &H00000002={000209F2-0000-0000-C000-000000000046};Word8.0;&H00000000
  290 + ##
  291 + ## [Workspace]
  292 + ## ThisDocument=22, 29, 339, 477, Z
  293 + ## NewMacros=-4, 42, 832, 510, C
  294 +
  295 + code_modules = {}
  296 +
  297 + for line in project:
  298 + line = line.strip()
  299 + if '=' in line:
  300 + # split line at the 1st equal sign:
  301 + name, value = line.split('=', 1)
  302 + # looking for code modules
  303 + # add the code module as a key in the dictionary
  304 + # the value will be the extension needed later
  305 + if name == 'Document':
  306 + # split value at the 1st slash, keep 1st part:
  307 + value = value.split('/', 1)[0]
  308 + code_modules[value] = CLASS_EXTENSION
  309 + elif name == 'Module':
  310 + code_modules[value] = MODULE_EXTENSION
  311 + elif name == 'Class':
  312 + code_modules[value] = CLASS_EXTENSION
  313 + elif name == 'BaseClass':
  314 + code_modules[value] = FORM_EXTENSION
  315 +
  316 + # Find the dir stream
  317 + dir_path = vba_root + '/VBA/dir'
  318 + if not ole.exists(dir_path):
  319 + logging.debug('missing dir stream')
  320 + return None
  321 + # read data from dir stream (compressed)
  322 + dir_compressed = ole.openstream(dir_path).read()
  323 +
  324 + def check_value(name, expected, value):
  325 + if expected != value:
  326 + logging.error("invalid value for {0} expected {1:04X} got {2:04X}".format(name, expected, value))
  327 +
  328 + dir_stream = cStringIO.StringIO(decompress_stream(dir_compressed))
  329 +
  330 + # PROJECTSYSKIND Record
  331 + PROJECTSYSKIND_Id = struct.unpack("<H", dir_stream.read(2))[0]
  332 + check_value('PROJECTSYSKIND_Id', 0x0001, PROJECTSYSKIND_Id)
  333 + PROJECTSYSKIND_Size = struct.unpack("<L", dir_stream.read(4))[0]
  334 + check_value('PROJECTSYSKIND_Size', 0x0004, PROJECTSYSKIND_Size)
  335 + PROJECTSYSKIND_SysKind = struct.unpack("<L", dir_stream.read(4))[0]
  336 + if PROJECTSYSKIND_SysKind == 0x00:
  337 + logging.debug("16-bit Windows")
  338 + elif PROJECTSYSKIND_SysKind == 0x01:
  339 + logging.debug("32-bit Windows")
  340 + elif PROJECTSYSKIND_SysKind == 0x02:
  341 + logging.debug("Macintosh")
  342 + elif PROJECTSYSKIND_SysKind == 0x03:
  343 + logging.debug("64-bit Windows")
  344 + else:
  345 + logging.error("invalid PROJECTSYSKIND_SysKind {0:04X}".format(PROJECTSYSKIND_SysKind))
  346 +
  347 + # PROJECTLCID Record
  348 + PROJECTLCID_Id = struct.unpack("<H", dir_stream.read(2))[0]
  349 + check_value('PROJECTLCID_Id', 0x0002, PROJECTLCID_Id)
  350 + PROJECTLCID_Size = struct.unpack("<L", dir_stream.read(4))[0]
  351 + check_value('PROJECTLCID_Size', 0x0004, PROJECTLCID_Size)
  352 + PROJECTLCID_Lcid = struct.unpack("<L", dir_stream.read(4))[0]
  353 + check_value('PROJECTLCID_Lcid', 0x409, PROJECTLCID_Lcid)
  354 +
  355 + # PROJECTLCIDINVOKE Record
  356 + PROJECTLCIDINVOKE_Id = struct.unpack("<H", dir_stream.read(2))[0]
  357 + check_value('PROJECTLCIDINVOKE_Id', 0x0014, PROJECTLCIDINVOKE_Id)
  358 + PROJECTLCIDINVOKE_Size = struct.unpack("<L", dir_stream.read(4))[0]
  359 + check_value('PROJECTLCIDINVOKE_Size', 0x0004, PROJECTLCIDINVOKE_Size)
  360 + PROJECTLCIDINVOKE_LcidInvoke = struct.unpack("<L", dir_stream.read(4))[0]
  361 + check_value('PROJECTLCIDINVOKE_LcidInvoke', 0x409, PROJECTLCIDINVOKE_LcidInvoke)
  362 +
  363 + # PROJECTCODEPAGE Record
  364 + PROJECTCODEPAGE_Id = struct.unpack("<H", dir_stream.read(2))[0]
  365 + check_value('PROJECTCODEPAGE_Id', 0x0003, PROJECTCODEPAGE_Id)
  366 + PROJECTCODEPAGE_Size = struct.unpack("<L", dir_stream.read(4))[0]
  367 + check_value('PROJECTCODEPAGE_Size', 0x0002, PROJECTCODEPAGE_Size)
  368 + PROJECTCODEPAGE_CodePage = struct.unpack("<H", dir_stream.read(2))[0]
  369 +
  370 + # PROJECTNAME Record
  371 + PROJECTNAME_Id = struct.unpack("<H", dir_stream.read(2))[0]
  372 + check_value('PROJECTNAME_Id', 0x0004, PROJECTNAME_Id)
  373 + PROJECTNAME_SizeOfProjectName = struct.unpack("<L", dir_stream.read(4))[0]
  374 + if PROJECTNAME_SizeOfProjectName < 1 or PROJECTNAME_SizeOfProjectName > 128:
  375 + logging.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(PROJECTNAME_SizeOfProjectName))
  376 + PROJECTNAME_ProjectName = dir_stream.read(PROJECTNAME_SizeOfProjectName)
  377 +
  378 + # PROJECTDOCSTRING Record
  379 + PROJECTDOCSTRING_Id = struct.unpack("<H", dir_stream.read(2))[0]
  380 + check_value('PROJECTDOCSTRING_Id', 0x0005, PROJECTDOCSTRING_Id)
  381 + PROJECTDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0]
  382 + if PROJECTNAME_SizeOfProjectName > 2000:
  383 + logging.error("PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(PROJECTDOCSTRING_SizeOfDocString))
  384 + PROJECTDOCSTRING_DocString = dir_stream.read(PROJECTDOCSTRING_SizeOfDocString)
  385 + PROJECTDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  386 + check_value('PROJECTDOCSTRING_Reserved', 0x0040, PROJECTDOCSTRING_Reserved)
  387 + PROJECTDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  388 + if PROJECTDOCSTRING_SizeOfDocStringUnicode % 2 != 0:
  389 + logging.error("PROJECTDOCSTRING_SizeOfDocStringUnicode is not even")
  390 + PROJECTDOCSTRING_DocStringUnicode = dir_stream.read(PROJECTDOCSTRING_SizeOfDocStringUnicode)
  391 +
  392 + # PROJECTHELPFILEPATH Record - MS-OVBA 2.3.4.2.1.7
  393 + PROJECTHELPFILEPATH_Id = struct.unpack("<H", dir_stream.read(2))[0]
  394 + check_value('PROJECTHELPFILEPATH_Id', 0x0006, PROJECTHELPFILEPATH_Id)
  395 + PROJECTHELPFILEPATH_SizeOfHelpFile1 = struct.unpack("<L", dir_stream.read(4))[0]
  396 + if PROJECTHELPFILEPATH_SizeOfHelpFile1 > 260:
  397 + logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(PROJECTHELPFILEPATH_SizeOfHelpFile1))
  398 + PROJECTHELPFILEPATH_HelpFile1 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile1)
  399 + PROJECTHELPFILEPATH_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  400 + check_value('PROJECTHELPFILEPATH_Reserved', 0x003D, PROJECTHELPFILEPATH_Reserved)
  401 + PROJECTHELPFILEPATH_SizeOfHelpFile2 = struct.unpack("<L", dir_stream.read(4))[0]
  402 + if PROJECTHELPFILEPATH_SizeOfHelpFile2 != PROJECTHELPFILEPATH_SizeOfHelpFile1:
  403 + logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 does not equal PROJECTHELPFILEPATH_SizeOfHelpFile2")
  404 + PROJECTHELPFILEPATH_HelpFile2 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile2)
  405 + if PROJECTHELPFILEPATH_HelpFile2 != PROJECTHELPFILEPATH_HelpFile1:
  406 + logging.error("PROJECTHELPFILEPATH_HelpFile1 does not equal PROJECTHELPFILEPATH_HelpFile2")
  407 +
  408 + # PROJECTHELPCONTEXT Record
  409 + PROJECTHELPCONTEXT_Id = struct.unpack("<H", dir_stream.read(2))[0]
  410 + check_value('PROJECTHELPCONTEXT_Id', 0x0007, PROJECTHELPCONTEXT_Id)
  411 + PROJECTHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0]
  412 + check_value('PROJECTHELPCONTEXT_Size', 0x0004, PROJECTHELPCONTEXT_Size)
  413 + PROJECTHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0]
  414 +
  415 + # PROJECTLIBFLAGS Record
  416 + PROJECTLIBFLAGS_Id = struct.unpack("<H", dir_stream.read(2))[0]
  417 + check_value('PROJECTLIBFLAGS_Id', 0x0008, PROJECTLIBFLAGS_Id)
  418 + PROJECTLIBFLAGS_Size = struct.unpack("<L", dir_stream.read(4))[0]
  419 + check_value('PROJECTLIBFLAGS_Size', 0x0004, PROJECTLIBFLAGS_Size)
  420 + PROJECTLIBFLAGS_ProjectLibFlags = struct.unpack("<L", dir_stream.read(4))[0]
  421 + check_value('PROJECTLIBFLAGS_ProjectLibFlags', 0x0000, PROJECTLIBFLAGS_ProjectLibFlags)
  422 +
  423 + # PROJECTVERSION Record
  424 + PROJECTVERSION_Id = struct.unpack("<H", dir_stream.read(2))[0]
  425 + check_value('PROJECTVERSION_Id', 0x0009, PROJECTVERSION_Id)
  426 + PROJECTVERSION_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
  427 + check_value('PROJECTVERSION_Reserved', 0x0004, PROJECTVERSION_Reserved)
  428 + PROJECTVERSION_VersionMajor = struct.unpack("<L", dir_stream.read(4))[0]
  429 + PROJECTVERSION_VersionMinor = struct.unpack("<H", dir_stream.read(2))[0]
  430 +
  431 + # PROJECTCONSTANTS Record
  432 + PROJECTCONSTANTS_Id = struct.unpack("<H", dir_stream.read(2))[0]
  433 + check_value('PROJECTCONSTANTS_Id', 0x000C, PROJECTCONSTANTS_Id)
  434 + PROJECTCONSTANTS_SizeOfConstants = struct.unpack("<L", dir_stream.read(4))[0]
  435 + if PROJECTCONSTANTS_SizeOfConstants > 1015:
  436 + logging.error("PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(PROJECTCONSTANTS_SizeOfConstants))
  437 + PROJECTCONSTANTS_Constants = dir_stream.read(PROJECTCONSTANTS_SizeOfConstants)
  438 + PROJECTCONSTANTS_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  439 + check_value('PROJECTCONSTANTS_Reserved', 0x003C, PROJECTCONSTANTS_Reserved)
  440 + PROJECTCONSTANTS_SizeOfConstantsUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  441 + if PROJECTCONSTANTS_SizeOfConstantsUnicode % 2 != 0:
  442 + logging.error("PROJECTCONSTANTS_SizeOfConstantsUnicode is not even")
  443 + PROJECTCONSTANTS_ConstantsUnicode = dir_stream.read(PROJECTCONSTANTS_SizeOfConstantsUnicode)
  444 +
  445 + # array of REFERENCE records
  446 + check = None
  447 + while True:
  448 + check = struct.unpack("<H", dir_stream.read(2))[0]
  449 + logging.debug("reference type = {0:04X}".format(check))
  450 + if check == 0x000F:
  451 + break
  452 +
  453 + if check == 0x0016:
  454 + # REFERENCENAME
  455 + REFERENCE_Id = check
  456 + REFERENCE_SizeOfName = struct.unpack("<L", dir_stream.read(4))[0]
  457 + REFERENCE_Name = dir_stream.read(REFERENCE_SizeOfName)
  458 + REFERENCE_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  459 + check_value('REFERENCE_Reserved', 0x003E, REFERENCE_Reserved)
  460 + REFERENCE_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  461 + REFERENCE_NameUnicode = dir_stream.read(REFERENCE_SizeOfNameUnicode)
  462 + continue
  463 +
  464 + if check == 0x0033:
  465 + # REFERENCEORIGINAL (followed by REFERENCECONTROL)
  466 + REFERENCEORIGINAL_Id = check
  467 + REFERENCEORIGINAL_SizeOfLibidOriginal = struct.unpack("<L", dir_stream.read(4))[0]
  468 + REFERENCEORIGINAL_LibidOriginal = dir_stream.read(REFERENCEORIGINAL_SizeOfLibidOriginal)
  469 + continue
  470 +
  471 + if check == 0x002F:
  472 + # REFERENCECONTROL
  473 + REFERENCECONTROL_Id = check
  474 + REFERENCECONTROL_SizeTwiddled = struct.unpack("<L", dir_stream.read(4))[0] # ignore
  475 + REFERENCECONTROL_SizeOfLibidTwiddled = struct.unpack("<L", dir_stream.read(4))[0]
  476 + REFERENCECONTROL_LibidTwiddled = dir_stream.read(REFERENCECONTROL_SizeOfLibidTwiddled)
  477 + REFERENCECONTROL_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0] # ignore
  478 + check_value('REFERENCECONTROL_Reserved1', 0x0000, REFERENCECONTROL_Reserved1)
  479 + REFERENCECONTROL_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0] # ignore
  480 + check_value('REFERENCECONTROL_Reserved2', 0x0000, REFERENCECONTROL_Reserved2)
  481 + # optional field
  482 + check2 = struct.unpack("<H", dir_stream.read(2))[0]
  483 + if check2 == 0x0016:
  484 + REFERENCECONTROL_NameRecordExtended_Id = check
  485 + REFERENCECONTROL_NameRecordExtended_SizeofName = struct.unpack("<L", dir_stream.read(4))[0]
  486 + REFERENCECONTROL_NameRecordExtended_Name = dir_stream.read(REFERENCECONTROL_NameRecordExtended_SizeofName)
  487 + REFERENCECONTROL_NameRecordExtended_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  488 + check_value('REFERENCECONTROL_NameRecordExtended_Reserved', 0x003E, REFERENCECONTROL_NameRecordExtended_Reserved)
  489 + REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  490 + REFERENCECONTROL_NameRecordExtended_NameUnicode = dir_stream.read(REFERENCECONTROL_NameRecordExtended_SizeOfNameUnicode)
  491 + REFERENCECONTROL_Reserved3 = struct.unpack("<H", dir_stream.read(2))[0]
  492 + else:
  493 + REFERENCECONTROL_Reserved3 = check2
  494 +
  495 + check_value('REFERENCECONTROL_Reserved3', 0x0030, REFERENCECONTROL_Reserved3)
  496 + REFERENCECONTROL_SizeExtended = struct.unpack("<L", dir_stream.read(4))[0]
  497 + REFERENCECONTROL_SizeOfLibidExtended = struct.unpack("<L", dir_stream.read(4))[0]
  498 + REFERENCECONTROL_LibidExtended = dir_stream.read(REFERENCECONTROL_SizeOfLibidExtended)
  499 + REFERENCECONTROL_Reserved4 = struct.unpack("<L", dir_stream.read(4))[0]
  500 + REFERENCECONTROL_Reserved5 = struct.unpack("<H", dir_stream.read(2))[0]
  501 + REFERENCECONTROL_OriginalTypeLib = dir_stream.read(16)
  502 + REFERENCECONTROL_Cookie = struct.unpack("<L", dir_stream.read(4))[0]
  503 + continue
  504 +
  505 + if check == 0x000D:
  506 + # REFERENCEREGISTERED
  507 + REFERENCEREGISTERED_Id = check
  508 + REFERENCEREGISTERED_Size = struct.unpack("<L", dir_stream.read(4))[0]
  509 + REFERENCEREGISTERED_SizeOfLibid = struct.unpack("<L", dir_stream.read(4))[0]
  510 + REFERENCEREGISTERED_Libid = dir_stream.read(REFERENCEREGISTERED_SizeOfLibid)
  511 + REFERENCEREGISTERED_Reserved1 = struct.unpack("<L", dir_stream.read(4))[0]
  512 + check_value('REFERENCEREGISTERED_Reserved1', 0x0000, REFERENCEREGISTERED_Reserved1)
  513 + REFERENCEREGISTERED_Reserved2 = struct.unpack("<H", dir_stream.read(2))[0]
  514 + check_value('REFERENCEREGISTERED_Reserved2', 0x0000, REFERENCEREGISTERED_Reserved2)
  515 + continue
  516 +
  517 + if check == 0x000E:
  518 + # REFERENCEPROJECT
  519 + REFERENCEPROJECT_Id = check
  520 + REFERENCEPROJECT_Size = struct.unpack("<L", dir_stream.read(4))[0]
  521 + REFERENCEPROJECT_SizeOfLibidAbsolute = struct.unpack("<L", dir_stream.read(4))[0]
  522 + REFERENCEPROJECT_LibidAbsolute = dir_stream.read(REFERENCEPROJECT_SizeOfLibidAbsolute)
  523 + REFERENCEPROJECT_SizeOfLibidRelative = struct.unpack("<L", dir_stream.read(4))[0]
  524 + REFERENCEPROJECT_LibidRelative = dir_stream.read(REFERENCEPROJECT_SizeOfLibidRelative)
  525 + REFERENCEPROJECT_MajorVersion = struct.unpack("<L", dir_stream.read(4))[0]
  526 + REFERENCEPROJECT_MinorVersion = struct.unpack("<H", dir_stream.read(2))[0]
  527 + continue
  528 +
  529 + logging.error('invalid or unknown check Id {0:04X}'.format(check))
  530 + sys.exit(0)
  531 +
  532 + PROJECTMODULES_Id = check #struct.unpack("<H", dir_stream.read(2))[0]
  533 + check_value('PROJECTMODULES_Id', 0x000F, PROJECTMODULES_Id)
  534 + PROJECTMODULES_Size = struct.unpack("<L", dir_stream.read(4))[0]
  535 + check_value('PROJECTMODULES_Size', 0x0002, PROJECTMODULES_Size)
  536 + PROJECTMODULES_Count = struct.unpack("<H", dir_stream.read(2))[0]
  537 + PROJECTMODULES_ProjectCookieRecord_Id = struct.unpack("<H", dir_stream.read(2))[0]
  538 + check_value('PROJECTMODULES_ProjectCookieRecord_Id', 0x0013, PROJECTMODULES_ProjectCookieRecord_Id)
  539 + PROJECTMODULES_ProjectCookieRecord_Size = struct.unpack("<L", dir_stream.read(4))[0]
  540 + check_value('PROJECTMODULES_ProjectCookieRecord_Size', 0x0002, PROJECTMODULES_ProjectCookieRecord_Size)
  541 + PROJECTMODULES_ProjectCookieRecord_Cookie = struct.unpack("<H", dir_stream.read(2))[0]
  542 +
  543 + logging.debug("parsing {0} modules".format(PROJECTMODULES_Count))
  544 + for x in xrange(0, PROJECTMODULES_Count):
  545 + MODULENAME_Id = struct.unpack("<H", dir_stream.read(2))[0]
  546 + check_value('MODULENAME_Id', 0x0019, MODULENAME_Id)
  547 + MODULENAME_SizeOfModuleName = struct.unpack("<L", dir_stream.read(4))[0]
  548 + MODULENAME_ModuleName = dir_stream.read(MODULENAME_SizeOfModuleName)
  549 + # account for optional sections
  550 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  551 + if section_id == 0x0047:
  552 + MODULENAMEUNICODE_Id = section_id
  553 + MODULENAMEUNICODE_SizeOfModuleNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  554 + MODULENAMEUNICODE_ModuleNameUnicode = dir_stream.read(MODULENAMEUNICODE_SizeOfModuleNameUnicode)
  555 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  556 + if section_id == 0x001A:
  557 + MODULESTREAMNAME_id = section_id
  558 + MODULESTREAMNAME_SizeOfStreamName = struct.unpack("<L", dir_stream.read(4))[0]
  559 + MODULESTREAMNAME_StreamName = dir_stream.read(MODULESTREAMNAME_SizeOfStreamName)
  560 + MODULESTREAMNAME_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  561 + check_value('MODULESTREAMNAME_Reserved', 0x0032, MODULESTREAMNAME_Reserved)
  562 + MODULESTREAMNAME_SizeOfStreamNameUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  563 + MODULESTREAMNAME_StreamNameUnicode = dir_stream.read(MODULESTREAMNAME_SizeOfStreamNameUnicode)
  564 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  565 + if section_id == 0x001C:
  566 + MODULEDOCSTRING_Id = section_id
  567 + check_value('MODULEDOCSTRING_Id', 0x001C, MODULEDOCSTRING_Id)
  568 + MODULEDOCSTRING_SizeOfDocString = struct.unpack("<L", dir_stream.read(4))[0]
  569 + MODULEDOCSTRING_DocString = dir_stream.read(MODULEDOCSTRING_SizeOfDocString)
  570 + MODULEDOCSTRING_Reserved = struct.unpack("<H", dir_stream.read(2))[0]
  571 + check_value('MODULEDOCSTRING_Reserved', 0x0048, MODULEDOCSTRING_Reserved)
  572 + MODULEDOCSTRING_SizeOfDocStringUnicode = struct.unpack("<L", dir_stream.read(4))[0]
  573 + MODULEDOCSTRING_DocStringUnicode = dir_stream.read(MODULEDOCSTRING_SizeOfDocStringUnicode)
  574 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  575 + if section_id == 0x0031:
  576 + MODULEOFFSET_Id = section_id
  577 + check_value('MODULEOFFSET_Id', 0x0031, MODULEOFFSET_Id)
  578 + MODULEOFFSET_Size = struct.unpack("<L", dir_stream.read(4))[0]
  579 + check_value('MODULEOFFSET_Size', 0x0004, MODULEOFFSET_Size)
  580 + MODULEOFFSET_TextOffset = struct.unpack("<L", dir_stream.read(4))[0]
  581 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  582 + if section_id == 0x001E:
  583 + MODULEHELPCONTEXT_Id = section_id
  584 + check_value('MODULEHELPCONTEXT_Id', 0x001E, MODULEHELPCONTEXT_Id)
  585 + MODULEHELPCONTEXT_Size = struct.unpack("<L", dir_stream.read(4))[0]
  586 + check_value('MODULEHELPCONTEXT_Size', 0x0004, MODULEHELPCONTEXT_Size)
  587 + MODULEHELPCONTEXT_HelpContext = struct.unpack("<L", dir_stream.read(4))[0]
  588 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  589 + if section_id == 0x002C:
  590 + MODULECOOKIE_Id = section_id
  591 + check_value('MODULECOOKIE_Id', 0x002C, MODULECOOKIE_Id)
  592 + MODULECOOKIE_Size = struct.unpack("<L", dir_stream.read(4))[0]
  593 + check_value('MODULECOOKIE_Size', 0x0002, MODULECOOKIE_Size)
  594 + MODULECOOKIE_Cookie = struct.unpack("<H", dir_stream.read(2))[0]
  595 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  596 + if section_id == 0x0021 or section_id == 0x0022:
  597 + MODULETYPE_Id = section_id
  598 + MODULETYPE_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
  599 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  600 + if section_id == 0x0025:
  601 + MODULEREADONLY_Id = section_id
  602 + check_value('MODULEREADONLY_Id', 0x0025, MODULEREADONLY_Id)
  603 + MODULEREADONLY_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
  604 + check_value('MODULEREADONLY_Reserved', 0x0000, MODULEREADONLY_Reserved)
  605 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  606 + if section_id == 0x0028:
  607 + MODULEPRIVATE_Id = section_id
  608 + check_value('MODULEPRIVATE_Id', 0x0028, MODULEPRIVATE_Id)
  609 + MODULEPRIVATE_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
  610 + check_value('MODULEPRIVATE_Reserved', 0x0000, MODULEPRIVATE_Reserved)
  611 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  612 + if section_id == 0x002B: # TERMINATOR
  613 + MODULE_Reserved = struct.unpack("<L", dir_stream.read(4))[0]
  614 + check_value('MODULE_Reserved', 0x0000, MODULE_Reserved)
  615 + section_id = None
  616 + if section_id != None:
  617 + logging.warning('unknown or invalid module section id {0:04X}'.format(section_id))
  618 +
  619 + logging.debug("ModuleName = {0}".format(MODULENAME_ModuleName))
  620 + logging.debug("StreamName = {0}".format(MODULESTREAMNAME_StreamName))
  621 + logging.debug("TextOffset = {0}".format(MODULEOFFSET_TextOffset))
  622 +
  623 + code_path = vba_root + '/VBA/' + MODULESTREAMNAME_StreamName
  624 + #TODO: test if stream exists
  625 + code_data = ole.openstream(code_path).read()
  626 + logging.debug("length of code_data = {0}".format(len(code_data)))
  627 + logging.debug("offset of code_data = {0}".format(MODULEOFFSET_TextOffset))
  628 + code_data = code_data[MODULEOFFSET_TextOffset:]
  629 + if len(code_data) > 0:
  630 + code_data = decompress_stream(code_data)
  631 + filext = code_modules[MODULENAME_ModuleName]
  632 + filename = '{0}.{1}'.format(MODULENAME_ModuleName, filext)
  633 + #TODO: return list of strings or dict instead of printing
  634 + print '-'*79
  635 + print filename
  636 + print ''
  637 + print code_data
  638 + print ''
  639 + logging.debug('extracted file {0}'.format(filename))
  640 + else:
  641 + logging.warning("module stream {0} has code data length 0".format(MODULESTREAMNAME_StreamName))
  642 + return
  643 +
  644 +
  645 +#=== MAIN =====================================================================
  646 +
  647 +if __name__ == '__main__':
  648 +
  649 + if len(sys.argv)<2:
  650 + print __doc__
  651 + sys.exit(1)
  652 +
  653 + logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING)
  654 +
  655 + ole = OleFileIO_PL.OleFileIO(sys.argv[1])
  656 + extract_macros(ole)
  657 +
  658 + ole.close()
... ...