Commit 730c5088b35eb235db5dd58481df785dd78c6b03

Authored by Christian Herdtweck
1 parent 3781f711

ppt_parser: create new alternative based on records

Sofar, the ppt_parser is rather stupid, does not understand the structure
of the streams but just looks for a certain byte sequence anywhere in the
stream (search_* methods).

There was another attempt to understand and parse the stream structure
but that failed (parse_* methods).

Encouraged by xls_parser, that also parses the data as a series of
records, tried the same with ppt files and works nicely sofar. Might
be able to replace ppt_parser soon.
Showing 1 changed file with 303 additions and 0 deletions
oletools/ppt_record_parser.py 0 → 100644
  1 +#!/usr/bin/env python
  2 +
  3 +"""
  4 +ppt_record_parser.py
  5 +
  6 +Alternative to ppt_parser.py that works on records
  7 +"""
  8 +
  9 +# === LICENSE =================================================================
  10 +#
  11 +# Redistribution and use in source and binary forms, with or without
  12 +# modification, are permitted provided that the following conditions are met:
  13 +#
  14 +# * Redistributions of source code must retain the above copyright notice,
  15 +# this list of conditions and the following disclaimer.
  16 +# * Redistributions in binary form must reproduce the above copyright notice,
  17 +# this list of conditions and the following disclaimer in the documentation
  18 +# and/or other materials provided with the distribution.
  19 +#
  20 +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  21 +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22 +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23 +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  24 +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  25 +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  26 +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  27 +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28 +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  29 +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30 +# POSSIBILITY OF SUCH DAMAGE.
  31 +
  32 +from __future__ import print_function
  33 +
  34 +#------------------------------------------------------------------------------
  35 +# CHANGELOG:
  36 +# 2017-11-30 v0.01 CH: - first version based on xls_parser
  37 +
  38 +#------------------------------------------------------------------------------
  39 +# TODO:
  40 +
  41 +# -----------------------------------------------------------------------------
  42 +# REFERENCES:
  43 +# - [MS-PPT]
  44 +
  45 +
  46 +import sys
  47 +from struct import unpack
  48 +import logging
  49 +import record_base
  50 +import io
  51 +
  52 +
  53 +class PptFile(record_base.OleRecordFile):
  54 + """ Record-based view on a PowerPoint ppt file """
  55 +
  56 + @classmethod
  57 + def stream_class_for_name(self, stream_name):
  58 + return PptStream
  59 +
  60 +class PptStream(record_base.OleRecordStream):
  61 + """ a stream of records in a ppt file """
  62 +
  63 + def read_record_head(self):
  64 + """ read first few bytes of record to determine size and type
  65 +
  66 + returns (type, size, other) where other is (instance, version)
  67 + """
  68 + ver_inst, rec_type, rec_size = unpack('<HHL', self.stream.read(8))
  69 + instance, version = divmod(ver_inst, 2**4)
  70 + return rec_type, rec_size, (instance, version)
  71 +
  72 + @classmethod
  73 + def record_class_for_type(cls, rec_type):
  74 + """ determine a class for given record type
  75 +
  76 + returns (clz, force_read)
  77 + """
  78 + if rec_type == PptRecordCurrentUser.TYPE:
  79 + return PptRecordCurrentUser, True
  80 +
  81 + try:
  82 + record_name = RECORD_TYPES[rec_type]
  83 + if record_name.endswith('Container'):
  84 + is_container = True
  85 + elif record_name.endswith('Atom'):
  86 + is_container = False
  87 + elif record_name.endswith('Blob'):
  88 + is_container = False
  89 + else:
  90 + logging.warning('Unexpected name for record type "{0}". typo?'
  91 + .format(record_name))
  92 + is_container = False
  93 +
  94 + if is_container:
  95 + return PptContainerRecord, True
  96 + else:
  97 + return PptRecord, False
  98 + except KeyError:
  99 + return PptRecord, False
  100 +
  101 +
  102 +class PptRecord(record_base.OleRecordBase):
  103 + """ A Record within a ppt file; has instance and version fields """
  104 +
  105 + # fixed values for instance and version (usually ver is 0 or 0xf, inst 0/1)
  106 + INSTANCE = None
  107 + VERSION = None
  108 +
  109 + def parse(self, more_data):
  110 + instance, version = more_data
  111 + if self.INSTANCE is not None and self.INSTANCE != instance:
  112 + raise ValueError('invalid instance {0} for {1}'
  113 + .format(instance, self))
  114 + elif self.INSTANCE is not None and instance not in (0,1):
  115 + try:
  116 + min_val, max_val = INSTANCE_EXCEPTIONS[self.type]
  117 + is_ok = (min_val <= instance <= max_val)
  118 + except KeyError:
  119 + is_ok = False
  120 + if not is_ok:
  121 + logging.warning('unexpected instance {0} for {1}'
  122 + .format(instance, self))
  123 + self.instance = instance
  124 + if self.VERSION is not None and self.VERSION != version:
  125 + raise ValueError('invalid version {0} for {1}'
  126 + .format(version, self))
  127 + elif self.VERSION is None and version not in (0x0, 0x1, 0xf):
  128 + try:
  129 + is_ok = version == VERSION_EXCEPTIONS[self.type]
  130 + except KeyError:
  131 + is_ok = False
  132 + if not is_ok:
  133 + logging.warning('unexpected version {0} for {1}'
  134 + .format(version, self))
  135 + self.version = version
  136 +
  137 + def _type_str(self):
  138 + """ helper for __str__, base implementation """
  139 + try:
  140 + record_name = RECORD_TYPES[self.type]
  141 + return '{0} record'.format(record_name)
  142 + except KeyError:
  143 + return '{0} type 0x{1:04x}'.format(self.__class__.__name__,
  144 + self.type)
  145 +
  146 +
  147 +class PptContainerRecord(PptRecord):
  148 + """ A record that contains other records """
  149 +
  150 + def parse(self, more_data):
  151 + # set self.version and self.instance
  152 + super(PptContainerRecord, self).parse(more_data)
  153 +
  154 + logging.debug('parsing contents of container record {0}'.format(self))
  155 +
  156 + # create a stream from self.data and parse it like any other
  157 + data_stream = io.BytesIO(self.data)
  158 + record_stream = PptStream(data_stream, self.size,
  159 + 'PptContainerRecordSubstream',
  160 + record_base.STGTY_SUBSTREAM)
  161 + self.records = list(record_stream.iter_records())
  162 + logging.debug('done parsing contents of container record {0}'
  163 + .format(self))
  164 +
  165 +
  166 +class PptRecordCurrentUser(PptRecord):
  167 + """ The CurrentUserAtom record """
  168 + TYPE = 0x0ff6
  169 + VERSION = 0
  170 + INSTANCE = 0
  171 +
  172 + def parse(self, more_data):
  173 + super(PptRecordCurrentUser, self).parse(more_data)
  174 + if self.size < 24:
  175 + raise ValueError('CurrentUser record is too small ({0})'
  176 + .format(self.size))
  177 + self.size2, self.header_token, self.offset_to_current_edit, \
  178 + self.len_user_name, self.doc_file_version, self.major_version, \
  179 + self.minor_version, _ = unpack('<IIIHHBBH', self.data[0:20])
  180 + if self.size2 != 0x14:
  181 + raise ValueError('Wrong size2 ({0}) in CurrentUser record'
  182 + .format(self.size2))
  183 + elif self.header_token not in (0xE391C05F, 0xF3D1C4DF):
  184 + raise ValueError('Wrong header_token ({0}) in CurrentUser record'
  185 + .format(self.header_token))
  186 + elif self.doc_file_version != 0x03F4:
  187 + raise ValueError('Wrong doc file version ({0}) in CurrentUser '
  188 + 'record'.format(self.doc_file_version))
  189 + elif self.major_version != 0x03:
  190 + raise ValueError('Wrong major version ({0}) in CurrentUser record'
  191 + .format(self.major_version))
  192 + elif self.minor_version != 0x00:
  193 + raise ValueError('Wrong minor version ({0}) in CurrentUser record'
  194 + .format(self.minor_version))
  195 + self.ansi_user_name = self.data[20:20+self.len_user_name]
  196 + if len(self.ansi_user_name) != self.len_user_name:
  197 + raise ValueError('CurrentUser record is too small for user name '
  198 + '({0} != {1})'.format(len(self.ansi_user_name),
  199 + self.len_user_name))
  200 + offset = 20 + self.len_user_name
  201 + self.release_version = unpack('<I', self.data[offset:offset+4])[0]
  202 + if self.release_version not in (8, 9):
  203 + raise ValueError('CurrentUser record has wrong release version {0}'
  204 + .format(self.release_version))
  205 + offset += 4
  206 + if self.size == offset:
  207 + self.unicode_user_name = None # may be omitted
  208 + elif self.size == offset + 2*self.len_user_name:
  209 + self.unicode_user_name = self.data[offset:].decode('utf-16')
  210 + else:
  211 + raise ValueError('CurrentUser record has wrong size ({0} left)'
  212 + .format(self.size - offset))
  213 +
  214 + def is_document_encrypted(self):
  215 + return self.header_token == 0xF3D1C4DF
  216 +
  217 +
  218 +# types of relevant records (there are much more than listed here)
  219 +RECORD_TYPES = dict([
  220 + # file structure types
  221 + (0x0ff5, 'UserEditAtom'),
  222 + (0x0ff6, 'CurrentUserAtom'), # --> use PptRecordCurrentUser instead
  223 + (0x1772, 'PersistDirectoryAtom'),
  224 + (0x2f14, 'CryptSession10Container'),
  225 + # document types
  226 + (0x03e8, 'DocumentContainer'),
  227 + (0x0fc9, 'HandoutContainer'),
  228 + (0x03f0, 'NotesContainer'),
  229 + (0x03ff, 'VbaInfoContainer'),
  230 + (0x03e9, 'DocumentAtom'),
  231 + (0x03ea, 'EndDocumentAtom'),
  232 + # slide types
  233 + (0x03ee, 'SlideContainer'),
  234 + (0x03f8, 'MainMasterContainer'),
  235 + # external object ty
  236 + (0x0409, 'ExObjListContainer'),
  237 + (0x1011, 'ExOleVbaActiveXAtom'), # ExOleObj|VbaProject|ExControl]Stg[Unc|C]ompressedAtom
  238 + (0x1006, 'ExAviMovieContainer'),
  239 + (0x100e, 'ExCDAudioContainer'),
  240 + (0x0fee, 'ExControlContainer'),
  241 + (0x0fd7, 'ExHyperlinkContainer'),
  242 + (0x1007, 'ExMCIMovieContainer'),
  243 + (0x100d, 'ExMIDIAudioContainer'),
  244 + (0x0fcc, 'ExOleEmbedContainer'),
  245 + (0x0fce, 'ExOleLinkContainer'),
  246 + (0x100f, 'ExWAVAudioEmbeddedContainer'),
  247 + (0x1010, 'ExWAVAudioLinkContainer'),
  248 + (0x1004, 'ExMediaAtom'),
  249 + # other types
  250 + (0x0fc1, 'MetafileBlob'),
  251 + (0x0fb8, 'FontEmbedDataBlob'),
  252 + (0x07e7, 'SoundDataBlob'),
  253 + (0x138b, 'BinaryTagDataBlob'),
  254 +])
  255 +
  256 +# record types where version is not 0x0 or 0xf
  257 +VERSION_EXCEPTIONS = dict([
  258 + (0x0400, 2), # rt_vbainfoatom
  259 + (0x03ef, 2), # rt_slideatom
  260 +])
  261 +
  262 +# record types where instance is not 0x0 or 0x1
  263 +INSTANCE_EXCEPTIONS = dict([
  264 + (0x0fba, (2, 0x14)), # rt_cstring,
  265 + (0x0ff0, (2, 2)), # rt_slidelistwithtext,
  266 + (0x0fd9, (3, 4)), # rt_headersfooters,
  267 + (0x07e4, (5, 5)), # rt_soundcollection,
  268 + (0x03fb, (7, 7)), # rt_guideatom,
  269 + (0x07e9, (2, 2)), # rt_bookmarkseeatom,
  270 + (0x07f0, (6, 6)), # rt_colorschemeatom,
  271 + (0xf125, (0, 5)), # rt_timeconditioncontainer,
  272 + (0xf13d, (0, 0xa)), # rt_timepropertylist,
  273 + (0x0fc8, (2, 2)), # rt_kinsoku,
  274 + (0x0fd2, (3, 3)), # rt_kinsokuatom,
  275 + (0x0f9f, (0, 5)), # rt_textheaderatom,
  276 + (0x0fb7, (0, 128)), # rt_fontentityatom,
  277 + (0x0fa3, (0, 8)), # rt_textmasterstyleatom,
  278 + (0x0fad, (0, 8)), # rt_textmasterstyle9atom,
  279 + (0x0fb2, (0, 8)), # rt_textmasterstyle10atom,
  280 + (0x07f9, (0, 0x80)), # rt_blibentitiy9atom,
  281 + (0x0faf, (0, 5)), # rt_outlinetextpropsheader9atom,
  282 + (0x0fb8, (0, 3)), # rt_fontembeddatablob,
  283 +])
  284 +
  285 +
  286 +###############################################################################
  287 +# TESTING
  288 +###############################################################################
  289 +
  290 +
  291 +if __name__ == '__main__':
  292 + def print_subrecords(record):
  293 + if isinstance(record, PptContainerRecord):
  294 + for subrec in record.records:
  295 + logging.info(' {0}'.format(subrec))
  296 + elif isinstance(record, PptRecordCurrentUser):
  297 + logging.info(' crypt: {0}, offset {1}, user {2}/{3}'
  298 + .format(record.is_document_encrypted(),
  299 + record.offset_to_current_edit,
  300 + repr(record.ansi_user_name),
  301 + repr(record.unicode_user_name)))
  302 + sys.exit(record_base.test(sys.argv[1:], PptFile,
  303 + do_per_record=print_subrecords))
... ...