Commit 730c5088b35eb235db5dd58481df785dd78c6b03
1 parent
3781f711
ppt_parser: create new alternative based on records
Sofar, the ppt_parser is rather stupid, does not understand the structure of the streams but just looks for a certain byte sequence anywhere in the stream (search_* methods). There was another attempt to understand and parse the stream structure but that failed (parse_* methods). Encouraged by xls_parser, that also parses the data as a series of records, tried the same with ppt files and works nicely sofar. Might be able to replace ppt_parser soon.
Showing
1 changed file
with
303 additions
and
0 deletions
oletools/ppt_record_parser.py
0 → 100644
| 1 | +#!/usr/bin/env python | |
| 2 | + | |
| 3 | +""" | |
| 4 | +ppt_record_parser.py | |
| 5 | + | |
| 6 | +Alternative to ppt_parser.py that works on records | |
| 7 | +""" | |
| 8 | + | |
| 9 | +# === LICENSE ================================================================= | |
| 10 | +# | |
| 11 | +# Redistribution and use in source and binary forms, with or without | |
| 12 | +# modification, are permitted provided that the following conditions are met: | |
| 13 | +# | |
| 14 | +# * Redistributions of source code must retain the above copyright notice, | |
| 15 | +# this list of conditions and the following disclaimer. | |
| 16 | +# * Redistributions in binary form must reproduce the above copyright notice, | |
| 17 | +# this list of conditions and the following disclaimer in the documentation | |
| 18 | +# and/or other materials provided with the distribution. | |
| 19 | +# | |
| 20 | +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
| 21 | +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 22 | +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| 23 | +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE | |
| 24 | +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
| 25 | +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
| 26 | +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
| 27 | +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
| 28 | +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
| 29 | +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
| 30 | +# POSSIBILITY OF SUCH DAMAGE. | |
| 31 | + | |
| 32 | +from __future__ import print_function | |
| 33 | + | |
| 34 | +#------------------------------------------------------------------------------ | |
| 35 | +# CHANGELOG: | |
| 36 | +# 2017-11-30 v0.01 CH: - first version based on xls_parser | |
| 37 | + | |
| 38 | +#------------------------------------------------------------------------------ | |
| 39 | +# TODO: | |
| 40 | + | |
| 41 | +# ----------------------------------------------------------------------------- | |
| 42 | +# REFERENCES: | |
| 43 | +# - [MS-PPT] | |
| 44 | + | |
| 45 | + | |
| 46 | +import sys | |
| 47 | +from struct import unpack | |
| 48 | +import logging | |
| 49 | +import record_base | |
| 50 | +import io | |
| 51 | + | |
| 52 | + | |
| 53 | +class PptFile(record_base.OleRecordFile): | |
| 54 | + """ Record-based view on a PowerPoint ppt file """ | |
| 55 | + | |
| 56 | + @classmethod | |
| 57 | + def stream_class_for_name(self, stream_name): | |
| 58 | + return PptStream | |
| 59 | + | |
| 60 | +class PptStream(record_base.OleRecordStream): | |
| 61 | + """ a stream of records in a ppt file """ | |
| 62 | + | |
| 63 | + def read_record_head(self): | |
| 64 | + """ read first few bytes of record to determine size and type | |
| 65 | + | |
| 66 | + returns (type, size, other) where other is (instance, version) | |
| 67 | + """ | |
| 68 | + ver_inst, rec_type, rec_size = unpack('<HHL', self.stream.read(8)) | |
| 69 | + instance, version = divmod(ver_inst, 2**4) | |
| 70 | + return rec_type, rec_size, (instance, version) | |
| 71 | + | |
| 72 | + @classmethod | |
| 73 | + def record_class_for_type(cls, rec_type): | |
| 74 | + """ determine a class for given record type | |
| 75 | + | |
| 76 | + returns (clz, force_read) | |
| 77 | + """ | |
| 78 | + if rec_type == PptRecordCurrentUser.TYPE: | |
| 79 | + return PptRecordCurrentUser, True | |
| 80 | + | |
| 81 | + try: | |
| 82 | + record_name = RECORD_TYPES[rec_type] | |
| 83 | + if record_name.endswith('Container'): | |
| 84 | + is_container = True | |
| 85 | + elif record_name.endswith('Atom'): | |
| 86 | + is_container = False | |
| 87 | + elif record_name.endswith('Blob'): | |
| 88 | + is_container = False | |
| 89 | + else: | |
| 90 | + logging.warning('Unexpected name for record type "{0}". typo?' | |
| 91 | + .format(record_name)) | |
| 92 | + is_container = False | |
| 93 | + | |
| 94 | + if is_container: | |
| 95 | + return PptContainerRecord, True | |
| 96 | + else: | |
| 97 | + return PptRecord, False | |
| 98 | + except KeyError: | |
| 99 | + return PptRecord, False | |
| 100 | + | |
| 101 | + | |
| 102 | +class PptRecord(record_base.OleRecordBase): | |
| 103 | + """ A Record within a ppt file; has instance and version fields """ | |
| 104 | + | |
| 105 | + # fixed values for instance and version (usually ver is 0 or 0xf, inst 0/1) | |
| 106 | + INSTANCE = None | |
| 107 | + VERSION = None | |
| 108 | + | |
| 109 | + def parse(self, more_data): | |
| 110 | + instance, version = more_data | |
| 111 | + if self.INSTANCE is not None and self.INSTANCE != instance: | |
| 112 | + raise ValueError('invalid instance {0} for {1}' | |
| 113 | + .format(instance, self)) | |
| 114 | + elif self.INSTANCE is not None and instance not in (0,1): | |
| 115 | + try: | |
| 116 | + min_val, max_val = INSTANCE_EXCEPTIONS[self.type] | |
| 117 | + is_ok = (min_val <= instance <= max_val) | |
| 118 | + except KeyError: | |
| 119 | + is_ok = False | |
| 120 | + if not is_ok: | |
| 121 | + logging.warning('unexpected instance {0} for {1}' | |
| 122 | + .format(instance, self)) | |
| 123 | + self.instance = instance | |
| 124 | + if self.VERSION is not None and self.VERSION != version: | |
| 125 | + raise ValueError('invalid version {0} for {1}' | |
| 126 | + .format(version, self)) | |
| 127 | + elif self.VERSION is None and version not in (0x0, 0x1, 0xf): | |
| 128 | + try: | |
| 129 | + is_ok = version == VERSION_EXCEPTIONS[self.type] | |
| 130 | + except KeyError: | |
| 131 | + is_ok = False | |
| 132 | + if not is_ok: | |
| 133 | + logging.warning('unexpected version {0} for {1}' | |
| 134 | + .format(version, self)) | |
| 135 | + self.version = version | |
| 136 | + | |
| 137 | + def _type_str(self): | |
| 138 | + """ helper for __str__, base implementation """ | |
| 139 | + try: | |
| 140 | + record_name = RECORD_TYPES[self.type] | |
| 141 | + return '{0} record'.format(record_name) | |
| 142 | + except KeyError: | |
| 143 | + return '{0} type 0x{1:04x}'.format(self.__class__.__name__, | |
| 144 | + self.type) | |
| 145 | + | |
| 146 | + | |
| 147 | +class PptContainerRecord(PptRecord): | |
| 148 | + """ A record that contains other records """ | |
| 149 | + | |
| 150 | + def parse(self, more_data): | |
| 151 | + # set self.version and self.instance | |
| 152 | + super(PptContainerRecord, self).parse(more_data) | |
| 153 | + | |
| 154 | + logging.debug('parsing contents of container record {0}'.format(self)) | |
| 155 | + | |
| 156 | + # create a stream from self.data and parse it like any other | |
| 157 | + data_stream = io.BytesIO(self.data) | |
| 158 | + record_stream = PptStream(data_stream, self.size, | |
| 159 | + 'PptContainerRecordSubstream', | |
| 160 | + record_base.STGTY_SUBSTREAM) | |
| 161 | + self.records = list(record_stream.iter_records()) | |
| 162 | + logging.debug('done parsing contents of container record {0}' | |
| 163 | + .format(self)) | |
| 164 | + | |
| 165 | + | |
| 166 | +class PptRecordCurrentUser(PptRecord): | |
| 167 | + """ The CurrentUserAtom record """ | |
| 168 | + TYPE = 0x0ff6 | |
| 169 | + VERSION = 0 | |
| 170 | + INSTANCE = 0 | |
| 171 | + | |
| 172 | + def parse(self, more_data): | |
| 173 | + super(PptRecordCurrentUser, self).parse(more_data) | |
| 174 | + if self.size < 24: | |
| 175 | + raise ValueError('CurrentUser record is too small ({0})' | |
| 176 | + .format(self.size)) | |
| 177 | + self.size2, self.header_token, self.offset_to_current_edit, \ | |
| 178 | + self.len_user_name, self.doc_file_version, self.major_version, \ | |
| 179 | + self.minor_version, _ = unpack('<IIIHHBBH', self.data[0:20]) | |
| 180 | + if self.size2 != 0x14: | |
| 181 | + raise ValueError('Wrong size2 ({0}) in CurrentUser record' | |
| 182 | + .format(self.size2)) | |
| 183 | + elif self.header_token not in (0xE391C05F, 0xF3D1C4DF): | |
| 184 | + raise ValueError('Wrong header_token ({0}) in CurrentUser record' | |
| 185 | + .format(self.header_token)) | |
| 186 | + elif self.doc_file_version != 0x03F4: | |
| 187 | + raise ValueError('Wrong doc file version ({0}) in CurrentUser ' | |
| 188 | + 'record'.format(self.doc_file_version)) | |
| 189 | + elif self.major_version != 0x03: | |
| 190 | + raise ValueError('Wrong major version ({0}) in CurrentUser record' | |
| 191 | + .format(self.major_version)) | |
| 192 | + elif self.minor_version != 0x00: | |
| 193 | + raise ValueError('Wrong minor version ({0}) in CurrentUser record' | |
| 194 | + .format(self.minor_version)) | |
| 195 | + self.ansi_user_name = self.data[20:20+self.len_user_name] | |
| 196 | + if len(self.ansi_user_name) != self.len_user_name: | |
| 197 | + raise ValueError('CurrentUser record is too small for user name ' | |
| 198 | + '({0} != {1})'.format(len(self.ansi_user_name), | |
| 199 | + self.len_user_name)) | |
| 200 | + offset = 20 + self.len_user_name | |
| 201 | + self.release_version = unpack('<I', self.data[offset:offset+4])[0] | |
| 202 | + if self.release_version not in (8, 9): | |
| 203 | + raise ValueError('CurrentUser record has wrong release version {0}' | |
| 204 | + .format(self.release_version)) | |
| 205 | + offset += 4 | |
| 206 | + if self.size == offset: | |
| 207 | + self.unicode_user_name = None # may be omitted | |
| 208 | + elif self.size == offset + 2*self.len_user_name: | |
| 209 | + self.unicode_user_name = self.data[offset:].decode('utf-16') | |
| 210 | + else: | |
| 211 | + raise ValueError('CurrentUser record has wrong size ({0} left)' | |
| 212 | + .format(self.size - offset)) | |
| 213 | + | |
| 214 | + def is_document_encrypted(self): | |
| 215 | + return self.header_token == 0xF3D1C4DF | |
| 216 | + | |
| 217 | + | |
| 218 | +# types of relevant records (there are much more than listed here) | |
| 219 | +RECORD_TYPES = dict([ | |
| 220 | + # file structure types | |
| 221 | + (0x0ff5, 'UserEditAtom'), | |
| 222 | + (0x0ff6, 'CurrentUserAtom'), # --> use PptRecordCurrentUser instead | |
| 223 | + (0x1772, 'PersistDirectoryAtom'), | |
| 224 | + (0x2f14, 'CryptSession10Container'), | |
| 225 | + # document types | |
| 226 | + (0x03e8, 'DocumentContainer'), | |
| 227 | + (0x0fc9, 'HandoutContainer'), | |
| 228 | + (0x03f0, 'NotesContainer'), | |
| 229 | + (0x03ff, 'VbaInfoContainer'), | |
| 230 | + (0x03e9, 'DocumentAtom'), | |
| 231 | + (0x03ea, 'EndDocumentAtom'), | |
| 232 | + # slide types | |
| 233 | + (0x03ee, 'SlideContainer'), | |
| 234 | + (0x03f8, 'MainMasterContainer'), | |
| 235 | + # external object ty | |
| 236 | + (0x0409, 'ExObjListContainer'), | |
| 237 | + (0x1011, 'ExOleVbaActiveXAtom'), # ExOleObj|VbaProject|ExControl]Stg[Unc|C]ompressedAtom | |
| 238 | + (0x1006, 'ExAviMovieContainer'), | |
| 239 | + (0x100e, 'ExCDAudioContainer'), | |
| 240 | + (0x0fee, 'ExControlContainer'), | |
| 241 | + (0x0fd7, 'ExHyperlinkContainer'), | |
| 242 | + (0x1007, 'ExMCIMovieContainer'), | |
| 243 | + (0x100d, 'ExMIDIAudioContainer'), | |
| 244 | + (0x0fcc, 'ExOleEmbedContainer'), | |
| 245 | + (0x0fce, 'ExOleLinkContainer'), | |
| 246 | + (0x100f, 'ExWAVAudioEmbeddedContainer'), | |
| 247 | + (0x1010, 'ExWAVAudioLinkContainer'), | |
| 248 | + (0x1004, 'ExMediaAtom'), | |
| 249 | + # other types | |
| 250 | + (0x0fc1, 'MetafileBlob'), | |
| 251 | + (0x0fb8, 'FontEmbedDataBlob'), | |
| 252 | + (0x07e7, 'SoundDataBlob'), | |
| 253 | + (0x138b, 'BinaryTagDataBlob'), | |
| 254 | +]) | |
| 255 | + | |
| 256 | +# record types where version is not 0x0 or 0xf | |
| 257 | +VERSION_EXCEPTIONS = dict([ | |
| 258 | + (0x0400, 2), # rt_vbainfoatom | |
| 259 | + (0x03ef, 2), # rt_slideatom | |
| 260 | +]) | |
| 261 | + | |
| 262 | +# record types where instance is not 0x0 or 0x1 | |
| 263 | +INSTANCE_EXCEPTIONS = dict([ | |
| 264 | + (0x0fba, (2, 0x14)), # rt_cstring, | |
| 265 | + (0x0ff0, (2, 2)), # rt_slidelistwithtext, | |
| 266 | + (0x0fd9, (3, 4)), # rt_headersfooters, | |
| 267 | + (0x07e4, (5, 5)), # rt_soundcollection, | |
| 268 | + (0x03fb, (7, 7)), # rt_guideatom, | |
| 269 | + (0x07e9, (2, 2)), # rt_bookmarkseeatom, | |
| 270 | + (0x07f0, (6, 6)), # rt_colorschemeatom, | |
| 271 | + (0xf125, (0, 5)), # rt_timeconditioncontainer, | |
| 272 | + (0xf13d, (0, 0xa)), # rt_timepropertylist, | |
| 273 | + (0x0fc8, (2, 2)), # rt_kinsoku, | |
| 274 | + (0x0fd2, (3, 3)), # rt_kinsokuatom, | |
| 275 | + (0x0f9f, (0, 5)), # rt_textheaderatom, | |
| 276 | + (0x0fb7, (0, 128)), # rt_fontentityatom, | |
| 277 | + (0x0fa3, (0, 8)), # rt_textmasterstyleatom, | |
| 278 | + (0x0fad, (0, 8)), # rt_textmasterstyle9atom, | |
| 279 | + (0x0fb2, (0, 8)), # rt_textmasterstyle10atom, | |
| 280 | + (0x07f9, (0, 0x80)), # rt_blibentitiy9atom, | |
| 281 | + (0x0faf, (0, 5)), # rt_outlinetextpropsheader9atom, | |
| 282 | + (0x0fb8, (0, 3)), # rt_fontembeddatablob, | |
| 283 | +]) | |
| 284 | + | |
| 285 | + | |
| 286 | +############################################################################### | |
| 287 | +# TESTING | |
| 288 | +############################################################################### | |
| 289 | + | |
| 290 | + | |
| 291 | +if __name__ == '__main__': | |
| 292 | + def print_subrecords(record): | |
| 293 | + if isinstance(record, PptContainerRecord): | |
| 294 | + for subrec in record.records: | |
| 295 | + logging.info(' {0}'.format(subrec)) | |
| 296 | + elif isinstance(record, PptRecordCurrentUser): | |
| 297 | + logging.info(' crypt: {0}, offset {1}, user {2}/{3}' | |
| 298 | + .format(record.is_document_encrypted(), | |
| 299 | + record.offset_to_current_edit, | |
| 300 | + repr(record.ansi_user_name), | |
| 301 | + repr(record.unicode_user_name))) | |
| 302 | + sys.exit(record_base.test(sys.argv[1:], PptFile, | |
| 303 | + do_per_record=print_subrecords)) | ... | ... |