Commit 63dafd09eec50aa4a50b2cbe5752c3a3c5cd0f0c

Authored by Christian Herdtweck
1 parent dd5ee6df

added base type PptType, parse all of persist dir (not just first), added DummyT…

…ype and DocumentContainer

also ran through pylint

rather non-atomic, sorry
Showing 1 changed file with 605 additions and 42 deletions
oletools/ppt_parser.py
@@ -16,6 +16,8 @@ References: @@ -16,6 +16,8 @@ References:
16 # TODO 16 # TODO
17 #------------------------------------------------------------------------------ 17 #------------------------------------------------------------------------------
18 # TODO: 18 # TODO:
  19 +# - make CurrentUserAtom and UserEditAtom PptTypes; adjust parse
  20 +# - make stream optional in PptUnexpectedData
19 # - license 21 # - license
20 # - create a AtomBase class that defines check_value and parses RecordHead? 22 # - create a AtomBase class that defines check_value and parses RecordHead?
21 # 23 #
@@ -30,6 +32,7 @@ import sys @@ -30,6 +32,7 @@ import sys
30 import logging 32 import logging
31 import struct 33 import struct
32 import traceback 34 import traceback
  35 +import os
33 36
34 import thirdparty.olefile as olefile 37 import thirdparty.olefile as olefile
35 from olevba import get_logger 38 from olevba import get_logger
@@ -41,6 +44,8 @@ log = get_logger('ppt') @@ -41,6 +44,8 @@ log = get_logger('ppt')
41 44
42 #--- CONSTANTS ---------------------------------------------------------------- 45 #--- CONSTANTS ----------------------------------------------------------------
43 46
  47 +# name of main stream
  48 +MAIN_STREAM_NAME = 'PowerPoint Document'
44 49
45 # URL and message to report issues: 50 # URL and message to report issues:
46 URL_OLEVBA_ISSUES = 'https://bitbucket.org/decalage/oletools/issues' 51 URL_OLEVBA_ISSUES = 'https://bitbucket.org/decalage/oletools/issues'
@@ -66,7 +71,7 @@ def check_value(name, value, expected): @@ -66,7 +71,7 @@ def check_value(name, value, expected):
66 """ simplify verification of values in extract_from """ 71 """ simplify verification of values in extract_from """
67 if isinstance(expected, (list, tuple)): 72 if isinstance(expected, (list, tuple)):
68 if value not in expected: 73 if value not in expected:
69 - exp_str = '[' + ' OR '.join('{0:04X}'.format(val) 74 + exp_str = '[' + ' OR '.join('{0:04X}'.format(val)
70 for val in expected) + ']' 75 for val in expected) + ']'
71 raise PptUnexpectedData( 76 raise PptUnexpectedData(
72 'Current User', name, 77 'Current User', name,
@@ -78,7 +83,7 @@ def check_value(name, value, expected): @@ -78,7 +83,7 @@ def check_value(name, value, expected):
78 83
79 84
80 class RecordHeader(object): 85 class RecordHeader(object):
81 - """ a record header, often found in ppt files 86 + """ a record header, at start of many types found in ppt files
82 87
83 https://msdn.microsoft.com/en-us/library/dd926377%28v=office.12%29.aspx 88 https://msdn.microsoft.com/en-us/library/dd926377%28v=office.12%29.aspx
84 https://msdn.microsoft.com/en-us/library/dd948895%28v=office.12%29.aspx 89 https://msdn.microsoft.com/en-us/library/dd948895%28v=office.12%29.aspx
@@ -97,7 +102,7 @@ class RecordHeader(object): @@ -97,7 +102,7 @@ class RecordHeader(object):
97 obj = clz() 102 obj = clz()
98 # first half byte is version, next 3 half bytes are instance 103 # first half byte is version, next 3 half bytes are instance
99 version_instance, = struct.unpack('<H', stream.read(2)) 104 version_instance, = struct.unpack('<H', stream.read(2))
100 - obj.rec_ver, obj.rec_instance = divmod(version_instance, 16) 105 + obj.rec_instance, obj.rec_ver = divmod(version_instance, 2**4)
101 obj.rec_type, = struct.unpack('<H', stream.read(2)) 106 obj.rec_type, = struct.unpack('<H', stream.read(2))
102 obj.rec_len, = struct.unpack('<L', stream.read(4)) 107 obj.rec_len, = struct.unpack('<L', stream.read(4))
103 return obj 108 return obj
@@ -151,13 +156,14 @@ class CurrentUserAtom(object): @@ -151,13 +156,14 @@ class CurrentUserAtom(object):
151 stream = None 156 stream = None
152 try: 157 try:
153 # open stream 158 # open stream
  159 + log.debug('opening stream')
154 stream = ole.openstream('Current User') 160 stream = ole.openstream('Current User')
155 obj = clz() 161 obj = clz()
156 162
157 # parse record header 163 # parse record header
158 obj.rec_head = RecordHeader.extract_from(stream) 164 obj.rec_head = RecordHeader.extract_from(stream)
159 check_value('rec_version', obj.rec_head.rec_ver, 0) 165 check_value('rec_version', obj.rec_head.rec_ver, 0)
160 - check_value('rec_instance', obj.rec_head.rec_ver, 0) 166 + check_value('rec_instance', obj.rec_head.rec_instance, 0)
161 check_value('rec_type', obj.rec_head.rec_type, clz.RECORD_TYPE) 167 check_value('rec_type', obj.rec_head.rec_type, clz.RECORD_TYPE)
162 168
163 size, = struct.unpack('<L', stream.read(4)) 169 size, = struct.unpack('<L', stream.read(4))
@@ -165,13 +171,8 @@ class CurrentUserAtom(object): @@ -165,13 +171,8 @@ class CurrentUserAtom(object):
165 obj.header_token, = struct.unpack('<L', stream.read(4)) 171 obj.header_token, = struct.unpack('<L', stream.read(4))
166 check_value('headerToken', obj.header_token, 172 check_value('headerToken', obj.header_token,
167 [clz.HEADER_TOKEN_ENCRYPT, clz.HEADER_TOKEN_NOCRYPT]) 173 [clz.HEADER_TOKEN_ENCRYPT, clz.HEADER_TOKEN_NOCRYPT])
168 - log.debug('headerToken is encrypt: {}'  
169 - .format(obj.header_token == clz.HEADER_TOKEN_ENCRYPT))  
170 obj.offset_to_current_edit, = struct.unpack('<L', stream.read(4)) 174 obj.offset_to_current_edit, = struct.unpack('<L', stream.read(4))
171 - log.debug('offsetToCurrentEdit: {0} (0x{0:04X})'  
172 - .format(obj.offset_to_current_edit))  
173 obj.len_user_name, = struct.unpack('<H', stream.read(2)) 175 obj.len_user_name, = struct.unpack('<H', stream.read(2))
174 - log.debug('lenUserName: {}'.format(obj.len_user_name))  
175 if obj.len_user_name > 255: 176 if obj.len_user_name > 255:
176 raise PptUnexpectedData( 177 raise PptUnexpectedData(
177 'Current User', 'CurrentUserAtom.lenUserName', 178 'Current User', 'CurrentUserAtom.lenUserName',
@@ -185,24 +186,146 @@ class CurrentUserAtom(object): @@ -185,24 +186,146 @@ class CurrentUserAtom(object):
185 check_value('minorVersion', obj.minor_version, clz.MINOR_VERSION) 186 check_value('minorVersion', obj.minor_version, clz.MINOR_VERSION)
186 stream.read(2) # unused 187 stream.read(2) # unused
187 obj.ansi_user_name = stream.read(obj.len_user_name) 188 obj.ansi_user_name = stream.read(obj.len_user_name)
188 - log.debug('ansiUserName: {!r}'.format(obj.ansi_user_name))  
189 obj.rel_version, = struct.unpack('<L', stream.read(4)) 189 obj.rel_version, = struct.unpack('<L', stream.read(4))
190 - log.debug('relVersion: {0:04X}'.format(obj.rel_version))  
191 check_value('relVersion', obj.rel_version, 190 check_value('relVersion', obj.rel_version,
192 [clz.REL_VERSION_CAN_USE, clz.REL_VERSION_NO_USE]) 191 [clz.REL_VERSION_CAN_USE, clz.REL_VERSION_NO_USE])
193 obj.unicode_user_name = stream.read(2 * obj.len_user_name) 192 obj.unicode_user_name = stream.read(2 * obj.len_user_name)
194 - log.debug('unicodeUserName: {!r}'.format(obj.unicode_user_name))  
195 193
196 return obj 194 return obj
197 195
198 - except Exception:  
199 - raise  
200 finally: 196 finally:
201 if stream is not None: 197 if stream is not None:
202 log.debug('closing stream') 198 log.debug('closing stream')
203 stream.close() 199 stream.close()
204 200
205 -class UserEditAtom(object): 201 +
  202 +class PptType(object):
  203 + """ base class of data types found in ppt ole files
  204 +
  205 + starts with a RecordHeader, has a extract_from and a check_validity method
  206 + """
  207 +
  208 + RECORD_TYPE = None # must be specified in subclasses
  209 + RECORD_INSTANCE = 0x0 # can be overwritten in subclasses
  210 + RECORD_VERSION = 0x000 # can be overwritten in subclasses
  211 +
  212 + @classmethod
  213 + def extract_from(clz, stream):
  214 + raise NotImplementedError('abstract base function!')
  215 +
  216 + def __init__(self, stream_name=MAIN_STREAM_NAME):
  217 + self.stream = None
  218 + self.stream_name = stream_name
  219 + self.rec_head = None
  220 +
  221 + def read_rec_head(self, stream):
  222 + self.rec_head = RecordHeader.extract_from(stream)
  223 +
  224 + def set_stream(self, stream):
  225 + """ need to call before any read_... method """
  226 + self.stream = stream
  227 +
  228 + def unset_stream(self):
  229 + """ should call after any read_... method """
  230 + self.stream = None
  231 +
  232 + def read_1(self):
  233 + """ read 1 byte from stream """
  234 + return struct.unpack('<B', self.stream.read(1))[0]
  235 +
  236 + def read_2(self):
  237 + """ read 2 byte (short) from stream """
  238 + return struct.unpack('<H', self.stream.read(2))[0]
  239 +
  240 + def read_4(self):
  241 + """ read 4 byte (long) from stream """
  242 + return struct.unpack('<L', self.stream.read(4))[0]
  243 +
  244 + def check_validity(self):
  245 + """ to be overwritten in subclasses
  246 +
  247 + :returns: list of PptUnexpectedData
  248 + """
  249 + raise NotImplementedError('abstract base function!')
  250 +
  251 + def check_value(self, name, value, expected):
  252 + """ simplify verification of values: check value equals/is in expected
  253 +
  254 + :returns: list of PptUnexpectedData exceptions
  255 + """
  256 + if isinstance(expected, (list, tuple)):
  257 + if value not in expected:
  258 + clz_name = self.__class__.__name__
  259 + exp_str = '[' + ' OR '.join('{0:04X}'.format(val)
  260 + for val in expected) + ']'
  261 + return [PptUnexpectedData(
  262 + self.stream_name, clz_name + '.' + name,
  263 + '{0:04X}'.format(value), exp_str), ]
  264 + elif expected != value:
  265 + clz_name = self.__class__.__name__
  266 + return [PptUnexpectedData(
  267 + self.stream_name, clz_name + '.' + name,
  268 + '{0:04X}'.format(value), '{0:04X}'.format(expected)), ]
  269 + return []
  270 +
  271 + def check_range(self, name, value, expect_lower, expect_upper):
  272 + """ simplify verification of values: check value is in given range
  273 +
  274 + expect_lower or expected_upper can be given as None to check only one
  275 + boundary. If value equals one of the boundaries, that is also an error
  276 + (boundaries form an open interval)
  277 +
  278 + :returns: list of PptUnexpectedData exceptions
  279 + """
  280 +
  281 + is_err = False
  282 + if expect_upper is None and expect_lower is None:
  283 + raise ValueError('need at least one non-None boundary!')
  284 + if expect_lower is not None:
  285 + if value <= expect_lower:
  286 + is_err = True
  287 + if expect_upper is not None:
  288 + if value >= expect_upper:
  289 + is_err = True
  290 +
  291 + if is_err:
  292 + clz_name = self.__class__.__name__
  293 + if expect_lower is None:
  294 + expect_str = '< {0:04X}'.format(expect_upper)
  295 + elif expect_upper is None:
  296 + expect_str = '> {0:04X}'.format(expect_lower)
  297 + else:
  298 + expect_str = 'within ({0:04X}, {1:04X})'.format(expect_lower,
  299 + expect_upper)
  300 + return [PptUnexpectedData(self.stream_name, clz_name + '.' + name,
  301 + '{0:04X}'.format(value), expect_str), ]
  302 + else:
  303 + return []
  304 +
  305 + def check_rec_head(self, length=None):
  306 + """ to be called by check_validity to check the self.rec_head
  307 +
  308 + uses self.RECORD_... constants, (not quite that constant for DummyType)
  309 + """
  310 +
  311 + errs = []
  312 + errs.extend(self.check_value('rec_head.recVer', self.rec_head.rec_ver,
  313 + self.RECORD_VERSION))
  314 + errs.extend(self.check_value('rec_head.recInstance',
  315 + self.rec_head.rec_instance,
  316 + self.RECORD_INSTANCE))
  317 + if self.RECORD_TYPE is None:
  318 + raise NotImplementedError('RECORD_TYPE not specified!')
  319 + errs.extend(self.check_value('rec_head.recType',
  320 + self.rec_head.rec_type,
  321 + self.RECORD_TYPE))
  322 + if length is not None:
  323 + errs.extend(self.check_value('rec_head.recLen',
  324 + self.rec_head.rec_len, length))
  325 + return errs
  326 +
  327 +
  328 +class UserEditAtom(PptType):
206 """ An atom record that specifies information about a user edit 329 """ An atom record that specifies information about a user edit
207 330
208 https://msdn.microsoft.com/en-us/library/dd945746%28v=office.12%29.aspx 331 https://msdn.microsoft.com/en-us/library/dd945746%28v=office.12%29.aspx
@@ -213,6 +336,7 @@ class UserEditAtom(object): @@ -213,6 +336,7 @@ class UserEditAtom(object):
213 MAJOR_VERSION = 0x03 336 MAJOR_VERSION = 0x03
214 337
215 def __init__(self): 338 def __init__(self):
  339 + super(UserEditAtom, self).__init__()
216 self.rec_head = None 340 self.rec_head = None
217 self.last_slide_id_ref = None 341 self.last_slide_id_ref = None
218 self.version = None 342 self.version = None
@@ -235,35 +359,383 @@ class UserEditAtom(object): @@ -235,35 +359,383 @@ class UserEditAtom(object):
235 359
236 # parse record header 360 # parse record header
237 obj.rec_head = RecordHeader.extract_from(stream) 361 obj.rec_head = RecordHeader.extract_from(stream)
238 - check_value('rec_version', obj.rec_head.rec_ver, 0)  
239 - check_value('rec_instance', obj.rec_head.rec_ver, 0)  
240 - check_value('rec_type', obj.rec_head.rec_type, clz.RECORD_TYPE)  
241 362
242 obj.last_slide_id_ref, = struct.unpack('<L', stream.read(4)) 363 obj.last_slide_id_ref, = struct.unpack('<L', stream.read(4))
243 obj.version, = struct.unpack('<H', stream.read(2)) 364 obj.version, = struct.unpack('<H', stream.read(2))
244 obj.minor_version, = struct.unpack('<B', stream.read(1)) 365 obj.minor_version, = struct.unpack('<B', stream.read(1))
245 - check_value('minorVersion', obj.minor_version, clz.MINOR_VERSION)  
246 obj.major_version, = struct.unpack('<B', stream.read(1)) 366 obj.major_version, = struct.unpack('<B', stream.read(1))
247 - check_value('majorVersion', obj.major_version, clz.MAJOR_VERSION)  
248 obj.offset_last_edit, = struct.unpack('<L', stream.read(4)) 367 obj.offset_last_edit, = struct.unpack('<L', stream.read(4))
249 - log.debug('offsetLastEdit: {0} (0x{0:04X})'.format(obj.offset_last_edit))  
250 - # todo: check that this is before start pos / prev pos; 0x000 is end  
251 obj.offset_persist_directory, = struct.unpack('<L', stream.read(4)) 368 obj.offset_persist_directory, = struct.unpack('<L', stream.read(4))
252 - log.debug('offsetPersistDir: {0} (0x{0:04X})'  
253 - .format(obj.offset_persist_directory))  
254 obj.doc_persist_id_ref, = struct.unpack('<L', stream.read(4)) 369 obj.doc_persist_id_ref, = struct.unpack('<L', stream.read(4))
255 - check_value('docPersistIdRef', obj.doc_persist_id_ref, 1)  
256 obj.persist_id_seed, = struct.unpack('<L', stream.read(4)) 370 obj.persist_id_seed, = struct.unpack('<L', stream.read(4))
  371 + # (can only check once have the PersistDirectoryAtom)
257 obj.last_view, = struct.unpack('<H', stream.read(2)) 372 obj.last_view, = struct.unpack('<H', stream.read(2))
258 stream.read(2) # unused 373 stream.read(2) # unused
259 if is_encrypted: 374 if is_encrypted:
260 - obj.encrypt_session_persist_id_ref, = struct.unpack('<L',  
261 - stream.read(4))  
262 - else: 375 + obj.encrypt_session_persist_id_ref, = \
  376 + struct.unpack('<L', stream.read(4))
  377 + else: # this entry may be there or may not
263 obj.encrypt_session_persist_id_ref = None 378 obj.encrypt_session_persist_id_ref = None
264 379
265 return obj 380 return obj
266 381
  382 + def check_validity(self, offset=None):
  383 + errs = self.check_rec_head()
  384 + errs.extend(self.check_value('minorVersion', self.minor_version,
  385 + self.MINOR_VERSION))
  386 + errs.extend(self.check_value('majorVersion', self.major_version,
  387 + self.MAJOR_VERSION))
  388 + if offset is not None:
  389 + if self.offset_last_edit >= offset:
  390 + errs.append(PptUnexpectedData(
  391 + 'PowerPoint Document', 'UserEditAtom.offsetLastEdit',
  392 + self.offset_last_edit, '< {}'.format(offset)))
  393 + if self.offset_persist_directory >= offset or \
  394 + self.offset_persist_directory <= self.offset_last_edit:
  395 + errs.append(PptUnexpectedData(
  396 + 'PowerPoint Document',
  397 + 'UserEditAtom.offsetPersistDirectory',
  398 + self.offset_last_edit,
  399 + 'in ({}, {})'.format(self.offset_last_edit, offset)))
  400 + errs.extend(self.check_value('docPersistIdRef',
  401 + self.doc_persist_id_ref, 1))
  402 + return errs
  403 +
  404 + # TODO: offer to check persist_id_seed given PersistDirectoryAtom)
  405 +
  406 +
  407 +class DummyType(PptType):
  408 + """ a type that is found in ppt documents we are not interested in
  409 +
  410 + instead of parsing many uninteresting types, we just read their
  411 + RecordHeader and set the RECORD_... values on an instance- (instead of
  412 + class-) level
  413 +
  414 + used to skip over uninteresting types in e.g. DocumentContainer
  415 + """
  416 +
  417 + def __init__(self, type_name, record_type, rec_ver=0, rec_instance=0,
  418 + rec_len=None):
  419 + super(DummyType, self).__init__()
  420 + self.type_name = type_name
  421 + self.RECORD_TYPE = record_type
  422 + self.RECORD_VERSION = rec_ver
  423 + self.RECORD_INSTANCE = rec_instance
  424 + self.record_length = rec_len
  425 +
  426 + def extract_from(self, stream):
  427 + """ extract record header and just skip as many bytes as header says
  428 +
  429 + Since this requires RECORD_... values set in constructor, this is NOT
  430 + a classmethod like all the other extract_from!
  431 +
  432 + Otherwise this tries to be compatible with other extract_from methods
  433 + (e.g. returns self)
  434 + """
  435 + self.read_rec_head(stream)
  436 + log.debug('skipping over {} Byte for type {}'
  437 + .format(self.rec_head.rec_len, self.type_name))
  438 + log.debug('start at pos {}'.format(stream.tell()))
  439 + stream.seek(self.rec_head.rec_len, os.SEEK_CUR)
  440 + log.debug('now at pos {}'.format(stream.tell()))
  441 + return self
  442 +
  443 + def check_validity(self):
  444 + return self.check_rec_head(self.record_length)
  445 +
  446 +
  447 +class PersistDirectoryAtom(PptType):
  448 + """ one part of a persist object directory with unique persist object id
  449 +
  450 + contains PersistDirectoryEntry objects
  451 +
  452 + https://msdn.microsoft.com/en-us/library/dd952680%28v=office.12%29.aspx
  453 + """
  454 +
  455 + RECORD_TYPE = 0x1772
  456 +
  457 + def __init__(self):
  458 + super(PersistDirectoryAtom, self).__init__()
  459 + self.rg_persist_dir_entry = None # actually, this will be an array
  460 + self.stream_offset = None
  461 +
  462 + @classmethod
  463 + def extract_from(clz, stream):
  464 + """ create and return object with data from given stream """
  465 +
  466 + log.debug("Extracting a PersistDirectoryAtom from stream")
  467 + obj = clz()
  468 +
  469 + # remember own offset for checking validity
  470 + obj.stream_offset = stream.tell()
  471 +
  472 + # parse record header
  473 + obj.read_rec_head(stream)
  474 +
  475 + # read directory entries from list until reach size for this object
  476 + curr_pos = stream.tell()
  477 + stop_pos = curr_pos + obj.rec_head.rec_len
  478 + log.debug('start reading at pos {}, read until {}'
  479 + .format(curr_pos, stop_pos))
  480 + obj.rg_persist_dir_entry = []
  481 +
  482 + while curr_pos < stop_pos:
  483 + new_entry = PersistDirectoryEntry.extract_from(stream)
  484 + obj.rg_persist_dir_entry.append(new_entry)
  485 + curr_pos = stream.tell()
  486 + log.debug('at pos {}'.format(curr_pos))
  487 + return obj
  488 +
  489 + def check_validity(self, user_edit_last_offset=None):
  490 + errs = self.check_rec_head()
  491 + for entry in self.rg_persist_dir_entry:
  492 + errs.extend(entry.check_validity(user_edit_last_offset,
  493 + self.stream_offset))
  494 + return errs
  495 +
  496 +
  497 +class PersistDirectoryEntry(object):
  498 + """ an entry contained in a PersistDirectoryAtom.rg_persist_dir_entry
  499 +
  500 + A structure that specifies a compressed table of sequential persist object
  501 + identifiers and stream offsets to associated persist objects.
  502 +
  503 + NOT a subclass of PptType because has no RecordHeader
  504 +
  505 + https://msdn.microsoft.com/en-us/library/dd947347%28v=office.12%29.aspx
  506 + """
  507 +
  508 + def __init__(self):
  509 + self.persist_id = None
  510 + self.c_persist = None
  511 + self.rg_persist_offset = None
  512 +
  513 + @classmethod
  514 + def extract_from(clz, stream):
  515 + # take a 4-byte (=32bit) number, divide into 20bit and 12 bit)
  516 + log.debug("Extracting a PersistDirectoryEntry from stream")
  517 + obj = clz()
  518 +
  519 + # persistId (20 bits): An unsigned integer that specifies a starting
  520 + # persist object identifier. It MUST be less than or equal to 0xFFFFE.
  521 + # The first entry in rgPersistOffset is associated with persistId. The
  522 + # next entry, if present, is associated with persistId plus 1. Each
  523 + # entry in rgPersistOffset is associated with a persist object
  524 + # identifier in this manner, with the final entry associated with
  525 + # persistId + cPersist - 1.
  526 +
  527 + # cPersist (12 bits): An unsigned integer that specifies the count of
  528 + # items in rgPersistOffset. It MUST be greater than or equal to 0x001.
  529 + temp, = struct.unpack('<L', stream.read(4))
  530 + obj.c_persist, obj.persist_id = divmod(temp, 2**20)
  531 + log.debug('temp is 0x{0:04X} --> id is {1}, reading {2} offsets'
  532 + .format(temp, obj.persist_id, obj.c_persist))
  533 +
  534 + # rgPersistOffset (variable): An array of PersistOffsetEntry (section
  535 + # 2.3.6) that specifies stream offsets to persist objects. The count of
  536 + # items in the array is specified by cPersist. The value of each item
  537 + # MUST be greater than or equal to offsetLastEdit in the corresponding
  538 + # user edit and MUST be less than the offset, in bytes, of the
  539 + # corresponding persist object directory.
  540 + # PersistOffsetEntry: An unsigned 4-byte integer that specifies an
  541 + # offset, in bytes, from the beginning of the PowerPoint Document
  542 + # Stream (section 2.1.2) to a persist object.
  543 + obj.rg_persist_offset = [struct.unpack('<L', stream.read(4))[0] \
  544 + for _ in range(obj.c_persist)]
  545 + log.debug('offsets are: {}'.format(obj.rg_persist_offset))
  546 + return obj
  547 +
  548 + def check_validity(self, user_edit_last_offset=None,
  549 + persist_obj_dir_offset=None):
  550 + errs = []
  551 + if self.persist_id > 0xFFFFE: # (--> == 0xFFFFF since 20bit)
  552 + errs.append(PptUnexpectedData(
  553 + MAIN_STREAM_NAME, 'PersistDirectoryEntry.persist_id',
  554 + self.persist_id, '< 0xFFFFE (dec: {})'.format(0xFFFFE)))
  555 + if self.c_persist == 0:
  556 + errs.append(PptUnexpectedData(
  557 + MAIN_STREAM_NAME, 'PersistDirectoryEntry.c_persist',
  558 + self.c_persist, '> 0'))
  559 + if user_edit_last_offset is not None \
  560 + and min(self.rg_persist_offset) < user_edit_last_offset:
  561 + errs.append(PptUnexpectedData(
  562 + MAIN_STREAM_NAME, 'PersistDirectoryEntry.rg_persist_offset',
  563 + min(self.rg_persist_offset),
  564 + '> UserEdit.offsetLastEdit ({})'
  565 + .format(user_edit_last_offset)))
  566 + if persist_obj_dir_offset is not None \
  567 + and max(self.rg_persist_offset) > persist_obj_dir_offset:
  568 + errs.append(PptUnexpectedData(
  569 + MAIN_STREAM_NAME, 'PersistDirectoryEntry.rg_persist_offset',
  570 + max(self.rg_persist_offset),
  571 + '> PersistObjectDirectory offset ({})'
  572 + .format(persist_obj_dir_offset)))
  573 + return errs
  574 +
  575 +
  576 +class DocInfoListContainer(PptType):
  577 + """ information about the document and document display settings
  578 +
  579 + https://msdn.microsoft.com/en-us/library/dd926767%28v=office.12%29.aspx
  580 + """
  581 +
  582 + RECORD_VERSION = 0xF
  583 + RECORD_TYPE = 0x07D0
  584 +
  585 + def __init__(self):
  586 + super(DocInfoListContainer, self).__init__()
  587 +
  588 +
  589 +class DocumentContainer(PptType):
  590 + """ a DocumentContainer record
  591 +
  592 + https://msdn.microsoft.com/en-us/library/dd947357%28v=office.12%29.aspx
  593 + """
  594 +
  595 + RECORD_TYPE = 0x03E8
  596 +
  597 + def __init__(self):
  598 + super(DocumentContainer, self).__init__()
  599 + self.document_atom = None
  600 + self.ex_obj_list = None
  601 + self.document_text_info = None
  602 + self.sound_collection = None
  603 + self.drawing_group = None
  604 + self.master_list = None
  605 + self.doc_info_list = None
  606 + self.slide_hf = None
  607 + self.notes_hf = None
  608 + self.slide_list = None
  609 + self.notes_list = None
  610 + self.slide_show_doc_info = None
  611 + self.named_shows = None
  612 + self.summary = None
  613 + self.doc_routing_slip = None
  614 + self.print_options = None
  615 + self.rt_custom_table_styles_1 = None
  616 + self.end_document = None
  617 + self.rt_custom_table_styles_2 = None
  618 +
  619 + @classmethod
  620 + def extract_from(clz, stream):
  621 + """ created object with values from given stream
  622 +
  623 + stream is assumed to be positioned correctly
  624 +
  625 + this container contains lots of data we are not interested in.
  626 + """
  627 + obj = clz()
  628 +
  629 + # parse record header
  630 + obj.read_rec_head(stream)
  631 +
  632 + # documentAtom (48 bytes): A DocumentAtom record (section 2.4.2) that
  633 + # specifies size information for presentation slides and notes slides.
  634 + obj.document_atom = DummyType('DocumentAtom', 0x03E9, rec_ver=0x1,
  635 + rec_len=0x28).extract_from(stream)
  636 +
  637 + # exObjList (variable): An optional ExObjListContainer record (section
  638 + # 2.10.1) that specifies the list of external objects in the document.
  639 + obj.ex_obj_list = DummyType('ExObjListContainer', 0x0409, rec_ver=0xF)\
  640 + .extract_from(stream)
  641 +
  642 + # documentTextInfo (variable): A DocumentTextInfoContainer record
  643 + # (section 2.9.1) that specifies the default text styles for the
  644 + # document.
  645 + obj.document_text_info = DummyType('DocumentTextInfoContainer', 0x03F2,
  646 + rec_ver=0xF).extract_from(stream)
  647 +
  648 + # soundCollection (variable): An optional SoundCollectionContainer
  649 + # record (section 2.4.16.1) that specifies the list of sounds in the
  650 + # file.
  651 + obj.sound_collection = DummyType('SoundCollectionContainer', 0x07E4,
  652 + rec_ver=0xF, rec_instance=0x005)\
  653 + .extract_from(stream)
  654 +
  655 + # drawingGroup (variable): A DrawingGroupContainer record (section
  656 + # 2.4.3) that specifies drawing information for the document.
  657 + obj.drawing_group = DummyType('DrawingGroupContainer', 0x040B,
  658 + rec_ver=0xF).extract_from(stream)
  659 +
  660 + # masterList (variable): A MasterListWithTextContainer record (section
  661 + # 2.4.14.1) that specifies the list of main master slides and title
  662 + # master slides.
  663 + obj.master_list = DummyType('MasterListWithContainer', 0x0FF0,
  664 + rec_ver=0xF).extract_from(stream)
  665 +
  666 + # docInfoList (variable): An optional DocInfoListContainer record
  667 + # (section 2.4.4) that specifies additional document information.
  668 + # this is the variable we are interested in!
  669 + obj.doc_info_list = DocInfoListContainer.extract_from(stream)
  670 +
  671 + # slideHF (variable): An optional SlideHeadersFootersContainer record
  672 + # (section 2.4.15.1) that specifies the default header and footer
  673 + # information for presentation slides.
  674 + obj.slide_hf = None
  675 +
  676 + # notesHF (variable): An optional NotesHeadersFootersContainer record
  677 + # (section 2.4.15.6) that specifies the default header and footer
  678 + # information for notes slides.
  679 + obj.notes_hf = None
  680 +
  681 + # slideList (variable): An optional SlideListWithTextContainer record
  682 + # (section 2.4.14.3) that specifies the list of presentation slides.
  683 + obj.slide_list = None
  684 +
  685 + # notesList (variable): An optional NotesListWithTextContainer record
  686 + # (section 2.4.14.6) that specifies the list of notes slides.
  687 + obj.notes_list = None
  688 +
  689 + # slideShowDocInfoAtom (88 bytes): An optional SlideShowDocInfoAtom
  690 + # record (section 2.6.1) that specifies slide show information for the
  691 + # document.
  692 + obj.slide_show_doc_info = None
  693 +
  694 + # namedShows (variable): An optional NamedShowsContainer record
  695 + # (section 2.6.2) that specifies named shows in the document.
  696 + obj.named_shows = None
  697 +
  698 + # summary (variable): An optional SummaryContainer record (section
  699 + # 2.4.22.3) that specifies bookmarks for the document.
  700 + obj.summary = None
  701 +
  702 + # docRoutingSlipAtom (variable): An optional DocRoutingSlipAtom record
  703 + # (section 2.11.1) that specifies document routing information.
  704 + obj.doc_routing_slip = None
  705 +
  706 + # printOptionsAtom (13 bytes): An optional PrintOptionsAtom record
  707 + # (section 2.4.12) that specifies default print options.
  708 + obj.print_options = None
  709 +
  710 + # rtCustomTableStylesAtom1 (variable): An optional
  711 + # RoundTripCustomTableStyles12Atom record (section 2.11.13) that
  712 + # specifies round-trip information for custom table styles.
  713 + obj.rt_custom_table_styles_1 = None
  714 +
  715 + # endDocumentAtom (8 bytes): An EndDocumentAtom record (section 2.4.13)
  716 + # that specifies the end of the information for the document.
  717 + obj.end_document = None
  718 +
  719 + # rtCustomTableStylesAtom2 (variable): An optional
  720 + # RoundTripCustomTableStyles12Atom record that specifies round-trip
  721 + # information for custom table styles. It MUST NOT exist if
  722 + # rtCustomTableStylesAtom1 exists.
  723 + obj.rt_custom_table_styles_2 = None
  724 +
  725 + return obj
  726 +
  727 +
  728 + def check_validity(self):
  729 + """ check all values in object for valid values """
  730 + errs = self.check_rec_head()
  731 + errs.extend(self.document_atom.check_validity())
  732 + errs.extend(self.ex_obj_list.check_validity())
  733 + errs.extend(self.document_text_info.check_validity())
  734 + errs.extend(self.sound_collection.check_validity())
  735 + errs.extend(self.drawing_group.check_validity())
  736 + errs.extend(self.master_list.check_validity())
  737 + errs.extend(self.doc_info_list.check_validity())
  738 + return errs
267 739
268 # === PptParser =============================================================== 740 # === PptParser ===============================================================
269 741
@@ -276,7 +748,7 @@ class PptParser(object): @@ -276,7 +748,7 @@ class PptParser(object):
276 748
277 def __init__(self, ole, fast_fail=False): 749 def __init__(self, ole, fast_fail=False):
278 """ constructor 750 """ constructor
279 - 751 +
280 :param ole: OleFileIO or anything that OleFileIO constructor accepts 752 :param ole: OleFileIO or anything that OleFileIO constructor accepts
281 :param bool fast_fail: if True, all unexpected data will raise a 753 :param bool fast_fail: if True, all unexpected data will raise a
282 PptUnexpectedData; if False will only log error 754 PptUnexpectedData; if False will only log error
@@ -290,6 +762,8 @@ class PptParser(object): @@ -290,6 +762,8 @@ class PptParser(object):
290 self.fast_fail = fast_fail 762 self.fast_fail = fast_fail
291 763
292 self.current_user_atom = None 764 self.current_user_atom = None
  765 + self.document_persist_obj = None
  766 + self.persist_object_directory = None
293 767
294 # basic compatibility check: root directory structure is 768 # basic compatibility check: root directory structure is
295 # [['\x05DocumentSummaryInformation'], 769 # [['\x05DocumentSummaryInformation'],
@@ -304,12 +778,12 @@ class PptParser(object): @@ -304,12 +778,12 @@ class PptParser(object):
304 root_streams = [stream[0].lower() for stream in root_streams] 778 root_streams = [stream[0].lower() for stream in root_streams]
305 if not 'current user' in root_streams: 779 if not 'current user' in root_streams:
306 self._fail('root', 'listdir', root_streams, 'Current User') 780 self._fail('root', 'listdir', root_streams, 'Current User')
307 - if not 'powerpoint document' in root_streams:  
308 - self._fail('root', 'listdir', root_streams, 'PowerPoint Document') 781 + if not MAIN_STREAM_NAME.lower() in root_streams:
  782 + self._fail('root', 'listdir', root_streams, MAIN_STREAM_NAME)
309 783
310 def _log_exception(self, msg=None): 784 def _log_exception(self, msg=None):
311 """ log an exception instead of raising it 785 """ log an exception instead of raising it
312 - 786 +
313 call in one of 2 ways: 787 call in one of 2 ways:
314 try: 788 try:
315 if fail(): 789 if fail():
@@ -348,7 +822,7 @@ class PptParser(object): @@ -348,7 +822,7 @@ class PptParser(object):
348 822
349 if self.current_user_atom is not None: 823 if self.current_user_atom is not None:
350 log.warning('re-reading and overwriting ' 824 log.warning('re-reading and overwriting '
351 - 'previously read CurrentUserAtom') 825 + 'previously read current_user_atom')
352 826
353 try: 827 try:
354 self.current_user_atom = CurrentUserAtom.extract_from(self.ole) 828 self.current_user_atom = CurrentUserAtom.extract_from(self.ole)
@@ -358,41 +832,130 @@ class PptParser(object): @@ -358,41 +832,130 @@ class PptParser(object):
358 else: 832 else:
359 self._log_exception() 833 self._log_exception()
360 834
361 - def construct_persist_object_directory(self):  
362 - """ part 2 """  
363 - 835 + def parse_persist_object_directory(self):
  836 + """ Part 1: Construct the persist object directory """
  837 +
  838 + if self.persist_object_directory is not None:
  839 + log.warning('re-reading and overwriting '
  840 + 'previously read persist_object_directory')
  841 +
364 if self.current_user_atom is None: 842 if self.current_user_atom is None:
365 self.parse_current_user() 843 self.parse_current_user()
366 844
367 offset = self.current_user_atom.offset_to_current_edit 845 offset = self.current_user_atom.offset_to_current_edit
368 is_encrypted = self.current_user_atom.is_encrypted() 846 is_encrypted = self.current_user_atom.is_encrypted()
  847 + self.persist_object_directory = {}
  848 +
  849 + stream = None
  850 + try:
  851 + log.debug('opening stream')
  852 + stream = self.ole.openstream(MAIN_STREAM_NAME)
  853 + while offset != 0:
  854 +
  855 + stream.seek(offset, os.SEEK_SET)
  856 + user_edit = UserEditAtom.extract_from(stream, is_encrypted)
  857 +
  858 + log.debug('checking validity')
  859 + errs = user_edit.check_validity()
  860 + if errs:
  861 + log.warning('check_validity found {} issues'
  862 + .format(len(errs)))
  863 + for err in errs:
  864 + log.warning('UserEditAtom.check_validity: {}'.format(err))
  865 + if errs and self.fast_fail:
  866 + raise errs[0]
  867 +
  868 + log.debug('seeking to pos {}'
  869 + .format(user_edit.offset_persist_directory))
  870 + stream.seek(user_edit.offset_persist_directory, os.SEEK_SET)
  871 +
  872 + persist_dir_atom = PersistDirectoryAtom.extract_from(stream)
  873 +
  874 + log.debug('checking validity')
  875 + errs = persist_dir_atom.check_validity(offset)
  876 + if errs:
  877 + log.warning('check_validity found {} issues'
  878 + .format(len(errs)))
  879 + for err in errs:
  880 + log.warning('PersistDirectoryAtom.check_validity: {}'
  881 + .format(err))
  882 + if errs and self.fast_fail:
  883 + raise errs[0]
  884 +
  885 + for entry in persist_dir_atom.rg_persist_dir_entry:
  886 + log.debug('saving {} offsets for persist_id {}'
  887 + .format(len(entry.rg_persist_offset),
  888 + entry.persist_id))
  889 + self.persist_object_directory[entry.persist_id] = \
  890 + entry.rg_persist_offset
  891 +
  892 + # check for more
  893 + offset = user_edit.offset_last_edit
  894 + except Exception:
  895 + if self.fast_fail:
  896 + raise
  897 + else:
  898 + self._log_exception()
  899 + finally:
  900 + if stream is not None:
  901 + log.debug('closing stream')
  902 + stream.close()
  903 +
  904 + def parse_document_persist_object(self):
  905 + """ """
  906 + if self.document_persist_obj is not None:
  907 + log.warning('re-reading and overwriting '
  908 + 'previously read document_persist_object')
  909 +
  910 + if self.persist_object_directory is None:
  911 + self.parse_persist_object_directory()
  912 +
  913 + offset = None # TODO: read from object directory
369 stream = None 914 stream = None
370 915
371 try: 916 try:
372 - stream = self.ole.openstream('PowerPoint Document') 917 + log.debug('opening stream')
  918 + stream = self.ole.openstream(MAIN_STREAM_NAME)
  919 + log.debug('stream pos: {}'.format(stream.tell()))
373 stream.seek(offset) 920 stream.seek(offset)
374 - user_edit = UserEditAtom.extract_from(stream, is_encrypted) 921 + log.debug('seek by {} to {}'.format(offset, stream.tell()))
  922 + self.document_persist_obj = DocumentContainer.extract_from(stream)
  923 + except Exception:
  924 + if self.fast_fail:
  925 + raise
  926 + else:
  927 + self._log_exception()
375 finally: 928 finally:
376 if stream is not None: 929 if stream is not None:
377 log.debug('closing stream') 930 log.debug('closing stream')
378 stream.close() 931 stream.close()
379 932
  933 + log.debug('checking validity')
  934 + errs = self.document_persist_obj.check_validity()
  935 + if errs:
  936 + log.warning('check_validity found {} issues'.format(len(errs)))
  937 + for err in errs:
  938 + log.warning('check_validity(document_persist_obj): {}'
  939 + .format(err))
  940 + if errs and self.fast_fail:
  941 + raise errs[0]
  942 +
380 # === TESTING ================================================================= 943 # === TESTING =================================================================
381 944
382 def test(): 945 def test():
383 """ for testing and debugging """ 946 """ for testing and debugging """
384 947
385 # setup logging 948 # setup logging
386 - logging.basicConfig(level=logging.DEBUG, format='%(levelname)-8s %(message)s') 949 + logging.basicConfig(level=logging.DEBUG,
  950 + format='%(levelname)-8s %(message)s')
387 log.setLevel(logging.NOTSET) 951 log.setLevel(logging.NOTSET)
388 952
389 # test file with some autostart macros 953 # test file with some autostart macros
390 test_file = 'gelaber_autostart.ppt' 954 test_file = 'gelaber_autostart.ppt'
391 955
392 # parse 956 # parse
393 - ppt = PptParser(test_file)  
394 - ppt.parse_current_user()  
395 - ppt.construct_persist_object_directory() 957 + ppt = PptParser(test_file, fast_fail=False)
  958 + ppt.parse_document_persist_object()
396 959
397 960
398 if __name__ == '__main__': 961 if __name__ == '__main__':