Commit 63dafd09eec50aa4a50b2cbe5752c3a3c5cd0f0c

Authored by Christian Herdtweck
1 parent dd5ee6df

added base type PptType, parse all of persist dir (not just first), added DummyT…

…ype and DocumentContainer

also ran through pylint

rather non-atomic, sorry
Showing 1 changed file with 605 additions and 42 deletions
oletools/ppt_parser.py
... ... @@ -16,6 +16,8 @@ References:
16 16 # TODO
17 17 #------------------------------------------------------------------------------
18 18 # TODO:
  19 +# - make CurrentUserAtom and UserEditAtom PptTypes; adjust parse
  20 +# - make stream optional in PptUnexpectedData
19 21 # - license
20 22 # - create a AtomBase class that defines check_value and parses RecordHead?
21 23 #
... ... @@ -30,6 +32,7 @@ import sys
30 32 import logging
31 33 import struct
32 34 import traceback
  35 +import os
33 36  
34 37 import thirdparty.olefile as olefile
35 38 from olevba import get_logger
... ... @@ -41,6 +44,8 @@ log = get_logger('ppt')
41 44  
42 45 #--- CONSTANTS ----------------------------------------------------------------
43 46  
  47 +# name of main stream
  48 +MAIN_STREAM_NAME = 'PowerPoint Document'
44 49  
45 50 # URL and message to report issues:
46 51 URL_OLEVBA_ISSUES = 'https://bitbucket.org/decalage/oletools/issues'
... ... @@ -66,7 +71,7 @@ def check_value(name, value, expected):
66 71 """ simplify verification of values in extract_from """
67 72 if isinstance(expected, (list, tuple)):
68 73 if value not in expected:
69   - exp_str = '[' + ' OR '.join('{0:04X}'.format(val)
  74 + exp_str = '[' + ' OR '.join('{0:04X}'.format(val)
70 75 for val in expected) + ']'
71 76 raise PptUnexpectedData(
72 77 'Current User', name,
... ... @@ -78,7 +83,7 @@ def check_value(name, value, expected):
78 83  
79 84  
80 85 class RecordHeader(object):
81   - """ a record header, often found in ppt files
  86 + """ a record header, at start of many types found in ppt files
82 87  
83 88 https://msdn.microsoft.com/en-us/library/dd926377%28v=office.12%29.aspx
84 89 https://msdn.microsoft.com/en-us/library/dd948895%28v=office.12%29.aspx
... ... @@ -97,7 +102,7 @@ class RecordHeader(object):
97 102 obj = clz()
98 103 # first half byte is version, next 3 half bytes are instance
99 104 version_instance, = struct.unpack('<H', stream.read(2))
100   - obj.rec_ver, obj.rec_instance = divmod(version_instance, 16)
  105 + obj.rec_instance, obj.rec_ver = divmod(version_instance, 2**4)
101 106 obj.rec_type, = struct.unpack('<H', stream.read(2))
102 107 obj.rec_len, = struct.unpack('<L', stream.read(4))
103 108 return obj
... ... @@ -151,13 +156,14 @@ class CurrentUserAtom(object):
151 156 stream = None
152 157 try:
153 158 # open stream
  159 + log.debug('opening stream')
154 160 stream = ole.openstream('Current User')
155 161 obj = clz()
156 162  
157 163 # parse record header
158 164 obj.rec_head = RecordHeader.extract_from(stream)
159 165 check_value('rec_version', obj.rec_head.rec_ver, 0)
160   - check_value('rec_instance', obj.rec_head.rec_ver, 0)
  166 + check_value('rec_instance', obj.rec_head.rec_instance, 0)
161 167 check_value('rec_type', obj.rec_head.rec_type, clz.RECORD_TYPE)
162 168  
163 169 size, = struct.unpack('<L', stream.read(4))
... ... @@ -165,13 +171,8 @@ class CurrentUserAtom(object):
165 171 obj.header_token, = struct.unpack('<L', stream.read(4))
166 172 check_value('headerToken', obj.header_token,
167 173 [clz.HEADER_TOKEN_ENCRYPT, clz.HEADER_TOKEN_NOCRYPT])
168   - log.debug('headerToken is encrypt: {}'
169   - .format(obj.header_token == clz.HEADER_TOKEN_ENCRYPT))
170 174 obj.offset_to_current_edit, = struct.unpack('<L', stream.read(4))
171   - log.debug('offsetToCurrentEdit: {0} (0x{0:04X})'
172   - .format(obj.offset_to_current_edit))
173 175 obj.len_user_name, = struct.unpack('<H', stream.read(2))
174   - log.debug('lenUserName: {}'.format(obj.len_user_name))
175 176 if obj.len_user_name > 255:
176 177 raise PptUnexpectedData(
177 178 'Current User', 'CurrentUserAtom.lenUserName',
... ... @@ -185,24 +186,146 @@ class CurrentUserAtom(object):
185 186 check_value('minorVersion', obj.minor_version, clz.MINOR_VERSION)
186 187 stream.read(2) # unused
187 188 obj.ansi_user_name = stream.read(obj.len_user_name)
188   - log.debug('ansiUserName: {!r}'.format(obj.ansi_user_name))
189 189 obj.rel_version, = struct.unpack('<L', stream.read(4))
190   - log.debug('relVersion: {0:04X}'.format(obj.rel_version))
191 190 check_value('relVersion', obj.rel_version,
192 191 [clz.REL_VERSION_CAN_USE, clz.REL_VERSION_NO_USE])
193 192 obj.unicode_user_name = stream.read(2 * obj.len_user_name)
194   - log.debug('unicodeUserName: {!r}'.format(obj.unicode_user_name))
195 193  
196 194 return obj
197 195  
198   - except Exception:
199   - raise
200 196 finally:
201 197 if stream is not None:
202 198 log.debug('closing stream')
203 199 stream.close()
204 200  
205   -class UserEditAtom(object):
  201 +
  202 +class PptType(object):
  203 + """ base class of data types found in ppt ole files
  204 +
  205 + starts with a RecordHeader, has a extract_from and a check_validity method
  206 + """
  207 +
  208 + RECORD_TYPE = None # must be specified in subclasses
  209 + RECORD_INSTANCE = 0x0 # can be overwritten in subclasses
  210 + RECORD_VERSION = 0x000 # can be overwritten in subclasses
  211 +
  212 + @classmethod
  213 + def extract_from(clz, stream):
  214 + raise NotImplementedError('abstract base function!')
  215 +
  216 + def __init__(self, stream_name=MAIN_STREAM_NAME):
  217 + self.stream = None
  218 + self.stream_name = stream_name
  219 + self.rec_head = None
  220 +
  221 + def read_rec_head(self, stream):
  222 + self.rec_head = RecordHeader.extract_from(stream)
  223 +
  224 + def set_stream(self, stream):
  225 + """ need to call before any read_... method """
  226 + self.stream = stream
  227 +
  228 + def unset_stream(self):
  229 + """ should call after any read_... method """
  230 + self.stream = None
  231 +
  232 + def read_1(self):
  233 + """ read 1 byte from stream """
  234 + return struct.unpack('<B', self.stream.read(1))[0]
  235 +
  236 + def read_2(self):
  237 + """ read 2 byte (short) from stream """
  238 + return struct.unpack('<H', self.stream.read(2))[0]
  239 +
  240 + def read_4(self):
  241 + """ read 4 byte (long) from stream """
  242 + return struct.unpack('<L', self.stream.read(4))[0]
  243 +
  244 + def check_validity(self):
  245 + """ to be overwritten in subclasses
  246 +
  247 + :returns: list of PptUnexpectedData
  248 + """
  249 + raise NotImplementedError('abstract base function!')
  250 +
  251 + def check_value(self, name, value, expected):
  252 + """ simplify verification of values: check value equals/is in expected
  253 +
  254 + :returns: list of PptUnexpectedData exceptions
  255 + """
  256 + if isinstance(expected, (list, tuple)):
  257 + if value not in expected:
  258 + clz_name = self.__class__.__name__
  259 + exp_str = '[' + ' OR '.join('{0:04X}'.format(val)
  260 + for val in expected) + ']'
  261 + return [PptUnexpectedData(
  262 + self.stream_name, clz_name + '.' + name,
  263 + '{0:04X}'.format(value), exp_str), ]
  264 + elif expected != value:
  265 + clz_name = self.__class__.__name__
  266 + return [PptUnexpectedData(
  267 + self.stream_name, clz_name + '.' + name,
  268 + '{0:04X}'.format(value), '{0:04X}'.format(expected)), ]
  269 + return []
  270 +
  271 + def check_range(self, name, value, expect_lower, expect_upper):
  272 + """ simplify verification of values: check value is in given range
  273 +
  274 + expect_lower or expected_upper can be given as None to check only one
  275 + boundary. If value equals one of the boundaries, that is also an error
  276 + (boundaries form an open interval)
  277 +
  278 + :returns: list of PptUnexpectedData exceptions
  279 + """
  280 +
  281 + is_err = False
  282 + if expect_upper is None and expect_lower is None:
  283 + raise ValueError('need at least one non-None boundary!')
  284 + if expect_lower is not None:
  285 + if value <= expect_lower:
  286 + is_err = True
  287 + if expect_upper is not None:
  288 + if value >= expect_upper:
  289 + is_err = True
  290 +
  291 + if is_err:
  292 + clz_name = self.__class__.__name__
  293 + if expect_lower is None:
  294 + expect_str = '< {0:04X}'.format(expect_upper)
  295 + elif expect_upper is None:
  296 + expect_str = '> {0:04X}'.format(expect_lower)
  297 + else:
  298 + expect_str = 'within ({0:04X}, {1:04X})'.format(expect_lower,
  299 + expect_upper)
  300 + return [PptUnexpectedData(self.stream_name, clz_name + '.' + name,
  301 + '{0:04X}'.format(value), expect_str), ]
  302 + else:
  303 + return []
  304 +
  305 + def check_rec_head(self, length=None):
  306 + """ to be called by check_validity to check the self.rec_head
  307 +
  308 + uses self.RECORD_... constants, (not quite that constant for DummyType)
  309 + """
  310 +
  311 + errs = []
  312 + errs.extend(self.check_value('rec_head.recVer', self.rec_head.rec_ver,
  313 + self.RECORD_VERSION))
  314 + errs.extend(self.check_value('rec_head.recInstance',
  315 + self.rec_head.rec_instance,
  316 + self.RECORD_INSTANCE))
  317 + if self.RECORD_TYPE is None:
  318 + raise NotImplementedError('RECORD_TYPE not specified!')
  319 + errs.extend(self.check_value('rec_head.recType',
  320 + self.rec_head.rec_type,
  321 + self.RECORD_TYPE))
  322 + if length is not None:
  323 + errs.extend(self.check_value('rec_head.recLen',
  324 + self.rec_head.rec_len, length))
  325 + return errs
  326 +
  327 +
  328 +class UserEditAtom(PptType):
206 329 """ An atom record that specifies information about a user edit
207 330  
208 331 https://msdn.microsoft.com/en-us/library/dd945746%28v=office.12%29.aspx
... ... @@ -213,6 +336,7 @@ class UserEditAtom(object):
213 336 MAJOR_VERSION = 0x03
214 337  
215 338 def __init__(self):
  339 + super(UserEditAtom, self).__init__()
216 340 self.rec_head = None
217 341 self.last_slide_id_ref = None
218 342 self.version = None
... ... @@ -235,35 +359,383 @@ class UserEditAtom(object):
235 359  
236 360 # parse record header
237 361 obj.rec_head = RecordHeader.extract_from(stream)
238   - check_value('rec_version', obj.rec_head.rec_ver, 0)
239   - check_value('rec_instance', obj.rec_head.rec_ver, 0)
240   - check_value('rec_type', obj.rec_head.rec_type, clz.RECORD_TYPE)
241 362  
242 363 obj.last_slide_id_ref, = struct.unpack('<L', stream.read(4))
243 364 obj.version, = struct.unpack('<H', stream.read(2))
244 365 obj.minor_version, = struct.unpack('<B', stream.read(1))
245   - check_value('minorVersion', obj.minor_version, clz.MINOR_VERSION)
246 366 obj.major_version, = struct.unpack('<B', stream.read(1))
247   - check_value('majorVersion', obj.major_version, clz.MAJOR_VERSION)
248 367 obj.offset_last_edit, = struct.unpack('<L', stream.read(4))
249   - log.debug('offsetLastEdit: {0} (0x{0:04X})'.format(obj.offset_last_edit))
250   - # todo: check that this is before start pos / prev pos; 0x000 is end
251 368 obj.offset_persist_directory, = struct.unpack('<L', stream.read(4))
252   - log.debug('offsetPersistDir: {0} (0x{0:04X})'
253   - .format(obj.offset_persist_directory))
254 369 obj.doc_persist_id_ref, = struct.unpack('<L', stream.read(4))
255   - check_value('docPersistIdRef', obj.doc_persist_id_ref, 1)
256 370 obj.persist_id_seed, = struct.unpack('<L', stream.read(4))
  371 + # (can only check once have the PersistDirectoryAtom)
257 372 obj.last_view, = struct.unpack('<H', stream.read(2))
258 373 stream.read(2) # unused
259 374 if is_encrypted:
260   - obj.encrypt_session_persist_id_ref, = struct.unpack('<L',
261   - stream.read(4))
262   - else:
  375 + obj.encrypt_session_persist_id_ref, = \
  376 + struct.unpack('<L', stream.read(4))
  377 + else: # this entry may be there or may not
263 378 obj.encrypt_session_persist_id_ref = None
264 379  
265 380 return obj
266 381  
  382 + def check_validity(self, offset=None):
  383 + errs = self.check_rec_head()
  384 + errs.extend(self.check_value('minorVersion', self.minor_version,
  385 + self.MINOR_VERSION))
  386 + errs.extend(self.check_value('majorVersion', self.major_version,
  387 + self.MAJOR_VERSION))
  388 + if offset is not None:
  389 + if self.offset_last_edit >= offset:
  390 + errs.append(PptUnexpectedData(
  391 + 'PowerPoint Document', 'UserEditAtom.offsetLastEdit',
  392 + self.offset_last_edit, '< {}'.format(offset)))
  393 + if self.offset_persist_directory >= offset or \
  394 + self.offset_persist_directory <= self.offset_last_edit:
  395 + errs.append(PptUnexpectedData(
  396 + 'PowerPoint Document',
  397 + 'UserEditAtom.offsetPersistDirectory',
  398 + self.offset_last_edit,
  399 + 'in ({}, {})'.format(self.offset_last_edit, offset)))
  400 + errs.extend(self.check_value('docPersistIdRef',
  401 + self.doc_persist_id_ref, 1))
  402 + return errs
  403 +
  404 + # TODO: offer to check persist_id_seed given PersistDirectoryAtom)
  405 +
  406 +
  407 +class DummyType(PptType):
  408 + """ a type that is found in ppt documents we are not interested in
  409 +
  410 + instead of parsing many uninteresting types, we just read their
  411 + RecordHeader and set the RECORD_... values on an instance- (instead of
  412 + class-) level
  413 +
  414 + used to skip over uninteresting types in e.g. DocumentContainer
  415 + """
  416 +
  417 + def __init__(self, type_name, record_type, rec_ver=0, rec_instance=0,
  418 + rec_len=None):
  419 + super(DummyType, self).__init__()
  420 + self.type_name = type_name
  421 + self.RECORD_TYPE = record_type
  422 + self.RECORD_VERSION = rec_ver
  423 + self.RECORD_INSTANCE = rec_instance
  424 + self.record_length = rec_len
  425 +
  426 + def extract_from(self, stream):
  427 + """ extract record header and just skip as many bytes as header says
  428 +
  429 + Since this requires RECORD_... values set in constructor, this is NOT
  430 + a classmethod like all the other extract_from!
  431 +
  432 + Otherwise this tries to be compatible with other extract_from methods
  433 + (e.g. returns self)
  434 + """
  435 + self.read_rec_head(stream)
  436 + log.debug('skipping over {} Byte for type {}'
  437 + .format(self.rec_head.rec_len, self.type_name))
  438 + log.debug('start at pos {}'.format(stream.tell()))
  439 + stream.seek(self.rec_head.rec_len, os.SEEK_CUR)
  440 + log.debug('now at pos {}'.format(stream.tell()))
  441 + return self
  442 +
  443 + def check_validity(self):
  444 + return self.check_rec_head(self.record_length)
  445 +
  446 +
  447 +class PersistDirectoryAtom(PptType):
  448 + """ one part of a persist object directory with unique persist object id
  449 +
  450 + contains PersistDirectoryEntry objects
  451 +
  452 + https://msdn.microsoft.com/en-us/library/dd952680%28v=office.12%29.aspx
  453 + """
  454 +
  455 + RECORD_TYPE = 0x1772
  456 +
  457 + def __init__(self):
  458 + super(PersistDirectoryAtom, self).__init__()
  459 + self.rg_persist_dir_entry = None # actually, this will be an array
  460 + self.stream_offset = None
  461 +
  462 + @classmethod
  463 + def extract_from(clz, stream):
  464 + """ create and return object with data from given stream """
  465 +
  466 + log.debug("Extracting a PersistDirectoryAtom from stream")
  467 + obj = clz()
  468 +
  469 + # remember own offset for checking validity
  470 + obj.stream_offset = stream.tell()
  471 +
  472 + # parse record header
  473 + obj.read_rec_head(stream)
  474 +
  475 + # read directory entries from list until reach size for this object
  476 + curr_pos = stream.tell()
  477 + stop_pos = curr_pos + obj.rec_head.rec_len
  478 + log.debug('start reading at pos {}, read until {}'
  479 + .format(curr_pos, stop_pos))
  480 + obj.rg_persist_dir_entry = []
  481 +
  482 + while curr_pos < stop_pos:
  483 + new_entry = PersistDirectoryEntry.extract_from(stream)
  484 + obj.rg_persist_dir_entry.append(new_entry)
  485 + curr_pos = stream.tell()
  486 + log.debug('at pos {}'.format(curr_pos))
  487 + return obj
  488 +
  489 + def check_validity(self, user_edit_last_offset=None):
  490 + errs = self.check_rec_head()
  491 + for entry in self.rg_persist_dir_entry:
  492 + errs.extend(entry.check_validity(user_edit_last_offset,
  493 + self.stream_offset))
  494 + return errs
  495 +
  496 +
  497 +class PersistDirectoryEntry(object):
  498 + """ an entry contained in a PersistDirectoryAtom.rg_persist_dir_entry
  499 +
  500 + A structure that specifies a compressed table of sequential persist object
  501 + identifiers and stream offsets to associated persist objects.
  502 +
  503 + NOT a subclass of PptType because has no RecordHeader
  504 +
  505 + https://msdn.microsoft.com/en-us/library/dd947347%28v=office.12%29.aspx
  506 + """
  507 +
  508 + def __init__(self):
  509 + self.persist_id = None
  510 + self.c_persist = None
  511 + self.rg_persist_offset = None
  512 +
  513 + @classmethod
  514 + def extract_from(clz, stream):
  515 + # take a 4-byte (=32bit) number, divide into 20bit and 12 bit)
  516 + log.debug("Extracting a PersistDirectoryEntry from stream")
  517 + obj = clz()
  518 +
  519 + # persistId (20 bits): An unsigned integer that specifies a starting
  520 + # persist object identifier. It MUST be less than or equal to 0xFFFFE.
  521 + # The first entry in rgPersistOffset is associated with persistId. The
  522 + # next entry, if present, is associated with persistId plus 1. Each
  523 + # entry in rgPersistOffset is associated with a persist object
  524 + # identifier in this manner, with the final entry associated with
  525 + # persistId + cPersist - 1.
  526 +
  527 + # cPersist (12 bits): An unsigned integer that specifies the count of
  528 + # items in rgPersistOffset. It MUST be greater than or equal to 0x001.
  529 + temp, = struct.unpack('<L', stream.read(4))
  530 + obj.c_persist, obj.persist_id = divmod(temp, 2**20)
  531 + log.debug('temp is 0x{0:04X} --> id is {1}, reading {2} offsets'
  532 + .format(temp, obj.persist_id, obj.c_persist))
  533 +
  534 + # rgPersistOffset (variable): An array of PersistOffsetEntry (section
  535 + # 2.3.6) that specifies stream offsets to persist objects. The count of
  536 + # items in the array is specified by cPersist. The value of each item
  537 + # MUST be greater than or equal to offsetLastEdit in the corresponding
  538 + # user edit and MUST be less than the offset, in bytes, of the
  539 + # corresponding persist object directory.
  540 + # PersistOffsetEntry: An unsigned 4-byte integer that specifies an
  541 + # offset, in bytes, from the beginning of the PowerPoint Document
  542 + # Stream (section 2.1.2) to a persist object.
  543 + obj.rg_persist_offset = [struct.unpack('<L', stream.read(4))[0] \
  544 + for _ in range(obj.c_persist)]
  545 + log.debug('offsets are: {}'.format(obj.rg_persist_offset))
  546 + return obj
  547 +
  548 + def check_validity(self, user_edit_last_offset=None,
  549 + persist_obj_dir_offset=None):
  550 + errs = []
  551 + if self.persist_id > 0xFFFFE: # (--> == 0xFFFFF since 20bit)
  552 + errs.append(PptUnexpectedData(
  553 + MAIN_STREAM_NAME, 'PersistDirectoryEntry.persist_id',
  554 + self.persist_id, '< 0xFFFFE (dec: {})'.format(0xFFFFE)))
  555 + if self.c_persist == 0:
  556 + errs.append(PptUnexpectedData(
  557 + MAIN_STREAM_NAME, 'PersistDirectoryEntry.c_persist',
  558 + self.c_persist, '> 0'))
  559 + if user_edit_last_offset is not None \
  560 + and min(self.rg_persist_offset) < user_edit_last_offset:
  561 + errs.append(PptUnexpectedData(
  562 + MAIN_STREAM_NAME, 'PersistDirectoryEntry.rg_persist_offset',
  563 + min(self.rg_persist_offset),
  564 + '> UserEdit.offsetLastEdit ({})'
  565 + .format(user_edit_last_offset)))
  566 + if persist_obj_dir_offset is not None \
  567 + and max(self.rg_persist_offset) > persist_obj_dir_offset:
  568 + errs.append(PptUnexpectedData(
  569 + MAIN_STREAM_NAME, 'PersistDirectoryEntry.rg_persist_offset',
  570 + max(self.rg_persist_offset),
  571 + '> PersistObjectDirectory offset ({})'
  572 + .format(persist_obj_dir_offset)))
  573 + return errs
  574 +
  575 +
  576 +class DocInfoListContainer(PptType):
  577 + """ information about the document and document display settings
  578 +
  579 + https://msdn.microsoft.com/en-us/library/dd926767%28v=office.12%29.aspx
  580 + """
  581 +
  582 + RECORD_VERSION = 0xF
  583 + RECORD_TYPE = 0x07D0
  584 +
  585 + def __init__(self):
  586 + super(DocInfoListContainer, self).__init__()
  587 +
  588 +
  589 +class DocumentContainer(PptType):
  590 + """ a DocumentContainer record
  591 +
  592 + https://msdn.microsoft.com/en-us/library/dd947357%28v=office.12%29.aspx
  593 + """
  594 +
  595 + RECORD_TYPE = 0x03E8
  596 +
  597 + def __init__(self):
  598 + super(DocumentContainer, self).__init__()
  599 + self.document_atom = None
  600 + self.ex_obj_list = None
  601 + self.document_text_info = None
  602 + self.sound_collection = None
  603 + self.drawing_group = None
  604 + self.master_list = None
  605 + self.doc_info_list = None
  606 + self.slide_hf = None
  607 + self.notes_hf = None
  608 + self.slide_list = None
  609 + self.notes_list = None
  610 + self.slide_show_doc_info = None
  611 + self.named_shows = None
  612 + self.summary = None
  613 + self.doc_routing_slip = None
  614 + self.print_options = None
  615 + self.rt_custom_table_styles_1 = None
  616 + self.end_document = None
  617 + self.rt_custom_table_styles_2 = None
  618 +
  619 + @classmethod
  620 + def extract_from(clz, stream):
  621 + """ created object with values from given stream
  622 +
  623 + stream is assumed to be positioned correctly
  624 +
  625 + this container contains lots of data we are not interested in.
  626 + """
  627 + obj = clz()
  628 +
  629 + # parse record header
  630 + obj.read_rec_head(stream)
  631 +
  632 + # documentAtom (48 bytes): A DocumentAtom record (section 2.4.2) that
  633 + # specifies size information for presentation slides and notes slides.
  634 + obj.document_atom = DummyType('DocumentAtom', 0x03E9, rec_ver=0x1,
  635 + rec_len=0x28).extract_from(stream)
  636 +
  637 + # exObjList (variable): An optional ExObjListContainer record (section
  638 + # 2.10.1) that specifies the list of external objects in the document.
  639 + obj.ex_obj_list = DummyType('ExObjListContainer', 0x0409, rec_ver=0xF)\
  640 + .extract_from(stream)
  641 +
  642 + # documentTextInfo (variable): A DocumentTextInfoContainer record
  643 + # (section 2.9.1) that specifies the default text styles for the
  644 + # document.
  645 + obj.document_text_info = DummyType('DocumentTextInfoContainer', 0x03F2,
  646 + rec_ver=0xF).extract_from(stream)
  647 +
  648 + # soundCollection (variable): An optional SoundCollectionContainer
  649 + # record (section 2.4.16.1) that specifies the list of sounds in the
  650 + # file.
  651 + obj.sound_collection = DummyType('SoundCollectionContainer', 0x07E4,
  652 + rec_ver=0xF, rec_instance=0x005)\
  653 + .extract_from(stream)
  654 +
  655 + # drawingGroup (variable): A DrawingGroupContainer record (section
  656 + # 2.4.3) that specifies drawing information for the document.
  657 + obj.drawing_group = DummyType('DrawingGroupContainer', 0x040B,
  658 + rec_ver=0xF).extract_from(stream)
  659 +
  660 + # masterList (variable): A MasterListWithTextContainer record (section
  661 + # 2.4.14.1) that specifies the list of main master slides and title
  662 + # master slides.
  663 + obj.master_list = DummyType('MasterListWithContainer', 0x0FF0,
  664 + rec_ver=0xF).extract_from(stream)
  665 +
  666 + # docInfoList (variable): An optional DocInfoListContainer record
  667 + # (section 2.4.4) that specifies additional document information.
  668 + # this is the variable we are interested in!
  669 + obj.doc_info_list = DocInfoListContainer.extract_from(stream)
  670 +
  671 + # slideHF (variable): An optional SlideHeadersFootersContainer record
  672 + # (section 2.4.15.1) that specifies the default header and footer
  673 + # information for presentation slides.
  674 + obj.slide_hf = None
  675 +
  676 + # notesHF (variable): An optional NotesHeadersFootersContainer record
  677 + # (section 2.4.15.6) that specifies the default header and footer
  678 + # information for notes slides.
  679 + obj.notes_hf = None
  680 +
  681 + # slideList (variable): An optional SlideListWithTextContainer record
  682 + # (section 2.4.14.3) that specifies the list of presentation slides.
  683 + obj.slide_list = None
  684 +
  685 + # notesList (variable): An optional NotesListWithTextContainer record
  686 + # (section 2.4.14.6) that specifies the list of notes slides.
  687 + obj.notes_list = None
  688 +
  689 + # slideShowDocInfoAtom (88 bytes): An optional SlideShowDocInfoAtom
  690 + # record (section 2.6.1) that specifies slide show information for the
  691 + # document.
  692 + obj.slide_show_doc_info = None
  693 +
  694 + # namedShows (variable): An optional NamedShowsContainer record
  695 + # (section 2.6.2) that specifies named shows in the document.
  696 + obj.named_shows = None
  697 +
  698 + # summary (variable): An optional SummaryContainer record (section
  699 + # 2.4.22.3) that specifies bookmarks for the document.
  700 + obj.summary = None
  701 +
  702 + # docRoutingSlipAtom (variable): An optional DocRoutingSlipAtom record
  703 + # (section 2.11.1) that specifies document routing information.
  704 + obj.doc_routing_slip = None
  705 +
  706 + # printOptionsAtom (13 bytes): An optional PrintOptionsAtom record
  707 + # (section 2.4.12) that specifies default print options.
  708 + obj.print_options = None
  709 +
  710 + # rtCustomTableStylesAtom1 (variable): An optional
  711 + # RoundTripCustomTableStyles12Atom record (section 2.11.13) that
  712 + # specifies round-trip information for custom table styles.
  713 + obj.rt_custom_table_styles_1 = None
  714 +
  715 + # endDocumentAtom (8 bytes): An EndDocumentAtom record (section 2.4.13)
  716 + # that specifies the end of the information for the document.
  717 + obj.end_document = None
  718 +
  719 + # rtCustomTableStylesAtom2 (variable): An optional
  720 + # RoundTripCustomTableStyles12Atom record that specifies round-trip
  721 + # information for custom table styles. It MUST NOT exist if
  722 + # rtCustomTableStylesAtom1 exists.
  723 + obj.rt_custom_table_styles_2 = None
  724 +
  725 + return obj
  726 +
  727 +
  728 + def check_validity(self):
  729 + """ check all values in object for valid values """
  730 + errs = self.check_rec_head()
  731 + errs.extend(self.document_atom.check_validity())
  732 + errs.extend(self.ex_obj_list.check_validity())
  733 + errs.extend(self.document_text_info.check_validity())
  734 + errs.extend(self.sound_collection.check_validity())
  735 + errs.extend(self.drawing_group.check_validity())
  736 + errs.extend(self.master_list.check_validity())
  737 + errs.extend(self.doc_info_list.check_validity())
  738 + return errs
267 739  
268 740 # === PptParser ===============================================================
269 741  
... ... @@ -276,7 +748,7 @@ class PptParser(object):
276 748  
277 749 def __init__(self, ole, fast_fail=False):
278 750 """ constructor
279   -
  751 +
280 752 :param ole: OleFileIO or anything that OleFileIO constructor accepts
281 753 :param bool fast_fail: if True, all unexpected data will raise a
282 754 PptUnexpectedData; if False will only log error
... ... @@ -290,6 +762,8 @@ class PptParser(object):
290 762 self.fast_fail = fast_fail
291 763  
292 764 self.current_user_atom = None
  765 + self.document_persist_obj = None
  766 + self.persist_object_directory = None
293 767  
294 768 # basic compatibility check: root directory structure is
295 769 # [['\x05DocumentSummaryInformation'],
... ... @@ -304,12 +778,12 @@ class PptParser(object):
304 778 root_streams = [stream[0].lower() for stream in root_streams]
305 779 if not 'current user' in root_streams:
306 780 self._fail('root', 'listdir', root_streams, 'Current User')
307   - if not 'powerpoint document' in root_streams:
308   - self._fail('root', 'listdir', root_streams, 'PowerPoint Document')
  781 + if not MAIN_STREAM_NAME.lower() in root_streams:
  782 + self._fail('root', 'listdir', root_streams, MAIN_STREAM_NAME)
309 783  
310 784 def _log_exception(self, msg=None):
311 785 """ log an exception instead of raising it
312   -
  786 +
313 787 call in one of 2 ways:
314 788 try:
315 789 if fail():
... ... @@ -348,7 +822,7 @@ class PptParser(object):
348 822  
349 823 if self.current_user_atom is not None:
350 824 log.warning('re-reading and overwriting '
351   - 'previously read CurrentUserAtom')
  825 + 'previously read current_user_atom')
352 826  
353 827 try:
354 828 self.current_user_atom = CurrentUserAtom.extract_from(self.ole)
... ... @@ -358,41 +832,130 @@ class PptParser(object):
358 832 else:
359 833 self._log_exception()
360 834  
361   - def construct_persist_object_directory(self):
362   - """ part 2 """
363   -
  835 + def parse_persist_object_directory(self):
  836 + """ Part 1: Construct the persist object directory """
  837 +
  838 + if self.persist_object_directory is not None:
  839 + log.warning('re-reading and overwriting '
  840 + 'previously read persist_object_directory')
  841 +
364 842 if self.current_user_atom is None:
365 843 self.parse_current_user()
366 844  
367 845 offset = self.current_user_atom.offset_to_current_edit
368 846 is_encrypted = self.current_user_atom.is_encrypted()
  847 + self.persist_object_directory = {}
  848 +
  849 + stream = None
  850 + try:
  851 + log.debug('opening stream')
  852 + stream = self.ole.openstream(MAIN_STREAM_NAME)
  853 + while offset != 0:
  854 +
  855 + stream.seek(offset, os.SEEK_SET)
  856 + user_edit = UserEditAtom.extract_from(stream, is_encrypted)
  857 +
  858 + log.debug('checking validity')
  859 + errs = user_edit.check_validity()
  860 + if errs:
  861 + log.warning('check_validity found {} issues'
  862 + .format(len(errs)))
  863 + for err in errs:
  864 + log.warning('UserEditAtom.check_validity: {}'.format(err))
  865 + if errs and self.fast_fail:
  866 + raise errs[0]
  867 +
  868 + log.debug('seeking to pos {}'
  869 + .format(user_edit.offset_persist_directory))
  870 + stream.seek(user_edit.offset_persist_directory, os.SEEK_SET)
  871 +
  872 + persist_dir_atom = PersistDirectoryAtom.extract_from(stream)
  873 +
  874 + log.debug('checking validity')
  875 + errs = persist_dir_atom.check_validity(offset)
  876 + if errs:
  877 + log.warning('check_validity found {} issues'
  878 + .format(len(errs)))
  879 + for err in errs:
  880 + log.warning('PersistDirectoryAtom.check_validity: {}'
  881 + .format(err))
  882 + if errs and self.fast_fail:
  883 + raise errs[0]
  884 +
  885 + for entry in persist_dir_atom.rg_persist_dir_entry:
  886 + log.debug('saving {} offsets for persist_id {}'
  887 + .format(len(entry.rg_persist_offset),
  888 + entry.persist_id))
  889 + self.persist_object_directory[entry.persist_id] = \
  890 + entry.rg_persist_offset
  891 +
  892 + # check for more
  893 + offset = user_edit.offset_last_edit
  894 + except Exception:
  895 + if self.fast_fail:
  896 + raise
  897 + else:
  898 + self._log_exception()
  899 + finally:
  900 + if stream is not None:
  901 + log.debug('closing stream')
  902 + stream.close()
  903 +
  904 + def parse_document_persist_object(self):
  905 + """ """
  906 + if self.document_persist_obj is not None:
  907 + log.warning('re-reading and overwriting '
  908 + 'previously read document_persist_object')
  909 +
  910 + if self.persist_object_directory is None:
  911 + self.parse_persist_object_directory()
  912 +
  913 + offset = None # TODO: read from object directory
369 914 stream = None
370 915  
371 916 try:
372   - stream = self.ole.openstream('PowerPoint Document')
  917 + log.debug('opening stream')
  918 + stream = self.ole.openstream(MAIN_STREAM_NAME)
  919 + log.debug('stream pos: {}'.format(stream.tell()))
373 920 stream.seek(offset)
374   - user_edit = UserEditAtom.extract_from(stream, is_encrypted)
  921 + log.debug('seek by {} to {}'.format(offset, stream.tell()))
  922 + self.document_persist_obj = DocumentContainer.extract_from(stream)
  923 + except Exception:
  924 + if self.fast_fail:
  925 + raise
  926 + else:
  927 + self._log_exception()
375 928 finally:
376 929 if stream is not None:
377 930 log.debug('closing stream')
378 931 stream.close()
379 932  
  933 + log.debug('checking validity')
  934 + errs = self.document_persist_obj.check_validity()
  935 + if errs:
  936 + log.warning('check_validity found {} issues'.format(len(errs)))
  937 + for err in errs:
  938 + log.warning('check_validity(document_persist_obj): {}'
  939 + .format(err))
  940 + if errs and self.fast_fail:
  941 + raise errs[0]
  942 +
380 943 # === TESTING =================================================================
381 944  
382 945 def test():
383 946 """ for testing and debugging """
384 947  
385 948 # setup logging
386   - logging.basicConfig(level=logging.DEBUG, format='%(levelname)-8s %(message)s')
  949 + logging.basicConfig(level=logging.DEBUG,
  950 + format='%(levelname)-8s %(message)s')
387 951 log.setLevel(logging.NOTSET)
388 952  
389 953 # test file with some autostart macros
390 954 test_file = 'gelaber_autostart.ppt'
391 955  
392 956 # parse
393   - ppt = PptParser(test_file)
394   - ppt.parse_current_user()
395   - ppt.construct_persist_object_directory()
  957 + ppt = PptParser(test_file, fast_fail=False)
  958 + ppt.parse_document_persist_object()
396 959  
397 960  
398 961 if __name__ == '__main__':
... ...