Commit dd5ee6df43c6ec547b428089ba4d5ea5e28f1ae4

Authored by Christian Herdtweck
1 parent 8ae664a2

continue with UserEditAtom

Showing 1 changed file with 119 additions and 31 deletions
oletools/ppt_parser.py
... ... @@ -17,6 +17,7 @@ References:
17 17 #------------------------------------------------------------------------------
18 18 # TODO:
19 19 # - license
  20 +# - create a AtomBase class that defines check_value and parses RecordHead?
20 21 #
21 22 # CHANGELOG:
22 23 # 2016-05-04 v0.01 CH: - start parsing "Current User" stream
... ... @@ -61,6 +62,21 @@ class PptUnexpectedData(Exception):
61 62 # === STRUCTS =================================================================
62 63  
63 64  
  65 +def check_value(name, value, expected):
  66 + """ simplify verification of values in extract_from """
  67 + if isinstance(expected, (list, tuple)):
  68 + if value not in expected:
  69 + exp_str = '[' + ' OR '.join('{0:04X}'.format(val)
  70 + for val in expected) + ']'
  71 + raise PptUnexpectedData(
  72 + 'Current User', name,
  73 + '{0:04X}'.format(value), exp_str)
  74 + elif expected != value:
  75 + raise PptUnexpectedData(
  76 + 'Current User', name,
  77 + '{0:04X}'.format(value), '{0:04X}'.format(expected))
  78 +
  79 +
64 80 class RecordHeader(object):
65 81 """ a record header, often found in ppt files
66 82  
... ... @@ -123,6 +139,9 @@ class CurrentUserAtom(object):
123 139 self.unicode_user_name = None
124 140 self.rel_version = None
125 141  
  142 + def is_encrypted(self):
  143 + return self.header_token == self.HEADER_TOKEN_ENCRYPT
  144 +
126 145 @classmethod
127 146 def extract_from(clz, ole):
128 147 """ extract info from olefile """
... ... @@ -137,21 +156,19 @@ class CurrentUserAtom(object):
137 156  
138 157 # parse record header
139 158 obj.rec_head = RecordHeader.extract_from(stream)
140   - obj.check_value('rec_version', obj.rec_head.rec_ver, 0)
141   - obj.check_value('rec_instance', obj.rec_head.rec_ver, 0)
142   - obj.check_value('rec_instance', obj.rec_head.rec_type,
143   - clz.RECORD_TYPE)
  159 + check_value('rec_version', obj.rec_head.rec_ver, 0)
  160 + check_value('rec_instance', obj.rec_head.rec_ver, 0)
  161 + check_value('rec_type', obj.rec_head.rec_type, clz.RECORD_TYPE)
144 162  
145 163 size, = struct.unpack('<L', stream.read(4))
146   - obj.check_value('size', size, obj.SIZE)
  164 + check_value('size', size, obj.SIZE)
147 165 obj.header_token, = struct.unpack('<L', stream.read(4))
148   - obj.check_value('headerToken', obj.header_token,
149   - [clz.HEADER_TOKEN_ENCRYPT,
150   - clz.HEADER_TOKEN_NOCRYPT])
  166 + check_value('headerToken', obj.header_token,
  167 + [clz.HEADER_TOKEN_ENCRYPT, clz.HEADER_TOKEN_NOCRYPT])
151 168 log.debug('headerToken is encrypt: {}'
152 169 .format(obj.header_token == clz.HEADER_TOKEN_ENCRYPT))
153 170 obj.offset_to_current_edit, = struct.unpack('<L', stream.read(4))
154   - log.debug('offsetToCurrentEdit: {0} ({0:04X})'
  171 + log.debug('offsetToCurrentEdit: {0} (0x{0:04X})'
155 172 .format(obj.offset_to_current_edit))
156 173 obj.len_user_name, = struct.unpack('<H', stream.read(2))
157 174 log.debug('lenUserName: {}'.format(obj.len_user_name))
... ... @@ -160,22 +177,19 @@ class CurrentUserAtom(object):
160 177 'Current User', 'CurrentUserAtom.lenUserName',
161 178 obj.len_user_name, '< 256')
162 179 obj.doc_file_version, = struct.unpack('<H', stream.read(2))
163   - obj.check_value('docFileVersion', obj.doc_file_version,
164   - clz.DOC_FILE_VERSION)
  180 + check_value('docFileVersion', obj.doc_file_version,
  181 + clz.DOC_FILE_VERSION)
165 182 obj.major_version, = struct.unpack('<B', stream.read(1))
166   - obj.check_value('majorVersion', obj.major_version,
167   - clz.MAJOR_VERSION)
  183 + check_value('majorVersion', obj.major_version, clz.MAJOR_VERSION)
168 184 obj.minor_version, = struct.unpack('<B', stream.read(1))
169   - obj.check_value('minorVersion', obj.minor_version,
170   - clz.MINOR_VERSION)
  185 + check_value('minorVersion', obj.minor_version, clz.MINOR_VERSION)
171 186 stream.read(2) # unused
172 187 obj.ansi_user_name = stream.read(obj.len_user_name)
173 188 log.debug('ansiUserName: {!r}'.format(obj.ansi_user_name))
174 189 obj.rel_version, = struct.unpack('<L', stream.read(4))
175 190 log.debug('relVersion: {0:04X}'.format(obj.rel_version))
176   - obj.check_value('relVersion', obj.rel_version,
177   - [clz.REL_VERSION_CAN_USE,
178   - clz.REL_VERSION_NO_USE])
  191 + check_value('relVersion', obj.rel_version,
  192 + [clz.REL_VERSION_CAN_USE, clz.REL_VERSION_NO_USE])
179 193 obj.unicode_user_name = stream.read(2 * obj.len_user_name)
180 194 log.debug('unicodeUserName: {!r}'.format(obj.unicode_user_name))
181 195  
... ... @@ -188,19 +202,67 @@ class CurrentUserAtom(object):
188 202 log.debug('closing stream')
189 203 stream.close()
190 204  
191   - def check_value(self, name, value, expected):
192   - """ simplify verification of values in extract_from """
193   - if isinstance(expected, (list, tuple)):
194   - if value not in expected:
195   - exp_str = '[' + ' OR '.join('{0:04X}'.format(val)
196   - for val in expected) + ']'
197   - raise PptUnexpectedData(
198   - 'Current User', 'CurrentUserAtom.' + name,
199   - '{0:04X}'.format(value), exp_str)
200   - elif expected != value:
201   - raise PptUnexpectedData(
202   - 'Current User', 'CurrentUserAtom.' + name,
203   - '{0:04X}'.format(value), '{0:04X}'.format(expected))
  205 +class UserEditAtom(object):
  206 + """ An atom record that specifies information about a user edit
  207 +
  208 + https://msdn.microsoft.com/en-us/library/dd945746%28v=office.12%29.aspx
  209 + """
  210 +
  211 + RECORD_TYPE = 0x0FF5
  212 + MINOR_VERSION = 0x00
  213 + MAJOR_VERSION = 0x03
  214 +
  215 + def __init__(self):
  216 + self.rec_head = None
  217 + self.last_slide_id_ref = None
  218 + self.version = None
  219 + self.minor_version = None
  220 + self.major_version = None
  221 + self.offset_last_edit = None
  222 + self.offset_persist_directory = None
  223 + self.doc_persist_id_ref = None
  224 + self.persist_id_seed = None
  225 + self.last_view = None
  226 + self.encrypt_session_persist_id_ref = None
  227 +
  228 + @classmethod
  229 + def extract_from(clz, stream, is_encrypted):
  230 + """ extract info from given stream (already positioned correctly!) """
  231 +
  232 + log.debug('extract UserEditAtom from stream')
  233 +
  234 + obj = clz()
  235 +
  236 + # parse record header
  237 + obj.rec_head = RecordHeader.extract_from(stream)
  238 + check_value('rec_version', obj.rec_head.rec_ver, 0)
  239 + check_value('rec_instance', obj.rec_head.rec_ver, 0)
  240 + check_value('rec_type', obj.rec_head.rec_type, clz.RECORD_TYPE)
  241 +
  242 + obj.last_slide_id_ref, = struct.unpack('<L', stream.read(4))
  243 + obj.version, = struct.unpack('<H', stream.read(2))
  244 + obj.minor_version, = struct.unpack('<B', stream.read(1))
  245 + check_value('minorVersion', obj.minor_version, clz.MINOR_VERSION)
  246 + obj.major_version, = struct.unpack('<B', stream.read(1))
  247 + check_value('majorVersion', obj.major_version, clz.MAJOR_VERSION)
  248 + obj.offset_last_edit, = struct.unpack('<L', stream.read(4))
  249 + log.debug('offsetLastEdit: {0} (0x{0:04X})'.format(obj.offset_last_edit))
  250 + # todo: check that this is before start pos / prev pos; 0x000 is end
  251 + obj.offset_persist_directory, = struct.unpack('<L', stream.read(4))
  252 + log.debug('offsetPersistDir: {0} (0x{0:04X})'
  253 + .format(obj.offset_persist_directory))
  254 + obj.doc_persist_id_ref, = struct.unpack('<L', stream.read(4))
  255 + check_value('docPersistIdRef', obj.doc_persist_id_ref, 1)
  256 + obj.persist_id_seed, = struct.unpack('<L', stream.read(4))
  257 + obj.last_view, = struct.unpack('<H', stream.read(2))
  258 + stream.read(2) # unused
  259 + if is_encrypted:
  260 + obj.encrypt_session_persist_id_ref, = struct.unpack('<L',
  261 + stream.read(4))
  262 + else:
  263 + obj.encrypt_session_persist_id_ref = None
  264 +
  265 + return obj
204 266  
205 267  
206 268 # === PptParser ===============================================================
... ... @@ -227,6 +289,8 @@ class PptParser(object):
227 289  
228 290 self.fast_fail = fast_fail
229 291  
  292 + self.current_user_atom = None
  293 +
230 294 # basic compatibility check: root directory structure is
231 295 # [['\x05DocumentSummaryInformation'],
232 296 # ['\x05SummaryInformation'],
... ... @@ -282,6 +346,10 @@ class PptParser(object):
282 346 https://msdn.microsoft.com/en-us/library/dd948895%28v=office.12%29.aspx
283 347 """
284 348  
  349 + if self.current_user_atom is not None:
  350 + log.warning('re-reading and overwriting '
  351 + 'previously read CurrentUserAtom')
  352 +
285 353 try:
286 354 self.current_user_atom = CurrentUserAtom.extract_from(self.ole)
287 355 except Exception:
... ... @@ -290,6 +358,25 @@ class PptParser(object):
290 358 else:
291 359 self._log_exception()
292 360  
  361 + def construct_persist_object_directory(self):
  362 + """ part 2 """
  363 +
  364 + if self.current_user_atom is None:
  365 + self.parse_current_user()
  366 +
  367 + offset = self.current_user_atom.offset_to_current_edit
  368 + is_encrypted = self.current_user_atom.is_encrypted()
  369 + stream = None
  370 +
  371 + try:
  372 + stream = self.ole.openstream('PowerPoint Document')
  373 + stream.seek(offset)
  374 + user_edit = UserEditAtom.extract_from(stream, is_encrypted)
  375 + finally:
  376 + if stream is not None:
  377 + log.debug('closing stream')
  378 + stream.close()
  379 +
293 380 # === TESTING =================================================================
294 381  
295 382 def test():
... ... @@ -305,6 +392,7 @@ def test():
305 392 # parse
306 393 ppt = PptParser(test_file)
307 394 ppt.parse_current_user()
  395 + ppt.construct_persist_object_directory()
308 396  
309 397  
310 398 if __name__ == '__main__':
... ...