ppt records: compensate wrong size in CurrentUserAtom

This compensates for an inconsistency that is probably just an error in some ppt versions. The size attribute of the CurrentUserAtom "forgets" about the optional unicode user name, which then creates strange data behind the record (where nothing should be)

ppt records: compensate wrong size in CurrentUserAtom
This compensates for an inconsistency that is probably just an error in some ppt versions. The size attribute of the CurrentUserAtom "forgets" about the optional unicode user name, which then creates strange data behind the record (where nothing should be)
Christian Herdtweck
1 parent 470d0806
Showing 2 changed files with 35 additions and 1 deletions
oletools/ppt_record_parser.py
oletools/record_base.py
@@ -214,6 +214,20 @@ class PptRecordCurrentUser(PptRecord):
     def is_document_encrypted(self):
         return self.header_token == 0xF3D1C4DF
+    def read_some_more(self, stream):
+        """ check if unicode user name comes in stream after record
+
+        Can safely do this since no data should come after this record.
+        """
+        more_data = stream.read(3*self.len_user_name)   # limit data to read
+        if self.unicode_user_name is None and \
+                len(more_data) == 2*self.len_user_name:
+            self.unicode_user_name = more_data.decode('utf-16')
+            logging.debug('found unicode user name BEHIND current user atom')
+        else:
+            logging.warning('Unexplained data of size {0} in "Current User" '
+                            'stream'.format(len(data)))
+
 # types of relevant records (there are much more than listed here)
 RECORD_TYPES = dict([
@@ -203,7 +203,12 @@ class OleRecordStream(object):
             else:
                 self.stream.seek(rec_size, SEEK_CUR)
                 data = None
-            yield rec_clz(rec_type, rec_size, other, pos, data)
+            rec_object = rec_clz(rec_type, rec_size, other, pos, data)
+
+            # "We are microsoft, we do not have to adhere to our specifications"
+            rec_object.read_some_more(self.stream)
+            yield rec_object
+
     def __str__(self):
         return '[{0} {1} (type {2}, size {3})' \
@@ -265,6 +270,21 @@ class OleRecordBase(object):
         """
         pass
+    def read_some_more(self, stream):
+        """ Read some more data from stream after end of this record
+
+        Found that for CurrentUserAtom in "Current User" stream of ppt files,
+        the last attribute (user name in unicode) is found *behind* the record
+        data. Thank you, Microsoft!
+
+        Do this only if you are certain you will not mess up the following
+        records!
+
+        This base implementation does nothing. For optional overwriting in
+        subclasses (like PptRecordUserAtom where no record should follow.)
+        """
+        return
+
     def _type_str(self):
         """ helper for __str__, base implementation """
         return '{0} type {1}'.format(self.__class__.__name__, self.type)