Commit acfb36b32ffa3b670fd7cda922cb570fa7ef9bcd

Authored by Christian Herdtweck
1 parent 38418c29

ppt_record_parser: find and decompress embedded ole streams

Showing 1 changed file with 223 additions and 22 deletions
oletools/ppt_record_parser.py
@@ -29,8 +29,6 @@ Alternative to ppt_parser.py that works on records @@ -29,8 +29,6 @@ Alternative to ppt_parser.py that works on records
29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 # POSSIBILITY OF SUCH DAMAGE. 30 # POSSIBILITY OF SUCH DAMAGE.
31 31
32 -from __future__ import print_function  
33 -  
34 #------------------------------------------------------------------------------ 32 #------------------------------------------------------------------------------
35 # CHANGELOG: 33 # CHANGELOG:
36 # 2017-11-30 v0.01 CH: - first version based on xls_parser 34 # 2017-11-30 v0.01 CH: - first version based on xls_parser
@@ -44,10 +42,11 @@ from __future__ import print_function @@ -44,10 +42,11 @@ from __future__ import print_function
44 42
45 43
46 import sys 44 import sys
47 -from struct import unpack 45 +from struct import unpack # unsigned: 1 Byte = B, 2 Byte = H, 4 Byte = L
48 import logging 46 import logging
49 import record_base 47 import record_base
50 import io 48 import io
  49 +import zlib
51 50
52 51
53 class PptFile(record_base.OleRecordFile): 52 class PptFile(record_base.OleRecordFile):
@@ -77,6 +76,10 @@ class PptStream(record_base.OleRecordStream): @@ -77,6 +76,10 @@ class PptStream(record_base.OleRecordStream):
77 """ 76 """
78 if rec_type == PptRecordCurrentUser.TYPE: 77 if rec_type == PptRecordCurrentUser.TYPE:
79 return PptRecordCurrentUser, True 78 return PptRecordCurrentUser, True
  79 + elif rec_type == PptRecordExOleObjAtom.TYPE:
  80 + return PptRecordExOleObjAtom, True
  81 + elif rec_type == PptRecordExOleVbaActiveXAtom.TYPE:
  82 + return PptRecordExOleVbaActiveXAtom, True
80 83
81 try: 84 try:
82 record_name = RECORD_TYPES[rec_type] 85 record_name = RECORD_TYPES[rec_type]
@@ -86,6 +89,8 @@ class PptStream(record_base.OleRecordStream): @@ -86,6 +89,8 @@ class PptStream(record_base.OleRecordStream):
86 is_container = False 89 is_container = False
87 elif record_name.endswith('Blob'): 90 elif record_name.endswith('Blob'):
88 is_container = False 91 is_container = False
  92 + elif record_name == 'CString':
  93 + is_container = False
89 else: 94 else:
90 logging.warning('Unexpected name for record type "{0}". typo?' 95 logging.warning('Unexpected name for record type "{0}". typo?'
91 .format(record_name)) 96 .format(record_name))
@@ -106,7 +111,8 @@ class PptRecord(record_base.OleRecordBase): @@ -106,7 +111,8 @@ class PptRecord(record_base.OleRecordBase):
106 INSTANCE = None 111 INSTANCE = None
107 VERSION = None 112 VERSION = None
108 113
109 - def parse(self, more_data): 114 + def finish_constructing(self, more_data):
  115 + """ check and save instance and version """
110 instance, version = more_data 116 instance, version = more_data
111 if self.INSTANCE is not None and self.INSTANCE != instance: 117 if self.INSTANCE is not None and self.INSTANCE != instance:
112 raise ValueError('invalid instance {0} for {1}' 118 raise ValueError('invalid instance {0} for {1}'
@@ -147,9 +153,13 @@ class PptRecord(record_base.OleRecordBase): @@ -147,9 +153,13 @@ class PptRecord(record_base.OleRecordBase):
147 class PptContainerRecord(PptRecord): 153 class PptContainerRecord(PptRecord):
148 """ A record that contains other records """ 154 """ A record that contains other records """
149 155
150 - def parse(self, more_data): 156 + def finish_constructing(self, more_data):
  157 + """ parse records from self.data """
151 # set self.version and self.instance 158 # set self.version and self.instance
152 - super(PptContainerRecord, self).parse(more_data) 159 + super(PptContainerRecord, self).finish_constructing(more_data)
  160 + self.records = None
  161 + if not self.data:
  162 + return
153 163
154 logging.debug('parsing contents of container record {0}'.format(self)) 164 logging.debug('parsing contents of container record {0}'.format(self))
155 165
@@ -162,6 +172,16 @@ class PptContainerRecord(PptRecord): @@ -162,6 +172,16 @@ class PptContainerRecord(PptRecord):
162 logging.debug('done parsing contents of container record {0}' 172 logging.debug('done parsing contents of container record {0}'
163 .format(self)) 173 .format(self))
164 174
  175 + def __str__(self):
  176 + text = super(PptContainerRecord, self).__str__()
  177 + if self.records is None:
  178 + return '{0}, unparsed{1}'.format(text[:-2], text[-2:])
  179 + elif self.records:
  180 + return '{0}, contains {1} recs{2}' \
  181 + .format(text[:-2], len(self.records), text[-2:])
  182 + else:
  183 + return text
  184 +
165 185
166 class PptRecordCurrentUser(PptRecord): 186 class PptRecordCurrentUser(PptRecord):
167 """ The CurrentUserAtom record """ 187 """ The CurrentUserAtom record """
@@ -169,14 +189,28 @@ class PptRecordCurrentUser(PptRecord): @@ -169,14 +189,28 @@ class PptRecordCurrentUser(PptRecord):
169 VERSION = 0 189 VERSION = 0
170 INSTANCE = 0 190 INSTANCE = 0
171 191
172 - def parse(self, more_data):  
173 - super(PptRecordCurrentUser, self).parse(more_data) 192 + def finish_constructing(self, more_data):
  193 + """ read various attributes from data """
  194 + super(PptRecordCurrentUser, self).finish_constructing(more_data)
174 if self.size < 24: 195 if self.size < 24:
175 raise ValueError('CurrentUser record is too small ({0})' 196 raise ValueError('CurrentUser record is too small ({0})'
176 .format(self.size)) 197 .format(self.size))
  198 + self.size2 = None
  199 + self.header_token = None
  200 + self.offset_to_current_edit = None
  201 + self.len_user_name = None
  202 + self.doc_file_version = None
  203 + self.major_version = None
  204 + self.minor_version = None
  205 + self.ansi_user_name = None
  206 + self.unicode_user_name = None
  207 +
  208 + if not self.data:
  209 + return
  210 +
177 self.size2, self.header_token, self.offset_to_current_edit, \ 211 self.size2, self.header_token, self.offset_to_current_edit, \
178 self.len_user_name, self.doc_file_version, self.major_version, \ 212 self.len_user_name, self.doc_file_version, self.major_version, \
179 - self.minor_version, _ = unpack('<IIIHHBBH', self.data[0:20]) 213 + self.minor_version, _ = unpack('<LLLHHBBH', self.data[0:20])
180 if self.size2 != 0x14: 214 if self.size2 != 0x14:
181 raise ValueError('Wrong size2 ({0}) in CurrentUser record' 215 raise ValueError('Wrong size2 ({0}) in CurrentUser record'
182 .format(self.size2)) 216 .format(self.size2))
@@ -198,7 +232,7 @@ class PptRecordCurrentUser(PptRecord): @@ -198,7 +232,7 @@ class PptRecordCurrentUser(PptRecord):
198 '({0} != {1})'.format(len(self.ansi_user_name), 232 '({0} != {1})'.format(len(self.ansi_user_name),
199 self.len_user_name)) 233 self.len_user_name))
200 offset = 20 + self.len_user_name 234 offset = 20 + self.len_user_name
201 - self.release_version = unpack('<I', self.data[offset:offset+4])[0] 235 + self.release_version = unpack('<L', self.data[offset:offset+4])[0]
202 if self.release_version not in (8, 9): 236 if self.release_version not in (8, 9):
203 raise ValueError('CurrentUser record has wrong release version {0}' 237 raise ValueError('CurrentUser record has wrong release version {0}'
204 .format(self.release_version)) 238 .format(self.release_version))
@@ -212,6 +246,8 @@ class PptRecordCurrentUser(PptRecord): @@ -212,6 +246,8 @@ class PptRecordCurrentUser(PptRecord):
212 .format(self.size - offset)) 246 .format(self.size - offset))
213 247
214 def is_document_encrypted(self): 248 def is_document_encrypted(self):
  249 + if self.header_token is None:
  250 + raise ValueError('unknown')
215 return self.header_token == 0xF3D1C4DF 251 return self.header_token == 0xF3D1C4DF
216 252
217 def read_some_more(self, stream): 253 def read_some_more(self, stream):
@@ -229,6 +265,142 @@ class PptRecordCurrentUser(PptRecord): @@ -229,6 +265,142 @@ class PptRecordCurrentUser(PptRecord):
229 'stream'.format(len(data))) 265 'stream'.format(len(data)))
230 266
231 267
  268 +class PptRecordExOleObjAtom(PptRecord):
  269 + """ Record that contains info about type of embedded object """
  270 +
  271 + TYPE = 0x0fc3
  272 +
  273 + OBJ_TYPES = dict([(0, 'embedded'), (1, 'link'), (2, 'ActiveX')])
  274 + SUB_TYPES = dict([
  275 + (0x00, 'default'),
  276 + (0x01, 'clipart'),
  277 + (0x02, 'word doc'),
  278 + (0x03, 'excel sheet'),
  279 + (0x04, 'MS graph'),
  280 + (0x05, 'MS org chart'),
  281 + (0x06, 'equation'),
  282 + (0x07, 'word art'),
  283 + (0x08, 'sound'),
  284 + (0x0c, 'MS project'),
  285 + (0x0d, 'note-it'),
  286 + (0x0e, 'excel chart'),
  287 + (0x0f, 'media'),
  288 + (0x10, 'WordPad doc'),
  289 + (0x11, 'visio drawing'),
  290 + (0x12, 'OpenDoc text'),
  291 + (0x13, 'OpenDoc calc'),
  292 + (0x14, 'OpenDoc present'),
  293 + ])
  294 +
  295 + def finish_constructing(self, more_data):
  296 + """ parse some more data from this """
  297 + self.draw_aspect = None
  298 + self.obj_type = None
  299 + self.ex_obj_id = None
  300 + self.sub_type = None
  301 + self.persist_id_ref = None
  302 + if self.size != 0x18:
  303 + raise ValueError('ExOleObjAtom has wrong size {0} != 0x18'
  304 + .format(self.size))
  305 + if self.data:
  306 + self.draw_aspect, self.obj_type, self.ex_obj_id, self.sub_type, \
  307 + self.persist_id_ref, _ = unpack('<LLLLLL', self.data)
  308 + if self.obj_type not in self.OBJ_TYPES:
  309 + logging.warning('Unknown "type" value in ExOleObjAtom: {0}'
  310 + .format(self.obj_type))
  311 + if self.sub_type not in self.SUB_TYPES:
  312 + logging.warning('Unknown sub type value in ExOleObjAtom: {0}'
  313 + .format(self.sub_type))
  314 +
  315 + def _type_str(self):
  316 + return 'ExOleObjAtom type {0}/{1}'.format(
  317 + self.OBJ_TYPES.get(self.obj_type, str(self.obj_type)),
  318 + self.SUB_TYPES.get(self.sub_type, str(self.sub_type)))
  319 +
  320 +
  321 +class PptRecordExOleVbaActiveXAtom(PptRecord):
  322 + """ record that contains and ole object / vba storage / active x control
  323 +
  324 + Contains the actual data of the ole object / VBA storage / ActiveX control
  325 + in compressed or uncompressed form.
  326 +
  327 + Corresponding types in [MS-PPT]:
  328 + ExOleObjStg, ExOleObjStgUncompressedAtom, ExOleObjStgCompressedAtom,
  329 + VbaProjectStg, VbaProjectStgUncompressedAtom, VbaProjectStgCompressedAtom,
  330 + ExControlStg, ExControlStgUncompressedAtom, ExControlStgCompressedAtom.
  331 +
  332 + self.data is "An array of bytes that specifies a structured storage
  333 + (described in [MSDN-COM]) for the OLE object / ActiveX control / VBA
  334 + project ([MS-OVBA] section 2.2.1)."
  335 + If compressed, "The original bytes of the storage are compressed by the
  336 + algorithm specified in [RFC1950] and are decompressed by the algorithm
  337 + specified in [RFC1951]." (--> meaning zlib)
  338 + "Office Forms ActiveX controls are specified in [MS-OFORMS]."
  339 +
  340 + whether this is an OLE object or ActiveX control or a VBA Storage, need to
  341 + find the corresponding PptRecordExOleObjAtom
  342 + """
  343 +
  344 +
  345 + TYPE = 0x1011
  346 +
  347 + def is_compressed(self):
  348 + return self.instance == 1
  349 +
  350 + def get_uncompressed_size(self):
  351 + """ Get size of data in uncompressed form
  352 +
  353 + For uncompressed data, this just returns self.size. For compressed data,
  354 + this reads and returns the doecmpressedSize field value from self.data.
  355 + Raises a value error if compressed and data is not available.
  356 + """
  357 + if not self.is_compressed():
  358 + return self.size
  359 + elif self.data is None:
  360 + raise ValueError('Data not read from record')
  361 + else:
  362 + return unpack('<L', self.data[:4])[0]
  363 +
  364 + def iter_uncompressed(self, chunk_size=4096):
  365 + """ iterate over data, decompress data if necessary
  366 +
  367 + chunk_size is used for input to decompression, so chunks yielded from
  368 + this may well be larger than that. Last chunk is most probably smaller.
  369 + """
  370 + if self.data is None:
  371 + raise ValueError('data not read from record')
  372 + must_decomp = self.is_compressed()
  373 + start_idx = 0
  374 + out_size = 0
  375 + if must_decomp:
  376 + decompressor = zlib.decompressobj()
  377 + start_idx = 4
  378 + while start_idx < self.size:
  379 + end_idx = min(self.size, start_idx+chunk_size)
  380 + if must_decomp:
  381 + result = decompressor.decompress(decompressor.unconsumed_tail +
  382 + self.data[start_idx:end_idx])
  383 + else:
  384 + result = self.data[start_idx:end_idx]
  385 + yield result
  386 + logging.debug('decompressing from {0} to {1} resulted in {2} new'
  387 + .format(start_idx, end_idx, len(result)))
  388 + out_size += len(result)
  389 + start_idx = end_idx
  390 + if must_decomp:
  391 + result = decompressor.flush()
  392 + out_size += len(result)
  393 + yield result
  394 + if out_size != self.get_uncompressed_size():
  395 + logging.warning('Decompressed data has wrong size {0} != {1}'
  396 + .format(out_size, self.get_uncompressed_size()))
  397 +
  398 + def __str__(self):
  399 + text = super(PptRecordExOleVbaActiveXAtom, self).__str__()
  400 + compr_text = 'compressed' if self.is_compressed() else 'uncompressed'
  401 + return '{0}, {1}{2}'.format(text[:-2], compr_text, text[-2:])
  402 +
  403 +
232 # types of relevant records (there are much more than listed here) 404 # types of relevant records (there are much more than listed here)
233 RECORD_TYPES = dict([ 405 RECORD_TYPES = dict([
234 # file structure types 406 # file structure types
@@ -248,7 +420,7 @@ RECORD_TYPES = dict([ @@ -248,7 +420,7 @@ RECORD_TYPES = dict([
248 (0x03f8, 'MainMasterContainer'), 420 (0x03f8, 'MainMasterContainer'),
249 # external object ty 421 # external object ty
250 (0x0409, 'ExObjListContainer'), 422 (0x0409, 'ExObjListContainer'),
251 - (0x1011, 'ExOleVbaActiveXAtom'), # ExOleObj|VbaProject|ExControl]Stg[Unc|C]ompressedAtom 423 + (0x1011, 'ExOleVbaActiveXAtom'), # --> use PptRecordExOleVbaActiveXAtom
252 (0x1006, 'ExAviMovieContainer'), 424 (0x1006, 'ExAviMovieContainer'),
253 (0x100e, 'ExCDAudioContainer'), 425 (0x100e, 'ExCDAudioContainer'),
254 (0x0fee, 'ExControlContainer'), 426 (0x0fee, 'ExControlContainer'),
@@ -260,11 +432,15 @@ RECORD_TYPES = dict([ @@ -260,11 +432,15 @@ RECORD_TYPES = dict([
260 (0x100f, 'ExWAVAudioEmbeddedContainer'), 432 (0x100f, 'ExWAVAudioEmbeddedContainer'),
261 (0x1010, 'ExWAVAudioLinkContainer'), 433 (0x1010, 'ExWAVAudioLinkContainer'),
262 (0x1004, 'ExMediaAtom'), 434 (0x1004, 'ExMediaAtom'),
  435 + (0x040a, 'ExObjListAtom'),
  436 + (0x0fcd, 'ExOleEmbedAtom'),
  437 + (0x0fc3, 'ExOleObjAtom'), # --> use PptRecordExOleObjAtom instead
263 # other types 438 # other types
264 (0x0fc1, 'MetafileBlob'), 439 (0x0fc1, 'MetafileBlob'),
265 (0x0fb8, 'FontEmbedDataBlob'), 440 (0x0fb8, 'FontEmbedDataBlob'),
266 (0x07e7, 'SoundDataBlob'), 441 (0x07e7, 'SoundDataBlob'),
267 (0x138b, 'BinaryTagDataBlob'), 442 (0x138b, 'BinaryTagDataBlob'),
  443 + (0x0fba, 'CString'),
268 ]) 444 ])
269 445
270 # record types where version is not 0x0 or 0xf 446 # record types where version is not 0x0 or 0xf
@@ -302,16 +478,41 @@ INSTANCE_EXCEPTIONS = dict([ @@ -302,16 +478,41 @@ INSTANCE_EXCEPTIONS = dict([
302 ############################################################################### 478 ###############################################################################
303 479
304 480
  481 +def print_records(record, print_fn, indent, do_print_record):
  482 + """ print additional info for record
  483 +
  484 + prints additional info for some types and subrecords recursively
  485 + """
  486 + if do_print_record:
  487 + print_fn('{0}{1}'.format(' ' * indent, record))
  488 + if isinstance(record, PptContainerRecord):
  489 + for subrec in record.records:
  490 + print_records(subrec, print_fn, indent+1, True)
  491 + elif isinstance(record, PptRecordCurrentUser):
  492 + logging.info('{4}--> crypt: {0}, offset {1}, user {2}/{3}'
  493 + .format(record.is_document_encrypted(),
  494 + record.offset_to_current_edit,
  495 + repr(record.ansi_user_name),
  496 + repr(record.unicode_user_name),
  497 + ' ' * indent))
  498 + elif isinstance(record, PptRecordExOleObjAtom):
  499 + logging.info('{2}--> obj id {0}, persist id ref {1}'
  500 + .format(record.ex_obj_id, record.persist_id_ref,
  501 + ' ' * indent))
  502 + elif isinstance(record, PptRecordExOleVbaActiveXAtom):
  503 + #with open('testdump', 'wb') as writer:
  504 + # for chunk in record.iter_uncompressed():
  505 + # logging.info('{0}--> "{1}"'.format(' ' * indent, chunk))
  506 + # writer.write(chunk)
  507 + chunk1 = next(record.iter_uncompressed())
  508 + logging.info('{0}--> decompressed size {1}, data {2}...'
  509 + .format(' ' * indent, record.get_uncompressed_size(),
  510 + ', '.join('{0:02x}'.format(ord(c))
  511 + for c in chunk1[:32])))
  512 +
  513 +
305 if __name__ == '__main__': 514 if __name__ == '__main__':
306 - def print_subrecords(record):  
307 - if isinstance(record, PptContainerRecord):  
308 - for subrec in record.records:  
309 - logging.info(' {0}'.format(subrec))  
310 - elif isinstance(record, PptRecordCurrentUser):  
311 - logging.info(' crypt: {0}, offset {1}, user {2}/{3}'  
312 - .format(record.is_document_encrypted(),  
313 - record.offset_to_current_edit,  
314 - repr(record.ansi_user_name),  
315 - repr(record.unicode_user_name))) 515 + def do_per_record(record):
  516 + print_records(record, logging.info, 2, False)
316 sys.exit(record_base.test(sys.argv[1:], PptFile, 517 sys.exit(record_base.test(sys.argv[1:], PptFile,
317 - do_per_record=print_subrecords)) 518 + do_per_record=do_per_record))