Commit acfb36b32ffa3b670fd7cda922cb570fa7ef9bcd
1 parent
38418c29
ppt_record_parser: find and decompress embedded ole streams
Showing
1 changed file
with
223 additions
and
22 deletions
oletools/ppt_record_parser.py
| ... | ... | @@ -29,8 +29,6 @@ Alternative to ppt_parser.py that works on records |
| 29 | 29 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 30 | 30 | # POSSIBILITY OF SUCH DAMAGE. |
| 31 | 31 | |
| 32 | -from __future__ import print_function | |
| 33 | - | |
| 34 | 32 | #------------------------------------------------------------------------------ |
| 35 | 33 | # CHANGELOG: |
| 36 | 34 | # 2017-11-30 v0.01 CH: - first version based on xls_parser |
| ... | ... | @@ -44,10 +42,11 @@ from __future__ import print_function |
| 44 | 42 | |
| 45 | 43 | |
| 46 | 44 | import sys |
| 47 | -from struct import unpack | |
| 45 | +from struct import unpack # unsigned: 1 Byte = B, 2 Byte = H, 4 Byte = L | |
| 48 | 46 | import logging |
| 49 | 47 | import record_base |
| 50 | 48 | import io |
| 49 | +import zlib | |
| 51 | 50 | |
| 52 | 51 | |
| 53 | 52 | class PptFile(record_base.OleRecordFile): |
| ... | ... | @@ -77,6 +76,10 @@ class PptStream(record_base.OleRecordStream): |
| 77 | 76 | """ |
| 78 | 77 | if rec_type == PptRecordCurrentUser.TYPE: |
| 79 | 78 | return PptRecordCurrentUser, True |
| 79 | + elif rec_type == PptRecordExOleObjAtom.TYPE: | |
| 80 | + return PptRecordExOleObjAtom, True | |
| 81 | + elif rec_type == PptRecordExOleVbaActiveXAtom.TYPE: | |
| 82 | + return PptRecordExOleVbaActiveXAtom, True | |
| 80 | 83 | |
| 81 | 84 | try: |
| 82 | 85 | record_name = RECORD_TYPES[rec_type] |
| ... | ... | @@ -86,6 +89,8 @@ class PptStream(record_base.OleRecordStream): |
| 86 | 89 | is_container = False |
| 87 | 90 | elif record_name.endswith('Blob'): |
| 88 | 91 | is_container = False |
| 92 | + elif record_name == 'CString': | |
| 93 | + is_container = False | |
| 89 | 94 | else: |
| 90 | 95 | logging.warning('Unexpected name for record type "{0}". typo?' |
| 91 | 96 | .format(record_name)) |
| ... | ... | @@ -106,7 +111,8 @@ class PptRecord(record_base.OleRecordBase): |
| 106 | 111 | INSTANCE = None |
| 107 | 112 | VERSION = None |
| 108 | 113 | |
| 109 | - def parse(self, more_data): | |
| 114 | + def finish_constructing(self, more_data): | |
| 115 | + """ check and save instance and version """ | |
| 110 | 116 | instance, version = more_data |
| 111 | 117 | if self.INSTANCE is not None and self.INSTANCE != instance: |
| 112 | 118 | raise ValueError('invalid instance {0} for {1}' |
| ... | ... | @@ -147,9 +153,13 @@ class PptRecord(record_base.OleRecordBase): |
| 147 | 153 | class PptContainerRecord(PptRecord): |
| 148 | 154 | """ A record that contains other records """ |
| 149 | 155 | |
| 150 | - def parse(self, more_data): | |
| 156 | + def finish_constructing(self, more_data): | |
| 157 | + """ parse records from self.data """ | |
| 151 | 158 | # set self.version and self.instance |
| 152 | - super(PptContainerRecord, self).parse(more_data) | |
| 159 | + super(PptContainerRecord, self).finish_constructing(more_data) | |
| 160 | + self.records = None | |
| 161 | + if not self.data: | |
| 162 | + return | |
| 153 | 163 | |
| 154 | 164 | logging.debug('parsing contents of container record {0}'.format(self)) |
| 155 | 165 | |
| ... | ... | @@ -162,6 +172,16 @@ class PptContainerRecord(PptRecord): |
| 162 | 172 | logging.debug('done parsing contents of container record {0}' |
| 163 | 173 | .format(self)) |
| 164 | 174 | |
| 175 | + def __str__(self): | |
| 176 | + text = super(PptContainerRecord, self).__str__() | |
| 177 | + if self.records is None: | |
| 178 | + return '{0}, unparsed{1}'.format(text[:-2], text[-2:]) | |
| 179 | + elif self.records: | |
| 180 | + return '{0}, contains {1} recs{2}' \ | |
| 181 | + .format(text[:-2], len(self.records), text[-2:]) | |
| 182 | + else: | |
| 183 | + return text | |
| 184 | + | |
| 165 | 185 | |
| 166 | 186 | class PptRecordCurrentUser(PptRecord): |
| 167 | 187 | """ The CurrentUserAtom record """ |
| ... | ... | @@ -169,14 +189,28 @@ class PptRecordCurrentUser(PptRecord): |
| 169 | 189 | VERSION = 0 |
| 170 | 190 | INSTANCE = 0 |
| 171 | 191 | |
| 172 | - def parse(self, more_data): | |
| 173 | - super(PptRecordCurrentUser, self).parse(more_data) | |
| 192 | + def finish_constructing(self, more_data): | |
| 193 | + """ read various attributes from data """ | |
| 194 | + super(PptRecordCurrentUser, self).finish_constructing(more_data) | |
| 174 | 195 | if self.size < 24: |
| 175 | 196 | raise ValueError('CurrentUser record is too small ({0})' |
| 176 | 197 | .format(self.size)) |
| 198 | + self.size2 = None | |
| 199 | + self.header_token = None | |
| 200 | + self.offset_to_current_edit = None | |
| 201 | + self.len_user_name = None | |
| 202 | + self.doc_file_version = None | |
| 203 | + self.major_version = None | |
| 204 | + self.minor_version = None | |
| 205 | + self.ansi_user_name = None | |
| 206 | + self.unicode_user_name = None | |
| 207 | + | |
| 208 | + if not self.data: | |
| 209 | + return | |
| 210 | + | |
| 177 | 211 | self.size2, self.header_token, self.offset_to_current_edit, \ |
| 178 | 212 | self.len_user_name, self.doc_file_version, self.major_version, \ |
| 179 | - self.minor_version, _ = unpack('<IIIHHBBH', self.data[0:20]) | |
| 213 | + self.minor_version, _ = unpack('<LLLHHBBH', self.data[0:20]) | |
| 180 | 214 | if self.size2 != 0x14: |
| 181 | 215 | raise ValueError('Wrong size2 ({0}) in CurrentUser record' |
| 182 | 216 | .format(self.size2)) |
| ... | ... | @@ -198,7 +232,7 @@ class PptRecordCurrentUser(PptRecord): |
| 198 | 232 | '({0} != {1})'.format(len(self.ansi_user_name), |
| 199 | 233 | self.len_user_name)) |
| 200 | 234 | offset = 20 + self.len_user_name |
| 201 | - self.release_version = unpack('<I', self.data[offset:offset+4])[0] | |
| 235 | + self.release_version = unpack('<L', self.data[offset:offset+4])[0] | |
| 202 | 236 | if self.release_version not in (8, 9): |
| 203 | 237 | raise ValueError('CurrentUser record has wrong release version {0}' |
| 204 | 238 | .format(self.release_version)) |
| ... | ... | @@ -212,6 +246,8 @@ class PptRecordCurrentUser(PptRecord): |
| 212 | 246 | .format(self.size - offset)) |
| 213 | 247 | |
| 214 | 248 | def is_document_encrypted(self): |
| 249 | + if self.header_token is None: | |
| 250 | + raise ValueError('unknown') | |
| 215 | 251 | return self.header_token == 0xF3D1C4DF |
| 216 | 252 | |
| 217 | 253 | def read_some_more(self, stream): |
| ... | ... | @@ -229,6 +265,142 @@ class PptRecordCurrentUser(PptRecord): |
| 229 | 265 | 'stream'.format(len(data))) |
| 230 | 266 | |
| 231 | 267 | |
| 268 | +class PptRecordExOleObjAtom(PptRecord): | |
| 269 | + """ Record that contains info about type of embedded object """ | |
| 270 | + | |
| 271 | + TYPE = 0x0fc3 | |
| 272 | + | |
| 273 | + OBJ_TYPES = dict([(0, 'embedded'), (1, 'link'), (2, 'ActiveX')]) | |
| 274 | + SUB_TYPES = dict([ | |
| 275 | + (0x00, 'default'), | |
| 276 | + (0x01, 'clipart'), | |
| 277 | + (0x02, 'word doc'), | |
| 278 | + (0x03, 'excel sheet'), | |
| 279 | + (0x04, 'MS graph'), | |
| 280 | + (0x05, 'MS org chart'), | |
| 281 | + (0x06, 'equation'), | |
| 282 | + (0x07, 'word art'), | |
| 283 | + (0x08, 'sound'), | |
| 284 | + (0x0c, 'MS project'), | |
| 285 | + (0x0d, 'note-it'), | |
| 286 | + (0x0e, 'excel chart'), | |
| 287 | + (0x0f, 'media'), | |
| 288 | + (0x10, 'WordPad doc'), | |
| 289 | + (0x11, 'visio drawing'), | |
| 290 | + (0x12, 'OpenDoc text'), | |
| 291 | + (0x13, 'OpenDoc calc'), | |
| 292 | + (0x14, 'OpenDoc present'), | |
| 293 | + ]) | |
| 294 | + | |
| 295 | + def finish_constructing(self, more_data): | |
| 296 | + """ parse some more data from this """ | |
| 297 | + self.draw_aspect = None | |
| 298 | + self.obj_type = None | |
| 299 | + self.ex_obj_id = None | |
| 300 | + self.sub_type = None | |
| 301 | + self.persist_id_ref = None | |
| 302 | + if self.size != 0x18: | |
| 303 | + raise ValueError('ExOleObjAtom has wrong size {0} != 0x18' | |
| 304 | + .format(self.size)) | |
| 305 | + if self.data: | |
| 306 | + self.draw_aspect, self.obj_type, self.ex_obj_id, self.sub_type, \ | |
| 307 | + self.persist_id_ref, _ = unpack('<LLLLLL', self.data) | |
| 308 | + if self.obj_type not in self.OBJ_TYPES: | |
| 309 | + logging.warning('Unknown "type" value in ExOleObjAtom: {0}' | |
| 310 | + .format(self.obj_type)) | |
| 311 | + if self.sub_type not in self.SUB_TYPES: | |
| 312 | + logging.warning('Unknown sub type value in ExOleObjAtom: {0}' | |
| 313 | + .format(self.sub_type)) | |
| 314 | + | |
| 315 | + def _type_str(self): | |
| 316 | + return 'ExOleObjAtom type {0}/{1}'.format( | |
| 317 | + self.OBJ_TYPES.get(self.obj_type, str(self.obj_type)), | |
| 318 | + self.SUB_TYPES.get(self.sub_type, str(self.sub_type))) | |
| 319 | + | |
| 320 | + | |
| 321 | +class PptRecordExOleVbaActiveXAtom(PptRecord): | |
| 322 | + """ record that contains and ole object / vba storage / active x control | |
| 323 | + | |
| 324 | + Contains the actual data of the ole object / VBA storage / ActiveX control | |
| 325 | + in compressed or uncompressed form. | |
| 326 | + | |
| 327 | + Corresponding types in [MS-PPT]: | |
| 328 | + ExOleObjStg, ExOleObjStgUncompressedAtom, ExOleObjStgCompressedAtom, | |
| 329 | + VbaProjectStg, VbaProjectStgUncompressedAtom, VbaProjectStgCompressedAtom, | |
| 330 | + ExControlStg, ExControlStgUncompressedAtom, ExControlStgCompressedAtom. | |
| 331 | + | |
| 332 | + self.data is "An array of bytes that specifies a structured storage | |
| 333 | + (described in [MSDN-COM]) for the OLE object / ActiveX control / VBA | |
| 334 | + project ([MS-OVBA] section 2.2.1)." | |
| 335 | + If compressed, "The original bytes of the storage are compressed by the | |
| 336 | + algorithm specified in [RFC1950] and are decompressed by the algorithm | |
| 337 | + specified in [RFC1951]." (--> meaning zlib) | |
| 338 | + "Office Forms ActiveX controls are specified in [MS-OFORMS]." | |
| 339 | + | |
| 340 | + whether this is an OLE object or ActiveX control or a VBA Storage, need to | |
| 341 | + find the corresponding PptRecordExOleObjAtom | |
| 342 | + """ | |
| 343 | + | |
| 344 | + | |
| 345 | + TYPE = 0x1011 | |
| 346 | + | |
| 347 | + def is_compressed(self): | |
| 348 | + return self.instance == 1 | |
| 349 | + | |
| 350 | + def get_uncompressed_size(self): | |
| 351 | + """ Get size of data in uncompressed form | |
| 352 | + | |
| 353 | + For uncompressed data, this just returns self.size. For compressed data, | |
| 354 | + this reads and returns the doecmpressedSize field value from self.data. | |
| 355 | + Raises a value error if compressed and data is not available. | |
| 356 | + """ | |
| 357 | + if not self.is_compressed(): | |
| 358 | + return self.size | |
| 359 | + elif self.data is None: | |
| 360 | + raise ValueError('Data not read from record') | |
| 361 | + else: | |
| 362 | + return unpack('<L', self.data[:4])[0] | |
| 363 | + | |
| 364 | + def iter_uncompressed(self, chunk_size=4096): | |
| 365 | + """ iterate over data, decompress data if necessary | |
| 366 | + | |
| 367 | + chunk_size is used for input to decompression, so chunks yielded from | |
| 368 | + this may well be larger than that. Last chunk is most probably smaller. | |
| 369 | + """ | |
| 370 | + if self.data is None: | |
| 371 | + raise ValueError('data not read from record') | |
| 372 | + must_decomp = self.is_compressed() | |
| 373 | + start_idx = 0 | |
| 374 | + out_size = 0 | |
| 375 | + if must_decomp: | |
| 376 | + decompressor = zlib.decompressobj() | |
| 377 | + start_idx = 4 | |
| 378 | + while start_idx < self.size: | |
| 379 | + end_idx = min(self.size, start_idx+chunk_size) | |
| 380 | + if must_decomp: | |
| 381 | + result = decompressor.decompress(decompressor.unconsumed_tail + | |
| 382 | + self.data[start_idx:end_idx]) | |
| 383 | + else: | |
| 384 | + result = self.data[start_idx:end_idx] | |
| 385 | + yield result | |
| 386 | + logging.debug('decompressing from {0} to {1} resulted in {2} new' | |
| 387 | + .format(start_idx, end_idx, len(result))) | |
| 388 | + out_size += len(result) | |
| 389 | + start_idx = end_idx | |
| 390 | + if must_decomp: | |
| 391 | + result = decompressor.flush() | |
| 392 | + out_size += len(result) | |
| 393 | + yield result | |
| 394 | + if out_size != self.get_uncompressed_size(): | |
| 395 | + logging.warning('Decompressed data has wrong size {0} != {1}' | |
| 396 | + .format(out_size, self.get_uncompressed_size())) | |
| 397 | + | |
| 398 | + def __str__(self): | |
| 399 | + text = super(PptRecordExOleVbaActiveXAtom, self).__str__() | |
| 400 | + compr_text = 'compressed' if self.is_compressed() else 'uncompressed' | |
| 401 | + return '{0}, {1}{2}'.format(text[:-2], compr_text, text[-2:]) | |
| 402 | + | |
| 403 | + | |
| 232 | 404 | # types of relevant records (there are much more than listed here) |
| 233 | 405 | RECORD_TYPES = dict([ |
| 234 | 406 | # file structure types |
| ... | ... | @@ -248,7 +420,7 @@ RECORD_TYPES = dict([ |
| 248 | 420 | (0x03f8, 'MainMasterContainer'), |
| 249 | 421 | # external object ty |
| 250 | 422 | (0x0409, 'ExObjListContainer'), |
| 251 | - (0x1011, 'ExOleVbaActiveXAtom'), # ExOleObj|VbaProject|ExControl]Stg[Unc|C]ompressedAtom | |
| 423 | + (0x1011, 'ExOleVbaActiveXAtom'), # --> use PptRecordExOleVbaActiveXAtom | |
| 252 | 424 | (0x1006, 'ExAviMovieContainer'), |
| 253 | 425 | (0x100e, 'ExCDAudioContainer'), |
| 254 | 426 | (0x0fee, 'ExControlContainer'), |
| ... | ... | @@ -260,11 +432,15 @@ RECORD_TYPES = dict([ |
| 260 | 432 | (0x100f, 'ExWAVAudioEmbeddedContainer'), |
| 261 | 433 | (0x1010, 'ExWAVAudioLinkContainer'), |
| 262 | 434 | (0x1004, 'ExMediaAtom'), |
| 435 | + (0x040a, 'ExObjListAtom'), | |
| 436 | + (0x0fcd, 'ExOleEmbedAtom'), | |
| 437 | + (0x0fc3, 'ExOleObjAtom'), # --> use PptRecordExOleObjAtom instead | |
| 263 | 438 | # other types |
| 264 | 439 | (0x0fc1, 'MetafileBlob'), |
| 265 | 440 | (0x0fb8, 'FontEmbedDataBlob'), |
| 266 | 441 | (0x07e7, 'SoundDataBlob'), |
| 267 | 442 | (0x138b, 'BinaryTagDataBlob'), |
| 443 | + (0x0fba, 'CString'), | |
| 268 | 444 | ]) |
| 269 | 445 | |
| 270 | 446 | # record types where version is not 0x0 or 0xf |
| ... | ... | @@ -302,16 +478,41 @@ INSTANCE_EXCEPTIONS = dict([ |
| 302 | 478 | ############################################################################### |
| 303 | 479 | |
| 304 | 480 | |
| 481 | +def print_records(record, print_fn, indent, do_print_record): | |
| 482 | + """ print additional info for record | |
| 483 | + | |
| 484 | + prints additional info for some types and subrecords recursively | |
| 485 | + """ | |
| 486 | + if do_print_record: | |
| 487 | + print_fn('{0}{1}'.format(' ' * indent, record)) | |
| 488 | + if isinstance(record, PptContainerRecord): | |
| 489 | + for subrec in record.records: | |
| 490 | + print_records(subrec, print_fn, indent+1, True) | |
| 491 | + elif isinstance(record, PptRecordCurrentUser): | |
| 492 | + logging.info('{4}--> crypt: {0}, offset {1}, user {2}/{3}' | |
| 493 | + .format(record.is_document_encrypted(), | |
| 494 | + record.offset_to_current_edit, | |
| 495 | + repr(record.ansi_user_name), | |
| 496 | + repr(record.unicode_user_name), | |
| 497 | + ' ' * indent)) | |
| 498 | + elif isinstance(record, PptRecordExOleObjAtom): | |
| 499 | + logging.info('{2}--> obj id {0}, persist id ref {1}' | |
| 500 | + .format(record.ex_obj_id, record.persist_id_ref, | |
| 501 | + ' ' * indent)) | |
| 502 | + elif isinstance(record, PptRecordExOleVbaActiveXAtom): | |
| 503 | + #with open('testdump', 'wb') as writer: | |
| 504 | + # for chunk in record.iter_uncompressed(): | |
| 505 | + # logging.info('{0}--> "{1}"'.format(' ' * indent, chunk)) | |
| 506 | + # writer.write(chunk) | |
| 507 | + chunk1 = next(record.iter_uncompressed()) | |
| 508 | + logging.info('{0}--> decompressed size {1}, data {2}...' | |
| 509 | + .format(' ' * indent, record.get_uncompressed_size(), | |
| 510 | + ', '.join('{0:02x}'.format(ord(c)) | |
| 511 | + for c in chunk1[:32]))) | |
| 512 | + | |
| 513 | + | |
| 305 | 514 | if __name__ == '__main__': |
| 306 | - def print_subrecords(record): | |
| 307 | - if isinstance(record, PptContainerRecord): | |
| 308 | - for subrec in record.records: | |
| 309 | - logging.info(' {0}'.format(subrec)) | |
| 310 | - elif isinstance(record, PptRecordCurrentUser): | |
| 311 | - logging.info(' crypt: {0}, offset {1}, user {2}/{3}' | |
| 312 | - .format(record.is_document_encrypted(), | |
| 313 | - record.offset_to_current_edit, | |
| 314 | - repr(record.ansi_user_name), | |
| 315 | - repr(record.unicode_user_name))) | |
| 515 | + def do_per_record(record): | |
| 516 | + print_records(record, logging.info, 2, False) | |
| 316 | 517 | sys.exit(record_base.test(sys.argv[1:], PptFile, |
| 317 | - do_per_record=print_subrecords)) | |
| 518 | + do_per_record=do_per_record)) | ... | ... |