Commit 2b3f8d3e2d8e25bf61d4a45e09f7fd5f1780b968
1 parent
cc142ee3
oleobj: generalize "opening" of ole files to allow for other types
This way, oleobj can now handle office 2007+ types (docx, xlsx, pptx, and derivates). Since this adds another loop level into process_file, created own function for inner-most code part (the actual dumping).
Showing
1 changed file
with
120 additions
and
25 deletions
oletools/oleobj.py
| ... | ... | @@ -85,6 +85,7 @@ if not _parent_dir in sys.path: |
| 85 | 85 | |
| 86 | 86 | from oletools.thirdparty.olefile import olefile |
| 87 | 87 | from oletools.thirdparty.xglob import xglob |
| 88 | +from ppt_record_parser import is_ppt, PptFile, PptRecordExOleVbaActiveXAtom | |
| 88 | 89 | |
| 89 | 90 | # === LOGGING ================================================================= |
| 90 | 91 | |
| ... | ... | @@ -358,7 +359,92 @@ def sanitize_filename(filename, replacement='_', max_length=200): |
| 358 | 359 | return sane_fname |
| 359 | 360 | |
| 360 | 361 | |
| 362 | +def find_ole_in_ppt(filename): | |
| 363 | + """ find ole streams in ppt """ | |
| 364 | + for stream in PptFile(filename).iter_streams(): | |
| 365 | + for record in stream.iter_records(): | |
| 366 | + if isinstance(record, PptRecordExOleVbaActiveXAtom): | |
| 367 | + ole = None | |
| 368 | + try: | |
| 369 | + data_start = next(record.iter_uncompressed()) | |
| 370 | + if data_start[:len(olefile.MAGIC)] != olefile.MAGIC: | |
| 371 | + continue # could be an ActiveX control or VBA Storage | |
| 372 | + | |
| 373 | + # otherwise, this should be an OLE object | |
| 374 | + ole = record.get_data_as_olefile() | |
| 375 | + yield ole | |
| 376 | + except IOError: | |
| 377 | + logging.warning('Error reading data from {0} stream or ' | |
| 378 | + 'interpreting it as OLE object' | |
| 379 | + .format(stream.name), exc_info=True) | |
| 380 | + finally: | |
| 381 | + if ole is not None: | |
| 382 | + ole.close() | |
| 383 | + | |
| 384 | + | |
| 385 | +def find_ole(filename, data): | |
| 386 | + """ try to open somehow as zip/ole/rtf/... ; yield None if fail | |
| 387 | + | |
| 388 | + if data is given, filename is ignored | |
| 389 | + """ | |
| 390 | + | |
| 391 | + try: | |
| 392 | + if data is not None: | |
| 393 | + # assume data is a complete OLE file | |
| 394 | + logging.info('working on raw OLE data (filename: {0})' | |
| 395 | + .format(filename)) | |
| 396 | + yield olefile.OleFileIO(data) | |
| 397 | + elif olefile.isOleFile(filename): | |
| 398 | + if is_ppt(filename): | |
| 399 | + logging.info('is ppt file: ' + filename) | |
| 400 | + for ole in find_ole_in_ppt(filename): | |
| 401 | + yield ole | |
| 402 | + ole.close() | |
| 403 | + else: | |
| 404 | + logging.info('is ole file: ' + filename) | |
| 405 | + ole = olefile.OleFileIO(filename) | |
| 406 | + yield ole | |
| 407 | + ole.close() | |
| 408 | + elif is_zipfile(filename): | |
| 409 | + logging.info('is zip file: ' + filename) | |
| 410 | + zipper = ZipFile(filename, 'r') | |
| 411 | + for subfile in zipper.namelist(): | |
| 412 | + head = b'' | |
| 413 | + try: | |
| 414 | + with zipper.open(subfile) as file_handle: | |
| 415 | + head = file_handle.read(len(olefile.MAGIC)) | |
| 416 | + except RuntimeError: | |
| 417 | + logging.error('zip is encrypted: ' + filename) | |
| 418 | + yield None | |
| 419 | + continue | |
| 420 | + | |
| 421 | + if head == olefile.MAGIC: | |
| 422 | + logging.info(' unzipping ole: ' + subfile) | |
| 423 | + with zipper.open(subfile) as file_handle: | |
| 424 | + ole = olefile.OleFileIO(file_handle) | |
| 425 | + yield ole | |
| 426 | + ole.close() | |
| 427 | + else: | |
| 428 | + logging.debug('unzip skip: ' + subfile) | |
| 429 | + else: | |
| 430 | + logging.warning('open failed: ' + filename) | |
| 431 | + yield None # --> leads to non-0 return code | |
| 432 | + except Exception: | |
| 433 | + logging.error('Caught exception opening {0}'.format(filename), | |
| 434 | + exc_info=True) | |
| 435 | + yield None # --> leads to non-0 return code but try next file first | |
| 436 | + | |
| 437 | + | |
| 361 | 438 | def process_file(container, filename, data, output_dir=None): |
| 439 | + """ find embedded objects in given file | |
| 440 | + | |
| 441 | + if data is given (from xglob for encrypted zip files), then filename is | |
| 442 | + not used for reading. If not (usual case), then data is read from filename | |
| 443 | + on demand. | |
| 444 | + | |
| 445 | + If output_dir is given and does not exist, it is created. If it is not | |
| 446 | + given, data is saved to same directory as the input file. | |
| 447 | + """ | |
| 362 | 448 | if output_dir: |
| 363 | 449 | if not os.path.isdir(output_dir): |
| 364 | 450 | log.info('creating output directory %s' % output_dir) |
| ... | ... | @@ -372,36 +458,45 @@ def process_file(container, filename, data, output_dir=None): |
| 372 | 458 | fname_prefix = os.path.join(base_dir, sane_fname) |
| 373 | 459 | |
| 374 | 460 | # TODO: option to extract objects to files (false by default) |
| 375 | - if data is None: | |
| 376 | - data = open(filename, 'rb').read() | |
| 377 | 461 | print ('-'*79) |
| 378 | - print ('File: %r - %d bytes' % (filename, len(data))) | |
| 379 | - ole = olefile.OleFileIO(data) | |
| 462 | + print ('File: %r' % filename) | |
| 380 | 463 | index = 1 |
| 381 | - for stream in ole.listdir(): | |
| 382 | - if stream[-1] == '\x01Ole10Native': | |
| 383 | - objdata = ole.openstream(stream).read() | |
| 384 | - stream_path = '/'.join(stream) | |
| 385 | - log.debug('Checking stream %r' % stream_path) | |
| 386 | - try: | |
| 387 | - print('extract file embedded in OLE object from stream %r:' % stream_path) | |
| 388 | - print ('Parsing OLE Package') | |
| 389 | - opkg = OleNativeStream(bindata=objdata) | |
| 390 | - print ('Filename = %r' % opkg.filename) | |
| 391 | - print ('Source path = %r' % opkg.src_path) | |
| 392 | - print ('Temp path = %r' % opkg.temp_path) | |
| 393 | - if opkg.filename: | |
| 394 | - fname = '%s_%s' % (fname_prefix, | |
| 395 | - sanitize_filename(opkg.filename)) | |
| 396 | - else: | |
| 397 | - fname = '%s_object_%03d.noname' % (fname_prefix, index) | |
| 398 | - print ('saving to file %s' % fname) | |
| 399 | - open(fname, 'wb').write(opkg.data) | |
| 464 | + | |
| 465 | + # look for ole files inside file (e.g. unzip docx) | |
| 466 | + flag_no_ole = False | |
| 467 | + for ole in find_ole(filename, data): | |
| 468 | + if ole is None: # no ole file found | |
| 469 | + flag_no_ole = True | |
| 470 | + continue | |
| 471 | + | |
| 472 | + for stream in ole.listdir(): | |
| 473 | + if stream[-1] == '\x01Ole10Native': | |
| 474 | + process_native_stream(ole, stream, fname_prefix, index) | |
| 400 | 475 | index += 1 |
| 401 | - except: | |
| 402 | - log.debug('*** Not an OLE 1.0 Object') | |
| 403 | 476 | |
| 404 | 477 | |
| 478 | +def process_native_stream(ole, stream, fname_prefix, index): | |
| 479 | + """ Dump data from OLE embedded object stream """ | |
| 480 | + objdata = ole.openstream(stream).read() | |
| 481 | + stream_path = '/'.join(stream) | |
| 482 | + log.debug('Checking stream %r' % stream_path) | |
| 483 | + try: | |
| 484 | + print('extract file embedded in OLE object from stream %r:' % stream_path) | |
| 485 | + print ('Parsing OLE Package') | |
| 486 | + opkg = OleNativeStream(bindata=objdata) | |
| 487 | + print ('Filename = %r' % opkg.filename) | |
| 488 | + print ('Source path = %r' % opkg.src_path) | |
| 489 | + print ('Temp path = %r' % opkg.temp_path) | |
| 490 | + if opkg.filename: | |
| 491 | + fname = '%s_%s' % (fname_prefix, | |
| 492 | + sanitize_filename(opkg.filename)) | |
| 493 | + else: | |
| 494 | + fname = '%s_object_%03d.noname' % (fname_prefix, index) | |
| 495 | + print ('saving to file %s' % fname) | |
| 496 | + open(fname, 'wb').write(opkg.data) | |
| 497 | + except Exception: | |
| 498 | + log.debug('*** Not an OLE 1.0 Object') | |
| 499 | + | |
| 405 | 500 | |
| 406 | 501 | #=== MAIN ================================================================= |
| 407 | 502 | ... | ... |