Commit 2b3f8d3e2d8e25bf61d4a45e09f7fd5f1780b968
1 parent
cc142ee3
oleobj: generalize "opening" of ole files to allow for other types
This way, oleobj can now handle office 2007+ types (docx, xlsx, pptx, and derivates). Since this adds another loop level into process_file, created own function for inner-most code part (the actual dumping).
Showing
1 changed file
with
120 additions
and
25 deletions
oletools/oleobj.py
| @@ -85,6 +85,7 @@ if not _parent_dir in sys.path: | @@ -85,6 +85,7 @@ if not _parent_dir in sys.path: | ||
| 85 | 85 | ||
| 86 | from oletools.thirdparty.olefile import olefile | 86 | from oletools.thirdparty.olefile import olefile |
| 87 | from oletools.thirdparty.xglob import xglob | 87 | from oletools.thirdparty.xglob import xglob |
| 88 | +from ppt_record_parser import is_ppt, PptFile, PptRecordExOleVbaActiveXAtom | ||
| 88 | 89 | ||
| 89 | # === LOGGING ================================================================= | 90 | # === LOGGING ================================================================= |
| 90 | 91 | ||
| @@ -358,7 +359,92 @@ def sanitize_filename(filename, replacement='_', max_length=200): | @@ -358,7 +359,92 @@ def sanitize_filename(filename, replacement='_', max_length=200): | ||
| 358 | return sane_fname | 359 | return sane_fname |
| 359 | 360 | ||
| 360 | 361 | ||
| 362 | +def find_ole_in_ppt(filename): | ||
| 363 | + """ find ole streams in ppt """ | ||
| 364 | + for stream in PptFile(filename).iter_streams(): | ||
| 365 | + for record in stream.iter_records(): | ||
| 366 | + if isinstance(record, PptRecordExOleVbaActiveXAtom): | ||
| 367 | + ole = None | ||
| 368 | + try: | ||
| 369 | + data_start = next(record.iter_uncompressed()) | ||
| 370 | + if data_start[:len(olefile.MAGIC)] != olefile.MAGIC: | ||
| 371 | + continue # could be an ActiveX control or VBA Storage | ||
| 372 | + | ||
| 373 | + # otherwise, this should be an OLE object | ||
| 374 | + ole = record.get_data_as_olefile() | ||
| 375 | + yield ole | ||
| 376 | + except IOError: | ||
| 377 | + logging.warning('Error reading data from {0} stream or ' | ||
| 378 | + 'interpreting it as OLE object' | ||
| 379 | + .format(stream.name), exc_info=True) | ||
| 380 | + finally: | ||
| 381 | + if ole is not None: | ||
| 382 | + ole.close() | ||
| 383 | + | ||
| 384 | + | ||
| 385 | +def find_ole(filename, data): | ||
| 386 | + """ try to open somehow as zip/ole/rtf/... ; yield None if fail | ||
| 387 | + | ||
| 388 | + if data is given, filename is ignored | ||
| 389 | + """ | ||
| 390 | + | ||
| 391 | + try: | ||
| 392 | + if data is not None: | ||
| 393 | + # assume data is a complete OLE file | ||
| 394 | + logging.info('working on raw OLE data (filename: {0})' | ||
| 395 | + .format(filename)) | ||
| 396 | + yield olefile.OleFileIO(data) | ||
| 397 | + elif olefile.isOleFile(filename): | ||
| 398 | + if is_ppt(filename): | ||
| 399 | + logging.info('is ppt file: ' + filename) | ||
| 400 | + for ole in find_ole_in_ppt(filename): | ||
| 401 | + yield ole | ||
| 402 | + ole.close() | ||
| 403 | + else: | ||
| 404 | + logging.info('is ole file: ' + filename) | ||
| 405 | + ole = olefile.OleFileIO(filename) | ||
| 406 | + yield ole | ||
| 407 | + ole.close() | ||
| 408 | + elif is_zipfile(filename): | ||
| 409 | + logging.info('is zip file: ' + filename) | ||
| 410 | + zipper = ZipFile(filename, 'r') | ||
| 411 | + for subfile in zipper.namelist(): | ||
| 412 | + head = b'' | ||
| 413 | + try: | ||
| 414 | + with zipper.open(subfile) as file_handle: | ||
| 415 | + head = file_handle.read(len(olefile.MAGIC)) | ||
| 416 | + except RuntimeError: | ||
| 417 | + logging.error('zip is encrypted: ' + filename) | ||
| 418 | + yield None | ||
| 419 | + continue | ||
| 420 | + | ||
| 421 | + if head == olefile.MAGIC: | ||
| 422 | + logging.info(' unzipping ole: ' + subfile) | ||
| 423 | + with zipper.open(subfile) as file_handle: | ||
| 424 | + ole = olefile.OleFileIO(file_handle) | ||
| 425 | + yield ole | ||
| 426 | + ole.close() | ||
| 427 | + else: | ||
| 428 | + logging.debug('unzip skip: ' + subfile) | ||
| 429 | + else: | ||
| 430 | + logging.warning('open failed: ' + filename) | ||
| 431 | + yield None # --> leads to non-0 return code | ||
| 432 | + except Exception: | ||
| 433 | + logging.error('Caught exception opening {0}'.format(filename), | ||
| 434 | + exc_info=True) | ||
| 435 | + yield None # --> leads to non-0 return code but try next file first | ||
| 436 | + | ||
| 437 | + | ||
| 361 | def process_file(container, filename, data, output_dir=None): | 438 | def process_file(container, filename, data, output_dir=None): |
| 439 | + """ find embedded objects in given file | ||
| 440 | + | ||
| 441 | + if data is given (from xglob for encrypted zip files), then filename is | ||
| 442 | + not used for reading. If not (usual case), then data is read from filename | ||
| 443 | + on demand. | ||
| 444 | + | ||
| 445 | + If output_dir is given and does not exist, it is created. If it is not | ||
| 446 | + given, data is saved to same directory as the input file. | ||
| 447 | + """ | ||
| 362 | if output_dir: | 448 | if output_dir: |
| 363 | if not os.path.isdir(output_dir): | 449 | if not os.path.isdir(output_dir): |
| 364 | log.info('creating output directory %s' % output_dir) | 450 | log.info('creating output directory %s' % output_dir) |
| @@ -372,36 +458,45 @@ def process_file(container, filename, data, output_dir=None): | @@ -372,36 +458,45 @@ def process_file(container, filename, data, output_dir=None): | ||
| 372 | fname_prefix = os.path.join(base_dir, sane_fname) | 458 | fname_prefix = os.path.join(base_dir, sane_fname) |
| 373 | 459 | ||
| 374 | # TODO: option to extract objects to files (false by default) | 460 | # TODO: option to extract objects to files (false by default) |
| 375 | - if data is None: | ||
| 376 | - data = open(filename, 'rb').read() | ||
| 377 | print ('-'*79) | 461 | print ('-'*79) |
| 378 | - print ('File: %r - %d bytes' % (filename, len(data))) | ||
| 379 | - ole = olefile.OleFileIO(data) | 462 | + print ('File: %r' % filename) |
| 380 | index = 1 | 463 | index = 1 |
| 381 | - for stream in ole.listdir(): | ||
| 382 | - if stream[-1] == '\x01Ole10Native': | ||
| 383 | - objdata = ole.openstream(stream).read() | ||
| 384 | - stream_path = '/'.join(stream) | ||
| 385 | - log.debug('Checking stream %r' % stream_path) | ||
| 386 | - try: | ||
| 387 | - print('extract file embedded in OLE object from stream %r:' % stream_path) | ||
| 388 | - print ('Parsing OLE Package') | ||
| 389 | - opkg = OleNativeStream(bindata=objdata) | ||
| 390 | - print ('Filename = %r' % opkg.filename) | ||
| 391 | - print ('Source path = %r' % opkg.src_path) | ||
| 392 | - print ('Temp path = %r' % opkg.temp_path) | ||
| 393 | - if opkg.filename: | ||
| 394 | - fname = '%s_%s' % (fname_prefix, | ||
| 395 | - sanitize_filename(opkg.filename)) | ||
| 396 | - else: | ||
| 397 | - fname = '%s_object_%03d.noname' % (fname_prefix, index) | ||
| 398 | - print ('saving to file %s' % fname) | ||
| 399 | - open(fname, 'wb').write(opkg.data) | 464 | + |
| 465 | + # look for ole files inside file (e.g. unzip docx) | ||
| 466 | + flag_no_ole = False | ||
| 467 | + for ole in find_ole(filename, data): | ||
| 468 | + if ole is None: # no ole file found | ||
| 469 | + flag_no_ole = True | ||
| 470 | + continue | ||
| 471 | + | ||
| 472 | + for stream in ole.listdir(): | ||
| 473 | + if stream[-1] == '\x01Ole10Native': | ||
| 474 | + process_native_stream(ole, stream, fname_prefix, index) | ||
| 400 | index += 1 | 475 | index += 1 |
| 401 | - except: | ||
| 402 | - log.debug('*** Not an OLE 1.0 Object') | ||
| 403 | 476 | ||
| 404 | 477 | ||
| 478 | +def process_native_stream(ole, stream, fname_prefix, index): | ||
| 479 | + """ Dump data from OLE embedded object stream """ | ||
| 480 | + objdata = ole.openstream(stream).read() | ||
| 481 | + stream_path = '/'.join(stream) | ||
| 482 | + log.debug('Checking stream %r' % stream_path) | ||
| 483 | + try: | ||
| 484 | + print('extract file embedded in OLE object from stream %r:' % stream_path) | ||
| 485 | + print ('Parsing OLE Package') | ||
| 486 | + opkg = OleNativeStream(bindata=objdata) | ||
| 487 | + print ('Filename = %r' % opkg.filename) | ||
| 488 | + print ('Source path = %r' % opkg.src_path) | ||
| 489 | + print ('Temp path = %r' % opkg.temp_path) | ||
| 490 | + if opkg.filename: | ||
| 491 | + fname = '%s_%s' % (fname_prefix, | ||
| 492 | + sanitize_filename(opkg.filename)) | ||
| 493 | + else: | ||
| 494 | + fname = '%s_object_%03d.noname' % (fname_prefix, index) | ||
| 495 | + print ('saving to file %s' % fname) | ||
| 496 | + open(fname, 'wb').write(opkg.data) | ||
| 497 | + except Exception: | ||
| 498 | + log.debug('*** Not an OLE 1.0 Object') | ||
| 499 | + | ||
| 405 | 500 | ||
| 406 | #=== MAIN ================================================================= | 501 | #=== MAIN ================================================================= |
| 407 | 502 |