Commit 2b3f8d3e2d8e25bf61d4a45e09f7fd5f1780b968

Authored by Christian Herdtweck
1 parent cc142ee3

oleobj: generalize "opening" of ole files to allow for other types

This way, oleobj can now handle office 2007+ types (docx, xlsx, pptx, and
derivates).

Since this adds another loop level into process_file, created own function
for inner-most code part (the actual dumping).
Showing 1 changed file with 120 additions and 25 deletions
oletools/oleobj.py
@@ -85,6 +85,7 @@ if not _parent_dir in sys.path: @@ -85,6 +85,7 @@ if not _parent_dir in sys.path:
85 85
86 from oletools.thirdparty.olefile import olefile 86 from oletools.thirdparty.olefile import olefile
87 from oletools.thirdparty.xglob import xglob 87 from oletools.thirdparty.xglob import xglob
  88 +from ppt_record_parser import is_ppt, PptFile, PptRecordExOleVbaActiveXAtom
88 89
89 # === LOGGING ================================================================= 90 # === LOGGING =================================================================
90 91
@@ -358,7 +359,92 @@ def sanitize_filename(filename, replacement='_', max_length=200): @@ -358,7 +359,92 @@ def sanitize_filename(filename, replacement='_', max_length=200):
358 return sane_fname 359 return sane_fname
359 360
360 361
  362 +def find_ole_in_ppt(filename):
  363 + """ find ole streams in ppt """
  364 + for stream in PptFile(filename).iter_streams():
  365 + for record in stream.iter_records():
  366 + if isinstance(record, PptRecordExOleVbaActiveXAtom):
  367 + ole = None
  368 + try:
  369 + data_start = next(record.iter_uncompressed())
  370 + if data_start[:len(olefile.MAGIC)] != olefile.MAGIC:
  371 + continue # could be an ActiveX control or VBA Storage
  372 +
  373 + # otherwise, this should be an OLE object
  374 + ole = record.get_data_as_olefile()
  375 + yield ole
  376 + except IOError:
  377 + logging.warning('Error reading data from {0} stream or '
  378 + 'interpreting it as OLE object'
  379 + .format(stream.name), exc_info=True)
  380 + finally:
  381 + if ole is not None:
  382 + ole.close()
  383 +
  384 +
  385 +def find_ole(filename, data):
  386 + """ try to open somehow as zip/ole/rtf/... ; yield None if fail
  387 +
  388 + if data is given, filename is ignored
  389 + """
  390 +
  391 + try:
  392 + if data is not None:
  393 + # assume data is a complete OLE file
  394 + logging.info('working on raw OLE data (filename: {0})'
  395 + .format(filename))
  396 + yield olefile.OleFileIO(data)
  397 + elif olefile.isOleFile(filename):
  398 + if is_ppt(filename):
  399 + logging.info('is ppt file: ' + filename)
  400 + for ole in find_ole_in_ppt(filename):
  401 + yield ole
  402 + ole.close()
  403 + else:
  404 + logging.info('is ole file: ' + filename)
  405 + ole = olefile.OleFileIO(filename)
  406 + yield ole
  407 + ole.close()
  408 + elif is_zipfile(filename):
  409 + logging.info('is zip file: ' + filename)
  410 + zipper = ZipFile(filename, 'r')
  411 + for subfile in zipper.namelist():
  412 + head = b''
  413 + try:
  414 + with zipper.open(subfile) as file_handle:
  415 + head = file_handle.read(len(olefile.MAGIC))
  416 + except RuntimeError:
  417 + logging.error('zip is encrypted: ' + filename)
  418 + yield None
  419 + continue
  420 +
  421 + if head == olefile.MAGIC:
  422 + logging.info(' unzipping ole: ' + subfile)
  423 + with zipper.open(subfile) as file_handle:
  424 + ole = olefile.OleFileIO(file_handle)
  425 + yield ole
  426 + ole.close()
  427 + else:
  428 + logging.debug('unzip skip: ' + subfile)
  429 + else:
  430 + logging.warning('open failed: ' + filename)
  431 + yield None # --> leads to non-0 return code
  432 + except Exception:
  433 + logging.error('Caught exception opening {0}'.format(filename),
  434 + exc_info=True)
  435 + yield None # --> leads to non-0 return code but try next file first
  436 +
  437 +
361 def process_file(container, filename, data, output_dir=None): 438 def process_file(container, filename, data, output_dir=None):
  439 + """ find embedded objects in given file
  440 +
  441 + if data is given (from xglob for encrypted zip files), then filename is
  442 + not used for reading. If not (usual case), then data is read from filename
  443 + on demand.
  444 +
  445 + If output_dir is given and does not exist, it is created. If it is not
  446 + given, data is saved to same directory as the input file.
  447 + """
362 if output_dir: 448 if output_dir:
363 if not os.path.isdir(output_dir): 449 if not os.path.isdir(output_dir):
364 log.info('creating output directory %s' % output_dir) 450 log.info('creating output directory %s' % output_dir)
@@ -372,36 +458,45 @@ def process_file(container, filename, data, output_dir=None): @@ -372,36 +458,45 @@ def process_file(container, filename, data, output_dir=None):
372 fname_prefix = os.path.join(base_dir, sane_fname) 458 fname_prefix = os.path.join(base_dir, sane_fname)
373 459
374 # TODO: option to extract objects to files (false by default) 460 # TODO: option to extract objects to files (false by default)
375 - if data is None:  
376 - data = open(filename, 'rb').read()  
377 print ('-'*79) 461 print ('-'*79)
378 - print ('File: %r - %d bytes' % (filename, len(data)))  
379 - ole = olefile.OleFileIO(data) 462 + print ('File: %r' % filename)
380 index = 1 463 index = 1
381 - for stream in ole.listdir():  
382 - if stream[-1] == '\x01Ole10Native':  
383 - objdata = ole.openstream(stream).read()  
384 - stream_path = '/'.join(stream)  
385 - log.debug('Checking stream %r' % stream_path)  
386 - try:  
387 - print('extract file embedded in OLE object from stream %r:' % stream_path)  
388 - print ('Parsing OLE Package')  
389 - opkg = OleNativeStream(bindata=objdata)  
390 - print ('Filename = %r' % opkg.filename)  
391 - print ('Source path = %r' % opkg.src_path)  
392 - print ('Temp path = %r' % opkg.temp_path)  
393 - if opkg.filename:  
394 - fname = '%s_%s' % (fname_prefix,  
395 - sanitize_filename(opkg.filename))  
396 - else:  
397 - fname = '%s_object_%03d.noname' % (fname_prefix, index)  
398 - print ('saving to file %s' % fname)  
399 - open(fname, 'wb').write(opkg.data) 464 +
  465 + # look for ole files inside file (e.g. unzip docx)
  466 + flag_no_ole = False
  467 + for ole in find_ole(filename, data):
  468 + if ole is None: # no ole file found
  469 + flag_no_ole = True
  470 + continue
  471 +
  472 + for stream in ole.listdir():
  473 + if stream[-1] == '\x01Ole10Native':
  474 + process_native_stream(ole, stream, fname_prefix, index)
400 index += 1 475 index += 1
401 - except:  
402 - log.debug('*** Not an OLE 1.0 Object')  
403 476
404 477
  478 +def process_native_stream(ole, stream, fname_prefix, index):
  479 + """ Dump data from OLE embedded object stream """
  480 + objdata = ole.openstream(stream).read()
  481 + stream_path = '/'.join(stream)
  482 + log.debug('Checking stream %r' % stream_path)
  483 + try:
  484 + print('extract file embedded in OLE object from stream %r:' % stream_path)
  485 + print ('Parsing OLE Package')
  486 + opkg = OleNativeStream(bindata=objdata)
  487 + print ('Filename = %r' % opkg.filename)
  488 + print ('Source path = %r' % opkg.src_path)
  489 + print ('Temp path = %r' % opkg.temp_path)
  490 + if opkg.filename:
  491 + fname = '%s_%s' % (fname_prefix,
  492 + sanitize_filename(opkg.filename))
  493 + else:
  494 + fname = '%s_object_%03d.noname' % (fname_prefix, index)
  495 + print ('saving to file %s' % fname)
  496 + open(fname, 'wb').write(opkg.data)
  497 + except Exception:
  498 + log.debug('*** Not an OLE 1.0 Object')
  499 +
405 500
406 #=== MAIN ================================================================= 501 #=== MAIN =================================================================
407 502