Commit f4fdba01d5f16130058e6e5b82d2078a593f3616

Authored by Samir Aguiar
1 parent 42a369f9

oleobj: reuse xml_parser when looking for olefiles

Since we already parse the Xml file before when looking for
external links in the relationships, we can reuse the parser to
avoid iterating files twice.
Showing 1 changed file with 32 additions and 25 deletions
oletools/oleobj.py
@@ -50,7 +50,7 @@ import os @@ -50,7 +50,7 @@ import os
50 import re 50 import re
51 import sys 51 import sys
52 import io 52 import io
53 -from zipfile import is_zipfile, ZipFile 53 +from zipfile import is_zipfile
54 54
55 # IMPORTANT: it should be possible to run oletools directly as scripts 55 # IMPORTANT: it should be possible to run oletools directly as scripts
56 # in any directory without installing them with pip or setup.py. 56 # in any directory without installing them with pip or setup.py.
@@ -70,7 +70,7 @@ except ImportError: @@ -70,7 +70,7 @@ except ImportError:
70 from oletools.thirdparty import xglob 70 from oletools.thirdparty import xglob
71 from oletools.ppt_record_parser import (is_ppt, PptFile, 71 from oletools.ppt_record_parser import (is_ppt, PptFile,
72 PptRecordExOleVbaActiveXAtom) 72 PptRecordExOleVbaActiveXAtom)
73 -from oletools.ooxml import XmlParser, ZipSubFile 73 +from oletools.ooxml import XmlParser
74 74
75 # ----------------------------------------------------------------------------- 75 # -----------------------------------------------------------------------------
76 # CHANGELOG: 76 # CHANGELOG:
@@ -615,7 +615,7 @@ class FakeFile(io.RawIOBase): @@ -615,7 +615,7 @@ class FakeFile(io.RawIOBase):
615 return self.pos 615 return self.pos
616 616
617 617
618 -def find_ole(filename, data): 618 +def find_ole(filename, data, xml_parser=None):
619 """ try to open somehow as zip/ole/rtf/... ; yield None if fail 619 """ try to open somehow as zip/ole/rtf/... ; yield None if fail
620 620
621 If data is given, filename is (mostly) ignored. 621 If data is given, filename is (mostly) ignored.
@@ -647,34 +647,40 @@ def find_ole(filename, data): @@ -647,34 +647,40 @@ def find_ole(filename, data):
647 log.info('is ole file: ' + filename) 647 log.info('is ole file: ' + filename)
648 ole = olefile.OleFileIO(arg_for_ole) 648 ole = olefile.OleFileIO(arg_for_ole)
649 yield ole 649 yield ole
650 - elif is_zipfile(arg_for_zip): 650 + elif xml_parser is not None or is_zipfile(arg_for_zip):
  651 + # keep compatibility with 3rd-party code that calls this function
  652 + # directly without providing an XmlParser instance
  653 + if xml_parser is None:
  654 + xml_parser = XmlParser(arg_for_zip)
  655 + # force iteration so XmlParser.iter_non_xml() returns data
  656 + [x for x in xml_parser.iter_xml()]
  657 +
651 log.info('is zip file: ' + filename) 658 log.info('is zip file: ' + filename)
652 - zipper = ZipFile(arg_for_zip, 'r')  
653 - for subfile in zipper.namelist():  
654 - head = b'' 659 + # we looped through the XML files before, now we can
  660 + # iterate the non-XML files looking for ole objects
  661 + for subfile, _, file_handle in xml_parser.iter_non_xml():
655 try: 662 try:
656 - with zipper.open(subfile) as file_handle:  
657 - head = file_handle.read(len(olefile.MAGIC)) 663 + head = file_handle.read(len(olefile.MAGIC))
658 except RuntimeError: 664 except RuntimeError:
659 log.error('zip is encrypted: ' + filename) 665 log.error('zip is encrypted: ' + filename)
660 yield None 666 yield None
661 continue 667 continue
662 668
663 if head == olefile.MAGIC: 669 if head == olefile.MAGIC:
  670 + file_handle.seek(0)
664 log.info(' unzipping ole: ' + subfile) 671 log.info(' unzipping ole: ' + subfile)
665 - with ZipSubFile(zipper, subfile) as file_handle:  
666 - try:  
667 - ole = olefile.OleFileIO(file_handle)  
668 - yield ole  
669 - except IOError:  
670 - log.warning('Error reading data from {0}/{1} or '  
671 - 'interpreting it as OLE object'  
672 - .format(filename, subfile))  
673 - log.debug('', exc_info=True)  
674 - finally:  
675 - if ole is not None:  
676 - ole.close()  
677 - ole = None 672 + try:
  673 + ole = olefile.OleFileIO(file_handle)
  674 + yield ole
  675 + except IOError:
  676 + log.warning('Error reading data from {0}/{1} or '
  677 + 'interpreting it as OLE object'
  678 + .format(filename, subfile))
  679 + log.debug('', exc_info=True)
  680 + finally:
  681 + if ole is not None:
  682 + ole.close()
  683 + ole = None
678 else: 684 else:
679 log.debug('unzip skip: ' + subfile) 685 log.debug('unzip skip: ' + subfile)
680 else: 686 else:
@@ -738,18 +744,19 @@ def process_file(filename, data, output_dir=None): @@ -738,18 +744,19 @@ def process_file(filename, data, output_dir=None):
738 err_dumping = False 744 err_dumping = False
739 did_dump = False 745 did_dump = False
740 746
  747 + xml_parser = None
741 if is_zipfile(filename): 748 if is_zipfile(filename):
742 - log.info('file is a OOXML file, looking for relationships with external files') 749 + log.info('file is a OOXML file, looking for relationships with external links')
743 xml_parser = XmlParser(filename) 750 xml_parser = XmlParser(filename)
744 for relationship, target in find_external_relationships(xml_parser): 751 for relationship, target in find_external_relationships(xml_parser):
745 did_dump = True 752 did_dump = True
746 - print("Found relationship '%s' with external file %s" % (relationship, target)) 753 + print("Found relationship '%s' with external link %s" % (relationship, target))
747 754
748 755
749 # look for ole files inside file (e.g. unzip docx) 756 # look for ole files inside file (e.g. unzip docx)
750 # have to finish work on every ole stream inside iteration, since handles 757 # have to finish work on every ole stream inside iteration, since handles
751 # are closed in find_ole 758 # are closed in find_ole
752 - for ole in find_ole(filename, data): 759 + for ole in find_ole(filename, data, xml_parser):
753 if ole is None: # no ole file found 760 if ole is None: # no ole file found
754 continue 761 continue
755 762