Commit f4fdba01d5f16130058e6e5b82d2078a593f3616

Authored by Samir Aguiar
1 parent 42a369f9

oleobj: reuse xml_parser when looking for olefiles

Since we already parse the Xml file before when looking for
external links in the relationships, we can reuse the parser to
avoid iterating files twice.
Showing 1 changed file with 32 additions and 25 deletions
oletools/oleobj.py
... ... @@ -50,7 +50,7 @@ import os
50 50 import re
51 51 import sys
52 52 import io
53   -from zipfile import is_zipfile, ZipFile
  53 +from zipfile import is_zipfile
54 54  
55 55 # IMPORTANT: it should be possible to run oletools directly as scripts
56 56 # in any directory without installing them with pip or setup.py.
... ... @@ -70,7 +70,7 @@ except ImportError:
70 70 from oletools.thirdparty import xglob
71 71 from oletools.ppt_record_parser import (is_ppt, PptFile,
72 72 PptRecordExOleVbaActiveXAtom)
73   -from oletools.ooxml import XmlParser, ZipSubFile
  73 +from oletools.ooxml import XmlParser
74 74  
75 75 # -----------------------------------------------------------------------------
76 76 # CHANGELOG:
... ... @@ -615,7 +615,7 @@ class FakeFile(io.RawIOBase):
615 615 return self.pos
616 616  
617 617  
618   -def find_ole(filename, data):
  618 +def find_ole(filename, data, xml_parser=None):
619 619 """ try to open somehow as zip/ole/rtf/... ; yield None if fail
620 620  
621 621 If data is given, filename is (mostly) ignored.
... ... @@ -647,34 +647,40 @@ def find_ole(filename, data):
647 647 log.info('is ole file: ' + filename)
648 648 ole = olefile.OleFileIO(arg_for_ole)
649 649 yield ole
650   - elif is_zipfile(arg_for_zip):
  650 + elif xml_parser is not None or is_zipfile(arg_for_zip):
  651 + # keep compatibility with 3rd-party code that calls this function
  652 + # directly without providing an XmlParser instance
  653 + if xml_parser is None:
  654 + xml_parser = XmlParser(arg_for_zip)
  655 + # force iteration so XmlParser.iter_non_xml() returns data
  656 + [x for x in xml_parser.iter_xml()]
  657 +
651 658 log.info('is zip file: ' + filename)
652   - zipper = ZipFile(arg_for_zip, 'r')
653   - for subfile in zipper.namelist():
654   - head = b''
  659 + # we looped through the XML files before, now we can
  660 + # iterate the non-XML files looking for ole objects
  661 + for subfile, _, file_handle in xml_parser.iter_non_xml():
655 662 try:
656   - with zipper.open(subfile) as file_handle:
657   - head = file_handle.read(len(olefile.MAGIC))
  663 + head = file_handle.read(len(olefile.MAGIC))
658 664 except RuntimeError:
659 665 log.error('zip is encrypted: ' + filename)
660 666 yield None
661 667 continue
662 668  
663 669 if head == olefile.MAGIC:
  670 + file_handle.seek(0)
664 671 log.info(' unzipping ole: ' + subfile)
665   - with ZipSubFile(zipper, subfile) as file_handle:
666   - try:
667   - ole = olefile.OleFileIO(file_handle)
668   - yield ole
669   - except IOError:
670   - log.warning('Error reading data from {0}/{1} or '
671   - 'interpreting it as OLE object'
672   - .format(filename, subfile))
673   - log.debug('', exc_info=True)
674   - finally:
675   - if ole is not None:
676   - ole.close()
677   - ole = None
  672 + try:
  673 + ole = olefile.OleFileIO(file_handle)
  674 + yield ole
  675 + except IOError:
  676 + log.warning('Error reading data from {0}/{1} or '
  677 + 'interpreting it as OLE object'
  678 + .format(filename, subfile))
  679 + log.debug('', exc_info=True)
  680 + finally:
  681 + if ole is not None:
  682 + ole.close()
  683 + ole = None
678 684 else:
679 685 log.debug('unzip skip: ' + subfile)
680 686 else:
... ... @@ -738,18 +744,19 @@ def process_file(filename, data, output_dir=None):
738 744 err_dumping = False
739 745 did_dump = False
740 746  
  747 + xml_parser = None
741 748 if is_zipfile(filename):
742   - log.info('file is a OOXML file, looking for relationships with external files')
  749 + log.info('file is a OOXML file, looking for relationships with external links')
743 750 xml_parser = XmlParser(filename)
744 751 for relationship, target in find_external_relationships(xml_parser):
745 752 did_dump = True
746   - print("Found relationship '%s' with external file %s" % (relationship, target))
  753 + print("Found relationship '%s' with external link %s" % (relationship, target))
747 754  
748 755  
749 756 # look for ole files inside file (e.g. unzip docx)
750 757 # have to finish work on every ole stream inside iteration, since handles
751 758 # are closed in find_ole
752   - for ole in find_ole(filename, data):
  759 + for ole in find_ole(filename, data, xml_parser):
753 760 if ole is None: # no ole file found
754 761 continue
755 762  
... ...