Commit f4fdba01d5f16130058e6e5b82d2078a593f3616
1 parent
42a369f9
oleobj: reuse xml_parser when looking for olefiles
Since we already parse the Xml file before when looking for external links in the relationships, we can reuse the parser to avoid iterating files twice.
Showing
1 changed file
with
32 additions
and
25 deletions
oletools/oleobj.py
| @@ -50,7 +50,7 @@ import os | @@ -50,7 +50,7 @@ import os | ||
| 50 | import re | 50 | import re |
| 51 | import sys | 51 | import sys |
| 52 | import io | 52 | import io |
| 53 | -from zipfile import is_zipfile, ZipFile | 53 | +from zipfile import is_zipfile |
| 54 | 54 | ||
| 55 | # IMPORTANT: it should be possible to run oletools directly as scripts | 55 | # IMPORTANT: it should be possible to run oletools directly as scripts |
| 56 | # in any directory without installing them with pip or setup.py. | 56 | # in any directory without installing them with pip or setup.py. |
| @@ -70,7 +70,7 @@ except ImportError: | @@ -70,7 +70,7 @@ except ImportError: | ||
| 70 | from oletools.thirdparty import xglob | 70 | from oletools.thirdparty import xglob |
| 71 | from oletools.ppt_record_parser import (is_ppt, PptFile, | 71 | from oletools.ppt_record_parser import (is_ppt, PptFile, |
| 72 | PptRecordExOleVbaActiveXAtom) | 72 | PptRecordExOleVbaActiveXAtom) |
| 73 | -from oletools.ooxml import XmlParser, ZipSubFile | 73 | +from oletools.ooxml import XmlParser |
| 74 | 74 | ||
| 75 | # ----------------------------------------------------------------------------- | 75 | # ----------------------------------------------------------------------------- |
| 76 | # CHANGELOG: | 76 | # CHANGELOG: |
| @@ -615,7 +615,7 @@ class FakeFile(io.RawIOBase): | @@ -615,7 +615,7 @@ class FakeFile(io.RawIOBase): | ||
| 615 | return self.pos | 615 | return self.pos |
| 616 | 616 | ||
| 617 | 617 | ||
| 618 | -def find_ole(filename, data): | 618 | +def find_ole(filename, data, xml_parser=None): |
| 619 | """ try to open somehow as zip/ole/rtf/... ; yield None if fail | 619 | """ try to open somehow as zip/ole/rtf/... ; yield None if fail |
| 620 | 620 | ||
| 621 | If data is given, filename is (mostly) ignored. | 621 | If data is given, filename is (mostly) ignored. |
| @@ -647,34 +647,40 @@ def find_ole(filename, data): | @@ -647,34 +647,40 @@ def find_ole(filename, data): | ||
| 647 | log.info('is ole file: ' + filename) | 647 | log.info('is ole file: ' + filename) |
| 648 | ole = olefile.OleFileIO(arg_for_ole) | 648 | ole = olefile.OleFileIO(arg_for_ole) |
| 649 | yield ole | 649 | yield ole |
| 650 | - elif is_zipfile(arg_for_zip): | 650 | + elif xml_parser is not None or is_zipfile(arg_for_zip): |
| 651 | + # keep compatibility with 3rd-party code that calls this function | ||
| 652 | + # directly without providing an XmlParser instance | ||
| 653 | + if xml_parser is None: | ||
| 654 | + xml_parser = XmlParser(arg_for_zip) | ||
| 655 | + # force iteration so XmlParser.iter_non_xml() returns data | ||
| 656 | + [x for x in xml_parser.iter_xml()] | ||
| 657 | + | ||
| 651 | log.info('is zip file: ' + filename) | 658 | log.info('is zip file: ' + filename) |
| 652 | - zipper = ZipFile(arg_for_zip, 'r') | ||
| 653 | - for subfile in zipper.namelist(): | ||
| 654 | - head = b'' | 659 | + # we looped through the XML files before, now we can |
| 660 | + # iterate the non-XML files looking for ole objects | ||
| 661 | + for subfile, _, file_handle in xml_parser.iter_non_xml(): | ||
| 655 | try: | 662 | try: |
| 656 | - with zipper.open(subfile) as file_handle: | ||
| 657 | - head = file_handle.read(len(olefile.MAGIC)) | 663 | + head = file_handle.read(len(olefile.MAGIC)) |
| 658 | except RuntimeError: | 664 | except RuntimeError: |
| 659 | log.error('zip is encrypted: ' + filename) | 665 | log.error('zip is encrypted: ' + filename) |
| 660 | yield None | 666 | yield None |
| 661 | continue | 667 | continue |
| 662 | 668 | ||
| 663 | if head == olefile.MAGIC: | 669 | if head == olefile.MAGIC: |
| 670 | + file_handle.seek(0) | ||
| 664 | log.info(' unzipping ole: ' + subfile) | 671 | log.info(' unzipping ole: ' + subfile) |
| 665 | - with ZipSubFile(zipper, subfile) as file_handle: | ||
| 666 | - try: | ||
| 667 | - ole = olefile.OleFileIO(file_handle) | ||
| 668 | - yield ole | ||
| 669 | - except IOError: | ||
| 670 | - log.warning('Error reading data from {0}/{1} or ' | ||
| 671 | - 'interpreting it as OLE object' | ||
| 672 | - .format(filename, subfile)) | ||
| 673 | - log.debug('', exc_info=True) | ||
| 674 | - finally: | ||
| 675 | - if ole is not None: | ||
| 676 | - ole.close() | ||
| 677 | - ole = None | 672 | + try: |
| 673 | + ole = olefile.OleFileIO(file_handle) | ||
| 674 | + yield ole | ||
| 675 | + except IOError: | ||
| 676 | + log.warning('Error reading data from {0}/{1} or ' | ||
| 677 | + 'interpreting it as OLE object' | ||
| 678 | + .format(filename, subfile)) | ||
| 679 | + log.debug('', exc_info=True) | ||
| 680 | + finally: | ||
| 681 | + if ole is not None: | ||
| 682 | + ole.close() | ||
| 683 | + ole = None | ||
| 678 | else: | 684 | else: |
| 679 | log.debug('unzip skip: ' + subfile) | 685 | log.debug('unzip skip: ' + subfile) |
| 680 | else: | 686 | else: |
| @@ -738,18 +744,19 @@ def process_file(filename, data, output_dir=None): | @@ -738,18 +744,19 @@ def process_file(filename, data, output_dir=None): | ||
| 738 | err_dumping = False | 744 | err_dumping = False |
| 739 | did_dump = False | 745 | did_dump = False |
| 740 | 746 | ||
| 747 | + xml_parser = None | ||
| 741 | if is_zipfile(filename): | 748 | if is_zipfile(filename): |
| 742 | - log.info('file is a OOXML file, looking for relationships with external files') | 749 | + log.info('file is a OOXML file, looking for relationships with external links') |
| 743 | xml_parser = XmlParser(filename) | 750 | xml_parser = XmlParser(filename) |
| 744 | for relationship, target in find_external_relationships(xml_parser): | 751 | for relationship, target in find_external_relationships(xml_parser): |
| 745 | did_dump = True | 752 | did_dump = True |
| 746 | - print("Found relationship '%s' with external file %s" % (relationship, target)) | 753 | + print("Found relationship '%s' with external link %s" % (relationship, target)) |
| 747 | 754 | ||
| 748 | 755 | ||
| 749 | # look for ole files inside file (e.g. unzip docx) | 756 | # look for ole files inside file (e.g. unzip docx) |
| 750 | # have to finish work on every ole stream inside iteration, since handles | 757 | # have to finish work on every ole stream inside iteration, since handles |
| 751 | # are closed in find_ole | 758 | # are closed in find_ole |
| 752 | - for ole in find_ole(filename, data): | 759 | + for ole in find_ole(filename, data, xml_parser): |
| 753 | if ole is None: # no ole file found | 760 | if ole is None: # no ole file found |
| 754 | continue | 761 | continue |
| 755 | 762 |