Commit f4fdba01d5f16130058e6e5b82d2078a593f3616
1 parent
42a369f9
oleobj: reuse xml_parser when looking for olefiles
Since we already parse the Xml file before when looking for external links in the relationships, we can reuse the parser to avoid iterating files twice.
Showing
1 changed file
with
32 additions
and
25 deletions
oletools/oleobj.py
| ... | ... | @@ -50,7 +50,7 @@ import os |
| 50 | 50 | import re |
| 51 | 51 | import sys |
| 52 | 52 | import io |
| 53 | -from zipfile import is_zipfile, ZipFile | |
| 53 | +from zipfile import is_zipfile | |
| 54 | 54 | |
| 55 | 55 | # IMPORTANT: it should be possible to run oletools directly as scripts |
| 56 | 56 | # in any directory without installing them with pip or setup.py. |
| ... | ... | @@ -70,7 +70,7 @@ except ImportError: |
| 70 | 70 | from oletools.thirdparty import xglob |
| 71 | 71 | from oletools.ppt_record_parser import (is_ppt, PptFile, |
| 72 | 72 | PptRecordExOleVbaActiveXAtom) |
| 73 | -from oletools.ooxml import XmlParser, ZipSubFile | |
| 73 | +from oletools.ooxml import XmlParser | |
| 74 | 74 | |
| 75 | 75 | # ----------------------------------------------------------------------------- |
| 76 | 76 | # CHANGELOG: |
| ... | ... | @@ -615,7 +615,7 @@ class FakeFile(io.RawIOBase): |
| 615 | 615 | return self.pos |
| 616 | 616 | |
| 617 | 617 | |
| 618 | -def find_ole(filename, data): | |
| 618 | +def find_ole(filename, data, xml_parser=None): | |
| 619 | 619 | """ try to open somehow as zip/ole/rtf/... ; yield None if fail |
| 620 | 620 | |
| 621 | 621 | If data is given, filename is (mostly) ignored. |
| ... | ... | @@ -647,34 +647,40 @@ def find_ole(filename, data): |
| 647 | 647 | log.info('is ole file: ' + filename) |
| 648 | 648 | ole = olefile.OleFileIO(arg_for_ole) |
| 649 | 649 | yield ole |
| 650 | - elif is_zipfile(arg_for_zip): | |
| 650 | + elif xml_parser is not None or is_zipfile(arg_for_zip): | |
| 651 | + # keep compatibility with 3rd-party code that calls this function | |
| 652 | + # directly without providing an XmlParser instance | |
| 653 | + if xml_parser is None: | |
| 654 | + xml_parser = XmlParser(arg_for_zip) | |
| 655 | + # force iteration so XmlParser.iter_non_xml() returns data | |
| 656 | + [x for x in xml_parser.iter_xml()] | |
| 657 | + | |
| 651 | 658 | log.info('is zip file: ' + filename) |
| 652 | - zipper = ZipFile(arg_for_zip, 'r') | |
| 653 | - for subfile in zipper.namelist(): | |
| 654 | - head = b'' | |
| 659 | + # we looped through the XML files before, now we can | |
| 660 | + # iterate the non-XML files looking for ole objects | |
| 661 | + for subfile, _, file_handle in xml_parser.iter_non_xml(): | |
| 655 | 662 | try: |
| 656 | - with zipper.open(subfile) as file_handle: | |
| 657 | - head = file_handle.read(len(olefile.MAGIC)) | |
| 663 | + head = file_handle.read(len(olefile.MAGIC)) | |
| 658 | 664 | except RuntimeError: |
| 659 | 665 | log.error('zip is encrypted: ' + filename) |
| 660 | 666 | yield None |
| 661 | 667 | continue |
| 662 | 668 | |
| 663 | 669 | if head == olefile.MAGIC: |
| 670 | + file_handle.seek(0) | |
| 664 | 671 | log.info(' unzipping ole: ' + subfile) |
| 665 | - with ZipSubFile(zipper, subfile) as file_handle: | |
| 666 | - try: | |
| 667 | - ole = olefile.OleFileIO(file_handle) | |
| 668 | - yield ole | |
| 669 | - except IOError: | |
| 670 | - log.warning('Error reading data from {0}/{1} or ' | |
| 671 | - 'interpreting it as OLE object' | |
| 672 | - .format(filename, subfile)) | |
| 673 | - log.debug('', exc_info=True) | |
| 674 | - finally: | |
| 675 | - if ole is not None: | |
| 676 | - ole.close() | |
| 677 | - ole = None | |
| 672 | + try: | |
| 673 | + ole = olefile.OleFileIO(file_handle) | |
| 674 | + yield ole | |
| 675 | + except IOError: | |
| 676 | + log.warning('Error reading data from {0}/{1} or ' | |
| 677 | + 'interpreting it as OLE object' | |
| 678 | + .format(filename, subfile)) | |
| 679 | + log.debug('', exc_info=True) | |
| 680 | + finally: | |
| 681 | + if ole is not None: | |
| 682 | + ole.close() | |
| 683 | + ole = None | |
| 678 | 684 | else: |
| 679 | 685 | log.debug('unzip skip: ' + subfile) |
| 680 | 686 | else: |
| ... | ... | @@ -738,18 +744,19 @@ def process_file(filename, data, output_dir=None): |
| 738 | 744 | err_dumping = False |
| 739 | 745 | did_dump = False |
| 740 | 746 | |
| 747 | + xml_parser = None | |
| 741 | 748 | if is_zipfile(filename): |
| 742 | - log.info('file is a OOXML file, looking for relationships with external files') | |
| 749 | + log.info('file is a OOXML file, looking for relationships with external links') | |
| 743 | 750 | xml_parser = XmlParser(filename) |
| 744 | 751 | for relationship, target in find_external_relationships(xml_parser): |
| 745 | 752 | did_dump = True |
| 746 | - print("Found relationship '%s' with external file %s" % (relationship, target)) | |
| 753 | + print("Found relationship '%s' with external link %s" % (relationship, target)) | |
| 747 | 754 | |
| 748 | 755 | |
| 749 | 756 | # look for ole files inside file (e.g. unzip docx) |
| 750 | 757 | # have to finish work on every ole stream inside iteration, since handles |
| 751 | 758 | # are closed in find_ole |
| 752 | - for ole in find_ole(filename, data): | |
| 759 | + for ole in find_ole(filename, data, xml_parser): | |
| 753 | 760 | if ole is None: # no ole file found |
| 754 | 761 | continue |
| 755 | 762 | ... | ... |