Commit 791592425455617979c0a3a1a94f8d6ea452206b

Authored by Philippe Lagadec
Committed by GitHub
2 parents d3b8857d a5ac0720

Merge pull request #317 from samiraguiar/oleobj-detect-external

oleobj: detect external links
oletools/oleobj.py
@@ -50,7 +50,7 @@ import os @@ -50,7 +50,7 @@ import os
50 import re 50 import re
51 import sys 51 import sys
52 import io 52 import io
53 -from zipfile import is_zipfile, ZipFile 53 +from zipfile import is_zipfile
54 54
55 import olefile 55 import olefile
56 56
@@ -72,7 +72,7 @@ except ImportError: @@ -72,7 +72,7 @@ except ImportError:
72 72
73 from oletools.ppt_record_parser import (is_ppt, PptFile, 73 from oletools.ppt_record_parser import (is_ppt, PptFile,
74 PptRecordExOleVbaActiveXAtom) 74 PptRecordExOleVbaActiveXAtom)
75 -from oletools.ooxml import ZipSubFile 75 +from oletools.ooxml import XmlParser
76 76
77 # ----------------------------------------------------------------------------- 77 # -----------------------------------------------------------------------------
78 # CHANGELOG: 78 # CHANGELOG:
@@ -181,6 +181,7 @@ else: @@ -181,6 +181,7 @@ else:
181 NULL_CHAR = 0 # pylint: disable=redefined-variable-type 181 NULL_CHAR = 0 # pylint: disable=redefined-variable-type
182 xrange = range # pylint: disable=redefined-builtin, invalid-name 182 xrange = range # pylint: disable=redefined-builtin, invalid-name
183 183
  184 +OOXML_RELATIONSHIP_TAG = '{http://schemas.openxmlformats.org/package/2006/relationships}Relationship'
184 185
185 # === GLOBAL VARIABLES ======================================================== 186 # === GLOBAL VARIABLES ========================================================
186 187
@@ -206,6 +207,24 @@ RETURN_ERR_ARGS = 2 # reserve for OptionParser.parse_args @@ -206,6 +207,24 @@ RETURN_ERR_ARGS = 2 # reserve for OptionParser.parse_args
206 RETURN_ERR_STREAM = 4 # error opening/parsing a stream 207 RETURN_ERR_STREAM = 4 # error opening/parsing a stream
207 RETURN_ERR_DUMP = 8 # error dumping data from stream to file 208 RETURN_ERR_DUMP = 8 # error dumping data from stream to file
208 209
  210 +# Not sure if they can all be "External", but just in case
  211 +BLACKLISTED_RELATIONSHIP_TYPES = [
  212 + 'attachedTemplate',
  213 + 'externalLink',
  214 + 'externalLinkPath',
  215 + 'externalReference'
  216 + 'frame'
  217 + 'hyperlink',
  218 + 'officeDocument',
  219 + 'oleObject',
  220 + 'package',
  221 + 'slideUpdateUrl',
  222 + 'slideMaster',
  223 + 'slide',
  224 + 'slideUpdateInfo',
  225 + 'subDocument',
  226 + 'worksheet'
  227 +]
209 228
210 # === FUNCTIONS =============================================================== 229 # === FUNCTIONS ===============================================================
211 230
@@ -599,7 +618,7 @@ class FakeFile(io.RawIOBase): @@ -599,7 +618,7 @@ class FakeFile(io.RawIOBase):
599 return self.pos 618 return self.pos
600 619
601 620
602 -def find_ole(filename, data): 621 +def find_ole(filename, data, xml_parser=None):
603 """ try to open somehow as zip/ole/rtf/... ; yield None if fail 622 """ try to open somehow as zip/ole/rtf/... ; yield None if fail
604 623
605 If data is given, filename is (mostly) ignored. 624 If data is given, filename is (mostly) ignored.
@@ -631,34 +650,40 @@ def find_ole(filename, data): @@ -631,34 +650,40 @@ def find_ole(filename, data):
631 log.info('is ole file: ' + filename) 650 log.info('is ole file: ' + filename)
632 ole = olefile.OleFileIO(arg_for_ole) 651 ole = olefile.OleFileIO(arg_for_ole)
633 yield ole 652 yield ole
634 - elif is_zipfile(arg_for_zip): 653 + elif xml_parser is not None or is_zipfile(arg_for_zip):
  654 + # keep compatibility with 3rd-party code that calls this function
  655 + # directly without providing an XmlParser instance
  656 + if xml_parser is None:
  657 + xml_parser = XmlParser(arg_for_zip)
  658 + # force iteration so XmlParser.iter_non_xml() returns data
  659 + [x for x in xml_parser.iter_xml()]
  660 +
635 log.info('is zip file: ' + filename) 661 log.info('is zip file: ' + filename)
636 - zipper = ZipFile(arg_for_zip, 'r')  
637 - for subfile in zipper.namelist():  
638 - head = b'' 662 + # we looped through the XML files before, now we can
  663 + # iterate the non-XML files looking for ole objects
  664 + for subfile, _, file_handle in xml_parser.iter_non_xml():
639 try: 665 try:
640 - with zipper.open(subfile) as file_handle:  
641 - head = file_handle.read(len(olefile.MAGIC)) 666 + head = file_handle.read(len(olefile.MAGIC))
642 except RuntimeError: 667 except RuntimeError:
643 log.error('zip is encrypted: ' + filename) 668 log.error('zip is encrypted: ' + filename)
644 yield None 669 yield None
645 continue 670 continue
646 671
647 if head == olefile.MAGIC: 672 if head == olefile.MAGIC:
  673 + file_handle.seek(0)
648 log.info(' unzipping ole: ' + subfile) 674 log.info(' unzipping ole: ' + subfile)
649 - with ZipSubFile(zipper, subfile) as file_handle:  
650 - try:  
651 - ole = olefile.OleFileIO(file_handle)  
652 - yield ole  
653 - except IOError:  
654 - log.warning('Error reading data from {0}/{1} or '  
655 - 'interpreting it as OLE object'  
656 - .format(filename, subfile))  
657 - log.debug('', exc_info=True)  
658 - finally:  
659 - if ole is not None:  
660 - ole.close()  
661 - ole = None 675 + try:
  676 + ole = olefile.OleFileIO(file_handle)
  677 + yield ole
  678 + except IOError:
  679 + log.warning('Error reading data from {0}/{1} or '
  680 + 'interpreting it as OLE object'
  681 + .format(filename, subfile))
  682 + log.debug('', exc_info=True)
  683 + finally:
  684 + if ole is not None:
  685 + ole.close()
  686 + ole = None
662 else: 687 else:
663 log.debug('unzip skip: ' + subfile) 688 log.debug('unzip skip: ' + subfile)
664 else: 689 else:
@@ -674,6 +699,22 @@ def find_ole(filename, data): @@ -674,6 +699,22 @@ def find_ole(filename, data):
674 ole.close() 699 ole.close()
675 700
676 701
  702 +def find_external_relationships(xml_parser):
  703 + """ iterate XML files looking for relationships to external objects
  704 + """
  705 + for _, elem, _ in xml_parser.iter_xml(None, False, OOXML_RELATIONSHIP_TAG):
  706 + try:
  707 + if elem.attrib['TargetMode'] == 'External':
  708 + relationship_type = elem.attrib['Type'].rsplit('/', 1)[1]
  709 +
  710 + if relationship_type in BLACKLISTED_RELATIONSHIP_TYPES:
  711 + yield relationship_type, elem.attrib['Target']
  712 + except (AttributeError, KeyError):
  713 + # ignore missing attributes - Word won't detect
  714 + # external links anyway
  715 + pass
  716 +
  717 +
677 def process_file(filename, data, output_dir=None): 718 def process_file(filename, data, output_dir=None):
678 """ find embedded objects in given file 719 """ find embedded objects in given file
679 720
@@ -706,10 +747,19 @@ def process_file(filename, data, output_dir=None): @@ -706,10 +747,19 @@ def process_file(filename, data, output_dir=None):
706 err_dumping = False 747 err_dumping = False
707 did_dump = False 748 did_dump = False
708 749
  750 + xml_parser = None
  751 + if is_zipfile(filename):
  752 + log.info('file is a OOXML file, looking for relationships with external links')
  753 + xml_parser = XmlParser(filename)
  754 + for relationship, target in find_external_relationships(xml_parser):
  755 + did_dump = True
  756 + print("Found relationship '%s' with external link %s" % (relationship, target))
  757 +
  758 +
709 # look for ole files inside file (e.g. unzip docx) 759 # look for ole files inside file (e.g. unzip docx)
710 # have to finish work on every ole stream inside iteration, since handles 760 # have to finish work on every ole stream inside iteration, since handles
711 # are closed in find_ole 761 # are closed in find_ole
712 - for ole in find_ole(filename, data): 762 + for ole in find_ole(filename, data, xml_parser):
713 if ole is None: # no ole file found 763 if ole is None: # no ole file found
714 continue 764 continue
715 765
oletools/ooxml.py
@@ -644,7 +644,7 @@ def test(): @@ -644,7 +644,7 @@ def test():
644 for subfile, elem, depth in parser.iter_xml(): 644 for subfile, elem, depth in parser.iter_xml():
645 if depth < 4: 645 if depth < 4:
646 print(u'{0} {1}{2}'.format(subfile, ' ' * depth, debug_str(elem))) 646 print(u'{0} {1}{2}'.format(subfile, ' ' * depth, debug_str(elem)))
647 - for index, (subfile, content_type) in enumerate(parser.iter_non_xml()): 647 + for index, (subfile, content_type, _) in enumerate(parser.iter_non_xml()):
648 print(u'Non-XML subfile: {0} of type {1}' 648 print(u'Non-XML subfile: {0} of type {1}'
649 .format(subfile, content_type or u'unknown')) 649 .format(subfile, content_type or u'unknown'))
650 if index > 100: 650 if index > 100:
tests/oleobj/test_external_links.py 0 → 100644
  1 +""" Test that oleobj detects external links in relationships files.
  2 +"""
  3 +
  4 +import unittest
  5 +import os
  6 +from os import path
  7 +
  8 +# Directory with test data, independent of current working directory
  9 +from tests.test_utils import DATA_BASE_DIR
  10 +from oletools import oleobj
  11 +
  12 +BASE_DIR = path.join(DATA_BASE_DIR, 'oleobj', 'external_link')
  13 +
  14 +
  15 +class TestExternalLinks(unittest.TestCase):
  16 + def test_external_links(self):
  17 + """
  18 + loop through sample files asserting that external links are found
  19 + """
  20 +
  21 + for dirpath, _, filenames in os.walk(BASE_DIR):
  22 + for filename in filenames:
  23 + file_path = path.join(dirpath, filename)
  24 +
  25 + ret_val = oleobj.main([file_path])
  26 + self.assertEqual(ret_val, oleobj.RETURN_DID_DUMP)
  27 +
  28 +
  29 +# just in case somebody calls this file as a script
  30 +if __name__ == '__main__':
  31 + unittest.main()
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.docm 0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.docx 0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.dotm 0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.dotx 0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.potm 0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.potx 0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.ppsm 0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.ppsx 0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.pptm 0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.pptx 0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.xlsb 0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.xlsm 0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.xlsx 0 → 100644
No preview for this file type