Commit 791592425455617979c0a3a1a94f8d6ea452206b

Authored by Philippe Lagadec
Committed by GitHub
2 parents d3b8857d a5ac0720

Merge pull request #317 from samiraguiar/oleobj-detect-external

oleobj: detect external links
oletools/oleobj.py
... ... @@ -50,7 +50,7 @@ import os
50 50 import re
51 51 import sys
52 52 import io
53   -from zipfile import is_zipfile, ZipFile
  53 +from zipfile import is_zipfile
54 54  
55 55 import olefile
56 56  
... ... @@ -72,7 +72,7 @@ except ImportError:
72 72  
73 73 from oletools.ppt_record_parser import (is_ppt, PptFile,
74 74 PptRecordExOleVbaActiveXAtom)
75   -from oletools.ooxml import ZipSubFile
  75 +from oletools.ooxml import XmlParser
76 76  
77 77 # -----------------------------------------------------------------------------
78 78 # CHANGELOG:
... ... @@ -181,6 +181,7 @@ else:
181 181 NULL_CHAR = 0 # pylint: disable=redefined-variable-type
182 182 xrange = range # pylint: disable=redefined-builtin, invalid-name
183 183  
  184 +OOXML_RELATIONSHIP_TAG = '{http://schemas.openxmlformats.org/package/2006/relationships}Relationship'
184 185  
185 186 # === GLOBAL VARIABLES ========================================================
186 187  
... ... @@ -206,6 +207,24 @@ RETURN_ERR_ARGS = 2 # reserve for OptionParser.parse_args
206 207 RETURN_ERR_STREAM = 4 # error opening/parsing a stream
207 208 RETURN_ERR_DUMP = 8 # error dumping data from stream to file
208 209  
  210 +# Not sure if they can all be "External", but just in case
  211 +BLACKLISTED_RELATIONSHIP_TYPES = [
  212 + 'attachedTemplate',
  213 + 'externalLink',
  214 + 'externalLinkPath',
  215 + 'externalReference'
  216 + 'frame'
  217 + 'hyperlink',
  218 + 'officeDocument',
  219 + 'oleObject',
  220 + 'package',
  221 + 'slideUpdateUrl',
  222 + 'slideMaster',
  223 + 'slide',
  224 + 'slideUpdateInfo',
  225 + 'subDocument',
  226 + 'worksheet'
  227 +]
209 228  
210 229 # === FUNCTIONS ===============================================================
211 230  
... ... @@ -599,7 +618,7 @@ class FakeFile(io.RawIOBase):
599 618 return self.pos
600 619  
601 620  
602   -def find_ole(filename, data):
  621 +def find_ole(filename, data, xml_parser=None):
603 622 """ try to open somehow as zip/ole/rtf/... ; yield None if fail
604 623  
605 624 If data is given, filename is (mostly) ignored.
... ... @@ -631,34 +650,40 @@ def find_ole(filename, data):
631 650 log.info('is ole file: ' + filename)
632 651 ole = olefile.OleFileIO(arg_for_ole)
633 652 yield ole
634   - elif is_zipfile(arg_for_zip):
  653 + elif xml_parser is not None or is_zipfile(arg_for_zip):
  654 + # keep compatibility with 3rd-party code that calls this function
  655 + # directly without providing an XmlParser instance
  656 + if xml_parser is None:
  657 + xml_parser = XmlParser(arg_for_zip)
  658 + # force iteration so XmlParser.iter_non_xml() returns data
  659 + [x for x in xml_parser.iter_xml()]
  660 +
635 661 log.info('is zip file: ' + filename)
636   - zipper = ZipFile(arg_for_zip, 'r')
637   - for subfile in zipper.namelist():
638   - head = b''
  662 + # we looped through the XML files before, now we can
  663 + # iterate the non-XML files looking for ole objects
  664 + for subfile, _, file_handle in xml_parser.iter_non_xml():
639 665 try:
640   - with zipper.open(subfile) as file_handle:
641   - head = file_handle.read(len(olefile.MAGIC))
  666 + head = file_handle.read(len(olefile.MAGIC))
642 667 except RuntimeError:
643 668 log.error('zip is encrypted: ' + filename)
644 669 yield None
645 670 continue
646 671  
647 672 if head == olefile.MAGIC:
  673 + file_handle.seek(0)
648 674 log.info(' unzipping ole: ' + subfile)
649   - with ZipSubFile(zipper, subfile) as file_handle:
650   - try:
651   - ole = olefile.OleFileIO(file_handle)
652   - yield ole
653   - except IOError:
654   - log.warning('Error reading data from {0}/{1} or '
655   - 'interpreting it as OLE object'
656   - .format(filename, subfile))
657   - log.debug('', exc_info=True)
658   - finally:
659   - if ole is not None:
660   - ole.close()
661   - ole = None
  675 + try:
  676 + ole = olefile.OleFileIO(file_handle)
  677 + yield ole
  678 + except IOError:
  679 + log.warning('Error reading data from {0}/{1} or '
  680 + 'interpreting it as OLE object'
  681 + .format(filename, subfile))
  682 + log.debug('', exc_info=True)
  683 + finally:
  684 + if ole is not None:
  685 + ole.close()
  686 + ole = None
662 687 else:
663 688 log.debug('unzip skip: ' + subfile)
664 689 else:
... ... @@ -674,6 +699,22 @@ def find_ole(filename, data):
674 699 ole.close()
675 700  
676 701  
  702 +def find_external_relationships(xml_parser):
  703 + """ iterate XML files looking for relationships to external objects
  704 + """
  705 + for _, elem, _ in xml_parser.iter_xml(None, False, OOXML_RELATIONSHIP_TAG):
  706 + try:
  707 + if elem.attrib['TargetMode'] == 'External':
  708 + relationship_type = elem.attrib['Type'].rsplit('/', 1)[1]
  709 +
  710 + if relationship_type in BLACKLISTED_RELATIONSHIP_TYPES:
  711 + yield relationship_type, elem.attrib['Target']
  712 + except (AttributeError, KeyError):
  713 + # ignore missing attributes - Word won't detect
  714 + # external links anyway
  715 + pass
  716 +
  717 +
677 718 def process_file(filename, data, output_dir=None):
678 719 """ find embedded objects in given file
679 720  
... ... @@ -706,10 +747,19 @@ def process_file(filename, data, output_dir=None):
706 747 err_dumping = False
707 748 did_dump = False
708 749  
  750 + xml_parser = None
  751 + if is_zipfile(filename):
  752 + log.info('file is a OOXML file, looking for relationships with external links')
  753 + xml_parser = XmlParser(filename)
  754 + for relationship, target in find_external_relationships(xml_parser):
  755 + did_dump = True
  756 + print("Found relationship '%s' with external link %s" % (relationship, target))
  757 +
  758 +
709 759 # look for ole files inside file (e.g. unzip docx)
710 760 # have to finish work on every ole stream inside iteration, since handles
711 761 # are closed in find_ole
712   - for ole in find_ole(filename, data):
  762 + for ole in find_ole(filename, data, xml_parser):
713 763 if ole is None: # no ole file found
714 764 continue
715 765  
... ...
oletools/ooxml.py
... ... @@ -644,7 +644,7 @@ def test():
644 644 for subfile, elem, depth in parser.iter_xml():
645 645 if depth < 4:
646 646 print(u'{0} {1}{2}'.format(subfile, ' ' * depth, debug_str(elem)))
647   - for index, (subfile, content_type) in enumerate(parser.iter_non_xml()):
  647 + for index, (subfile, content_type, _) in enumerate(parser.iter_non_xml()):
648 648 print(u'Non-XML subfile: {0} of type {1}'
649 649 .format(subfile, content_type or u'unknown'))
650 650 if index > 100:
... ...
tests/oleobj/test_external_links.py 0 → 100644
  1 +""" Test that oleobj detects external links in relationships files.
  2 +"""
  3 +
  4 +import unittest
  5 +import os
  6 +from os import path
  7 +
  8 +# Directory with test data, independent of current working directory
  9 +from tests.test_utils import DATA_BASE_DIR
  10 +from oletools import oleobj
  11 +
  12 +BASE_DIR = path.join(DATA_BASE_DIR, 'oleobj', 'external_link')
  13 +
  14 +
  15 +class TestExternalLinks(unittest.TestCase):
  16 + def test_external_links(self):
  17 + """
  18 + loop through sample files asserting that external links are found
  19 + """
  20 +
  21 + for dirpath, _, filenames in os.walk(BASE_DIR):
  22 + for filename in filenames:
  23 + file_path = path.join(dirpath, filename)
  24 +
  25 + ret_val = oleobj.main([file_path])
  26 + self.assertEqual(ret_val, oleobj.RETURN_DID_DUMP)
  27 +
  28 +
  29 +# just in case somebody calls this file as a script
  30 +if __name__ == '__main__':
  31 + unittest.main()
... ...
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.docm 0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.docx 0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.dotm 0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.dotx 0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.potm 0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.potx 0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.ppsm 0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.ppsx 0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.pptm 0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.pptx 0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.xlsb 0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.xlsm 0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.xlsx 0 → 100644
No preview for this file type