diff --git a/oletools/oleobj.py b/oletools/oleobj.py index eac3c89..f6d905a 100644 --- a/oletools/oleobj.py +++ b/oletools/oleobj.py @@ -50,7 +50,7 @@ import os import re import sys import io -from zipfile import is_zipfile, ZipFile +from zipfile import is_zipfile import olefile @@ -72,7 +72,7 @@ except ImportError: from oletools.ppt_record_parser import (is_ppt, PptFile, PptRecordExOleVbaActiveXAtom) -from oletools.ooxml import ZipSubFile +from oletools.ooxml import XmlParser # ----------------------------------------------------------------------------- # CHANGELOG: @@ -181,6 +181,7 @@ else: NULL_CHAR = 0 # pylint: disable=redefined-variable-type xrange = range # pylint: disable=redefined-builtin, invalid-name +OOXML_RELATIONSHIP_TAG = '{http://schemas.openxmlformats.org/package/2006/relationships}Relationship' # === GLOBAL VARIABLES ======================================================== @@ -206,6 +207,24 @@ RETURN_ERR_ARGS = 2 # reserve for OptionParser.parse_args RETURN_ERR_STREAM = 4 # error opening/parsing a stream RETURN_ERR_DUMP = 8 # error dumping data from stream to file +# Not sure if they can all be "External", but just in case +BLACKLISTED_RELATIONSHIP_TYPES = [ + 'attachedTemplate', + 'externalLink', + 'externalLinkPath', + 'externalReference' + 'frame' + 'hyperlink', + 'officeDocument', + 'oleObject', + 'package', + 'slideUpdateUrl', + 'slideMaster', + 'slide', + 'slideUpdateInfo', + 'subDocument', + 'worksheet' +] # === FUNCTIONS =============================================================== @@ -599,7 +618,7 @@ class FakeFile(io.RawIOBase): return self.pos -def find_ole(filename, data): +def find_ole(filename, data, xml_parser=None): """ try to open somehow as zip/ole/rtf/... ; yield None if fail If data is given, filename is (mostly) ignored. @@ -631,34 +650,40 @@ def find_ole(filename, data): log.info('is ole file: ' + filename) ole = olefile.OleFileIO(arg_for_ole) yield ole - elif is_zipfile(arg_for_zip): + elif xml_parser is not None or is_zipfile(arg_for_zip): + # keep compatibility with 3rd-party code that calls this function + # directly without providing an XmlParser instance + if xml_parser is None: + xml_parser = XmlParser(arg_for_zip) + # force iteration so XmlParser.iter_non_xml() returns data + [x for x in xml_parser.iter_xml()] + log.info('is zip file: ' + filename) - zipper = ZipFile(arg_for_zip, 'r') - for subfile in zipper.namelist(): - head = b'' + # we looped through the XML files before, now we can + # iterate the non-XML files looking for ole objects + for subfile, _, file_handle in xml_parser.iter_non_xml(): try: - with zipper.open(subfile) as file_handle: - head = file_handle.read(len(olefile.MAGIC)) + head = file_handle.read(len(olefile.MAGIC)) except RuntimeError: log.error('zip is encrypted: ' + filename) yield None continue if head == olefile.MAGIC: + file_handle.seek(0) log.info(' unzipping ole: ' + subfile) - with ZipSubFile(zipper, subfile) as file_handle: - try: - ole = olefile.OleFileIO(file_handle) - yield ole - except IOError: - log.warning('Error reading data from {0}/{1} or ' - 'interpreting it as OLE object' - .format(filename, subfile)) - log.debug('', exc_info=True) - finally: - if ole is not None: - ole.close() - ole = None + try: + ole = olefile.OleFileIO(file_handle) + yield ole + except IOError: + log.warning('Error reading data from {0}/{1} or ' + 'interpreting it as OLE object' + .format(filename, subfile)) + log.debug('', exc_info=True) + finally: + if ole is not None: + ole.close() + ole = None else: log.debug('unzip skip: ' + subfile) else: @@ -674,6 +699,22 @@ def find_ole(filename, data): ole.close() +def find_external_relationships(xml_parser): + """ iterate XML files looking for relationships to external objects + """ + for _, elem, _ in xml_parser.iter_xml(None, False, OOXML_RELATIONSHIP_TAG): + try: + if elem.attrib['TargetMode'] == 'External': + relationship_type = elem.attrib['Type'].rsplit('/', 1)[1] + + if relationship_type in BLACKLISTED_RELATIONSHIP_TYPES: + yield relationship_type, elem.attrib['Target'] + except (AttributeError, KeyError): + # ignore missing attributes - Word won't detect + # external links anyway + pass + + def process_file(filename, data, output_dir=None): """ find embedded objects in given file @@ -706,10 +747,19 @@ def process_file(filename, data, output_dir=None): err_dumping = False did_dump = False + xml_parser = None + if is_zipfile(filename): + log.info('file is a OOXML file, looking for relationships with external links') + xml_parser = XmlParser(filename) + for relationship, target in find_external_relationships(xml_parser): + did_dump = True + print("Found relationship '%s' with external link %s" % (relationship, target)) + + # look for ole files inside file (e.g. unzip docx) # have to finish work on every ole stream inside iteration, since handles # are closed in find_ole - for ole in find_ole(filename, data): + for ole in find_ole(filename, data, xml_parser): if ole is None: # no ole file found continue diff --git a/oletools/ooxml.py b/oletools/ooxml.py index d33828b..174c46d 100644 --- a/oletools/ooxml.py +++ b/oletools/ooxml.py @@ -644,7 +644,7 @@ def test(): for subfile, elem, depth in parser.iter_xml(): if depth < 4: print(u'{0} {1}{2}'.format(subfile, ' ' * depth, debug_str(elem))) - for index, (subfile, content_type) in enumerate(parser.iter_non_xml()): + for index, (subfile, content_type, _) in enumerate(parser.iter_non_xml()): print(u'Non-XML subfile: {0} of type {1}' .format(subfile, content_type or u'unknown')) if index > 100: diff --git a/tests/oleobj/test_external_links.py b/tests/oleobj/test_external_links.py new file mode 100644 index 0000000..9c7e632 --- /dev/null +++ b/tests/oleobj/test_external_links.py @@ -0,0 +1,31 @@ +""" Test that oleobj detects external links in relationships files. +""" + +import unittest +import os +from os import path + +# Directory with test data, independent of current working directory +from tests.test_utils import DATA_BASE_DIR +from oletools import oleobj + +BASE_DIR = path.join(DATA_BASE_DIR, 'oleobj', 'external_link') + + +class TestExternalLinks(unittest.TestCase): + def test_external_links(self): + """ + loop through sample files asserting that external links are found + """ + + for dirpath, _, filenames in os.walk(BASE_DIR): + for filename in filenames: + file_path = path.join(dirpath, filename) + + ret_val = oleobj.main([file_path]) + self.assertEqual(ret_val, oleobj.RETURN_DID_DUMP) + + +# just in case somebody calls this file as a script +if __name__ == '__main__': + unittest.main() diff --git a/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.docm b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.docm new file mode 100644 index 0000000..ee1e169 --- /dev/null +++ b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.docm diff --git a/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.docx b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.docx new file mode 100644 index 0000000..9f00bbd --- /dev/null +++ b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.docx diff --git a/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.dotm b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.dotm new file mode 100644 index 0000000..fe0c0ea --- /dev/null +++ b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.dotm diff --git a/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.dotx b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.dotx new file mode 100644 index 0000000..a30ccf2 --- /dev/null +++ b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.dotx diff --git a/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.potm b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.potm new file mode 100644 index 0000000..aa29f11 --- /dev/null +++ b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.potm diff --git a/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.potx b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.potx new file mode 100644 index 0000000..ebafc78 --- /dev/null +++ b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.potx diff --git a/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.ppsm b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.ppsm new file mode 100644 index 0000000..22f7d97 --- /dev/null +++ b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.ppsm diff --git a/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.ppsx b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.ppsx new file mode 100644 index 0000000..80e6a6e --- /dev/null +++ b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.ppsx diff --git a/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.pptm b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.pptm new file mode 100644 index 0000000..205a489 --- /dev/null +++ b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.pptm diff --git a/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.pptx b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.pptx new file mode 100644 index 0000000..2e1eba3 --- /dev/null +++ b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.pptx diff --git a/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.xlsb b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.xlsb new file mode 100644 index 0000000..19d6e66 --- /dev/null +++ b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.xlsb diff --git a/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.xlsm b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.xlsm new file mode 100644 index 0000000..afb6f6d --- /dev/null +++ b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.xlsm diff --git a/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.xlsx b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.xlsx new file mode 100644 index 0000000..00d5db7 --- /dev/null +++ b/tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.xlsx