Commit 42a369f9f75750ea9e16d7c7c47c3577343d82bd

Authored by Samir Aguiar
1 parent 80d72312

oleobj: detect external relationships

By using ooxml we can iterate it through the XML files searching
for external links in documents relationships.
Showing 1 changed file with 44 additions and 1 deletions
oletools/oleobj.py
... ... @@ -70,7 +70,7 @@ except ImportError:
70 70 from oletools.thirdparty import xglob
71 71 from oletools.ppt_record_parser import (is_ppt, PptFile,
72 72 PptRecordExOleVbaActiveXAtom)
73   -from oletools.ooxml import ZipSubFile
  73 +from oletools.ooxml import XmlParser, ZipSubFile
74 74  
75 75 # -----------------------------------------------------------------------------
76 76 # CHANGELOG:
... ... @@ -178,6 +178,7 @@ else:
178 178 NULL_CHAR = 0 # pylint: disable=redefined-variable-type
179 179 xrange = range # pylint: disable=redefined-builtin, invalid-name
180 180  
  181 +OOXML_RELATIONSHIP_TAG = '{http://schemas.openxmlformats.org/package/2006/relationships}Relationship'
181 182  
182 183 # === GLOBAL VARIABLES ========================================================
183 184  
... ... @@ -203,6 +204,24 @@ RETURN_ERR_ARGS = 2 # reserve for OptionParser.parse_args
203 204 RETURN_ERR_STREAM = 4 # error opening/parsing a stream
204 205 RETURN_ERR_DUMP = 8 # error dumping data from stream to file
205 206  
  207 +# Not sure if they can all be "External", but just in case
  208 +BLACKLISTED_RELATIONSHIP_TYPES = [
  209 + 'attachedTemplate',
  210 + 'externalLink',
  211 + 'externalLinkPath',
  212 + 'externalReference'
  213 + 'frame'
  214 + 'hyperlink',
  215 + 'officeDocument',
  216 + 'oleObject',
  217 + 'package',
  218 + 'slideUpdateUrl',
  219 + 'slideMaster',
  220 + 'slide',
  221 + 'slideUpdateInfo',
  222 + 'subDocument',
  223 + 'worksheet'
  224 +]
206 225  
207 226 # === FUNCTIONS ===============================================================
208 227  
... ... @@ -671,6 +690,22 @@ def find_ole(filename, data):
671 690 ole.close()
672 691  
673 692  
  693 +def find_external_relationships(xml_parser):
  694 + """ iterate XML files looking for relationships to external objects
  695 + """
  696 + for _, elem, _ in xml_parser.iter_xml(None, False, OOXML_RELATIONSHIP_TAG):
  697 + try:
  698 + if elem.attrib['TargetMode'] == 'External':
  699 + relationship_type = elem.attrib['Type'].rsplit('/', 1)[1]
  700 +
  701 + if relationship_type in BLACKLISTED_RELATIONSHIP_TYPES:
  702 + yield relationship_type, elem.attrib['Target']
  703 + except (AttributeError, KeyError):
  704 + # ignore missing attributes - Word won't detect
  705 + # external links anyway
  706 + pass
  707 +
  708 +
674 709 def process_file(filename, data, output_dir=None):
675 710 """ find embedded objects in given file
676 711  
... ... @@ -703,6 +738,14 @@ def process_file(filename, data, output_dir=None):
703 738 err_dumping = False
704 739 did_dump = False
705 740  
  741 + if is_zipfile(filename):
  742 + log.info('file is a OOXML file, looking for relationships with external files')
  743 + xml_parser = XmlParser(filename)
  744 + for relationship, target in find_external_relationships(xml_parser):
  745 + did_dump = True
  746 + print("Found relationship '%s' with external file %s" % (relationship, target))
  747 +
  748 +
706 749 # look for ole files inside file (e.g. unzip docx)
707 750 # have to finish work on every ole stream inside iteration, since handles
708 751 # are closed in find_ole
... ...