Commit 791592425455617979c0a3a1a94f8d6ea452206b
Committed by
GitHub
Merge pull request #317 from samiraguiar/oleobj-detect-external
oleobj: detect external links
Showing
16 changed files
with
105 additions
and
24 deletions
oletools/oleobj.py
| ... | ... | @@ -50,7 +50,7 @@ import os |
| 50 | 50 | import re |
| 51 | 51 | import sys |
| 52 | 52 | import io |
| 53 | -from zipfile import is_zipfile, ZipFile | |
| 53 | +from zipfile import is_zipfile | |
| 54 | 54 | |
| 55 | 55 | import olefile |
| 56 | 56 | |
| ... | ... | @@ -72,7 +72,7 @@ except ImportError: |
| 72 | 72 | |
| 73 | 73 | from oletools.ppt_record_parser import (is_ppt, PptFile, |
| 74 | 74 | PptRecordExOleVbaActiveXAtom) |
| 75 | -from oletools.ooxml import ZipSubFile | |
| 75 | +from oletools.ooxml import XmlParser | |
| 76 | 76 | |
| 77 | 77 | # ----------------------------------------------------------------------------- |
| 78 | 78 | # CHANGELOG: |
| ... | ... | @@ -181,6 +181,7 @@ else: |
| 181 | 181 | NULL_CHAR = 0 # pylint: disable=redefined-variable-type |
| 182 | 182 | xrange = range # pylint: disable=redefined-builtin, invalid-name |
| 183 | 183 | |
| 184 | +OOXML_RELATIONSHIP_TAG = '{http://schemas.openxmlformats.org/package/2006/relationships}Relationship' | |
| 184 | 185 | |
| 185 | 186 | # === GLOBAL VARIABLES ======================================================== |
| 186 | 187 | |
| ... | ... | @@ -206,6 +207,24 @@ RETURN_ERR_ARGS = 2 # reserve for OptionParser.parse_args |
| 206 | 207 | RETURN_ERR_STREAM = 4 # error opening/parsing a stream |
| 207 | 208 | RETURN_ERR_DUMP = 8 # error dumping data from stream to file |
| 208 | 209 | |
| 210 | +# Not sure if they can all be "External", but just in case | |
| 211 | +BLACKLISTED_RELATIONSHIP_TYPES = [ | |
| 212 | + 'attachedTemplate', | |
| 213 | + 'externalLink', | |
| 214 | + 'externalLinkPath', | |
| 215 | + 'externalReference' | |
| 216 | + 'frame' | |
| 217 | + 'hyperlink', | |
| 218 | + 'officeDocument', | |
| 219 | + 'oleObject', | |
| 220 | + 'package', | |
| 221 | + 'slideUpdateUrl', | |
| 222 | + 'slideMaster', | |
| 223 | + 'slide', | |
| 224 | + 'slideUpdateInfo', | |
| 225 | + 'subDocument', | |
| 226 | + 'worksheet' | |
| 227 | +] | |
| 209 | 228 | |
| 210 | 229 | # === FUNCTIONS =============================================================== |
| 211 | 230 | |
| ... | ... | @@ -599,7 +618,7 @@ class FakeFile(io.RawIOBase): |
| 599 | 618 | return self.pos |
| 600 | 619 | |
| 601 | 620 | |
| 602 | -def find_ole(filename, data): | |
| 621 | +def find_ole(filename, data, xml_parser=None): | |
| 603 | 622 | """ try to open somehow as zip/ole/rtf/... ; yield None if fail |
| 604 | 623 | |
| 605 | 624 | If data is given, filename is (mostly) ignored. |
| ... | ... | @@ -631,34 +650,40 @@ def find_ole(filename, data): |
| 631 | 650 | log.info('is ole file: ' + filename) |
| 632 | 651 | ole = olefile.OleFileIO(arg_for_ole) |
| 633 | 652 | yield ole |
| 634 | - elif is_zipfile(arg_for_zip): | |
| 653 | + elif xml_parser is not None or is_zipfile(arg_for_zip): | |
| 654 | + # keep compatibility with 3rd-party code that calls this function | |
| 655 | + # directly without providing an XmlParser instance | |
| 656 | + if xml_parser is None: | |
| 657 | + xml_parser = XmlParser(arg_for_zip) | |
| 658 | + # force iteration so XmlParser.iter_non_xml() returns data | |
| 659 | + [x for x in xml_parser.iter_xml()] | |
| 660 | + | |
| 635 | 661 | log.info('is zip file: ' + filename) |
| 636 | - zipper = ZipFile(arg_for_zip, 'r') | |
| 637 | - for subfile in zipper.namelist(): | |
| 638 | - head = b'' | |
| 662 | + # we looped through the XML files before, now we can | |
| 663 | + # iterate the non-XML files looking for ole objects | |
| 664 | + for subfile, _, file_handle in xml_parser.iter_non_xml(): | |
| 639 | 665 | try: |
| 640 | - with zipper.open(subfile) as file_handle: | |
| 641 | - head = file_handle.read(len(olefile.MAGIC)) | |
| 666 | + head = file_handle.read(len(olefile.MAGIC)) | |
| 642 | 667 | except RuntimeError: |
| 643 | 668 | log.error('zip is encrypted: ' + filename) |
| 644 | 669 | yield None |
| 645 | 670 | continue |
| 646 | 671 | |
| 647 | 672 | if head == olefile.MAGIC: |
| 673 | + file_handle.seek(0) | |
| 648 | 674 | log.info(' unzipping ole: ' + subfile) |
| 649 | - with ZipSubFile(zipper, subfile) as file_handle: | |
| 650 | - try: | |
| 651 | - ole = olefile.OleFileIO(file_handle) | |
| 652 | - yield ole | |
| 653 | - except IOError: | |
| 654 | - log.warning('Error reading data from {0}/{1} or ' | |
| 655 | - 'interpreting it as OLE object' | |
| 656 | - .format(filename, subfile)) | |
| 657 | - log.debug('', exc_info=True) | |
| 658 | - finally: | |
| 659 | - if ole is not None: | |
| 660 | - ole.close() | |
| 661 | - ole = None | |
| 675 | + try: | |
| 676 | + ole = olefile.OleFileIO(file_handle) | |
| 677 | + yield ole | |
| 678 | + except IOError: | |
| 679 | + log.warning('Error reading data from {0}/{1} or ' | |
| 680 | + 'interpreting it as OLE object' | |
| 681 | + .format(filename, subfile)) | |
| 682 | + log.debug('', exc_info=True) | |
| 683 | + finally: | |
| 684 | + if ole is not None: | |
| 685 | + ole.close() | |
| 686 | + ole = None | |
| 662 | 687 | else: |
| 663 | 688 | log.debug('unzip skip: ' + subfile) |
| 664 | 689 | else: |
| ... | ... | @@ -674,6 +699,22 @@ def find_ole(filename, data): |
| 674 | 699 | ole.close() |
| 675 | 700 | |
| 676 | 701 | |
| 702 | +def find_external_relationships(xml_parser): | |
| 703 | + """ iterate XML files looking for relationships to external objects | |
| 704 | + """ | |
| 705 | + for _, elem, _ in xml_parser.iter_xml(None, False, OOXML_RELATIONSHIP_TAG): | |
| 706 | + try: | |
| 707 | + if elem.attrib['TargetMode'] == 'External': | |
| 708 | + relationship_type = elem.attrib['Type'].rsplit('/', 1)[1] | |
| 709 | + | |
| 710 | + if relationship_type in BLACKLISTED_RELATIONSHIP_TYPES: | |
| 711 | + yield relationship_type, elem.attrib['Target'] | |
| 712 | + except (AttributeError, KeyError): | |
| 713 | + # ignore missing attributes - Word won't detect | |
| 714 | + # external links anyway | |
| 715 | + pass | |
| 716 | + | |
| 717 | + | |
| 677 | 718 | def process_file(filename, data, output_dir=None): |
| 678 | 719 | """ find embedded objects in given file |
| 679 | 720 | |
| ... | ... | @@ -706,10 +747,19 @@ def process_file(filename, data, output_dir=None): |
| 706 | 747 | err_dumping = False |
| 707 | 748 | did_dump = False |
| 708 | 749 | |
| 750 | + xml_parser = None | |
| 751 | + if is_zipfile(filename): | |
| 752 | + log.info('file is a OOXML file, looking for relationships with external links') | |
| 753 | + xml_parser = XmlParser(filename) | |
| 754 | + for relationship, target in find_external_relationships(xml_parser): | |
| 755 | + did_dump = True | |
| 756 | + print("Found relationship '%s' with external link %s" % (relationship, target)) | |
| 757 | + | |
| 758 | + | |
| 709 | 759 | # look for ole files inside file (e.g. unzip docx) |
| 710 | 760 | # have to finish work on every ole stream inside iteration, since handles |
| 711 | 761 | # are closed in find_ole |
| 712 | - for ole in find_ole(filename, data): | |
| 762 | + for ole in find_ole(filename, data, xml_parser): | |
| 713 | 763 | if ole is None: # no ole file found |
| 714 | 764 | continue |
| 715 | 765 | ... | ... |
oletools/ooxml.py
| ... | ... | @@ -644,7 +644,7 @@ def test(): |
| 644 | 644 | for subfile, elem, depth in parser.iter_xml(): |
| 645 | 645 | if depth < 4: |
| 646 | 646 | print(u'{0} {1}{2}'.format(subfile, ' ' * depth, debug_str(elem))) |
| 647 | - for index, (subfile, content_type) in enumerate(parser.iter_non_xml()): | |
| 647 | + for index, (subfile, content_type, _) in enumerate(parser.iter_non_xml()): | |
| 648 | 648 | print(u'Non-XML subfile: {0} of type {1}' |
| 649 | 649 | .format(subfile, content_type or u'unknown')) |
| 650 | 650 | if index > 100: | ... | ... |
tests/oleobj/test_external_links.py
0 → 100644
| 1 | +""" Test that oleobj detects external links in relationships files. | |
| 2 | +""" | |
| 3 | + | |
| 4 | +import unittest | |
| 5 | +import os | |
| 6 | +from os import path | |
| 7 | + | |
| 8 | +# Directory with test data, independent of current working directory | |
| 9 | +from tests.test_utils import DATA_BASE_DIR | |
| 10 | +from oletools import oleobj | |
| 11 | + | |
| 12 | +BASE_DIR = path.join(DATA_BASE_DIR, 'oleobj', 'external_link') | |
| 13 | + | |
| 14 | + | |
| 15 | +class TestExternalLinks(unittest.TestCase): | |
| 16 | + def test_external_links(self): | |
| 17 | + """ | |
| 18 | + loop through sample files asserting that external links are found | |
| 19 | + """ | |
| 20 | + | |
| 21 | + for dirpath, _, filenames in os.walk(BASE_DIR): | |
| 22 | + for filename in filenames: | |
| 23 | + file_path = path.join(dirpath, filename) | |
| 24 | + | |
| 25 | + ret_val = oleobj.main([file_path]) | |
| 26 | + self.assertEqual(ret_val, oleobj.RETURN_DID_DUMP) | |
| 27 | + | |
| 28 | + | |
| 29 | +# just in case somebody calls this file as a script | |
| 30 | +if __name__ == '__main__': | |
| 31 | + unittest.main() | ... | ... |
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.docm
0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.docx
0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.dotm
0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.dotx
0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.potm
0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.potx
0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.ppsm
0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.ppsx
0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.pptm
0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.pptx
0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.xlsb
0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.xlsm
0 → 100644
No preview for this file type
tests/test-data/oleobj/external_link/sample_with_external_link_to_doc.xlsx
0 → 100644
No preview for this file type