diff --git a/tests/ooxml/test_basic.py b/tests/ooxml/test_basic.py index e850389..3231b42 100644 --- a/tests/ooxml/test_basic.py +++ b/tests/ooxml/test_basic.py @@ -7,14 +7,20 @@ from os.path import join, splitext from tests.test_utils import DATA_BASE_DIR from oletools.thirdparty.olefile import isOleFile from oletools import ooxml +import logging class TestOOXML(unittest.TestCase): - """ Tests correct detection of doc type """ + """ Tests correct behaviour of XML parser """ DO_DEBUG = False - def test_all_rough(self): + def setUp(self): + if self.DO_DEBUG: + logging.basicConfig(level=logging.DEBUG) + + + def test_rough_doctype(self): """Checks all samples, expect either ole files or good ooxml output""" # map from extension to expected doctype ext2doc = dict( @@ -27,7 +33,7 @@ class TestOOXML(unittest.TestCase): # files that are neither OLE nor xml: except_files = 'empty', 'text' - except_extns = 'rtf' + except_extns = 'rtf', 'csv' # analyse all files in data dir for base_dir, _, files in os.walk(DATA_BASE_DIR): @@ -62,6 +68,80 @@ class TestOOXML(unittest.TestCase): if self.DO_DEBUG: print('ok: {0} --> {1}'.format(filename, doctype)) + def test_iter_all(self): + """ test iter_xml without args """ + expect_subfiles = dict([ + ('[Content_Types].xml', 11), + ('_rels/.rels', 4), + ('word/_rels/document.xml.rels', 6), + ('word/document.xml', 102), + ('word/theme/theme1.xml', 227), + ('word/settings.xml', 40), + ('word/fontTable.xml', 25), + ('word/webSettings.xml', 3), + ('docProps/app.xml', 26), + ('docProps/core.xml', 10), + ('word/styles.xml', 441), + ]) + n_elems = 0 + testfile = join(DATA_BASE_DIR, 'msodde', 'harmless-clean.docx') + for subfile, elem, depth in ooxml.XmlParser(testfile).iter_xml(): + n_elems += 1 + if depth > 0: + continue + + # now depth == 0; should occur once at end of every subfile + if subfile not in expect_subfiles: + self.fail('Subfile {0} not expected'.format(subfile)) + self.assertEqual(n_elems, expect_subfiles[subfile], + 'wrong number of elems ({0}) yielded from {1}' + .format(n_elems, subfile)) + _ = expect_subfiles.pop(subfile) + n_elems = 0 + + self.assertEqual(len(expect_subfiles), 0, + 'Forgot to iterate through subfile(s) {0}' + .format(expect_subfiles.keys())) + + def test_iter_subfiles(self): + """ test that limitation on few subfiles works """ + testfile = join(DATA_BASE_DIR, 'msodde', 'dde-test.xlsx') + subfiles = ['xl/theme/theme1.xml', 'docProps/app.xml'] + parser = ooxml.XmlParser(testfile) + for subfile, elem, depth in parser.iter_xml(subfiles): + if self.DO_DEBUG: + print(u'{0} {1}{2}'.format(subfile, ' '*depth, + ooxml.debug_str(elem))) + if subfile not in subfiles: + self.fail('should have been skipped: {0}'.format(subfile)) + if depth == 0: + subfiles.remove(subfile) + + self.assertEqual(subfiles, [], 'missed subfile(s) {0}' + .format(subfiles)) + + def test_iter_tags(self): + """ test that limitation to tags works """ + testfile = join(DATA_BASE_DIR, 'msodde', 'harmless-clean.docm') + nmspc = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' + tag = '{' + nmspc + '}p' + + parser = ooxml.XmlParser(testfile) + n_found = 0 + for subfile, elem, depth in parser.iter_xml(tags=tag): + n_found += 1 + self.assertEqual(elem.tag, tag) + + # also check that children are present + n_children = 0 + for child in elem: + n_children += 1 + self.assertFalse(child.tag == '') + self.assertTrue(n_children > 0, 'no children for elem {0}' + .format(ooxml.debug_str(elem))) + + self.assertEqual(n_found, 7) + # just in case somebody calls this file as a script if __name__ == '__main__':