Commit b79261132b8a9f67351654b06f631b33f54249f8
1 parent
36272e08
unittest: test new ooxml features (limit to tags/subfiles)
Showing
1 changed file
with
83 additions
and
3 deletions
tests/ooxml/test_basic.py
| @@ -7,14 +7,20 @@ from os.path import join, splitext | @@ -7,14 +7,20 @@ from os.path import join, splitext | ||
| 7 | from tests.test_utils import DATA_BASE_DIR | 7 | from tests.test_utils import DATA_BASE_DIR |
| 8 | from oletools.thirdparty.olefile import isOleFile | 8 | from oletools.thirdparty.olefile import isOleFile |
| 9 | from oletools import ooxml | 9 | from oletools import ooxml |
| 10 | +import logging | ||
| 10 | 11 | ||
| 11 | 12 | ||
| 12 | class TestOOXML(unittest.TestCase): | 13 | class TestOOXML(unittest.TestCase): |
| 13 | - """ Tests correct detection of doc type """ | 14 | + """ Tests correct behaviour of XML parser """ |
| 14 | 15 | ||
| 15 | DO_DEBUG = False | 16 | DO_DEBUG = False |
| 16 | 17 | ||
| 17 | - def test_all_rough(self): | 18 | + def setUp(self): |
| 19 | + if self.DO_DEBUG: | ||
| 20 | + logging.basicConfig(level=logging.DEBUG) | ||
| 21 | + | ||
| 22 | + | ||
| 23 | + def test_rough_doctype(self): | ||
| 18 | """Checks all samples, expect either ole files or good ooxml output""" | 24 | """Checks all samples, expect either ole files or good ooxml output""" |
| 19 | # map from extension to expected doctype | 25 | # map from extension to expected doctype |
| 20 | ext2doc = dict( | 26 | ext2doc = dict( |
| @@ -27,7 +33,7 @@ class TestOOXML(unittest.TestCase): | @@ -27,7 +33,7 @@ class TestOOXML(unittest.TestCase): | ||
| 27 | 33 | ||
| 28 | # files that are neither OLE nor xml: | 34 | # files that are neither OLE nor xml: |
| 29 | except_files = 'empty', 'text' | 35 | except_files = 'empty', 'text' |
| 30 | - except_extns = 'rtf' | 36 | + except_extns = 'rtf', 'csv' |
| 31 | 37 | ||
| 32 | # analyse all files in data dir | 38 | # analyse all files in data dir |
| 33 | for base_dir, _, files in os.walk(DATA_BASE_DIR): | 39 | for base_dir, _, files in os.walk(DATA_BASE_DIR): |
| @@ -62,6 +68,80 @@ class TestOOXML(unittest.TestCase): | @@ -62,6 +68,80 @@ class TestOOXML(unittest.TestCase): | ||
| 62 | if self.DO_DEBUG: | 68 | if self.DO_DEBUG: |
| 63 | print('ok: {0} --> {1}'.format(filename, doctype)) | 69 | print('ok: {0} --> {1}'.format(filename, doctype)) |
| 64 | 70 | ||
| 71 | + def test_iter_all(self): | ||
| 72 | + """ test iter_xml without args """ | ||
| 73 | + expect_subfiles = dict([ | ||
| 74 | + ('[Content_Types].xml', 11), | ||
| 75 | + ('_rels/.rels', 4), | ||
| 76 | + ('word/_rels/document.xml.rels', 6), | ||
| 77 | + ('word/document.xml', 102), | ||
| 78 | + ('word/theme/theme1.xml', 227), | ||
| 79 | + ('word/settings.xml', 40), | ||
| 80 | + ('word/fontTable.xml', 25), | ||
| 81 | + ('word/webSettings.xml', 3), | ||
| 82 | + ('docProps/app.xml', 26), | ||
| 83 | + ('docProps/core.xml', 10), | ||
| 84 | + ('word/styles.xml', 441), | ||
| 85 | + ]) | ||
| 86 | + n_elems = 0 | ||
| 87 | + testfile = join(DATA_BASE_DIR, 'msodde', 'harmless-clean.docx') | ||
| 88 | + for subfile, elem, depth in ooxml.XmlParser(testfile).iter_xml(): | ||
| 89 | + n_elems += 1 | ||
| 90 | + if depth > 0: | ||
| 91 | + continue | ||
| 92 | + | ||
| 93 | + # now depth == 0; should occur once at end of every subfile | ||
| 94 | + if subfile not in expect_subfiles: | ||
| 95 | + self.fail('Subfile {0} not expected'.format(subfile)) | ||
| 96 | + self.assertEqual(n_elems, expect_subfiles[subfile], | ||
| 97 | + 'wrong number of elems ({0}) yielded from {1}' | ||
| 98 | + .format(n_elems, subfile)) | ||
| 99 | + _ = expect_subfiles.pop(subfile) | ||
| 100 | + n_elems = 0 | ||
| 101 | + | ||
| 102 | + self.assertEqual(len(expect_subfiles), 0, | ||
| 103 | + 'Forgot to iterate through subfile(s) {0}' | ||
| 104 | + .format(expect_subfiles.keys())) | ||
| 105 | + | ||
| 106 | + def test_iter_subfiles(self): | ||
| 107 | + """ test that limitation on few subfiles works """ | ||
| 108 | + testfile = join(DATA_BASE_DIR, 'msodde', 'dde-test.xlsx') | ||
| 109 | + subfiles = ['xl/theme/theme1.xml', 'docProps/app.xml'] | ||
| 110 | + parser = ooxml.XmlParser(testfile) | ||
| 111 | + for subfile, elem, depth in parser.iter_xml(subfiles): | ||
| 112 | + if self.DO_DEBUG: | ||
| 113 | + print(u'{0} {1}{2}'.format(subfile, ' '*depth, | ||
| 114 | + ooxml.debug_str(elem))) | ||
| 115 | + if subfile not in subfiles: | ||
| 116 | + self.fail('should have been skipped: {0}'.format(subfile)) | ||
| 117 | + if depth == 0: | ||
| 118 | + subfiles.remove(subfile) | ||
| 119 | + | ||
| 120 | + self.assertEqual(subfiles, [], 'missed subfile(s) {0}' | ||
| 121 | + .format(subfiles)) | ||
| 122 | + | ||
| 123 | + def test_iter_tags(self): | ||
| 124 | + """ test that limitation to tags works """ | ||
| 125 | + testfile = join(DATA_BASE_DIR, 'msodde', 'harmless-clean.docm') | ||
| 126 | + nmspc = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' | ||
| 127 | + tag = '{' + nmspc + '}p' | ||
| 128 | + | ||
| 129 | + parser = ooxml.XmlParser(testfile) | ||
| 130 | + n_found = 0 | ||
| 131 | + for subfile, elem, depth in parser.iter_xml(tags=tag): | ||
| 132 | + n_found += 1 | ||
| 133 | + self.assertEqual(elem.tag, tag) | ||
| 134 | + | ||
| 135 | + # also check that children are present | ||
| 136 | + n_children = 0 | ||
| 137 | + for child in elem: | ||
| 138 | + n_children += 1 | ||
| 139 | + self.assertFalse(child.tag == '') | ||
| 140 | + self.assertTrue(n_children > 0, 'no children for elem {0}' | ||
| 141 | + .format(ooxml.debug_str(elem))) | ||
| 142 | + | ||
| 143 | + self.assertEqual(n_found, 7) | ||
| 144 | + | ||
| 65 | 145 | ||
| 66 | # just in case somebody calls this file as a script | 146 | # just in case somebody calls this file as a script |
| 67 | if __name__ == '__main__': | 147 | if __name__ == '__main__': |