Commit b79261132b8a9f67351654b06f631b33f54249f8
1 parent
36272e08
unittest: test new ooxml features (limit to tags/subfiles)
Showing
1 changed file
with
83 additions
and
3 deletions
tests/ooxml/test_basic.py
| ... | ... | @@ -7,14 +7,20 @@ from os.path import join, splitext |
| 7 | 7 | from tests.test_utils import DATA_BASE_DIR |
| 8 | 8 | from oletools.thirdparty.olefile import isOleFile |
| 9 | 9 | from oletools import ooxml |
| 10 | +import logging | |
| 10 | 11 | |
| 11 | 12 | |
| 12 | 13 | class TestOOXML(unittest.TestCase): |
| 13 | - """ Tests correct detection of doc type """ | |
| 14 | + """ Tests correct behaviour of XML parser """ | |
| 14 | 15 | |
| 15 | 16 | DO_DEBUG = False |
| 16 | 17 | |
| 17 | - def test_all_rough(self): | |
| 18 | + def setUp(self): | |
| 19 | + if self.DO_DEBUG: | |
| 20 | + logging.basicConfig(level=logging.DEBUG) | |
| 21 | + | |
| 22 | + | |
| 23 | + def test_rough_doctype(self): | |
| 18 | 24 | """Checks all samples, expect either ole files or good ooxml output""" |
| 19 | 25 | # map from extension to expected doctype |
| 20 | 26 | ext2doc = dict( |
| ... | ... | @@ -27,7 +33,7 @@ class TestOOXML(unittest.TestCase): |
| 27 | 33 | |
| 28 | 34 | # files that are neither OLE nor xml: |
| 29 | 35 | except_files = 'empty', 'text' |
| 30 | - except_extns = 'rtf' | |
| 36 | + except_extns = 'rtf', 'csv' | |
| 31 | 37 | |
| 32 | 38 | # analyse all files in data dir |
| 33 | 39 | for base_dir, _, files in os.walk(DATA_BASE_DIR): |
| ... | ... | @@ -62,6 +68,80 @@ class TestOOXML(unittest.TestCase): |
| 62 | 68 | if self.DO_DEBUG: |
| 63 | 69 | print('ok: {0} --> {1}'.format(filename, doctype)) |
| 64 | 70 | |
| 71 | + def test_iter_all(self): | |
| 72 | + """ test iter_xml without args """ | |
| 73 | + expect_subfiles = dict([ | |
| 74 | + ('[Content_Types].xml', 11), | |
| 75 | + ('_rels/.rels', 4), | |
| 76 | + ('word/_rels/document.xml.rels', 6), | |
| 77 | + ('word/document.xml', 102), | |
| 78 | + ('word/theme/theme1.xml', 227), | |
| 79 | + ('word/settings.xml', 40), | |
| 80 | + ('word/fontTable.xml', 25), | |
| 81 | + ('word/webSettings.xml', 3), | |
| 82 | + ('docProps/app.xml', 26), | |
| 83 | + ('docProps/core.xml', 10), | |
| 84 | + ('word/styles.xml', 441), | |
| 85 | + ]) | |
| 86 | + n_elems = 0 | |
| 87 | + testfile = join(DATA_BASE_DIR, 'msodde', 'harmless-clean.docx') | |
| 88 | + for subfile, elem, depth in ooxml.XmlParser(testfile).iter_xml(): | |
| 89 | + n_elems += 1 | |
| 90 | + if depth > 0: | |
| 91 | + continue | |
| 92 | + | |
| 93 | + # now depth == 0; should occur once at end of every subfile | |
| 94 | + if subfile not in expect_subfiles: | |
| 95 | + self.fail('Subfile {0} not expected'.format(subfile)) | |
| 96 | + self.assertEqual(n_elems, expect_subfiles[subfile], | |
| 97 | + 'wrong number of elems ({0}) yielded from {1}' | |
| 98 | + .format(n_elems, subfile)) | |
| 99 | + _ = expect_subfiles.pop(subfile) | |
| 100 | + n_elems = 0 | |
| 101 | + | |
| 102 | + self.assertEqual(len(expect_subfiles), 0, | |
| 103 | + 'Forgot to iterate through subfile(s) {0}' | |
| 104 | + .format(expect_subfiles.keys())) | |
| 105 | + | |
| 106 | + def test_iter_subfiles(self): | |
| 107 | + """ test that limitation on few subfiles works """ | |
| 108 | + testfile = join(DATA_BASE_DIR, 'msodde', 'dde-test.xlsx') | |
| 109 | + subfiles = ['xl/theme/theme1.xml', 'docProps/app.xml'] | |
| 110 | + parser = ooxml.XmlParser(testfile) | |
| 111 | + for subfile, elem, depth in parser.iter_xml(subfiles): | |
| 112 | + if self.DO_DEBUG: | |
| 113 | + print(u'{0} {1}{2}'.format(subfile, ' '*depth, | |
| 114 | + ooxml.debug_str(elem))) | |
| 115 | + if subfile not in subfiles: | |
| 116 | + self.fail('should have been skipped: {0}'.format(subfile)) | |
| 117 | + if depth == 0: | |
| 118 | + subfiles.remove(subfile) | |
| 119 | + | |
| 120 | + self.assertEqual(subfiles, [], 'missed subfile(s) {0}' | |
| 121 | + .format(subfiles)) | |
| 122 | + | |
| 123 | + def test_iter_tags(self): | |
| 124 | + """ test that limitation to tags works """ | |
| 125 | + testfile = join(DATA_BASE_DIR, 'msodde', 'harmless-clean.docm') | |
| 126 | + nmspc = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' | |
| 127 | + tag = '{' + nmspc + '}p' | |
| 128 | + | |
| 129 | + parser = ooxml.XmlParser(testfile) | |
| 130 | + n_found = 0 | |
| 131 | + for subfile, elem, depth in parser.iter_xml(tags=tag): | |
| 132 | + n_found += 1 | |
| 133 | + self.assertEqual(elem.tag, tag) | |
| 134 | + | |
| 135 | + # also check that children are present | |
| 136 | + n_children = 0 | |
| 137 | + for child in elem: | |
| 138 | + n_children += 1 | |
| 139 | + self.assertFalse(child.tag == '') | |
| 140 | + self.assertTrue(n_children > 0, 'no children for elem {0}' | |
| 141 | + .format(ooxml.debug_str(elem))) | |
| 142 | + | |
| 143 | + self.assertEqual(n_found, 7) | |
| 144 | + | |
| 65 | 145 | |
| 66 | 146 | # just in case somebody calls this file as a script |
| 67 | 147 | if __name__ == '__main__': | ... | ... |