Commit b79261132b8a9f67351654b06f631b33f54249f8

Authored by Christian Herdtweck
1 parent 36272e08

unittest: test new ooxml features (limit to tags/subfiles)

Showing 1 changed file with 83 additions and 3 deletions
tests/ooxml/test_basic.py
... ... @@ -7,14 +7,20 @@ from os.path import join, splitext
7 7 from tests.test_utils import DATA_BASE_DIR
8 8 from oletools.thirdparty.olefile import isOleFile
9 9 from oletools import ooxml
  10 +import logging
10 11  
11 12  
12 13 class TestOOXML(unittest.TestCase):
13   - """ Tests correct detection of doc type """
  14 + """ Tests correct behaviour of XML parser """
14 15  
15 16 DO_DEBUG = False
16 17  
17   - def test_all_rough(self):
  18 + def setUp(self):
  19 + if self.DO_DEBUG:
  20 + logging.basicConfig(level=logging.DEBUG)
  21 +
  22 +
  23 + def test_rough_doctype(self):
18 24 """Checks all samples, expect either ole files or good ooxml output"""
19 25 # map from extension to expected doctype
20 26 ext2doc = dict(
... ... @@ -27,7 +33,7 @@ class TestOOXML(unittest.TestCase):
27 33  
28 34 # files that are neither OLE nor xml:
29 35 except_files = 'empty', 'text'
30   - except_extns = 'rtf'
  36 + except_extns = 'rtf', 'csv'
31 37  
32 38 # analyse all files in data dir
33 39 for base_dir, _, files in os.walk(DATA_BASE_DIR):
... ... @@ -62,6 +68,80 @@ class TestOOXML(unittest.TestCase):
62 68 if self.DO_DEBUG:
63 69 print('ok: {0} --> {1}'.format(filename, doctype))
64 70  
  71 + def test_iter_all(self):
  72 + """ test iter_xml without args """
  73 + expect_subfiles = dict([
  74 + ('[Content_Types].xml', 11),
  75 + ('_rels/.rels', 4),
  76 + ('word/_rels/document.xml.rels', 6),
  77 + ('word/document.xml', 102),
  78 + ('word/theme/theme1.xml', 227),
  79 + ('word/settings.xml', 40),
  80 + ('word/fontTable.xml', 25),
  81 + ('word/webSettings.xml', 3),
  82 + ('docProps/app.xml', 26),
  83 + ('docProps/core.xml', 10),
  84 + ('word/styles.xml', 441),
  85 + ])
  86 + n_elems = 0
  87 + testfile = join(DATA_BASE_DIR, 'msodde', 'harmless-clean.docx')
  88 + for subfile, elem, depth in ooxml.XmlParser(testfile).iter_xml():
  89 + n_elems += 1
  90 + if depth > 0:
  91 + continue
  92 +
  93 + # now depth == 0; should occur once at end of every subfile
  94 + if subfile not in expect_subfiles:
  95 + self.fail('Subfile {0} not expected'.format(subfile))
  96 + self.assertEqual(n_elems, expect_subfiles[subfile],
  97 + 'wrong number of elems ({0}) yielded from {1}'
  98 + .format(n_elems, subfile))
  99 + _ = expect_subfiles.pop(subfile)
  100 + n_elems = 0
  101 +
  102 + self.assertEqual(len(expect_subfiles), 0,
  103 + 'Forgot to iterate through subfile(s) {0}'
  104 + .format(expect_subfiles.keys()))
  105 +
  106 + def test_iter_subfiles(self):
  107 + """ test that limitation on few subfiles works """
  108 + testfile = join(DATA_BASE_DIR, 'msodde', 'dde-test.xlsx')
  109 + subfiles = ['xl/theme/theme1.xml', 'docProps/app.xml']
  110 + parser = ooxml.XmlParser(testfile)
  111 + for subfile, elem, depth in parser.iter_xml(subfiles):
  112 + if self.DO_DEBUG:
  113 + print(u'{0} {1}{2}'.format(subfile, ' '*depth,
  114 + ooxml.debug_str(elem)))
  115 + if subfile not in subfiles:
  116 + self.fail('should have been skipped: {0}'.format(subfile))
  117 + if depth == 0:
  118 + subfiles.remove(subfile)
  119 +
  120 + self.assertEqual(subfiles, [], 'missed subfile(s) {0}'
  121 + .format(subfiles))
  122 +
  123 + def test_iter_tags(self):
  124 + """ test that limitation to tags works """
  125 + testfile = join(DATA_BASE_DIR, 'msodde', 'harmless-clean.docm')
  126 + nmspc = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
  127 + tag = '{' + nmspc + '}p'
  128 +
  129 + parser = ooxml.XmlParser(testfile)
  130 + n_found = 0
  131 + for subfile, elem, depth in parser.iter_xml(tags=tag):
  132 + n_found += 1
  133 + self.assertEqual(elem.tag, tag)
  134 +
  135 + # also check that children are present
  136 + n_children = 0
  137 + for child in elem:
  138 + n_children += 1
  139 + self.assertFalse(child.tag == '')
  140 + self.assertTrue(n_children > 0, 'no children for elem {0}'
  141 + .format(ooxml.debug_str(elem)))
  142 +
  143 + self.assertEqual(n_found, 7)
  144 +
65 145  
66 146 # just in case somebody calls this file as a script
67 147 if __name__ == '__main__':
... ...