test_basic.py
4.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
"""Test ftguess"""
import unittest
import os
from os.path import splitext, join
from oletools import ftguess
# Directory with test data, independent of current working directory
from tests.test_utils import DATA_BASE_DIR
from tests.test_utils.testdata_reader import loop_over_files
class TestFTGuess(unittest.TestCase):
"""Test ftguess"""
def test_all(self):
"""Run all files in test-data and compare to known ouput"""
# ftguess knows extension for each FType, create a reverse mapping
used_types = (
ftguess.FType_RTF, ftguess.FType_Generic_OLE,
ftguess.FType_Generic_Zip, ftguess.FType_Word97,
ftguess.FType_Word2007, ftguess.FType_Word2007_Macro,
ftguess.FType_Word2007_Template,
ftguess.FType_Word2007_Template_Macro, ftguess.FType_Excel97,
ftguess.FType_Excel2007, ftguess.FType_Excel2007_XLSB,
ftguess.FType_Excel2007_XLSX , ftguess.FType_Excel2007_XLSM ,
ftguess.FType_Excel2007_Template,
ftguess.FType_Excel2007_Template_Macro,
ftguess.FType_Excel2007_Addin_Macro, ftguess.FType_Powerpoint97,
ftguess.FType_Powerpoint2007_Presentation,
ftguess.FType_Powerpoint2007_Slideshow,
ftguess.FType_Powerpoint2007_Macro,
ftguess.FType_Powerpoint2007_Slideshow_Macro,
ftguess.FType_XPS,
)
ftype_for_extension = dict()
for ftype in used_types:
for extension in ftype.extensions:
ftype_for_extension[extension] = ftype
for filename, file_contents in loop_over_files():
# let the system guess
guess = ftguess.ftype_guess(data=file_contents)
#print(f'for debugging: {filename} --> {guess}')
# determine what we expect...
before_dot, extension = splitext(filename)
if extension == '.zip':
extension = splitext(before_dot)[1]
elif filename in (join('basic', 'empty'), join('basic', 'text')):
extension = '.csv' # have just like that
elif not extension:
self.fail('Could not find extension for test sample {0}'
.format(filename))
extension = extension[1:] # remove the leading '.'
# encrypted files are mostly not recognized (yet?), except .xls
if filename.startswith('encrypted' + os.sep):
if extension == 'xls':
expect = ftguess.FType_Excel97
else:
expect = ftguess.FType_Generic_OLE
elif extension in ('xml', 'csv', 'odt', 'ods', 'odp', 'potx', 'potm'):
# not really an office file type
expect = ftguess.FType_Unknown
elif extension == 'slk':
# not implemented yet
expect = ftguess.FType_Unknown
elif filename == join('basic', 'encrypted.docx'):
expect = ftguess.FType_Generic_OLE
elif 'excel5' in filename:
# excel5 and excel97 have the same extensions, so we did not
# include excel5 in "used_types" above.
expect = ftguess.FType_Excel5
else:
# other files behave nicely, so extension determines the type
expect = ftype_for_extension[extension]
self.assertEqual(guess.container, expect.container,
msg='ftguess guessed container {0} for {1} '
'but we expected {2}'
.format(guess.container, filename,
expect.container))
self.assertEqual(guess.filetype, expect.filetype,
msg='ftguess guessed filetype {0} for {1} '
'but we expected {2}'
.format(guess.filetype, filename,
expect.filetype))
self.assertEqual(guess.application, expect.application,
msg='ftguess guessed application {0} for {1} '
'but we expected {2}'
.format(guess.application, filename,
expect.application))
if expect not in (ftguess.FType_Generic_OLE, ftguess.FType_Unknown):
self.assertEqual(guess.is_excel(), extension.startswith('x')
and extension != 'xml'
and extension != 'xps')
self.assertEqual(guess.is_word(), extension.startswith('d'))
self.assertEqual(guess.is_powerpoint(),
extension.startswith('p'))
# just in case somebody calls this file as a script
if __name__ == '__main__':
unittest.main()