"""Test ftguess""" import unittest import os from os.path import splitext, join from oletools import ftguess # Directory with test data, independent of current working directory from tests.test_utils import DATA_BASE_DIR from tests.test_utils.testdata_reader import loop_over_files class TestFTGuess(unittest.TestCase): """Test ftguess""" def test_all(self): """Run all files in test-data and compare to known ouput""" # ftguess knows extension for each FType, create a reverse mapping used_types = ( ftguess.FType_RTF, ftguess.FType_Generic_OLE, ftguess.FType_Generic_Zip, ftguess.FType_Word97, ftguess.FType_Word2007, ftguess.FType_Word2007_Macro, ftguess.FType_Word2007_Template, ftguess.FType_Word2007_Template_Macro, ftguess.FType_Excel97, ftguess.FType_Excel2007, ftguess.FType_Excel2007_XLSB, ftguess.FType_Excel2007_XLSX , ftguess.FType_Excel2007_XLSM , ftguess.FType_Excel2007_Template, ftguess.FType_Excel2007_Template_Macro, ftguess.FType_Excel2007_Addin_Macro, ftguess.FType_Powerpoint97, ftguess.FType_Powerpoint2007_Presentation, ftguess.FType_Powerpoint2007_Slideshow, ftguess.FType_Powerpoint2007_Macro, ftguess.FType_Powerpoint2007_Slideshow_Macro, ftguess.FType_XPS, ) ftype_for_extension = dict() for ftype in used_types: for extension in ftype.extensions: ftype_for_extension[extension] = ftype for filename, file_contents in loop_over_files(): # let the system guess guess = ftguess.ftype_guess(data=file_contents) #print(f'for debugging: {filename} --> {guess}') # determine what we expect... before_dot, extension = splitext(filename) if extension == '.zip': extension = splitext(before_dot)[1] elif filename in (join('basic', 'empty'), join('basic', 'text')): extension = '.csv' # have just like that elif not extension: self.fail('Could not find extension for test sample {0}' .format(filename)) extension = extension[1:] # remove the leading '.' # encrypted files are mostly not recognized (yet?), except .xls if filename.startswith('encrypted' + os.sep): if extension == 'xls': expect = ftguess.FType_Excel97 else: expect = ftguess.FType_Generic_OLE elif extension in ('xml', 'csv', 'odt', 'ods', 'odp', 'potx', 'potm'): # not really an office file type expect = ftguess.FType_Unknown elif extension == 'slk': # not implemented yet expect = ftguess.FType_Unknown elif filename == join('basic', 'encrypted.docx'): expect = ftguess.FType_Generic_OLE elif 'excel5' in filename: # excel5 and excel97 have the same extensions, so we did not # include excel5 in "used_types" above. expect = ftguess.FType_Excel5 else: # other files behave nicely, so extension determines the type expect = ftype_for_extension[extension] self.assertEqual(guess.container, expect.container, msg='ftguess guessed container {0} for {1} ' 'but we expected {2}' .format(guess.container, filename, expect.container)) self.assertEqual(guess.filetype, expect.filetype, msg='ftguess guessed filetype {0} for {1} ' 'but we expected {2}' .format(guess.filetype, filename, expect.filetype)) self.assertEqual(guess.application, expect.application, msg='ftguess guessed application {0} for {1} ' 'but we expected {2}' .format(guess.application, filename, expect.application)) if expect not in (ftguess.FType_Generic_OLE, ftguess.FType_Unknown): self.assertEqual(guess.is_excel(), extension.startswith('x') and extension != 'xml' and extension != 'xps') self.assertEqual(guess.is_word(), extension.startswith('d')) self.assertEqual(guess.is_powerpoint(), extension.startswith('p')) # just in case somebody calls this file as a script if __name__ == '__main__': unittest.main()