Commit cb41b344f001359e56319d631e09d7e70129079a

Authored by Philippe Lagadec
Committed by GitHub
2 parents 20166c47 eb0b5093

Merge pull request #451 from christian-intra2net/oleobj-preserve-suffix

Oleobj preserve file extension
oletools/oleobj.py
@@ -51,6 +51,7 @@ import re @@ -51,6 +51,7 @@ import re
51 import sys 51 import sys
52 import io 52 import io
53 from zipfile import is_zipfile 53 from zipfile import is_zipfile
  54 +import random
54 55
55 import olefile 56 import olefile
56 57
@@ -230,6 +231,12 @@ BLACKLISTED_RELATIONSHIP_TYPES = [ @@ -230,6 +231,12 @@ BLACKLISTED_RELATIONSHIP_TYPES = [
230 'worksheet' 231 'worksheet'
231 ] 232 ]
232 233
  234 +# Save maximum length of a filename
  235 +MAX_FILENAME_LENGTH = 255
  236 +
  237 +# Max attempts at generating a non-existent random file name
  238 +MAX_FILENAME_ATTEMPTS = 100
  239 +
233 # === FUNCTIONS =============================================================== 240 # === FUNCTIONS ===============================================================
234 241
235 242
@@ -494,11 +501,40 @@ class OleObject(object): @@ -494,11 +501,40 @@ class OleObject(object):
494 self.extra_data = data[index+self.data_size:] 501 self.extra_data = data[index+self.data_size:]
495 502
496 503
497 -def sanitize_filename(filename, replacement='_', max_length=200):  
498 - """compute basename of filename. Replaces all non-whitelisted characters.  
499 - The returned filename is always a ascii basename of the file.""" 504 +def shorten_filename(fname, max_len):
  505 + """Create filename shorter than max_len, trying to preserve suffix."""
  506 + # simple cases:
  507 + if not max_len:
  508 + return fname
  509 + name_len = len(fname)
  510 + if name_len < max_len:
  511 + return fname
  512 +
  513 + idx = fname.rfind('.')
  514 + if idx == -1:
  515 + return fname[:max_len]
  516 +
  517 + suffix_len = name_len - idx # length of suffix including '.'
  518 + if suffix_len > max_len:
  519 + return fname[:max_len]
  520 +
  521 + # great, can preserve suffix
  522 + return fname[:max_len-suffix_len] + fname[idx:]
  523 +
  524 +
  525 +def sanitize_filename(filename, replacement='_',
  526 + max_len=MAX_FILENAME_LENGTH):
  527 + """
  528 + Return filename that is save to work with.
  529 +
  530 + Removes path components, replaces all non-whitelisted characters (so output
  531 + is always a pure-ascii string), replaces '..' and ' ' and shortens to
  532 + given max length, trying to preserve suffix.
  533 +
  534 + Might return empty string
  535 + """
500 basepath = os.path.basename(filename).strip() 536 basepath = os.path.basename(filename).strip()
501 - sane_fname = re.sub(u'[^a-zA-Z0-9.\\-_ ]', replacement, basepath) 537 + sane_fname = re.sub(u'[^a-zA-Z0-9.\-_ ]', replacement, basepath)
502 sane_fname = str(sane_fname) # py3: does nothing; py2: unicode --> str 538 sane_fname = str(sane_fname) # py3: does nothing; py2: unicode --> str
503 539
504 while ".." in sane_fname: 540 while ".." in sane_fname:
@@ -507,14 +543,71 @@ def sanitize_filename(filename, replacement=&#39;_&#39;, max_length=200): @@ -507,14 +543,71 @@ def sanitize_filename(filename, replacement=&#39;_&#39;, max_length=200):
507 while " " in sane_fname: 543 while " " in sane_fname:
508 sane_fname = sane_fname.replace(' ', ' ') 544 sane_fname = sane_fname.replace(' ', ' ')
509 545
510 - if not filename:  
511 - sane_fname = 'NONAME' 546 + # limit filename length, try to preserve suffix
  547 + return shorten_filename(sane_fname, max_len)
  548 +
512 549
513 - # limit filename length  
514 - if max_length:  
515 - sane_fname = sane_fname[:max_length] 550 +def get_sane_embedded_filenames(filename, src_path, tmp_path, max_len,
  551 + noname_index):
  552 + """
  553 + Get some sane filenames out of path information, preserving file suffix.
  554 +
  555 + Returns several canddiates, first with suffix, then without, then random
  556 + with suffix and finally one last attempt ignoring max_len using arg
  557 + `noname_index`.
  558 +
  559 + In some malware examples, filename (on which we relied sofar exclusively
  560 + for this) is empty or " ", but src_path and tmp_path contain paths with
  561 + proper file names. Try to extract filename from any of those.
  562 +
  563 + Preservation of suffix is especially important since that controls how
  564 + windoze treats the file.
  565 + """
  566 + suffixes = []
  567 + candidates_without_suffix = [] # remember these as fallback
  568 + for candidate in (filename, src_path, tmp_path):
  569 + # remove path component. Could be from linux, mac or windows
  570 + idx = max(candidate.rfind('/'), candidate.rfind('\\'))
  571 + candidate = candidate[idx+1:].strip()
  572 +
  573 + # sanitize
  574 + candidate = sanitize_filename(candidate, max_len=max_len)
  575 +
  576 + if not candidate:
  577 + continue # skip whitespace-only
  578 +
  579 + # identify suffix. Dangerous suffixes are all short
  580 + idx = candidate.rfind('.')
  581 + if idx is -1:
  582 + candidates_without_suffix.append(candidate)
  583 + continue
  584 + elif idx < len(candidate)-5:
  585 + candidates_without_suffix.append(candidate)
  586 + continue
  587 +
  588 + # remember suffix
  589 + suffixes.append(candidate[idx:])
  590 +
  591 + yield candidate
516 592
517 - return sane_fname 593 + # parts with suffix not good enough? try those without one
  594 + for candidate in candidates_without_suffix:
  595 + yield candidate
  596 +
  597 + # then try random
  598 + suffixes.append('') # ensure there is something in there
  599 + for _ in range(MAX_FILENAME_ATTEMPTS):
  600 + for suffix in suffixes:
  601 + leftover_len = max_len - len(suffix)
  602 + if leftover_len < 1:
  603 + continue
  604 + name = ''.join(random.sample('abcdefghijklmnopqrstuvwxyz',
  605 + min(26, leftover_len)))
  606 + yield name + suffix
  607 +
  608 + # still not returned? Then we have to make up a name ourselves
  609 + # do not care any more about max_len (maybe it was 0 or negative)
  610 + yield 'oleobj_%03d' % noname_index
518 611
519 612
520 def find_ole_in_ppt(filename): 613 def find_ole_in_ppt(filename):
@@ -666,7 +759,8 @@ def find_ole(filename, data, xml_parser=None): @@ -666,7 +759,8 @@ def find_ole(filename, data, xml_parser=None):
666 if xml_parser is None: 759 if xml_parser is None:
667 xml_parser = XmlParser(arg_for_zip) 760 xml_parser = XmlParser(arg_for_zip)
668 # force iteration so XmlParser.iter_non_xml() returns data 761 # force iteration so XmlParser.iter_non_xml() returns data
669 - [x for x in xml_parser.iter_xml()] 762 + for _ in xml_parser.iter_xml():
  763 + pass
670 764
671 log.info('is zip file: ' + filename) 765 log.info('is zip file: ' + filename)
672 # we looped through the XML files before, now we can 766 # we looped through the XML files before, now we can
@@ -748,16 +842,17 @@ def process_file(filename, data, output_dir=None): @@ -748,16 +842,17 @@ def process_file(filename, data, output_dir=None):
748 If output_dir is given and does not exist, it is created. If it is not 842 If output_dir is given and does not exist, it is created. If it is not
749 given, data is saved to same directory as the input file. 843 given, data is saved to same directory as the input file.
750 """ 844 """
  845 + # sanitize filename, leave space for embedded filename part
  846 + sane_fname = sanitize_filename(filename, max_len=MAX_FILENAME_LENGTH-5) or\
  847 + 'NONAME'
751 if output_dir: 848 if output_dir:
752 if not os.path.isdir(output_dir): 849 if not os.path.isdir(output_dir):
753 log.info('creating output directory %s', output_dir) 850 log.info('creating output directory %s', output_dir)
754 os.mkdir(output_dir) 851 os.mkdir(output_dir)
755 852
756 - fname_prefix = os.path.join(output_dir,  
757 - sanitize_filename(filename)) 853 + fname_prefix = os.path.join(output_dir, sane_fname)
758 else: 854 else:
759 base_dir = os.path.dirname(filename) 855 base_dir = os.path.dirname(filename)
760 - sane_fname = sanitize_filename(filename)  
761 fname_prefix = os.path.join(base_dir, sane_fname) 856 fname_prefix = os.path.join(base_dir, sane_fname)
762 857
763 # TODO: option to extract objects to files (false by default) 858 # TODO: option to extract objects to files (false by default)
@@ -818,11 +913,12 @@ def process_file(filename, data, output_dir=None): @@ -818,11 +913,12 @@ def process_file(filename, data, output_dir=None):
818 print(u'Filename = "%s"' % opkg.filename) 913 print(u'Filename = "%s"' % opkg.filename)
819 print(u'Source path = "%s"' % opkg.src_path) 914 print(u'Source path = "%s"' % opkg.src_path)
820 print(u'Temp path = "%s"' % opkg.temp_path) 915 print(u'Temp path = "%s"' % opkg.temp_path)
821 - if opkg.filename:  
822 - fname = '%s_%s' % (fname_prefix,  
823 - sanitize_filename(opkg.filename))  
824 - else:  
825 - fname = '%s_object_%03d.noname' % (fname_prefix, index) 916 + for embedded_fname in get_sane_embedded_filenames(
  917 + opkg.filename, opkg.src_path, opkg.temp_path,
  918 + MAX_FILENAME_LENGTH - len(sane_fname) - 1, index):
  919 + fname = fname_prefix + '_' + embedded_fname
  920 + if not os.path.isfile(fname):
  921 + break
826 922
827 # dump 923 # dump
828 try: 924 try:
tests/oleobj/test_basic.py
@@ -159,6 +159,65 @@ class TestOleObj(unittest.TestCase): @@ -159,6 +159,65 @@ class TestOleObj(unittest.TestCase):
159 only_run_every=4) 159 only_run_every=4)
160 160
161 161
  162 +class TestSaneFilenameCreation(unittest.TestCase):
  163 + """ Test sanitization / creation of sane filenames """
  164 + def test_with_empty_inputs(self):
  165 + """Test empty inputs lead to several non-empty distinct outputs"""
  166 + iter = oleobj.get_sane_embedded_filenames('', '', '', 10, 47)
  167 + output = set()
  168 + for attempt in range(10):
  169 + output.add(next(iter))
  170 + self.assertEqual(len(output), 10) # check all 10 are different
  171 + for fname in output:
  172 + self.assertNotEqual(fname, '') # all are non-empty
  173 +
  174 + def test_that_first_has_priority(self):
  175 + iter = oleobj.get_sane_embedded_filenames('fname.sfx', 'do_not.use',
  176 + 'do_not.use', 10, 47)
  177 + self.assertEqual(next(iter), 'fname.sfx')
  178 + [next(iter) for _ in range(10)] # check this does not crash
  179 +
  180 + def test_that_suffixed_have_priority(self):
  181 + iter = oleobj.get_sane_embedded_filenames('no_suffix', 'also_not',
  182 + 'fname.sfx', 10, 47)
  183 + self.assertEqual(next(iter), 'fname.sfx')
  184 + self.assertEqual(next(iter), 'no_suffix')
  185 + self.assertEqual(next(iter), 'also_not')
  186 + [next(iter) for _ in range(10)] # check this does not crash
  187 +
  188 + def test_with_hardly_any_length(self):
  189 + iter = oleobj.get_sane_embedded_filenames('fname.suffx', 'fname.sufx',
  190 + 'fname.sfx', 4, 47)
  191 + self.assertEqual(next(iter), '.sfx')
  192 + [next(iter) for _ in range(10)] # check this does not crash
  193 +
  194 + def test_with_mean_unicode(self):
  195 + uni_name1 = u'\xfcnic\xf6de-\xdftring'
  196 + uni_name2 = u'keyboard:\u2328, Braille:\u2800, Phone:\u260e'
  197 + iter = oleobj.get_sane_embedded_filenames(uni_name1, uni_name2,
  198 + 'regular_txt', 30, 47)
  199 + self.assertEqual(next(iter), '_nic_de-_tring')
  200 + self.assertEqual(next(iter), 'keyboard___ Braille___ Phone__')
  201 + self.assertEqual(next(iter), 'regular_txt')
  202 + [next(iter) for _ in range(10)] # check this does not crash
  203 +
  204 + def test_last_resort(self):
  205 + iter = oleobj.get_sane_embedded_filenames('', '', '', 10, 47)
  206 + all_options = list(iter)
  207 + self.assertEqual(len(all_options), oleobj.MAX_FILENAME_ATTEMPTS+1)
  208 + self.assertIn('47', all_options[-1])
  209 +
  210 + def test_realworld_lnk_example(self):
  211 + fname = ' '
  212 + src_path = 'E:\\tmp\\doc_package\\doc\\6.lnk'
  213 + tmp_path = 'C:\\Users\\1\\AppData\\Local\\Temp\\6.lnk'
  214 + iter = oleobj.get_sane_embedded_filenames(fname, src_path, tmp_path,
  215 + 30, 47)
  216 + self.assertEqual(next(iter), '6.lnk')
  217 + self.assertEqual(next(iter), '6.lnk')
  218 + [next(iter) for _ in range(10)] # check this does not crash
  219 +
  220 +
162 # just in case somebody calls this file as a script 221 # just in case somebody calls this file as a script
163 if __name__ == '__main__': 222 if __name__ == '__main__':
164 unittest.main() 223 unittest.main()