Commit f938a9463ae2fc629f5c7d7127cbf4f86781d0a3

Authored by Christian Herdtweck
1 parent 2e672973

oleobj: smarter way to create dump filename

Sofar we have only looked at the `filename` attribute but in malware
samples the path has been empty and windows used src_path or tmp_path
to determine dumped file type.

Look at all 3 filenames/paths, try to preserve suffix but still limit
length of resulting file name. Deal with multiple objects of same
resulting filename by offering random file names
Showing 1 changed file with 115 additions and 19 deletions
oletools/oleobj.py
... ... @@ -51,6 +51,7 @@ import re
51 51 import sys
52 52 import io
53 53 from zipfile import is_zipfile
  54 +import random
54 55  
55 56 import olefile
56 57  
... ... @@ -230,6 +231,12 @@ BLACKLISTED_RELATIONSHIP_TYPES = [
230 231 'worksheet'
231 232 ]
232 233  
  234 +# Save maximum length of a filename
  235 +MAX_FILENAME_LENGTH = 255
  236 +
  237 +# Max attempts at generating a non-existent random file name
  238 +MAX_FILENAME_ATTEMPTS = 100
  239 +
233 240 # === FUNCTIONS ===============================================================
234 241  
235 242  
... ... @@ -494,11 +501,40 @@ class OleObject(object):
494 501 self.extra_data = data[index+self.data_size:]
495 502  
496 503  
497   -def sanitize_filename(filename, replacement='_', max_length=200):
498   - """compute basename of filename. Replaces all non-whitelisted characters.
499   - The returned filename is always a ascii basename of the file."""
  504 +def shorten_filename(fname, max_len):
  505 + """Create filename shorter than max_len, trying to preserve suffix."""
  506 + # simple cases:
  507 + if not max_len:
  508 + return fname
  509 + name_len = len(fname)
  510 + if name_len < max_len:
  511 + return fname
  512 +
  513 + idx = fname.rfind('.')
  514 + if idx == -1:
  515 + return fname[:max_len]
  516 +
  517 + suffix_len = name_len - idx # length of suffix including '.'
  518 + if suffix_len > max_len:
  519 + return fname[:max_len]
  520 +
  521 + # great, can preserve suffix
  522 + return fname[:max_len-suffix_len] + fname[idx:]
  523 +
  524 +
  525 +def sanitize_filename(filename, replacement='_',
  526 + max_len=MAX_FILENAME_LENGTH):
  527 + """
  528 + Return filename that is save to work with.
  529 +
  530 + Removes path components, replaces all non-whitelisted characters (so output
  531 + is always a pure-ascii string), replaces '..' and ' ' and shortens to
  532 + given max length, trying to preserve suffix.
  533 +
  534 + Might return empty string
  535 + """
500 536 basepath = os.path.basename(filename).strip()
501   - sane_fname = re.sub(u'[^a-zA-Z0-9.\\-_ ]', replacement, basepath)
  537 + sane_fname = re.sub(u'[^a-zA-Z0-9.\-_ ]', replacement, basepath)
502 538 sane_fname = str(sane_fname) # py3: does nothing; py2: unicode --> str
503 539  
504 540 while ".." in sane_fname:
... ... @@ -507,14 +543,71 @@ def sanitize_filename(filename, replacement=&#39;_&#39;, max_length=200):
507 543 while " " in sane_fname:
508 544 sane_fname = sane_fname.replace(' ', ' ')
509 545  
510   - if not filename:
511   - sane_fname = 'NONAME'
  546 + # limit filename length, try to preserve suffix
  547 + return shorten_filename(sane_fname, max_len)
  548 +
512 549  
513   - # limit filename length
514   - if max_length:
515   - sane_fname = sane_fname[:max_length]
  550 +def get_sane_embedded_filenames(filename, src_path, tmp_path, max_len,
  551 + noname_index):
  552 + """
  553 + Get some sane filenames out of path information, preserving file suffix.
  554 +
  555 + Returns several canddiates, first with suffix, then without, then random
  556 + with suffix and finally one last attempt ignoring max_len using arg
  557 + `noname_index`.
  558 +
  559 + In some malware examples, filename (on which we relied sofar exclusively
  560 + for this) is empty or " ", but src_path and tmp_path contain paths with
  561 + proper file names. Try to extract filename from any of those.
  562 +
  563 + Preservation of suffix is especially important since that controls how
  564 + windoze treats the file.
  565 + """
  566 + suffixes = []
  567 + candidates_without_suffix = [] # remember these as fallback
  568 + for candidate in (filename, src_path, tmp_path):
  569 + # remove path component. Could be from linux, mac or windows
  570 + idx = max(candidate.rfind('/'), candidate.rfind('\\'))
  571 + candidate = candidate[idx+1:].strip()
  572 +
  573 + # sanitize
  574 + candidate = sanitize_filename(candidate, max_len=max_len)
  575 +
  576 + if not candidate:
  577 + continue # skip whitespace-only
  578 +
  579 + # identify suffix. Dangerous suffixes are all short
  580 + idx = candidate.rfind('.')
  581 + if idx is -1:
  582 + candidates_without_suffix.append(candidate)
  583 + continue
  584 + elif idx < len(candidate)-5:
  585 + candidates_without_suffix.append(candidate)
  586 + continue
  587 +
  588 + # remember suffix
  589 + suffixes.append(candidate[idx:])
  590 +
  591 + yield candidate
516 592  
517   - return sane_fname
  593 + # parts with suffix not good enough? try those without one
  594 + for candidate in candidates_without_suffix:
  595 + yield candidate
  596 +
  597 + # then try random
  598 + suffixes.append('') # ensure there is something in there
  599 + for _ in range(MAX_FILENAME_ATTEMPTS):
  600 + for suffix in suffixes:
  601 + leftover_len = max_len - len(suffix)
  602 + if leftover_len < 1:
  603 + continue
  604 + name = ''.join(random.sample('abcdefghijklmnopqrstuvwxyz',
  605 + min(26, leftover_len)))
  606 + yield name + suffix
  607 +
  608 + # still not returned? Then we have to make up a name ourselves
  609 + # do not care any more about max_len (maybe it was 0 or negative)
  610 + yield 'oleobj_%03d' % noname_index
518 611  
519 612  
520 613 def find_ole_in_ppt(filename):
... ... @@ -666,7 +759,8 @@ def find_ole(filename, data, xml_parser=None):
666 759 if xml_parser is None:
667 760 xml_parser = XmlParser(arg_for_zip)
668 761 # force iteration so XmlParser.iter_non_xml() returns data
669   - [x for x in xml_parser.iter_xml()]
  762 + for _ in xml_parser.iter_xml():
  763 + pass
670 764  
671 765 log.info('is zip file: ' + filename)
672 766 # we looped through the XML files before, now we can
... ... @@ -748,16 +842,17 @@ def process_file(filename, data, output_dir=None):
748 842 If output_dir is given and does not exist, it is created. If it is not
749 843 given, data is saved to same directory as the input file.
750 844 """
  845 + # sanitize filename, leave space for embedded filename part
  846 + sane_fname = sanitize_filename(filename, max_len=MAX_FILENAME_LENGTH-5) or\
  847 + 'NONAME'
751 848 if output_dir:
752 849 if not os.path.isdir(output_dir):
753 850 log.info('creating output directory %s', output_dir)
754 851 os.mkdir(output_dir)
755 852  
756   - fname_prefix = os.path.join(output_dir,
757   - sanitize_filename(filename))
  853 + fname_prefix = os.path.join(output_dir, sane_fname)
758 854 else:
759 855 base_dir = os.path.dirname(filename)
760   - sane_fname = sanitize_filename(filename)
761 856 fname_prefix = os.path.join(base_dir, sane_fname)
762 857  
763 858 # TODO: option to extract objects to files (false by default)
... ... @@ -818,11 +913,12 @@ def process_file(filename, data, output_dir=None):
818 913 print(u'Filename = "%s"' % opkg.filename)
819 914 print(u'Source path = "%s"' % opkg.src_path)
820 915 print(u'Temp path = "%s"' % opkg.temp_path)
821   - if opkg.filename:
822   - fname = '%s_%s' % (fname_prefix,
823   - sanitize_filename(opkg.filename))
824   - else:
825   - fname = '%s_object_%03d.noname' % (fname_prefix, index)
  916 + for embedded_fname in get_sane_embedded_filenames(
  917 + opkg.filename, opkg.src_path, opkg.temp_path,
  918 + MAX_FILENAME_LENGTH - len(sane_fname) - 1, index):
  919 + fname = fname_prefix + '_' + embedded_fname
  920 + if not os.path.isfile(fname):
  921 + break
826 922  
827 923 # dump
828 924 try:
... ...