Commit f938a9463ae2fc629f5c7d7127cbf4f86781d0a3
1 parent
2e672973
oleobj: smarter way to create dump filename
Sofar we have only looked at the `filename` attribute but in malware samples the path has been empty and windows used src_path or tmp_path to determine dumped file type. Look at all 3 filenames/paths, try to preserve suffix but still limit length of resulting file name. Deal with multiple objects of same resulting filename by offering random file names
Showing
1 changed file
with
115 additions
and
19 deletions
oletools/oleobj.py
| ... | ... | @@ -51,6 +51,7 @@ import re |
| 51 | 51 | import sys |
| 52 | 52 | import io |
| 53 | 53 | from zipfile import is_zipfile |
| 54 | +import random | |
| 54 | 55 | |
| 55 | 56 | import olefile |
| 56 | 57 | |
| ... | ... | @@ -230,6 +231,12 @@ BLACKLISTED_RELATIONSHIP_TYPES = [ |
| 230 | 231 | 'worksheet' |
| 231 | 232 | ] |
| 232 | 233 | |
| 234 | +# Save maximum length of a filename | |
| 235 | +MAX_FILENAME_LENGTH = 255 | |
| 236 | + | |
| 237 | +# Max attempts at generating a non-existent random file name | |
| 238 | +MAX_FILENAME_ATTEMPTS = 100 | |
| 239 | + | |
| 233 | 240 | # === FUNCTIONS =============================================================== |
| 234 | 241 | |
| 235 | 242 | |
| ... | ... | @@ -494,11 +501,40 @@ class OleObject(object): |
| 494 | 501 | self.extra_data = data[index+self.data_size:] |
| 495 | 502 | |
| 496 | 503 | |
| 497 | -def sanitize_filename(filename, replacement='_', max_length=200): | |
| 498 | - """compute basename of filename. Replaces all non-whitelisted characters. | |
| 499 | - The returned filename is always a ascii basename of the file.""" | |
| 504 | +def shorten_filename(fname, max_len): | |
| 505 | + """Create filename shorter than max_len, trying to preserve suffix.""" | |
| 506 | + # simple cases: | |
| 507 | + if not max_len: | |
| 508 | + return fname | |
| 509 | + name_len = len(fname) | |
| 510 | + if name_len < max_len: | |
| 511 | + return fname | |
| 512 | + | |
| 513 | + idx = fname.rfind('.') | |
| 514 | + if idx == -1: | |
| 515 | + return fname[:max_len] | |
| 516 | + | |
| 517 | + suffix_len = name_len - idx # length of suffix including '.' | |
| 518 | + if suffix_len > max_len: | |
| 519 | + return fname[:max_len] | |
| 520 | + | |
| 521 | + # great, can preserve suffix | |
| 522 | + return fname[:max_len-suffix_len] + fname[idx:] | |
| 523 | + | |
| 524 | + | |
| 525 | +def sanitize_filename(filename, replacement='_', | |
| 526 | + max_len=MAX_FILENAME_LENGTH): | |
| 527 | + """ | |
| 528 | + Return filename that is save to work with. | |
| 529 | + | |
| 530 | + Removes path components, replaces all non-whitelisted characters (so output | |
| 531 | + is always a pure-ascii string), replaces '..' and ' ' and shortens to | |
| 532 | + given max length, trying to preserve suffix. | |
| 533 | + | |
| 534 | + Might return empty string | |
| 535 | + """ | |
| 500 | 536 | basepath = os.path.basename(filename).strip() |
| 501 | - sane_fname = re.sub(u'[^a-zA-Z0-9.\\-_ ]', replacement, basepath) | |
| 537 | + sane_fname = re.sub(u'[^a-zA-Z0-9.\-_ ]', replacement, basepath) | |
| 502 | 538 | sane_fname = str(sane_fname) # py3: does nothing; py2: unicode --> str |
| 503 | 539 | |
| 504 | 540 | while ".." in sane_fname: |
| ... | ... | @@ -507,14 +543,71 @@ def sanitize_filename(filename, replacement='_', max_length=200): |
| 507 | 543 | while " " in sane_fname: |
| 508 | 544 | sane_fname = sane_fname.replace(' ', ' ') |
| 509 | 545 | |
| 510 | - if not filename: | |
| 511 | - sane_fname = 'NONAME' | |
| 546 | + # limit filename length, try to preserve suffix | |
| 547 | + return shorten_filename(sane_fname, max_len) | |
| 548 | + | |
| 512 | 549 | |
| 513 | - # limit filename length | |
| 514 | - if max_length: | |
| 515 | - sane_fname = sane_fname[:max_length] | |
| 550 | +def get_sane_embedded_filenames(filename, src_path, tmp_path, max_len, | |
| 551 | + noname_index): | |
| 552 | + """ | |
| 553 | + Get some sane filenames out of path information, preserving file suffix. | |
| 554 | + | |
| 555 | + Returns several canddiates, first with suffix, then without, then random | |
| 556 | + with suffix and finally one last attempt ignoring max_len using arg | |
| 557 | + `noname_index`. | |
| 558 | + | |
| 559 | + In some malware examples, filename (on which we relied sofar exclusively | |
| 560 | + for this) is empty or " ", but src_path and tmp_path contain paths with | |
| 561 | + proper file names. Try to extract filename from any of those. | |
| 562 | + | |
| 563 | + Preservation of suffix is especially important since that controls how | |
| 564 | + windoze treats the file. | |
| 565 | + """ | |
| 566 | + suffixes = [] | |
| 567 | + candidates_without_suffix = [] # remember these as fallback | |
| 568 | + for candidate in (filename, src_path, tmp_path): | |
| 569 | + # remove path component. Could be from linux, mac or windows | |
| 570 | + idx = max(candidate.rfind('/'), candidate.rfind('\\')) | |
| 571 | + candidate = candidate[idx+1:].strip() | |
| 572 | + | |
| 573 | + # sanitize | |
| 574 | + candidate = sanitize_filename(candidate, max_len=max_len) | |
| 575 | + | |
| 576 | + if not candidate: | |
| 577 | + continue # skip whitespace-only | |
| 578 | + | |
| 579 | + # identify suffix. Dangerous suffixes are all short | |
| 580 | + idx = candidate.rfind('.') | |
| 581 | + if idx is -1: | |
| 582 | + candidates_without_suffix.append(candidate) | |
| 583 | + continue | |
| 584 | + elif idx < len(candidate)-5: | |
| 585 | + candidates_without_suffix.append(candidate) | |
| 586 | + continue | |
| 587 | + | |
| 588 | + # remember suffix | |
| 589 | + suffixes.append(candidate[idx:]) | |
| 590 | + | |
| 591 | + yield candidate | |
| 516 | 592 | |
| 517 | - return sane_fname | |
| 593 | + # parts with suffix not good enough? try those without one | |
| 594 | + for candidate in candidates_without_suffix: | |
| 595 | + yield candidate | |
| 596 | + | |
| 597 | + # then try random | |
| 598 | + suffixes.append('') # ensure there is something in there | |
| 599 | + for _ in range(MAX_FILENAME_ATTEMPTS): | |
| 600 | + for suffix in suffixes: | |
| 601 | + leftover_len = max_len - len(suffix) | |
| 602 | + if leftover_len < 1: | |
| 603 | + continue | |
| 604 | + name = ''.join(random.sample('abcdefghijklmnopqrstuvwxyz', | |
| 605 | + min(26, leftover_len))) | |
| 606 | + yield name + suffix | |
| 607 | + | |
| 608 | + # still not returned? Then we have to make up a name ourselves | |
| 609 | + # do not care any more about max_len (maybe it was 0 or negative) | |
| 610 | + yield 'oleobj_%03d' % noname_index | |
| 518 | 611 | |
| 519 | 612 | |
| 520 | 613 | def find_ole_in_ppt(filename): |
| ... | ... | @@ -666,7 +759,8 @@ def find_ole(filename, data, xml_parser=None): |
| 666 | 759 | if xml_parser is None: |
| 667 | 760 | xml_parser = XmlParser(arg_for_zip) |
| 668 | 761 | # force iteration so XmlParser.iter_non_xml() returns data |
| 669 | - [x for x in xml_parser.iter_xml()] | |
| 762 | + for _ in xml_parser.iter_xml(): | |
| 763 | + pass | |
| 670 | 764 | |
| 671 | 765 | log.info('is zip file: ' + filename) |
| 672 | 766 | # we looped through the XML files before, now we can |
| ... | ... | @@ -748,16 +842,17 @@ def process_file(filename, data, output_dir=None): |
| 748 | 842 | If output_dir is given and does not exist, it is created. If it is not |
| 749 | 843 | given, data is saved to same directory as the input file. |
| 750 | 844 | """ |
| 845 | + # sanitize filename, leave space for embedded filename part | |
| 846 | + sane_fname = sanitize_filename(filename, max_len=MAX_FILENAME_LENGTH-5) or\ | |
| 847 | + 'NONAME' | |
| 751 | 848 | if output_dir: |
| 752 | 849 | if not os.path.isdir(output_dir): |
| 753 | 850 | log.info('creating output directory %s', output_dir) |
| 754 | 851 | os.mkdir(output_dir) |
| 755 | 852 | |
| 756 | - fname_prefix = os.path.join(output_dir, | |
| 757 | - sanitize_filename(filename)) | |
| 853 | + fname_prefix = os.path.join(output_dir, sane_fname) | |
| 758 | 854 | else: |
| 759 | 855 | base_dir = os.path.dirname(filename) |
| 760 | - sane_fname = sanitize_filename(filename) | |
| 761 | 856 | fname_prefix = os.path.join(base_dir, sane_fname) |
| 762 | 857 | |
| 763 | 858 | # TODO: option to extract objects to files (false by default) |
| ... | ... | @@ -818,11 +913,12 @@ def process_file(filename, data, output_dir=None): |
| 818 | 913 | print(u'Filename = "%s"' % opkg.filename) |
| 819 | 914 | print(u'Source path = "%s"' % opkg.src_path) |
| 820 | 915 | print(u'Temp path = "%s"' % opkg.temp_path) |
| 821 | - if opkg.filename: | |
| 822 | - fname = '%s_%s' % (fname_prefix, | |
| 823 | - sanitize_filename(opkg.filename)) | |
| 824 | - else: | |
| 825 | - fname = '%s_object_%03d.noname' % (fname_prefix, index) | |
| 916 | + for embedded_fname in get_sane_embedded_filenames( | |
| 917 | + opkg.filename, opkg.src_path, opkg.temp_path, | |
| 918 | + MAX_FILENAME_LENGTH - len(sane_fname) - 1, index): | |
| 919 | + fname = fname_prefix + '_' + embedded_fname | |
| 920 | + if not os.path.isfile(fname): | |
| 921 | + break | |
| 826 | 922 | |
| 827 | 923 | # dump |
| 828 | 924 | try: | ... | ... |