Commit cb41b344f001359e56319d631e09d7e70129079a
Committed by
GitHub
Merge pull request #451 from christian-intra2net/oleobj-preserve-suffix
Oleobj preserve file extension
Showing
2 changed files
with
174 additions
and
19 deletions
oletools/oleobj.py
| ... | ... | @@ -51,6 +51,7 @@ import re |
| 51 | 51 | import sys |
| 52 | 52 | import io |
| 53 | 53 | from zipfile import is_zipfile |
| 54 | +import random | |
| 54 | 55 | |
| 55 | 56 | import olefile |
| 56 | 57 | |
| ... | ... | @@ -230,6 +231,12 @@ BLACKLISTED_RELATIONSHIP_TYPES = [ |
| 230 | 231 | 'worksheet' |
| 231 | 232 | ] |
| 232 | 233 | |
| 234 | +# Save maximum length of a filename | |
| 235 | +MAX_FILENAME_LENGTH = 255 | |
| 236 | + | |
| 237 | +# Max attempts at generating a non-existent random file name | |
| 238 | +MAX_FILENAME_ATTEMPTS = 100 | |
| 239 | + | |
| 233 | 240 | # === FUNCTIONS =============================================================== |
| 234 | 241 | |
| 235 | 242 | |
| ... | ... | @@ -494,11 +501,40 @@ class OleObject(object): |
| 494 | 501 | self.extra_data = data[index+self.data_size:] |
| 495 | 502 | |
| 496 | 503 | |
| 497 | -def sanitize_filename(filename, replacement='_', max_length=200): | |
| 498 | - """compute basename of filename. Replaces all non-whitelisted characters. | |
| 499 | - The returned filename is always a ascii basename of the file.""" | |
| 504 | +def shorten_filename(fname, max_len): | |
| 505 | + """Create filename shorter than max_len, trying to preserve suffix.""" | |
| 506 | + # simple cases: | |
| 507 | + if not max_len: | |
| 508 | + return fname | |
| 509 | + name_len = len(fname) | |
| 510 | + if name_len < max_len: | |
| 511 | + return fname | |
| 512 | + | |
| 513 | + idx = fname.rfind('.') | |
| 514 | + if idx == -1: | |
| 515 | + return fname[:max_len] | |
| 516 | + | |
| 517 | + suffix_len = name_len - idx # length of suffix including '.' | |
| 518 | + if suffix_len > max_len: | |
| 519 | + return fname[:max_len] | |
| 520 | + | |
| 521 | + # great, can preserve suffix | |
| 522 | + return fname[:max_len-suffix_len] + fname[idx:] | |
| 523 | + | |
| 524 | + | |
| 525 | +def sanitize_filename(filename, replacement='_', | |
| 526 | + max_len=MAX_FILENAME_LENGTH): | |
| 527 | + """ | |
| 528 | + Return filename that is save to work with. | |
| 529 | + | |
| 530 | + Removes path components, replaces all non-whitelisted characters (so output | |
| 531 | + is always a pure-ascii string), replaces '..' and ' ' and shortens to | |
| 532 | + given max length, trying to preserve suffix. | |
| 533 | + | |
| 534 | + Might return empty string | |
| 535 | + """ | |
| 500 | 536 | basepath = os.path.basename(filename).strip() |
| 501 | - sane_fname = re.sub(u'[^a-zA-Z0-9.\\-_ ]', replacement, basepath) | |
| 537 | + sane_fname = re.sub(u'[^a-zA-Z0-9.\-_ ]', replacement, basepath) | |
| 502 | 538 | sane_fname = str(sane_fname) # py3: does nothing; py2: unicode --> str |
| 503 | 539 | |
| 504 | 540 | while ".." in sane_fname: |
| ... | ... | @@ -507,14 +543,71 @@ def sanitize_filename(filename, replacement='_', max_length=200): |
| 507 | 543 | while " " in sane_fname: |
| 508 | 544 | sane_fname = sane_fname.replace(' ', ' ') |
| 509 | 545 | |
| 510 | - if not filename: | |
| 511 | - sane_fname = 'NONAME' | |
| 546 | + # limit filename length, try to preserve suffix | |
| 547 | + return shorten_filename(sane_fname, max_len) | |
| 548 | + | |
| 512 | 549 | |
| 513 | - # limit filename length | |
| 514 | - if max_length: | |
| 515 | - sane_fname = sane_fname[:max_length] | |
| 550 | +def get_sane_embedded_filenames(filename, src_path, tmp_path, max_len, | |
| 551 | + noname_index): | |
| 552 | + """ | |
| 553 | + Get some sane filenames out of path information, preserving file suffix. | |
| 554 | + | |
| 555 | + Returns several canddiates, first with suffix, then without, then random | |
| 556 | + with suffix and finally one last attempt ignoring max_len using arg | |
| 557 | + `noname_index`. | |
| 558 | + | |
| 559 | + In some malware examples, filename (on which we relied sofar exclusively | |
| 560 | + for this) is empty or " ", but src_path and tmp_path contain paths with | |
| 561 | + proper file names. Try to extract filename from any of those. | |
| 562 | + | |
| 563 | + Preservation of suffix is especially important since that controls how | |
| 564 | + windoze treats the file. | |
| 565 | + """ | |
| 566 | + suffixes = [] | |
| 567 | + candidates_without_suffix = [] # remember these as fallback | |
| 568 | + for candidate in (filename, src_path, tmp_path): | |
| 569 | + # remove path component. Could be from linux, mac or windows | |
| 570 | + idx = max(candidate.rfind('/'), candidate.rfind('\\')) | |
| 571 | + candidate = candidate[idx+1:].strip() | |
| 572 | + | |
| 573 | + # sanitize | |
| 574 | + candidate = sanitize_filename(candidate, max_len=max_len) | |
| 575 | + | |
| 576 | + if not candidate: | |
| 577 | + continue # skip whitespace-only | |
| 578 | + | |
| 579 | + # identify suffix. Dangerous suffixes are all short | |
| 580 | + idx = candidate.rfind('.') | |
| 581 | + if idx is -1: | |
| 582 | + candidates_without_suffix.append(candidate) | |
| 583 | + continue | |
| 584 | + elif idx < len(candidate)-5: | |
| 585 | + candidates_without_suffix.append(candidate) | |
| 586 | + continue | |
| 587 | + | |
| 588 | + # remember suffix | |
| 589 | + suffixes.append(candidate[idx:]) | |
| 590 | + | |
| 591 | + yield candidate | |
| 516 | 592 | |
| 517 | - return sane_fname | |
| 593 | + # parts with suffix not good enough? try those without one | |
| 594 | + for candidate in candidates_without_suffix: | |
| 595 | + yield candidate | |
| 596 | + | |
| 597 | + # then try random | |
| 598 | + suffixes.append('') # ensure there is something in there | |
| 599 | + for _ in range(MAX_FILENAME_ATTEMPTS): | |
| 600 | + for suffix in suffixes: | |
| 601 | + leftover_len = max_len - len(suffix) | |
| 602 | + if leftover_len < 1: | |
| 603 | + continue | |
| 604 | + name = ''.join(random.sample('abcdefghijklmnopqrstuvwxyz', | |
| 605 | + min(26, leftover_len))) | |
| 606 | + yield name + suffix | |
| 607 | + | |
| 608 | + # still not returned? Then we have to make up a name ourselves | |
| 609 | + # do not care any more about max_len (maybe it was 0 or negative) | |
| 610 | + yield 'oleobj_%03d' % noname_index | |
| 518 | 611 | |
| 519 | 612 | |
| 520 | 613 | def find_ole_in_ppt(filename): |
| ... | ... | @@ -666,7 +759,8 @@ def find_ole(filename, data, xml_parser=None): |
| 666 | 759 | if xml_parser is None: |
| 667 | 760 | xml_parser = XmlParser(arg_for_zip) |
| 668 | 761 | # force iteration so XmlParser.iter_non_xml() returns data |
| 669 | - [x for x in xml_parser.iter_xml()] | |
| 762 | + for _ in xml_parser.iter_xml(): | |
| 763 | + pass | |
| 670 | 764 | |
| 671 | 765 | log.info('is zip file: ' + filename) |
| 672 | 766 | # we looped through the XML files before, now we can |
| ... | ... | @@ -748,16 +842,17 @@ def process_file(filename, data, output_dir=None): |
| 748 | 842 | If output_dir is given and does not exist, it is created. If it is not |
| 749 | 843 | given, data is saved to same directory as the input file. |
| 750 | 844 | """ |
| 845 | + # sanitize filename, leave space for embedded filename part | |
| 846 | + sane_fname = sanitize_filename(filename, max_len=MAX_FILENAME_LENGTH-5) or\ | |
| 847 | + 'NONAME' | |
| 751 | 848 | if output_dir: |
| 752 | 849 | if not os.path.isdir(output_dir): |
| 753 | 850 | log.info('creating output directory %s', output_dir) |
| 754 | 851 | os.mkdir(output_dir) |
| 755 | 852 | |
| 756 | - fname_prefix = os.path.join(output_dir, | |
| 757 | - sanitize_filename(filename)) | |
| 853 | + fname_prefix = os.path.join(output_dir, sane_fname) | |
| 758 | 854 | else: |
| 759 | 855 | base_dir = os.path.dirname(filename) |
| 760 | - sane_fname = sanitize_filename(filename) | |
| 761 | 856 | fname_prefix = os.path.join(base_dir, sane_fname) |
| 762 | 857 | |
| 763 | 858 | # TODO: option to extract objects to files (false by default) |
| ... | ... | @@ -818,11 +913,12 @@ def process_file(filename, data, output_dir=None): |
| 818 | 913 | print(u'Filename = "%s"' % opkg.filename) |
| 819 | 914 | print(u'Source path = "%s"' % opkg.src_path) |
| 820 | 915 | print(u'Temp path = "%s"' % opkg.temp_path) |
| 821 | - if opkg.filename: | |
| 822 | - fname = '%s_%s' % (fname_prefix, | |
| 823 | - sanitize_filename(opkg.filename)) | |
| 824 | - else: | |
| 825 | - fname = '%s_object_%03d.noname' % (fname_prefix, index) | |
| 916 | + for embedded_fname in get_sane_embedded_filenames( | |
| 917 | + opkg.filename, opkg.src_path, opkg.temp_path, | |
| 918 | + MAX_FILENAME_LENGTH - len(sane_fname) - 1, index): | |
| 919 | + fname = fname_prefix + '_' + embedded_fname | |
| 920 | + if not os.path.isfile(fname): | |
| 921 | + break | |
| 826 | 922 | |
| 827 | 923 | # dump |
| 828 | 924 | try: | ... | ... |
tests/oleobj/test_basic.py
| ... | ... | @@ -159,6 +159,65 @@ class TestOleObj(unittest.TestCase): |
| 159 | 159 | only_run_every=4) |
| 160 | 160 | |
| 161 | 161 | |
| 162 | +class TestSaneFilenameCreation(unittest.TestCase): | |
| 163 | + """ Test sanitization / creation of sane filenames """ | |
| 164 | + def test_with_empty_inputs(self): | |
| 165 | + """Test empty inputs lead to several non-empty distinct outputs""" | |
| 166 | + iter = oleobj.get_sane_embedded_filenames('', '', '', 10, 47) | |
| 167 | + output = set() | |
| 168 | + for attempt in range(10): | |
| 169 | + output.add(next(iter)) | |
| 170 | + self.assertEqual(len(output), 10) # check all 10 are different | |
| 171 | + for fname in output: | |
| 172 | + self.assertNotEqual(fname, '') # all are non-empty | |
| 173 | + | |
| 174 | + def test_that_first_has_priority(self): | |
| 175 | + iter = oleobj.get_sane_embedded_filenames('fname.sfx', 'do_not.use', | |
| 176 | + 'do_not.use', 10, 47) | |
| 177 | + self.assertEqual(next(iter), 'fname.sfx') | |
| 178 | + [next(iter) for _ in range(10)] # check this does not crash | |
| 179 | + | |
| 180 | + def test_that_suffixed_have_priority(self): | |
| 181 | + iter = oleobj.get_sane_embedded_filenames('no_suffix', 'also_not', | |
| 182 | + 'fname.sfx', 10, 47) | |
| 183 | + self.assertEqual(next(iter), 'fname.sfx') | |
| 184 | + self.assertEqual(next(iter), 'no_suffix') | |
| 185 | + self.assertEqual(next(iter), 'also_not') | |
| 186 | + [next(iter) for _ in range(10)] # check this does not crash | |
| 187 | + | |
| 188 | + def test_with_hardly_any_length(self): | |
| 189 | + iter = oleobj.get_sane_embedded_filenames('fname.suffx', 'fname.sufx', | |
| 190 | + 'fname.sfx', 4, 47) | |
| 191 | + self.assertEqual(next(iter), '.sfx') | |
| 192 | + [next(iter) for _ in range(10)] # check this does not crash | |
| 193 | + | |
| 194 | + def test_with_mean_unicode(self): | |
| 195 | + uni_name1 = u'\xfcnic\xf6de-\xdftring' | |
| 196 | + uni_name2 = u'keyboard:\u2328, Braille:\u2800, Phone:\u260e' | |
| 197 | + iter = oleobj.get_sane_embedded_filenames(uni_name1, uni_name2, | |
| 198 | + 'regular_txt', 30, 47) | |
| 199 | + self.assertEqual(next(iter), '_nic_de-_tring') | |
| 200 | + self.assertEqual(next(iter), 'keyboard___ Braille___ Phone__') | |
| 201 | + self.assertEqual(next(iter), 'regular_txt') | |
| 202 | + [next(iter) for _ in range(10)] # check this does not crash | |
| 203 | + | |
| 204 | + def test_last_resort(self): | |
| 205 | + iter = oleobj.get_sane_embedded_filenames('', '', '', 10, 47) | |
| 206 | + all_options = list(iter) | |
| 207 | + self.assertEqual(len(all_options), oleobj.MAX_FILENAME_ATTEMPTS+1) | |
| 208 | + self.assertIn('47', all_options[-1]) | |
| 209 | + | |
| 210 | + def test_realworld_lnk_example(self): | |
| 211 | + fname = ' ' | |
| 212 | + src_path = 'E:\\tmp\\doc_package\\doc\\6.lnk' | |
| 213 | + tmp_path = 'C:\\Users\\1\\AppData\\Local\\Temp\\6.lnk' | |
| 214 | + iter = oleobj.get_sane_embedded_filenames(fname, src_path, tmp_path, | |
| 215 | + 30, 47) | |
| 216 | + self.assertEqual(next(iter), '6.lnk') | |
| 217 | + self.assertEqual(next(iter), '6.lnk') | |
| 218 | + [next(iter) for _ in range(10)] # check this does not crash | |
| 219 | + | |
| 220 | + | |
| 162 | 221 | # just in case somebody calls this file as a script |
| 163 | 222 | if __name__ == '__main__': |
| 164 | 223 | unittest.main() | ... | ... |