Commit 5268bc721ad2ff42905f39d13e4ba4cf0bae414c
1 parent
2363247a
rtfobj: improved parsing to handle some malware tricks
Showing
1 changed file
with
36 additions
and
6 deletions
oletools/rtfobj.py
| @@ -47,8 +47,9 @@ http://www.decalage.info/python/oletools | @@ -47,8 +47,9 @@ http://www.decalage.info/python/oletools | ||
| 47 | # - extract OLE 1.0 objects | 47 | # - extract OLE 1.0 objects |
| 48 | # - extract files from OLE Package objects | 48 | # - extract files from OLE Package objects |
| 49 | # 2016-04-01 v0.04 PL: - fixed logging output to use stdout instead of stderr | 49 | # 2016-04-01 v0.04 PL: - fixed logging output to use stdout instead of stderr |
| 50 | +# 2016-04-07 v0.45 PL: - improved parsing to handle some malware tricks | ||
| 50 | 51 | ||
| 51 | -__version__ = '0.04' | 52 | +__version__ = '0.45' |
| 52 | 53 | ||
| 53 | #------------------------------------------------------------------------------ | 54 | #------------------------------------------------------------------------------ |
| 54 | # TODO: | 55 | # TODO: |
| @@ -110,9 +111,24 @@ log = get_logger('rtfobj') | @@ -110,9 +111,24 @@ log = get_logger('rtfobj') | ||
| 110 | #=== CONSTANTS================================================================= | 111 | #=== CONSTANTS================================================================= |
| 111 | 112 | ||
| 112 | # REGEX pattern to extract embedded OLE objects in hexadecimal format: | 113 | # REGEX pattern to extract embedded OLE objects in hexadecimal format: |
| 114 | + | ||
| 113 | # alphanum digit: [0-9A-Fa-f] | 115 | # alphanum digit: [0-9A-Fa-f] |
| 116 | +HEX_DIGIT = r'[0-9A-Fa-f]' | ||
| 117 | + | ||
| 114 | # hex char = two alphanum digits: [0-9A-Fa-f]{2} | 118 | # hex char = two alphanum digits: [0-9A-Fa-f]{2} |
| 115 | -HEX_CHAR = r'[0-9A-Fa-f]{2}' | 119 | +# HEX_CHAR = r'[0-9A-Fa-f]{2}' |
| 120 | +# in fact MS Word allows whitespaces in between the hex digits! | ||
| 121 | +# HEX_CHAR = r'[0-9A-Fa-f]\s*[0-9A-Fa-f]' | ||
| 122 | +# Even worse, MS Word also allows ANY RTF-style tag {*} in between!! | ||
| 123 | +# AND the tags can be nested... | ||
| 124 | +SINGLE_RTF_TAG = r'[{][^{}]*[}]' | ||
| 125 | +# Nested tags, two levels (because Python's re does not support nested matching): | ||
| 126 | +NESTED_RTF_TAG = r'[{](?:[^{}]|'+SINGLE_RTF_TAG+r')*[}]' | ||
| 127 | +# ignored whitespaces and tags within a hex block: | ||
| 128 | +IGNORED = r'(?:\s|'+NESTED_RTF_TAG+r')*' | ||
| 129 | + | ||
| 130 | +HEX_CHAR = HEX_DIGIT + IGNORED + HEX_DIGIT | ||
| 131 | + | ||
| 116 | # several hex chars, at least 4: (?:[0-9A-Fa-f]{2}){4,} | 132 | # several hex chars, at least 4: (?:[0-9A-Fa-f]{2}){4,} |
| 117 | # + word boundaries | 133 | # + word boundaries |
| 118 | HEX_CHARS_4orMORE = r'\b(?:' + HEX_CHAR + r'){4,}\b' | 134 | HEX_CHARS_4orMORE = r'\b(?:' + HEX_CHAR + r'){4,}\b' |
| @@ -124,7 +140,9 @@ HEX_CHARS_1orMORE_WHITESPACES = r'(?:' + HEX_CHAR + r')+\s+' | @@ -124,7 +140,9 @@ HEX_CHARS_1orMORE_WHITESPACES = r'(?:' + HEX_CHAR + r')+\s+' | ||
| 124 | # HEX_CHARS_1orMORE_WHITESPACES = r'\b(?:' + HEX_CHAR + r')+\b\s*' | 140 | # HEX_CHARS_1orMORE_WHITESPACES = r'\b(?:' + HEX_CHAR + r')+\b\s*' |
| 125 | # at least one block of hex and whitespace chars, followed by closing curly bracket: | 141 | # at least one block of hex and whitespace chars, followed by closing curly bracket: |
| 126 | # HEX_BLOCK_CURLY_BRACKET = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')+\}' | 142 | # HEX_BLOCK_CURLY_BRACKET = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')+\}' |
| 127 | -PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE | 143 | +# PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE |
| 144 | + | ||
| 145 | +PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}\b' | ||
| 128 | 146 | ||
| 129 | # at least 4 hex chars, followed by whitespace or CR/LF: (?:[0-9A-Fa-f]{2}){4,}\s* | 147 | # at least 4 hex chars, followed by whitespace or CR/LF: (?:[0-9A-Fa-f]{2}){4,}\s* |
| 130 | # PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}' | 148 | # PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}' |
| @@ -135,18 +153,19 @@ PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE | @@ -135,18 +153,19 @@ PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE | ||
| 135 | TRANSTABLE_NOCHANGE = string.maketrans('', '') | 153 | TRANSTABLE_NOCHANGE = string.maketrans('', '') |
| 136 | 154 | ||
| 137 | re_hexblock = re.compile(PATTERN) | 155 | re_hexblock = re.compile(PATTERN) |
| 156 | +re_embedded_tags = re.compile(IGNORED) | ||
| 138 | re_decimal = re.compile(r'\d+') | 157 | re_decimal = re.compile(r'\d+') |
| 139 | 158 | ||
| 140 | re_delimiter = re.compile(r'[ \t\r\n\f\v]') | 159 | re_delimiter = re.compile(r'[ \t\r\n\f\v]') |
| 141 | 160 | ||
| 142 | DELIMITER = r'[ \t\r\n\f\v]' | 161 | DELIMITER = r'[ \t\r\n\f\v]' |
| 143 | DELIMITERS_ZeroOrMore = r'[ \t\r\n\f\v]*' | 162 | DELIMITERS_ZeroOrMore = r'[ \t\r\n\f\v]*' |
| 144 | -ANTISLASH_BIN = r'\bin' | 163 | +BACKSLASH_BIN = r'\bin' |
| 145 | # According to my tests, Word accepts up to 250 digits (leading zeroes) | 164 | # According to my tests, Word accepts up to 250 digits (leading zeroes) |
| 146 | DECIMAL_GROUP = r'(\d{1,250})' | 165 | DECIMAL_GROUP = r'(\d{1,250})' |
| 147 | 166 | ||
| 148 | -re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + ANTISLASH_BIN | ||
| 149 | - + DECIMAL_GROUP + DELIMITER) | 167 | +re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + BACKSLASH_BIN |
| 168 | + + DECIMAL_GROUP + DELIMITER) | ||
| 150 | re_delim_hexblock = re.compile(DELIMITER + PATTERN) | 169 | re_delim_hexblock = re.compile(DELIMITER + PATTERN) |
| 151 | 170 | ||
| 152 | 171 | ||
| @@ -206,10 +225,18 @@ def rtf_iter_objects (data, min_size=32): | @@ -206,10 +225,18 @@ def rtf_iter_objects (data, min_size=32): | ||
| 206 | match = re_hexblock.search(data, pos=current) | 225 | match = re_hexblock.search(data, pos=current) |
| 207 | continue | 226 | continue |
| 208 | log.debug('Found hex block starting at %08X, end %08X' % (start, current)) | 227 | log.debug('Found hex block starting at %08X, end %08X' % (start, current)) |
| 228 | + log.debug('Match: %s' % found) | ||
| 209 | # remove all whitespace and line feeds: | 229 | # remove all whitespace and line feeds: |
| 210 | #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE | 230 | #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE |
| 211 | found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') | 231 | found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') |
| 232 | + # TODO: make it a function | ||
| 233 | + # Also remove embedded RTF tags: | ||
| 234 | + found = re_embedded_tags.sub('', found) | ||
| 212 | # object data extracted from the RTF file | 235 | # object data extracted from the RTF file |
| 236 | + # MS Word accepts an extra hex digit, so we need to trim it if present: | ||
| 237 | + if len(found) & 1: | ||
| 238 | + found = found[:-1] | ||
| 239 | + log.debug('Cleaned match: %s' % found) | ||
| 213 | objdata = binascii.unhexlify(found) | 240 | objdata = binascii.unhexlify(found) |
| 214 | # Detect the "\bin" control word, which is sometimes used for obfuscation: | 241 | # Detect the "\bin" control word, which is sometimes used for obfuscation: |
| 215 | bin_match = re_delims_bin_decimal.match(data, pos=current) | 242 | bin_match = re_delims_bin_decimal.match(data, pos=current) |
| @@ -236,9 +263,12 @@ def rtf_iter_objects (data, min_size=32): | @@ -236,9 +263,12 @@ def rtf_iter_objects (data, min_size=32): | ||
| 236 | log.debug('Found next hex block starting at %08X, end %08X' | 263 | log.debug('Found next hex block starting at %08X, end %08X' |
| 237 | % (match.start(), match.end())) | 264 | % (match.start(), match.end())) |
| 238 | found = match.group(0) | 265 | found = match.group(0) |
| 266 | + log.debug('Match: %s' % found) | ||
| 239 | # remove all whitespace and line feeds: | 267 | # remove all whitespace and line feeds: |
| 240 | #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE | 268 | #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE |
| 241 | found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') | 269 | found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') |
| 270 | + # Also remove embedded RTF tags: | ||
| 271 | + found = re_embedded_tags.sub(found, '') | ||
| 242 | objdata += binascii.unhexlify(found) | 272 | objdata += binascii.unhexlify(found) |
| 243 | current = match.end() | 273 | current = match.end() |
| 244 | bin_match = re_delims_bin_decimal.match(data, pos=current) | 274 | bin_match = re_delims_bin_decimal.match(data, pos=current) |