Commit 5268bc721ad2ff42905f39d13e4ba4cf0bae414c
1 parent
2363247a
rtfobj: improved parsing to handle some malware tricks
Showing
1 changed file
with
36 additions
and
6 deletions
oletools/rtfobj.py
| ... | ... | @@ -47,8 +47,9 @@ http://www.decalage.info/python/oletools |
| 47 | 47 | # - extract OLE 1.0 objects |
| 48 | 48 | # - extract files from OLE Package objects |
| 49 | 49 | # 2016-04-01 v0.04 PL: - fixed logging output to use stdout instead of stderr |
| 50 | +# 2016-04-07 v0.45 PL: - improved parsing to handle some malware tricks | |
| 50 | 51 | |
| 51 | -__version__ = '0.04' | |
| 52 | +__version__ = '0.45' | |
| 52 | 53 | |
| 53 | 54 | #------------------------------------------------------------------------------ |
| 54 | 55 | # TODO: |
| ... | ... | @@ -110,9 +111,24 @@ log = get_logger('rtfobj') |
| 110 | 111 | #=== CONSTANTS================================================================= |
| 111 | 112 | |
| 112 | 113 | # REGEX pattern to extract embedded OLE objects in hexadecimal format: |
| 114 | + | |
| 113 | 115 | # alphanum digit: [0-9A-Fa-f] |
| 116 | +HEX_DIGIT = r'[0-9A-Fa-f]' | |
| 117 | + | |
| 114 | 118 | # hex char = two alphanum digits: [0-9A-Fa-f]{2} |
| 115 | -HEX_CHAR = r'[0-9A-Fa-f]{2}' | |
| 119 | +# HEX_CHAR = r'[0-9A-Fa-f]{2}' | |
| 120 | +# in fact MS Word allows whitespaces in between the hex digits! | |
| 121 | +# HEX_CHAR = r'[0-9A-Fa-f]\s*[0-9A-Fa-f]' | |
| 122 | +# Even worse, MS Word also allows ANY RTF-style tag {*} in between!! | |
| 123 | +# AND the tags can be nested... | |
| 124 | +SINGLE_RTF_TAG = r'[{][^{}]*[}]' | |
| 125 | +# Nested tags, two levels (because Python's re does not support nested matching): | |
| 126 | +NESTED_RTF_TAG = r'[{](?:[^{}]|'+SINGLE_RTF_TAG+r')*[}]' | |
| 127 | +# ignored whitespaces and tags within a hex block: | |
| 128 | +IGNORED = r'(?:\s|'+NESTED_RTF_TAG+r')*' | |
| 129 | + | |
| 130 | +HEX_CHAR = HEX_DIGIT + IGNORED + HEX_DIGIT | |
| 131 | + | |
| 116 | 132 | # several hex chars, at least 4: (?:[0-9A-Fa-f]{2}){4,} |
| 117 | 133 | # + word boundaries |
| 118 | 134 | HEX_CHARS_4orMORE = r'\b(?:' + HEX_CHAR + r'){4,}\b' |
| ... | ... | @@ -124,7 +140,9 @@ HEX_CHARS_1orMORE_WHITESPACES = r'(?:' + HEX_CHAR + r')+\s+' |
| 124 | 140 | # HEX_CHARS_1orMORE_WHITESPACES = r'\b(?:' + HEX_CHAR + r')+\b\s*' |
| 125 | 141 | # at least one block of hex and whitespace chars, followed by closing curly bracket: |
| 126 | 142 | # HEX_BLOCK_CURLY_BRACKET = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')+\}' |
| 127 | -PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE | |
| 143 | +# PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE | |
| 144 | + | |
| 145 | +PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}\b' | |
| 128 | 146 | |
| 129 | 147 | # at least 4 hex chars, followed by whitespace or CR/LF: (?:[0-9A-Fa-f]{2}){4,}\s* |
| 130 | 148 | # PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}' |
| ... | ... | @@ -135,18 +153,19 @@ PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE |
| 135 | 153 | TRANSTABLE_NOCHANGE = string.maketrans('', '') |
| 136 | 154 | |
| 137 | 155 | re_hexblock = re.compile(PATTERN) |
| 156 | +re_embedded_tags = re.compile(IGNORED) | |
| 138 | 157 | re_decimal = re.compile(r'\d+') |
| 139 | 158 | |
| 140 | 159 | re_delimiter = re.compile(r'[ \t\r\n\f\v]') |
| 141 | 160 | |
| 142 | 161 | DELIMITER = r'[ \t\r\n\f\v]' |
| 143 | 162 | DELIMITERS_ZeroOrMore = r'[ \t\r\n\f\v]*' |
| 144 | -ANTISLASH_BIN = r'\bin' | |
| 163 | +BACKSLASH_BIN = r'\bin' | |
| 145 | 164 | # According to my tests, Word accepts up to 250 digits (leading zeroes) |
| 146 | 165 | DECIMAL_GROUP = r'(\d{1,250})' |
| 147 | 166 | |
| 148 | -re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + ANTISLASH_BIN | |
| 149 | - + DECIMAL_GROUP + DELIMITER) | |
| 167 | +re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + BACKSLASH_BIN | |
| 168 | + + DECIMAL_GROUP + DELIMITER) | |
| 150 | 169 | re_delim_hexblock = re.compile(DELIMITER + PATTERN) |
| 151 | 170 | |
| 152 | 171 | |
| ... | ... | @@ -206,10 +225,18 @@ def rtf_iter_objects (data, min_size=32): |
| 206 | 225 | match = re_hexblock.search(data, pos=current) |
| 207 | 226 | continue |
| 208 | 227 | log.debug('Found hex block starting at %08X, end %08X' % (start, current)) |
| 228 | + log.debug('Match: %s' % found) | |
| 209 | 229 | # remove all whitespace and line feeds: |
| 210 | 230 | #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE |
| 211 | 231 | found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') |
| 232 | + # TODO: make it a function | |
| 233 | + # Also remove embedded RTF tags: | |
| 234 | + found = re_embedded_tags.sub('', found) | |
| 212 | 235 | # object data extracted from the RTF file |
| 236 | + # MS Word accepts an extra hex digit, so we need to trim it if present: | |
| 237 | + if len(found) & 1: | |
| 238 | + found = found[:-1] | |
| 239 | + log.debug('Cleaned match: %s' % found) | |
| 213 | 240 | objdata = binascii.unhexlify(found) |
| 214 | 241 | # Detect the "\bin" control word, which is sometimes used for obfuscation: |
| 215 | 242 | bin_match = re_delims_bin_decimal.match(data, pos=current) |
| ... | ... | @@ -236,9 +263,12 @@ def rtf_iter_objects (data, min_size=32): |
| 236 | 263 | log.debug('Found next hex block starting at %08X, end %08X' |
| 237 | 264 | % (match.start(), match.end())) |
| 238 | 265 | found = match.group(0) |
| 266 | + log.debug('Match: %s' % found) | |
| 239 | 267 | # remove all whitespace and line feeds: |
| 240 | 268 | #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE |
| 241 | 269 | found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') |
| 270 | + # Also remove embedded RTF tags: | |
| 271 | + found = re_embedded_tags.sub(found, '') | |
| 242 | 272 | objdata += binascii.unhexlify(found) |
| 243 | 273 | current = match.end() |
| 244 | 274 | bin_match = re_delims_bin_decimal.match(data, pos=current) | ... | ... |