Commit 23ffafb33dc68e43b281045300ff219120f82ff9
1 parent
5f8e0b88
rtfobj: improved parsing, fixed issue #42
Showing
1 changed file
with
16 additions
and
9 deletions
oletools/rtfobj.py
| ... | ... | @@ -51,6 +51,7 @@ http://www.decalage.info/python/oletools |
| 51 | 51 | # 2016-05-06 v0.47 TJ: - added option -d to set the output directory |
| 52 | 52 | # (contribution by Thomas Jarosch) |
| 53 | 53 | # TJ: - sanitize filenames to avoid special characters |
| 54 | +# 2016-05-29 PL: - improved parsing, fixed issue #42 | |
| 54 | 55 | |
| 55 | 56 | __version__ = '0.47' |
| 56 | 57 | |
| ... | ... | @@ -129,23 +130,26 @@ SINGLE_RTF_TAG = r'[{][^{}]*[}]' |
| 129 | 130 | NESTED_RTF_TAG = r'[{](?:[^{}]|'+SINGLE_RTF_TAG+r')*[}]' |
| 130 | 131 | # ignored whitespaces and tags within a hex block: |
| 131 | 132 | IGNORED = r'(?:\s|'+NESTED_RTF_TAG+r')*' |
| 133 | +#IGNORED = r'\s*' | |
| 132 | 134 | |
| 133 | -HEX_CHAR = HEX_DIGIT + IGNORED + HEX_DIGIT | |
| 135 | +# HEX_CHAR = HEX_DIGIT + IGNORED + HEX_DIGIT | |
| 134 | 136 | |
| 135 | 137 | # several hex chars, at least 4: (?:[0-9A-Fa-f]{2}){4,} |
| 136 | 138 | # + word boundaries |
| 137 | -HEX_CHARS_4orMORE = r'\b(?:' + HEX_CHAR + r'){4,}\b' | |
| 139 | +# HEX_CHARS_4orMORE = r'\b(?:' + HEX_CHAR + r'){4,}\b' | |
| 138 | 140 | # at least 1 hex char: |
| 139 | -HEX_CHARS_1orMORE = r'(?:' + HEX_CHAR + r')+' | |
| 141 | +# HEX_CHARS_1orMORE = r'(?:' + HEX_CHAR + r')+' | |
| 140 | 142 | # at least 1 hex char, followed by whitespace or CR/LF: |
| 141 | -HEX_CHARS_1orMORE_WHITESPACES = r'(?:' + HEX_CHAR + r')+\s+' | |
| 143 | +# HEX_CHARS_1orMORE_WHITESPACES = r'(?:' + HEX_CHAR + r')+\s+' | |
| 142 | 144 | # + word boundaries around hex block |
| 143 | 145 | # HEX_CHARS_1orMORE_WHITESPACES = r'\b(?:' + HEX_CHAR + r')+\b\s*' |
| 144 | 146 | # at least one block of hex and whitespace chars, followed by closing curly bracket: |
| 145 | 147 | # HEX_BLOCK_CURLY_BRACKET = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')+\}' |
| 146 | 148 | # PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE |
| 147 | 149 | |
| 148 | -PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}\b' | |
| 150 | +#TODO PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}\b' | |
| 151 | +# PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}' #+ HEX_CHAR + r'\b' | |
| 152 | +PATTERN = r'\b(?:' + HEX_DIGIT + IGNORED + r'){7,}' + HEX_DIGIT + r'\b' | |
| 149 | 153 | |
| 150 | 154 | # at least 4 hex chars, followed by whitespace or CR/LF: (?:[0-9A-Fa-f]{2}){4,}\s* |
| 151 | 155 | # PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}' |
| ... | ... | @@ -216,6 +220,7 @@ def rtf_iter_objects (data, min_size=32): |
| 216 | 220 | # Search 1st occurence of a hex block: |
| 217 | 221 | match = re_hexblock.search(data) |
| 218 | 222 | if match is None: |
| 223 | + log.debug('No hex block found.') | |
| 219 | 224 | # no hex block found |
| 220 | 225 | return |
| 221 | 226 | while match is not None: |
| ... | ... | @@ -224,11 +229,12 @@ def rtf_iter_objects (data, min_size=32): |
| 224 | 229 | start = match.start() |
| 225 | 230 | # current position |
| 226 | 231 | current = match.end() |
| 232 | + log.debug('Found hex block starting at %08X, end %08X, size=%d' % (start, current, len(found))) | |
| 227 | 233 | if len(found) < min_size: |
| 234 | + log.debug('Too small - size<%d, ignored.' % min_size) | |
| 228 | 235 | match = re_hexblock.search(data, pos=current) |
| 229 | 236 | continue |
| 230 | - log.debug('Found hex block starting at %08X, end %08X' % (start, current)) | |
| 231 | - log.debug('Match: %s' % found) | |
| 237 | + #log.debug('Match: %s' % found) | |
| 232 | 238 | # remove all whitespace and line feeds: |
| 233 | 239 | #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE |
| 234 | 240 | found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') |
| ... | ... | @@ -238,8 +244,9 @@ def rtf_iter_objects (data, min_size=32): |
| 238 | 244 | # object data extracted from the RTF file |
| 239 | 245 | # MS Word accepts an extra hex digit, so we need to trim it if present: |
| 240 | 246 | if len(found) & 1: |
| 247 | + log.debug('Odd length, trimmed last byte.') | |
| 241 | 248 | found = found[:-1] |
| 242 | - log.debug('Cleaned match: %s' % found) | |
| 249 | + #log.debug('Cleaned match: %s' % found) | |
| 243 | 250 | objdata = binascii.unhexlify(found) |
| 244 | 251 | # Detect the "\bin" control word, which is sometimes used for obfuscation: |
| 245 | 252 | bin_match = re_delims_bin_decimal.match(data, pos=current) |
| ... | ... | @@ -365,7 +372,7 @@ def process_file(container, filename, data, output_dir=None): |
| 365 | 372 | open(fname, 'wb').write(opkg.data) |
| 366 | 373 | except: |
| 367 | 374 | pass |
| 368 | - # log.exception('*** Not an OLE 1.0 Object') | |
| 375 | + log.exception('*** Not an OLE 1.0 Object') | |
| 369 | 376 | |
| 370 | 377 | |
| 371 | 378 | ... | ... |