Commit 6416b39aaa18b2efa23671bfdc5483c206ebfbf8
1 parent
1cf591dd
rtfobj, oleobj: fixed Python 2.6+2.7+3.x support
Showing
2 changed files
with
327 additions
and
149 deletions
oletools/oleobj.py
| 1 | 1 | #!/usr/bin/env python |
| 2 | +from __future__ import print_function | |
| 2 | 3 | """ |
| 3 | 4 | oleobj.py |
| 4 | 5 | |
| ... | ... | @@ -14,7 +15,7 @@ http://www.decalage.info/python/oletools |
| 14 | 15 | |
| 15 | 16 | # === LICENSE ================================================================== |
| 16 | 17 | |
| 17 | -# oleobj is copyright (c) 2015 Philippe Lagadec (http://www.decalage.info) | |
| 18 | +# oleobj is copyright (c) 2015-2016 Philippe Lagadec (http://www.decalage.info) | |
| 18 | 19 | # All rights reserved. |
| 19 | 20 | # |
| 20 | 21 | # Redistribution and use in source and binary forms, with or without modification, |
| ... | ... | @@ -41,8 +42,11 @@ http://www.decalage.info/python/oletools |
| 41 | 42 | #------------------------------------------------------------------------------ |
| 42 | 43 | # CHANGELOG: |
| 43 | 44 | # 2015-12-05 v0.01 PL: - first version |
| 45 | +# 2016-06 PL: - added main and process_file (not working yet) | |
| 46 | +# 2016-07-18 v0.48 SL: - added Python 3.5 support | |
| 47 | +# 2016-07-19 PL: - fixed Python 2.6-7 support | |
| 44 | 48 | |
| 45 | -__version__ = '0.01' | |
| 49 | +__version__ = '0.48' | |
| 46 | 50 | |
| 47 | 51 | #------------------------------------------------------------------------------ |
| 48 | 52 | # TODO: |
| ... | ... | @@ -62,8 +66,10 @@ __version__ = '0.01' |
| 62 | 66 | |
| 63 | 67 | #--- IMPORTS ------------------------------------------------------------------ |
| 64 | 68 | |
| 65 | -import logging, struct | |
| 69 | +import logging, struct, optparse, os, re, sys | |
| 66 | 70 | |
| 71 | +from thirdparty.olefile import olefile | |
| 72 | +from thirdparty.xglob import xglob | |
| 67 | 73 | |
| 68 | 74 | # === LOGGING ================================================================= |
| 69 | 75 | |
| ... | ... | @@ -107,6 +113,18 @@ def get_logger(name, level=logging.CRITICAL+1): |
| 107 | 113 | log = get_logger('oleobj') |
| 108 | 114 | |
| 109 | 115 | |
| 116 | +# === CONSTANTS ============================================================== | |
| 117 | + | |
| 118 | +# some str methods on Python 2.x return characters, | |
| 119 | +# while the equivalent bytes methods return integers on Python 3.x: | |
| 120 | +if sys.version_info[0] <= 2: | |
| 121 | + # Python 2.x | |
| 122 | + NULL_CHAR = '\x00' | |
| 123 | +else: | |
| 124 | + # Python 3.x | |
| 125 | + NULL_CHAR = 0 | |
| 126 | + | |
| 127 | + | |
| 110 | 128 | # === GLOBAL VARIABLES ======================================================= |
| 111 | 129 | |
| 112 | 130 | # struct to parse an unsigned integer of 32 bits: |
| ... | ... | @@ -162,7 +180,7 @@ def read_LengthPrefixedAnsiString(data): |
| 162 | 180 | ansi_string = data[:length-1] |
| 163 | 181 | # TODO: only in strict mode: |
| 164 | 182 | # check the presence of the null char: |
| 165 | - assert data[length] == 0 | |
| 183 | + assert data[length] == NULL_CHAR | |
| 166 | 184 | new_data = data[length:] |
| 167 | 185 | return (ansi_string, new_data) |
| 168 | 186 | |
| ... | ... | @@ -285,3 +303,149 @@ class OleObject (object): |
| 285 | 303 | self.data = data[:self.data_size] |
| 286 | 304 | assert len(self.data) == self.data_size |
| 287 | 305 | self.extra_data = data[self.data_size:] |
| 306 | + | |
| 307 | + | |
| 308 | + | |
| 309 | +def sanitize_filename(filename, replacement='_', max_length=200): | |
| 310 | + """compute basename of filename. Replaces all non-whitelisted characters. | |
| 311 | + The returned filename is always a basename of the file.""" | |
| 312 | + basepath = os.path.basename(filename).strip() | |
| 313 | + sane_fname = re.sub(r'[^\w\.\- ]', replacement, basepath) | |
| 314 | + | |
| 315 | + while ".." in sane_fname: | |
| 316 | + sane_fname = sane_fname.replace('..', '.') | |
| 317 | + | |
| 318 | + while " " in sane_fname: | |
| 319 | + sane_fname = sane_fname.replace(' ', ' ') | |
| 320 | + | |
| 321 | + if not len(filename): | |
| 322 | + sane_fname = 'NONAME' | |
| 323 | + | |
| 324 | + # limit filename length | |
| 325 | + if max_length: | |
| 326 | + sane_fname = sane_fname[:max_length] | |
| 327 | + | |
| 328 | + return sane_fname | |
| 329 | + | |
| 330 | + | |
| 331 | +def process_file(container, filename, data, output_dir=None): | |
| 332 | + if output_dir: | |
| 333 | + if not os.path.isdir(output_dir): | |
| 334 | + log.info('creating output directory %s' % output_dir) | |
| 335 | + os.mkdir(output_dir) | |
| 336 | + | |
| 337 | + fname_prefix = os.path.join(output_dir, | |
| 338 | + sanitize_filename(filename)) | |
| 339 | + else: | |
| 340 | + base_dir = os.path.dirname(filename) | |
| 341 | + sane_fname = sanitize_filename(filename) | |
| 342 | + fname_prefix = os.path.join(base_dir, sane_fname) | |
| 343 | + | |
| 344 | + # TODO: option to extract objects to files (false by default) | |
| 345 | + if data is None: | |
| 346 | + data = open(filename, 'rb').read() | |
| 347 | + print ('-'*79) | |
| 348 | + print ('File: %r - %d bytes' % (filename, len(data))) | |
| 349 | + ole = olefile.OleFileIO(data) | |
| 350 | + index = 1 | |
| 351 | + for stream in ole.listdir(): | |
| 352 | + objdata = ole.openstream(stream).read() | |
| 353 | + stream_path = '/'.join(stream) | |
| 354 | + log.debug('Checking stream %r' % stream_path) | |
| 355 | + obj = OleObject() | |
| 356 | + try: | |
| 357 | + obj.parse(objdata) | |
| 358 | + print('extract file embedded in OLE object from stream %r:' % stream_path) | |
| 359 | + print('format_id = %d' % obj.format_id) | |
| 360 | + print('class name = %r' % obj.class_name) | |
| 361 | + print('data size = %d' % obj.data_size) | |
| 362 | + # set a file extension according to the class name: | |
| 363 | + class_name = obj.class_name.lower() | |
| 364 | + if class_name.startswith('word'): | |
| 365 | + ext = 'doc' | |
| 366 | + elif class_name.startswith('package'): | |
| 367 | + ext = 'package' | |
| 368 | + else: | |
| 369 | + ext = 'bin' | |
| 370 | + | |
| 371 | + fname = '%s_object_%03d.%s' % (fname_prefix, index, ext) | |
| 372 | + print ('saving to file %s' % fname) | |
| 373 | + open(fname, 'wb').write(obj.data) | |
| 374 | + if obj.class_name.lower() == 'package': | |
| 375 | + print ('Parsing OLE Package') | |
| 376 | + opkg = OleNativeStream(bindata=obj.data) | |
| 377 | + print ('Filename = %r' % opkg.filename) | |
| 378 | + print ('Source path = %r' % opkg.src_path) | |
| 379 | + print ('Temp path = %r' % opkg.temp_path) | |
| 380 | + if opkg.filename: | |
| 381 | + fname = '%s_%s' % (fname_prefix, | |
| 382 | + sanitize_filename(opkg.filename)) | |
| 383 | + else: | |
| 384 | + fname = '%s_object_%03d.noname' % (fname_prefix, index) | |
| 385 | + print ('saving to file %s' % fname) | |
| 386 | + open(fname, 'wb').write(opkg.data) | |
| 387 | + index += 1 | |
| 388 | + except: | |
| 389 | + log.info('*** Not an OLE 1.0 Object') | |
| 390 | + | |
| 391 | + | |
| 392 | + | |
| 393 | +#=== MAIN ================================================================= | |
| 394 | + | |
| 395 | +if __name__ == '__main__': | |
| 396 | + # print banner with version | |
| 397 | + print ('oleobj %s - http://decalage.info/oletools' % __version__) | |
| 398 | + print ('THIS IS WORK IN PROGRESS - Check updates regularly!') | |
| 399 | + print ('Please report any issue at https://github.com/decalage2/oletools/issues') | |
| 400 | + print ('') | |
| 401 | + | |
| 402 | + DEFAULT_LOG_LEVEL = "warning" # Default log level | |
| 403 | + LOG_LEVELS = {'debug': logging.DEBUG, | |
| 404 | + 'info': logging.INFO, | |
| 405 | + 'warning': logging.WARNING, | |
| 406 | + 'error': logging.ERROR, | |
| 407 | + 'critical': logging.CRITICAL | |
| 408 | + } | |
| 409 | + | |
| 410 | + usage = 'usage: %prog [options] <filename> [filename2 ...]' | |
| 411 | + parser = optparse.OptionParser(usage=usage) | |
| 412 | + # parser.add_option('-o', '--outfile', dest='outfile', | |
| 413 | + # help='output file') | |
| 414 | + # parser.add_option('-c', '--csv', dest='csv', | |
| 415 | + # help='export results to a CSV file') | |
| 416 | + parser.add_option("-r", action="store_true", dest="recursive", | |
| 417 | + help='find files recursively in subdirectories.') | |
| 418 | + parser.add_option("-d", type="str", dest="output_dir", | |
| 419 | + help='use specified directory to output files.', default=None) | |
| 420 | + parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None, | |
| 421 | + help='if the file is a zip archive, open first file from it, using the provided password (requires Python 2.6+)') | |
| 422 | + parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*', | |
| 423 | + help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)') | |
| 424 | + parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL, | |
| 425 | + help="logging level debug/info/warning/error/critical (default=%default)") | |
| 426 | + | |
| 427 | + (options, args) = parser.parse_args() | |
| 428 | + | |
| 429 | + # Print help if no arguments are passed | |
| 430 | + if len(args) == 0: | |
| 431 | + print (__doc__) | |
| 432 | + parser.print_help() | |
| 433 | + sys.exit() | |
| 434 | + | |
| 435 | + # Setup logging to the console: | |
| 436 | + # here we use stdout instead of stderr by default, so that the output | |
| 437 | + # can be redirected properly. | |
| 438 | + logging.basicConfig(level=LOG_LEVELS[options.loglevel], stream=sys.stdout, | |
| 439 | + format='%(levelname)-8s %(message)s') | |
| 440 | + # enable logging in the modules: | |
| 441 | + log.setLevel(logging.NOTSET) | |
| 442 | + | |
| 443 | + | |
| 444 | + for container, filename, data in xglob.iter_files(args, recursive=options.recursive, | |
| 445 | + zip_password=options.zip_password, zip_fname=options.zip_fname): | |
| 446 | + # ignore directory names stored in zip files: | |
| 447 | + if container and filename.endswith('/'): | |
| 448 | + continue | |
| 449 | + process_file(container, filename, data, options.output_dir) | |
| 450 | + | |
| 451 | + | ... | ... |
oletools/rtfobj.py
| ... | ... | @@ -55,18 +55,20 @@ http://www.decalage.info/python/oletools |
| 55 | 55 | # TJ: - sanitize filenames to avoid special characters |
| 56 | 56 | # 2016-05-29 PL: - improved parsing, fixed issue #42 |
| 57 | 57 | # 2016-07-13 v0.48 PL: - new RtfParser and RtfObjParser classes |
| 58 | +# 2016-07-18 SL: - added Python 3.5 support | |
| 59 | +# 2016-07-19 PL: - fixed Python 2.6-2.7 support | |
| 58 | 60 | |
| 59 | 61 | __version__ = '0.48' |
| 60 | 62 | |
| 61 | -#------------------------------------------------------------------------------ | |
| 63 | +# ------------------------------------------------------------------------------ | |
| 62 | 64 | # TODO: |
| 63 | 65 | # - allow semicolon within hex, as found in this sample: |
| 64 | 66 | # http://contagiodump.blogspot.nl/2011/10/sep-28-cve-2010-3333-manuscript-with.html |
| 65 | 67 | |
| 66 | 68 | |
| 67 | -#=== IMPORTS ================================================================= | |
| 69 | +# === IMPORTS ================================================================= | |
| 68 | 70 | |
| 69 | -import re, os, sys, string, binascii, logging, optparse | |
| 71 | +import re, os, sys, binascii, logging, optparse | |
| 70 | 72 | |
| 71 | 73 | from thirdparty.xglob import xglob |
| 72 | 74 | from oleobj import OleObject, OleNativeStream |
| ... | ... | @@ -120,7 +122,7 @@ log = get_logger('rtfobj') |
| 120 | 122 | # REGEX pattern to extract embedded OLE objects in hexadecimal format: |
| 121 | 123 | |
| 122 | 124 | # alphanum digit: [0-9A-Fa-f] |
| 123 | -HEX_DIGIT = rb'[0-9A-Fa-f]' | |
| 125 | +HEX_DIGIT = b'[0-9A-Fa-f]' | |
| 124 | 126 | |
| 125 | 127 | # hex char = two alphanum digits: [0-9A-Fa-f]{2} |
| 126 | 128 | # HEX_CHAR = r'[0-9A-Fa-f]{2}' |
| ... | ... | @@ -130,11 +132,11 @@ HEX_DIGIT = rb'[0-9A-Fa-f]' |
| 130 | 132 | # AND the tags can be nested... |
| 131 | 133 | #SINGLE_RTF_TAG = r'[{][^{}]*[}]' |
| 132 | 134 | # Actually RTF tags may contain braces escaped with backslash (\{ \}): |
| 133 | -SINGLE_RTF_TAG = rb'[{](?:\\.|[^{}\])*[}]' | |
| 135 | +SINGLE_RTF_TAG = b'[{](?:\\\\.|[^{}\\\])*[}]' | |
| 134 | 136 | |
| 135 | 137 | # Nested tags, two levels (because Python's re does not support nested matching): |
| 136 | 138 | # NESTED_RTF_TAG = r'[{](?:[^{}]|'+SINGLE_RTF_TAG+r')*[}]' |
| 137 | -NESTED_RTF_TAG = rb'[{](?:\\.|[^{}\]|'+SINGLE_RTF_TAG+b')*[}]' | |
| 139 | +NESTED_RTF_TAG = b'[{](?:\\\\.|[^{}\\\]|'+SINGLE_RTF_TAG+b')*[}]' | |
| 138 | 140 | |
| 139 | 141 | # AND it is also allowed to insert ANY control word or control symbol (ignored) |
| 140 | 142 | # According to Rich Text Format (RTF) Specification Version 1.9.1, |
| ... | ... | @@ -146,7 +148,7 @@ NESTED_RTF_TAG = rb'[{](?:\\.|[^{}\\]|'+SINGLE_RTF_TAG+b')*[}]' |
| 146 | 148 | # "\AnyThing " "\AnyThing123z" ""\AnyThing-456{" "\AnyThing{" |
| 147 | 149 | # control symbol = \<any char except letter or digit> (followed by anything) |
| 148 | 150 | |
| 149 | -ASCII_NAME = rb'([a-zA-Z]{1,250})' | |
| 151 | +ASCII_NAME = b'([a-zA-Z]{1,250})' | |
| 150 | 152 | |
| 151 | 153 | # using Python's re lookahead assumption: |
| 152 | 154 | # (?=...) Matches if ... matches next, but doesn't consume any of the string. |
| ... | ... | @@ -155,21 +157,21 @@ ASCII_NAME = rb'([a-zA-Z]{1,250})' |
| 155 | 157 | |
| 156 | 158 | # TODO: Find the actual limit on the number of digits for Word |
| 157 | 159 | # SIGNED_INTEGER = r'(-?\d{1,250})' |
| 158 | -SIGNED_INTEGER = rb'(-?\d+)' | |
| 160 | +SIGNED_INTEGER = b'(-?\\d+)' | |
| 159 | 161 | |
| 160 | -CONTROL_WORD = rb'(?:\\' + ASCII_NAME + rb'(?:(?=[^a-zA-Z0-9-])|' + SIGNED_INTEGER + rb'(?=[^0-9])))' | |
| 162 | +CONTROL_WORD = b'(?:\\\\' + ASCII_NAME + b'(?:(?=[^a-zA-Z0-9-])|' + SIGNED_INTEGER + b'(?=[^0-9])))' | |
| 161 | 163 | |
| 162 | 164 | re_control_word = re.compile(CONTROL_WORD) |
| 163 | 165 | |
| 164 | -CONTROL_SYMBOL = rb'(?:\[^a-zA-Z0-9])' | |
| 166 | +CONTROL_SYMBOL = b'(?:\\\[^a-zA-Z0-9])' | |
| 165 | 167 | re_control_symbol = re.compile(CONTROL_SYMBOL) |
| 166 | 168 | |
| 167 | 169 | # Text that is not a control word/symbol or a group: |
| 168 | -TEXT = rb'[^{}\]+' | |
| 170 | +TEXT = b'[^{}\\\]+' | |
| 169 | 171 | re_text = re.compile(TEXT) |
| 170 | 172 | |
| 171 | 173 | # ignored whitespaces and tags within a hex block: |
| 172 | -IGNORED = rb'(?:\s|'+NESTED_RTF_TAG+rb'|'+CONTROL_SYMBOL+rb'|'+CONTROL_WORD+rb')*' | |
| 174 | +IGNORED = b'(?:\\s|'+NESTED_RTF_TAG+b'|'+CONTROL_SYMBOL+b'|'+CONTROL_WORD+b')*' | |
| 173 | 175 | #IGNORED = r'\s*' |
| 174 | 176 | |
| 175 | 177 | # HEX_CHAR = HEX_DIGIT + IGNORED + HEX_DIGIT |
| ... | ... | @@ -189,27 +191,24 @@ IGNORED = rb'(?:\s|'+NESTED_RTF_TAG+rb'|'+CONTROL_SYMBOL+rb'|'+CONTROL_WORD+rb') |
| 189 | 191 | |
| 190 | 192 | #TODO PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}\b' |
| 191 | 193 | # PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}' #+ HEX_CHAR + r'\b' |
| 192 | -PATTERN = rb'\b(?:' + HEX_DIGIT + IGNORED + rb'){7,}' + HEX_DIGIT + rb'\b' | |
| 194 | +PATTERN = b'\\b(?:' + HEX_DIGIT + IGNORED + b'){7,}' + HEX_DIGIT + b'\\b' | |
| 193 | 195 | |
| 194 | 196 | # at least 4 hex chars, followed by whitespace or CR/LF: (?:[0-9A-Fa-f]{2}){4,}\s* |
| 195 | 197 | # PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}' |
| 196 | 198 | # improved pattern, allowing semicolons within hex: |
| 197 | 199 | #PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}' |
| 198 | 200 | |
| 199 | -# a dummy translation table for str.translate, which does not change anythying: | |
| 200 | -TRANSTABLE_NOCHANGE = bytes.maketrans(b'', b'') | |
| 201 | - | |
| 202 | 201 | re_hexblock = re.compile(PATTERN) |
| 203 | 202 | re_embedded_tags = re.compile(IGNORED) |
| 204 | -re_decimal = re.compile(rb'\d+') | |
| 203 | +re_decimal = re.compile(b'\\d+') | |
| 205 | 204 | |
| 206 | -re_delimiter = re.compile(rb'[ \t\r\n\f\v]') | |
| 205 | +re_delimiter = re.compile(b'[ \\t\\r\\n\\f\\v]') | |
| 207 | 206 | |
| 208 | -DELIMITER = rb'[ \t\r\n\f\v]' | |
| 209 | -DELIMITERS_ZeroOrMore = rb'[ \t\r\n\f\v]*' | |
| 210 | -BACKSLASH_BIN = rb'\\bin' | |
| 207 | +DELIMITER = b'[ \\t\\r\\n\\f\\v]' | |
| 208 | +DELIMITERS_ZeroOrMore = b'[ \\t\\r\\n\\f\\v]*' | |
| 209 | +BACKSLASH_BIN = b'\\\\bin' | |
| 211 | 210 | # According to my tests, Word accepts up to 250 digits (leading zeroes) |
| 212 | -DECIMAL_GROUP = rb'(\d{1,250})' | |
| 211 | +DECIMAL_GROUP = b'(\d{1,250})' | |
| 213 | 212 | |
| 214 | 213 | re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + BACKSLASH_BIN |
| 215 | 214 | + DECIMAL_GROUP + DELIMITER) |
| ... | ... | @@ -250,6 +249,19 @@ DESTINATION_CONTROL_WORDS = frozenset(( |
| 250 | 249 | )) |
| 251 | 250 | |
| 252 | 251 | |
| 252 | +# some str methods on Python 2.x return characters, | |
| 253 | +# while the equivalent bytes methods return integers on Python 3.x: | |
| 254 | +if sys.version_info[0] <= 2: | |
| 255 | + # Python 2.x - Characters (str) | |
| 256 | + BACKSLASH = '\\' | |
| 257 | + BRACE_OPEN = '{' | |
| 258 | + BRACE_CLOSE = '}' | |
| 259 | +else: | |
| 260 | + # Python 3.x - Integers | |
| 261 | + BACKSLASH = ord('\\') | |
| 262 | + BRACE_OPEN = ord('{') | |
| 263 | + BRACE_CLOSE = ord('}') | |
| 264 | + | |
| 253 | 265 | |
| 254 | 266 | #=== CLASSES ================================================================= |
| 255 | 267 | |
| ... | ... | @@ -294,15 +306,15 @@ class RtfParser(object): |
| 294 | 306 | def parse(self): |
| 295 | 307 | self.index = 0 |
| 296 | 308 | while self.index < self.size: |
| 297 | - if self.data[self.index] == ord('{'): | |
| 309 | + if self.data[self.index] == BRACE_OPEN: | |
| 298 | 310 | self._open_group() |
| 299 | 311 | self.index += 1 |
| 300 | 312 | continue |
| 301 | - if self.data[self.index] == ord('}'): | |
| 313 | + if self.data[self.index] == BRACE_CLOSE: | |
| 302 | 314 | self._close_group() |
| 303 | 315 | self.index += 1 |
| 304 | 316 | continue |
| 305 | - if self.data[self.index] == ord('\\'): | |
| 317 | + if self.data[self.index] == BACKSLASH: | |
| 306 | 318 | m = re_control_word.match(self.data, self.index) |
| 307 | 319 | if m: |
| 308 | 320 | cword = m.group(1) |
| ... | ... | @@ -332,7 +344,7 @@ class RtfParser(object): |
| 332 | 344 | |
| 333 | 345 | def _open_group(self): |
| 334 | 346 | self.group_level += 1 |
| 335 | - log.debug('{ Open Group at index %Xh - level=%d' % (self.index, self.group_level)) | |
| 347 | + #log.debug('{ Open Group at index %Xh - level=%d' % (self.index, self.group_level)) | |
| 336 | 348 | # call user method AFTER increasing the level: |
| 337 | 349 | self.open_group() |
| 338 | 350 | |
| ... | ... | @@ -341,19 +353,20 @@ class RtfParser(object): |
| 341 | 353 | pass |
| 342 | 354 | |
| 343 | 355 | def _close_group(self): |
| 344 | - log.debug('} Close Group at index %Xh - level=%d' % (self.index, self.group_level)) | |
| 356 | + #log.debug('} Close Group at index %Xh - level=%d' % (self.index, self.group_level)) | |
| 345 | 357 | # call user method BEFORE decreasing the level: |
| 346 | 358 | self.close_group() |
| 347 | 359 | # if the destination level is the same as the group level, close the destination: |
| 348 | 360 | if self.group_level == self.current_destination.group_level: |
| 349 | - log.debug('Current Destination %r level = %d => Close Destination' % ( | |
| 350 | - self.current_destination.cword, self.current_destination.group_level)) | |
| 361 | + # log.debug('Current Destination %r level = %d => Close Destination' % ( | |
| 362 | + # self.current_destination.cword, self.current_destination.group_level)) | |
| 351 | 363 | self._close_destination() |
| 352 | 364 | else: |
| 353 | - log.debug('Current Destination %r level = %d => Continue with same Destination' % ( | |
| 354 | - self.current_destination.cword, self.current_destination.group_level)) | |
| 365 | + # log.debug('Current Destination %r level = %d => Continue with same Destination' % ( | |
| 366 | + # self.current_destination.cword, self.current_destination.group_level)) | |
| 367 | + pass | |
| 355 | 368 | self.group_level -= 1 |
| 356 | - log.debug('Decreased group level to %d' % self.group_level) | |
| 369 | + # log.debug('Decreased group level to %d' % self.group_level) | |
| 357 | 370 | |
| 358 | 371 | def close_group(self): |
| 359 | 372 | #log.debug('close group at index %Xh' % self.index) |
| ... | ... | @@ -369,7 +382,7 @@ class RtfParser(object): |
| 369 | 382 | self.current_destination = new_dest |
| 370 | 383 | # start of the destination is right after the control word: |
| 371 | 384 | new_dest.start = self.index + len(matchobject.group()) |
| 372 | - log.debug("Open Destination %r start=%Xh - level=%d" % (cword, new_dest.start, new_dest.group_level)) | |
| 385 | + # log.debug("Open Destination %r start=%Xh - level=%d" % (cword, new_dest.start, new_dest.group_level)) | |
| 373 | 386 | # call the corresponding user method for additional processing: |
| 374 | 387 | self.open_destination(self.current_destination) |
| 375 | 388 | |
| ... | ... | @@ -377,8 +390,8 @@ class RtfParser(object): |
| 377 | 390 | pass |
| 378 | 391 | |
| 379 | 392 | def _close_destination(self): |
| 380 | - log.debug("Close Destination %r end=%Xh - level=%d" % (self.current_destination.cword, | |
| 381 | - self.index, self.current_destination.group_level)) | |
| 393 | + # log.debug("Close Destination %r end=%Xh - level=%d" % (self.current_destination.cword, | |
| 394 | + # self.index, self.current_destination.group_level)) | |
| 382 | 395 | self.current_destination.end = self.index |
| 383 | 396 | # call the corresponding user method for additional processing: |
| 384 | 397 | self.close_destination(self.current_destination) |
| ... | ... | @@ -388,7 +401,8 @@ class RtfParser(object): |
| 388 | 401 | if len(self.destinations) > 0: |
| 389 | 402 | self.current_destination = self.destinations[-1] |
| 390 | 403 | else: |
| 391 | - log.debug('All destinations are closed, keeping the document destination open') | |
| 404 | + # log.debug('All destinations are closed, keeping the document destination open') | |
| 405 | + pass | |
| 392 | 406 | |
| 393 | 407 | def close_destination(self, destination): |
| 394 | 408 | pass |
| ... | ... | @@ -430,10 +444,10 @@ class RtfParser(object): |
| 430 | 444 | pass |
| 431 | 445 | |
| 432 | 446 | def _end_of_file(self): |
| 433 | - log.debug('%Xh Reached End of File') | |
| 447 | + # log.debug('%Xh Reached End of File') | |
| 434 | 448 | # close any group/destination that is still open: |
| 435 | 449 | while self.group_level > 0: |
| 436 | - log.debug('Group Level = %d, closing group' % self.group_level) | |
| 450 | + # log.debug('Group Level = %d, closing group' % self.group_level) | |
| 437 | 451 | self._close_group() |
| 438 | 452 | self.end_of_file() |
| 439 | 453 | |
| ... | ... | @@ -458,7 +472,7 @@ class RtfObjParser(RtfParser): |
| 458 | 472 | if destination.cword == b'objdata': |
| 459 | 473 | log.debug('*** Close object data at index %Xh' % self.index) |
| 460 | 474 | # Filter out all whitespaces first (just ignored): |
| 461 | - hexdata1 = destination.data.translate(TRANSTABLE_NOCHANGE, b' \t\r\n\f\v') | |
| 475 | + hexdata1 = destination.data.translate(None, b' \t\r\n\f\v') | |
| 462 | 476 | # Then filter out any other non-hex character: |
| 463 | 477 | hexdata = re.sub(b'[^a-hA-H0-9]', b'', hexdata1) |
| 464 | 478 | if len(hexdata) < len(hexdata1): |
| ... | ... | @@ -528,116 +542,116 @@ class RtfObjParser(RtfParser): |
| 528 | 542 | |
| 529 | 543 | #=== FUNCTIONS =============================================================== |
| 530 | 544 | |
| 531 | -def rtf_iter_objects_old (filename, min_size=32): | |
| 532 | - """ | |
| 533 | - Open a RTF file, extract each embedded object encoded in hexadecimal of | |
| 534 | - size > min_size, yield the index of the object in the RTF file and its data | |
| 535 | - in binary format. | |
| 536 | - This is an iterator. | |
| 537 | - """ | |
| 538 | - data = open(filename, 'rb').read() | |
| 539 | - for m in re.finditer(PATTERN, data): | |
| 540 | - found = m.group(0) | |
| 541 | - orig_len = len(found) | |
| 542 | - # remove all whitespace and line feeds: | |
| 543 | - #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE | |
| 544 | - found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v}') | |
| 545 | - found = binascii.unhexlify(found) | |
| 546 | - #print repr(found) | |
| 547 | - if len(found)>min_size: | |
| 548 | - yield m.start(), orig_len, found | |
| 545 | +# def rtf_iter_objects_old (filename, min_size=32): | |
| 546 | +# """ | |
| 547 | +# Open a RTF file, extract each embedded object encoded in hexadecimal of | |
| 548 | +# size > min_size, yield the index of the object in the RTF file and its data | |
| 549 | +# in binary format. | |
| 550 | +# This is an iterator. | |
| 551 | +# """ | |
| 552 | +# data = open(filename, 'rb').read() | |
| 553 | +# for m in re.finditer(PATTERN, data): | |
| 554 | +# found = m.group(0) | |
| 555 | +# orig_len = len(found) | |
| 556 | +# # remove all whitespace and line feeds: | |
| 557 | +# #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE | |
| 558 | +# found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v}') | |
| 559 | +# found = binascii.unhexlify(found) | |
| 560 | +# #print repr(found) | |
| 561 | +# if len(found)>min_size: | |
| 562 | +# yield m.start(), orig_len, found | |
| 549 | 563 | |
| 550 | 564 | # TODO: backward-compatible API? |
| 551 | 565 | |
| 552 | 566 | |
| 553 | -def search_hex_block(data, pos=0, min_size=32, first=True): | |
| 554 | - if first: | |
| 555 | - # Search 1st occurence of a hex block: | |
| 556 | - match = re_hexblock.search(data, pos=pos) | |
| 557 | - else: | |
| 558 | - # Match next occurences of a hex block, from the current position only: | |
| 559 | - match = re_hexblock.match(data, pos=pos) | |
| 560 | - | |
| 561 | - | |
| 562 | - | |
| 563 | -def rtf_iter_objects (data, min_size=32): | |
| 564 | - """ | |
| 565 | - Open a RTF file, extract each embedded object encoded in hexadecimal of | |
| 566 | - size > min_size, yield the index of the object in the RTF file and its data | |
| 567 | - in binary format. | |
| 568 | - This is an iterator. | |
| 569 | - """ | |
| 570 | - # Search 1st occurence of a hex block: | |
| 571 | - match = re_hexblock.search(data) | |
| 572 | - if match is None: | |
| 573 | - log.debug('No hex block found.') | |
| 574 | - # no hex block found | |
| 575 | - return | |
| 576 | - while match is not None: | |
| 577 | - found = match.group(0) | |
| 578 | - # start index | |
| 579 | - start = match.start() | |
| 580 | - # current position | |
| 581 | - current = match.end() | |
| 582 | - log.debug('Found hex block starting at %08X, end %08X, size=%d' % (start, current, len(found))) | |
| 583 | - if len(found) < min_size: | |
| 584 | - log.debug('Too small - size<%d, ignored.' % min_size) | |
| 585 | - match = re_hexblock.search(data, pos=current) | |
| 586 | - continue | |
| 587 | - #log.debug('Match: %s' % found) | |
| 588 | - # remove all whitespace and line feeds: | |
| 589 | - #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE | |
| 590 | - found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') | |
| 591 | - # TODO: make it a function | |
| 592 | - # Also remove embedded RTF tags: | |
| 593 | - found = re_embedded_tags.sub('', found) | |
| 594 | - # object data extracted from the RTF file | |
| 595 | - # MS Word accepts an extra hex digit, so we need to trim it if present: | |
| 596 | - if len(found) & 1: | |
| 597 | - log.debug('Odd length, trimmed last byte.') | |
| 598 | - found = found[:-1] | |
| 599 | - #log.debug('Cleaned match: %s' % found) | |
| 600 | - objdata = binascii.unhexlify(found) | |
| 601 | - # Detect the "\bin" control word, which is sometimes used for obfuscation: | |
| 602 | - bin_match = re_delims_bin_decimal.match(data, pos=current) | |
| 603 | - while bin_match is not None: | |
| 604 | - log.debug('Found \\bin block starting at %08X : %r' | |
| 605 | - % (bin_match.start(), bin_match.group(0))) | |
| 606 | - # extract the decimal integer following '\bin' | |
| 607 | - bin_len = int(bin_match.group(1)) | |
| 608 | - log.debug('\\bin block length = %d' % bin_len) | |
| 609 | - if current+bin_len > len(data): | |
| 610 | - log.error('\\bin block length is larger than the remaining data') | |
| 611 | - # move the current index, ignore the \bin block | |
| 612 | - current += len(bin_match.group(0)) | |
| 613 | - break | |
| 614 | - # read that number of bytes: | |
| 615 | - objdata += data[current:current+bin_len] | |
| 616 | - # TODO: handle exception | |
| 617 | - current += len(bin_match.group(0)) + bin_len | |
| 618 | - # TODO: check if current is out of range | |
| 619 | - # TODO: is Word limiting the \bin length to a number of digits? | |
| 620 | - log.debug('Current position = %08X' % current) | |
| 621 | - match = re_delim_hexblock.match(data, pos=current) | |
| 622 | - if match is not None: | |
| 623 | - log.debug('Found next hex block starting at %08X, end %08X' | |
| 624 | - % (match.start(), match.end())) | |
| 625 | - found = match.group(0) | |
| 626 | - log.debug('Match: %s' % found) | |
| 627 | - # remove all whitespace and line feeds: | |
| 628 | - #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE | |
| 629 | - found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') | |
| 630 | - # Also remove embedded RTF tags: | |
| 631 | - found = re_embedded_tags.sub(found, '') | |
| 632 | - objdata += binascii.unhexlify(found) | |
| 633 | - current = match.end() | |
| 634 | - bin_match = re_delims_bin_decimal.match(data, pos=current) | |
| 635 | - | |
| 636 | - # print repr(found) | |
| 637 | - if len(objdata)>min_size: | |
| 638 | - yield start, current-start, objdata | |
| 639 | - # Search next occurence of a hex block: | |
| 640 | - match = re_hexblock.search(data, pos=current) | |
| 567 | +# def search_hex_block(data, pos=0, min_size=32, first=True): | |
| 568 | +# if first: | |
| 569 | +# # Search 1st occurence of a hex block: | |
| 570 | +# match = re_hexblock.search(data, pos=pos) | |
| 571 | +# else: | |
| 572 | +# # Match next occurences of a hex block, from the current position only: | |
| 573 | +# match = re_hexblock.match(data, pos=pos) | |
| 574 | +# | |
| 575 | +# | |
| 576 | +# | |
| 577 | +# def rtf_iter_objects (data, min_size=32): | |
| 578 | +# """ | |
| 579 | +# Open a RTF file, extract each embedded object encoded in hexadecimal of | |
| 580 | +# size > min_size, yield the index of the object in the RTF file and its data | |
| 581 | +# in binary format. | |
| 582 | +# This is an iterator. | |
| 583 | +# """ | |
| 584 | +# # Search 1st occurence of a hex block: | |
| 585 | +# match = re_hexblock.search(data) | |
| 586 | +# if match is None: | |
| 587 | +# log.debug('No hex block found.') | |
| 588 | +# # no hex block found | |
| 589 | +# return | |
| 590 | +# while match is not None: | |
| 591 | +# found = match.group(0) | |
| 592 | +# # start index | |
| 593 | +# start = match.start() | |
| 594 | +# # current position | |
| 595 | +# current = match.end() | |
| 596 | +# log.debug('Found hex block starting at %08X, end %08X, size=%d' % (start, current, len(found))) | |
| 597 | +# if len(found) < min_size: | |
| 598 | +# log.debug('Too small - size<%d, ignored.' % min_size) | |
| 599 | +# match = re_hexblock.search(data, pos=current) | |
| 600 | +# continue | |
| 601 | +# #log.debug('Match: %s' % found) | |
| 602 | +# # remove all whitespace and line feeds: | |
| 603 | +# #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE | |
| 604 | +# found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') | |
| 605 | +# # TODO: make it a function | |
| 606 | +# # Also remove embedded RTF tags: | |
| 607 | +# found = re_embedded_tags.sub('', found) | |
| 608 | +# # object data extracted from the RTF file | |
| 609 | +# # MS Word accepts an extra hex digit, so we need to trim it if present: | |
| 610 | +# if len(found) & 1: | |
| 611 | +# log.debug('Odd length, trimmed last byte.') | |
| 612 | +# found = found[:-1] | |
| 613 | +# #log.debug('Cleaned match: %s' % found) | |
| 614 | +# objdata = binascii.unhexlify(found) | |
| 615 | +# # Detect the "\bin" control word, which is sometimes used for obfuscation: | |
| 616 | +# bin_match = re_delims_bin_decimal.match(data, pos=current) | |
| 617 | +# while bin_match is not None: | |
| 618 | +# log.debug('Found \\bin block starting at %08X : %r' | |
| 619 | +# % (bin_match.start(), bin_match.group(0))) | |
| 620 | +# # extract the decimal integer following '\bin' | |
| 621 | +# bin_len = int(bin_match.group(1)) | |
| 622 | +# log.debug('\\bin block length = %d' % bin_len) | |
| 623 | +# if current+bin_len > len(data): | |
| 624 | +# log.error('\\bin block length is larger than the remaining data') | |
| 625 | +# # move the current index, ignore the \bin block | |
| 626 | +# current += len(bin_match.group(0)) | |
| 627 | +# break | |
| 628 | +# # read that number of bytes: | |
| 629 | +# objdata += data[current:current+bin_len] | |
| 630 | +# # TODO: handle exception | |
| 631 | +# current += len(bin_match.group(0)) + bin_len | |
| 632 | +# # TODO: check if current is out of range | |
| 633 | +# # TODO: is Word limiting the \bin length to a number of digits? | |
| 634 | +# log.debug('Current position = %08X' % current) | |
| 635 | +# match = re_delim_hexblock.match(data, pos=current) | |
| 636 | +# if match is not None: | |
| 637 | +# log.debug('Found next hex block starting at %08X, end %08X' | |
| 638 | +# % (match.start(), match.end())) | |
| 639 | +# found = match.group(0) | |
| 640 | +# log.debug('Match: %s' % found) | |
| 641 | +# # remove all whitespace and line feeds: | |
| 642 | +# #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE | |
| 643 | +# found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v') | |
| 644 | +# # Also remove embedded RTF tags: | |
| 645 | +# found = re_embedded_tags.sub(found, '') | |
| 646 | +# objdata += binascii.unhexlify(found) | |
| 647 | +# current = match.end() | |
| 648 | +# bin_match = re_delims_bin_decimal.match(data, pos=current) | |
| 649 | +# | |
| 650 | +# # print repr(found) | |
| 651 | +# if len(objdata)>min_size: | |
| 652 | +# yield start, current-start, objdata | |
| 653 | +# # Search next occurence of a hex block: | |
| 654 | +# match = re_hexblock.search(data, pos=current) | |
| 641 | 655 | |
| 642 | 656 | |
| 643 | 657 | ... | ... |