Commit 6416b39aaa18b2efa23671bfdc5483c206ebfbf8

Authored by decalage2
1 parent 1cf591dd

rtfobj, oleobj: fixed Python 2.6+2.7+3.x support

oletools/oleobj.py
1 1 #!/usr/bin/env python
  2 +from __future__ import print_function
2 3 """
3 4 oleobj.py
4 5  
... ... @@ -14,7 +15,7 @@ http://www.decalage.info/python/oletools
14 15  
15 16 # === LICENSE ==================================================================
16 17  
17   -# oleobj is copyright (c) 2015 Philippe Lagadec (http://www.decalage.info)
  18 +# oleobj is copyright (c) 2015-2016 Philippe Lagadec (http://www.decalage.info)
18 19 # All rights reserved.
19 20 #
20 21 # Redistribution and use in source and binary forms, with or without modification,
... ... @@ -41,8 +42,11 @@ http://www.decalage.info/python/oletools
41 42 #------------------------------------------------------------------------------
42 43 # CHANGELOG:
43 44 # 2015-12-05 v0.01 PL: - first version
  45 +# 2016-06 PL: - added main and process_file (not working yet)
  46 +# 2016-07-18 v0.48 SL: - added Python 3.5 support
  47 +# 2016-07-19 PL: - fixed Python 2.6-7 support
44 48  
45   -__version__ = '0.01'
  49 +__version__ = '0.48'
46 50  
47 51 #------------------------------------------------------------------------------
48 52 # TODO:
... ... @@ -62,8 +66,10 @@ __version__ = '0.01'
62 66  
63 67 #--- IMPORTS ------------------------------------------------------------------
64 68  
65   -import logging, struct
  69 +import logging, struct, optparse, os, re, sys
66 70  
  71 +from thirdparty.olefile import olefile
  72 +from thirdparty.xglob import xglob
67 73  
68 74 # === LOGGING =================================================================
69 75  
... ... @@ -107,6 +113,18 @@ def get_logger(name, level=logging.CRITICAL+1):
107 113 log = get_logger('oleobj')
108 114  
109 115  
  116 +# === CONSTANTS ==============================================================
  117 +
  118 +# some str methods on Python 2.x return characters,
  119 +# while the equivalent bytes methods return integers on Python 3.x:
  120 +if sys.version_info[0] <= 2:
  121 + # Python 2.x
  122 + NULL_CHAR = '\x00'
  123 +else:
  124 + # Python 3.x
  125 + NULL_CHAR = 0
  126 +
  127 +
110 128 # === GLOBAL VARIABLES =======================================================
111 129  
112 130 # struct to parse an unsigned integer of 32 bits:
... ... @@ -162,7 +180,7 @@ def read_LengthPrefixedAnsiString(data):
162 180 ansi_string = data[:length-1]
163 181 # TODO: only in strict mode:
164 182 # check the presence of the null char:
165   - assert data[length] == 0
  183 + assert data[length] == NULL_CHAR
166 184 new_data = data[length:]
167 185 return (ansi_string, new_data)
168 186  
... ... @@ -285,3 +303,149 @@ class OleObject (object):
285 303 self.data = data[:self.data_size]
286 304 assert len(self.data) == self.data_size
287 305 self.extra_data = data[self.data_size:]
  306 +
  307 +
  308 +
  309 +def sanitize_filename(filename, replacement='_', max_length=200):
  310 + """compute basename of filename. Replaces all non-whitelisted characters.
  311 + The returned filename is always a basename of the file."""
  312 + basepath = os.path.basename(filename).strip()
  313 + sane_fname = re.sub(r'[^\w\.\- ]', replacement, basepath)
  314 +
  315 + while ".." in sane_fname:
  316 + sane_fname = sane_fname.replace('..', '.')
  317 +
  318 + while " " in sane_fname:
  319 + sane_fname = sane_fname.replace(' ', ' ')
  320 +
  321 + if not len(filename):
  322 + sane_fname = 'NONAME'
  323 +
  324 + # limit filename length
  325 + if max_length:
  326 + sane_fname = sane_fname[:max_length]
  327 +
  328 + return sane_fname
  329 +
  330 +
  331 +def process_file(container, filename, data, output_dir=None):
  332 + if output_dir:
  333 + if not os.path.isdir(output_dir):
  334 + log.info('creating output directory %s' % output_dir)
  335 + os.mkdir(output_dir)
  336 +
  337 + fname_prefix = os.path.join(output_dir,
  338 + sanitize_filename(filename))
  339 + else:
  340 + base_dir = os.path.dirname(filename)
  341 + sane_fname = sanitize_filename(filename)
  342 + fname_prefix = os.path.join(base_dir, sane_fname)
  343 +
  344 + # TODO: option to extract objects to files (false by default)
  345 + if data is None:
  346 + data = open(filename, 'rb').read()
  347 + print ('-'*79)
  348 + print ('File: %r - %d bytes' % (filename, len(data)))
  349 + ole = olefile.OleFileIO(data)
  350 + index = 1
  351 + for stream in ole.listdir():
  352 + objdata = ole.openstream(stream).read()
  353 + stream_path = '/'.join(stream)
  354 + log.debug('Checking stream %r' % stream_path)
  355 + obj = OleObject()
  356 + try:
  357 + obj.parse(objdata)
  358 + print('extract file embedded in OLE object from stream %r:' % stream_path)
  359 + print('format_id = %d' % obj.format_id)
  360 + print('class name = %r' % obj.class_name)
  361 + print('data size = %d' % obj.data_size)
  362 + # set a file extension according to the class name:
  363 + class_name = obj.class_name.lower()
  364 + if class_name.startswith('word'):
  365 + ext = 'doc'
  366 + elif class_name.startswith('package'):
  367 + ext = 'package'
  368 + else:
  369 + ext = 'bin'
  370 +
  371 + fname = '%s_object_%03d.%s' % (fname_prefix, index, ext)
  372 + print ('saving to file %s' % fname)
  373 + open(fname, 'wb').write(obj.data)
  374 + if obj.class_name.lower() == 'package':
  375 + print ('Parsing OLE Package')
  376 + opkg = OleNativeStream(bindata=obj.data)
  377 + print ('Filename = %r' % opkg.filename)
  378 + print ('Source path = %r' % opkg.src_path)
  379 + print ('Temp path = %r' % opkg.temp_path)
  380 + if opkg.filename:
  381 + fname = '%s_%s' % (fname_prefix,
  382 + sanitize_filename(opkg.filename))
  383 + else:
  384 + fname = '%s_object_%03d.noname' % (fname_prefix, index)
  385 + print ('saving to file %s' % fname)
  386 + open(fname, 'wb').write(opkg.data)
  387 + index += 1
  388 + except:
  389 + log.info('*** Not an OLE 1.0 Object')
  390 +
  391 +
  392 +
  393 +#=== MAIN =================================================================
  394 +
  395 +if __name__ == '__main__':
  396 + # print banner with version
  397 + print ('oleobj %s - http://decalage.info/oletools' % __version__)
  398 + print ('THIS IS WORK IN PROGRESS - Check updates regularly!')
  399 + print ('Please report any issue at https://github.com/decalage2/oletools/issues')
  400 + print ('')
  401 +
  402 + DEFAULT_LOG_LEVEL = "warning" # Default log level
  403 + LOG_LEVELS = {'debug': logging.DEBUG,
  404 + 'info': logging.INFO,
  405 + 'warning': logging.WARNING,
  406 + 'error': logging.ERROR,
  407 + 'critical': logging.CRITICAL
  408 + }
  409 +
  410 + usage = 'usage: %prog [options] <filename> [filename2 ...]'
  411 + parser = optparse.OptionParser(usage=usage)
  412 + # parser.add_option('-o', '--outfile', dest='outfile',
  413 + # help='output file')
  414 + # parser.add_option('-c', '--csv', dest='csv',
  415 + # help='export results to a CSV file')
  416 + parser.add_option("-r", action="store_true", dest="recursive",
  417 + help='find files recursively in subdirectories.')
  418 + parser.add_option("-d", type="str", dest="output_dir",
  419 + help='use specified directory to output files.', default=None)
  420 + parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None,
  421 + help='if the file is a zip archive, open first file from it, using the provided password (requires Python 2.6+)')
  422 + parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*',
  423 + help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)')
  424 + parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL,
  425 + help="logging level debug/info/warning/error/critical (default=%default)")
  426 +
  427 + (options, args) = parser.parse_args()
  428 +
  429 + # Print help if no arguments are passed
  430 + if len(args) == 0:
  431 + print (__doc__)
  432 + parser.print_help()
  433 + sys.exit()
  434 +
  435 + # Setup logging to the console:
  436 + # here we use stdout instead of stderr by default, so that the output
  437 + # can be redirected properly.
  438 + logging.basicConfig(level=LOG_LEVELS[options.loglevel], stream=sys.stdout,
  439 + format='%(levelname)-8s %(message)s')
  440 + # enable logging in the modules:
  441 + log.setLevel(logging.NOTSET)
  442 +
  443 +
  444 + for container, filename, data in xglob.iter_files(args, recursive=options.recursive,
  445 + zip_password=options.zip_password, zip_fname=options.zip_fname):
  446 + # ignore directory names stored in zip files:
  447 + if container and filename.endswith('/'):
  448 + continue
  449 + process_file(container, filename, data, options.output_dir)
  450 +
  451 +
... ...
oletools/rtfobj.py
... ... @@ -55,18 +55,20 @@ http://www.decalage.info/python/oletools
55 55 # TJ: - sanitize filenames to avoid special characters
56 56 # 2016-05-29 PL: - improved parsing, fixed issue #42
57 57 # 2016-07-13 v0.48 PL: - new RtfParser and RtfObjParser classes
  58 +# 2016-07-18 SL: - added Python 3.5 support
  59 +# 2016-07-19 PL: - fixed Python 2.6-2.7 support
58 60  
59 61 __version__ = '0.48'
60 62  
61   -#------------------------------------------------------------------------------
  63 +# ------------------------------------------------------------------------------
62 64 # TODO:
63 65 # - allow semicolon within hex, as found in this sample:
64 66 # http://contagiodump.blogspot.nl/2011/10/sep-28-cve-2010-3333-manuscript-with.html
65 67  
66 68  
67   -#=== IMPORTS =================================================================
  69 +# === IMPORTS =================================================================
68 70  
69   -import re, os, sys, string, binascii, logging, optparse
  71 +import re, os, sys, binascii, logging, optparse
70 72  
71 73 from thirdparty.xglob import xglob
72 74 from oleobj import OleObject, OleNativeStream
... ... @@ -120,7 +122,7 @@ log = get_logger(&#39;rtfobj&#39;)
120 122 # REGEX pattern to extract embedded OLE objects in hexadecimal format:
121 123  
122 124 # alphanum digit: [0-9A-Fa-f]
123   -HEX_DIGIT = rb'[0-9A-Fa-f]'
  125 +HEX_DIGIT = b'[0-9A-Fa-f]'
124 126  
125 127 # hex char = two alphanum digits: [0-9A-Fa-f]{2}
126 128 # HEX_CHAR = r'[0-9A-Fa-f]{2}'
... ... @@ -130,11 +132,11 @@ HEX_DIGIT = rb&#39;[0-9A-Fa-f]&#39;
130 132 # AND the tags can be nested...
131 133 #SINGLE_RTF_TAG = r'[{][^{}]*[}]'
132 134 # Actually RTF tags may contain braces escaped with backslash (\{ \}):
133   -SINGLE_RTF_TAG = rb'[{](?:\\.|[^{}\])*[}]'
  135 +SINGLE_RTF_TAG = b'[{](?:\\\\.|[^{}\\\])*[}]'
134 136  
135 137 # Nested tags, two levels (because Python's re does not support nested matching):
136 138 # NESTED_RTF_TAG = r'[{](?:[^{}]|'+SINGLE_RTF_TAG+r')*[}]'
137   -NESTED_RTF_TAG = rb'[{](?:\\.|[^{}\]|'+SINGLE_RTF_TAG+b')*[}]'
  139 +NESTED_RTF_TAG = b'[{](?:\\\\.|[^{}\\\]|'+SINGLE_RTF_TAG+b')*[}]'
138 140  
139 141 # AND it is also allowed to insert ANY control word or control symbol (ignored)
140 142 # According to Rich Text Format (RTF) Specification Version 1.9.1,
... ... @@ -146,7 +148,7 @@ NESTED_RTF_TAG = rb&#39;[{](?:\\.|[^{}\\]|&#39;+SINGLE_RTF_TAG+b&#39;)*[}]&#39;
146 148 # "\AnyThing " "\AnyThing123z" ""\AnyThing-456{" "\AnyThing{"
147 149 # control symbol = \<any char except letter or digit> (followed by anything)
148 150  
149   -ASCII_NAME = rb'([a-zA-Z]{1,250})'
  151 +ASCII_NAME = b'([a-zA-Z]{1,250})'
150 152  
151 153 # using Python's re lookahead assumption:
152 154 # (?=...) Matches if ... matches next, but doesn't consume any of the string.
... ... @@ -155,21 +157,21 @@ ASCII_NAME = rb&#39;([a-zA-Z]{1,250})&#39;
155 157  
156 158 # TODO: Find the actual limit on the number of digits for Word
157 159 # SIGNED_INTEGER = r'(-?\d{1,250})'
158   -SIGNED_INTEGER = rb'(-?\d+)'
  160 +SIGNED_INTEGER = b'(-?\\d+)'
159 161  
160   -CONTROL_WORD = rb'(?:\\' + ASCII_NAME + rb'(?:(?=[^a-zA-Z0-9-])|' + SIGNED_INTEGER + rb'(?=[^0-9])))'
  162 +CONTROL_WORD = b'(?:\\\\' + ASCII_NAME + b'(?:(?=[^a-zA-Z0-9-])|' + SIGNED_INTEGER + b'(?=[^0-9])))'
161 163  
162 164 re_control_word = re.compile(CONTROL_WORD)
163 165  
164   -CONTROL_SYMBOL = rb'(?:\[^a-zA-Z0-9])'
  166 +CONTROL_SYMBOL = b'(?:\\\[^a-zA-Z0-9])'
165 167 re_control_symbol = re.compile(CONTROL_SYMBOL)
166 168  
167 169 # Text that is not a control word/symbol or a group:
168   -TEXT = rb'[^{}\]+'
  170 +TEXT = b'[^{}\\\]+'
169 171 re_text = re.compile(TEXT)
170 172  
171 173 # ignored whitespaces and tags within a hex block:
172   -IGNORED = rb'(?:\s|'+NESTED_RTF_TAG+rb'|'+CONTROL_SYMBOL+rb'|'+CONTROL_WORD+rb')*'
  174 +IGNORED = b'(?:\\s|'+NESTED_RTF_TAG+b'|'+CONTROL_SYMBOL+b'|'+CONTROL_WORD+b')*'
173 175 #IGNORED = r'\s*'
174 176  
175 177 # HEX_CHAR = HEX_DIGIT + IGNORED + HEX_DIGIT
... ... @@ -189,27 +191,24 @@ IGNORED = rb&#39;(?:\s|&#39;+NESTED_RTF_TAG+rb&#39;|&#39;+CONTROL_SYMBOL+rb&#39;|&#39;+CONTROL_WORD+rb&#39;)
189 191  
190 192 #TODO PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}\b'
191 193 # PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}' #+ HEX_CHAR + r'\b'
192   -PATTERN = rb'\b(?:' + HEX_DIGIT + IGNORED + rb'){7,}' + HEX_DIGIT + rb'\b'
  194 +PATTERN = b'\\b(?:' + HEX_DIGIT + IGNORED + b'){7,}' + HEX_DIGIT + b'\\b'
193 195  
194 196 # at least 4 hex chars, followed by whitespace or CR/LF: (?:[0-9A-Fa-f]{2}){4,}\s*
195 197 # PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}'
196 198 # improved pattern, allowing semicolons within hex:
197 199 #PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}'
198 200  
199   -# a dummy translation table for str.translate, which does not change anythying:
200   -TRANSTABLE_NOCHANGE = bytes.maketrans(b'', b'')
201   -
202 201 re_hexblock = re.compile(PATTERN)
203 202 re_embedded_tags = re.compile(IGNORED)
204   -re_decimal = re.compile(rb'\d+')
  203 +re_decimal = re.compile(b'\\d+')
205 204  
206   -re_delimiter = re.compile(rb'[ \t\r\n\f\v]')
  205 +re_delimiter = re.compile(b'[ \\t\\r\\n\\f\\v]')
207 206  
208   -DELIMITER = rb'[ \t\r\n\f\v]'
209   -DELIMITERS_ZeroOrMore = rb'[ \t\r\n\f\v]*'
210   -BACKSLASH_BIN = rb'\\bin'
  207 +DELIMITER = b'[ \\t\\r\\n\\f\\v]'
  208 +DELIMITERS_ZeroOrMore = b'[ \\t\\r\\n\\f\\v]*'
  209 +BACKSLASH_BIN = b'\\\\bin'
211 210 # According to my tests, Word accepts up to 250 digits (leading zeroes)
212   -DECIMAL_GROUP = rb'(\d{1,250})'
  211 +DECIMAL_GROUP = b'(\d{1,250})'
213 212  
214 213 re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + BACKSLASH_BIN
215 214 + DECIMAL_GROUP + DELIMITER)
... ... @@ -250,6 +249,19 @@ DESTINATION_CONTROL_WORDS = frozenset((
250 249 ))
251 250  
252 251  
  252 +# some str methods on Python 2.x return characters,
  253 +# while the equivalent bytes methods return integers on Python 3.x:
  254 +if sys.version_info[0] <= 2:
  255 + # Python 2.x - Characters (str)
  256 + BACKSLASH = '\\'
  257 + BRACE_OPEN = '{'
  258 + BRACE_CLOSE = '}'
  259 +else:
  260 + # Python 3.x - Integers
  261 + BACKSLASH = ord('\\')
  262 + BRACE_OPEN = ord('{')
  263 + BRACE_CLOSE = ord('}')
  264 +
253 265  
254 266 #=== CLASSES =================================================================
255 267  
... ... @@ -294,15 +306,15 @@ class RtfParser(object):
294 306 def parse(self):
295 307 self.index = 0
296 308 while self.index < self.size:
297   - if self.data[self.index] == ord('{'):
  309 + if self.data[self.index] == BRACE_OPEN:
298 310 self._open_group()
299 311 self.index += 1
300 312 continue
301   - if self.data[self.index] == ord('}'):
  313 + if self.data[self.index] == BRACE_CLOSE:
302 314 self._close_group()
303 315 self.index += 1
304 316 continue
305   - if self.data[self.index] == ord('\\'):
  317 + if self.data[self.index] == BACKSLASH:
306 318 m = re_control_word.match(self.data, self.index)
307 319 if m:
308 320 cword = m.group(1)
... ... @@ -332,7 +344,7 @@ class RtfParser(object):
332 344  
333 345 def _open_group(self):
334 346 self.group_level += 1
335   - log.debug('{ Open Group at index %Xh - level=%d' % (self.index, self.group_level))
  347 + #log.debug('{ Open Group at index %Xh - level=%d' % (self.index, self.group_level))
336 348 # call user method AFTER increasing the level:
337 349 self.open_group()
338 350  
... ... @@ -341,19 +353,20 @@ class RtfParser(object):
341 353 pass
342 354  
343 355 def _close_group(self):
344   - log.debug('} Close Group at index %Xh - level=%d' % (self.index, self.group_level))
  356 + #log.debug('} Close Group at index %Xh - level=%d' % (self.index, self.group_level))
345 357 # call user method BEFORE decreasing the level:
346 358 self.close_group()
347 359 # if the destination level is the same as the group level, close the destination:
348 360 if self.group_level == self.current_destination.group_level:
349   - log.debug('Current Destination %r level = %d => Close Destination' % (
350   - self.current_destination.cword, self.current_destination.group_level))
  361 + # log.debug('Current Destination %r level = %d => Close Destination' % (
  362 + # self.current_destination.cword, self.current_destination.group_level))
351 363 self._close_destination()
352 364 else:
353   - log.debug('Current Destination %r level = %d => Continue with same Destination' % (
354   - self.current_destination.cword, self.current_destination.group_level))
  365 + # log.debug('Current Destination %r level = %d => Continue with same Destination' % (
  366 + # self.current_destination.cword, self.current_destination.group_level))
  367 + pass
355 368 self.group_level -= 1
356   - log.debug('Decreased group level to %d' % self.group_level)
  369 + # log.debug('Decreased group level to %d' % self.group_level)
357 370  
358 371 def close_group(self):
359 372 #log.debug('close group at index %Xh' % self.index)
... ... @@ -369,7 +382,7 @@ class RtfParser(object):
369 382 self.current_destination = new_dest
370 383 # start of the destination is right after the control word:
371 384 new_dest.start = self.index + len(matchobject.group())
372   - log.debug("Open Destination %r start=%Xh - level=%d" % (cword, new_dest.start, new_dest.group_level))
  385 + # log.debug("Open Destination %r start=%Xh - level=%d" % (cword, new_dest.start, new_dest.group_level))
373 386 # call the corresponding user method for additional processing:
374 387 self.open_destination(self.current_destination)
375 388  
... ... @@ -377,8 +390,8 @@ class RtfParser(object):
377 390 pass
378 391  
379 392 def _close_destination(self):
380   - log.debug("Close Destination %r end=%Xh - level=%d" % (self.current_destination.cword,
381   - self.index, self.current_destination.group_level))
  393 + # log.debug("Close Destination %r end=%Xh - level=%d" % (self.current_destination.cword,
  394 + # self.index, self.current_destination.group_level))
382 395 self.current_destination.end = self.index
383 396 # call the corresponding user method for additional processing:
384 397 self.close_destination(self.current_destination)
... ... @@ -388,7 +401,8 @@ class RtfParser(object):
388 401 if len(self.destinations) > 0:
389 402 self.current_destination = self.destinations[-1]
390 403 else:
391   - log.debug('All destinations are closed, keeping the document destination open')
  404 + # log.debug('All destinations are closed, keeping the document destination open')
  405 + pass
392 406  
393 407 def close_destination(self, destination):
394 408 pass
... ... @@ -430,10 +444,10 @@ class RtfParser(object):
430 444 pass
431 445  
432 446 def _end_of_file(self):
433   - log.debug('%Xh Reached End of File')
  447 + # log.debug('%Xh Reached End of File')
434 448 # close any group/destination that is still open:
435 449 while self.group_level > 0:
436   - log.debug('Group Level = %d, closing group' % self.group_level)
  450 + # log.debug('Group Level = %d, closing group' % self.group_level)
437 451 self._close_group()
438 452 self.end_of_file()
439 453  
... ... @@ -458,7 +472,7 @@ class RtfObjParser(RtfParser):
458 472 if destination.cword == b'objdata':
459 473 log.debug('*** Close object data at index %Xh' % self.index)
460 474 # Filter out all whitespaces first (just ignored):
461   - hexdata1 = destination.data.translate(TRANSTABLE_NOCHANGE, b' \t\r\n\f\v')
  475 + hexdata1 = destination.data.translate(None, b' \t\r\n\f\v')
462 476 # Then filter out any other non-hex character:
463 477 hexdata = re.sub(b'[^a-hA-H0-9]', b'', hexdata1)
464 478 if len(hexdata) < len(hexdata1):
... ... @@ -528,116 +542,116 @@ class RtfObjParser(RtfParser):
528 542  
529 543 #=== FUNCTIONS ===============================================================
530 544  
531   -def rtf_iter_objects_old (filename, min_size=32):
532   - """
533   - Open a RTF file, extract each embedded object encoded in hexadecimal of
534   - size > min_size, yield the index of the object in the RTF file and its data
535   - in binary format.
536   - This is an iterator.
537   - """
538   - data = open(filename, 'rb').read()
539   - for m in re.finditer(PATTERN, data):
540   - found = m.group(0)
541   - orig_len = len(found)
542   - # remove all whitespace and line feeds:
543   - #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
544   - found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v}')
545   - found = binascii.unhexlify(found)
546   - #print repr(found)
547   - if len(found)>min_size:
548   - yield m.start(), orig_len, found
  545 +# def rtf_iter_objects_old (filename, min_size=32):
  546 +# """
  547 +# Open a RTF file, extract each embedded object encoded in hexadecimal of
  548 +# size > min_size, yield the index of the object in the RTF file and its data
  549 +# in binary format.
  550 +# This is an iterator.
  551 +# """
  552 +# data = open(filename, 'rb').read()
  553 +# for m in re.finditer(PATTERN, data):
  554 +# found = m.group(0)
  555 +# orig_len = len(found)
  556 +# # remove all whitespace and line feeds:
  557 +# #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
  558 +# found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v}')
  559 +# found = binascii.unhexlify(found)
  560 +# #print repr(found)
  561 +# if len(found)>min_size:
  562 +# yield m.start(), orig_len, found
549 563  
550 564 # TODO: backward-compatible API?
551 565  
552 566  
553   -def search_hex_block(data, pos=0, min_size=32, first=True):
554   - if first:
555   - # Search 1st occurence of a hex block:
556   - match = re_hexblock.search(data, pos=pos)
557   - else:
558   - # Match next occurences of a hex block, from the current position only:
559   - match = re_hexblock.match(data, pos=pos)
560   -
561   -
562   -
563   -def rtf_iter_objects (data, min_size=32):
564   - """
565   - Open a RTF file, extract each embedded object encoded in hexadecimal of
566   - size > min_size, yield the index of the object in the RTF file and its data
567   - in binary format.
568   - This is an iterator.
569   - """
570   - # Search 1st occurence of a hex block:
571   - match = re_hexblock.search(data)
572   - if match is None:
573   - log.debug('No hex block found.')
574   - # no hex block found
575   - return
576   - while match is not None:
577   - found = match.group(0)
578   - # start index
579   - start = match.start()
580   - # current position
581   - current = match.end()
582   - log.debug('Found hex block starting at %08X, end %08X, size=%d' % (start, current, len(found)))
583   - if len(found) < min_size:
584   - log.debug('Too small - size<%d, ignored.' % min_size)
585   - match = re_hexblock.search(data, pos=current)
586   - continue
587   - #log.debug('Match: %s' % found)
588   - # remove all whitespace and line feeds:
589   - #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
590   - found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')
591   - # TODO: make it a function
592   - # Also remove embedded RTF tags:
593   - found = re_embedded_tags.sub('', found)
594   - # object data extracted from the RTF file
595   - # MS Word accepts an extra hex digit, so we need to trim it if present:
596   - if len(found) & 1:
597   - log.debug('Odd length, trimmed last byte.')
598   - found = found[:-1]
599   - #log.debug('Cleaned match: %s' % found)
600   - objdata = binascii.unhexlify(found)
601   - # Detect the "\bin" control word, which is sometimes used for obfuscation:
602   - bin_match = re_delims_bin_decimal.match(data, pos=current)
603   - while bin_match is not None:
604   - log.debug('Found \\bin block starting at %08X : %r'
605   - % (bin_match.start(), bin_match.group(0)))
606   - # extract the decimal integer following '\bin'
607   - bin_len = int(bin_match.group(1))
608   - log.debug('\\bin block length = %d' % bin_len)
609   - if current+bin_len > len(data):
610   - log.error('\\bin block length is larger than the remaining data')
611   - # move the current index, ignore the \bin block
612   - current += len(bin_match.group(0))
613   - break
614   - # read that number of bytes:
615   - objdata += data[current:current+bin_len]
616   - # TODO: handle exception
617   - current += len(bin_match.group(0)) + bin_len
618   - # TODO: check if current is out of range
619   - # TODO: is Word limiting the \bin length to a number of digits?
620   - log.debug('Current position = %08X' % current)
621   - match = re_delim_hexblock.match(data, pos=current)
622   - if match is not None:
623   - log.debug('Found next hex block starting at %08X, end %08X'
624   - % (match.start(), match.end()))
625   - found = match.group(0)
626   - log.debug('Match: %s' % found)
627   - # remove all whitespace and line feeds:
628   - #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
629   - found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')
630   - # Also remove embedded RTF tags:
631   - found = re_embedded_tags.sub(found, '')
632   - objdata += binascii.unhexlify(found)
633   - current = match.end()
634   - bin_match = re_delims_bin_decimal.match(data, pos=current)
635   -
636   - # print repr(found)
637   - if len(objdata)>min_size:
638   - yield start, current-start, objdata
639   - # Search next occurence of a hex block:
640   - match = re_hexblock.search(data, pos=current)
  567 +# def search_hex_block(data, pos=0, min_size=32, first=True):
  568 +# if first:
  569 +# # Search 1st occurence of a hex block:
  570 +# match = re_hexblock.search(data, pos=pos)
  571 +# else:
  572 +# # Match next occurences of a hex block, from the current position only:
  573 +# match = re_hexblock.match(data, pos=pos)
  574 +#
  575 +#
  576 +#
  577 +# def rtf_iter_objects (data, min_size=32):
  578 +# """
  579 +# Open a RTF file, extract each embedded object encoded in hexadecimal of
  580 +# size > min_size, yield the index of the object in the RTF file and its data
  581 +# in binary format.
  582 +# This is an iterator.
  583 +# """
  584 +# # Search 1st occurence of a hex block:
  585 +# match = re_hexblock.search(data)
  586 +# if match is None:
  587 +# log.debug('No hex block found.')
  588 +# # no hex block found
  589 +# return
  590 +# while match is not None:
  591 +# found = match.group(0)
  592 +# # start index
  593 +# start = match.start()
  594 +# # current position
  595 +# current = match.end()
  596 +# log.debug('Found hex block starting at %08X, end %08X, size=%d' % (start, current, len(found)))
  597 +# if len(found) < min_size:
  598 +# log.debug('Too small - size<%d, ignored.' % min_size)
  599 +# match = re_hexblock.search(data, pos=current)
  600 +# continue
  601 +# #log.debug('Match: %s' % found)
  602 +# # remove all whitespace and line feeds:
  603 +# #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
  604 +# found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')
  605 +# # TODO: make it a function
  606 +# # Also remove embedded RTF tags:
  607 +# found = re_embedded_tags.sub('', found)
  608 +# # object data extracted from the RTF file
  609 +# # MS Word accepts an extra hex digit, so we need to trim it if present:
  610 +# if len(found) & 1:
  611 +# log.debug('Odd length, trimmed last byte.')
  612 +# found = found[:-1]
  613 +# #log.debug('Cleaned match: %s' % found)
  614 +# objdata = binascii.unhexlify(found)
  615 +# # Detect the "\bin" control word, which is sometimes used for obfuscation:
  616 +# bin_match = re_delims_bin_decimal.match(data, pos=current)
  617 +# while bin_match is not None:
  618 +# log.debug('Found \\bin block starting at %08X : %r'
  619 +# % (bin_match.start(), bin_match.group(0)))
  620 +# # extract the decimal integer following '\bin'
  621 +# bin_len = int(bin_match.group(1))
  622 +# log.debug('\\bin block length = %d' % bin_len)
  623 +# if current+bin_len > len(data):
  624 +# log.error('\\bin block length is larger than the remaining data')
  625 +# # move the current index, ignore the \bin block
  626 +# current += len(bin_match.group(0))
  627 +# break
  628 +# # read that number of bytes:
  629 +# objdata += data[current:current+bin_len]
  630 +# # TODO: handle exception
  631 +# current += len(bin_match.group(0)) + bin_len
  632 +# # TODO: check if current is out of range
  633 +# # TODO: is Word limiting the \bin length to a number of digits?
  634 +# log.debug('Current position = %08X' % current)
  635 +# match = re_delim_hexblock.match(data, pos=current)
  636 +# if match is not None:
  637 +# log.debug('Found next hex block starting at %08X, end %08X'
  638 +# % (match.start(), match.end()))
  639 +# found = match.group(0)
  640 +# log.debug('Match: %s' % found)
  641 +# # remove all whitespace and line feeds:
  642 +# #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
  643 +# found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')
  644 +# # Also remove embedded RTF tags:
  645 +# found = re_embedded_tags.sub(found, '')
  646 +# objdata += binascii.unhexlify(found)
  647 +# current = match.end()
  648 +# bin_match = re_delims_bin_decimal.match(data, pos=current)
  649 +#
  650 +# # print repr(found)
  651 +# if len(objdata)>min_size:
  652 +# yield start, current-start, objdata
  653 +# # Search next occurence of a hex block:
  654 +# match = re_hexblock.search(data, pos=current)
641 655  
642 656  
643 657  
... ...