Commit 6416b39aaa18b2efa23671bfdc5483c206ebfbf8

Authored by decalage2
1 parent 1cf591dd

rtfobj, oleobj: fixed Python 2.6+2.7+3.x support

oletools/oleobj.py
1 #!/usr/bin/env python 1 #!/usr/bin/env python
  2 +from __future__ import print_function
2 """ 3 """
3 oleobj.py 4 oleobj.py
4 5
@@ -14,7 +15,7 @@ http://www.decalage.info/python/oletools @@ -14,7 +15,7 @@ http://www.decalage.info/python/oletools
14 15
15 # === LICENSE ================================================================== 16 # === LICENSE ==================================================================
16 17
17 -# oleobj is copyright (c) 2015 Philippe Lagadec (http://www.decalage.info) 18 +# oleobj is copyright (c) 2015-2016 Philippe Lagadec (http://www.decalage.info)
18 # All rights reserved. 19 # All rights reserved.
19 # 20 #
20 # Redistribution and use in source and binary forms, with or without modification, 21 # Redistribution and use in source and binary forms, with or without modification,
@@ -41,8 +42,11 @@ http://www.decalage.info/python/oletools @@ -41,8 +42,11 @@ http://www.decalage.info/python/oletools
41 #------------------------------------------------------------------------------ 42 #------------------------------------------------------------------------------
42 # CHANGELOG: 43 # CHANGELOG:
43 # 2015-12-05 v0.01 PL: - first version 44 # 2015-12-05 v0.01 PL: - first version
  45 +# 2016-06 PL: - added main and process_file (not working yet)
  46 +# 2016-07-18 v0.48 SL: - added Python 3.5 support
  47 +# 2016-07-19 PL: - fixed Python 2.6-7 support
44 48
45 -__version__ = '0.01' 49 +__version__ = '0.48'
46 50
47 #------------------------------------------------------------------------------ 51 #------------------------------------------------------------------------------
48 # TODO: 52 # TODO:
@@ -62,8 +66,10 @@ __version__ = '0.01' @@ -62,8 +66,10 @@ __version__ = '0.01'
62 66
63 #--- IMPORTS ------------------------------------------------------------------ 67 #--- IMPORTS ------------------------------------------------------------------
64 68
65 -import logging, struct 69 +import logging, struct, optparse, os, re, sys
66 70
  71 +from thirdparty.olefile import olefile
  72 +from thirdparty.xglob import xglob
67 73
68 # === LOGGING ================================================================= 74 # === LOGGING =================================================================
69 75
@@ -107,6 +113,18 @@ def get_logger(name, level=logging.CRITICAL+1): @@ -107,6 +113,18 @@ def get_logger(name, level=logging.CRITICAL+1):
107 log = get_logger('oleobj') 113 log = get_logger('oleobj')
108 114
109 115
  116 +# === CONSTANTS ==============================================================
  117 +
  118 +# some str methods on Python 2.x return characters,
  119 +# while the equivalent bytes methods return integers on Python 3.x:
  120 +if sys.version_info[0] <= 2:
  121 + # Python 2.x
  122 + NULL_CHAR = '\x00'
  123 +else:
  124 + # Python 3.x
  125 + NULL_CHAR = 0
  126 +
  127 +
110 # === GLOBAL VARIABLES ======================================================= 128 # === GLOBAL VARIABLES =======================================================
111 129
112 # struct to parse an unsigned integer of 32 bits: 130 # struct to parse an unsigned integer of 32 bits:
@@ -162,7 +180,7 @@ def read_LengthPrefixedAnsiString(data): @@ -162,7 +180,7 @@ def read_LengthPrefixedAnsiString(data):
162 ansi_string = data[:length-1] 180 ansi_string = data[:length-1]
163 # TODO: only in strict mode: 181 # TODO: only in strict mode:
164 # check the presence of the null char: 182 # check the presence of the null char:
165 - assert data[length] == 0 183 + assert data[length] == NULL_CHAR
166 new_data = data[length:] 184 new_data = data[length:]
167 return (ansi_string, new_data) 185 return (ansi_string, new_data)
168 186
@@ -285,3 +303,149 @@ class OleObject (object): @@ -285,3 +303,149 @@ class OleObject (object):
285 self.data = data[:self.data_size] 303 self.data = data[:self.data_size]
286 assert len(self.data) == self.data_size 304 assert len(self.data) == self.data_size
287 self.extra_data = data[self.data_size:] 305 self.extra_data = data[self.data_size:]
  306 +
  307 +
  308 +
  309 +def sanitize_filename(filename, replacement='_', max_length=200):
  310 + """compute basename of filename. Replaces all non-whitelisted characters.
  311 + The returned filename is always a basename of the file."""
  312 + basepath = os.path.basename(filename).strip()
  313 + sane_fname = re.sub(r'[^\w\.\- ]', replacement, basepath)
  314 +
  315 + while ".." in sane_fname:
  316 + sane_fname = sane_fname.replace('..', '.')
  317 +
  318 + while " " in sane_fname:
  319 + sane_fname = sane_fname.replace(' ', ' ')
  320 +
  321 + if not len(filename):
  322 + sane_fname = 'NONAME'
  323 +
  324 + # limit filename length
  325 + if max_length:
  326 + sane_fname = sane_fname[:max_length]
  327 +
  328 + return sane_fname
  329 +
  330 +
  331 +def process_file(container, filename, data, output_dir=None):
  332 + if output_dir:
  333 + if not os.path.isdir(output_dir):
  334 + log.info('creating output directory %s' % output_dir)
  335 + os.mkdir(output_dir)
  336 +
  337 + fname_prefix = os.path.join(output_dir,
  338 + sanitize_filename(filename))
  339 + else:
  340 + base_dir = os.path.dirname(filename)
  341 + sane_fname = sanitize_filename(filename)
  342 + fname_prefix = os.path.join(base_dir, sane_fname)
  343 +
  344 + # TODO: option to extract objects to files (false by default)
  345 + if data is None:
  346 + data = open(filename, 'rb').read()
  347 + print ('-'*79)
  348 + print ('File: %r - %d bytes' % (filename, len(data)))
  349 + ole = olefile.OleFileIO(data)
  350 + index = 1
  351 + for stream in ole.listdir():
  352 + objdata = ole.openstream(stream).read()
  353 + stream_path = '/'.join(stream)
  354 + log.debug('Checking stream %r' % stream_path)
  355 + obj = OleObject()
  356 + try:
  357 + obj.parse(objdata)
  358 + print('extract file embedded in OLE object from stream %r:' % stream_path)
  359 + print('format_id = %d' % obj.format_id)
  360 + print('class name = %r' % obj.class_name)
  361 + print('data size = %d' % obj.data_size)
  362 + # set a file extension according to the class name:
  363 + class_name = obj.class_name.lower()
  364 + if class_name.startswith('word'):
  365 + ext = 'doc'
  366 + elif class_name.startswith('package'):
  367 + ext = 'package'
  368 + else:
  369 + ext = 'bin'
  370 +
  371 + fname = '%s_object_%03d.%s' % (fname_prefix, index, ext)
  372 + print ('saving to file %s' % fname)
  373 + open(fname, 'wb').write(obj.data)
  374 + if obj.class_name.lower() == 'package':
  375 + print ('Parsing OLE Package')
  376 + opkg = OleNativeStream(bindata=obj.data)
  377 + print ('Filename = %r' % opkg.filename)
  378 + print ('Source path = %r' % opkg.src_path)
  379 + print ('Temp path = %r' % opkg.temp_path)
  380 + if opkg.filename:
  381 + fname = '%s_%s' % (fname_prefix,
  382 + sanitize_filename(opkg.filename))
  383 + else:
  384 + fname = '%s_object_%03d.noname' % (fname_prefix, index)
  385 + print ('saving to file %s' % fname)
  386 + open(fname, 'wb').write(opkg.data)
  387 + index += 1
  388 + except:
  389 + log.info('*** Not an OLE 1.0 Object')
  390 +
  391 +
  392 +
  393 +#=== MAIN =================================================================
  394 +
  395 +if __name__ == '__main__':
  396 + # print banner with version
  397 + print ('oleobj %s - http://decalage.info/oletools' % __version__)
  398 + print ('THIS IS WORK IN PROGRESS - Check updates regularly!')
  399 + print ('Please report any issue at https://github.com/decalage2/oletools/issues')
  400 + print ('')
  401 +
  402 + DEFAULT_LOG_LEVEL = "warning" # Default log level
  403 + LOG_LEVELS = {'debug': logging.DEBUG,
  404 + 'info': logging.INFO,
  405 + 'warning': logging.WARNING,
  406 + 'error': logging.ERROR,
  407 + 'critical': logging.CRITICAL
  408 + }
  409 +
  410 + usage = 'usage: %prog [options] <filename> [filename2 ...]'
  411 + parser = optparse.OptionParser(usage=usage)
  412 + # parser.add_option('-o', '--outfile', dest='outfile',
  413 + # help='output file')
  414 + # parser.add_option('-c', '--csv', dest='csv',
  415 + # help='export results to a CSV file')
  416 + parser.add_option("-r", action="store_true", dest="recursive",
  417 + help='find files recursively in subdirectories.')
  418 + parser.add_option("-d", type="str", dest="output_dir",
  419 + help='use specified directory to output files.', default=None)
  420 + parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None,
  421 + help='if the file is a zip archive, open first file from it, using the provided password (requires Python 2.6+)')
  422 + parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*',
  423 + help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)')
  424 + parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL,
  425 + help="logging level debug/info/warning/error/critical (default=%default)")
  426 +
  427 + (options, args) = parser.parse_args()
  428 +
  429 + # Print help if no arguments are passed
  430 + if len(args) == 0:
  431 + print (__doc__)
  432 + parser.print_help()
  433 + sys.exit()
  434 +
  435 + # Setup logging to the console:
  436 + # here we use stdout instead of stderr by default, so that the output
  437 + # can be redirected properly.
  438 + logging.basicConfig(level=LOG_LEVELS[options.loglevel], stream=sys.stdout,
  439 + format='%(levelname)-8s %(message)s')
  440 + # enable logging in the modules:
  441 + log.setLevel(logging.NOTSET)
  442 +
  443 +
  444 + for container, filename, data in xglob.iter_files(args, recursive=options.recursive,
  445 + zip_password=options.zip_password, zip_fname=options.zip_fname):
  446 + # ignore directory names stored in zip files:
  447 + if container and filename.endswith('/'):
  448 + continue
  449 + process_file(container, filename, data, options.output_dir)
  450 +
  451 +
oletools/rtfobj.py
@@ -55,18 +55,20 @@ http://www.decalage.info/python/oletools @@ -55,18 +55,20 @@ http://www.decalage.info/python/oletools
55 # TJ: - sanitize filenames to avoid special characters 55 # TJ: - sanitize filenames to avoid special characters
56 # 2016-05-29 PL: - improved parsing, fixed issue #42 56 # 2016-05-29 PL: - improved parsing, fixed issue #42
57 # 2016-07-13 v0.48 PL: - new RtfParser and RtfObjParser classes 57 # 2016-07-13 v0.48 PL: - new RtfParser and RtfObjParser classes
  58 +# 2016-07-18 SL: - added Python 3.5 support
  59 +# 2016-07-19 PL: - fixed Python 2.6-2.7 support
58 60
59 __version__ = '0.48' 61 __version__ = '0.48'
60 62
61 -#------------------------------------------------------------------------------ 63 +# ------------------------------------------------------------------------------
62 # TODO: 64 # TODO:
63 # - allow semicolon within hex, as found in this sample: 65 # - allow semicolon within hex, as found in this sample:
64 # http://contagiodump.blogspot.nl/2011/10/sep-28-cve-2010-3333-manuscript-with.html 66 # http://contagiodump.blogspot.nl/2011/10/sep-28-cve-2010-3333-manuscript-with.html
65 67
66 68
67 -#=== IMPORTS ================================================================= 69 +# === IMPORTS =================================================================
68 70
69 -import re, os, sys, string, binascii, logging, optparse 71 +import re, os, sys, binascii, logging, optparse
70 72
71 from thirdparty.xglob import xglob 73 from thirdparty.xglob import xglob
72 from oleobj import OleObject, OleNativeStream 74 from oleobj import OleObject, OleNativeStream
@@ -120,7 +122,7 @@ log = get_logger(&#39;rtfobj&#39;) @@ -120,7 +122,7 @@ log = get_logger(&#39;rtfobj&#39;)
120 # REGEX pattern to extract embedded OLE objects in hexadecimal format: 122 # REGEX pattern to extract embedded OLE objects in hexadecimal format:
121 123
122 # alphanum digit: [0-9A-Fa-f] 124 # alphanum digit: [0-9A-Fa-f]
123 -HEX_DIGIT = rb'[0-9A-Fa-f]' 125 +HEX_DIGIT = b'[0-9A-Fa-f]'
124 126
125 # hex char = two alphanum digits: [0-9A-Fa-f]{2} 127 # hex char = two alphanum digits: [0-9A-Fa-f]{2}
126 # HEX_CHAR = r'[0-9A-Fa-f]{2}' 128 # HEX_CHAR = r'[0-9A-Fa-f]{2}'
@@ -130,11 +132,11 @@ HEX_DIGIT = rb&#39;[0-9A-Fa-f]&#39; @@ -130,11 +132,11 @@ HEX_DIGIT = rb&#39;[0-9A-Fa-f]&#39;
130 # AND the tags can be nested... 132 # AND the tags can be nested...
131 #SINGLE_RTF_TAG = r'[{][^{}]*[}]' 133 #SINGLE_RTF_TAG = r'[{][^{}]*[}]'
132 # Actually RTF tags may contain braces escaped with backslash (\{ \}): 134 # Actually RTF tags may contain braces escaped with backslash (\{ \}):
133 -SINGLE_RTF_TAG = rb'[{](?:\\.|[^{}\])*[}]' 135 +SINGLE_RTF_TAG = b'[{](?:\\\\.|[^{}\\\])*[}]'
134 136
135 # Nested tags, two levels (because Python's re does not support nested matching): 137 # Nested tags, two levels (because Python's re does not support nested matching):
136 # NESTED_RTF_TAG = r'[{](?:[^{}]|'+SINGLE_RTF_TAG+r')*[}]' 138 # NESTED_RTF_TAG = r'[{](?:[^{}]|'+SINGLE_RTF_TAG+r')*[}]'
137 -NESTED_RTF_TAG = rb'[{](?:\\.|[^{}\]|'+SINGLE_RTF_TAG+b')*[}]' 139 +NESTED_RTF_TAG = b'[{](?:\\\\.|[^{}\\\]|'+SINGLE_RTF_TAG+b')*[}]'
138 140
139 # AND it is also allowed to insert ANY control word or control symbol (ignored) 141 # AND it is also allowed to insert ANY control word or control symbol (ignored)
140 # According to Rich Text Format (RTF) Specification Version 1.9.1, 142 # According to Rich Text Format (RTF) Specification Version 1.9.1,
@@ -146,7 +148,7 @@ NESTED_RTF_TAG = rb&#39;[{](?:\\.|[^{}\\]|&#39;+SINGLE_RTF_TAG+b&#39;)*[}]&#39; @@ -146,7 +148,7 @@ NESTED_RTF_TAG = rb&#39;[{](?:\\.|[^{}\\]|&#39;+SINGLE_RTF_TAG+b&#39;)*[}]&#39;
146 # "\AnyThing " "\AnyThing123z" ""\AnyThing-456{" "\AnyThing{" 148 # "\AnyThing " "\AnyThing123z" ""\AnyThing-456{" "\AnyThing{"
147 # control symbol = \<any char except letter or digit> (followed by anything) 149 # control symbol = \<any char except letter or digit> (followed by anything)
148 150
149 -ASCII_NAME = rb'([a-zA-Z]{1,250})' 151 +ASCII_NAME = b'([a-zA-Z]{1,250})'
150 152
151 # using Python's re lookahead assumption: 153 # using Python's re lookahead assumption:
152 # (?=...) Matches if ... matches next, but doesn't consume any of the string. 154 # (?=...) Matches if ... matches next, but doesn't consume any of the string.
@@ -155,21 +157,21 @@ ASCII_NAME = rb&#39;([a-zA-Z]{1,250})&#39; @@ -155,21 +157,21 @@ ASCII_NAME = rb&#39;([a-zA-Z]{1,250})&#39;
155 157
156 # TODO: Find the actual limit on the number of digits for Word 158 # TODO: Find the actual limit on the number of digits for Word
157 # SIGNED_INTEGER = r'(-?\d{1,250})' 159 # SIGNED_INTEGER = r'(-?\d{1,250})'
158 -SIGNED_INTEGER = rb'(-?\d+)' 160 +SIGNED_INTEGER = b'(-?\\d+)'
159 161
160 -CONTROL_WORD = rb'(?:\\' + ASCII_NAME + rb'(?:(?=[^a-zA-Z0-9-])|' + SIGNED_INTEGER + rb'(?=[^0-9])))' 162 +CONTROL_WORD = b'(?:\\\\' + ASCII_NAME + b'(?:(?=[^a-zA-Z0-9-])|' + SIGNED_INTEGER + b'(?=[^0-9])))'
161 163
162 re_control_word = re.compile(CONTROL_WORD) 164 re_control_word = re.compile(CONTROL_WORD)
163 165
164 -CONTROL_SYMBOL = rb'(?:\[^a-zA-Z0-9])' 166 +CONTROL_SYMBOL = b'(?:\\\[^a-zA-Z0-9])'
165 re_control_symbol = re.compile(CONTROL_SYMBOL) 167 re_control_symbol = re.compile(CONTROL_SYMBOL)
166 168
167 # Text that is not a control word/symbol or a group: 169 # Text that is not a control word/symbol or a group:
168 -TEXT = rb'[^{}\]+' 170 +TEXT = b'[^{}\\\]+'
169 re_text = re.compile(TEXT) 171 re_text = re.compile(TEXT)
170 172
171 # ignored whitespaces and tags within a hex block: 173 # ignored whitespaces and tags within a hex block:
172 -IGNORED = rb'(?:\s|'+NESTED_RTF_TAG+rb'|'+CONTROL_SYMBOL+rb'|'+CONTROL_WORD+rb')*' 174 +IGNORED = b'(?:\\s|'+NESTED_RTF_TAG+b'|'+CONTROL_SYMBOL+b'|'+CONTROL_WORD+b')*'
173 #IGNORED = r'\s*' 175 #IGNORED = r'\s*'
174 176
175 # HEX_CHAR = HEX_DIGIT + IGNORED + HEX_DIGIT 177 # HEX_CHAR = HEX_DIGIT + IGNORED + HEX_DIGIT
@@ -189,27 +191,24 @@ IGNORED = rb&#39;(?:\s|&#39;+NESTED_RTF_TAG+rb&#39;|&#39;+CONTROL_SYMBOL+rb&#39;|&#39;+CONTROL_WORD+rb&#39;) @@ -189,27 +191,24 @@ IGNORED = rb&#39;(?:\s|&#39;+NESTED_RTF_TAG+rb&#39;|&#39;+CONTROL_SYMBOL+rb&#39;|&#39;+CONTROL_WORD+rb&#39;)
189 191
190 #TODO PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}\b' 192 #TODO PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}\b'
191 # PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}' #+ HEX_CHAR + r'\b' 193 # PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}' #+ HEX_CHAR + r'\b'
192 -PATTERN = rb'\b(?:' + HEX_DIGIT + IGNORED + rb'){7,}' + HEX_DIGIT + rb'\b' 194 +PATTERN = b'\\b(?:' + HEX_DIGIT + IGNORED + b'){7,}' + HEX_DIGIT + b'\\b'
193 195
194 # at least 4 hex chars, followed by whitespace or CR/LF: (?:[0-9A-Fa-f]{2}){4,}\s* 196 # at least 4 hex chars, followed by whitespace or CR/LF: (?:[0-9A-Fa-f]{2}){4,}\s*
195 # PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}' 197 # PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}'
196 # improved pattern, allowing semicolons within hex: 198 # improved pattern, allowing semicolons within hex:
197 #PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}' 199 #PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}'
198 200
199 -# a dummy translation table for str.translate, which does not change anythying:  
200 -TRANSTABLE_NOCHANGE = bytes.maketrans(b'', b'')  
201 -  
202 re_hexblock = re.compile(PATTERN) 201 re_hexblock = re.compile(PATTERN)
203 re_embedded_tags = re.compile(IGNORED) 202 re_embedded_tags = re.compile(IGNORED)
204 -re_decimal = re.compile(rb'\d+') 203 +re_decimal = re.compile(b'\\d+')
205 204
206 -re_delimiter = re.compile(rb'[ \t\r\n\f\v]') 205 +re_delimiter = re.compile(b'[ \\t\\r\\n\\f\\v]')
207 206
208 -DELIMITER = rb'[ \t\r\n\f\v]'  
209 -DELIMITERS_ZeroOrMore = rb'[ \t\r\n\f\v]*'  
210 -BACKSLASH_BIN = rb'\\bin' 207 +DELIMITER = b'[ \\t\\r\\n\\f\\v]'
  208 +DELIMITERS_ZeroOrMore = b'[ \\t\\r\\n\\f\\v]*'
  209 +BACKSLASH_BIN = b'\\\\bin'
211 # According to my tests, Word accepts up to 250 digits (leading zeroes) 210 # According to my tests, Word accepts up to 250 digits (leading zeroes)
212 -DECIMAL_GROUP = rb'(\d{1,250})' 211 +DECIMAL_GROUP = b'(\d{1,250})'
213 212
214 re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + BACKSLASH_BIN 213 re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + BACKSLASH_BIN
215 + DECIMAL_GROUP + DELIMITER) 214 + DECIMAL_GROUP + DELIMITER)
@@ -250,6 +249,19 @@ DESTINATION_CONTROL_WORDS = frozenset(( @@ -250,6 +249,19 @@ DESTINATION_CONTROL_WORDS = frozenset((
250 )) 249 ))
251 250
252 251
  252 +# some str methods on Python 2.x return characters,
  253 +# while the equivalent bytes methods return integers on Python 3.x:
  254 +if sys.version_info[0] <= 2:
  255 + # Python 2.x - Characters (str)
  256 + BACKSLASH = '\\'
  257 + BRACE_OPEN = '{'
  258 + BRACE_CLOSE = '}'
  259 +else:
  260 + # Python 3.x - Integers
  261 + BACKSLASH = ord('\\')
  262 + BRACE_OPEN = ord('{')
  263 + BRACE_CLOSE = ord('}')
  264 +
253 265
254 #=== CLASSES ================================================================= 266 #=== CLASSES =================================================================
255 267
@@ -294,15 +306,15 @@ class RtfParser(object): @@ -294,15 +306,15 @@ class RtfParser(object):
294 def parse(self): 306 def parse(self):
295 self.index = 0 307 self.index = 0
296 while self.index < self.size: 308 while self.index < self.size:
297 - if self.data[self.index] == ord('{'): 309 + if self.data[self.index] == BRACE_OPEN:
298 self._open_group() 310 self._open_group()
299 self.index += 1 311 self.index += 1
300 continue 312 continue
301 - if self.data[self.index] == ord('}'): 313 + if self.data[self.index] == BRACE_CLOSE:
302 self._close_group() 314 self._close_group()
303 self.index += 1 315 self.index += 1
304 continue 316 continue
305 - if self.data[self.index] == ord('\\'): 317 + if self.data[self.index] == BACKSLASH:
306 m = re_control_word.match(self.data, self.index) 318 m = re_control_word.match(self.data, self.index)
307 if m: 319 if m:
308 cword = m.group(1) 320 cword = m.group(1)
@@ -332,7 +344,7 @@ class RtfParser(object): @@ -332,7 +344,7 @@ class RtfParser(object):
332 344
333 def _open_group(self): 345 def _open_group(self):
334 self.group_level += 1 346 self.group_level += 1
335 - log.debug('{ Open Group at index %Xh - level=%d' % (self.index, self.group_level)) 347 + #log.debug('{ Open Group at index %Xh - level=%d' % (self.index, self.group_level))
336 # call user method AFTER increasing the level: 348 # call user method AFTER increasing the level:
337 self.open_group() 349 self.open_group()
338 350
@@ -341,19 +353,20 @@ class RtfParser(object): @@ -341,19 +353,20 @@ class RtfParser(object):
341 pass 353 pass
342 354
343 def _close_group(self): 355 def _close_group(self):
344 - log.debug('} Close Group at index %Xh - level=%d' % (self.index, self.group_level)) 356 + #log.debug('} Close Group at index %Xh - level=%d' % (self.index, self.group_level))
345 # call user method BEFORE decreasing the level: 357 # call user method BEFORE decreasing the level:
346 self.close_group() 358 self.close_group()
347 # if the destination level is the same as the group level, close the destination: 359 # if the destination level is the same as the group level, close the destination:
348 if self.group_level == self.current_destination.group_level: 360 if self.group_level == self.current_destination.group_level:
349 - log.debug('Current Destination %r level = %d => Close Destination' % (  
350 - self.current_destination.cword, self.current_destination.group_level)) 361 + # log.debug('Current Destination %r level = %d => Close Destination' % (
  362 + # self.current_destination.cword, self.current_destination.group_level))
351 self._close_destination() 363 self._close_destination()
352 else: 364 else:
353 - log.debug('Current Destination %r level = %d => Continue with same Destination' % (  
354 - self.current_destination.cword, self.current_destination.group_level)) 365 + # log.debug('Current Destination %r level = %d => Continue with same Destination' % (
  366 + # self.current_destination.cword, self.current_destination.group_level))
  367 + pass
355 self.group_level -= 1 368 self.group_level -= 1
356 - log.debug('Decreased group level to %d' % self.group_level) 369 + # log.debug('Decreased group level to %d' % self.group_level)
357 370
358 def close_group(self): 371 def close_group(self):
359 #log.debug('close group at index %Xh' % self.index) 372 #log.debug('close group at index %Xh' % self.index)
@@ -369,7 +382,7 @@ class RtfParser(object): @@ -369,7 +382,7 @@ class RtfParser(object):
369 self.current_destination = new_dest 382 self.current_destination = new_dest
370 # start of the destination is right after the control word: 383 # start of the destination is right after the control word:
371 new_dest.start = self.index + len(matchobject.group()) 384 new_dest.start = self.index + len(matchobject.group())
372 - log.debug("Open Destination %r start=%Xh - level=%d" % (cword, new_dest.start, new_dest.group_level)) 385 + # log.debug("Open Destination %r start=%Xh - level=%d" % (cword, new_dest.start, new_dest.group_level))
373 # call the corresponding user method for additional processing: 386 # call the corresponding user method for additional processing:
374 self.open_destination(self.current_destination) 387 self.open_destination(self.current_destination)
375 388
@@ -377,8 +390,8 @@ class RtfParser(object): @@ -377,8 +390,8 @@ class RtfParser(object):
377 pass 390 pass
378 391
379 def _close_destination(self): 392 def _close_destination(self):
380 - log.debug("Close Destination %r end=%Xh - level=%d" % (self.current_destination.cword,  
381 - self.index, self.current_destination.group_level)) 393 + # log.debug("Close Destination %r end=%Xh - level=%d" % (self.current_destination.cword,
  394 + # self.index, self.current_destination.group_level))
382 self.current_destination.end = self.index 395 self.current_destination.end = self.index
383 # call the corresponding user method for additional processing: 396 # call the corresponding user method for additional processing:
384 self.close_destination(self.current_destination) 397 self.close_destination(self.current_destination)
@@ -388,7 +401,8 @@ class RtfParser(object): @@ -388,7 +401,8 @@ class RtfParser(object):
388 if len(self.destinations) > 0: 401 if len(self.destinations) > 0:
389 self.current_destination = self.destinations[-1] 402 self.current_destination = self.destinations[-1]
390 else: 403 else:
391 - log.debug('All destinations are closed, keeping the document destination open') 404 + # log.debug('All destinations are closed, keeping the document destination open')
  405 + pass
392 406
393 def close_destination(self, destination): 407 def close_destination(self, destination):
394 pass 408 pass
@@ -430,10 +444,10 @@ class RtfParser(object): @@ -430,10 +444,10 @@ class RtfParser(object):
430 pass 444 pass
431 445
432 def _end_of_file(self): 446 def _end_of_file(self):
433 - log.debug('%Xh Reached End of File') 447 + # log.debug('%Xh Reached End of File')
434 # close any group/destination that is still open: 448 # close any group/destination that is still open:
435 while self.group_level > 0: 449 while self.group_level > 0:
436 - log.debug('Group Level = %d, closing group' % self.group_level) 450 + # log.debug('Group Level = %d, closing group' % self.group_level)
437 self._close_group() 451 self._close_group()
438 self.end_of_file() 452 self.end_of_file()
439 453
@@ -458,7 +472,7 @@ class RtfObjParser(RtfParser): @@ -458,7 +472,7 @@ class RtfObjParser(RtfParser):
458 if destination.cword == b'objdata': 472 if destination.cword == b'objdata':
459 log.debug('*** Close object data at index %Xh' % self.index) 473 log.debug('*** Close object data at index %Xh' % self.index)
460 # Filter out all whitespaces first (just ignored): 474 # Filter out all whitespaces first (just ignored):
461 - hexdata1 = destination.data.translate(TRANSTABLE_NOCHANGE, b' \t\r\n\f\v') 475 + hexdata1 = destination.data.translate(None, b' \t\r\n\f\v')
462 # Then filter out any other non-hex character: 476 # Then filter out any other non-hex character:
463 hexdata = re.sub(b'[^a-hA-H0-9]', b'', hexdata1) 477 hexdata = re.sub(b'[^a-hA-H0-9]', b'', hexdata1)
464 if len(hexdata) < len(hexdata1): 478 if len(hexdata) < len(hexdata1):
@@ -528,116 +542,116 @@ class RtfObjParser(RtfParser): @@ -528,116 +542,116 @@ class RtfObjParser(RtfParser):
528 542
529 #=== FUNCTIONS =============================================================== 543 #=== FUNCTIONS ===============================================================
530 544
531 -def rtf_iter_objects_old (filename, min_size=32):  
532 - """  
533 - Open a RTF file, extract each embedded object encoded in hexadecimal of  
534 - size > min_size, yield the index of the object in the RTF file and its data  
535 - in binary format.  
536 - This is an iterator.  
537 - """  
538 - data = open(filename, 'rb').read()  
539 - for m in re.finditer(PATTERN, data):  
540 - found = m.group(0)  
541 - orig_len = len(found)  
542 - # remove all whitespace and line feeds:  
543 - #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE  
544 - found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v}')  
545 - found = binascii.unhexlify(found)  
546 - #print repr(found)  
547 - if len(found)>min_size:  
548 - yield m.start(), orig_len, found 545 +# def rtf_iter_objects_old (filename, min_size=32):
  546 +# """
  547 +# Open a RTF file, extract each embedded object encoded in hexadecimal of
  548 +# size > min_size, yield the index of the object in the RTF file and its data
  549 +# in binary format.
  550 +# This is an iterator.
  551 +# """
  552 +# data = open(filename, 'rb').read()
  553 +# for m in re.finditer(PATTERN, data):
  554 +# found = m.group(0)
  555 +# orig_len = len(found)
  556 +# # remove all whitespace and line feeds:
  557 +# #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
  558 +# found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v}')
  559 +# found = binascii.unhexlify(found)
  560 +# #print repr(found)
  561 +# if len(found)>min_size:
  562 +# yield m.start(), orig_len, found
549 563
550 # TODO: backward-compatible API? 564 # TODO: backward-compatible API?
551 565
552 566
553 -def search_hex_block(data, pos=0, min_size=32, first=True):  
554 - if first:  
555 - # Search 1st occurence of a hex block:  
556 - match = re_hexblock.search(data, pos=pos)  
557 - else:  
558 - # Match next occurences of a hex block, from the current position only:  
559 - match = re_hexblock.match(data, pos=pos)  
560 -  
561 -  
562 -  
563 -def rtf_iter_objects (data, min_size=32):  
564 - """  
565 - Open a RTF file, extract each embedded object encoded in hexadecimal of  
566 - size > min_size, yield the index of the object in the RTF file and its data  
567 - in binary format.  
568 - This is an iterator.  
569 - """  
570 - # Search 1st occurence of a hex block:  
571 - match = re_hexblock.search(data)  
572 - if match is None:  
573 - log.debug('No hex block found.')  
574 - # no hex block found  
575 - return  
576 - while match is not None:  
577 - found = match.group(0)  
578 - # start index  
579 - start = match.start()  
580 - # current position  
581 - current = match.end()  
582 - log.debug('Found hex block starting at %08X, end %08X, size=%d' % (start, current, len(found)))  
583 - if len(found) < min_size:  
584 - log.debug('Too small - size<%d, ignored.' % min_size)  
585 - match = re_hexblock.search(data, pos=current)  
586 - continue  
587 - #log.debug('Match: %s' % found)  
588 - # remove all whitespace and line feeds:  
589 - #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE  
590 - found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')  
591 - # TODO: make it a function  
592 - # Also remove embedded RTF tags:  
593 - found = re_embedded_tags.sub('', found)  
594 - # object data extracted from the RTF file  
595 - # MS Word accepts an extra hex digit, so we need to trim it if present:  
596 - if len(found) & 1:  
597 - log.debug('Odd length, trimmed last byte.')  
598 - found = found[:-1]  
599 - #log.debug('Cleaned match: %s' % found)  
600 - objdata = binascii.unhexlify(found)  
601 - # Detect the "\bin" control word, which is sometimes used for obfuscation:  
602 - bin_match = re_delims_bin_decimal.match(data, pos=current)  
603 - while bin_match is not None:  
604 - log.debug('Found \\bin block starting at %08X : %r'  
605 - % (bin_match.start(), bin_match.group(0)))  
606 - # extract the decimal integer following '\bin'  
607 - bin_len = int(bin_match.group(1))  
608 - log.debug('\\bin block length = %d' % bin_len)  
609 - if current+bin_len > len(data):  
610 - log.error('\\bin block length is larger than the remaining data')  
611 - # move the current index, ignore the \bin block  
612 - current += len(bin_match.group(0))  
613 - break  
614 - # read that number of bytes:  
615 - objdata += data[current:current+bin_len]  
616 - # TODO: handle exception  
617 - current += len(bin_match.group(0)) + bin_len  
618 - # TODO: check if current is out of range  
619 - # TODO: is Word limiting the \bin length to a number of digits?  
620 - log.debug('Current position = %08X' % current)  
621 - match = re_delim_hexblock.match(data, pos=current)  
622 - if match is not None:  
623 - log.debug('Found next hex block starting at %08X, end %08X'  
624 - % (match.start(), match.end()))  
625 - found = match.group(0)  
626 - log.debug('Match: %s' % found)  
627 - # remove all whitespace and line feeds:  
628 - #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE  
629 - found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')  
630 - # Also remove embedded RTF tags:  
631 - found = re_embedded_tags.sub(found, '')  
632 - objdata += binascii.unhexlify(found)  
633 - current = match.end()  
634 - bin_match = re_delims_bin_decimal.match(data, pos=current)  
635 -  
636 - # print repr(found)  
637 - if len(objdata)>min_size:  
638 - yield start, current-start, objdata  
639 - # Search next occurence of a hex block:  
640 - match = re_hexblock.search(data, pos=current) 567 +# def search_hex_block(data, pos=0, min_size=32, first=True):
  568 +# if first:
  569 +# # Search 1st occurence of a hex block:
  570 +# match = re_hexblock.search(data, pos=pos)
  571 +# else:
  572 +# # Match next occurences of a hex block, from the current position only:
  573 +# match = re_hexblock.match(data, pos=pos)
  574 +#
  575 +#
  576 +#
  577 +# def rtf_iter_objects (data, min_size=32):
  578 +# """
  579 +# Open a RTF file, extract each embedded object encoded in hexadecimal of
  580 +# size > min_size, yield the index of the object in the RTF file and its data
  581 +# in binary format.
  582 +# This is an iterator.
  583 +# """
  584 +# # Search 1st occurence of a hex block:
  585 +# match = re_hexblock.search(data)
  586 +# if match is None:
  587 +# log.debug('No hex block found.')
  588 +# # no hex block found
  589 +# return
  590 +# while match is not None:
  591 +# found = match.group(0)
  592 +# # start index
  593 +# start = match.start()
  594 +# # current position
  595 +# current = match.end()
  596 +# log.debug('Found hex block starting at %08X, end %08X, size=%d' % (start, current, len(found)))
  597 +# if len(found) < min_size:
  598 +# log.debug('Too small - size<%d, ignored.' % min_size)
  599 +# match = re_hexblock.search(data, pos=current)
  600 +# continue
  601 +# #log.debug('Match: %s' % found)
  602 +# # remove all whitespace and line feeds:
  603 +# #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
  604 +# found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')
  605 +# # TODO: make it a function
  606 +# # Also remove embedded RTF tags:
  607 +# found = re_embedded_tags.sub('', found)
  608 +# # object data extracted from the RTF file
  609 +# # MS Word accepts an extra hex digit, so we need to trim it if present:
  610 +# if len(found) & 1:
  611 +# log.debug('Odd length, trimmed last byte.')
  612 +# found = found[:-1]
  613 +# #log.debug('Cleaned match: %s' % found)
  614 +# objdata = binascii.unhexlify(found)
  615 +# # Detect the "\bin" control word, which is sometimes used for obfuscation:
  616 +# bin_match = re_delims_bin_decimal.match(data, pos=current)
  617 +# while bin_match is not None:
  618 +# log.debug('Found \\bin block starting at %08X : %r'
  619 +# % (bin_match.start(), bin_match.group(0)))
  620 +# # extract the decimal integer following '\bin'
  621 +# bin_len = int(bin_match.group(1))
  622 +# log.debug('\\bin block length = %d' % bin_len)
  623 +# if current+bin_len > len(data):
  624 +# log.error('\\bin block length is larger than the remaining data')
  625 +# # move the current index, ignore the \bin block
  626 +# current += len(bin_match.group(0))
  627 +# break
  628 +# # read that number of bytes:
  629 +# objdata += data[current:current+bin_len]
  630 +# # TODO: handle exception
  631 +# current += len(bin_match.group(0)) + bin_len
  632 +# # TODO: check if current is out of range
  633 +# # TODO: is Word limiting the \bin length to a number of digits?
  634 +# log.debug('Current position = %08X' % current)
  635 +# match = re_delim_hexblock.match(data, pos=current)
  636 +# if match is not None:
  637 +# log.debug('Found next hex block starting at %08X, end %08X'
  638 +# % (match.start(), match.end()))
  639 +# found = match.group(0)
  640 +# log.debug('Match: %s' % found)
  641 +# # remove all whitespace and line feeds:
  642 +# #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
  643 +# found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')
  644 +# # Also remove embedded RTF tags:
  645 +# found = re_embedded_tags.sub(found, '')
  646 +# objdata += binascii.unhexlify(found)
  647 +# current = match.end()
  648 +# bin_match = re_delims_bin_decimal.match(data, pos=current)
  649 +#
  650 +# # print repr(found)
  651 +# if len(objdata)>min_size:
  652 +# yield start, current-start, objdata
  653 +# # Search next occurence of a hex block:
  654 +# match = re_hexblock.search(data, pos=current)
641 655
642 656
643 657