rtfobj, oleobj: fixed Python 2.6+2.7+3.x support

decalage2
1 parent 1cf591dd
Showing 2 changed files with 327 additions and 149 deletions
oletools/oleobj.py
oletools/rtfobj.py
 #!/usr/bin/env python
+from __future__ import print_function
 """
 oleobj.py
@@ -14,7 +15,7 @@ http://www.decalage.info/python/oletools
 # === LICENSE ==================================================================
-# oleobj is copyright (c) 2015 Philippe Lagadec (http://www.decalage.info)
+# oleobj is copyright (c) 2015-2016 Philippe Lagadec (http://www.decalage.info)
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification,
@@ -41,8 +42,11 @@ http://www.decalage.info/python/oletools
 #------------------------------------------------------------------------------
 # CHANGELOG:
 # 2015-12-05 v0.01 PL: - first version
+# 2016-06          PL: - added main and process_file (not working yet)
+# 2016-07-18 v0.48 SL: - added Python 3.5 support
+# 2016-07-19       PL: - fixed Python 2.6-7 support
-__version__ = '0.01'
+__version__ = '0.48'
 #------------------------------------------------------------------------------
 # TODO:
@@ -62,8 +66,10 @@ __version__ = &#39;0.01&#39;
 #--- IMPORTS ------------------------------------------------------------------
-import logging, struct
+import logging, struct, optparse, os, re, sys
+from thirdparty.olefile import olefile
+from thirdparty.xglob import xglob
 # === LOGGING =================================================================
@@ -107,6 +113,18 @@ def get_logger(name, level=logging.CRITICAL+1):
 log = get_logger('oleobj')
+# === CONSTANTS ==============================================================
+
+# some str methods on Python 2.x return characters,
+# while the equivalent bytes methods return integers on Python 3.x:
+if sys.version_info[0] <= 2:
+    # Python 2.x
+    NULL_CHAR = '\x00'
+else:
+    # Python 3.x
+    NULL_CHAR = 0
+
+
 # === GLOBAL VARIABLES =======================================================
 # struct to parse an unsigned integer of 32 bits:
@@ -162,7 +180,7 @@ def read_LengthPrefixedAnsiString(data):
     ansi_string = data[:length-1]
     # TODO: only in strict mode:
     # check the presence of the null char:
-    assert data[length] == 0
+    assert data[length] == NULL_CHAR
     new_data = data[length:]
     return (ansi_string, new_data)
@@ -285,3 +303,149 @@ class OleObject (object):
             self.data = data[:self.data_size]
             assert len(self.data) == self.data_size
             self.extra_data = data[self.data_size:]
+
+
+
+def sanitize_filename(filename, replacement='_', max_length=200):
+    """compute basename of filename. Replaces all non-whitelisted characters.
+       The returned filename is always a basename of the file."""
+    basepath = os.path.basename(filename).strip()
+    sane_fname = re.sub(r'[^\w\.\- ]', replacement, basepath)
+
+    while ".." in sane_fname:
+        sane_fname = sane_fname.replace('..', '.')
+
+    while "  " in sane_fname:
+        sane_fname = sane_fname.replace('  ', ' ')
+
+    if not len(filename):
+        sane_fname = 'NONAME'
+
+    # limit filename length
+    if max_length:
+        sane_fname = sane_fname[:max_length]
+
+    return sane_fname
+
+
+def process_file(container, filename, data, output_dir=None):
+    if output_dir:
+        if not os.path.isdir(output_dir):
+            log.info('creating output directory %s' % output_dir)
+            os.mkdir(output_dir)
+
+        fname_prefix = os.path.join(output_dir,
+                                    sanitize_filename(filename))
+    else:
+        base_dir = os.path.dirname(filename)
+        sane_fname = sanitize_filename(filename)
+        fname_prefix = os.path.join(base_dir, sane_fname)
+
+    # TODO: option to extract objects to files (false by default)
+    if data is None:
+        data = open(filename, 'rb').read()
+    print ('-'*79)
+    print ('File: %r - %d bytes' % (filename, len(data)))
+    ole = olefile.OleFileIO(data)
+    index = 1
+    for stream in ole.listdir():
+        objdata = ole.openstream(stream).read()
+        stream_path = '/'.join(stream)
+        log.debug('Checking stream %r' % stream_path)
+        obj = OleObject()
+        try:
+            obj.parse(objdata)
+            print('extract file embedded in OLE object from stream %r:' % stream_path)
+            print('format_id  = %d' % obj.format_id)
+            print('class name = %r' % obj.class_name)
+            print('data size  = %d' % obj.data_size)
+            # set a file extension according to the class name:
+            class_name = obj.class_name.lower()
+            if class_name.startswith('word'):
+                ext = 'doc'
+            elif class_name.startswith('package'):
+                ext = 'package'
+            else:
+                ext = 'bin'
+
+            fname = '%s_object_%03d.%s' % (fname_prefix, index, ext)
+            print ('saving to file %s' % fname)
+            open(fname, 'wb').write(obj.data)
+            if obj.class_name.lower() == 'package':
+                print ('Parsing OLE Package')
+                opkg = OleNativeStream(bindata=obj.data)
+                print ('Filename = %r' % opkg.filename)
+                print ('Source path = %r' % opkg.src_path)
+                print ('Temp path = %r' % opkg.temp_path)
+                if opkg.filename:
+                    fname = '%s_%s' % (fname_prefix,
+                                       sanitize_filename(opkg.filename))
+                else:
+                    fname = '%s_object_%03d.noname' % (fname_prefix, index)
+                print ('saving to file %s' % fname)
+                open(fname, 'wb').write(opkg.data)
+            index += 1
+        except:
+            log.info('*** Not an OLE 1.0 Object')
+
+
+
+#=== MAIN =================================================================
+
+if __name__ == '__main__':
+    # print banner with version
+    print ('oleobj %s - http://decalage.info/oletools' % __version__)
+    print ('THIS IS WORK IN PROGRESS - Check updates regularly!')
+    print ('Please report any issue at https://github.com/decalage2/oletools/issues')
+    print ('')
+
+    DEFAULT_LOG_LEVEL = "warning" # Default log level
+    LOG_LEVELS = {'debug':    logging.DEBUG,
+              'info':     logging.INFO,
+              'warning':  logging.WARNING,
+              'error':    logging.ERROR,
+              'critical': logging.CRITICAL
+             }
+
+    usage = 'usage: %prog [options] <filename> [filename2 ...]'
+    parser = optparse.OptionParser(usage=usage)
+    # parser.add_option('-o', '--outfile', dest='outfile',
+    #     help='output file')
+    # parser.add_option('-c', '--csv', dest='csv',
+    #     help='export results to a CSV file')
+    parser.add_option("-r", action="store_true", dest="recursive",
+        help='find files recursively in subdirectories.')
+    parser.add_option("-d", type="str", dest="output_dir",
+        help='use specified directory to output files.', default=None)
+    parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None,
+        help='if the file is a zip archive, open first file from it, using the provided password (requires Python 2.6+)')
+    parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*',
+        help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)')
+    parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL,
+                            help="logging level debug/info/warning/error/critical (default=%default)")
+
+    (options, args) = parser.parse_args()
+
+    # Print help if no arguments are passed
+    if len(args) == 0:
+        print (__doc__)
+        parser.print_help()
+        sys.exit()
+
+    # Setup logging to the console:
+    # here we use stdout instead of stderr by default, so that the output
+    # can be redirected properly.
+    logging.basicConfig(level=LOG_LEVELS[options.loglevel], stream=sys.stdout,
+                        format='%(levelname)-8s %(message)s')
+    # enable logging in the modules:
+    log.setLevel(logging.NOTSET)
+
+
+    for container, filename, data in xglob.iter_files(args, recursive=options.recursive,
+        zip_password=options.zip_password, zip_fname=options.zip_fname):
+        # ignore directory names stored in zip files:
+        if container and filename.endswith('/'):
+            continue
+        process_file(container, filename, data, options.output_dir)
+
+
@@ -55,18 +55,20 @@ http://www.decalage.info/python/oletools
 #                  TJ: - sanitize filenames to avoid special characters
 # 2016-05-29       PL: - improved parsing, fixed issue #42
 # 2016-07-13 v0.48 PL: - new RtfParser and RtfObjParser classes
+# 2016-07-18       SL: - added Python 3.5 support
+# 2016-07-19       PL: - fixed Python 2.6-2.7 support
 __version__ = '0.48'
-#------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------
 # TODO:
 # - allow semicolon within hex, as found in  this sample:
 #   http://contagiodump.blogspot.nl/2011/10/sep-28-cve-2010-3333-manuscript-with.html
-#=== IMPORTS =================================================================
+# === IMPORTS =================================================================
-import re, os, sys, string, binascii, logging, optparse
+import re, os, sys, binascii, logging, optparse
 from thirdparty.xglob import xglob
 from oleobj import OleObject, OleNativeStream
@@ -120,7 +122,7 @@ log = get_logger(&#39;rtfobj&#39;)
 # REGEX pattern to extract embedded OLE objects in hexadecimal format:
 # alphanum digit: [0-9A-Fa-f]
-HEX_DIGIT = rb'[0-9A-Fa-f]'
+HEX_DIGIT = b'[0-9A-Fa-f]'
 # hex char = two alphanum digits: [0-9A-Fa-f]{2}
 # HEX_CHAR = r'[0-9A-Fa-f]{2}'
@@ -130,11 +132,11 @@ HEX_DIGIT = rb&#39;[0-9A-Fa-f]&#39;
 # AND the tags can be nested...
 #SINGLE_RTF_TAG = r'[{][^{}]*[}]'
 # Actually RTF tags may contain braces escaped with backslash (\{ \}):
-SINGLE_RTF_TAG = rb'[{](?:\\.|[^{}\])*[}]'
+SINGLE_RTF_TAG = b'[{](?:\\\\.|[^{}\\\])*[}]'
 # Nested tags, two levels (because Python's re does not support nested matching):
 # NESTED_RTF_TAG = r'[{](?:[^{}]|'+SINGLE_RTF_TAG+r')*[}]'
-NESTED_RTF_TAG = rb'[{](?:\\.|[^{}\]|'+SINGLE_RTF_TAG+b')*[}]'
+NESTED_RTF_TAG = b'[{](?:\\\\.|[^{}\\\]|'+SINGLE_RTF_TAG+b')*[}]'
 # AND it is also allowed to insert ANY control word or control symbol (ignored)
 # According to Rich Text Format (RTF) Specification Version 1.9.1,
@@ -146,7 +148,7 @@ NESTED_RTF_TAG = rb&#39;[{](?:\\.|[^{}\\]|&#39;+SINGLE_RTF_TAG+b&#39;)*[}]&#39;
 # "\AnyThing " "\AnyThing123z" ""\AnyThing-456{" "\AnyThing{"
 # control symbol = \<any char except letter or digit> (followed by anything)
-ASCII_NAME = rb'([a-zA-Z]{1,250})'
+ASCII_NAME = b'([a-zA-Z]{1,250})'
 # using Python's re lookahead assumption:
 # (?=...) Matches if ... matches next, but doesn't consume any of the string.
@@ -155,21 +157,21 @@ ASCII_NAME = rb&#39;([a-zA-Z]{1,250})&#39;
 # TODO: Find the actual limit on the number of digits for Word
 # SIGNED_INTEGER = r'(-?\d{1,250})'
-SIGNED_INTEGER = rb'(-?\d+)'
+SIGNED_INTEGER = b'(-?\\d+)'
-CONTROL_WORD = rb'(?:\\' + ASCII_NAME + rb'(?:(?=[^a-zA-Z0-9-])|' + SIGNED_INTEGER + rb'(?=[^0-9])))'
+CONTROL_WORD = b'(?:\\\\' + ASCII_NAME + b'(?:(?=[^a-zA-Z0-9-])|' + SIGNED_INTEGER + b'(?=[^0-9])))'
 re_control_word = re.compile(CONTROL_WORD)
-CONTROL_SYMBOL = rb'(?:\[^a-zA-Z0-9])'
+CONTROL_SYMBOL = b'(?:\\\[^a-zA-Z0-9])'
 re_control_symbol = re.compile(CONTROL_SYMBOL)
 # Text that is not a control word/symbol or a group:
-TEXT = rb'[^{}\]+'
+TEXT = b'[^{}\\\]+'
 re_text = re.compile(TEXT)
 # ignored whitespaces and tags within a hex block:
-IGNORED = rb'(?:\s|'+NESTED_RTF_TAG+rb'|'+CONTROL_SYMBOL+rb'|'+CONTROL_WORD+rb')*'
+IGNORED = b'(?:\\s|'+NESTED_RTF_TAG+b'|'+CONTROL_SYMBOL+b'|'+CONTROL_WORD+b')*'
 #IGNORED = r'\s*'
 # HEX_CHAR = HEX_DIGIT + IGNORED + HEX_DIGIT
@@ -189,27 +191,24 @@ IGNORED = rb&#39;(?:\s|&#39;+NESTED_RTF_TAG+rb&#39;|&#39;+CONTROL_SYMBOL+rb&#39;|&#39;+CONTROL_WORD+rb&#39;)
 #TODO PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}\b'
 # PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}' #+ HEX_CHAR + r'\b'
-PATTERN = rb'\b(?:' + HEX_DIGIT + IGNORED + rb'){7,}' + HEX_DIGIT + rb'\b'
+PATTERN = b'\\b(?:' + HEX_DIGIT + IGNORED + b'){7,}' + HEX_DIGIT + b'\\b'
 # at least 4 hex chars, followed by whitespace or CR/LF: (?:[0-9A-Fa-f]{2}){4,}\s*
 # PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}'
 # improved pattern, allowing semicolons within hex:
 #PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}'
-# a dummy translation table for str.translate, which does not change anythying:
-TRANSTABLE_NOCHANGE = bytes.maketrans(b'', b'')
-
 re_hexblock = re.compile(PATTERN)
 re_embedded_tags = re.compile(IGNORED)
-re_decimal = re.compile(rb'\d+')
+re_decimal = re.compile(b'\\d+')
-re_delimiter = re.compile(rb'[ \t\r\n\f\v]')
+re_delimiter = re.compile(b'[ \\t\\r\\n\\f\\v]')
-DELIMITER = rb'[ \t\r\n\f\v]'
-DELIMITERS_ZeroOrMore = rb'[ \t\r\n\f\v]*'
-BACKSLASH_BIN = rb'\\bin'
+DELIMITER = b'[ \\t\\r\\n\\f\\v]'
+DELIMITERS_ZeroOrMore = b'[ \\t\\r\\n\\f\\v]*'
+BACKSLASH_BIN = b'\\\\bin'
 # According to my tests, Word accepts up to 250 digits (leading zeroes)
-DECIMAL_GROUP = rb'(\d{1,250})'
+DECIMAL_GROUP = b'(\d{1,250})'
 re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + BACKSLASH_BIN
                                    + DECIMAL_GROUP + DELIMITER)
@@ -250,6 +249,19 @@ DESTINATION_CONTROL_WORDS = frozenset((
     ))
+# some str methods on Python 2.x return characters,
+# while the equivalent bytes methods return integers on Python 3.x:
+if sys.version_info[0] <= 2:
+    # Python 2.x - Characters (str)
+    BACKSLASH = '\\'
+    BRACE_OPEN = '{'
+    BRACE_CLOSE = '}'
+else:
+    # Python 3.x - Integers
+    BACKSLASH = ord('\\')
+    BRACE_OPEN = ord('{')
+    BRACE_CLOSE = ord('}')
+
 #=== CLASSES =================================================================
@@ -294,15 +306,15 @@ class RtfParser(object):
     def parse(self):
         self.index = 0
         while self.index < self.size:
-            if self.data[self.index] == ord('{'):
+            if self.data[self.index] == BRACE_OPEN:
                 self._open_group()
                 self.index += 1
                 continue
-            if self.data[self.index] == ord('}'):
+            if self.data[self.index] == BRACE_CLOSE:
                 self._close_group()
                 self.index += 1
                 continue
-            if self.data[self.index] == ord('\\'):
+            if self.data[self.index] == BACKSLASH:
                 m = re_control_word.match(self.data, self.index)
                 if m:
                     cword = m.group(1)
@@ -332,7 +344,7 @@ class RtfParser(object):
     def _open_group(self):
         self.group_level += 1
-        log.debug('{ Open Group at index %Xh - level=%d' % (self.index, self.group_level))
+        #log.debug('{ Open Group at index %Xh - level=%d' % (self.index, self.group_level))
         # call user method AFTER increasing the level:
         self.open_group()
@@ -341,19 +353,20 @@ class RtfParser(object):
         pass
     def _close_group(self):
-        log.debug('} Close Group at index %Xh - level=%d' % (self.index, self.group_level))
+        #log.debug('} Close Group at index %Xh - level=%d' % (self.index, self.group_level))
         # call user method BEFORE decreasing the level:
         self.close_group()
         # if the destination level is the same as the group level, close the destination:
         if self.group_level == self.current_destination.group_level:
-            log.debug('Current Destination %r level = %d => Close Destination' % (
-                self.current_destination.cword, self.current_destination.group_level))
+            # log.debug('Current Destination %r level = %d => Close Destination' % (
+            #     self.current_destination.cword, self.current_destination.group_level))
             self._close_destination()
         else:
-            log.debug('Current Destination %r level = %d => Continue with same Destination' % (
-                self.current_destination.cword, self.current_destination.group_level))
+            # log.debug('Current Destination %r level = %d => Continue with same Destination' % (
+            #     self.current_destination.cword, self.current_destination.group_level))
+            pass
         self.group_level -= 1
-        log.debug('Decreased group level to %d' % self.group_level)
+        # log.debug('Decreased group level to %d' % self.group_level)
     def close_group(self):
         #log.debug('close group at index %Xh' % self.index)
@@ -369,7 +382,7 @@ class RtfParser(object):
         self.current_destination = new_dest
         # start of the destination is right after the control word:
         new_dest.start = self.index + len(matchobject.group())
-        log.debug("Open Destination %r start=%Xh - level=%d" % (cword, new_dest.start, new_dest.group_level))
+        # log.debug("Open Destination %r start=%Xh - level=%d" % (cword, new_dest.start, new_dest.group_level))
         # call the corresponding user method for additional processing:
         self.open_destination(self.current_destination)
@@ -377,8 +390,8 @@ class RtfParser(object):
         pass
     def _close_destination(self):
-        log.debug("Close Destination %r end=%Xh - level=%d" % (self.current_destination.cword,
-            self.index, self.current_destination.group_level))
+        # log.debug("Close Destination %r end=%Xh - level=%d" % (self.current_destination.cword,
+        #     self.index, self.current_destination.group_level))
         self.current_destination.end = self.index
         # call the corresponding user method for additional processing:
         self.close_destination(self.current_destination)
@@ -388,7 +401,8 @@ class RtfParser(object):
         if len(self.destinations) > 0:
             self.current_destination = self.destinations[-1]
         else:
-            log.debug('All destinations are closed, keeping the document destination open')
+            # log.debug('All destinations are closed, keeping the document destination open')
+            pass
     def close_destination(self, destination):
         pass
@@ -430,10 +444,10 @@ class RtfParser(object):
         pass
     def _end_of_file(self):
-        log.debug('%Xh Reached End of File')
+        # log.debug('%Xh Reached End of File')
         # close any group/destination that is still open:
         while self.group_level > 0:
-            log.debug('Group Level = %d, closing group' % self.group_level)
+            # log.debug('Group Level = %d, closing group' % self.group_level)
             self._close_group()
         self.end_of_file()
@@ -458,7 +472,7 @@ class RtfObjParser(RtfParser):
         if destination.cword == b'objdata':
             log.debug('*** Close object data at index %Xh' % self.index)
             # Filter out all whitespaces first (just ignored):
-            hexdata1 = destination.data.translate(TRANSTABLE_NOCHANGE, b' \t\r\n\f\v')
+            hexdata1 = destination.data.translate(None, b' \t\r\n\f\v')
             # Then filter out any other non-hex character:
             hexdata = re.sub(b'[^a-hA-H0-9]', b'', hexdata1)
             if len(hexdata) < len(hexdata1):
@@ -528,116 +542,116 @@ class RtfObjParser(RtfParser):
 #=== FUNCTIONS ===============================================================
-def rtf_iter_objects_old (filename, min_size=32):
-    """
-    Open a RTF file, extract each embedded object encoded in hexadecimal of
-    size > min_size, yield the index of the object in the RTF file and its data
-    in binary format.
-    This is an iterator.
-    """
-    data = open(filename, 'rb').read()
-    for m in re.finditer(PATTERN, data):
-        found = m.group(0)
-        orig_len = len(found)
-        # remove all whitespace and line feeds:
-        #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
-        found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v}')
-        found = binascii.unhexlify(found)
-        #print repr(found)
-        if len(found)>min_size:
-            yield m.start(), orig_len, found
+# def rtf_iter_objects_old (filename, min_size=32):
+#     """
+#     Open a RTF file, extract each embedded object encoded in hexadecimal of
+#     size > min_size, yield the index of the object in the RTF file and its data
+#     in binary format.
+#     This is an iterator.
+#     """
+#     data = open(filename, 'rb').read()
+#     for m in re.finditer(PATTERN, data):
+#         found = m.group(0)
+#         orig_len = len(found)
+#         # remove all whitespace and line feeds:
+#         #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
+#         found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v}')
+#         found = binascii.unhexlify(found)
+#         #print repr(found)
+#         if len(found)>min_size:
+#             yield m.start(), orig_len, found
 # TODO: backward-compatible API?
-def search_hex_block(data, pos=0, min_size=32, first=True):
-    if first:
-        # Search 1st occurence of a hex block:
-        match = re_hexblock.search(data, pos=pos)
-    else:
-        # Match next occurences of a hex block, from the current position only:
-        match = re_hexblock.match(data, pos=pos)
-
-
-
-def rtf_iter_objects (data, min_size=32):
-    """
-    Open a RTF file, extract each embedded object encoded in hexadecimal of
-    size > min_size, yield the index of the object in the RTF file and its data
-    in binary format.
-    This is an iterator.
-    """
-    # Search 1st occurence of a hex block:
-    match = re_hexblock.search(data)
-    if match is None:
-        log.debug('No hex block found.')
-        # no hex block found
-        return
-    while match is not None:
-        found = match.group(0)
-        # start index
-        start = match.start()
-        # current position
-        current = match.end()
-        log.debug('Found hex block starting at %08X, end %08X, size=%d' % (start, current, len(found)))
-        if len(found) < min_size:
-            log.debug('Too small - size<%d, ignored.' % min_size)
-            match = re_hexblock.search(data, pos=current)
-            continue
-        #log.debug('Match: %s' % found)
-        # remove all whitespace and line feeds:
-        #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
-        found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')
-        # TODO: make it a function
-        # Also remove embedded RTF tags:
-        found = re_embedded_tags.sub('', found)
-        # object data extracted from the RTF file
-        # MS Word accepts an extra hex digit, so we need to trim it if present:
-        if len(found) & 1:
-            log.debug('Odd length, trimmed last byte.')
-            found = found[:-1]
-        #log.debug('Cleaned match: %s' % found)
-        objdata = binascii.unhexlify(found)
-        # Detect the "\bin" control word, which is sometimes used for obfuscation:
-        bin_match = re_delims_bin_decimal.match(data, pos=current)
-        while bin_match is not None:
-            log.debug('Found \\bin block starting at %08X : %r'
-                          % (bin_match.start(), bin_match.group(0)))
-            # extract the decimal integer following '\bin'
-            bin_len = int(bin_match.group(1))
-            log.debug('\\bin block length = %d' % bin_len)
-            if current+bin_len > len(data):
-                log.error('\\bin block length is larger than the remaining data')
-                # move the current index, ignore the \bin block
-                current += len(bin_match.group(0))
-                break
-            # read that number of bytes:
-            objdata += data[current:current+bin_len]
-            # TODO: handle exception
-            current += len(bin_match.group(0)) + bin_len
-            # TODO: check if current is out of range
-            # TODO: is Word limiting the \bin length to a number of digits?
-            log.debug('Current position = %08X' % current)
-            match = re_delim_hexblock.match(data, pos=current)
-            if match is not None:
-                log.debug('Found next hex block starting at %08X, end %08X'
-                    % (match.start(), match.end()))
-                found = match.group(0)
-                log.debug('Match: %s' % found)
-                # remove all whitespace and line feeds:
-                #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
-                found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')
-                # Also remove embedded RTF tags:
-                found = re_embedded_tags.sub(found, '')
-                objdata += binascii.unhexlify(found)
-                current = match.end()
-            bin_match = re_delims_bin_decimal.match(data, pos=current)
-
-        # print repr(found)
-        if len(objdata)>min_size:
-            yield start, current-start, objdata
-        # Search next occurence of a hex block:
-        match = re_hexblock.search(data, pos=current)
+# def search_hex_block(data, pos=0, min_size=32, first=True):
+#     if first:
+#         # Search 1st occurence of a hex block:
+#         match = re_hexblock.search(data, pos=pos)
+#     else:
+#         # Match next occurences of a hex block, from the current position only:
+#         match = re_hexblock.match(data, pos=pos)
+#
+#
+#
+# def rtf_iter_objects (data, min_size=32):
+#     """
+#     Open a RTF file, extract each embedded object encoded in hexadecimal of
+#     size > min_size, yield the index of the object in the RTF file and its data
+#     in binary format.
+#     This is an iterator.
+#     """
+#     # Search 1st occurence of a hex block:
+#     match = re_hexblock.search(data)
+#     if match is None:
+#         log.debug('No hex block found.')
+#         # no hex block found
+#         return
+#     while match is not None:
+#         found = match.group(0)
+#         # start index
+#         start = match.start()
+#         # current position
+#         current = match.end()
+#         log.debug('Found hex block starting at %08X, end %08X, size=%d' % (start, current, len(found)))
+#         if len(found) < min_size:
+#             log.debug('Too small - size<%d, ignored.' % min_size)
+#             match = re_hexblock.search(data, pos=current)
+#             continue
+#         #log.debug('Match: %s' % found)
+#         # remove all whitespace and line feeds:
+#         #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
+#         found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')
+#         # TODO: make it a function
+#         # Also remove embedded RTF tags:
+#         found = re_embedded_tags.sub('', found)
+#         # object data extracted from the RTF file
+#         # MS Word accepts an extra hex digit, so we need to trim it if present:
+#         if len(found) & 1:
+#             log.debug('Odd length, trimmed last byte.')
+#             found = found[:-1]
+#         #log.debug('Cleaned match: %s' % found)
+#         objdata = binascii.unhexlify(found)
+#         # Detect the "\bin" control word, which is sometimes used for obfuscation:
+#         bin_match = re_delims_bin_decimal.match(data, pos=current)
+#         while bin_match is not None:
+#             log.debug('Found \\bin block starting at %08X : %r'
+#                           % (bin_match.start(), bin_match.group(0)))
+#             # extract the decimal integer following '\bin'
+#             bin_len = int(bin_match.group(1))
+#             log.debug('\\bin block length = %d' % bin_len)
+#             if current+bin_len > len(data):
+#                 log.error('\\bin block length is larger than the remaining data')
+#                 # move the current index, ignore the \bin block
+#                 current += len(bin_match.group(0))
+#                 break
+#             # read that number of bytes:
+#             objdata += data[current:current+bin_len]
+#             # TODO: handle exception
+#             current += len(bin_match.group(0)) + bin_len
+#             # TODO: check if current is out of range
+#             # TODO: is Word limiting the \bin length to a number of digits?
+#             log.debug('Current position = %08X' % current)
+#             match = re_delim_hexblock.match(data, pos=current)
+#             if match is not None:
+#                 log.debug('Found next hex block starting at %08X, end %08X'
+#                     % (match.start(), match.end()))
+#                 found = match.group(0)
+#                 log.debug('Match: %s' % found)
+#                 # remove all whitespace and line feeds:
+#                 #NOTE: with Python 2.6+, we could use None instead of TRANSTABLE_NOCHANGE
+#                 found = found.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')
+#                 # Also remove embedded RTF tags:
+#                 found = re_embedded_tags.sub(found, '')
+#                 objdata += binascii.unhexlify(found)
+#                 current = match.end()
+#             bin_match = re_delims_bin_decimal.match(data, pos=current)
+#
+#         # print repr(found)
+#         if len(objdata)>min_size:
+#             yield start, current-start, objdata
+#         # Search next occurence of a hex block:
+#         match = re_hexblock.search(data, pos=current)