Merge pull request #412 from christian-intra2net/ooxml-accept-OpenOffice

Avoid error in ooxml for non-office zip files

Merge pull request #412 from christian-intra2net/ooxml-accept-OpenOffice
Avoid error in ooxml for non-office zip files
Philippe Lagadec · GitHub
2 parents f028496d a55bd780
Showing 7 changed files with 77 additions and 62 deletions
oletools/oleobj.py
oletools/ooxml.py
tests/oleobj/test_basic.py
tests/ooxml/test_basic.py
tests/test-data/oleobj/embedded-simple-2007.odp
tests/test-data/oleobj/embedded-simple-2007.ods
tests/test-data/oleobj/embedded-simple-2007.odt
@@ -750,13 +750,13 @@ def process_file(filename, data, output_dir=None):
  
     xml_parser = None
     if is_zipfile(filename):
-        log.info('file is a OOXML file, looking for relationships with external links')
+        log.info('file could be an OOXML file, looking for relationships with '
+                 'external links')
         xml_parser = XmlParser(filename)
         for relationship, target in find_external_relationships(xml_parser):
             did_dump = True
             print("Found relationship '%s' with external link %s" % (relationship, target))
  
-
     # look for ole files inside file (e.g. unzip docx)
     # have to finish work on every ole stream inside iteration, since handles
     # are closed in find_ole
@@ -765,9 +765,9 @@ def process_file(filename, data, output_dir=None):
             continue
  
         for path_parts in ole.listdir():
+            stream_path = '/'.join(path_parts)
+            log.debug('Checking stream %r', stream_path)
             if path_parts[-1] == '\x01Ole10Native':
-                stream_path = '/'.join(path_parts)
-                log.debug('Checking stream %r', stream_path)
                 stream = None
                 try:
                     stream = ole.openstream(path_parts)
@@ -16,11 +16,11 @@ TODO: &quot;xml2003&quot; == &quot;flatopc&quot;?
 """
  
 import sys
-from oletools.common.log_helper import log_helper
 from zipfile import ZipFile, BadZipfile, is_zipfile
 from os.path import splitext
 import io
 import re
+from oletools.common.log_helper import log_helper
  
 # import lxml or ElementTree for XML parsing:
 try:
@@ -107,16 +107,14 @@ def debug_str(elem):
     text = u', '.join(parts)
     if len(text) > 150:
         return text[:147] + u'...]'
-    else:
-        return text + u']'
+    return text + u']'
  
  
 def isstr(some_var):
     """ version-independent test for isinstance(some_var, (str, unicode)) """
     if sys.version_info.major == 2:
         return isinstance(some_var, basestring)  # true for str and unicode
-    else:
-        return isinstance(some_var, str)         # there is no unicode
+    return isinstance(some_var, str)         # there is no unicode
  
  
 ###############################################################################
@@ -136,23 +134,29 @@ def get_type(filename):
         prog_id = match.groups()[0]
         if prog_id == WORD_XML_PROG_ID:
             return DOCTYPE_WORD_XML
-        elif prog_id == EXCEL_XML_PROG_ID:
+        if prog_id == EXCEL_XML_PROG_ID:
             return DOCTYPE_EXCEL_XML
-        else:
-            return DOCTYPE_NONE
+        return DOCTYPE_NONE
  
     is_doc = False
     is_xls = False
     is_ppt = False
-    for _, elem, _ in parser.iter_xml(FILE_CONTENT_TYPES):
-        logger.debug(u'  ' + debug_str(elem))
-        try:
-            content_type = elem.attrib['ContentType']
-        except KeyError:         # ContentType not an attr
-            continue
-        is_xls |= content_type.startswith(CONTENT_TYPES_EXCEL)
-        is_doc |= content_type.startswith(CONTENT_TYPES_WORD)
-        is_ppt |= content_type.startswith(CONTENT_TYPES_PPT)
+    try:
+        for _, elem, _ in parser.iter_xml(FILE_CONTENT_TYPES):
+            logger.debug(u'  ' + debug_str(elem))
+            try:
+                content_type = elem.attrib['ContentType']
+            except KeyError:         # ContentType not an attr
+                continue
+            is_xls |= content_type.startswith(CONTENT_TYPES_EXCEL)
+            is_doc |= content_type.startswith(CONTENT_TYPES_WORD)
+            is_ppt |= content_type.startswith(CONTENT_TYPES_PPT)
+    except BadOOXML as oo_err:
+        if oo_err.more_info.startswith('invalid subfile') and \
+                FILE_CONTENT_TYPES in oo_err.more_info:
+            # no FILE_CONTENT_TYPES in zip, so probably no ms office xml.
+            return DOCTYPE_NONE
+        raise
  
     if is_doc and not is_xls and not is_ppt:
         return DOCTYPE_WORD
@@ -162,9 +166,8 @@ def get_type(filename):
         return DOCTYPE_POWERPOINT
     if not is_doc and not is_xls and not is_ppt:
         return DOCTYPE_NONE
-    else:
-        logger.warning('Encountered contradictory content types')
-        return DOCTYPE_MIXED
+    logger.warning('Encountered contradictory content types')
+    return DOCTYPE_MIXED
  
  
 def is_ooxml(filename):
@@ -177,6 +180,7 @@ def is_ooxml(filename):
         return False
     if doctype == DOCTYPE_NONE:
         return False
+    return True
  
  
 ###############################################################################
@@ -216,6 +220,7 @@ class ZipSubFile(object):
     See also (and maybe could some day merge with):
     ppt_record_parser.IterStream; also: oleobj.FakeFile
     """
+    CHUNK_SIZE = 4096
  
     def __init__(self, container, filename, mode='r', size=None):
         """ remember all necessary vars but do not open yet """
@@ -253,7 +258,7 @@ class ZipSubFile(object):
         # print('ZipSubFile: opened; size={}'.format(self.size))
         return self
  
-    def write(self, *args, **kwargs):                          # pylint: disable=unused-argument,no-self-use
+    def write(self, *args, **kwargs):
         """ write is not allowed """
         raise IOError('writing not implemented')
  
@@ -311,10 +316,9 @@ class ZipSubFile(object):
         """ helper for seek: skip forward by given amount using read() """
         # print('ZipSubFile: seek by skipping {} bytes starting at {}'
         #       .format(self.pos, to_skip))
-        CHUNK_SIZE = 4096
-        n_chunks, leftover = divmod(to_skip, CHUNK_SIZE)
+        n_chunks, leftover = divmod(to_skip, self.CHUNK_SIZE)
         for _ in range(n_chunks):
-            self.read(CHUNK_SIZE)    # just read and discard
+            self.read(self.CHUNK_SIZE)    # just read and discard
         self.read(leftover)
         # print('ZipSubFile: seek by skipping done, pos now {}'
         #       .format(self.pos))
@@ -417,8 +421,7 @@ class XmlParser(object):
         if match:
             self._is_single_xml = True
             return True
-        if not match:
-            raise BadOOXML(self.filename, 'is no zip and has no prog_id')
+        raise BadOOXML(self.filename, 'is no zip and has no prog_id')
  
     def iter_files(self, args=None):
         """ Find files in zip or just give single xml file """
@@ -433,17 +436,14 @@ class XmlParser(object):
             subfiles = None
             try:
                 zipper = ZipFile(self.filename)
-                try:
-                    _ = zipper.getinfo(FILE_CONTENT_TYPES)
-                except KeyError:
-                    raise BadOOXML(self.filename,
-                                   'No content type information')
                 if not args:
                     subfiles = zipper.namelist()
                 elif isstr(args):
                     subfiles = [args, ]
                 else:
-                    subfiles = tuple(args)   # make a copy in case orig changes
+                    # make a copy in case original args are modified
+                    # Not sure whether this really is needed...
+                    subfiles = tuple(arg for arg in args)
  
                 for subfile in subfiles:
                     with zipper.open(subfile, 'r') as handle:
@@ -451,10 +451,12 @@ class XmlParser(object):
                 if not args:
                     self.did_iter_all = True
             except KeyError as orig_err:
+                # Note: do not change text of this message without adjusting
+                #       conditions in except handlers
                 raise BadOOXML(self.filename,
                                'invalid subfile: ' + str(orig_err))
             except BadZipfile:
-                raise BadOOXML(self.filename, 'neither zip nor xml')
+                raise BadOOXML(self.filename, 'not in zip format')
             finally:
                 if zipper:
                     zipper.close()
@@ -503,7 +505,7 @@ class XmlParser(object):
                     if event == 'start':
                         if elem.tag in want_tags:
                             logger.debug('remember start of tag {0} at {1}'
-                                          .format(elem.tag, depth))
+                                         .format(elem.tag, depth))
                             inside_tags.append((elem.tag, depth))
                         depth += 1
                         continue
@@ -519,18 +521,18 @@ class XmlParser(object):
                                 inside_tags.pop()
                             else:
                                 logger.error('found end for wanted tag {0} '
-                                              'but last start tag {1} does not'
-                                              ' match'.format(curr_tag,
-                                                              inside_tags[-1]))
+                                             'but last start tag {1} does not'
+                                             ' match'.format(curr_tag,
+                                                             inside_tags[-1]))
                                 # try to recover: close all deeper tags
                                 while inside_tags and \
                                         inside_tags[-1][1] >= depth:
                                     logger.debug('recover: pop {0}'
-                                                  .format(inside_tags[-1]))
+                                                 .format(inside_tags[-1]))
                                     inside_tags.pop()
                         except IndexError:    # no inside_tag[-1]
                             logger.error('found end of {0} at depth {1} but '
-                                          'no start event')
+                                         'no start event')
                     # yield element
                     if is_wanted or not want_tags:
                         yield subfile, elem, depth
@@ -544,7 +546,7 @@ class XmlParser(object):
             except ET.ParseError as err:
                 self.subfiles_no_xml.add(subfile)
                 if subfile is None:    # this is no zip subfile but single xml
-                    raise BadOOXML(self.filename, 'is neither zip nor xml')
+                    raise BadOOXML(self.filename, 'content is not valid XML')
                 elif subfile.endswith('.xml'):
                     log = logger.warning
                 else:
@@ -568,21 +570,30 @@ class XmlParser(object):
  
         defaults = []
         files = []
-        for _, elem, _ in self.iter_xml(FILE_CONTENT_TYPES):
-            if elem.tag.endswith('Default'):
-                extension = elem.attrib['Extension']
-                if extension.startswith('.'):
-                    extension = extension[1:]
-                defaults.append((extension, elem.attrib['ContentType']))
-                logger.debug('found content type for extension {0[0]}: {0[1]}'
-                              .format(defaults[-1]))
-            elif elem.tag.endswith('Override'):
-                subfile = elem.attrib['PartName']
-                if subfile.startswith('/'):
-                    subfile = subfile[1:]
-                files.append((subfile, elem.attrib['ContentType']))
-                logger.debug('found content type for subfile {0[0]}: {0[1]}'
-                              .format(files[-1]))
+        try:
+            for _, elem, _ in self.iter_xml(FILE_CONTENT_TYPES):
+                if elem.tag.endswith('Default'):
+                    extension = elem.attrib['Extension']
+                    if extension.startswith('.'):
+                        extension = extension[1:]
+                    defaults.append((extension, elem.attrib['ContentType']))
+                    logger.debug('found content type for extension {0[0]}: '
+                                 '{0[1]}'.format(defaults[-1]))
+                elif elem.tag.endswith('Override'):
+                    subfile = elem.attrib['PartName']
+                    if subfile.startswith('/'):
+                        subfile = subfile[1:]
+                    files.append((subfile, elem.attrib['ContentType']))
+                    logger.debug('found content type for subfile {0[0]}: '
+                                 '{0[1]}'.format(files[-1]))
+        except BadOOXML as oo_err:
+            if oo_err.more_info.startswith('invalid subfile') and \
+                    FILE_CONTENT_TYPES in oo_err.more_info:
+                # no FILE_CONTENT_TYPES in zip, so probably no ms office xml.
+                # Maybe OpenDocument format? In any case, try to analyze.
+                pass
+            else:
+                raise
         return dict(files), dict(defaults)
  
     def iter_non_xml(self):
@@ -599,7 +610,7 @@ class XmlParser(object):
         """
         if not self.did_iter_all:
             logger.warning('Did not iterate through complete file. '
-                            'Should run iter_xml() without args, first.')
+                           'Should run iter_xml() without args, first.')
         if not self.subfiles_no_xml:
             return
  
@@ -631,7 +642,7 @@ def test():
  
     see module doc for more info
     """
-    log_helper.enable_logging(False, logger.DEBUG)
+    log_helper.enable_logging(False, 'debug')
     if len(sys.argv) != 2:
         print(u'To test this code, give me a single file as arg')
         return 2
@@ -41,8 +41,10 @@ SAMPLES += tuple(
      'ab8c65e4c0fc51739aa66ca5888265b4')
     for extn in ('xls', 'xlsx', 'xlsb', 'xlsm', 'xla', 'xlam', 'xlt', 'xltm',
                  'xltx', 'ppt', 'pptx', 'pptm', 'pps', 'ppsx', 'ppsm', 'pot',
-                 'potx', 'potm')
+                 'potx', 'potm', 'ods', 'odp')
 )
+SAMPLES += (('embedded-simple-2007.odt', 'simple-text-file.txt',
+     'bd5c063a5a43f67b3c50dc7b0f1195af'), )
  
  
 def calc_md5(filename):
@@ -33,6 +33,8 @@ class TestOOXML(unittest.TestCase):
             pptx=ooxml.DOCTYPE_POWERPOINT, pptm=ooxml.DOCTYPE_POWERPOINT,
             ppsx=ooxml.DOCTYPE_POWERPOINT, ppsm=ooxml.DOCTYPE_POWERPOINT,
             potx=ooxml.DOCTYPE_POWERPOINT, potm=ooxml.DOCTYPE_POWERPOINT,
+            ods=ooxml.DOCTYPE_NONE, odt=ooxml.DOCTYPE_NONE,
+            odp=ooxml.DOCTYPE_NONE,
         )
  
         # files that are neither OLE nor xml: