Commit 70c8a2165618b6308b071bc90990621709dabcf9

Authored by Christian Herdtweck
1 parent f028496d

ooxml: do not require [Content_Types].xml in zip files

This has to be present for Office OpenXML files but is not there e.g.
for OpenOffice files (odt, ...).

Can still analyze the file without this.
Showing 1 changed file with 42 additions and 29 deletions
oletools/ooxml.py
@@ -144,15 +144,22 @@ def get_type(filename): @@ -144,15 +144,22 @@ def get_type(filename):
144 is_doc = False 144 is_doc = False
145 is_xls = False 145 is_xls = False
146 is_ppt = False 146 is_ppt = False
147 - for _, elem, _ in parser.iter_xml(FILE_CONTENT_TYPES):  
148 - logger.debug(u' ' + debug_str(elem))  
149 - try:  
150 - content_type = elem.attrib['ContentType']  
151 - except KeyError: # ContentType not an attr  
152 - continue  
153 - is_xls |= content_type.startswith(CONTENT_TYPES_EXCEL)  
154 - is_doc |= content_type.startswith(CONTENT_TYPES_WORD)  
155 - is_ppt |= content_type.startswith(CONTENT_TYPES_PPT) 147 + try:
  148 + for _, elem, _ in parser.iter_xml(FILE_CONTENT_TYPES):
  149 + logger.debug(u' ' + debug_str(elem))
  150 + try:
  151 + content_type = elem.attrib['ContentType']
  152 + except KeyError: # ContentType not an attr
  153 + continue
  154 + is_xls |= content_type.startswith(CONTENT_TYPES_EXCEL)
  155 + is_doc |= content_type.startswith(CONTENT_TYPES_WORD)
  156 + is_ppt |= content_type.startswith(CONTENT_TYPES_PPT)
  157 + except BadOOXML as oo_err:
  158 + if oo_err.more_info.startswith('invalid subfile') and \
  159 + FILE_CONTENT_TYPES in oo_err.more_info:
  160 + # no FILE_CONTENT_TYPES in zip, so probably no ms office xml.
  161 + return DOCTYPE_NONE
  162 + raise
156 163
157 if is_doc and not is_xls and not is_ppt: 164 if is_doc and not is_xls and not is_ppt:
158 return DOCTYPE_WORD 165 return DOCTYPE_WORD
@@ -433,11 +440,6 @@ class XmlParser(object): @@ -433,11 +440,6 @@ class XmlParser(object):
433 subfiles = None 440 subfiles = None
434 try: 441 try:
435 zipper = ZipFile(self.filename) 442 zipper = ZipFile(self.filename)
436 - try:  
437 - _ = zipper.getinfo(FILE_CONTENT_TYPES)  
438 - except KeyError:  
439 - raise BadOOXML(self.filename,  
440 - 'No content type information')  
441 if not args: 443 if not args:
442 subfiles = zipper.namelist() 444 subfiles = zipper.namelist()
443 elif isstr(args): 445 elif isstr(args):
@@ -451,6 +453,8 @@ class XmlParser(object): @@ -451,6 +453,8 @@ class XmlParser(object):
451 if not args: 453 if not args:
452 self.did_iter_all = True 454 self.did_iter_all = True
453 except KeyError as orig_err: 455 except KeyError as orig_err:
  456 + # Note: do not change text of this message without adjusting
  457 + # conditions in except handlers
454 raise BadOOXML(self.filename, 458 raise BadOOXML(self.filename,
455 'invalid subfile: ' + str(orig_err)) 459 'invalid subfile: ' + str(orig_err))
456 except BadZipfile: 460 except BadZipfile:
@@ -568,21 +572,30 @@ class XmlParser(object): @@ -568,21 +572,30 @@ class XmlParser(object):
568 572
569 defaults = [] 573 defaults = []
570 files = [] 574 files = []
571 - for _, elem, _ in self.iter_xml(FILE_CONTENT_TYPES):  
572 - if elem.tag.endswith('Default'):  
573 - extension = elem.attrib['Extension']  
574 - if extension.startswith('.'):  
575 - extension = extension[1:]  
576 - defaults.append((extension, elem.attrib['ContentType']))  
577 - logger.debug('found content type for extension {0[0]}: {0[1]}'  
578 - .format(defaults[-1]))  
579 - elif elem.tag.endswith('Override'):  
580 - subfile = elem.attrib['PartName']  
581 - if subfile.startswith('/'):  
582 - subfile = subfile[1:]  
583 - files.append((subfile, elem.attrib['ContentType']))  
584 - logger.debug('found content type for subfile {0[0]}: {0[1]}'  
585 - .format(files[-1])) 575 + try:
  576 + for _, elem, _ in self.iter_xml(FILE_CONTENT_TYPES):
  577 + if elem.tag.endswith('Default'):
  578 + extension = elem.attrib['Extension']
  579 + if extension.startswith('.'):
  580 + extension = extension[1:]
  581 + defaults.append((extension, elem.attrib['ContentType']))
  582 + logger.debug('found content type for extension {0[0]}: {0[1]}'
  583 + .format(defaults[-1]))
  584 + elif elem.tag.endswith('Override'):
  585 + subfile = elem.attrib['PartName']
  586 + if subfile.startswith('/'):
  587 + subfile = subfile[1:]
  588 + files.append((subfile, elem.attrib['ContentType']))
  589 + logger.debug('found content type for subfile {0[0]}: {0[1]}'
  590 + .format(files[-1]))
  591 + except BadOOXML as oo_err:
  592 + if oo_err.more_info.startswith('invalid subfile') and \
  593 + FILE_CONTENT_TYPES in oo_err.more_info:
  594 + # no FILE_CONTENT_TYPES in zip, so probably no ms office xml.
  595 + # Maybe OpenDocument format? In any case, try to analyze.
  596 + pass
  597 + else:
  598 + raise
586 return dict(files), dict(defaults) 599 return dict(files), dict(defaults)
587 600
588 def iter_non_xml(self): 601 def iter_non_xml(self):