Commit 70c8a2165618b6308b071bc90990621709dabcf9

Authored by Christian Herdtweck
1 parent f028496d

ooxml: do not require [Content_Types].xml in zip files

This has to be present for Office OpenXML files but is not there e.g.
for OpenOffice files (odt, ...).

Can still analyze the file without this.
Showing 1 changed file with 42 additions and 29 deletions
oletools/ooxml.py
... ... @@ -144,15 +144,22 @@ def get_type(filename):
144 144 is_doc = False
145 145 is_xls = False
146 146 is_ppt = False
147   - for _, elem, _ in parser.iter_xml(FILE_CONTENT_TYPES):
148   - logger.debug(u' ' + debug_str(elem))
149   - try:
150   - content_type = elem.attrib['ContentType']
151   - except KeyError: # ContentType not an attr
152   - continue
153   - is_xls |= content_type.startswith(CONTENT_TYPES_EXCEL)
154   - is_doc |= content_type.startswith(CONTENT_TYPES_WORD)
155   - is_ppt |= content_type.startswith(CONTENT_TYPES_PPT)
  147 + try:
  148 + for _, elem, _ in parser.iter_xml(FILE_CONTENT_TYPES):
  149 + logger.debug(u' ' + debug_str(elem))
  150 + try:
  151 + content_type = elem.attrib['ContentType']
  152 + except KeyError: # ContentType not an attr
  153 + continue
  154 + is_xls |= content_type.startswith(CONTENT_TYPES_EXCEL)
  155 + is_doc |= content_type.startswith(CONTENT_TYPES_WORD)
  156 + is_ppt |= content_type.startswith(CONTENT_TYPES_PPT)
  157 + except BadOOXML as oo_err:
  158 + if oo_err.more_info.startswith('invalid subfile') and \
  159 + FILE_CONTENT_TYPES in oo_err.more_info:
  160 + # no FILE_CONTENT_TYPES in zip, so probably no ms office xml.
  161 + return DOCTYPE_NONE
  162 + raise
156 163  
157 164 if is_doc and not is_xls and not is_ppt:
158 165 return DOCTYPE_WORD
... ... @@ -433,11 +440,6 @@ class XmlParser(object):
433 440 subfiles = None
434 441 try:
435 442 zipper = ZipFile(self.filename)
436   - try:
437   - _ = zipper.getinfo(FILE_CONTENT_TYPES)
438   - except KeyError:
439   - raise BadOOXML(self.filename,
440   - 'No content type information')
441 443 if not args:
442 444 subfiles = zipper.namelist()
443 445 elif isstr(args):
... ... @@ -451,6 +453,8 @@ class XmlParser(object):
451 453 if not args:
452 454 self.did_iter_all = True
453 455 except KeyError as orig_err:
  456 + # Note: do not change text of this message without adjusting
  457 + # conditions in except handlers
454 458 raise BadOOXML(self.filename,
455 459 'invalid subfile: ' + str(orig_err))
456 460 except BadZipfile:
... ... @@ -568,21 +572,30 @@ class XmlParser(object):
568 572  
569 573 defaults = []
570 574 files = []
571   - for _, elem, _ in self.iter_xml(FILE_CONTENT_TYPES):
572   - if elem.tag.endswith('Default'):
573   - extension = elem.attrib['Extension']
574   - if extension.startswith('.'):
575   - extension = extension[1:]
576   - defaults.append((extension, elem.attrib['ContentType']))
577   - logger.debug('found content type for extension {0[0]}: {0[1]}'
578   - .format(defaults[-1]))
579   - elif elem.tag.endswith('Override'):
580   - subfile = elem.attrib['PartName']
581   - if subfile.startswith('/'):
582   - subfile = subfile[1:]
583   - files.append((subfile, elem.attrib['ContentType']))
584   - logger.debug('found content type for subfile {0[0]}: {0[1]}'
585   - .format(files[-1]))
  575 + try:
  576 + for _, elem, _ in self.iter_xml(FILE_CONTENT_TYPES):
  577 + if elem.tag.endswith('Default'):
  578 + extension = elem.attrib['Extension']
  579 + if extension.startswith('.'):
  580 + extension = extension[1:]
  581 + defaults.append((extension, elem.attrib['ContentType']))
  582 + logger.debug('found content type for extension {0[0]}: {0[1]}'
  583 + .format(defaults[-1]))
  584 + elif elem.tag.endswith('Override'):
  585 + subfile = elem.attrib['PartName']
  586 + if subfile.startswith('/'):
  587 + subfile = subfile[1:]
  588 + files.append((subfile, elem.attrib['ContentType']))
  589 + logger.debug('found content type for subfile {0[0]}: {0[1]}'
  590 + .format(files[-1]))
  591 + except BadOOXML as oo_err:
  592 + if oo_err.more_info.startswith('invalid subfile') and \
  593 + FILE_CONTENT_TYPES in oo_err.more_info:
  594 + # no FILE_CONTENT_TYPES in zip, so probably no ms office xml.
  595 + # Maybe OpenDocument format? In any case, try to analyze.
  596 + pass
  597 + else:
  598 + raise
586 599 return dict(files), dict(defaults)
587 600  
588 601 def iter_non_xml(self):
... ...