Commit 70c8a2165618b6308b071bc90990621709dabcf9
1 parent
f028496d
ooxml: do not require [Content_Types].xml in zip files
This has to be present for Office OpenXML files but is not there e.g. for OpenOffice files (odt, ...). Can still analyze the file without this.
Showing
1 changed file
with
42 additions
and
29 deletions
oletools/ooxml.py
| ... | ... | @@ -144,15 +144,22 @@ def get_type(filename): |
| 144 | 144 | is_doc = False |
| 145 | 145 | is_xls = False |
| 146 | 146 | is_ppt = False |
| 147 | - for _, elem, _ in parser.iter_xml(FILE_CONTENT_TYPES): | |
| 148 | - logger.debug(u' ' + debug_str(elem)) | |
| 149 | - try: | |
| 150 | - content_type = elem.attrib['ContentType'] | |
| 151 | - except KeyError: # ContentType not an attr | |
| 152 | - continue | |
| 153 | - is_xls |= content_type.startswith(CONTENT_TYPES_EXCEL) | |
| 154 | - is_doc |= content_type.startswith(CONTENT_TYPES_WORD) | |
| 155 | - is_ppt |= content_type.startswith(CONTENT_TYPES_PPT) | |
| 147 | + try: | |
| 148 | + for _, elem, _ in parser.iter_xml(FILE_CONTENT_TYPES): | |
| 149 | + logger.debug(u' ' + debug_str(elem)) | |
| 150 | + try: | |
| 151 | + content_type = elem.attrib['ContentType'] | |
| 152 | + except KeyError: # ContentType not an attr | |
| 153 | + continue | |
| 154 | + is_xls |= content_type.startswith(CONTENT_TYPES_EXCEL) | |
| 155 | + is_doc |= content_type.startswith(CONTENT_TYPES_WORD) | |
| 156 | + is_ppt |= content_type.startswith(CONTENT_TYPES_PPT) | |
| 157 | + except BadOOXML as oo_err: | |
| 158 | + if oo_err.more_info.startswith('invalid subfile') and \ | |
| 159 | + FILE_CONTENT_TYPES in oo_err.more_info: | |
| 160 | + # no FILE_CONTENT_TYPES in zip, so probably no ms office xml. | |
| 161 | + return DOCTYPE_NONE | |
| 162 | + raise | |
| 156 | 163 | |
| 157 | 164 | if is_doc and not is_xls and not is_ppt: |
| 158 | 165 | return DOCTYPE_WORD |
| ... | ... | @@ -433,11 +440,6 @@ class XmlParser(object): |
| 433 | 440 | subfiles = None |
| 434 | 441 | try: |
| 435 | 442 | zipper = ZipFile(self.filename) |
| 436 | - try: | |
| 437 | - _ = zipper.getinfo(FILE_CONTENT_TYPES) | |
| 438 | - except KeyError: | |
| 439 | - raise BadOOXML(self.filename, | |
| 440 | - 'No content type information') | |
| 441 | 443 | if not args: |
| 442 | 444 | subfiles = zipper.namelist() |
| 443 | 445 | elif isstr(args): |
| ... | ... | @@ -451,6 +453,8 @@ class XmlParser(object): |
| 451 | 453 | if not args: |
| 452 | 454 | self.did_iter_all = True |
| 453 | 455 | except KeyError as orig_err: |
| 456 | + # Note: do not change text of this message without adjusting | |
| 457 | + # conditions in except handlers | |
| 454 | 458 | raise BadOOXML(self.filename, |
| 455 | 459 | 'invalid subfile: ' + str(orig_err)) |
| 456 | 460 | except BadZipfile: |
| ... | ... | @@ -568,21 +572,30 @@ class XmlParser(object): |
| 568 | 572 | |
| 569 | 573 | defaults = [] |
| 570 | 574 | files = [] |
| 571 | - for _, elem, _ in self.iter_xml(FILE_CONTENT_TYPES): | |
| 572 | - if elem.tag.endswith('Default'): | |
| 573 | - extension = elem.attrib['Extension'] | |
| 574 | - if extension.startswith('.'): | |
| 575 | - extension = extension[1:] | |
| 576 | - defaults.append((extension, elem.attrib['ContentType'])) | |
| 577 | - logger.debug('found content type for extension {0[0]}: {0[1]}' | |
| 578 | - .format(defaults[-1])) | |
| 579 | - elif elem.tag.endswith('Override'): | |
| 580 | - subfile = elem.attrib['PartName'] | |
| 581 | - if subfile.startswith('/'): | |
| 582 | - subfile = subfile[1:] | |
| 583 | - files.append((subfile, elem.attrib['ContentType'])) | |
| 584 | - logger.debug('found content type for subfile {0[0]}: {0[1]}' | |
| 585 | - .format(files[-1])) | |
| 575 | + try: | |
| 576 | + for _, elem, _ in self.iter_xml(FILE_CONTENT_TYPES): | |
| 577 | + if elem.tag.endswith('Default'): | |
| 578 | + extension = elem.attrib['Extension'] | |
| 579 | + if extension.startswith('.'): | |
| 580 | + extension = extension[1:] | |
| 581 | + defaults.append((extension, elem.attrib['ContentType'])) | |
| 582 | + logger.debug('found content type for extension {0[0]}: {0[1]}' | |
| 583 | + .format(defaults[-1])) | |
| 584 | + elif elem.tag.endswith('Override'): | |
| 585 | + subfile = elem.attrib['PartName'] | |
| 586 | + if subfile.startswith('/'): | |
| 587 | + subfile = subfile[1:] | |
| 588 | + files.append((subfile, elem.attrib['ContentType'])) | |
| 589 | + logger.debug('found content type for subfile {0[0]}: {0[1]}' | |
| 590 | + .format(files[-1])) | |
| 591 | + except BadOOXML as oo_err: | |
| 592 | + if oo_err.more_info.startswith('invalid subfile') and \ | |
| 593 | + FILE_CONTENT_TYPES in oo_err.more_info: | |
| 594 | + # no FILE_CONTENT_TYPES in zip, so probably no ms office xml. | |
| 595 | + # Maybe OpenDocument format? In any case, try to analyze. | |
| 596 | + pass | |
| 597 | + else: | |
| 598 | + raise | |
| 586 | 599 | return dict(files), dict(defaults) |
| 587 | 600 | |
| 588 | 601 | def iter_non_xml(self): | ... | ... |