Commit 70c8a2165618b6308b071bc90990621709dabcf9
1 parent
f028496d
ooxml: do not require [Content_Types].xml in zip files
This has to be present for Office OpenXML files but is not there e.g. for OpenOffice files (odt, ...). Can still analyze the file without this.
Showing
1 changed file
with
42 additions
and
29 deletions
oletools/ooxml.py
| @@ -144,15 +144,22 @@ def get_type(filename): | @@ -144,15 +144,22 @@ def get_type(filename): | ||
| 144 | is_doc = False | 144 | is_doc = False |
| 145 | is_xls = False | 145 | is_xls = False |
| 146 | is_ppt = False | 146 | is_ppt = False |
| 147 | - for _, elem, _ in parser.iter_xml(FILE_CONTENT_TYPES): | ||
| 148 | - logger.debug(u' ' + debug_str(elem)) | ||
| 149 | - try: | ||
| 150 | - content_type = elem.attrib['ContentType'] | ||
| 151 | - except KeyError: # ContentType not an attr | ||
| 152 | - continue | ||
| 153 | - is_xls |= content_type.startswith(CONTENT_TYPES_EXCEL) | ||
| 154 | - is_doc |= content_type.startswith(CONTENT_TYPES_WORD) | ||
| 155 | - is_ppt |= content_type.startswith(CONTENT_TYPES_PPT) | 147 | + try: |
| 148 | + for _, elem, _ in parser.iter_xml(FILE_CONTENT_TYPES): | ||
| 149 | + logger.debug(u' ' + debug_str(elem)) | ||
| 150 | + try: | ||
| 151 | + content_type = elem.attrib['ContentType'] | ||
| 152 | + except KeyError: # ContentType not an attr | ||
| 153 | + continue | ||
| 154 | + is_xls |= content_type.startswith(CONTENT_TYPES_EXCEL) | ||
| 155 | + is_doc |= content_type.startswith(CONTENT_TYPES_WORD) | ||
| 156 | + is_ppt |= content_type.startswith(CONTENT_TYPES_PPT) | ||
| 157 | + except BadOOXML as oo_err: | ||
| 158 | + if oo_err.more_info.startswith('invalid subfile') and \ | ||
| 159 | + FILE_CONTENT_TYPES in oo_err.more_info: | ||
| 160 | + # no FILE_CONTENT_TYPES in zip, so probably no ms office xml. | ||
| 161 | + return DOCTYPE_NONE | ||
| 162 | + raise | ||
| 156 | 163 | ||
| 157 | if is_doc and not is_xls and not is_ppt: | 164 | if is_doc and not is_xls and not is_ppt: |
| 158 | return DOCTYPE_WORD | 165 | return DOCTYPE_WORD |
| @@ -433,11 +440,6 @@ class XmlParser(object): | @@ -433,11 +440,6 @@ class XmlParser(object): | ||
| 433 | subfiles = None | 440 | subfiles = None |
| 434 | try: | 441 | try: |
| 435 | zipper = ZipFile(self.filename) | 442 | zipper = ZipFile(self.filename) |
| 436 | - try: | ||
| 437 | - _ = zipper.getinfo(FILE_CONTENT_TYPES) | ||
| 438 | - except KeyError: | ||
| 439 | - raise BadOOXML(self.filename, | ||
| 440 | - 'No content type information') | ||
| 441 | if not args: | 443 | if not args: |
| 442 | subfiles = zipper.namelist() | 444 | subfiles = zipper.namelist() |
| 443 | elif isstr(args): | 445 | elif isstr(args): |
| @@ -451,6 +453,8 @@ class XmlParser(object): | @@ -451,6 +453,8 @@ class XmlParser(object): | ||
| 451 | if not args: | 453 | if not args: |
| 452 | self.did_iter_all = True | 454 | self.did_iter_all = True |
| 453 | except KeyError as orig_err: | 455 | except KeyError as orig_err: |
| 456 | + # Note: do not change text of this message without adjusting | ||
| 457 | + # conditions in except handlers | ||
| 454 | raise BadOOXML(self.filename, | 458 | raise BadOOXML(self.filename, |
| 455 | 'invalid subfile: ' + str(orig_err)) | 459 | 'invalid subfile: ' + str(orig_err)) |
| 456 | except BadZipfile: | 460 | except BadZipfile: |
| @@ -568,21 +572,30 @@ class XmlParser(object): | @@ -568,21 +572,30 @@ class XmlParser(object): | ||
| 568 | 572 | ||
| 569 | defaults = [] | 573 | defaults = [] |
| 570 | files = [] | 574 | files = [] |
| 571 | - for _, elem, _ in self.iter_xml(FILE_CONTENT_TYPES): | ||
| 572 | - if elem.tag.endswith('Default'): | ||
| 573 | - extension = elem.attrib['Extension'] | ||
| 574 | - if extension.startswith('.'): | ||
| 575 | - extension = extension[1:] | ||
| 576 | - defaults.append((extension, elem.attrib['ContentType'])) | ||
| 577 | - logger.debug('found content type for extension {0[0]}: {0[1]}' | ||
| 578 | - .format(defaults[-1])) | ||
| 579 | - elif elem.tag.endswith('Override'): | ||
| 580 | - subfile = elem.attrib['PartName'] | ||
| 581 | - if subfile.startswith('/'): | ||
| 582 | - subfile = subfile[1:] | ||
| 583 | - files.append((subfile, elem.attrib['ContentType'])) | ||
| 584 | - logger.debug('found content type for subfile {0[0]}: {0[1]}' | ||
| 585 | - .format(files[-1])) | 575 | + try: |
| 576 | + for _, elem, _ in self.iter_xml(FILE_CONTENT_TYPES): | ||
| 577 | + if elem.tag.endswith('Default'): | ||
| 578 | + extension = elem.attrib['Extension'] | ||
| 579 | + if extension.startswith('.'): | ||
| 580 | + extension = extension[1:] | ||
| 581 | + defaults.append((extension, elem.attrib['ContentType'])) | ||
| 582 | + logger.debug('found content type for extension {0[0]}: {0[1]}' | ||
| 583 | + .format(defaults[-1])) | ||
| 584 | + elif elem.tag.endswith('Override'): | ||
| 585 | + subfile = elem.attrib['PartName'] | ||
| 586 | + if subfile.startswith('/'): | ||
| 587 | + subfile = subfile[1:] | ||
| 588 | + files.append((subfile, elem.attrib['ContentType'])) | ||
| 589 | + logger.debug('found content type for subfile {0[0]}: {0[1]}' | ||
| 590 | + .format(files[-1])) | ||
| 591 | + except BadOOXML as oo_err: | ||
| 592 | + if oo_err.more_info.startswith('invalid subfile') and \ | ||
| 593 | + FILE_CONTENT_TYPES in oo_err.more_info: | ||
| 594 | + # no FILE_CONTENT_TYPES in zip, so probably no ms office xml. | ||
| 595 | + # Maybe OpenDocument format? In any case, try to analyze. | ||
| 596 | + pass | ||
| 597 | + else: | ||
| 598 | + raise | ||
| 586 | return dict(files), dict(defaults) | 599 | return dict(files), dict(defaults) |
| 587 | 600 | ||
| 588 | def iter_non_xml(self): | 601 | def iter_non_xml(self): |