Commit 8fac8b0ca497fbbdc5c0422fea4306792a6dba58
Committed by
GitHub
Merge pull request #412 from christian-intra2net/ooxml-accept-OpenOffice
Avoid error in ooxml for non-office zip files
Showing
7 changed files
with
77 additions
and
62 deletions
oletools/oleobj.py
| ... | ... | @@ -750,13 +750,13 @@ def process_file(filename, data, output_dir=None): |
| 750 | 750 | |
| 751 | 751 | xml_parser = None |
| 752 | 752 | if is_zipfile(filename): |
| 753 | - log.info('file is a OOXML file, looking for relationships with external links') | |
| 753 | + log.info('file could be an OOXML file, looking for relationships with ' | |
| 754 | + 'external links') | |
| 754 | 755 | xml_parser = XmlParser(filename) |
| 755 | 756 | for relationship, target in find_external_relationships(xml_parser): |
| 756 | 757 | did_dump = True |
| 757 | 758 | print("Found relationship '%s' with external link %s" % (relationship, target)) |
| 758 | 759 | |
| 759 | - | |
| 760 | 760 | # look for ole files inside file (e.g. unzip docx) |
| 761 | 761 | # have to finish work on every ole stream inside iteration, since handles |
| 762 | 762 | # are closed in find_ole |
| ... | ... | @@ -765,9 +765,9 @@ def process_file(filename, data, output_dir=None): |
| 765 | 765 | continue |
| 766 | 766 | |
| 767 | 767 | for path_parts in ole.listdir(): |
| 768 | + stream_path = '/'.join(path_parts) | |
| 769 | + log.debug('Checking stream %r', stream_path) | |
| 768 | 770 | if path_parts[-1] == '\x01Ole10Native': |
| 769 | - stream_path = '/'.join(path_parts) | |
| 770 | - log.debug('Checking stream %r', stream_path) | |
| 771 | 771 | stream = None |
| 772 | 772 | try: |
| 773 | 773 | stream = ole.openstream(path_parts) | ... | ... |
oletools/ooxml.py
| ... | ... | @@ -16,11 +16,11 @@ TODO: "xml2003" == "flatopc"? |
| 16 | 16 | """ |
| 17 | 17 | |
| 18 | 18 | import sys |
| 19 | -from oletools.common.log_helper import log_helper | |
| 20 | 19 | from zipfile import ZipFile, BadZipfile, is_zipfile |
| 21 | 20 | from os.path import splitext |
| 22 | 21 | import io |
| 23 | 22 | import re |
| 23 | +from oletools.common.log_helper import log_helper | |
| 24 | 24 | |
| 25 | 25 | # import lxml or ElementTree for XML parsing: |
| 26 | 26 | try: |
| ... | ... | @@ -107,16 +107,14 @@ def debug_str(elem): |
| 107 | 107 | text = u', '.join(parts) |
| 108 | 108 | if len(text) > 150: |
| 109 | 109 | return text[:147] + u'...]' |
| 110 | - else: | |
| 111 | - return text + u']' | |
| 110 | + return text + u']' | |
| 112 | 111 | |
| 113 | 112 | |
| 114 | 113 | def isstr(some_var): |
| 115 | 114 | """ version-independent test for isinstance(some_var, (str, unicode)) """ |
| 116 | 115 | if sys.version_info.major == 2: |
| 117 | 116 | return isinstance(some_var, basestring) # true for str and unicode |
| 118 | - else: | |
| 119 | - return isinstance(some_var, str) # there is no unicode | |
| 117 | + return isinstance(some_var, str) # there is no unicode | |
| 120 | 118 | |
| 121 | 119 | |
| 122 | 120 | ############################################################################### |
| ... | ... | @@ -136,23 +134,29 @@ def get_type(filename): |
| 136 | 134 | prog_id = match.groups()[0] |
| 137 | 135 | if prog_id == WORD_XML_PROG_ID: |
| 138 | 136 | return DOCTYPE_WORD_XML |
| 139 | - elif prog_id == EXCEL_XML_PROG_ID: | |
| 137 | + if prog_id == EXCEL_XML_PROG_ID: | |
| 140 | 138 | return DOCTYPE_EXCEL_XML |
| 141 | - else: | |
| 142 | - return DOCTYPE_NONE | |
| 139 | + return DOCTYPE_NONE | |
| 143 | 140 | |
| 144 | 141 | is_doc = False |
| 145 | 142 | is_xls = False |
| 146 | 143 | is_ppt = False |
| 147 | - for _, elem, _ in parser.iter_xml(FILE_CONTENT_TYPES): | |
| 148 | - logger.debug(u' ' + debug_str(elem)) | |
| 149 | - try: | |
| 150 | - content_type = elem.attrib['ContentType'] | |
| 151 | - except KeyError: # ContentType not an attr | |
| 152 | - continue | |
| 153 | - is_xls |= content_type.startswith(CONTENT_TYPES_EXCEL) | |
| 154 | - is_doc |= content_type.startswith(CONTENT_TYPES_WORD) | |
| 155 | - is_ppt |= content_type.startswith(CONTENT_TYPES_PPT) | |
| 144 | + try: | |
| 145 | + for _, elem, _ in parser.iter_xml(FILE_CONTENT_TYPES): | |
| 146 | + logger.debug(u' ' + debug_str(elem)) | |
| 147 | + try: | |
| 148 | + content_type = elem.attrib['ContentType'] | |
| 149 | + except KeyError: # ContentType not an attr | |
| 150 | + continue | |
| 151 | + is_xls |= content_type.startswith(CONTENT_TYPES_EXCEL) | |
| 152 | + is_doc |= content_type.startswith(CONTENT_TYPES_WORD) | |
| 153 | + is_ppt |= content_type.startswith(CONTENT_TYPES_PPT) | |
| 154 | + except BadOOXML as oo_err: | |
| 155 | + if oo_err.more_info.startswith('invalid subfile') and \ | |
| 156 | + FILE_CONTENT_TYPES in oo_err.more_info: | |
| 157 | + # no FILE_CONTENT_TYPES in zip, so probably no ms office xml. | |
| 158 | + return DOCTYPE_NONE | |
| 159 | + raise | |
| 156 | 160 | |
| 157 | 161 | if is_doc and not is_xls and not is_ppt: |
| 158 | 162 | return DOCTYPE_WORD |
| ... | ... | @@ -162,9 +166,8 @@ def get_type(filename): |
| 162 | 166 | return DOCTYPE_POWERPOINT |
| 163 | 167 | if not is_doc and not is_xls and not is_ppt: |
| 164 | 168 | return DOCTYPE_NONE |
| 165 | - else: | |
| 166 | - logger.warning('Encountered contradictory content types') | |
| 167 | - return DOCTYPE_MIXED | |
| 169 | + logger.warning('Encountered contradictory content types') | |
| 170 | + return DOCTYPE_MIXED | |
| 168 | 171 | |
| 169 | 172 | |
| 170 | 173 | def is_ooxml(filename): |
| ... | ... | @@ -177,6 +180,7 @@ def is_ooxml(filename): |
| 177 | 180 | return False |
| 178 | 181 | if doctype == DOCTYPE_NONE: |
| 179 | 182 | return False |
| 183 | + return True | |
| 180 | 184 | |
| 181 | 185 | |
| 182 | 186 | ############################################################################### |
| ... | ... | @@ -216,6 +220,7 @@ class ZipSubFile(object): |
| 216 | 220 | See also (and maybe could some day merge with): |
| 217 | 221 | ppt_record_parser.IterStream; also: oleobj.FakeFile |
| 218 | 222 | """ |
| 223 | + CHUNK_SIZE = 4096 | |
| 219 | 224 | |
| 220 | 225 | def __init__(self, container, filename, mode='r', size=None): |
| 221 | 226 | """ remember all necessary vars but do not open yet """ |
| ... | ... | @@ -253,7 +258,7 @@ class ZipSubFile(object): |
| 253 | 258 | # print('ZipSubFile: opened; size={}'.format(self.size)) |
| 254 | 259 | return self |
| 255 | 260 | |
| 256 | - def write(self, *args, **kwargs): # pylint: disable=unused-argument,no-self-use | |
| 261 | + def write(self, *args, **kwargs): | |
| 257 | 262 | """ write is not allowed """ |
| 258 | 263 | raise IOError('writing not implemented') |
| 259 | 264 | |
| ... | ... | @@ -311,10 +316,9 @@ class ZipSubFile(object): |
| 311 | 316 | """ helper for seek: skip forward by given amount using read() """ |
| 312 | 317 | # print('ZipSubFile: seek by skipping {} bytes starting at {}' |
| 313 | 318 | # .format(self.pos, to_skip)) |
| 314 | - CHUNK_SIZE = 4096 | |
| 315 | - n_chunks, leftover = divmod(to_skip, CHUNK_SIZE) | |
| 319 | + n_chunks, leftover = divmod(to_skip, self.CHUNK_SIZE) | |
| 316 | 320 | for _ in range(n_chunks): |
| 317 | - self.read(CHUNK_SIZE) # just read and discard | |
| 321 | + self.read(self.CHUNK_SIZE) # just read and discard | |
| 318 | 322 | self.read(leftover) |
| 319 | 323 | # print('ZipSubFile: seek by skipping done, pos now {}' |
| 320 | 324 | # .format(self.pos)) |
| ... | ... | @@ -417,8 +421,7 @@ class XmlParser(object): |
| 417 | 421 | if match: |
| 418 | 422 | self._is_single_xml = True |
| 419 | 423 | return True |
| 420 | - if not match: | |
| 421 | - raise BadOOXML(self.filename, 'is no zip and has no prog_id') | |
| 424 | + raise BadOOXML(self.filename, 'is no zip and has no prog_id') | |
| 422 | 425 | |
| 423 | 426 | def iter_files(self, args=None): |
| 424 | 427 | """ Find files in zip or just give single xml file """ |
| ... | ... | @@ -433,17 +436,14 @@ class XmlParser(object): |
| 433 | 436 | subfiles = None |
| 434 | 437 | try: |
| 435 | 438 | zipper = ZipFile(self.filename) |
| 436 | - try: | |
| 437 | - _ = zipper.getinfo(FILE_CONTENT_TYPES) | |
| 438 | - except KeyError: | |
| 439 | - raise BadOOXML(self.filename, | |
| 440 | - 'No content type information') | |
| 441 | 439 | if not args: |
| 442 | 440 | subfiles = zipper.namelist() |
| 443 | 441 | elif isstr(args): |
| 444 | 442 | subfiles = [args, ] |
| 445 | 443 | else: |
| 446 | - subfiles = tuple(args) # make a copy in case orig changes | |
| 444 | + # make a copy in case original args are modified | |
| 445 | + # Not sure whether this really is needed... | |
| 446 | + subfiles = tuple(arg for arg in args) | |
| 447 | 447 | |
| 448 | 448 | for subfile in subfiles: |
| 449 | 449 | with zipper.open(subfile, 'r') as handle: |
| ... | ... | @@ -451,10 +451,12 @@ class XmlParser(object): |
| 451 | 451 | if not args: |
| 452 | 452 | self.did_iter_all = True |
| 453 | 453 | except KeyError as orig_err: |
| 454 | + # Note: do not change text of this message without adjusting | |
| 455 | + # conditions in except handlers | |
| 454 | 456 | raise BadOOXML(self.filename, |
| 455 | 457 | 'invalid subfile: ' + str(orig_err)) |
| 456 | 458 | except BadZipfile: |
| 457 | - raise BadOOXML(self.filename, 'neither zip nor xml') | |
| 459 | + raise BadOOXML(self.filename, 'not in zip format') | |
| 458 | 460 | finally: |
| 459 | 461 | if zipper: |
| 460 | 462 | zipper.close() |
| ... | ... | @@ -503,7 +505,7 @@ class XmlParser(object): |
| 503 | 505 | if event == 'start': |
| 504 | 506 | if elem.tag in want_tags: |
| 505 | 507 | logger.debug('remember start of tag {0} at {1}' |
| 506 | - .format(elem.tag, depth)) | |
| 508 | + .format(elem.tag, depth)) | |
| 507 | 509 | inside_tags.append((elem.tag, depth)) |
| 508 | 510 | depth += 1 |
| 509 | 511 | continue |
| ... | ... | @@ -519,18 +521,18 @@ class XmlParser(object): |
| 519 | 521 | inside_tags.pop() |
| 520 | 522 | else: |
| 521 | 523 | logger.error('found end for wanted tag {0} ' |
| 522 | - 'but last start tag {1} does not' | |
| 523 | - ' match'.format(curr_tag, | |
| 524 | - inside_tags[-1])) | |
| 524 | + 'but last start tag {1} does not' | |
| 525 | + ' match'.format(curr_tag, | |
| 526 | + inside_tags[-1])) | |
| 525 | 527 | # try to recover: close all deeper tags |
| 526 | 528 | while inside_tags and \ |
| 527 | 529 | inside_tags[-1][1] >= depth: |
| 528 | 530 | logger.debug('recover: pop {0}' |
| 529 | - .format(inside_tags[-1])) | |
| 531 | + .format(inside_tags[-1])) | |
| 530 | 532 | inside_tags.pop() |
| 531 | 533 | except IndexError: # no inside_tag[-1] |
| 532 | 534 | logger.error('found end of {0} at depth {1} but ' |
| 533 | - 'no start event') | |
| 535 | + 'no start event') | |
| 534 | 536 | # yield element |
| 535 | 537 | if is_wanted or not want_tags: |
| 536 | 538 | yield subfile, elem, depth |
| ... | ... | @@ -544,7 +546,7 @@ class XmlParser(object): |
| 544 | 546 | except ET.ParseError as err: |
| 545 | 547 | self.subfiles_no_xml.add(subfile) |
| 546 | 548 | if subfile is None: # this is no zip subfile but single xml |
| 547 | - raise BadOOXML(self.filename, 'is neither zip nor xml') | |
| 549 | + raise BadOOXML(self.filename, 'content is not valid XML') | |
| 548 | 550 | elif subfile.endswith('.xml'): |
| 549 | 551 | log = logger.warning |
| 550 | 552 | else: |
| ... | ... | @@ -568,21 +570,30 @@ class XmlParser(object): |
| 568 | 570 | |
| 569 | 571 | defaults = [] |
| 570 | 572 | files = [] |
| 571 | - for _, elem, _ in self.iter_xml(FILE_CONTENT_TYPES): | |
| 572 | - if elem.tag.endswith('Default'): | |
| 573 | - extension = elem.attrib['Extension'] | |
| 574 | - if extension.startswith('.'): | |
| 575 | - extension = extension[1:] | |
| 576 | - defaults.append((extension, elem.attrib['ContentType'])) | |
| 577 | - logger.debug('found content type for extension {0[0]}: {0[1]}' | |
| 578 | - .format(defaults[-1])) | |
| 579 | - elif elem.tag.endswith('Override'): | |
| 580 | - subfile = elem.attrib['PartName'] | |
| 581 | - if subfile.startswith('/'): | |
| 582 | - subfile = subfile[1:] | |
| 583 | - files.append((subfile, elem.attrib['ContentType'])) | |
| 584 | - logger.debug('found content type for subfile {0[0]}: {0[1]}' | |
| 585 | - .format(files[-1])) | |
| 573 | + try: | |
| 574 | + for _, elem, _ in self.iter_xml(FILE_CONTENT_TYPES): | |
| 575 | + if elem.tag.endswith('Default'): | |
| 576 | + extension = elem.attrib['Extension'] | |
| 577 | + if extension.startswith('.'): | |
| 578 | + extension = extension[1:] | |
| 579 | + defaults.append((extension, elem.attrib['ContentType'])) | |
| 580 | + logger.debug('found content type for extension {0[0]}: ' | |
| 581 | + '{0[1]}'.format(defaults[-1])) | |
| 582 | + elif elem.tag.endswith('Override'): | |
| 583 | + subfile = elem.attrib['PartName'] | |
| 584 | + if subfile.startswith('/'): | |
| 585 | + subfile = subfile[1:] | |
| 586 | + files.append((subfile, elem.attrib['ContentType'])) | |
| 587 | + logger.debug('found content type for subfile {0[0]}: ' | |
| 588 | + '{0[1]}'.format(files[-1])) | |
| 589 | + except BadOOXML as oo_err: | |
| 590 | + if oo_err.more_info.startswith('invalid subfile') and \ | |
| 591 | + FILE_CONTENT_TYPES in oo_err.more_info: | |
| 592 | + # no FILE_CONTENT_TYPES in zip, so probably no ms office xml. | |
| 593 | + # Maybe OpenDocument format? In any case, try to analyze. | |
| 594 | + pass | |
| 595 | + else: | |
| 596 | + raise | |
| 586 | 597 | return dict(files), dict(defaults) |
| 587 | 598 | |
| 588 | 599 | def iter_non_xml(self): |
| ... | ... | @@ -599,7 +610,7 @@ class XmlParser(object): |
| 599 | 610 | """ |
| 600 | 611 | if not self.did_iter_all: |
| 601 | 612 | logger.warning('Did not iterate through complete file. ' |
| 602 | - 'Should run iter_xml() without args, first.') | |
| 613 | + 'Should run iter_xml() without args, first.') | |
| 603 | 614 | if not self.subfiles_no_xml: |
| 604 | 615 | return |
| 605 | 616 | |
| ... | ... | @@ -631,7 +642,7 @@ def test(): |
| 631 | 642 | |
| 632 | 643 | see module doc for more info |
| 633 | 644 | """ |
| 634 | - log_helper.enable_logging(False, logger.DEBUG) | |
| 645 | + log_helper.enable_logging(False, 'debug') | |
| 635 | 646 | if len(sys.argv) != 2: |
| 636 | 647 | print(u'To test this code, give me a single file as arg') |
| 637 | 648 | return 2 | ... | ... |
tests/oleobj/test_basic.py
| ... | ... | @@ -41,8 +41,10 @@ SAMPLES += tuple( |
| 41 | 41 | 'ab8c65e4c0fc51739aa66ca5888265b4') |
| 42 | 42 | for extn in ('xls', 'xlsx', 'xlsb', 'xlsm', 'xla', 'xlam', 'xlt', 'xltm', |
| 43 | 43 | 'xltx', 'ppt', 'pptx', 'pptm', 'pps', 'ppsx', 'ppsm', 'pot', |
| 44 | - 'potx', 'potm') | |
| 44 | + 'potx', 'potm', 'ods', 'odp') | |
| 45 | 45 | ) |
| 46 | +SAMPLES += (('embedded-simple-2007.odt', 'simple-text-file.txt', | |
| 47 | + 'bd5c063a5a43f67b3c50dc7b0f1195af'), ) | |
| 46 | 48 | |
| 47 | 49 | |
| 48 | 50 | def calc_md5(filename): | ... | ... |
tests/ooxml/test_basic.py
| ... | ... | @@ -33,6 +33,8 @@ class TestOOXML(unittest.TestCase): |
| 33 | 33 | pptx=ooxml.DOCTYPE_POWERPOINT, pptm=ooxml.DOCTYPE_POWERPOINT, |
| 34 | 34 | ppsx=ooxml.DOCTYPE_POWERPOINT, ppsm=ooxml.DOCTYPE_POWERPOINT, |
| 35 | 35 | potx=ooxml.DOCTYPE_POWERPOINT, potm=ooxml.DOCTYPE_POWERPOINT, |
| 36 | + ods=ooxml.DOCTYPE_NONE, odt=ooxml.DOCTYPE_NONE, | |
| 37 | + odp=ooxml.DOCTYPE_NONE, | |
| 36 | 38 | ) |
| 37 | 39 | |
| 38 | 40 | # files that are neither OLE nor xml: | ... | ... |
tests/test-data/oleobj/embedded-simple-2007.odp
0 โ 100644
No preview for this file type
tests/test-data/oleobj/embedded-simple-2007.ods
0 โ 100644
No preview for this file type
tests/test-data/oleobj/embedded-simple-2007.odt
0 โ 100644
No preview for this file type