Commit 8fac8b0ca497fbbdc5c0422fea4306792a6dba58

Authored by Philippe Lagadec
Committed by GitHub
2 parents f028496d a55bd780

Merge pull request #412 from christian-intra2net/ooxml-accept-OpenOffice

Avoid error in ooxml for non-office zip files
oletools/oleobj.py
@@ -750,13 +750,13 @@ def process_file(filename, data, output_dir=None): @@ -750,13 +750,13 @@ def process_file(filename, data, output_dir=None):
750 750
751 xml_parser = None 751 xml_parser = None
752 if is_zipfile(filename): 752 if is_zipfile(filename):
753 - log.info('file is a OOXML file, looking for relationships with external links') 753 + log.info('file could be an OOXML file, looking for relationships with '
  754 + 'external links')
754 xml_parser = XmlParser(filename) 755 xml_parser = XmlParser(filename)
755 for relationship, target in find_external_relationships(xml_parser): 756 for relationship, target in find_external_relationships(xml_parser):
756 did_dump = True 757 did_dump = True
757 print("Found relationship '%s' with external link %s" % (relationship, target)) 758 print("Found relationship '%s' with external link %s" % (relationship, target))
758 759
759 -  
760 # look for ole files inside file (e.g. unzip docx) 760 # look for ole files inside file (e.g. unzip docx)
761 # have to finish work on every ole stream inside iteration, since handles 761 # have to finish work on every ole stream inside iteration, since handles
762 # are closed in find_ole 762 # are closed in find_ole
@@ -765,9 +765,9 @@ def process_file(filename, data, output_dir=None): @@ -765,9 +765,9 @@ def process_file(filename, data, output_dir=None):
765 continue 765 continue
766 766
767 for path_parts in ole.listdir(): 767 for path_parts in ole.listdir():
  768 + stream_path = '/'.join(path_parts)
  769 + log.debug('Checking stream %r', stream_path)
768 if path_parts[-1] == '\x01Ole10Native': 770 if path_parts[-1] == '\x01Ole10Native':
769 - stream_path = '/'.join(path_parts)  
770 - log.debug('Checking stream %r', stream_path)  
771 stream = None 771 stream = None
772 try: 772 try:
773 stream = ole.openstream(path_parts) 773 stream = ole.openstream(path_parts)
oletools/ooxml.py
@@ -16,11 +16,11 @@ TODO: "xml2003" == "flatopc"? @@ -16,11 +16,11 @@ TODO: "xml2003" == "flatopc"?
16 """ 16 """
17 17
18 import sys 18 import sys
19 -from oletools.common.log_helper import log_helper  
20 from zipfile import ZipFile, BadZipfile, is_zipfile 19 from zipfile import ZipFile, BadZipfile, is_zipfile
21 from os.path import splitext 20 from os.path import splitext
22 import io 21 import io
23 import re 22 import re
  23 +from oletools.common.log_helper import log_helper
24 24
25 # import lxml or ElementTree for XML parsing: 25 # import lxml or ElementTree for XML parsing:
26 try: 26 try:
@@ -107,16 +107,14 @@ def debug_str(elem): @@ -107,16 +107,14 @@ def debug_str(elem):
107 text = u', '.join(parts) 107 text = u', '.join(parts)
108 if len(text) > 150: 108 if len(text) > 150:
109 return text[:147] + u'...]' 109 return text[:147] + u'...]'
110 - else:  
111 - return text + u']' 110 + return text + u']'
112 111
113 112
114 def isstr(some_var): 113 def isstr(some_var):
115 """ version-independent test for isinstance(some_var, (str, unicode)) """ 114 """ version-independent test for isinstance(some_var, (str, unicode)) """
116 if sys.version_info.major == 2: 115 if sys.version_info.major == 2:
117 return isinstance(some_var, basestring) # true for str and unicode 116 return isinstance(some_var, basestring) # true for str and unicode
118 - else:  
119 - return isinstance(some_var, str) # there is no unicode 117 + return isinstance(some_var, str) # there is no unicode
120 118
121 119
122 ############################################################################### 120 ###############################################################################
@@ -136,23 +134,29 @@ def get_type(filename): @@ -136,23 +134,29 @@ def get_type(filename):
136 prog_id = match.groups()[0] 134 prog_id = match.groups()[0]
137 if prog_id == WORD_XML_PROG_ID: 135 if prog_id == WORD_XML_PROG_ID:
138 return DOCTYPE_WORD_XML 136 return DOCTYPE_WORD_XML
139 - elif prog_id == EXCEL_XML_PROG_ID: 137 + if prog_id == EXCEL_XML_PROG_ID:
140 return DOCTYPE_EXCEL_XML 138 return DOCTYPE_EXCEL_XML
141 - else:  
142 - return DOCTYPE_NONE 139 + return DOCTYPE_NONE
143 140
144 is_doc = False 141 is_doc = False
145 is_xls = False 142 is_xls = False
146 is_ppt = False 143 is_ppt = False
147 - for _, elem, _ in parser.iter_xml(FILE_CONTENT_TYPES):  
148 - logger.debug(u' ' + debug_str(elem))  
149 - try:  
150 - content_type = elem.attrib['ContentType']  
151 - except KeyError: # ContentType not an attr  
152 - continue  
153 - is_xls |= content_type.startswith(CONTENT_TYPES_EXCEL)  
154 - is_doc |= content_type.startswith(CONTENT_TYPES_WORD)  
155 - is_ppt |= content_type.startswith(CONTENT_TYPES_PPT) 144 + try:
  145 + for _, elem, _ in parser.iter_xml(FILE_CONTENT_TYPES):
  146 + logger.debug(u' ' + debug_str(elem))
  147 + try:
  148 + content_type = elem.attrib['ContentType']
  149 + except KeyError: # ContentType not an attr
  150 + continue
  151 + is_xls |= content_type.startswith(CONTENT_TYPES_EXCEL)
  152 + is_doc |= content_type.startswith(CONTENT_TYPES_WORD)
  153 + is_ppt |= content_type.startswith(CONTENT_TYPES_PPT)
  154 + except BadOOXML as oo_err:
  155 + if oo_err.more_info.startswith('invalid subfile') and \
  156 + FILE_CONTENT_TYPES in oo_err.more_info:
  157 + # no FILE_CONTENT_TYPES in zip, so probably no ms office xml.
  158 + return DOCTYPE_NONE
  159 + raise
156 160
157 if is_doc and not is_xls and not is_ppt: 161 if is_doc and not is_xls and not is_ppt:
158 return DOCTYPE_WORD 162 return DOCTYPE_WORD
@@ -162,9 +166,8 @@ def get_type(filename): @@ -162,9 +166,8 @@ def get_type(filename):
162 return DOCTYPE_POWERPOINT 166 return DOCTYPE_POWERPOINT
163 if not is_doc and not is_xls and not is_ppt: 167 if not is_doc and not is_xls and not is_ppt:
164 return DOCTYPE_NONE 168 return DOCTYPE_NONE
165 - else:  
166 - logger.warning('Encountered contradictory content types')  
167 - return DOCTYPE_MIXED 169 + logger.warning('Encountered contradictory content types')
  170 + return DOCTYPE_MIXED
168 171
169 172
170 def is_ooxml(filename): 173 def is_ooxml(filename):
@@ -177,6 +180,7 @@ def is_ooxml(filename): @@ -177,6 +180,7 @@ def is_ooxml(filename):
177 return False 180 return False
178 if doctype == DOCTYPE_NONE: 181 if doctype == DOCTYPE_NONE:
179 return False 182 return False
  183 + return True
180 184
181 185
182 ############################################################################### 186 ###############################################################################
@@ -216,6 +220,7 @@ class ZipSubFile(object): @@ -216,6 +220,7 @@ class ZipSubFile(object):
216 See also (and maybe could some day merge with): 220 See also (and maybe could some day merge with):
217 ppt_record_parser.IterStream; also: oleobj.FakeFile 221 ppt_record_parser.IterStream; also: oleobj.FakeFile
218 """ 222 """
  223 + CHUNK_SIZE = 4096
219 224
220 def __init__(self, container, filename, mode='r', size=None): 225 def __init__(self, container, filename, mode='r', size=None):
221 """ remember all necessary vars but do not open yet """ 226 """ remember all necessary vars but do not open yet """
@@ -253,7 +258,7 @@ class ZipSubFile(object): @@ -253,7 +258,7 @@ class ZipSubFile(object):
253 # print('ZipSubFile: opened; size={}'.format(self.size)) 258 # print('ZipSubFile: opened; size={}'.format(self.size))
254 return self 259 return self
255 260
256 - def write(self, *args, **kwargs): # pylint: disable=unused-argument,no-self-use 261 + def write(self, *args, **kwargs):
257 """ write is not allowed """ 262 """ write is not allowed """
258 raise IOError('writing not implemented') 263 raise IOError('writing not implemented')
259 264
@@ -311,10 +316,9 @@ class ZipSubFile(object): @@ -311,10 +316,9 @@ class ZipSubFile(object):
311 """ helper for seek: skip forward by given amount using read() """ 316 """ helper for seek: skip forward by given amount using read() """
312 # print('ZipSubFile: seek by skipping {} bytes starting at {}' 317 # print('ZipSubFile: seek by skipping {} bytes starting at {}'
313 # .format(self.pos, to_skip)) 318 # .format(self.pos, to_skip))
314 - CHUNK_SIZE = 4096  
315 - n_chunks, leftover = divmod(to_skip, CHUNK_SIZE) 319 + n_chunks, leftover = divmod(to_skip, self.CHUNK_SIZE)
316 for _ in range(n_chunks): 320 for _ in range(n_chunks):
317 - self.read(CHUNK_SIZE) # just read and discard 321 + self.read(self.CHUNK_SIZE) # just read and discard
318 self.read(leftover) 322 self.read(leftover)
319 # print('ZipSubFile: seek by skipping done, pos now {}' 323 # print('ZipSubFile: seek by skipping done, pos now {}'
320 # .format(self.pos)) 324 # .format(self.pos))
@@ -417,8 +421,7 @@ class XmlParser(object): @@ -417,8 +421,7 @@ class XmlParser(object):
417 if match: 421 if match:
418 self._is_single_xml = True 422 self._is_single_xml = True
419 return True 423 return True
420 - if not match:  
421 - raise BadOOXML(self.filename, 'is no zip and has no prog_id') 424 + raise BadOOXML(self.filename, 'is no zip and has no prog_id')
422 425
423 def iter_files(self, args=None): 426 def iter_files(self, args=None):
424 """ Find files in zip or just give single xml file """ 427 """ Find files in zip or just give single xml file """
@@ -433,17 +436,14 @@ class XmlParser(object): @@ -433,17 +436,14 @@ class XmlParser(object):
433 subfiles = None 436 subfiles = None
434 try: 437 try:
435 zipper = ZipFile(self.filename) 438 zipper = ZipFile(self.filename)
436 - try:  
437 - _ = zipper.getinfo(FILE_CONTENT_TYPES)  
438 - except KeyError:  
439 - raise BadOOXML(self.filename,  
440 - 'No content type information')  
441 if not args: 439 if not args:
442 subfiles = zipper.namelist() 440 subfiles = zipper.namelist()
443 elif isstr(args): 441 elif isstr(args):
444 subfiles = [args, ] 442 subfiles = [args, ]
445 else: 443 else:
446 - subfiles = tuple(args) # make a copy in case orig changes 444 + # make a copy in case original args are modified
  445 + # Not sure whether this really is needed...
  446 + subfiles = tuple(arg for arg in args)
447 447
448 for subfile in subfiles: 448 for subfile in subfiles:
449 with zipper.open(subfile, 'r') as handle: 449 with zipper.open(subfile, 'r') as handle:
@@ -451,10 +451,12 @@ class XmlParser(object): @@ -451,10 +451,12 @@ class XmlParser(object):
451 if not args: 451 if not args:
452 self.did_iter_all = True 452 self.did_iter_all = True
453 except KeyError as orig_err: 453 except KeyError as orig_err:
  454 + # Note: do not change text of this message without adjusting
  455 + # conditions in except handlers
454 raise BadOOXML(self.filename, 456 raise BadOOXML(self.filename,
455 'invalid subfile: ' + str(orig_err)) 457 'invalid subfile: ' + str(orig_err))
456 except BadZipfile: 458 except BadZipfile:
457 - raise BadOOXML(self.filename, 'neither zip nor xml') 459 + raise BadOOXML(self.filename, 'not in zip format')
458 finally: 460 finally:
459 if zipper: 461 if zipper:
460 zipper.close() 462 zipper.close()
@@ -503,7 +505,7 @@ class XmlParser(object): @@ -503,7 +505,7 @@ class XmlParser(object):
503 if event == 'start': 505 if event == 'start':
504 if elem.tag in want_tags: 506 if elem.tag in want_tags:
505 logger.debug('remember start of tag {0} at {1}' 507 logger.debug('remember start of tag {0} at {1}'
506 - .format(elem.tag, depth)) 508 + .format(elem.tag, depth))
507 inside_tags.append((elem.tag, depth)) 509 inside_tags.append((elem.tag, depth))
508 depth += 1 510 depth += 1
509 continue 511 continue
@@ -519,18 +521,18 @@ class XmlParser(object): @@ -519,18 +521,18 @@ class XmlParser(object):
519 inside_tags.pop() 521 inside_tags.pop()
520 else: 522 else:
521 logger.error('found end for wanted tag {0} ' 523 logger.error('found end for wanted tag {0} '
522 - 'but last start tag {1} does not'  
523 - ' match'.format(curr_tag,  
524 - inside_tags[-1])) 524 + 'but last start tag {1} does not'
  525 + ' match'.format(curr_tag,
  526 + inside_tags[-1]))
525 # try to recover: close all deeper tags 527 # try to recover: close all deeper tags
526 while inside_tags and \ 528 while inside_tags and \
527 inside_tags[-1][1] >= depth: 529 inside_tags[-1][1] >= depth:
528 logger.debug('recover: pop {0}' 530 logger.debug('recover: pop {0}'
529 - .format(inside_tags[-1])) 531 + .format(inside_tags[-1]))
530 inside_tags.pop() 532 inside_tags.pop()
531 except IndexError: # no inside_tag[-1] 533 except IndexError: # no inside_tag[-1]
532 logger.error('found end of {0} at depth {1} but ' 534 logger.error('found end of {0} at depth {1} but '
533 - 'no start event') 535 + 'no start event')
534 # yield element 536 # yield element
535 if is_wanted or not want_tags: 537 if is_wanted or not want_tags:
536 yield subfile, elem, depth 538 yield subfile, elem, depth
@@ -544,7 +546,7 @@ class XmlParser(object): @@ -544,7 +546,7 @@ class XmlParser(object):
544 except ET.ParseError as err: 546 except ET.ParseError as err:
545 self.subfiles_no_xml.add(subfile) 547 self.subfiles_no_xml.add(subfile)
546 if subfile is None: # this is no zip subfile but single xml 548 if subfile is None: # this is no zip subfile but single xml
547 - raise BadOOXML(self.filename, 'is neither zip nor xml') 549 + raise BadOOXML(self.filename, 'content is not valid XML')
548 elif subfile.endswith('.xml'): 550 elif subfile.endswith('.xml'):
549 log = logger.warning 551 log = logger.warning
550 else: 552 else:
@@ -568,21 +570,30 @@ class XmlParser(object): @@ -568,21 +570,30 @@ class XmlParser(object):
568 570
569 defaults = [] 571 defaults = []
570 files = [] 572 files = []
571 - for _, elem, _ in self.iter_xml(FILE_CONTENT_TYPES):  
572 - if elem.tag.endswith('Default'):  
573 - extension = elem.attrib['Extension']  
574 - if extension.startswith('.'):  
575 - extension = extension[1:]  
576 - defaults.append((extension, elem.attrib['ContentType']))  
577 - logger.debug('found content type for extension {0[0]}: {0[1]}'  
578 - .format(defaults[-1]))  
579 - elif elem.tag.endswith('Override'):  
580 - subfile = elem.attrib['PartName']  
581 - if subfile.startswith('/'):  
582 - subfile = subfile[1:]  
583 - files.append((subfile, elem.attrib['ContentType']))  
584 - logger.debug('found content type for subfile {0[0]}: {0[1]}'  
585 - .format(files[-1])) 573 + try:
  574 + for _, elem, _ in self.iter_xml(FILE_CONTENT_TYPES):
  575 + if elem.tag.endswith('Default'):
  576 + extension = elem.attrib['Extension']
  577 + if extension.startswith('.'):
  578 + extension = extension[1:]
  579 + defaults.append((extension, elem.attrib['ContentType']))
  580 + logger.debug('found content type for extension {0[0]}: '
  581 + '{0[1]}'.format(defaults[-1]))
  582 + elif elem.tag.endswith('Override'):
  583 + subfile = elem.attrib['PartName']
  584 + if subfile.startswith('/'):
  585 + subfile = subfile[1:]
  586 + files.append((subfile, elem.attrib['ContentType']))
  587 + logger.debug('found content type for subfile {0[0]}: '
  588 + '{0[1]}'.format(files[-1]))
  589 + except BadOOXML as oo_err:
  590 + if oo_err.more_info.startswith('invalid subfile') and \
  591 + FILE_CONTENT_TYPES in oo_err.more_info:
  592 + # no FILE_CONTENT_TYPES in zip, so probably no ms office xml.
  593 + # Maybe OpenDocument format? In any case, try to analyze.
  594 + pass
  595 + else:
  596 + raise
586 return dict(files), dict(defaults) 597 return dict(files), dict(defaults)
587 598
588 def iter_non_xml(self): 599 def iter_non_xml(self):
@@ -599,7 +610,7 @@ class XmlParser(object): @@ -599,7 +610,7 @@ class XmlParser(object):
599 """ 610 """
600 if not self.did_iter_all: 611 if not self.did_iter_all:
601 logger.warning('Did not iterate through complete file. ' 612 logger.warning('Did not iterate through complete file. '
602 - 'Should run iter_xml() without args, first.') 613 + 'Should run iter_xml() without args, first.')
603 if not self.subfiles_no_xml: 614 if not self.subfiles_no_xml:
604 return 615 return
605 616
@@ -631,7 +642,7 @@ def test(): @@ -631,7 +642,7 @@ def test():
631 642
632 see module doc for more info 643 see module doc for more info
633 """ 644 """
634 - log_helper.enable_logging(False, logger.DEBUG) 645 + log_helper.enable_logging(False, 'debug')
635 if len(sys.argv) != 2: 646 if len(sys.argv) != 2:
636 print(u'To test this code, give me a single file as arg') 647 print(u'To test this code, give me a single file as arg')
637 return 2 648 return 2
tests/oleobj/test_basic.py
@@ -41,8 +41,10 @@ SAMPLES += tuple( @@ -41,8 +41,10 @@ SAMPLES += tuple(
41 'ab8c65e4c0fc51739aa66ca5888265b4') 41 'ab8c65e4c0fc51739aa66ca5888265b4')
42 for extn in ('xls', 'xlsx', 'xlsb', 'xlsm', 'xla', 'xlam', 'xlt', 'xltm', 42 for extn in ('xls', 'xlsx', 'xlsb', 'xlsm', 'xla', 'xlam', 'xlt', 'xltm',
43 'xltx', 'ppt', 'pptx', 'pptm', 'pps', 'ppsx', 'ppsm', 'pot', 43 'xltx', 'ppt', 'pptx', 'pptm', 'pps', 'ppsx', 'ppsm', 'pot',
44 - 'potx', 'potm') 44 + 'potx', 'potm', 'ods', 'odp')
45 ) 45 )
  46 +SAMPLES += (('embedded-simple-2007.odt', 'simple-text-file.txt',
  47 + 'bd5c063a5a43f67b3c50dc7b0f1195af'), )
46 48
47 49
48 def calc_md5(filename): 50 def calc_md5(filename):
tests/ooxml/test_basic.py
@@ -33,6 +33,8 @@ class TestOOXML(unittest.TestCase): @@ -33,6 +33,8 @@ class TestOOXML(unittest.TestCase):
33 pptx=ooxml.DOCTYPE_POWERPOINT, pptm=ooxml.DOCTYPE_POWERPOINT, 33 pptx=ooxml.DOCTYPE_POWERPOINT, pptm=ooxml.DOCTYPE_POWERPOINT,
34 ppsx=ooxml.DOCTYPE_POWERPOINT, ppsm=ooxml.DOCTYPE_POWERPOINT, 34 ppsx=ooxml.DOCTYPE_POWERPOINT, ppsm=ooxml.DOCTYPE_POWERPOINT,
35 potx=ooxml.DOCTYPE_POWERPOINT, potm=ooxml.DOCTYPE_POWERPOINT, 35 potx=ooxml.DOCTYPE_POWERPOINT, potm=ooxml.DOCTYPE_POWERPOINT,
  36 + ods=ooxml.DOCTYPE_NONE, odt=ooxml.DOCTYPE_NONE,
  37 + odp=ooxml.DOCTYPE_NONE,
36 ) 38 )
37 39
38 # files that are neither OLE nor xml: 40 # files that are neither OLE nor xml:
tests/test-data/oleobj/embedded-simple-2007.odp 0 โ†’ 100644
No preview for this file type
tests/test-data/oleobj/embedded-simple-2007.ods 0 โ†’ 100644
No preview for this file type
tests/test-data/oleobj/embedded-simple-2007.odt 0 โ†’ 100644
No preview for this file type