Commit 8fac8b0ca497fbbdc5c0422fea4306792a6dba58

Authored by Philippe Lagadec
Committed by GitHub
2 parents f028496d a55bd780

Merge pull request #412 from christian-intra2net/ooxml-accept-OpenOffice

Avoid error in ooxml for non-office zip files
oletools/oleobj.py
... ... @@ -750,13 +750,13 @@ def process_file(filename, data, output_dir=None):
750 750  
751 751 xml_parser = None
752 752 if is_zipfile(filename):
753   - log.info('file is a OOXML file, looking for relationships with external links')
  753 + log.info('file could be an OOXML file, looking for relationships with '
  754 + 'external links')
754 755 xml_parser = XmlParser(filename)
755 756 for relationship, target in find_external_relationships(xml_parser):
756 757 did_dump = True
757 758 print("Found relationship '%s' with external link %s" % (relationship, target))
758 759  
759   -
760 760 # look for ole files inside file (e.g. unzip docx)
761 761 # have to finish work on every ole stream inside iteration, since handles
762 762 # are closed in find_ole
... ... @@ -765,9 +765,9 @@ def process_file(filename, data, output_dir=None):
765 765 continue
766 766  
767 767 for path_parts in ole.listdir():
  768 + stream_path = '/'.join(path_parts)
  769 + log.debug('Checking stream %r', stream_path)
768 770 if path_parts[-1] == '\x01Ole10Native':
769   - stream_path = '/'.join(path_parts)
770   - log.debug('Checking stream %r', stream_path)
771 771 stream = None
772 772 try:
773 773 stream = ole.openstream(path_parts)
... ...
oletools/ooxml.py
... ... @@ -16,11 +16,11 @@ TODO: "xml2003" == "flatopc"?
16 16 """
17 17  
18 18 import sys
19   -from oletools.common.log_helper import log_helper
20 19 from zipfile import ZipFile, BadZipfile, is_zipfile
21 20 from os.path import splitext
22 21 import io
23 22 import re
  23 +from oletools.common.log_helper import log_helper
24 24  
25 25 # import lxml or ElementTree for XML parsing:
26 26 try:
... ... @@ -107,16 +107,14 @@ def debug_str(elem):
107 107 text = u', '.join(parts)
108 108 if len(text) > 150:
109 109 return text[:147] + u'...]'
110   - else:
111   - return text + u']'
  110 + return text + u']'
112 111  
113 112  
114 113 def isstr(some_var):
115 114 """ version-independent test for isinstance(some_var, (str, unicode)) """
116 115 if sys.version_info.major == 2:
117 116 return isinstance(some_var, basestring) # true for str and unicode
118   - else:
119   - return isinstance(some_var, str) # there is no unicode
  117 + return isinstance(some_var, str) # there is no unicode
120 118  
121 119  
122 120 ###############################################################################
... ... @@ -136,23 +134,29 @@ def get_type(filename):
136 134 prog_id = match.groups()[0]
137 135 if prog_id == WORD_XML_PROG_ID:
138 136 return DOCTYPE_WORD_XML
139   - elif prog_id == EXCEL_XML_PROG_ID:
  137 + if prog_id == EXCEL_XML_PROG_ID:
140 138 return DOCTYPE_EXCEL_XML
141   - else:
142   - return DOCTYPE_NONE
  139 + return DOCTYPE_NONE
143 140  
144 141 is_doc = False
145 142 is_xls = False
146 143 is_ppt = False
147   - for _, elem, _ in parser.iter_xml(FILE_CONTENT_TYPES):
148   - logger.debug(u' ' + debug_str(elem))
149   - try:
150   - content_type = elem.attrib['ContentType']
151   - except KeyError: # ContentType not an attr
152   - continue
153   - is_xls |= content_type.startswith(CONTENT_TYPES_EXCEL)
154   - is_doc |= content_type.startswith(CONTENT_TYPES_WORD)
155   - is_ppt |= content_type.startswith(CONTENT_TYPES_PPT)
  144 + try:
  145 + for _, elem, _ in parser.iter_xml(FILE_CONTENT_TYPES):
  146 + logger.debug(u' ' + debug_str(elem))
  147 + try:
  148 + content_type = elem.attrib['ContentType']
  149 + except KeyError: # ContentType not an attr
  150 + continue
  151 + is_xls |= content_type.startswith(CONTENT_TYPES_EXCEL)
  152 + is_doc |= content_type.startswith(CONTENT_TYPES_WORD)
  153 + is_ppt |= content_type.startswith(CONTENT_TYPES_PPT)
  154 + except BadOOXML as oo_err:
  155 + if oo_err.more_info.startswith('invalid subfile') and \
  156 + FILE_CONTENT_TYPES in oo_err.more_info:
  157 + # no FILE_CONTENT_TYPES in zip, so probably no ms office xml.
  158 + return DOCTYPE_NONE
  159 + raise
156 160  
157 161 if is_doc and not is_xls and not is_ppt:
158 162 return DOCTYPE_WORD
... ... @@ -162,9 +166,8 @@ def get_type(filename):
162 166 return DOCTYPE_POWERPOINT
163 167 if not is_doc and not is_xls and not is_ppt:
164 168 return DOCTYPE_NONE
165   - else:
166   - logger.warning('Encountered contradictory content types')
167   - return DOCTYPE_MIXED
  169 + logger.warning('Encountered contradictory content types')
  170 + return DOCTYPE_MIXED
168 171  
169 172  
170 173 def is_ooxml(filename):
... ... @@ -177,6 +180,7 @@ def is_ooxml(filename):
177 180 return False
178 181 if doctype == DOCTYPE_NONE:
179 182 return False
  183 + return True
180 184  
181 185  
182 186 ###############################################################################
... ... @@ -216,6 +220,7 @@ class ZipSubFile(object):
216 220 See also (and maybe could some day merge with):
217 221 ppt_record_parser.IterStream; also: oleobj.FakeFile
218 222 """
  223 + CHUNK_SIZE = 4096
219 224  
220 225 def __init__(self, container, filename, mode='r', size=None):
221 226 """ remember all necessary vars but do not open yet """
... ... @@ -253,7 +258,7 @@ class ZipSubFile(object):
253 258 # print('ZipSubFile: opened; size={}'.format(self.size))
254 259 return self
255 260  
256   - def write(self, *args, **kwargs): # pylint: disable=unused-argument,no-self-use
  261 + def write(self, *args, **kwargs):
257 262 """ write is not allowed """
258 263 raise IOError('writing not implemented')
259 264  
... ... @@ -311,10 +316,9 @@ class ZipSubFile(object):
311 316 """ helper for seek: skip forward by given amount using read() """
312 317 # print('ZipSubFile: seek by skipping {} bytes starting at {}'
313 318 # .format(self.pos, to_skip))
314   - CHUNK_SIZE = 4096
315   - n_chunks, leftover = divmod(to_skip, CHUNK_SIZE)
  319 + n_chunks, leftover = divmod(to_skip, self.CHUNK_SIZE)
316 320 for _ in range(n_chunks):
317   - self.read(CHUNK_SIZE) # just read and discard
  321 + self.read(self.CHUNK_SIZE) # just read and discard
318 322 self.read(leftover)
319 323 # print('ZipSubFile: seek by skipping done, pos now {}'
320 324 # .format(self.pos))
... ... @@ -417,8 +421,7 @@ class XmlParser(object):
417 421 if match:
418 422 self._is_single_xml = True
419 423 return True
420   - if not match:
421   - raise BadOOXML(self.filename, 'is no zip and has no prog_id')
  424 + raise BadOOXML(self.filename, 'is no zip and has no prog_id')
422 425  
423 426 def iter_files(self, args=None):
424 427 """ Find files in zip or just give single xml file """
... ... @@ -433,17 +436,14 @@ class XmlParser(object):
433 436 subfiles = None
434 437 try:
435 438 zipper = ZipFile(self.filename)
436   - try:
437   - _ = zipper.getinfo(FILE_CONTENT_TYPES)
438   - except KeyError:
439   - raise BadOOXML(self.filename,
440   - 'No content type information')
441 439 if not args:
442 440 subfiles = zipper.namelist()
443 441 elif isstr(args):
444 442 subfiles = [args, ]
445 443 else:
446   - subfiles = tuple(args) # make a copy in case orig changes
  444 + # make a copy in case original args are modified
  445 + # Not sure whether this really is needed...
  446 + subfiles = tuple(arg for arg in args)
447 447  
448 448 for subfile in subfiles:
449 449 with zipper.open(subfile, 'r') as handle:
... ... @@ -451,10 +451,12 @@ class XmlParser(object):
451 451 if not args:
452 452 self.did_iter_all = True
453 453 except KeyError as orig_err:
  454 + # Note: do not change text of this message without adjusting
  455 + # conditions in except handlers
454 456 raise BadOOXML(self.filename,
455 457 'invalid subfile: ' + str(orig_err))
456 458 except BadZipfile:
457   - raise BadOOXML(self.filename, 'neither zip nor xml')
  459 + raise BadOOXML(self.filename, 'not in zip format')
458 460 finally:
459 461 if zipper:
460 462 zipper.close()
... ... @@ -503,7 +505,7 @@ class XmlParser(object):
503 505 if event == 'start':
504 506 if elem.tag in want_tags:
505 507 logger.debug('remember start of tag {0} at {1}'
506   - .format(elem.tag, depth))
  508 + .format(elem.tag, depth))
507 509 inside_tags.append((elem.tag, depth))
508 510 depth += 1
509 511 continue
... ... @@ -519,18 +521,18 @@ class XmlParser(object):
519 521 inside_tags.pop()
520 522 else:
521 523 logger.error('found end for wanted tag {0} '
522   - 'but last start tag {1} does not'
523   - ' match'.format(curr_tag,
524   - inside_tags[-1]))
  524 + 'but last start tag {1} does not'
  525 + ' match'.format(curr_tag,
  526 + inside_tags[-1]))
525 527 # try to recover: close all deeper tags
526 528 while inside_tags and \
527 529 inside_tags[-1][1] >= depth:
528 530 logger.debug('recover: pop {0}'
529   - .format(inside_tags[-1]))
  531 + .format(inside_tags[-1]))
530 532 inside_tags.pop()
531 533 except IndexError: # no inside_tag[-1]
532 534 logger.error('found end of {0} at depth {1} but '
533   - 'no start event')
  535 + 'no start event')
534 536 # yield element
535 537 if is_wanted or not want_tags:
536 538 yield subfile, elem, depth
... ... @@ -544,7 +546,7 @@ class XmlParser(object):
544 546 except ET.ParseError as err:
545 547 self.subfiles_no_xml.add(subfile)
546 548 if subfile is None: # this is no zip subfile but single xml
547   - raise BadOOXML(self.filename, 'is neither zip nor xml')
  549 + raise BadOOXML(self.filename, 'content is not valid XML')
548 550 elif subfile.endswith('.xml'):
549 551 log = logger.warning
550 552 else:
... ... @@ -568,21 +570,30 @@ class XmlParser(object):
568 570  
569 571 defaults = []
570 572 files = []
571   - for _, elem, _ in self.iter_xml(FILE_CONTENT_TYPES):
572   - if elem.tag.endswith('Default'):
573   - extension = elem.attrib['Extension']
574   - if extension.startswith('.'):
575   - extension = extension[1:]
576   - defaults.append((extension, elem.attrib['ContentType']))
577   - logger.debug('found content type for extension {0[0]}: {0[1]}'
578   - .format(defaults[-1]))
579   - elif elem.tag.endswith('Override'):
580   - subfile = elem.attrib['PartName']
581   - if subfile.startswith('/'):
582   - subfile = subfile[1:]
583   - files.append((subfile, elem.attrib['ContentType']))
584   - logger.debug('found content type for subfile {0[0]}: {0[1]}'
585   - .format(files[-1]))
  573 + try:
  574 + for _, elem, _ in self.iter_xml(FILE_CONTENT_TYPES):
  575 + if elem.tag.endswith('Default'):
  576 + extension = elem.attrib['Extension']
  577 + if extension.startswith('.'):
  578 + extension = extension[1:]
  579 + defaults.append((extension, elem.attrib['ContentType']))
  580 + logger.debug('found content type for extension {0[0]}: '
  581 + '{0[1]}'.format(defaults[-1]))
  582 + elif elem.tag.endswith('Override'):
  583 + subfile = elem.attrib['PartName']
  584 + if subfile.startswith('/'):
  585 + subfile = subfile[1:]
  586 + files.append((subfile, elem.attrib['ContentType']))
  587 + logger.debug('found content type for subfile {0[0]}: '
  588 + '{0[1]}'.format(files[-1]))
  589 + except BadOOXML as oo_err:
  590 + if oo_err.more_info.startswith('invalid subfile') and \
  591 + FILE_CONTENT_TYPES in oo_err.more_info:
  592 + # no FILE_CONTENT_TYPES in zip, so probably no ms office xml.
  593 + # Maybe OpenDocument format? In any case, try to analyze.
  594 + pass
  595 + else:
  596 + raise
586 597 return dict(files), dict(defaults)
587 598  
588 599 def iter_non_xml(self):
... ... @@ -599,7 +610,7 @@ class XmlParser(object):
599 610 """
600 611 if not self.did_iter_all:
601 612 logger.warning('Did not iterate through complete file. '
602   - 'Should run iter_xml() without args, first.')
  613 + 'Should run iter_xml() without args, first.')
603 614 if not self.subfiles_no_xml:
604 615 return
605 616  
... ... @@ -631,7 +642,7 @@ def test():
631 642  
632 643 see module doc for more info
633 644 """
634   - log_helper.enable_logging(False, logger.DEBUG)
  645 + log_helper.enable_logging(False, 'debug')
635 646 if len(sys.argv) != 2:
636 647 print(u'To test this code, give me a single file as arg')
637 648 return 2
... ...
tests/oleobj/test_basic.py
... ... @@ -41,8 +41,10 @@ SAMPLES += tuple(
41 41 'ab8c65e4c0fc51739aa66ca5888265b4')
42 42 for extn in ('xls', 'xlsx', 'xlsb', 'xlsm', 'xla', 'xlam', 'xlt', 'xltm',
43 43 'xltx', 'ppt', 'pptx', 'pptm', 'pps', 'ppsx', 'ppsm', 'pot',
44   - 'potx', 'potm')
  44 + 'potx', 'potm', 'ods', 'odp')
45 45 )
  46 +SAMPLES += (('embedded-simple-2007.odt', 'simple-text-file.txt',
  47 + 'bd5c063a5a43f67b3c50dc7b0f1195af'), )
46 48  
47 49  
48 50 def calc_md5(filename):
... ...
tests/ooxml/test_basic.py
... ... @@ -33,6 +33,8 @@ class TestOOXML(unittest.TestCase):
33 33 pptx=ooxml.DOCTYPE_POWERPOINT, pptm=ooxml.DOCTYPE_POWERPOINT,
34 34 ppsx=ooxml.DOCTYPE_POWERPOINT, ppsm=ooxml.DOCTYPE_POWERPOINT,
35 35 potx=ooxml.DOCTYPE_POWERPOINT, potm=ooxml.DOCTYPE_POWERPOINT,
  36 + ods=ooxml.DOCTYPE_NONE, odt=ooxml.DOCTYPE_NONE,
  37 + odp=ooxml.DOCTYPE_NONE,
36 38 )
37 39  
38 40 # files that are neither OLE nor xml:
... ...
tests/test-data/oleobj/embedded-simple-2007.odp 0 โ†’ 100644
No preview for this file type
tests/test-data/oleobj/embedded-simple-2007.ods 0 โ†’ 100644
No preview for this file type
tests/test-data/oleobj/embedded-simple-2007.odt 0 โ†’ 100644
No preview for this file type