diff --git a/oletools/oleobj.py b/oletools/oleobj.py index a0f5e49..561a2a6 100644 --- a/oletools/oleobj.py +++ b/oletools/oleobj.py @@ -750,13 +750,13 @@ def process_file(filename, data, output_dir=None): xml_parser = None if is_zipfile(filename): - log.info('file is a OOXML file, looking for relationships with external links') + log.info('file could be an OOXML file, looking for relationships with ' + 'external links') xml_parser = XmlParser(filename) for relationship, target in find_external_relationships(xml_parser): did_dump = True print("Found relationship '%s' with external link %s" % (relationship, target)) - # look for ole files inside file (e.g. unzip docx) # have to finish work on every ole stream inside iteration, since handles # are closed in find_ole @@ -765,9 +765,9 @@ def process_file(filename, data, output_dir=None): continue for path_parts in ole.listdir(): + stream_path = '/'.join(path_parts) + log.debug('Checking stream %r', stream_path) if path_parts[-1] == '\x01Ole10Native': - stream_path = '/'.join(path_parts) - log.debug('Checking stream %r', stream_path) stream = None try: stream = ole.openstream(path_parts) diff --git a/oletools/ooxml.py b/oletools/ooxml.py index 174c46d..a36c99d 100644 --- a/oletools/ooxml.py +++ b/oletools/ooxml.py @@ -16,11 +16,11 @@ TODO: "xml2003" == "flatopc"? """ import sys -from oletools.common.log_helper import log_helper from zipfile import ZipFile, BadZipfile, is_zipfile from os.path import splitext import io import re +from oletools.common.log_helper import log_helper # import lxml or ElementTree for XML parsing: try: @@ -107,16 +107,14 @@ def debug_str(elem): text = u', '.join(parts) if len(text) > 150: return text[:147] + u'...]' - else: - return text + u']' + return text + u']' def isstr(some_var): """ version-independent test for isinstance(some_var, (str, unicode)) """ if sys.version_info.major == 2: return isinstance(some_var, basestring) # true for str and unicode - else: - return isinstance(some_var, str) # there is no unicode + return isinstance(some_var, str) # there is no unicode ############################################################################### @@ -136,23 +134,29 @@ def get_type(filename): prog_id = match.groups()[0] if prog_id == WORD_XML_PROG_ID: return DOCTYPE_WORD_XML - elif prog_id == EXCEL_XML_PROG_ID: + if prog_id == EXCEL_XML_PROG_ID: return DOCTYPE_EXCEL_XML - else: - return DOCTYPE_NONE + return DOCTYPE_NONE is_doc = False is_xls = False is_ppt = False - for _, elem, _ in parser.iter_xml(FILE_CONTENT_TYPES): - logger.debug(u' ' + debug_str(elem)) - try: - content_type = elem.attrib['ContentType'] - except KeyError: # ContentType not an attr - continue - is_xls |= content_type.startswith(CONTENT_TYPES_EXCEL) - is_doc |= content_type.startswith(CONTENT_TYPES_WORD) - is_ppt |= content_type.startswith(CONTENT_TYPES_PPT) + try: + for _, elem, _ in parser.iter_xml(FILE_CONTENT_TYPES): + logger.debug(u' ' + debug_str(elem)) + try: + content_type = elem.attrib['ContentType'] + except KeyError: # ContentType not an attr + continue + is_xls |= content_type.startswith(CONTENT_TYPES_EXCEL) + is_doc |= content_type.startswith(CONTENT_TYPES_WORD) + is_ppt |= content_type.startswith(CONTENT_TYPES_PPT) + except BadOOXML as oo_err: + if oo_err.more_info.startswith('invalid subfile') and \ + FILE_CONTENT_TYPES in oo_err.more_info: + # no FILE_CONTENT_TYPES in zip, so probably no ms office xml. + return DOCTYPE_NONE + raise if is_doc and not is_xls and not is_ppt: return DOCTYPE_WORD @@ -162,9 +166,8 @@ def get_type(filename): return DOCTYPE_POWERPOINT if not is_doc and not is_xls and not is_ppt: return DOCTYPE_NONE - else: - logger.warning('Encountered contradictory content types') - return DOCTYPE_MIXED + logger.warning('Encountered contradictory content types') + return DOCTYPE_MIXED def is_ooxml(filename): @@ -177,6 +180,7 @@ def is_ooxml(filename): return False if doctype == DOCTYPE_NONE: return False + return True ############################################################################### @@ -216,6 +220,7 @@ class ZipSubFile(object): See also (and maybe could some day merge with): ppt_record_parser.IterStream; also: oleobj.FakeFile """ + CHUNK_SIZE = 4096 def __init__(self, container, filename, mode='r', size=None): """ remember all necessary vars but do not open yet """ @@ -253,7 +258,7 @@ class ZipSubFile(object): # print('ZipSubFile: opened; size={}'.format(self.size)) return self - def write(self, *args, **kwargs): # pylint: disable=unused-argument,no-self-use + def write(self, *args, **kwargs): """ write is not allowed """ raise IOError('writing not implemented') @@ -311,10 +316,9 @@ class ZipSubFile(object): """ helper for seek: skip forward by given amount using read() """ # print('ZipSubFile: seek by skipping {} bytes starting at {}' # .format(self.pos, to_skip)) - CHUNK_SIZE = 4096 - n_chunks, leftover = divmod(to_skip, CHUNK_SIZE) + n_chunks, leftover = divmod(to_skip, self.CHUNK_SIZE) for _ in range(n_chunks): - self.read(CHUNK_SIZE) # just read and discard + self.read(self.CHUNK_SIZE) # just read and discard self.read(leftover) # print('ZipSubFile: seek by skipping done, pos now {}' # .format(self.pos)) @@ -417,8 +421,7 @@ class XmlParser(object): if match: self._is_single_xml = True return True - if not match: - raise BadOOXML(self.filename, 'is no zip and has no prog_id') + raise BadOOXML(self.filename, 'is no zip and has no prog_id') def iter_files(self, args=None): """ Find files in zip or just give single xml file """ @@ -433,17 +436,14 @@ class XmlParser(object): subfiles = None try: zipper = ZipFile(self.filename) - try: - _ = zipper.getinfo(FILE_CONTENT_TYPES) - except KeyError: - raise BadOOXML(self.filename, - 'No content type information') if not args: subfiles = zipper.namelist() elif isstr(args): subfiles = [args, ] else: - subfiles = tuple(args) # make a copy in case orig changes + # make a copy in case original args are modified + # Not sure whether this really is needed... + subfiles = tuple(arg for arg in args) for subfile in subfiles: with zipper.open(subfile, 'r') as handle: @@ -451,10 +451,12 @@ class XmlParser(object): if not args: self.did_iter_all = True except KeyError as orig_err: + # Note: do not change text of this message without adjusting + # conditions in except handlers raise BadOOXML(self.filename, 'invalid subfile: ' + str(orig_err)) except BadZipfile: - raise BadOOXML(self.filename, 'neither zip nor xml') + raise BadOOXML(self.filename, 'not in zip format') finally: if zipper: zipper.close() @@ -503,7 +505,7 @@ class XmlParser(object): if event == 'start': if elem.tag in want_tags: logger.debug('remember start of tag {0} at {1}' - .format(elem.tag, depth)) + .format(elem.tag, depth)) inside_tags.append((elem.tag, depth)) depth += 1 continue @@ -519,18 +521,18 @@ class XmlParser(object): inside_tags.pop() else: logger.error('found end for wanted tag {0} ' - 'but last start tag {1} does not' - ' match'.format(curr_tag, - inside_tags[-1])) + 'but last start tag {1} does not' + ' match'.format(curr_tag, + inside_tags[-1])) # try to recover: close all deeper tags while inside_tags and \ inside_tags[-1][1] >= depth: logger.debug('recover: pop {0}' - .format(inside_tags[-1])) + .format(inside_tags[-1])) inside_tags.pop() except IndexError: # no inside_tag[-1] logger.error('found end of {0} at depth {1} but ' - 'no start event') + 'no start event') # yield element if is_wanted or not want_tags: yield subfile, elem, depth @@ -544,7 +546,7 @@ class XmlParser(object): except ET.ParseError as err: self.subfiles_no_xml.add(subfile) if subfile is None: # this is no zip subfile but single xml - raise BadOOXML(self.filename, 'is neither zip nor xml') + raise BadOOXML(self.filename, 'content is not valid XML') elif subfile.endswith('.xml'): log = logger.warning else: @@ -568,21 +570,30 @@ class XmlParser(object): defaults = [] files = [] - for _, elem, _ in self.iter_xml(FILE_CONTENT_TYPES): - if elem.tag.endswith('Default'): - extension = elem.attrib['Extension'] - if extension.startswith('.'): - extension = extension[1:] - defaults.append((extension, elem.attrib['ContentType'])) - logger.debug('found content type for extension {0[0]}: {0[1]}' - .format(defaults[-1])) - elif elem.tag.endswith('Override'): - subfile = elem.attrib['PartName'] - if subfile.startswith('/'): - subfile = subfile[1:] - files.append((subfile, elem.attrib['ContentType'])) - logger.debug('found content type for subfile {0[0]}: {0[1]}' - .format(files[-1])) + try: + for _, elem, _ in self.iter_xml(FILE_CONTENT_TYPES): + if elem.tag.endswith('Default'): + extension = elem.attrib['Extension'] + if extension.startswith('.'): + extension = extension[1:] + defaults.append((extension, elem.attrib['ContentType'])) + logger.debug('found content type for extension {0[0]}: ' + '{0[1]}'.format(defaults[-1])) + elif elem.tag.endswith('Override'): + subfile = elem.attrib['PartName'] + if subfile.startswith('/'): + subfile = subfile[1:] + files.append((subfile, elem.attrib['ContentType'])) + logger.debug('found content type for subfile {0[0]}: ' + '{0[1]}'.format(files[-1])) + except BadOOXML as oo_err: + if oo_err.more_info.startswith('invalid subfile') and \ + FILE_CONTENT_TYPES in oo_err.more_info: + # no FILE_CONTENT_TYPES in zip, so probably no ms office xml. + # Maybe OpenDocument format? In any case, try to analyze. + pass + else: + raise return dict(files), dict(defaults) def iter_non_xml(self): @@ -599,7 +610,7 @@ class XmlParser(object): """ if not self.did_iter_all: logger.warning('Did not iterate through complete file. ' - 'Should run iter_xml() without args, first.') + 'Should run iter_xml() without args, first.') if not self.subfiles_no_xml: return @@ -631,7 +642,7 @@ def test(): see module doc for more info """ - log_helper.enable_logging(False, logger.DEBUG) + log_helper.enable_logging(False, 'debug') if len(sys.argv) != 2: print(u'To test this code, give me a single file as arg') return 2 diff --git a/tests/oleobj/test_basic.py b/tests/oleobj/test_basic.py index 783ae5a..8154e54 100644 --- a/tests/oleobj/test_basic.py +++ b/tests/oleobj/test_basic.py @@ -41,8 +41,10 @@ SAMPLES += tuple( 'ab8c65e4c0fc51739aa66ca5888265b4') for extn in ('xls', 'xlsx', 'xlsb', 'xlsm', 'xla', 'xlam', 'xlt', 'xltm', 'xltx', 'ppt', 'pptx', 'pptm', 'pps', 'ppsx', 'ppsm', 'pot', - 'potx', 'potm') + 'potx', 'potm', 'ods', 'odp') ) +SAMPLES += (('embedded-simple-2007.odt', 'simple-text-file.txt', + 'bd5c063a5a43f67b3c50dc7b0f1195af'), ) def calc_md5(filename): diff --git a/tests/ooxml/test_basic.py b/tests/ooxml/test_basic.py index 440d08d..b97c432 100644 --- a/tests/ooxml/test_basic.py +++ b/tests/ooxml/test_basic.py @@ -33,6 +33,8 @@ class TestOOXML(unittest.TestCase): pptx=ooxml.DOCTYPE_POWERPOINT, pptm=ooxml.DOCTYPE_POWERPOINT, ppsx=ooxml.DOCTYPE_POWERPOINT, ppsm=ooxml.DOCTYPE_POWERPOINT, potx=ooxml.DOCTYPE_POWERPOINT, potm=ooxml.DOCTYPE_POWERPOINT, + ods=ooxml.DOCTYPE_NONE, odt=ooxml.DOCTYPE_NONE, + odp=ooxml.DOCTYPE_NONE, ) # files that are neither OLE nor xml: diff --git a/tests/test-data/oleobj/embedded-simple-2007.odp b/tests/test-data/oleobj/embedded-simple-2007.odp new file mode 100644 index 0000000..eeb85e8 --- /dev/null +++ b/tests/test-data/oleobj/embedded-simple-2007.odp diff --git a/tests/test-data/oleobj/embedded-simple-2007.ods b/tests/test-data/oleobj/embedded-simple-2007.ods new file mode 100644 index 0000000..e465229 --- /dev/null +++ b/tests/test-data/oleobj/embedded-simple-2007.ods diff --git a/tests/test-data/oleobj/embedded-simple-2007.odt b/tests/test-data/oleobj/embedded-simple-2007.odt new file mode 100644 index 0000000..c73fe59 --- /dev/null +++ b/tests/test-data/oleobj/embedded-simple-2007.odt