Commit 4257c83682aaa763578237bd99c47e0326b0be03

Authored by Christian Herdtweck
Committed by Philippe Lagadec
1 parent 3235357b

ooxml: create new helper for xml parsing

Showing 1 changed file with 180 additions and 0 deletions
oletools/ooxml.py 0 → 100644
  1 +#!/usr/bin/env python3
  2 +
  3 +""" Common operations for OpenOffice XML (docx, xlsx, pptx, ...) files
  4 +
  5 +This is mostly based on ECMA-376 (5th edition, Part 1)
  6 +http://www.ecma-international.org/publications/standards/Ecma-376.htm
  7 +
  8 +See also: Notes on Microsoft's implementation of ECMA-376: [MS-0E376]
  9 +
  10 +.. codeauthor:: Intra2net AG <info@intra2net>
  11 +"""
  12 +
  13 +import sys
  14 +import logging
  15 +from zipfile import ZipFile, BadZipfile
  16 +from traceback import print_exc
  17 +
  18 +# import lxml or ElementTree for XML parsing:
  19 +try:
  20 + # lxml: best performance for XML processing
  21 + import lxml.etree as ET
  22 +except ImportError:
  23 + import xml.etree.cElementTree as ET
  24 +
  25 +
  26 +#: subfiles that have to be part of every ooxml file
  27 +FILE_CONTENT_TYPES = '[Content_Types].xml'
  28 +FILE_RELATIONSHIPS = '_rels/.rels'
  29 +
  30 +#: start of content type attributes
  31 +CONTENT_TYPES_EXCEL = (
  32 + 'application/vnd.openxmlformats-officedocument.spreadsheetml.',
  33 + 'application/vnd.ms-excel.',
  34 +)
  35 +CONTENT_TYPES_WORD = (
  36 + 'application/vnd.openxmlformats-officedocument.wordprocessingml.',
  37 +)
  38 +CONTENT_TYPES_PPT = (
  39 + 'application/vnd.openxmlformats-officedocument.presentationml.',
  40 +)
  41 +
  42 +#: other content types (currently unused)
  43 +CONTENT_TYPES_NEUTRAL = (
  44 + 'application/xml',
  45 + 'application/vnd.openxmlformats-package.relationships+xml',
  46 + 'application/vnd.openxmlformats-package.core-properties+xml',
  47 + 'application/vnd.openxmlformats-officedocument.theme+xml',
  48 + 'application/vnd.openxmlformats-officedocument.extended-properties+xml'
  49 +)
  50 +
  51 +#: constants for document type
  52 +DOCTYPE_WORD = 'word'
  53 +DOCTYPE_EXCEL = 'excel'
  54 +DOCTYPE_POWERPOINT = 'powerpoint'
  55 +DOCTYPE_NONE = 'none'
  56 +DOCTYPE_MIXED = 'mixed'
  57 +
  58 +
  59 +def debug_str(elem):
  60 + """ for debugging: print an element """
  61 + if elem is None:
  62 + return u'None'
  63 + if elem.tag[0] == '{' and elem.tag.count('}') == 1:
  64 + parts = ['[tag={{...}}{0}'.format(elem.tag[elem.tag.index('}')+1:]), ]
  65 + else:
  66 + parts = ['[tag={0}'.format(elem.tag), ]
  67 + if elem.text:
  68 + parts.append(u'text="{0}"'.format(elem.text))
  69 + if elem.tail:
  70 + parts.append(u'tail="{0}"'.format(elem.tail))
  71 + for key, value in elem.attrib.iteritems():
  72 + parts.append(u'{0}="{1}"'.format(key, value))
  73 + if key == 'ContentType':
  74 + if value.startswith(CONTENT_TYPES_EXCEL):
  75 + parts[-1] += u'-->xls'
  76 + elif value.startswith(CONTENT_TYPES_WORD):
  77 + parts[-1] += u'-->doc'
  78 + elif value.startswith(CONTENT_TYPES_PPT):
  79 + parts[-1] += u'-->ppt'
  80 + elif value in CONTENT_TYPES_NEUTRAL:
  81 + parts[-1] += u'-->_'
  82 + else:
  83 + parts[-1] += u'!!!'
  84 +
  85 + return u', '.join(parts) + u']'
  86 +
  87 +
  88 +def get_type(filename):
  89 + """ return one of the DOCTYPE_* constants or raise error """
  90 + is_doc = False
  91 + is_xls = False
  92 + is_ppt = False
  93 + for _, elem, _ in iter_xml(filename, FILE_CONTENT_TYPES):
  94 + logging.debug(u' ' + debug_str(elem))
  95 + try:
  96 + is_xls |= elem.attrib['ContentType'].startswith(
  97 + CONTENT_TYPES_EXCEL)
  98 + is_doc |= elem.attrib['ContentType'].startswith(
  99 + CONTENT_TYPES_WORD)
  100 + is_ppt |= elem.attrib['ContentType'].startswith(
  101 + CONTENT_TYPES_PPT)
  102 + except KeyError: # ContentType not an attr
  103 + pass
  104 +
  105 + if is_doc and not is_xls and not is_ppt:
  106 + return DOCTYPE_WORD
  107 + if not is_doc and is_xls and not is_ppt:
  108 + return DOCTYPE_EXCEL
  109 + if not is_doc and not is_xls and is_ppt:
  110 + return DOCTYPE_POWERPOINT
  111 + if not is_doc and not is_xls and not is_ppt:
  112 + return DOCTYPE_NONE
  113 + else:
  114 + return DOCTYPE_MIXED
  115 +
  116 +
  117 +def is_ooxml(filename):
  118 + """ Determine whether given file is an ooxml file; tries get_type """
  119 + try:
  120 + get_type(filename)
  121 + except BadZipfile:
  122 + return False
  123 + except IOError: # one of the required files is not present
  124 + return False
  125 +
  126 +
  127 +def iter_xml(filename, *args):
  128 + """ Iterate xml contents of document
  129 +
  130 + If given subfile name[s] as optional arg[s], will only parse that subfile[s]
  131 +
  132 + yields 3-tuples (subfilename, element, depth) where depth indicates how deep
  133 + in the hierarchy the element is located. Containers of element will come
  134 + *after* the elements they contain (since they are only finished then).
  135 +
  136 + Will silently ignore errors in xml-parsing of a file, since subfiles can be
  137 + OLE or embedded image files.
  138 + """
  139 + with ZipFile(filename) as zip:
  140 + if args:
  141 + subfiles = args
  142 + else:
  143 + subfiles = zip.namelist()
  144 + for subfile in subfiles:
  145 + logging.debug(u'subfile {0}'.format(subfile))
  146 + depth = 0
  147 + try:
  148 + with zip.open(subfile, 'rU') as handle:
  149 + for event, elem in ET.iterparse(handle, ('start', 'end')):
  150 + if elem is None:
  151 + continue
  152 + if event == 'start':
  153 + depth += 1
  154 + continue
  155 + assert(event == 'end')
  156 + depth -= 1
  157 + assert(depth >= 0)
  158 + yield subfile, elem, depth
  159 + except ET.ParseError as err:
  160 + logging.warning(' xml-parsing for {0} failed'.format(subfile))
  161 + assert(depth == 0)
  162 +
  163 +
  164 +def test():
  165 + """ Main function, called when running file as script
  166 +
  167 + see module doc for more info
  168 + """
  169 + if len(sys.argv) != 2:
  170 + print(u'To test this code, give me a single file as arg')
  171 + return 2
  172 + #type = get_type(sys.argv[1])
  173 + #print('content type is {0}'.format(type))
  174 + for _, elem, depth in iter_xml(sys.argv[1]):
  175 + print(u'{0}{1}'.format(' ' * depth, debug_str(elem)))
  176 + return 0
  177 +
  178 +
  179 +if __name__ == '__main__':
  180 + sys.exit(test())
... ...