Peter M. Groen / oletools

Browse Code »

Commit 4257c83682aaa763578237bd99c47e0326b0be03

Authored by Christian Herdtweck 2017-11-22 17:18:00 +0100

Committed by Philippe Lagadec 2017-11-23 20:46:38 +0100

1 parent 3235357b

ooxml: create new helper for xml parsing

Inline Side-by-side

Showing 1 changed file with 180 additions and 0 deletions

oletools/ooxml.py 0 → 100644

View file @4257c83

	1	+#!/usr/bin/env python3
	2	+
	3	+""" Common operations for OpenOffice XML (docx, xlsx, pptx, ...) files
	4	+
	5	+This is mostly based on ECMA-376 (5th edition, Part 1)
	6	+http://www.ecma-international.org/publications/standards/Ecma-376.htm
	7	+
	8	+See also: Notes on Microsoft's implementation of ECMA-376: [MS-0E376]
	9	+
	10	+.. codeauthor:: Intra2net AG <info@intra2net>
	11	+"""
	12	+
	13	+import sys
	14	+import logging
	15	+from zipfile import ZipFile, BadZipfile
	16	+from traceback import print_exc
	17	+
	18	+# import lxml or ElementTree for XML parsing:
	19	+try:
	20	+ # lxml: best performance for XML processing
	21	+ import lxml.etree as ET
	22	+except ImportError:
	23	+ import xml.etree.cElementTree as ET
	24	+
	25	+
	26	+#: subfiles that have to be part of every ooxml file
	27	+FILE_CONTENT_TYPES = '[Content_Types].xml'
	28	+FILE_RELATIONSHIPS = '_rels/.rels'
	29	+
	30	+#: start of content type attributes
	31	+CONTENT_TYPES_EXCEL = (
	32	+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.',
	33	+ 'application/vnd.ms-excel.',
	34	+)
	35	+CONTENT_TYPES_WORD = (
	36	+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.',
	37	+)
	38	+CONTENT_TYPES_PPT = (
	39	+ 'application/vnd.openxmlformats-officedocument.presentationml.',
	40	+)
	41	+
	42	+#: other content types (currently unused)
	43	+CONTENT_TYPES_NEUTRAL = (
	44	+ 'application/xml',
	45	+ 'application/vnd.openxmlformats-package.relationships+xml',
	46	+ 'application/vnd.openxmlformats-package.core-properties+xml',
	47	+ 'application/vnd.openxmlformats-officedocument.theme+xml',
	48	+ 'application/vnd.openxmlformats-officedocument.extended-properties+xml'
	49	+)
	50	+
	51	+#: constants for document type
	52	+DOCTYPE_WORD = 'word'
	53	+DOCTYPE_EXCEL = 'excel'
	54	+DOCTYPE_POWERPOINT = 'powerpoint'
	55	+DOCTYPE_NONE = 'none'
	56	+DOCTYPE_MIXED = 'mixed'
	57	+
	58	+
	59	+def debug_str(elem):
	60	+ """ for debugging: print an element """
	61	+ if elem is None:
	62	+ return u'None'
	63	+ if elem.tag[0] == '{' and elem.tag.count('}') == 1:
	64	+ parts = ['[tag={{...}}{0}'.format(elem.tag[elem.tag.index('}')+1:]), ]
	65	+ else:
	66	+ parts = ['[tag={0}'.format(elem.tag), ]
	67	+ if elem.text:
	68	+ parts.append(u'text="{0}"'.format(elem.text))
	69	+ if elem.tail:
	70	+ parts.append(u'tail="{0}"'.format(elem.tail))
	71	+ for key, value in elem.attrib.iteritems():
	72	+ parts.append(u'{0}="{1}"'.format(key, value))
	73	+ if key == 'ContentType':
	74	+ if value.startswith(CONTENT_TYPES_EXCEL):
	75	+ parts[-1] += u'-->xls'
	76	+ elif value.startswith(CONTENT_TYPES_WORD):
	77	+ parts[-1] += u'-->doc'
	78	+ elif value.startswith(CONTENT_TYPES_PPT):
	79	+ parts[-1] += u'-->ppt'
	80	+ elif value in CONTENT_TYPES_NEUTRAL:
	81	+ parts[-1] += u'-->_'
	82	+ else:
	83	+ parts[-1] += u'!!!'
	84	+
	85	+ return u', '.join(parts) + u']'
	86	+
	87	+
	88	+def get_type(filename):
	89	+ """ return one of the DOCTYPE_* constants or raise error """
	90	+ is_doc = False
	91	+ is_xls = False
	92	+ is_ppt = False
	93	+ for _, elem, _ in iter_xml(filename, FILE_CONTENT_TYPES):
	94	+ logging.debug(u' ' + debug_str(elem))
	95	+ try:
	96	+ is_xls \|= elem.attrib['ContentType'].startswith(
	97	+ CONTENT_TYPES_EXCEL)
	98	+ is_doc \|= elem.attrib['ContentType'].startswith(
	99	+ CONTENT_TYPES_WORD)
	100	+ is_ppt \|= elem.attrib['ContentType'].startswith(
	101	+ CONTENT_TYPES_PPT)
	102	+ except KeyError: # ContentType not an attr
	103	+ pass
	104	+
	105	+ if is_doc and not is_xls and not is_ppt:
	106	+ return DOCTYPE_WORD
	107	+ if not is_doc and is_xls and not is_ppt:
	108	+ return DOCTYPE_EXCEL
	109	+ if not is_doc and not is_xls and is_ppt:
	110	+ return DOCTYPE_POWERPOINT
	111	+ if not is_doc and not is_xls and not is_ppt:
	112	+ return DOCTYPE_NONE
	113	+ else:
	114	+ return DOCTYPE_MIXED
	115	+
	116	+
	117	+def is_ooxml(filename):
	118	+ """ Determine whether given file is an ooxml file; tries get_type """
	119	+ try:
	120	+ get_type(filename)
	121	+ except BadZipfile:
	122	+ return False
	123	+ except IOError: # one of the required files is not present
	124	+ return False
	125	+
	126	+
	127	+def iter_xml(filename, *args):
	128	+ """ Iterate xml contents of document
	129	+
	130	+ If given subfile name[s] as optional arg[s], will only parse that subfile[s]
	131	+
	132	+ yields 3-tuples (subfilename, element, depth) where depth indicates how deep
	133	+ in the hierarchy the element is located. Containers of element will come
	134	+ after the elements they contain (since they are only finished then).
	135	+
	136	+ Will silently ignore errors in xml-parsing of a file, since subfiles can be
	137	+ OLE or embedded image files.
	138	+ """
	139	+ with ZipFile(filename) as zip:
	140	+ if args:
	141	+ subfiles = args
	142	+ else:
	143	+ subfiles = zip.namelist()
	144	+ for subfile in subfiles:
	145	+ logging.debug(u'subfile {0}'.format(subfile))
	146	+ depth = 0
	147	+ try:
	148	+ with zip.open(subfile, 'rU') as handle:
	149	+ for event, elem in ET.iterparse(handle, ('start', 'end')):
	150	+ if elem is None:
	151	+ continue
	152	+ if event == 'start':
	153	+ depth += 1
	154	+ continue
	155	+ assert(event == 'end')
	156	+ depth -= 1
	157	+ assert(depth >= 0)
	158	+ yield subfile, elem, depth
	159	+ except ET.ParseError as err:
	160	+ logging.warning(' xml-parsing for {0} failed'.format(subfile))
	161	+ assert(depth == 0)
	162	+
	163	+
	164	+def test():
	165	+ """ Main function, called when running file as script
	166	+
	167	+ see module doc for more info
	168	+ """
	169	+ if len(sys.argv) != 2:
	170	+ print(u'To test this code, give me a single file as arg')
	171	+ return 2
	172	+ #type = get_type(sys.argv[1])
	173	+ #print('content type is {0}'.format(type))
	174	+ for _, elem, depth in iter_xml(sys.argv[1]):
	175	+ print(u'{0}{1}'.format(' ' * depth, debug_str(elem)))
	176	+ return 0
	177	+
	178	+
	179	+if __name__ == '__main__':
	180	+ sys.exit(test())
...	...