Commit f7b5b5f1912223c2a9a8f7d68042b42879207e41

Authored by Christian Herdtweck
Committed by Philippe Lagadec
1 parent b9b9af8c

ooxml: create a class to remember failed subfiles for further checking

Showing 1 changed file with 119 additions and 35 deletions
oletools/ooxml.py
... ... @@ -13,7 +13,7 @@ See also: Notes on Microsoft's implementation of ECMA-376: [MS-0E376]
13 13 import sys
14 14 import logging
15 15 from zipfile import ZipFile, BadZipfile
16   -from traceback import print_exc
  16 +from os.path import splitext
17 17  
18 18 # import lxml or ElementTree for XML parsing:
19 19 try:
... ... @@ -90,7 +90,7 @@ def get_type(filename):
90 90 is_doc = False
91 91 is_xls = False
92 92 is_ppt = False
93   - for _, elem, _ in iter_xml(filename, FILE_CONTENT_TYPES):
  93 + for _, elem, _ in XmlParser(filename).iter_xml(FILE_CONTENT_TYPES):
94 94 logging.debug(u' ' + debug_str(elem))
95 95 try:
96 96 is_xls |= elem.attrib['ContentType'].startswith(
... ... @@ -124,41 +124,120 @@ def is_ooxml(filename):
124 124 return False
125 125  
126 126  
127   -def iter_xml(filename, *args):
128   - """ Iterate xml contents of document
  127 +class XmlParser(object):
  128 + """ parser for OOXML files """
129 129  
130   - If given subfile name[s] as optional arg[s], will only parse that subfile[s]
  130 + def __init__(self, filename):
  131 + self.filename = filename
  132 + self.did_iter_all = False
  133 + self.subfiles_no_xml = set()
131 134  
132   - yields 3-tuples (subfilename, element, depth) where depth indicates how deep
133   - in the hierarchy the element is located. Containers of element will come
134   - *after* the elements they contain (since they are only finished then).
  135 + def iter_xml(self, *args):
  136 + """ Iterate xml contents of document
135 137  
136   - Will silently ignore errors in xml-parsing of a file, since subfiles can be
137   - OLE or embedded image files.
138   - """
139   - with ZipFile(filename) as zip:
140   - if args:
141   - subfiles = args
142   - else:
143   - subfiles = zip.namelist()
144   - for subfile in subfiles:
145   - logging.debug(u'subfile {0}'.format(subfile))
146   - depth = 0
147   - try:
  138 + If given subfile name[s] as optional arg[s], will only parse that
  139 + subfile[s]
  140 +
  141 + yields 3-tuples (subfilename, element, depth) where depth indicates how
  142 + deep in the hierarchy the element is located. Containers of element
  143 + will come *after* the elements they contain (since they are only
  144 + finished then).
  145 +
  146 + Subfiles that are not xml (e.g. OLE or image files) are remembered
  147 + internally and can be retrieved using iter_non_xml().
  148 + """
  149 + with ZipFile(self.filename) as zip:
  150 + if args:
  151 + subfiles = args
  152 + else:
  153 + subfiles = zip.namelist()
  154 +
  155 + failed = []
  156 + events = ('start', 'end')
  157 + for subfile in subfiles:
  158 + logging.debug(u'subfile {0}'.format(subfile))
  159 + depth = 0
  160 + try:
  161 + with zip.open(subfile, 'r') as handle:
  162 + for event, elem in ET.iterparse(handle, events):
  163 + if elem is None:
  164 + continue
  165 + if event == 'start':
  166 + depth += 1
  167 + continue
  168 + assert(event == 'end')
  169 + depth -= 1
  170 + assert(depth >= 0)
  171 + yield subfile, elem, depth
  172 + except ET.ParseError as err:
  173 + logging.warning(' xml-parsing for {0} failed. '
  174 + .format(subfile) +
  175 + 'Run iter_non_xml to investigate.')
  176 + self.subfiles_no_xml.add(subfile)
  177 + assert(depth == 0)
  178 + if not args:
  179 + self.did_iter_all = True
  180 +
  181 + def get_content_types(self):
  182 + """ retrieve subfile infos from [Content_Types].xml subfile
  183 +
  184 + returns (files, defaults) where
  185 + - files is a dict that maps file-name --> content-type
  186 + - defaults is a dict that maps extension --> content-type
  187 +
  188 + No guarantees on accuracy of these content types!
  189 + """
  190 + defaults = []
  191 + files = []
  192 + for _, elem, _ in self.iter_xml(FILE_CONTENT_TYPES):
  193 + if elem.tag.endswith('Default'):
  194 + extension = elem.attrib['Extension']
  195 + if extension.startswith('.'):
  196 + extension = extension[1:]
  197 + defaults.append((extension, elem.attrib['ContentType']))
  198 + logging.debug('found content type for extension {0[0]}: {0[1]}'
  199 + .format(defaults[-1]))
  200 + elif elem.tag.endswith('Override'):
  201 + subfile = elem.attrib['PartName']
  202 + if subfile.startswith('/'):
  203 + subfile = subfile[1:]
  204 + files.append((subfile, elem.attrib['ContentType']))
  205 + logging.debug('found content type for subfile {0[0]}: {0[1]}'
  206 + .format(files[-1]))
  207 + return dict(files), dict(defaults)
  208 +
  209 + def iter_non_xml(self):
  210 + """ retrieve subfiles that were found by iter_xml to be non-xml
  211 +
  212 + also looks for content type info in the [Content_Types].xml subfile.
  213 +
  214 + yields 3-tuples (filename, content_type, file_handle) where
  215 + content_type is based on filename or default for extension or is None,
  216 + and file_handle is an open read-only handle for the file
  217 + """
  218 + if not self.did_iter_all:
  219 + logging.warning('Did not iterate through complete file. Should run '
  220 + 'iter_xml() without args, first.')
  221 + if not self.subfiles_no_xml:
  222 + raise StopIteration()
  223 +
  224 + content_types, content_defaults = self.get_content_types()
  225 +
  226 + with ZipFile(self.filename) as zip:
  227 + for subfile in self.subfiles_no_xml:
  228 + if subfile.startswith('/'):
  229 + subfile = subfile[1:]
  230 + content_type = None
  231 + if subfile in content_types:
  232 + content_type = content_types[subfile]
  233 + else:
  234 + extension = splitext(subfile)[1]
  235 + if extension.startswith('.'):
  236 + extension = extension[1:] # remove the '.'
  237 + if extension in content_defaults:
  238 + content_type = content_defaults[extension]
148 239 with zip.open(subfile, 'r') as handle:
149   - for event, elem in ET.iterparse(handle, ('start', 'end')):
150   - if elem is None:
151   - continue
152   - if event == 'start':
153   - depth += 1
154   - continue
155   - assert(event == 'end')
156   - depth -= 1
157   - assert(depth >= 0)
158   - yield subfile, elem, depth
159   - except ET.ParseError as err:
160   - logging.warning(' xml-parsing for {0} failed'.format(subfile))
161   - assert(depth == 0)
  240 + yield subfile, content_type, handle
162 241  
163 242  
164 243 def test():
... ... @@ -166,13 +245,18 @@ def test():
166 245  
167 246 see module doc for more info
168 247 """
  248 + logging.basicConfig(level=logging.DEBUG)
169 249 if len(sys.argv) != 2:
170 250 print(u'To test this code, give me a single file as arg')
171 251 return 2
172 252 #type = get_type(sys.argv[1])
173 253 #print('content type is {0}'.format(type))
174   - for _, elem, depth in iter_xml(sys.argv[1]):
175   - print(u'{0}{1}'.format(' ' * depth, debug_str(elem)))
  254 + parser = XmlParser(sys.argv[1])
  255 + for subfile, elem, depth in parser.iter_xml():
  256 + print(u'{0}{1}{2}'.format(subfile, ' ' * depth, debug_str(elem)))
  257 + for subfile, content_type in parser.iter_non_xml():
  258 + print(u'Non-XML subfile: {0} of type {1}'
  259 + .format(subfile, content_type or u'unknown'))
176 260 return 0
177 261  
178 262  
... ...