Commit f7b5b5f1912223c2a9a8f7d68042b42879207e41
Committed by
Philippe Lagadec
1 parent
b9b9af8c
ooxml: create a class to remember failed subfiles for further checking
Showing
1 changed file
with
119 additions
and
35 deletions
oletools/ooxml.py
| ... | ... | @@ -13,7 +13,7 @@ See also: Notes on Microsoft's implementation of ECMA-376: [MS-0E376] |
| 13 | 13 | import sys |
| 14 | 14 | import logging |
| 15 | 15 | from zipfile import ZipFile, BadZipfile |
| 16 | -from traceback import print_exc | |
| 16 | +from os.path import splitext | |
| 17 | 17 | |
| 18 | 18 | # import lxml or ElementTree for XML parsing: |
| 19 | 19 | try: |
| ... | ... | @@ -90,7 +90,7 @@ def get_type(filename): |
| 90 | 90 | is_doc = False |
| 91 | 91 | is_xls = False |
| 92 | 92 | is_ppt = False |
| 93 | - for _, elem, _ in iter_xml(filename, FILE_CONTENT_TYPES): | |
| 93 | + for _, elem, _ in XmlParser(filename).iter_xml(FILE_CONTENT_TYPES): | |
| 94 | 94 | logging.debug(u' ' + debug_str(elem)) |
| 95 | 95 | try: |
| 96 | 96 | is_xls |= elem.attrib['ContentType'].startswith( |
| ... | ... | @@ -124,41 +124,120 @@ def is_ooxml(filename): |
| 124 | 124 | return False |
| 125 | 125 | |
| 126 | 126 | |
| 127 | -def iter_xml(filename, *args): | |
| 128 | - """ Iterate xml contents of document | |
| 127 | +class XmlParser(object): | |
| 128 | + """ parser for OOXML files """ | |
| 129 | 129 | |
| 130 | - If given subfile name[s] as optional arg[s], will only parse that subfile[s] | |
| 130 | + def __init__(self, filename): | |
| 131 | + self.filename = filename | |
| 132 | + self.did_iter_all = False | |
| 133 | + self.subfiles_no_xml = set() | |
| 131 | 134 | |
| 132 | - yields 3-tuples (subfilename, element, depth) where depth indicates how deep | |
| 133 | - in the hierarchy the element is located. Containers of element will come | |
| 134 | - *after* the elements they contain (since they are only finished then). | |
| 135 | + def iter_xml(self, *args): | |
| 136 | + """ Iterate xml contents of document | |
| 135 | 137 | |
| 136 | - Will silently ignore errors in xml-parsing of a file, since subfiles can be | |
| 137 | - OLE or embedded image files. | |
| 138 | - """ | |
| 139 | - with ZipFile(filename) as zip: | |
| 140 | - if args: | |
| 141 | - subfiles = args | |
| 142 | - else: | |
| 143 | - subfiles = zip.namelist() | |
| 144 | - for subfile in subfiles: | |
| 145 | - logging.debug(u'subfile {0}'.format(subfile)) | |
| 146 | - depth = 0 | |
| 147 | - try: | |
| 138 | + If given subfile name[s] as optional arg[s], will only parse that | |
| 139 | + subfile[s] | |
| 140 | + | |
| 141 | + yields 3-tuples (subfilename, element, depth) where depth indicates how | |
| 142 | + deep in the hierarchy the element is located. Containers of element | |
| 143 | + will come *after* the elements they contain (since they are only | |
| 144 | + finished then). | |
| 145 | + | |
| 146 | + Subfiles that are not xml (e.g. OLE or image files) are remembered | |
| 147 | + internally and can be retrieved using iter_non_xml(). | |
| 148 | + """ | |
| 149 | + with ZipFile(self.filename) as zip: | |
| 150 | + if args: | |
| 151 | + subfiles = args | |
| 152 | + else: | |
| 153 | + subfiles = zip.namelist() | |
| 154 | + | |
| 155 | + failed = [] | |
| 156 | + events = ('start', 'end') | |
| 157 | + for subfile in subfiles: | |
| 158 | + logging.debug(u'subfile {0}'.format(subfile)) | |
| 159 | + depth = 0 | |
| 160 | + try: | |
| 161 | + with zip.open(subfile, 'r') as handle: | |
| 162 | + for event, elem in ET.iterparse(handle, events): | |
| 163 | + if elem is None: | |
| 164 | + continue | |
| 165 | + if event == 'start': | |
| 166 | + depth += 1 | |
| 167 | + continue | |
| 168 | + assert(event == 'end') | |
| 169 | + depth -= 1 | |
| 170 | + assert(depth >= 0) | |
| 171 | + yield subfile, elem, depth | |
| 172 | + except ET.ParseError as err: | |
| 173 | + logging.warning(' xml-parsing for {0} failed. ' | |
| 174 | + .format(subfile) + | |
| 175 | + 'Run iter_non_xml to investigate.') | |
| 176 | + self.subfiles_no_xml.add(subfile) | |
| 177 | + assert(depth == 0) | |
| 178 | + if not args: | |
| 179 | + self.did_iter_all = True | |
| 180 | + | |
| 181 | + def get_content_types(self): | |
| 182 | + """ retrieve subfile infos from [Content_Types].xml subfile | |
| 183 | + | |
| 184 | + returns (files, defaults) where | |
| 185 | + - files is a dict that maps file-name --> content-type | |
| 186 | + - defaults is a dict that maps extension --> content-type | |
| 187 | + | |
| 188 | + No guarantees on accuracy of these content types! | |
| 189 | + """ | |
| 190 | + defaults = [] | |
| 191 | + files = [] | |
| 192 | + for _, elem, _ in self.iter_xml(FILE_CONTENT_TYPES): | |
| 193 | + if elem.tag.endswith('Default'): | |
| 194 | + extension = elem.attrib['Extension'] | |
| 195 | + if extension.startswith('.'): | |
| 196 | + extension = extension[1:] | |
| 197 | + defaults.append((extension, elem.attrib['ContentType'])) | |
| 198 | + logging.debug('found content type for extension {0[0]}: {0[1]}' | |
| 199 | + .format(defaults[-1])) | |
| 200 | + elif elem.tag.endswith('Override'): | |
| 201 | + subfile = elem.attrib['PartName'] | |
| 202 | + if subfile.startswith('/'): | |
| 203 | + subfile = subfile[1:] | |
| 204 | + files.append((subfile, elem.attrib['ContentType'])) | |
| 205 | + logging.debug('found content type for subfile {0[0]}: {0[1]}' | |
| 206 | + .format(files[-1])) | |
| 207 | + return dict(files), dict(defaults) | |
| 208 | + | |
| 209 | + def iter_non_xml(self): | |
| 210 | + """ retrieve subfiles that were found by iter_xml to be non-xml | |
| 211 | + | |
| 212 | + also looks for content type info in the [Content_Types].xml subfile. | |
| 213 | + | |
| 214 | + yields 3-tuples (filename, content_type, file_handle) where | |
| 215 | + content_type is based on filename or default for extension or is None, | |
| 216 | + and file_handle is an open read-only handle for the file | |
| 217 | + """ | |
| 218 | + if not self.did_iter_all: | |
| 219 | + logging.warning('Did not iterate through complete file. Should run ' | |
| 220 | + 'iter_xml() without args, first.') | |
| 221 | + if not self.subfiles_no_xml: | |
| 222 | + raise StopIteration() | |
| 223 | + | |
| 224 | + content_types, content_defaults = self.get_content_types() | |
| 225 | + | |
| 226 | + with ZipFile(self.filename) as zip: | |
| 227 | + for subfile in self.subfiles_no_xml: | |
| 228 | + if subfile.startswith('/'): | |
| 229 | + subfile = subfile[1:] | |
| 230 | + content_type = None | |
| 231 | + if subfile in content_types: | |
| 232 | + content_type = content_types[subfile] | |
| 233 | + else: | |
| 234 | + extension = splitext(subfile)[1] | |
| 235 | + if extension.startswith('.'): | |
| 236 | + extension = extension[1:] # remove the '.' | |
| 237 | + if extension in content_defaults: | |
| 238 | + content_type = content_defaults[extension] | |
| 148 | 239 | with zip.open(subfile, 'r') as handle: |
| 149 | - for event, elem in ET.iterparse(handle, ('start', 'end')): | |
| 150 | - if elem is None: | |
| 151 | - continue | |
| 152 | - if event == 'start': | |
| 153 | - depth += 1 | |
| 154 | - continue | |
| 155 | - assert(event == 'end') | |
| 156 | - depth -= 1 | |
| 157 | - assert(depth >= 0) | |
| 158 | - yield subfile, elem, depth | |
| 159 | - except ET.ParseError as err: | |
| 160 | - logging.warning(' xml-parsing for {0} failed'.format(subfile)) | |
| 161 | - assert(depth == 0) | |
| 240 | + yield subfile, content_type, handle | |
| 162 | 241 | |
| 163 | 242 | |
| 164 | 243 | def test(): |
| ... | ... | @@ -166,13 +245,18 @@ def test(): |
| 166 | 245 | |
| 167 | 246 | see module doc for more info |
| 168 | 247 | """ |
| 248 | + logging.basicConfig(level=logging.DEBUG) | |
| 169 | 249 | if len(sys.argv) != 2: |
| 170 | 250 | print(u'To test this code, give me a single file as arg') |
| 171 | 251 | return 2 |
| 172 | 252 | #type = get_type(sys.argv[1]) |
| 173 | 253 | #print('content type is {0}'.format(type)) |
| 174 | - for _, elem, depth in iter_xml(sys.argv[1]): | |
| 175 | - print(u'{0}{1}'.format(' ' * depth, debug_str(elem))) | |
| 254 | + parser = XmlParser(sys.argv[1]) | |
| 255 | + for subfile, elem, depth in parser.iter_xml(): | |
| 256 | + print(u'{0}{1}{2}'.format(subfile, ' ' * depth, debug_str(elem))) | |
| 257 | + for subfile, content_type in parser.iter_non_xml(): | |
| 258 | + print(u'Non-XML subfile: {0} of type {1}' | |
| 259 | + .format(subfile, content_type or u'unknown')) | |
| 176 | 260 | return 0 |
| 177 | 261 | |
| 178 | 262 | ... | ... |