Commit f7b5b5f1912223c2a9a8f7d68042b42879207e41
Committed by
Philippe Lagadec
1 parent
b9b9af8c
ooxml: create a class to remember failed subfiles for further checking
Showing
1 changed file
with
119 additions
and
35 deletions
oletools/ooxml.py
| @@ -13,7 +13,7 @@ See also: Notes on Microsoft's implementation of ECMA-376: [MS-0E376] | @@ -13,7 +13,7 @@ See also: Notes on Microsoft's implementation of ECMA-376: [MS-0E376] | ||
| 13 | import sys | 13 | import sys |
| 14 | import logging | 14 | import logging |
| 15 | from zipfile import ZipFile, BadZipfile | 15 | from zipfile import ZipFile, BadZipfile |
| 16 | -from traceback import print_exc | 16 | +from os.path import splitext |
| 17 | 17 | ||
| 18 | # import lxml or ElementTree for XML parsing: | 18 | # import lxml or ElementTree for XML parsing: |
| 19 | try: | 19 | try: |
| @@ -90,7 +90,7 @@ def get_type(filename): | @@ -90,7 +90,7 @@ def get_type(filename): | ||
| 90 | is_doc = False | 90 | is_doc = False |
| 91 | is_xls = False | 91 | is_xls = False |
| 92 | is_ppt = False | 92 | is_ppt = False |
| 93 | - for _, elem, _ in iter_xml(filename, FILE_CONTENT_TYPES): | 93 | + for _, elem, _ in XmlParser(filename).iter_xml(FILE_CONTENT_TYPES): |
| 94 | logging.debug(u' ' + debug_str(elem)) | 94 | logging.debug(u' ' + debug_str(elem)) |
| 95 | try: | 95 | try: |
| 96 | is_xls |= elem.attrib['ContentType'].startswith( | 96 | is_xls |= elem.attrib['ContentType'].startswith( |
| @@ -124,41 +124,120 @@ def is_ooxml(filename): | @@ -124,41 +124,120 @@ def is_ooxml(filename): | ||
| 124 | return False | 124 | return False |
| 125 | 125 | ||
| 126 | 126 | ||
| 127 | -def iter_xml(filename, *args): | ||
| 128 | - """ Iterate xml contents of document | 127 | +class XmlParser(object): |
| 128 | + """ parser for OOXML files """ | ||
| 129 | 129 | ||
| 130 | - If given subfile name[s] as optional arg[s], will only parse that subfile[s] | 130 | + def __init__(self, filename): |
| 131 | + self.filename = filename | ||
| 132 | + self.did_iter_all = False | ||
| 133 | + self.subfiles_no_xml = set() | ||
| 131 | 134 | ||
| 132 | - yields 3-tuples (subfilename, element, depth) where depth indicates how deep | ||
| 133 | - in the hierarchy the element is located. Containers of element will come | ||
| 134 | - *after* the elements they contain (since they are only finished then). | 135 | + def iter_xml(self, *args): |
| 136 | + """ Iterate xml contents of document | ||
| 135 | 137 | ||
| 136 | - Will silently ignore errors in xml-parsing of a file, since subfiles can be | ||
| 137 | - OLE or embedded image files. | ||
| 138 | - """ | ||
| 139 | - with ZipFile(filename) as zip: | ||
| 140 | - if args: | ||
| 141 | - subfiles = args | ||
| 142 | - else: | ||
| 143 | - subfiles = zip.namelist() | ||
| 144 | - for subfile in subfiles: | ||
| 145 | - logging.debug(u'subfile {0}'.format(subfile)) | ||
| 146 | - depth = 0 | ||
| 147 | - try: | 138 | + If given subfile name[s] as optional arg[s], will only parse that |
| 139 | + subfile[s] | ||
| 140 | + | ||
| 141 | + yields 3-tuples (subfilename, element, depth) where depth indicates how | ||
| 142 | + deep in the hierarchy the element is located. Containers of element | ||
| 143 | + will come *after* the elements they contain (since they are only | ||
| 144 | + finished then). | ||
| 145 | + | ||
| 146 | + Subfiles that are not xml (e.g. OLE or image files) are remembered | ||
| 147 | + internally and can be retrieved using iter_non_xml(). | ||
| 148 | + """ | ||
| 149 | + with ZipFile(self.filename) as zip: | ||
| 150 | + if args: | ||
| 151 | + subfiles = args | ||
| 152 | + else: | ||
| 153 | + subfiles = zip.namelist() | ||
| 154 | + | ||
| 155 | + failed = [] | ||
| 156 | + events = ('start', 'end') | ||
| 157 | + for subfile in subfiles: | ||
| 158 | + logging.debug(u'subfile {0}'.format(subfile)) | ||
| 159 | + depth = 0 | ||
| 160 | + try: | ||
| 161 | + with zip.open(subfile, 'r') as handle: | ||
| 162 | + for event, elem in ET.iterparse(handle, events): | ||
| 163 | + if elem is None: | ||
| 164 | + continue | ||
| 165 | + if event == 'start': | ||
| 166 | + depth += 1 | ||
| 167 | + continue | ||
| 168 | + assert(event == 'end') | ||
| 169 | + depth -= 1 | ||
| 170 | + assert(depth >= 0) | ||
| 171 | + yield subfile, elem, depth | ||
| 172 | + except ET.ParseError as err: | ||
| 173 | + logging.warning(' xml-parsing for {0} failed. ' | ||
| 174 | + .format(subfile) + | ||
| 175 | + 'Run iter_non_xml to investigate.') | ||
| 176 | + self.subfiles_no_xml.add(subfile) | ||
| 177 | + assert(depth == 0) | ||
| 178 | + if not args: | ||
| 179 | + self.did_iter_all = True | ||
| 180 | + | ||
| 181 | + def get_content_types(self): | ||
| 182 | + """ retrieve subfile infos from [Content_Types].xml subfile | ||
| 183 | + | ||
| 184 | + returns (files, defaults) where | ||
| 185 | + - files is a dict that maps file-name --> content-type | ||
| 186 | + - defaults is a dict that maps extension --> content-type | ||
| 187 | + | ||
| 188 | + No guarantees on accuracy of these content types! | ||
| 189 | + """ | ||
| 190 | + defaults = [] | ||
| 191 | + files = [] | ||
| 192 | + for _, elem, _ in self.iter_xml(FILE_CONTENT_TYPES): | ||
| 193 | + if elem.tag.endswith('Default'): | ||
| 194 | + extension = elem.attrib['Extension'] | ||
| 195 | + if extension.startswith('.'): | ||
| 196 | + extension = extension[1:] | ||
| 197 | + defaults.append((extension, elem.attrib['ContentType'])) | ||
| 198 | + logging.debug('found content type for extension {0[0]}: {0[1]}' | ||
| 199 | + .format(defaults[-1])) | ||
| 200 | + elif elem.tag.endswith('Override'): | ||
| 201 | + subfile = elem.attrib['PartName'] | ||
| 202 | + if subfile.startswith('/'): | ||
| 203 | + subfile = subfile[1:] | ||
| 204 | + files.append((subfile, elem.attrib['ContentType'])) | ||
| 205 | + logging.debug('found content type for subfile {0[0]}: {0[1]}' | ||
| 206 | + .format(files[-1])) | ||
| 207 | + return dict(files), dict(defaults) | ||
| 208 | + | ||
| 209 | + def iter_non_xml(self): | ||
| 210 | + """ retrieve subfiles that were found by iter_xml to be non-xml | ||
| 211 | + | ||
| 212 | + also looks for content type info in the [Content_Types].xml subfile. | ||
| 213 | + | ||
| 214 | + yields 3-tuples (filename, content_type, file_handle) where | ||
| 215 | + content_type is based on filename or default for extension or is None, | ||
| 216 | + and file_handle is an open read-only handle for the file | ||
| 217 | + """ | ||
| 218 | + if not self.did_iter_all: | ||
| 219 | + logging.warning('Did not iterate through complete file. Should run ' | ||
| 220 | + 'iter_xml() without args, first.') | ||
| 221 | + if not self.subfiles_no_xml: | ||
| 222 | + raise StopIteration() | ||
| 223 | + | ||
| 224 | + content_types, content_defaults = self.get_content_types() | ||
| 225 | + | ||
| 226 | + with ZipFile(self.filename) as zip: | ||
| 227 | + for subfile in self.subfiles_no_xml: | ||
| 228 | + if subfile.startswith('/'): | ||
| 229 | + subfile = subfile[1:] | ||
| 230 | + content_type = None | ||
| 231 | + if subfile in content_types: | ||
| 232 | + content_type = content_types[subfile] | ||
| 233 | + else: | ||
| 234 | + extension = splitext(subfile)[1] | ||
| 235 | + if extension.startswith('.'): | ||
| 236 | + extension = extension[1:] # remove the '.' | ||
| 237 | + if extension in content_defaults: | ||
| 238 | + content_type = content_defaults[extension] | ||
| 148 | with zip.open(subfile, 'r') as handle: | 239 | with zip.open(subfile, 'r') as handle: |
| 149 | - for event, elem in ET.iterparse(handle, ('start', 'end')): | ||
| 150 | - if elem is None: | ||
| 151 | - continue | ||
| 152 | - if event == 'start': | ||
| 153 | - depth += 1 | ||
| 154 | - continue | ||
| 155 | - assert(event == 'end') | ||
| 156 | - depth -= 1 | ||
| 157 | - assert(depth >= 0) | ||
| 158 | - yield subfile, elem, depth | ||
| 159 | - except ET.ParseError as err: | ||
| 160 | - logging.warning(' xml-parsing for {0} failed'.format(subfile)) | ||
| 161 | - assert(depth == 0) | 240 | + yield subfile, content_type, handle |
| 162 | 241 | ||
| 163 | 242 | ||
| 164 | def test(): | 243 | def test(): |
| @@ -166,13 +245,18 @@ def test(): | @@ -166,13 +245,18 @@ def test(): | ||
| 166 | 245 | ||
| 167 | see module doc for more info | 246 | see module doc for more info |
| 168 | """ | 247 | """ |
| 248 | + logging.basicConfig(level=logging.DEBUG) | ||
| 169 | if len(sys.argv) != 2: | 249 | if len(sys.argv) != 2: |
| 170 | print(u'To test this code, give me a single file as arg') | 250 | print(u'To test this code, give me a single file as arg') |
| 171 | return 2 | 251 | return 2 |
| 172 | #type = get_type(sys.argv[1]) | 252 | #type = get_type(sys.argv[1]) |
| 173 | #print('content type is {0}'.format(type)) | 253 | #print('content type is {0}'.format(type)) |
| 174 | - for _, elem, depth in iter_xml(sys.argv[1]): | ||
| 175 | - print(u'{0}{1}'.format(' ' * depth, debug_str(elem))) | 254 | + parser = XmlParser(sys.argv[1]) |
| 255 | + for subfile, elem, depth in parser.iter_xml(): | ||
| 256 | + print(u'{0}{1}{2}'.format(subfile, ' ' * depth, debug_str(elem))) | ||
| 257 | + for subfile, content_type in parser.iter_non_xml(): | ||
| 258 | + print(u'Non-XML subfile: {0} of type {1}' | ||
| 259 | + .format(subfile, content_type or u'unknown')) | ||
| 176 | return 0 | 260 | return 0 |
| 177 | 261 | ||
| 178 | 262 |