Commit 138753531bec374a90a84c1514a3ad53b96c8f31

Authored by Christian Herdtweck
1 parent 5862969f

ooxml: limit returns to set of tags; more memory-efficient

Showing 1 changed file with 102 additions and 13 deletions
oletools/ooxml.py
@@ -70,6 +70,11 @@ DOCTYPE_WORD_XML2003 = 'word-xml2003' # not yet used @@ -70,6 +70,11 @@ DOCTYPE_WORD_XML2003 = 'word-xml2003' # not yet used
70 DOCTYPE_EXCEL_XML2003 = 'excel-xml2003' # not yet used 70 DOCTYPE_EXCEL_XML2003 = 'excel-xml2003' # not yet used
71 71
72 72
  73 +###############################################################################
  74 +# HELPERS
  75 +###############################################################################
  76 +
  77 +
73 def debug_str(elem): 78 def debug_str(elem):
74 """ for debugging: print an element """ 79 """ for debugging: print an element """
75 if elem is None: 80 if elem is None:
@@ -103,6 +108,19 @@ def debug_str(elem): @@ -103,6 +108,19 @@ def debug_str(elem):
103 return text + u']' 108 return text + u']'
104 109
105 110
  111 +def isstr(some_var):
  112 + """ version-independent test for isinstance(some_var, (str, unicode)) """
  113 + if sys.version_info.major == 2:
  114 + return isinstance(some_var, basestring) # true for str and unicode
  115 + else:
  116 + return isinstance(some_var, str) # there is no unicode
  117 +
  118 +
  119 +###############################################################################
  120 +# INFO ON FILES
  121 +###############################################################################
  122 +
  123 +
106 def get_type(filename): 124 def get_type(filename):
107 """ return one of the DOCTYPE_* constants or raise error """ 125 """ return one of the DOCTYPE_* constants or raise error """
108 parser = XmlParser(filename) 126 parser = XmlParser(filename)
@@ -158,6 +176,11 @@ def is_ooxml(filename): @@ -158,6 +176,11 @@ def is_ooxml(filename):
158 return False 176 return False
159 177
160 178
  179 +###############################################################################
  180 +# HELPER CLASSES
  181 +###############################################################################
  182 +
  183 +
161 class ZipSubFile(object): 184 class ZipSubFile(object):
162 """ A file-like object like ZipFile.open returns them, with size and seek() 185 """ A file-like object like ZipFile.open returns them, with size and seek()
163 186
@@ -351,6 +374,11 @@ class BadOOXML(ValueError): @@ -351,6 +374,11 @@ class BadOOXML(ValueError):
351 self.more_info = more_info 374 self.more_info = more_info
352 375
353 376
  377 +###############################################################################
  378 +# PARSING
  379 +###############################################################################
  380 +
  381 +
354 class XmlParser(object): 382 class XmlParser(object):
355 """ parser for OOXML files 383 """ parser for OOXML files
356 384
@@ -389,7 +417,7 @@ class XmlParser(object): @@ -389,7 +417,7 @@ class XmlParser(object):
389 if not match: 417 if not match:
390 raise BadOOXML(self.filename, 'is no zip and has no prog_id') 418 raise BadOOXML(self.filename, 'is no zip and has no prog_id')
391 419
392 - def iter_files(self, *args): 420 + def iter_files(self, args=None):
393 """ Find files in zip or just give single xml file """ 421 """ Find files in zip or just give single xml file """
394 if self.is_single_xml(): 422 if self.is_single_xml():
395 if args: 423 if args:
@@ -399,29 +427,36 @@ class XmlParser(object): @@ -399,29 +427,36 @@ class XmlParser(object):
399 self.did_iter_all = True 427 self.did_iter_all = True
400 else: 428 else:
401 zipper = None 429 zipper = None
  430 + subfiles = None
402 try: 431 try:
403 zipper = ZipFile(self.filename) 432 zipper = ZipFile(self.filename)
404 - cont_file = zipper.getinfo(FILE_CONTENT_TYPES) # --> KeyError  
405 - if args:  
406 - subfiles = args  
407 - else: 433 + try:
  434 + cont_file = zipper.getinfo(FILE_CONTENT_TYPES)
  435 + except KeyError:
  436 + raise BadOOXML(self.filename,
  437 + 'No content type information')
  438 + if not args:
408 subfiles = zipper.namelist() 439 subfiles = zipper.namelist()
  440 + elif isstr(args):
  441 + subfiles = [args, ]
  442 + else:
  443 + subfiles = tuple(args) # make a copy in case orig changes
409 444
410 for subfile in subfiles: 445 for subfile in subfiles:
411 - logging.debug(u'subfile {0}'.format(subfile))  
412 with zipper.open(subfile, 'r') as handle: 446 with zipper.open(subfile, 'r') as handle:
413 yield subfile, handle 447 yield subfile, handle
414 if not args: 448 if not args:
415 self.did_iter_all = True 449 self.did_iter_all = True
416 - except KeyError: # zipper.getinfo failed, no content type file  
417 - raise BadOOXML(self.filename, 'No content type information') 450 + except KeyError as orig_err:
  451 + raise BadOOXML(self.filename, 'invalid subfile: ' +
  452 + str(orig_err))
418 except BadZipfile: 453 except BadZipfile:
419 raise BadOOXML(self.filename, 'neither zip nor xml') 454 raise BadOOXML(self.filename, 'neither zip nor xml')
420 finally: 455 finally:
421 if zipper: 456 if zipper:
422 zipper.close() 457 zipper.close()
423 458
424 - def iter_xml(self, *subfiles): 459 + def iter_xml(self, subfiles=None, need_children=False, tags=None):
425 """ Iterate xml contents of document 460 """ Iterate xml contents of document
426 461
427 If given subfile name[s] as optional arg[s], will only parse that 462 If given subfile name[s] as optional arg[s], will only parse that
@@ -434,21 +469,75 @@ class XmlParser(object): @@ -434,21 +469,75 @@ class XmlParser(object):
434 469
435 Subfiles that are not xml (e.g. OLE or image files) are remembered 470 Subfiles that are not xml (e.g. OLE or image files) are remembered
436 internally and can be retrieved using iter_non_xml(). 471 internally and can be retrieved using iter_non_xml().
  472 +
  473 + The argument need_children is set to False per default. If you need to
  474 + access an element's children, set it to True. Note, however, that
  475 + leaving it at False should save a lot of memory. Otherwise, the parser
  476 + has to keep every single element in memory since the last element
  477 + returned is the root which has the rest of the document as children.
  478 + c.f. http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
  479 +
  480 + Argument tags restricts output to tags with names from that list (or
  481 + equal to that string). Children are preserved for these.
437 """ 482 """
438 - for subfile, handle in self.iter_files(*subfiles): 483 + if tags is None:
  484 + want_tags = []
  485 + elif isinstance(tags, (str, unicode)):
  486 + want_tags = [tags, ]
  487 + logging.debug('looking for tags: {0}'.format(tags))
  488 + else:
  489 + want_tags = tags
  490 + logging.debug('looking for tags: {0}'.format(tags))
  491 +
  492 + for subfile, handle in self.iter_files(subfiles):
439 events = ('start', 'end') 493 events = ('start', 'end')
440 depth = 0 494 depth = 0
  495 + inside_tags = []
441 try: 496 try:
442 for event, elem in ET.iterparse(handle, events): 497 for event, elem in ET.iterparse(handle, events):
443 if elem is None: 498 if elem is None:
444 continue 499 continue
445 if event == 'start': 500 if event == 'start':
  501 + if elem.tag in want_tags:
  502 + logging.debug('remember start of tag {0} at {1}'
  503 + .format(elem.tag, depth))
  504 + inside_tags.append((elem.tag, depth))
446 depth += 1 505 depth += 1
447 continue 506 continue
448 assert(event == 'end') 507 assert(event == 'end')
449 depth -= 1 508 depth -= 1
450 assert(depth >= 0) 509 assert(depth >= 0)
451 - yield subfile, elem, depth 510 +
  511 + is_wanted = elem.tag in want_tags
  512 + if is_wanted:
  513 + curr_tag = (elem.tag, depth)
  514 + try:
  515 + if inside_tags[-1] == curr_tag:
  516 + inside_tags.pop()
  517 + else:
  518 + logging.error('found end for wanted tag {0} '
  519 + 'but last start tag {1} does not '
  520 + 'match'.format(curr_tag,
  521 + inside_tags[-1]))
  522 + # try to recover: close all deeper tags
  523 + while inside_tags and \
  524 + inside_tags[-1][1] >= depth:
  525 + logging.debug('recover: pop {0}'
  526 + .format(inside_tags[-1]))
  527 + inside_tags.pop()
  528 + except IndexError: # no inside_tag[-1]
  529 + logging.error('found end of {0} at depth {1} but '
  530 + 'no start event')
  531 + # yield element
  532 + if is_wanted or not want_tags:
  533 + yield subfile, elem, depth
  534 +
  535 + # save memory: clear elem so parser memorizes less
  536 + if not need_children and not inside_tags:
  537 + elem.clear()
  538 + # cannot do this since we might be using py-builtin xml
  539 + # while elem.getprevious() is not None:
  540 + # del elem.getparent()[0]
452 except ET.ParseError as err: 541 except ET.ParseError as err:
453 self.subfiles_no_xml.add(subfile) 542 self.subfiles_no_xml.add(subfile)
454 if subfile is None: # this is no zip subfile but single xml 543 if subfile is None: # this is no zip subfile but single xml
@@ -550,8 +639,8 @@ def test(): @@ -550,8 +639,8 @@ def test():
550 # test complete parsing 639 # test complete parsing
551 parser = XmlParser(sys.argv[1]) 640 parser = XmlParser(sys.argv[1])
552 for subfile, elem, depth in parser.iter_xml(): 641 for subfile, elem, depth in parser.iter_xml():
553 - if depth < 3:  
554 - print(u'{0}{1}{2}'.format(subfile, ' ' * depth, debug_str(elem))) 642 + if depth < 4:
  643 + print(u'{0} {1}{2}'.format(subfile, ' ' * depth, debug_str(elem)))
555 for index, (subfile, content_type) in enumerate(parser.iter_non_xml()): 644 for index, (subfile, content_type) in enumerate(parser.iter_non_xml()):
556 print(u'Non-XML subfile: {0} of type {1}' 645 print(u'Non-XML subfile: {0} of type {1}'
557 .format(subfile, content_type or u'unknown')) 646 .format(subfile, content_type or u'unknown'))