Commit 138753531bec374a90a84c1514a3ad53b96c8f31

Authored by Christian Herdtweck
1 parent 5862969f

ooxml: limit returns to set of tags; more memory-efficient

Showing 1 changed file with 102 additions and 13 deletions
oletools/ooxml.py
... ... @@ -70,6 +70,11 @@ DOCTYPE_WORD_XML2003 = 'word-xml2003' # not yet used
70 70 DOCTYPE_EXCEL_XML2003 = 'excel-xml2003' # not yet used
71 71  
72 72  
  73 +###############################################################################
  74 +# HELPERS
  75 +###############################################################################
  76 +
  77 +
73 78 def debug_str(elem):
74 79 """ for debugging: print an element """
75 80 if elem is None:
... ... @@ -103,6 +108,19 @@ def debug_str(elem):
103 108 return text + u']'
104 109  
105 110  
  111 +def isstr(some_var):
  112 + """ version-independent test for isinstance(some_var, (str, unicode)) """
  113 + if sys.version_info.major == 2:
  114 + return isinstance(some_var, basestring) # true for str and unicode
  115 + else:
  116 + return isinstance(some_var, str) # there is no unicode
  117 +
  118 +
  119 +###############################################################################
  120 +# INFO ON FILES
  121 +###############################################################################
  122 +
  123 +
106 124 def get_type(filename):
107 125 """ return one of the DOCTYPE_* constants or raise error """
108 126 parser = XmlParser(filename)
... ... @@ -158,6 +176,11 @@ def is_ooxml(filename):
158 176 return False
159 177  
160 178  
  179 +###############################################################################
  180 +# HELPER CLASSES
  181 +###############################################################################
  182 +
  183 +
161 184 class ZipSubFile(object):
162 185 """ A file-like object like ZipFile.open returns them, with size and seek()
163 186  
... ... @@ -351,6 +374,11 @@ class BadOOXML(ValueError):
351 374 self.more_info = more_info
352 375  
353 376  
  377 +###############################################################################
  378 +# PARSING
  379 +###############################################################################
  380 +
  381 +
354 382 class XmlParser(object):
355 383 """ parser for OOXML files
356 384  
... ... @@ -389,7 +417,7 @@ class XmlParser(object):
389 417 if not match:
390 418 raise BadOOXML(self.filename, 'is no zip and has no prog_id')
391 419  
392   - def iter_files(self, *args):
  420 + def iter_files(self, args=None):
393 421 """ Find files in zip or just give single xml file """
394 422 if self.is_single_xml():
395 423 if args:
... ... @@ -399,29 +427,36 @@ class XmlParser(object):
399 427 self.did_iter_all = True
400 428 else:
401 429 zipper = None
  430 + subfiles = None
402 431 try:
403 432 zipper = ZipFile(self.filename)
404   - cont_file = zipper.getinfo(FILE_CONTENT_TYPES) # --> KeyError
405   - if args:
406   - subfiles = args
407   - else:
  433 + try:
  434 + cont_file = zipper.getinfo(FILE_CONTENT_TYPES)
  435 + except KeyError:
  436 + raise BadOOXML(self.filename,
  437 + 'No content type information')
  438 + if not args:
408 439 subfiles = zipper.namelist()
  440 + elif isstr(args):
  441 + subfiles = [args, ]
  442 + else:
  443 + subfiles = tuple(args) # make a copy in case orig changes
409 444  
410 445 for subfile in subfiles:
411   - logging.debug(u'subfile {0}'.format(subfile))
412 446 with zipper.open(subfile, 'r') as handle:
413 447 yield subfile, handle
414 448 if not args:
415 449 self.did_iter_all = True
416   - except KeyError: # zipper.getinfo failed, no content type file
417   - raise BadOOXML(self.filename, 'No content type information')
  450 + except KeyError as orig_err:
  451 + raise BadOOXML(self.filename, 'invalid subfile: ' +
  452 + str(orig_err))
418 453 except BadZipfile:
419 454 raise BadOOXML(self.filename, 'neither zip nor xml')
420 455 finally:
421 456 if zipper:
422 457 zipper.close()
423 458  
424   - def iter_xml(self, *subfiles):
  459 + def iter_xml(self, subfiles=None, need_children=False, tags=None):
425 460 """ Iterate xml contents of document
426 461  
427 462 If given subfile name[s] as optional arg[s], will only parse that
... ... @@ -434,21 +469,75 @@ class XmlParser(object):
434 469  
435 470 Subfiles that are not xml (e.g. OLE or image files) are remembered
436 471 internally and can be retrieved using iter_non_xml().
  472 +
  473 + The argument need_children is set to False per default. If you need to
  474 + access an element's children, set it to True. Note, however, that
  475 + leaving it at False should save a lot of memory. Otherwise, the parser
  476 + has to keep every single element in memory since the last element
  477 + returned is the root which has the rest of the document as children.
  478 + c.f. http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
  479 +
  480 + Argument tags restricts output to tags with names from that list (or
  481 + equal to that string). Children are preserved for these.
437 482 """
438   - for subfile, handle in self.iter_files(*subfiles):
  483 + if tags is None:
  484 + want_tags = []
  485 + elif isinstance(tags, (str, unicode)):
  486 + want_tags = [tags, ]
  487 + logging.debug('looking for tags: {0}'.format(tags))
  488 + else:
  489 + want_tags = tags
  490 + logging.debug('looking for tags: {0}'.format(tags))
  491 +
  492 + for subfile, handle in self.iter_files(subfiles):
439 493 events = ('start', 'end')
440 494 depth = 0
  495 + inside_tags = []
441 496 try:
442 497 for event, elem in ET.iterparse(handle, events):
443 498 if elem is None:
444 499 continue
445 500 if event == 'start':
  501 + if elem.tag in want_tags:
  502 + logging.debug('remember start of tag {0} at {1}'
  503 + .format(elem.tag, depth))
  504 + inside_tags.append((elem.tag, depth))
446 505 depth += 1
447 506 continue
448 507 assert(event == 'end')
449 508 depth -= 1
450 509 assert(depth >= 0)
451   - yield subfile, elem, depth
  510 +
  511 + is_wanted = elem.tag in want_tags
  512 + if is_wanted:
  513 + curr_tag = (elem.tag, depth)
  514 + try:
  515 + if inside_tags[-1] == curr_tag:
  516 + inside_tags.pop()
  517 + else:
  518 + logging.error('found end for wanted tag {0} '
  519 + 'but last start tag {1} does not '
  520 + 'match'.format(curr_tag,
  521 + inside_tags[-1]))
  522 + # try to recover: close all deeper tags
  523 + while inside_tags and \
  524 + inside_tags[-1][1] >= depth:
  525 + logging.debug('recover: pop {0}'
  526 + .format(inside_tags[-1]))
  527 + inside_tags.pop()
  528 + except IndexError: # no inside_tag[-1]
  529 + logging.error('found end of {0} at depth {1} but '
  530 + 'no start event')
  531 + # yield element
  532 + if is_wanted or not want_tags:
  533 + yield subfile, elem, depth
  534 +
  535 + # save memory: clear elem so parser memorizes less
  536 + if not need_children and not inside_tags:
  537 + elem.clear()
  538 + # cannot do this since we might be using py-builtin xml
  539 + # while elem.getprevious() is not None:
  540 + # del elem.getparent()[0]
452 541 except ET.ParseError as err:
453 542 self.subfiles_no_xml.add(subfile)
454 543 if subfile is None: # this is no zip subfile but single xml
... ... @@ -550,8 +639,8 @@ def test():
550 639 # test complete parsing
551 640 parser = XmlParser(sys.argv[1])
552 641 for subfile, elem, depth in parser.iter_xml():
553   - if depth < 3:
554   - print(u'{0}{1}{2}'.format(subfile, ' ' * depth, debug_str(elem)))
  642 + if depth < 4:
  643 + print(u'{0} {1}{2}'.format(subfile, ' ' * depth, debug_str(elem)))
555 644 for index, (subfile, content_type) in enumerate(parser.iter_non_xml()):
556 645 print(u'Non-XML subfile: {0} of type {1}'
557 646 .format(subfile, content_type or u'unknown'))
... ...