Commit 138753531bec374a90a84c1514a3ad53b96c8f31
1 parent
5862969f
ooxml: limit returns to set of tags; more memory-efficient
Showing
1 changed file
with
102 additions
and
13 deletions
oletools/ooxml.py
| ... | ... | @@ -70,6 +70,11 @@ DOCTYPE_WORD_XML2003 = 'word-xml2003' # not yet used |
| 70 | 70 | DOCTYPE_EXCEL_XML2003 = 'excel-xml2003' # not yet used |
| 71 | 71 | |
| 72 | 72 | |
| 73 | +############################################################################### | |
| 74 | +# HELPERS | |
| 75 | +############################################################################### | |
| 76 | + | |
| 77 | + | |
| 73 | 78 | def debug_str(elem): |
| 74 | 79 | """ for debugging: print an element """ |
| 75 | 80 | if elem is None: |
| ... | ... | @@ -103,6 +108,19 @@ def debug_str(elem): |
| 103 | 108 | return text + u']' |
| 104 | 109 | |
| 105 | 110 | |
| 111 | +def isstr(some_var): | |
| 112 | + """ version-independent test for isinstance(some_var, (str, unicode)) """ | |
| 113 | + if sys.version_info.major == 2: | |
| 114 | + return isinstance(some_var, basestring) # true for str and unicode | |
| 115 | + else: | |
| 116 | + return isinstance(some_var, str) # there is no unicode | |
| 117 | + | |
| 118 | + | |
| 119 | +############################################################################### | |
| 120 | +# INFO ON FILES | |
| 121 | +############################################################################### | |
| 122 | + | |
| 123 | + | |
| 106 | 124 | def get_type(filename): |
| 107 | 125 | """ return one of the DOCTYPE_* constants or raise error """ |
| 108 | 126 | parser = XmlParser(filename) |
| ... | ... | @@ -158,6 +176,11 @@ def is_ooxml(filename): |
| 158 | 176 | return False |
| 159 | 177 | |
| 160 | 178 | |
| 179 | +############################################################################### | |
| 180 | +# HELPER CLASSES | |
| 181 | +############################################################################### | |
| 182 | + | |
| 183 | + | |
| 161 | 184 | class ZipSubFile(object): |
| 162 | 185 | """ A file-like object like ZipFile.open returns them, with size and seek() |
| 163 | 186 | |
| ... | ... | @@ -351,6 +374,11 @@ class BadOOXML(ValueError): |
| 351 | 374 | self.more_info = more_info |
| 352 | 375 | |
| 353 | 376 | |
| 377 | +############################################################################### | |
| 378 | +# PARSING | |
| 379 | +############################################################################### | |
| 380 | + | |
| 381 | + | |
| 354 | 382 | class XmlParser(object): |
| 355 | 383 | """ parser for OOXML files |
| 356 | 384 | |
| ... | ... | @@ -389,7 +417,7 @@ class XmlParser(object): |
| 389 | 417 | if not match: |
| 390 | 418 | raise BadOOXML(self.filename, 'is no zip and has no prog_id') |
| 391 | 419 | |
| 392 | - def iter_files(self, *args): | |
| 420 | + def iter_files(self, args=None): | |
| 393 | 421 | """ Find files in zip or just give single xml file """ |
| 394 | 422 | if self.is_single_xml(): |
| 395 | 423 | if args: |
| ... | ... | @@ -399,29 +427,36 @@ class XmlParser(object): |
| 399 | 427 | self.did_iter_all = True |
| 400 | 428 | else: |
| 401 | 429 | zipper = None |
| 430 | + subfiles = None | |
| 402 | 431 | try: |
| 403 | 432 | zipper = ZipFile(self.filename) |
| 404 | - cont_file = zipper.getinfo(FILE_CONTENT_TYPES) # --> KeyError | |
| 405 | - if args: | |
| 406 | - subfiles = args | |
| 407 | - else: | |
| 433 | + try: | |
| 434 | + cont_file = zipper.getinfo(FILE_CONTENT_TYPES) | |
| 435 | + except KeyError: | |
| 436 | + raise BadOOXML(self.filename, | |
| 437 | + 'No content type information') | |
| 438 | + if not args: | |
| 408 | 439 | subfiles = zipper.namelist() |
| 440 | + elif isstr(args): | |
| 441 | + subfiles = [args, ] | |
| 442 | + else: | |
| 443 | + subfiles = tuple(args) # make a copy in case orig changes | |
| 409 | 444 | |
| 410 | 445 | for subfile in subfiles: |
| 411 | - logging.debug(u'subfile {0}'.format(subfile)) | |
| 412 | 446 | with zipper.open(subfile, 'r') as handle: |
| 413 | 447 | yield subfile, handle |
| 414 | 448 | if not args: |
| 415 | 449 | self.did_iter_all = True |
| 416 | - except KeyError: # zipper.getinfo failed, no content type file | |
| 417 | - raise BadOOXML(self.filename, 'No content type information') | |
| 450 | + except KeyError as orig_err: | |
| 451 | + raise BadOOXML(self.filename, 'invalid subfile: ' + | |
| 452 | + str(orig_err)) | |
| 418 | 453 | except BadZipfile: |
| 419 | 454 | raise BadOOXML(self.filename, 'neither zip nor xml') |
| 420 | 455 | finally: |
| 421 | 456 | if zipper: |
| 422 | 457 | zipper.close() |
| 423 | 458 | |
| 424 | - def iter_xml(self, *subfiles): | |
| 459 | + def iter_xml(self, subfiles=None, need_children=False, tags=None): | |
| 425 | 460 | """ Iterate xml contents of document |
| 426 | 461 | |
| 427 | 462 | If given subfile name[s] as optional arg[s], will only parse that |
| ... | ... | @@ -434,21 +469,75 @@ class XmlParser(object): |
| 434 | 469 | |
| 435 | 470 | Subfiles that are not xml (e.g. OLE or image files) are remembered |
| 436 | 471 | internally and can be retrieved using iter_non_xml(). |
| 472 | + | |
| 473 | + The argument need_children is set to False per default. If you need to | |
| 474 | + access an element's children, set it to True. Note, however, that | |
| 475 | + leaving it at False should save a lot of memory. Otherwise, the parser | |
| 476 | + has to keep every single element in memory since the last element | |
| 477 | + returned is the root which has the rest of the document as children. | |
| 478 | + c.f. http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ | |
| 479 | + | |
| 480 | + Argument tags restricts output to tags with names from that list (or | |
| 481 | + equal to that string). Children are preserved for these. | |
| 437 | 482 | """ |
| 438 | - for subfile, handle in self.iter_files(*subfiles): | |
| 483 | + if tags is None: | |
| 484 | + want_tags = [] | |
| 485 | + elif isinstance(tags, (str, unicode)): | |
| 486 | + want_tags = [tags, ] | |
| 487 | + logging.debug('looking for tags: {0}'.format(tags)) | |
| 488 | + else: | |
| 489 | + want_tags = tags | |
| 490 | + logging.debug('looking for tags: {0}'.format(tags)) | |
| 491 | + | |
| 492 | + for subfile, handle in self.iter_files(subfiles): | |
| 439 | 493 | events = ('start', 'end') |
| 440 | 494 | depth = 0 |
| 495 | + inside_tags = [] | |
| 441 | 496 | try: |
| 442 | 497 | for event, elem in ET.iterparse(handle, events): |
| 443 | 498 | if elem is None: |
| 444 | 499 | continue |
| 445 | 500 | if event == 'start': |
| 501 | + if elem.tag in want_tags: | |
| 502 | + logging.debug('remember start of tag {0} at {1}' | |
| 503 | + .format(elem.tag, depth)) | |
| 504 | + inside_tags.append((elem.tag, depth)) | |
| 446 | 505 | depth += 1 |
| 447 | 506 | continue |
| 448 | 507 | assert(event == 'end') |
| 449 | 508 | depth -= 1 |
| 450 | 509 | assert(depth >= 0) |
| 451 | - yield subfile, elem, depth | |
| 510 | + | |
| 511 | + is_wanted = elem.tag in want_tags | |
| 512 | + if is_wanted: | |
| 513 | + curr_tag = (elem.tag, depth) | |
| 514 | + try: | |
| 515 | + if inside_tags[-1] == curr_tag: | |
| 516 | + inside_tags.pop() | |
| 517 | + else: | |
| 518 | + logging.error('found end for wanted tag {0} ' | |
| 519 | + 'but last start tag {1} does not ' | |
| 520 | + 'match'.format(curr_tag, | |
| 521 | + inside_tags[-1])) | |
| 522 | + # try to recover: close all deeper tags | |
| 523 | + while inside_tags and \ | |
| 524 | + inside_tags[-1][1] >= depth: | |
| 525 | + logging.debug('recover: pop {0}' | |
| 526 | + .format(inside_tags[-1])) | |
| 527 | + inside_tags.pop() | |
| 528 | + except IndexError: # no inside_tag[-1] | |
| 529 | + logging.error('found end of {0} at depth {1} but ' | |
| 530 | + 'no start event') | |
| 531 | + # yield element | |
| 532 | + if is_wanted or not want_tags: | |
| 533 | + yield subfile, elem, depth | |
| 534 | + | |
| 535 | + # save memory: clear elem so parser memorizes less | |
| 536 | + if not need_children and not inside_tags: | |
| 537 | + elem.clear() | |
| 538 | + # cannot do this since we might be using py-builtin xml | |
| 539 | + # while elem.getprevious() is not None: | |
| 540 | + # del elem.getparent()[0] | |
| 452 | 541 | except ET.ParseError as err: |
| 453 | 542 | self.subfiles_no_xml.add(subfile) |
| 454 | 543 | if subfile is None: # this is no zip subfile but single xml |
| ... | ... | @@ -550,8 +639,8 @@ def test(): |
| 550 | 639 | # test complete parsing |
| 551 | 640 | parser = XmlParser(sys.argv[1]) |
| 552 | 641 | for subfile, elem, depth in parser.iter_xml(): |
| 553 | - if depth < 3: | |
| 554 | - print(u'{0}{1}{2}'.format(subfile, ' ' * depth, debug_str(elem))) | |
| 642 | + if depth < 4: | |
| 643 | + print(u'{0} {1}{2}'.format(subfile, ' ' * depth, debug_str(elem))) | |
| 555 | 644 | for index, (subfile, content_type) in enumerate(parser.iter_non_xml()): |
| 556 | 645 | print(u'Non-XML subfile: {0} of type {1}' |
| 557 | 646 | .format(subfile, content_type or u'unknown')) | ... | ... |