Commit 138753531bec374a90a84c1514a3ad53b96c8f31
1 parent
5862969f
ooxml: limit returns to set of tags; more memory-efficient
Showing
1 changed file
with
102 additions
and
13 deletions
oletools/ooxml.py
| @@ -70,6 +70,11 @@ DOCTYPE_WORD_XML2003 = 'word-xml2003' # not yet used | @@ -70,6 +70,11 @@ DOCTYPE_WORD_XML2003 = 'word-xml2003' # not yet used | ||
| 70 | DOCTYPE_EXCEL_XML2003 = 'excel-xml2003' # not yet used | 70 | DOCTYPE_EXCEL_XML2003 = 'excel-xml2003' # not yet used |
| 71 | 71 | ||
| 72 | 72 | ||
| 73 | +############################################################################### | ||
| 74 | +# HELPERS | ||
| 75 | +############################################################################### | ||
| 76 | + | ||
| 77 | + | ||
| 73 | def debug_str(elem): | 78 | def debug_str(elem): |
| 74 | """ for debugging: print an element """ | 79 | """ for debugging: print an element """ |
| 75 | if elem is None: | 80 | if elem is None: |
| @@ -103,6 +108,19 @@ def debug_str(elem): | @@ -103,6 +108,19 @@ def debug_str(elem): | ||
| 103 | return text + u']' | 108 | return text + u']' |
| 104 | 109 | ||
| 105 | 110 | ||
| 111 | +def isstr(some_var): | ||
| 112 | + """ version-independent test for isinstance(some_var, (str, unicode)) """ | ||
| 113 | + if sys.version_info.major == 2: | ||
| 114 | + return isinstance(some_var, basestring) # true for str and unicode | ||
| 115 | + else: | ||
| 116 | + return isinstance(some_var, str) # there is no unicode | ||
| 117 | + | ||
| 118 | + | ||
| 119 | +############################################################################### | ||
| 120 | +# INFO ON FILES | ||
| 121 | +############################################################################### | ||
| 122 | + | ||
| 123 | + | ||
| 106 | def get_type(filename): | 124 | def get_type(filename): |
| 107 | """ return one of the DOCTYPE_* constants or raise error """ | 125 | """ return one of the DOCTYPE_* constants or raise error """ |
| 108 | parser = XmlParser(filename) | 126 | parser = XmlParser(filename) |
| @@ -158,6 +176,11 @@ def is_ooxml(filename): | @@ -158,6 +176,11 @@ def is_ooxml(filename): | ||
| 158 | return False | 176 | return False |
| 159 | 177 | ||
| 160 | 178 | ||
| 179 | +############################################################################### | ||
| 180 | +# HELPER CLASSES | ||
| 181 | +############################################################################### | ||
| 182 | + | ||
| 183 | + | ||
| 161 | class ZipSubFile(object): | 184 | class ZipSubFile(object): |
| 162 | """ A file-like object like ZipFile.open returns them, with size and seek() | 185 | """ A file-like object like ZipFile.open returns them, with size and seek() |
| 163 | 186 | ||
| @@ -351,6 +374,11 @@ class BadOOXML(ValueError): | @@ -351,6 +374,11 @@ class BadOOXML(ValueError): | ||
| 351 | self.more_info = more_info | 374 | self.more_info = more_info |
| 352 | 375 | ||
| 353 | 376 | ||
| 377 | +############################################################################### | ||
| 378 | +# PARSING | ||
| 379 | +############################################################################### | ||
| 380 | + | ||
| 381 | + | ||
| 354 | class XmlParser(object): | 382 | class XmlParser(object): |
| 355 | """ parser for OOXML files | 383 | """ parser for OOXML files |
| 356 | 384 | ||
| @@ -389,7 +417,7 @@ class XmlParser(object): | @@ -389,7 +417,7 @@ class XmlParser(object): | ||
| 389 | if not match: | 417 | if not match: |
| 390 | raise BadOOXML(self.filename, 'is no zip and has no prog_id') | 418 | raise BadOOXML(self.filename, 'is no zip and has no prog_id') |
| 391 | 419 | ||
| 392 | - def iter_files(self, *args): | 420 | + def iter_files(self, args=None): |
| 393 | """ Find files in zip or just give single xml file """ | 421 | """ Find files in zip or just give single xml file """ |
| 394 | if self.is_single_xml(): | 422 | if self.is_single_xml(): |
| 395 | if args: | 423 | if args: |
| @@ -399,29 +427,36 @@ class XmlParser(object): | @@ -399,29 +427,36 @@ class XmlParser(object): | ||
| 399 | self.did_iter_all = True | 427 | self.did_iter_all = True |
| 400 | else: | 428 | else: |
| 401 | zipper = None | 429 | zipper = None |
| 430 | + subfiles = None | ||
| 402 | try: | 431 | try: |
| 403 | zipper = ZipFile(self.filename) | 432 | zipper = ZipFile(self.filename) |
| 404 | - cont_file = zipper.getinfo(FILE_CONTENT_TYPES) # --> KeyError | ||
| 405 | - if args: | ||
| 406 | - subfiles = args | ||
| 407 | - else: | 433 | + try: |
| 434 | + cont_file = zipper.getinfo(FILE_CONTENT_TYPES) | ||
| 435 | + except KeyError: | ||
| 436 | + raise BadOOXML(self.filename, | ||
| 437 | + 'No content type information') | ||
| 438 | + if not args: | ||
| 408 | subfiles = zipper.namelist() | 439 | subfiles = zipper.namelist() |
| 440 | + elif isstr(args): | ||
| 441 | + subfiles = [args, ] | ||
| 442 | + else: | ||
| 443 | + subfiles = tuple(args) # make a copy in case orig changes | ||
| 409 | 444 | ||
| 410 | for subfile in subfiles: | 445 | for subfile in subfiles: |
| 411 | - logging.debug(u'subfile {0}'.format(subfile)) | ||
| 412 | with zipper.open(subfile, 'r') as handle: | 446 | with zipper.open(subfile, 'r') as handle: |
| 413 | yield subfile, handle | 447 | yield subfile, handle |
| 414 | if not args: | 448 | if not args: |
| 415 | self.did_iter_all = True | 449 | self.did_iter_all = True |
| 416 | - except KeyError: # zipper.getinfo failed, no content type file | ||
| 417 | - raise BadOOXML(self.filename, 'No content type information') | 450 | + except KeyError as orig_err: |
| 451 | + raise BadOOXML(self.filename, 'invalid subfile: ' + | ||
| 452 | + str(orig_err)) | ||
| 418 | except BadZipfile: | 453 | except BadZipfile: |
| 419 | raise BadOOXML(self.filename, 'neither zip nor xml') | 454 | raise BadOOXML(self.filename, 'neither zip nor xml') |
| 420 | finally: | 455 | finally: |
| 421 | if zipper: | 456 | if zipper: |
| 422 | zipper.close() | 457 | zipper.close() |
| 423 | 458 | ||
| 424 | - def iter_xml(self, *subfiles): | 459 | + def iter_xml(self, subfiles=None, need_children=False, tags=None): |
| 425 | """ Iterate xml contents of document | 460 | """ Iterate xml contents of document |
| 426 | 461 | ||
| 427 | If given subfile name[s] as optional arg[s], will only parse that | 462 | If given subfile name[s] as optional arg[s], will only parse that |
| @@ -434,21 +469,75 @@ class XmlParser(object): | @@ -434,21 +469,75 @@ class XmlParser(object): | ||
| 434 | 469 | ||
| 435 | Subfiles that are not xml (e.g. OLE or image files) are remembered | 470 | Subfiles that are not xml (e.g. OLE or image files) are remembered |
| 436 | internally and can be retrieved using iter_non_xml(). | 471 | internally and can be retrieved using iter_non_xml(). |
| 472 | + | ||
| 473 | + The argument need_children is set to False per default. If you need to | ||
| 474 | + access an element's children, set it to True. Note, however, that | ||
| 475 | + leaving it at False should save a lot of memory. Otherwise, the parser | ||
| 476 | + has to keep every single element in memory since the last element | ||
| 477 | + returned is the root which has the rest of the document as children. | ||
| 478 | + c.f. http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ | ||
| 479 | + | ||
| 480 | + Argument tags restricts output to tags with names from that list (or | ||
| 481 | + equal to that string). Children are preserved for these. | ||
| 437 | """ | 482 | """ |
| 438 | - for subfile, handle in self.iter_files(*subfiles): | 483 | + if tags is None: |
| 484 | + want_tags = [] | ||
| 485 | + elif isinstance(tags, (str, unicode)): | ||
| 486 | + want_tags = [tags, ] | ||
| 487 | + logging.debug('looking for tags: {0}'.format(tags)) | ||
| 488 | + else: | ||
| 489 | + want_tags = tags | ||
| 490 | + logging.debug('looking for tags: {0}'.format(tags)) | ||
| 491 | + | ||
| 492 | + for subfile, handle in self.iter_files(subfiles): | ||
| 439 | events = ('start', 'end') | 493 | events = ('start', 'end') |
| 440 | depth = 0 | 494 | depth = 0 |
| 495 | + inside_tags = [] | ||
| 441 | try: | 496 | try: |
| 442 | for event, elem in ET.iterparse(handle, events): | 497 | for event, elem in ET.iterparse(handle, events): |
| 443 | if elem is None: | 498 | if elem is None: |
| 444 | continue | 499 | continue |
| 445 | if event == 'start': | 500 | if event == 'start': |
| 501 | + if elem.tag in want_tags: | ||
| 502 | + logging.debug('remember start of tag {0} at {1}' | ||
| 503 | + .format(elem.tag, depth)) | ||
| 504 | + inside_tags.append((elem.tag, depth)) | ||
| 446 | depth += 1 | 505 | depth += 1 |
| 447 | continue | 506 | continue |
| 448 | assert(event == 'end') | 507 | assert(event == 'end') |
| 449 | depth -= 1 | 508 | depth -= 1 |
| 450 | assert(depth >= 0) | 509 | assert(depth >= 0) |
| 451 | - yield subfile, elem, depth | 510 | + |
| 511 | + is_wanted = elem.tag in want_tags | ||
| 512 | + if is_wanted: | ||
| 513 | + curr_tag = (elem.tag, depth) | ||
| 514 | + try: | ||
| 515 | + if inside_tags[-1] == curr_tag: | ||
| 516 | + inside_tags.pop() | ||
| 517 | + else: | ||
| 518 | + logging.error('found end for wanted tag {0} ' | ||
| 519 | + 'but last start tag {1} does not ' | ||
| 520 | + 'match'.format(curr_tag, | ||
| 521 | + inside_tags[-1])) | ||
| 522 | + # try to recover: close all deeper tags | ||
| 523 | + while inside_tags and \ | ||
| 524 | + inside_tags[-1][1] >= depth: | ||
| 525 | + logging.debug('recover: pop {0}' | ||
| 526 | + .format(inside_tags[-1])) | ||
| 527 | + inside_tags.pop() | ||
| 528 | + except IndexError: # no inside_tag[-1] | ||
| 529 | + logging.error('found end of {0} at depth {1} but ' | ||
| 530 | + 'no start event') | ||
| 531 | + # yield element | ||
| 532 | + if is_wanted or not want_tags: | ||
| 533 | + yield subfile, elem, depth | ||
| 534 | + | ||
| 535 | + # save memory: clear elem so parser memorizes less | ||
| 536 | + if not need_children and not inside_tags: | ||
| 537 | + elem.clear() | ||
| 538 | + # cannot do this since we might be using py-builtin xml | ||
| 539 | + # while elem.getprevious() is not None: | ||
| 540 | + # del elem.getparent()[0] | ||
| 452 | except ET.ParseError as err: | 541 | except ET.ParseError as err: |
| 453 | self.subfiles_no_xml.add(subfile) | 542 | self.subfiles_no_xml.add(subfile) |
| 454 | if subfile is None: # this is no zip subfile but single xml | 543 | if subfile is None: # this is no zip subfile but single xml |
| @@ -550,8 +639,8 @@ def test(): | @@ -550,8 +639,8 @@ def test(): | ||
| 550 | # test complete parsing | 639 | # test complete parsing |
| 551 | parser = XmlParser(sys.argv[1]) | 640 | parser = XmlParser(sys.argv[1]) |
| 552 | for subfile, elem, depth in parser.iter_xml(): | 641 | for subfile, elem, depth in parser.iter_xml(): |
| 553 | - if depth < 3: | ||
| 554 | - print(u'{0}{1}{2}'.format(subfile, ' ' * depth, debug_str(elem))) | 642 | + if depth < 4: |
| 643 | + print(u'{0} {1}{2}'.format(subfile, ' ' * depth, debug_str(elem))) | ||
| 555 | for index, (subfile, content_type) in enumerate(parser.iter_non_xml()): | 644 | for index, (subfile, content_type) in enumerate(parser.iter_non_xml()): |
| 556 | print(u'Non-XML subfile: {0} of type {1}' | 645 | print(u'Non-XML subfile: {0} of type {1}' |
| 557 | .format(subfile, content_type or u'unknown')) | 646 | .format(subfile, content_type or u'unknown')) |