Commit 00a2df089d26a2688c14df92780b49efde452ae9
1 parent
b7926113
msodde: parse single-xml files from Word 2003
Showing
1 changed file
with
23 additions
and
18 deletions
oletools/msodde.py
| ... | ... | @@ -123,15 +123,16 @@ if sys.version_info[0] >= 3: |
| 123 | 123 | |
| 124 | 124 | |
| 125 | 125 | NS_WORD = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' |
| 126 | +NS_WORD_2003 = 'http://schemas.microsoft.com/office/word/2003/wordml' | |
| 126 | 127 | NO_QUOTES = False |
| 127 | 128 | # XML tag for 'w:instrText' |
| 128 | -TAG_W_INSTRTEXT = '{%s}instrText' % NS_WORD | |
| 129 | -TAG_W_FLDSIMPLE = '{%s}fldSimple' % NS_WORD | |
| 130 | -TAG_W_FLDCHAR = '{%s}fldChar' % NS_WORD | |
| 131 | -TAG_W_P = "{%s}p" % NS_WORD | |
| 132 | -TAG_W_R = "{%s}r" % NS_WORD | |
| 133 | -ATTR_W_INSTR = '{%s}instr' % NS_WORD | |
| 134 | -ATTR_W_FLDCHARTYPE = '{%s}fldCharType' % NS_WORD | |
| 129 | +TAG_W_INSTRTEXT = ['{%s}instrText' % ns for ns in NS_WORD, NS_WORD_2003] | |
| 130 | +TAG_W_FLDSIMPLE = ['{%s}fldSimple' % ns for ns in NS_WORD, NS_WORD_2003] | |
| 131 | +TAG_W_FLDCHAR = ['{%s}fldChar' % ns for ns in NS_WORD, NS_WORD_2003] | |
| 132 | +TAG_W_P = ["{%s}p" % ns for ns in NS_WORD, NS_WORD_2003] | |
| 133 | +TAG_W_R = ["{%s}r" % ns for ns in NS_WORD, NS_WORD_2003] | |
| 134 | +ATTR_W_INSTR = ['{%s}instr' % ns for ns in NS_WORD, NS_WORD_2003] | |
| 135 | +ATTR_W_FLDCHARTYPE = ['{%s}fldCharType' % ns for ns in NS_WORD, NS_WORD_2003] | |
| 135 | 136 | LOCATIONS = ('word/document.xml', 'word/endnotes.xml', 'word/footnotes.xml', |
| 136 | 137 | 'word/header1.xml', 'word/footer1.xml', 'word/header2.xml', |
| 137 | 138 | 'word/footer2.xml', 'word/comments.xml') |
| ... | ... | @@ -558,23 +559,25 @@ def process_docx(filepath, field_filter_mode=None): |
| 558 | 559 | all_fields = [] |
| 559 | 560 | level = 0 |
| 560 | 561 | ddetext = u'' |
| 561 | - for _, subs, depth in parser.iter_xml(tags=[TAG_W_P, TAG_W_FLDSIMPLE]): | |
| 562 | + for _, subs, depth in parser.iter_xml(tags=TAG_W_P + TAG_W_FLDSIMPLE): | |
| 562 | 563 | if depth == 0: # at end of subfile: |
| 563 | 564 | level = 0 # reset |
| 564 | - if subs.tag == TAG_W_FLDSIMPLE: | |
| 565 | + if subs.tag in TAG_W_FLDSIMPLE: | |
| 565 | 566 | # concatenate the attribute of the field, if present: |
| 566 | - if subs.attrib is not None: | |
| 567 | - all_fields.append(unquote(subs.attrib[ATTR_W_INSTR])) | |
| 567 | + attrib_instr = subs.attrib.get(ATTR_W_INSTR[0]) or \ | |
| 568 | + subs.attrib.get(ATTR_W_INSTR[1]) | |
| 569 | + if attrib_instr is not None: | |
| 570 | + all_fields.append(unquote(attrib_instr)) | |
| 568 | 571 | continue |
| 569 | 572 | |
| 570 | 573 | # have a TAG_W_P |
| 571 | 574 | elem = None |
| 572 | 575 | for curr_elem in subs: |
| 573 | 576 | # check if w:r; parse children to pull out first FLDCHAR/INSTRTEXT |
| 574 | - if curr_elem.tag == TAG_W_R: | |
| 577 | + if curr_elem.tag in TAG_W_R: | |
| 575 | 578 | for child in curr_elem: |
| 576 | - if child.tag == TAG_W_FLDCHAR or \ | |
| 577 | - child.tag == TAG_W_INSTRTEXT: | |
| 579 | + if child.tag in TAG_W_FLDCHAR or \ | |
| 580 | + child.tag in TAG_W_INSTRTEXT: | |
| 578 | 581 | elem = child |
| 579 | 582 | break |
| 580 | 583 | else: |
| ... | ... | @@ -584,10 +587,12 @@ def process_docx(filepath, field_filter_mode=None): |
| 584 | 587 | continue |
| 585 | 588 | |
| 586 | 589 | # check if FLDCHARTYPE and whether "begin" or "end" tag |
| 587 | - if elem.attrib.get(ATTR_W_FLDCHARTYPE) is not None: | |
| 588 | - if elem.attrib[ATTR_W_FLDCHARTYPE] == "begin": | |
| 590 | + attrib_type = elem.attrib.get(ATTR_W_FLDCHARTYPE[0]) or \ | |
| 591 | + elem.attrib.get(ATTR_W_FLDCHARTYPE[1]) | |
| 592 | + if attrib_type is not None: | |
| 593 | + if attrib_type == "begin": | |
| 589 | 594 | level += 1 |
| 590 | - if elem.attrib[ATTR_W_FLDCHARTYPE] == "end": | |
| 595 | + if attrib_type == "end": | |
| 591 | 596 | level -= 1 |
| 592 | 597 | if level == 0 or level == -1: # edge-case; level gets -1 |
| 593 | 598 | all_fields.append(ddetext) |
| ... | ... | @@ -595,7 +600,7 @@ def process_docx(filepath, field_filter_mode=None): |
| 595 | 600 | level = 0 # reset edge-case |
| 596 | 601 | |
| 597 | 602 | # concatenate the text of the field, if present: |
| 598 | - if elem.tag == TAG_W_INSTRTEXT and elem.text is not None: | |
| 603 | + if elem.tag in TAG_W_INSTRTEXT and elem.text is not None: | |
| 599 | 604 | # expand field code if QUOTED |
| 600 | 605 | ddetext += unquote(elem.text) |
| 601 | 606 | ... | ... |