Commit 00a2df089d26a2688c14df92780b49efde452ae9
1 parent
b7926113
msodde: parse single-xml files from Word 2003
Showing
1 changed file
with
23 additions
and
18 deletions
oletools/msodde.py
| @@ -123,15 +123,16 @@ if sys.version_info[0] >= 3: | @@ -123,15 +123,16 @@ if sys.version_info[0] >= 3: | ||
| 123 | 123 | ||
| 124 | 124 | ||
| 125 | NS_WORD = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' | 125 | NS_WORD = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' |
| 126 | +NS_WORD_2003 = 'http://schemas.microsoft.com/office/word/2003/wordml' | ||
| 126 | NO_QUOTES = False | 127 | NO_QUOTES = False |
| 127 | # XML tag for 'w:instrText' | 128 | # XML tag for 'w:instrText' |
| 128 | -TAG_W_INSTRTEXT = '{%s}instrText' % NS_WORD | ||
| 129 | -TAG_W_FLDSIMPLE = '{%s}fldSimple' % NS_WORD | ||
| 130 | -TAG_W_FLDCHAR = '{%s}fldChar' % NS_WORD | ||
| 131 | -TAG_W_P = "{%s}p" % NS_WORD | ||
| 132 | -TAG_W_R = "{%s}r" % NS_WORD | ||
| 133 | -ATTR_W_INSTR = '{%s}instr' % NS_WORD | ||
| 134 | -ATTR_W_FLDCHARTYPE = '{%s}fldCharType' % NS_WORD | 129 | +TAG_W_INSTRTEXT = ['{%s}instrText' % ns for ns in NS_WORD, NS_WORD_2003] |
| 130 | +TAG_W_FLDSIMPLE = ['{%s}fldSimple' % ns for ns in NS_WORD, NS_WORD_2003] | ||
| 131 | +TAG_W_FLDCHAR = ['{%s}fldChar' % ns for ns in NS_WORD, NS_WORD_2003] | ||
| 132 | +TAG_W_P = ["{%s}p" % ns for ns in NS_WORD, NS_WORD_2003] | ||
| 133 | +TAG_W_R = ["{%s}r" % ns for ns in NS_WORD, NS_WORD_2003] | ||
| 134 | +ATTR_W_INSTR = ['{%s}instr' % ns for ns in NS_WORD, NS_WORD_2003] | ||
| 135 | +ATTR_W_FLDCHARTYPE = ['{%s}fldCharType' % ns for ns in NS_WORD, NS_WORD_2003] | ||
| 135 | LOCATIONS = ('word/document.xml', 'word/endnotes.xml', 'word/footnotes.xml', | 136 | LOCATIONS = ('word/document.xml', 'word/endnotes.xml', 'word/footnotes.xml', |
| 136 | 'word/header1.xml', 'word/footer1.xml', 'word/header2.xml', | 137 | 'word/header1.xml', 'word/footer1.xml', 'word/header2.xml', |
| 137 | 'word/footer2.xml', 'word/comments.xml') | 138 | 'word/footer2.xml', 'word/comments.xml') |
| @@ -558,23 +559,25 @@ def process_docx(filepath, field_filter_mode=None): | @@ -558,23 +559,25 @@ def process_docx(filepath, field_filter_mode=None): | ||
| 558 | all_fields = [] | 559 | all_fields = [] |
| 559 | level = 0 | 560 | level = 0 |
| 560 | ddetext = u'' | 561 | ddetext = u'' |
| 561 | - for _, subs, depth in parser.iter_xml(tags=[TAG_W_P, TAG_W_FLDSIMPLE]): | 562 | + for _, subs, depth in parser.iter_xml(tags=TAG_W_P + TAG_W_FLDSIMPLE): |
| 562 | if depth == 0: # at end of subfile: | 563 | if depth == 0: # at end of subfile: |
| 563 | level = 0 # reset | 564 | level = 0 # reset |
| 564 | - if subs.tag == TAG_W_FLDSIMPLE: | 565 | + if subs.tag in TAG_W_FLDSIMPLE: |
| 565 | # concatenate the attribute of the field, if present: | 566 | # concatenate the attribute of the field, if present: |
| 566 | - if subs.attrib is not None: | ||
| 567 | - all_fields.append(unquote(subs.attrib[ATTR_W_INSTR])) | 567 | + attrib_instr = subs.attrib.get(ATTR_W_INSTR[0]) or \ |
| 568 | + subs.attrib.get(ATTR_W_INSTR[1]) | ||
| 569 | + if attrib_instr is not None: | ||
| 570 | + all_fields.append(unquote(attrib_instr)) | ||
| 568 | continue | 571 | continue |
| 569 | 572 | ||
| 570 | # have a TAG_W_P | 573 | # have a TAG_W_P |
| 571 | elem = None | 574 | elem = None |
| 572 | for curr_elem in subs: | 575 | for curr_elem in subs: |
| 573 | # check if w:r; parse children to pull out first FLDCHAR/INSTRTEXT | 576 | # check if w:r; parse children to pull out first FLDCHAR/INSTRTEXT |
| 574 | - if curr_elem.tag == TAG_W_R: | 577 | + if curr_elem.tag in TAG_W_R: |
| 575 | for child in curr_elem: | 578 | for child in curr_elem: |
| 576 | - if child.tag == TAG_W_FLDCHAR or \ | ||
| 577 | - child.tag == TAG_W_INSTRTEXT: | 579 | + if child.tag in TAG_W_FLDCHAR or \ |
| 580 | + child.tag in TAG_W_INSTRTEXT: | ||
| 578 | elem = child | 581 | elem = child |
| 579 | break | 582 | break |
| 580 | else: | 583 | else: |
| @@ -584,10 +587,12 @@ def process_docx(filepath, field_filter_mode=None): | @@ -584,10 +587,12 @@ def process_docx(filepath, field_filter_mode=None): | ||
| 584 | continue | 587 | continue |
| 585 | 588 | ||
| 586 | # check if FLDCHARTYPE and whether "begin" or "end" tag | 589 | # check if FLDCHARTYPE and whether "begin" or "end" tag |
| 587 | - if elem.attrib.get(ATTR_W_FLDCHARTYPE) is not None: | ||
| 588 | - if elem.attrib[ATTR_W_FLDCHARTYPE] == "begin": | 590 | + attrib_type = elem.attrib.get(ATTR_W_FLDCHARTYPE[0]) or \ |
| 591 | + elem.attrib.get(ATTR_W_FLDCHARTYPE[1]) | ||
| 592 | + if attrib_type is not None: | ||
| 593 | + if attrib_type == "begin": | ||
| 589 | level += 1 | 594 | level += 1 |
| 590 | - if elem.attrib[ATTR_W_FLDCHARTYPE] == "end": | 595 | + if attrib_type == "end": |
| 591 | level -= 1 | 596 | level -= 1 |
| 592 | if level == 0 or level == -1: # edge-case; level gets -1 | 597 | if level == 0 or level == -1: # edge-case; level gets -1 |
| 593 | all_fields.append(ddetext) | 598 | all_fields.append(ddetext) |
| @@ -595,7 +600,7 @@ def process_docx(filepath, field_filter_mode=None): | @@ -595,7 +600,7 @@ def process_docx(filepath, field_filter_mode=None): | ||
| 595 | level = 0 # reset edge-case | 600 | level = 0 # reset edge-case |
| 596 | 601 | ||
| 597 | # concatenate the text of the field, if present: | 602 | # concatenate the text of the field, if present: |
| 598 | - if elem.tag == TAG_W_INSTRTEXT and elem.text is not None: | 603 | + if elem.tag in TAG_W_INSTRTEXT and elem.text is not None: |
| 599 | # expand field code if QUOTED | 604 | # expand field code if QUOTED |
| 600 | ddetext += unquote(elem.text) | 605 | ddetext += unquote(elem.text) |
| 601 | 606 |