Commit 00a2df089d26a2688c14df92780b49efde452ae9

Authored by Christian Herdtweck
1 parent b7926113

msodde: parse single-xml files from Word 2003

Showing 1 changed file with 23 additions and 18 deletions
oletools/msodde.py
@@ -123,15 +123,16 @@ if sys.version_info[0] >= 3: @@ -123,15 +123,16 @@ if sys.version_info[0] >= 3:
123 123
124 124
125 NS_WORD = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' 125 NS_WORD = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
  126 +NS_WORD_2003 = 'http://schemas.microsoft.com/office/word/2003/wordml'
126 NO_QUOTES = False 127 NO_QUOTES = False
127 # XML tag for 'w:instrText' 128 # XML tag for 'w:instrText'
128 -TAG_W_INSTRTEXT = '{%s}instrText' % NS_WORD  
129 -TAG_W_FLDSIMPLE = '{%s}fldSimple' % NS_WORD  
130 -TAG_W_FLDCHAR = '{%s}fldChar' % NS_WORD  
131 -TAG_W_P = "{%s}p" % NS_WORD  
132 -TAG_W_R = "{%s}r" % NS_WORD  
133 -ATTR_W_INSTR = '{%s}instr' % NS_WORD  
134 -ATTR_W_FLDCHARTYPE = '{%s}fldCharType' % NS_WORD 129 +TAG_W_INSTRTEXT = ['{%s}instrText' % ns for ns in NS_WORD, NS_WORD_2003]
  130 +TAG_W_FLDSIMPLE = ['{%s}fldSimple' % ns for ns in NS_WORD, NS_WORD_2003]
  131 +TAG_W_FLDCHAR = ['{%s}fldChar' % ns for ns in NS_WORD, NS_WORD_2003]
  132 +TAG_W_P = ["{%s}p" % ns for ns in NS_WORD, NS_WORD_2003]
  133 +TAG_W_R = ["{%s}r" % ns for ns in NS_WORD, NS_WORD_2003]
  134 +ATTR_W_INSTR = ['{%s}instr' % ns for ns in NS_WORD, NS_WORD_2003]
  135 +ATTR_W_FLDCHARTYPE = ['{%s}fldCharType' % ns for ns in NS_WORD, NS_WORD_2003]
135 LOCATIONS = ('word/document.xml', 'word/endnotes.xml', 'word/footnotes.xml', 136 LOCATIONS = ('word/document.xml', 'word/endnotes.xml', 'word/footnotes.xml',
136 'word/header1.xml', 'word/footer1.xml', 'word/header2.xml', 137 'word/header1.xml', 'word/footer1.xml', 'word/header2.xml',
137 'word/footer2.xml', 'word/comments.xml') 138 'word/footer2.xml', 'word/comments.xml')
@@ -558,23 +559,25 @@ def process_docx(filepath, field_filter_mode=None): @@ -558,23 +559,25 @@ def process_docx(filepath, field_filter_mode=None):
558 all_fields = [] 559 all_fields = []
559 level = 0 560 level = 0
560 ddetext = u'' 561 ddetext = u''
561 - for _, subs, depth in parser.iter_xml(tags=[TAG_W_P, TAG_W_FLDSIMPLE]): 562 + for _, subs, depth in parser.iter_xml(tags=TAG_W_P + TAG_W_FLDSIMPLE):
562 if depth == 0: # at end of subfile: 563 if depth == 0: # at end of subfile:
563 level = 0 # reset 564 level = 0 # reset
564 - if subs.tag == TAG_W_FLDSIMPLE: 565 + if subs.tag in TAG_W_FLDSIMPLE:
565 # concatenate the attribute of the field, if present: 566 # concatenate the attribute of the field, if present:
566 - if subs.attrib is not None:  
567 - all_fields.append(unquote(subs.attrib[ATTR_W_INSTR])) 567 + attrib_instr = subs.attrib.get(ATTR_W_INSTR[0]) or \
  568 + subs.attrib.get(ATTR_W_INSTR[1])
  569 + if attrib_instr is not None:
  570 + all_fields.append(unquote(attrib_instr))
568 continue 571 continue
569 572
570 # have a TAG_W_P 573 # have a TAG_W_P
571 elem = None 574 elem = None
572 for curr_elem in subs: 575 for curr_elem in subs:
573 # check if w:r; parse children to pull out first FLDCHAR/INSTRTEXT 576 # check if w:r; parse children to pull out first FLDCHAR/INSTRTEXT
574 - if curr_elem.tag == TAG_W_R: 577 + if curr_elem.tag in TAG_W_R:
575 for child in curr_elem: 578 for child in curr_elem:
576 - if child.tag == TAG_W_FLDCHAR or \  
577 - child.tag == TAG_W_INSTRTEXT: 579 + if child.tag in TAG_W_FLDCHAR or \
  580 + child.tag in TAG_W_INSTRTEXT:
578 elem = child 581 elem = child
579 break 582 break
580 else: 583 else:
@@ -584,10 +587,12 @@ def process_docx(filepath, field_filter_mode=None): @@ -584,10 +587,12 @@ def process_docx(filepath, field_filter_mode=None):
584 continue 587 continue
585 588
586 # check if FLDCHARTYPE and whether "begin" or "end" tag 589 # check if FLDCHARTYPE and whether "begin" or "end" tag
587 - if elem.attrib.get(ATTR_W_FLDCHARTYPE) is not None:  
588 - if elem.attrib[ATTR_W_FLDCHARTYPE] == "begin": 590 + attrib_type = elem.attrib.get(ATTR_W_FLDCHARTYPE[0]) or \
  591 + elem.attrib.get(ATTR_W_FLDCHARTYPE[1])
  592 + if attrib_type is not None:
  593 + if attrib_type == "begin":
589 level += 1 594 level += 1
590 - if elem.attrib[ATTR_W_FLDCHARTYPE] == "end": 595 + if attrib_type == "end":
591 level -= 1 596 level -= 1
592 if level == 0 or level == -1: # edge-case; level gets -1 597 if level == 0 or level == -1: # edge-case; level gets -1
593 all_fields.append(ddetext) 598 all_fields.append(ddetext)
@@ -595,7 +600,7 @@ def process_docx(filepath, field_filter_mode=None): @@ -595,7 +600,7 @@ def process_docx(filepath, field_filter_mode=None):
595 level = 0 # reset edge-case 600 level = 0 # reset edge-case
596 601
597 # concatenate the text of the field, if present: 602 # concatenate the text of the field, if present:
598 - if elem.tag == TAG_W_INSTRTEXT and elem.text is not None: 603 + if elem.tag in TAG_W_INSTRTEXT and elem.text is not None:
599 # expand field code if QUOTED 604 # expand field code if QUOTED
600 ddetext += unquote(elem.text) 605 ddetext += unquote(elem.text)
601 606