Commit 00a2df089d26a2688c14df92780b49efde452ae9

Authored by Christian Herdtweck
1 parent b7926113

msodde: parse single-xml files from Word 2003

Showing 1 changed file with 23 additions and 18 deletions
oletools/msodde.py
... ... @@ -123,15 +123,16 @@ if sys.version_info[0] >= 3:
123 123  
124 124  
125 125 NS_WORD = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
  126 +NS_WORD_2003 = 'http://schemas.microsoft.com/office/word/2003/wordml'
126 127 NO_QUOTES = False
127 128 # XML tag for 'w:instrText'
128   -TAG_W_INSTRTEXT = '{%s}instrText' % NS_WORD
129   -TAG_W_FLDSIMPLE = '{%s}fldSimple' % NS_WORD
130   -TAG_W_FLDCHAR = '{%s}fldChar' % NS_WORD
131   -TAG_W_P = "{%s}p" % NS_WORD
132   -TAG_W_R = "{%s}r" % NS_WORD
133   -ATTR_W_INSTR = '{%s}instr' % NS_WORD
134   -ATTR_W_FLDCHARTYPE = '{%s}fldCharType' % NS_WORD
  129 +TAG_W_INSTRTEXT = ['{%s}instrText' % ns for ns in NS_WORD, NS_WORD_2003]
  130 +TAG_W_FLDSIMPLE = ['{%s}fldSimple' % ns for ns in NS_WORD, NS_WORD_2003]
  131 +TAG_W_FLDCHAR = ['{%s}fldChar' % ns for ns in NS_WORD, NS_WORD_2003]
  132 +TAG_W_P = ["{%s}p" % ns for ns in NS_WORD, NS_WORD_2003]
  133 +TAG_W_R = ["{%s}r" % ns for ns in NS_WORD, NS_WORD_2003]
  134 +ATTR_W_INSTR = ['{%s}instr' % ns for ns in NS_WORD, NS_WORD_2003]
  135 +ATTR_W_FLDCHARTYPE = ['{%s}fldCharType' % ns for ns in NS_WORD, NS_WORD_2003]
135 136 LOCATIONS = ('word/document.xml', 'word/endnotes.xml', 'word/footnotes.xml',
136 137 'word/header1.xml', 'word/footer1.xml', 'word/header2.xml',
137 138 'word/footer2.xml', 'word/comments.xml')
... ... @@ -558,23 +559,25 @@ def process_docx(filepath, field_filter_mode=None):
558 559 all_fields = []
559 560 level = 0
560 561 ddetext = u''
561   - for _, subs, depth in parser.iter_xml(tags=[TAG_W_P, TAG_W_FLDSIMPLE]):
  562 + for _, subs, depth in parser.iter_xml(tags=TAG_W_P + TAG_W_FLDSIMPLE):
562 563 if depth == 0: # at end of subfile:
563 564 level = 0 # reset
564   - if subs.tag == TAG_W_FLDSIMPLE:
  565 + if subs.tag in TAG_W_FLDSIMPLE:
565 566 # concatenate the attribute of the field, if present:
566   - if subs.attrib is not None:
567   - all_fields.append(unquote(subs.attrib[ATTR_W_INSTR]))
  567 + attrib_instr = subs.attrib.get(ATTR_W_INSTR[0]) or \
  568 + subs.attrib.get(ATTR_W_INSTR[1])
  569 + if attrib_instr is not None:
  570 + all_fields.append(unquote(attrib_instr))
568 571 continue
569 572  
570 573 # have a TAG_W_P
571 574 elem = None
572 575 for curr_elem in subs:
573 576 # check if w:r; parse children to pull out first FLDCHAR/INSTRTEXT
574   - if curr_elem.tag == TAG_W_R:
  577 + if curr_elem.tag in TAG_W_R:
575 578 for child in curr_elem:
576   - if child.tag == TAG_W_FLDCHAR or \
577   - child.tag == TAG_W_INSTRTEXT:
  579 + if child.tag in TAG_W_FLDCHAR or \
  580 + child.tag in TAG_W_INSTRTEXT:
578 581 elem = child
579 582 break
580 583 else:
... ... @@ -584,10 +587,12 @@ def process_docx(filepath, field_filter_mode=None):
584 587 continue
585 588  
586 589 # check if FLDCHARTYPE and whether "begin" or "end" tag
587   - if elem.attrib.get(ATTR_W_FLDCHARTYPE) is not None:
588   - if elem.attrib[ATTR_W_FLDCHARTYPE] == "begin":
  590 + attrib_type = elem.attrib.get(ATTR_W_FLDCHARTYPE[0]) or \
  591 + elem.attrib.get(ATTR_W_FLDCHARTYPE[1])
  592 + if attrib_type is not None:
  593 + if attrib_type == "begin":
589 594 level += 1
590   - if elem.attrib[ATTR_W_FLDCHARTYPE] == "end":
  595 + if attrib_type == "end":
591 596 level -= 1
592 597 if level == 0 or level == -1: # edge-case; level gets -1
593 598 all_fields.append(ddetext)
... ... @@ -595,7 +600,7 @@ def process_docx(filepath, field_filter_mode=None):
595 600 level = 0 # reset edge-case
596 601  
597 602 # concatenate the text of the field, if present:
598   - if elem.tag == TAG_W_INSTRTEXT and elem.text is not None:
  603 + if elem.tag in TAG_W_INSTRTEXT and elem.text is not None:
599 604 # expand field code if QUOTED
600 605 ddetext += unquote(elem.text)
601 606  
... ...