Commit ba871b6b02b00b94bc92412a629c39a6b999dbb9

Authored by Etienne Stalmans
1 parent dae3f0f3

This tries to group all Field codes together. Will parse the document body and f…

…ind "begin" and "end" tags. All field codes
within these will be grouped together.

Should account for nested field codes as well. Values like { SET a { QUOTE 65 65 65 65 } } should be parsed out correctly.
Showing 1 changed file with 61 additions and 13 deletions
oletools/msodde.py
... ... @@ -46,12 +46,15 @@ from __future__ import print_function
46 46 # CHANGELOG:
47 47 # 2017-10-18 v0.52 PL: - first version
48 48 # 2017-10-20 PL: - fixed issue #202 (handling empty xml tags)
  49 +# 2017-10-23 PL: - add check for fldSimple codes
  50 +# 2017-10-24 PL: - group tags and track begin/end tags to keep DDE strings together
49 51  
50 52 __version__ = '0.52dev2'
51 53  
52 54 #------------------------------------------------------------------------------
53 55 # TODO: detect beginning/end of fields, to separate each field
54 56 # TODO: test if DDE links can also appear in headers, footers and other places
  57 +# TODO: field codes can be in headers/footers/comments - parse these
55 58 # TODO: add xlsx support
56 59  
57 60 #------------------------------------------------------------------------------
... ... @@ -77,18 +80,20 @@ import sys
77 80  
78 81  
79 82 NS_WORD = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
80   -
  83 +NO_QUOTES = False
81 84 # XML tag for 'w:instrText'
82 85 TAG_W_INSTRTEXT = '{%s}instrText' % NS_WORD
83 86 TAG_W_FLDSIMPLE = '{%s}fldSimple' % NS_WORD
84   -TAG_W_INSTRATTR= '{%s}instr' % NS_WORD
  87 +TAG_W_FLDCHAR = '{%s}fldChar' % NS_WORD
  88 +ATTR_W_INSTR = '{%s}instr' % NS_WORD
  89 +ATTR_W_FLDCHARTYPE = '{%s}fldCharType' % NS_WORD
85 90  
86 91 # === FUNCTIONS ==============================================================
87 92  
88 93 def process_args():
89 94 parser = argparse.ArgumentParser(description='A python tool to detect and extract DDE links in MS Office files')
90 95 parser.add_argument("filepath", help="path of the file to be analyzed")
91   -
  96 + parser.add_argument("--nounquote", help="don't unquote values",action='store_true')
92 97 args = parser.parse_args()
93 98  
94 99 if not os.path.exists(args.filepath):
... ... @@ -105,22 +110,61 @@ def process_file(filepath):
105 110 z.close()
106 111 # parse the XML data:
107 112 root = ET.fromstring(data)
108   - text = u''
109   - # find all the tags 'w:instrText':
  113 + fields = []
  114 + ddetext = u''
  115 + level = 0
  116 + # find all the tags 'w:p':
  117 + # parse each for begin and end tags, to group DDE strings
  118 + # fldChar can be in either a w:r element or floating alone in the w:p
  119 + # escape DDE if quoted etc
110 120 # (each is a chunk of a DDE link)
111   - for elem in root.iter(TAG_W_INSTRTEXT):
112   - # concatenate the text of the field, if present:
113   - if elem.text is not None:
114   - text += elem.text
  121 + for subs in root.iter("{%s}p"%NS_WORD):
  122 + for e in subs:
  123 + #check if w:r and if it is parse children elements to pull out the first FLDCHAR or INSTRTEXT
  124 + if e.tag == "{%s}r"%NS_WORD:
  125 + for child in e:
  126 + if child.tag == TAG_W_FLDCHAR or child.tag == TAG_W_INSTRTEXT:
  127 + elem = child
  128 + break
  129 + else:
  130 + elem = e
  131 + #this should be an error condition
  132 + if elem is None:
  133 + continue
  134 +
  135 + #check if FLDCHARTYPE and whether "begin" or "end" tag
  136 + if elem.attrib.get(ATTR_W_FLDCHARTYPE) is not None:
  137 + if elem.attrib[ATTR_W_FLDCHARTYPE] == "begin":
  138 + level += 1
  139 + if elem.attrib[ATTR_W_FLDCHARTYPE] == "end":
  140 + level -= 1
  141 + if level == 0:
  142 + fields.append(ddetext)
  143 + ddetext = u''
  144 +
  145 + # concatenate the text of the field, if present:
  146 + if elem.tag == TAG_W_INSTRTEXT and elem.text is not None:
  147 + #expand field code if QUOTED
  148 + ddetext += unquote(elem.text)
115 149  
116 150 for elem in root.iter(TAG_W_FLDSIMPLE):
117 151 # concatenate the attribute of the field, if present:
118 152 if elem.attrib is not None:
119   - text += elem.attrib[TAG_W_INSTRATTR]
  153 + fields.append(elem.attrib[ATTR_W_INSTR])
120 154  
121 155  
122   - return text
  156 + return fields
123 157  
  158 +def unquote(field):
  159 +
  160 + if "QUOTE" not in field or NO_QUOTES:
  161 + return field
  162 + #split into components
  163 + parts = field.strip().split(" ")
  164 + ddestr = ""
  165 + for p in parts[1:]:
  166 + ddestr += chr(int(p))
  167 + return ddestr
124 168  
125 169 #=== MAIN =================================================================
126 170  
... ... @@ -133,9 +177,13 @@ def main():
133 177  
134 178 args = process_args()
135 179 print('Opening file: %s' % args.filepath)
136   - text = process_file(args.filepath)
  180 + if args.nounquote :
  181 + global NO_QUOTES
  182 + NO_QUOTES = True
  183 + fields = process_file(args.filepath)
137 184 print ('DDE Links:')
138   - print(text)
  185 + for f in fields:
  186 + print(f)
139 187  
140 188  
141 189 if __name__ == '__main__':
... ...