Commit ba871b6b02b00b94bc92412a629c39a6b999dbb9
1 parent
dae3f0f3
This tries to group all Field codes together. Will parse the document body and f…
…ind "begin" and "end" tags. All field codes
within these will be grouped together.
Should account for nested field codes as well. Values like { SET a { QUOTE 65 65 65 65 } } should be parsed out correctly.
Showing
1 changed file
with
61 additions
and
13 deletions
oletools/msodde.py
| ... | ... | @@ -46,12 +46,15 @@ from __future__ import print_function |
| 46 | 46 | # CHANGELOG: |
| 47 | 47 | # 2017-10-18 v0.52 PL: - first version |
| 48 | 48 | # 2017-10-20 PL: - fixed issue #202 (handling empty xml tags) |
| 49 | +# 2017-10-23 PL: - add check for fldSimple codes | |
| 50 | +# 2017-10-24 PL: - group tags and track begin/end tags to keep DDE strings together | |
| 49 | 51 | |
| 50 | 52 | __version__ = '0.52dev2' |
| 51 | 53 | |
| 52 | 54 | #------------------------------------------------------------------------------ |
| 53 | 55 | # TODO: detect beginning/end of fields, to separate each field |
| 54 | 56 | # TODO: test if DDE links can also appear in headers, footers and other places |
| 57 | +# TODO: field codes can be in headers/footers/comments - parse these | |
| 55 | 58 | # TODO: add xlsx support |
| 56 | 59 | |
| 57 | 60 | #------------------------------------------------------------------------------ |
| ... | ... | @@ -77,18 +80,20 @@ import sys |
| 77 | 80 | |
| 78 | 81 | |
| 79 | 82 | NS_WORD = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' |
| 80 | - | |
| 83 | +NO_QUOTES = False | |
| 81 | 84 | # XML tag for 'w:instrText' |
| 82 | 85 | TAG_W_INSTRTEXT = '{%s}instrText' % NS_WORD |
| 83 | 86 | TAG_W_FLDSIMPLE = '{%s}fldSimple' % NS_WORD |
| 84 | -TAG_W_INSTRATTR= '{%s}instr' % NS_WORD | |
| 87 | +TAG_W_FLDCHAR = '{%s}fldChar' % NS_WORD | |
| 88 | +ATTR_W_INSTR = '{%s}instr' % NS_WORD | |
| 89 | +ATTR_W_FLDCHARTYPE = '{%s}fldCharType' % NS_WORD | |
| 85 | 90 | |
| 86 | 91 | # === FUNCTIONS ============================================================== |
| 87 | 92 | |
| 88 | 93 | def process_args(): |
| 89 | 94 | parser = argparse.ArgumentParser(description='A python tool to detect and extract DDE links in MS Office files') |
| 90 | 95 | parser.add_argument("filepath", help="path of the file to be analyzed") |
| 91 | - | |
| 96 | + parser.add_argument("--nounquote", help="don't unquote values",action='store_true') | |
| 92 | 97 | args = parser.parse_args() |
| 93 | 98 | |
| 94 | 99 | if not os.path.exists(args.filepath): |
| ... | ... | @@ -105,22 +110,61 @@ def process_file(filepath): |
| 105 | 110 | z.close() |
| 106 | 111 | # parse the XML data: |
| 107 | 112 | root = ET.fromstring(data) |
| 108 | - text = u'' | |
| 109 | - # find all the tags 'w:instrText': | |
| 113 | + fields = [] | |
| 114 | + ddetext = u'' | |
| 115 | + level = 0 | |
| 116 | + # find all the tags 'w:p': | |
| 117 | + # parse each for begin and end tags, to group DDE strings | |
| 118 | + # fldChar can be in either a w:r element or floating alone in the w:p | |
| 119 | + # escape DDE if quoted etc | |
| 110 | 120 | # (each is a chunk of a DDE link) |
| 111 | - for elem in root.iter(TAG_W_INSTRTEXT): | |
| 112 | - # concatenate the text of the field, if present: | |
| 113 | - if elem.text is not None: | |
| 114 | - text += elem.text | |
| 121 | + for subs in root.iter("{%s}p"%NS_WORD): | |
| 122 | + for e in subs: | |
| 123 | + #check if w:r and if it is parse children elements to pull out the first FLDCHAR or INSTRTEXT | |
| 124 | + if e.tag == "{%s}r"%NS_WORD: | |
| 125 | + for child in e: | |
| 126 | + if child.tag == TAG_W_FLDCHAR or child.tag == TAG_W_INSTRTEXT: | |
| 127 | + elem = child | |
| 128 | + break | |
| 129 | + else: | |
| 130 | + elem = e | |
| 131 | + #this should be an error condition | |
| 132 | + if elem is None: | |
| 133 | + continue | |
| 134 | + | |
| 135 | + #check if FLDCHARTYPE and whether "begin" or "end" tag | |
| 136 | + if elem.attrib.get(ATTR_W_FLDCHARTYPE) is not None: | |
| 137 | + if elem.attrib[ATTR_W_FLDCHARTYPE] == "begin": | |
| 138 | + level += 1 | |
| 139 | + if elem.attrib[ATTR_W_FLDCHARTYPE] == "end": | |
| 140 | + level -= 1 | |
| 141 | + if level == 0: | |
| 142 | + fields.append(ddetext) | |
| 143 | + ddetext = u'' | |
| 144 | + | |
| 145 | + # concatenate the text of the field, if present: | |
| 146 | + if elem.tag == TAG_W_INSTRTEXT and elem.text is not None: | |
| 147 | + #expand field code if QUOTED | |
| 148 | + ddetext += unquote(elem.text) | |
| 115 | 149 | |
| 116 | 150 | for elem in root.iter(TAG_W_FLDSIMPLE): |
| 117 | 151 | # concatenate the attribute of the field, if present: |
| 118 | 152 | if elem.attrib is not None: |
| 119 | - text += elem.attrib[TAG_W_INSTRATTR] | |
| 153 | + fields.append(elem.attrib[ATTR_W_INSTR]) | |
| 120 | 154 | |
| 121 | 155 | |
| 122 | - return text | |
| 156 | + return fields | |
| 123 | 157 | |
| 158 | +def unquote(field): | |
| 159 | + | |
| 160 | + if "QUOTE" not in field or NO_QUOTES: | |
| 161 | + return field | |
| 162 | + #split into components | |
| 163 | + parts = field.strip().split(" ") | |
| 164 | + ddestr = "" | |
| 165 | + for p in parts[1:]: | |
| 166 | + ddestr += chr(int(p)) | |
| 167 | + return ddestr | |
| 124 | 168 | |
| 125 | 169 | #=== MAIN ================================================================= |
| 126 | 170 | |
| ... | ... | @@ -133,9 +177,13 @@ def main(): |
| 133 | 177 | |
| 134 | 178 | args = process_args() |
| 135 | 179 | print('Opening file: %s' % args.filepath) |
| 136 | - text = process_file(args.filepath) | |
| 180 | + if args.nounquote : | |
| 181 | + global NO_QUOTES | |
| 182 | + NO_QUOTES = True | |
| 183 | + fields = process_file(args.filepath) | |
| 137 | 184 | print ('DDE Links:') |
| 138 | - print(text) | |
| 185 | + for f in fields: | |
| 186 | + print(f) | |
| 139 | 187 | |
| 140 | 188 | |
| 141 | 189 | if __name__ == '__main__': | ... | ... |