Commit 1cf591ddf5ec2da343d6ddc3f73383e8f1bdcdc8

Authored by Philippe Lagadec
Committed by GitHub
2 parents 2685c6f5 8adff0e0

Merge pull request #63 from sebdraven/master

python3.5 compliant only
oletools/oleobj.py
... ... @@ -162,7 +162,7 @@ def read_LengthPrefixedAnsiString(data):
162 162 ansi_string = data[:length-1]
163 163 # TODO: only in strict mode:
164 164 # check the presence of the null char:
165   - assert data[length] == '\x00'
  165 + assert data[length] == 0
166 166 new_data = data[length:]
167 167 return (ansi_string, new_data)
168 168  
... ... @@ -214,14 +214,14 @@ class OleNativeStream (object):
214 214 # log.debug('OLE native data size = {0:08X} ({0} bytes)'.format(self.native_data_size))
215 215 # I thought this might be an OLE type specifier ???
216 216 self.unknown_short, data = read_uint16(data)
217   - self.filename, data = data.split('\x00', 1)
  217 + self.filename, data = data.split(b'\x00', 1)
218 218 # source path
219   - self.src_path, data = data.split('\x00', 1)
  219 + self.src_path, data = data.split(b'\x00', 1)
220 220 # TODO I bet these next 8 bytes are a timestamp => FILETIME from olefile
221 221 self.unknown_long_1, data = read_uint32(data)
222 222 self.unknown_long_2, data = read_uint32(data)
223 223 # temp path?
224   - self.temp_path, data = data.split('\x00', 1)
  224 + self.temp_path, data = data.split(b'\x00', 1)
225 225 # size of the rest of the data
226 226 self.actual_size, data = read_uint32(data)
227 227 self.data = data[0:self.actual_size]
... ...
oletools/rtfobj.py
... ... @@ -120,7 +120,7 @@ log = get_logger('rtfobj')
120 120 # REGEX pattern to extract embedded OLE objects in hexadecimal format:
121 121  
122 122 # alphanum digit: [0-9A-Fa-f]
123   -HEX_DIGIT = r'[0-9A-Fa-f]'
  123 +HEX_DIGIT = rb'[0-9A-Fa-f]'
124 124  
125 125 # hex char = two alphanum digits: [0-9A-Fa-f]{2}
126 126 # HEX_CHAR = r'[0-9A-Fa-f]{2}'
... ... @@ -130,11 +130,11 @@ HEX_DIGIT = r'[0-9A-Fa-f]'
130 130 # AND the tags can be nested...
131 131 #SINGLE_RTF_TAG = r'[{][^{}]*[}]'
132 132 # Actually RTF tags may contain braces escaped with backslash (\{ \}):
133   -SINGLE_RTF_TAG = r'[{](?:\.|[^{}\])*[}]'
  133 +SINGLE_RTF_TAG = rb'[{](?:\.|[^{}\])*[}]'
134 134  
135 135 # Nested tags, two levels (because Python's re does not support nested matching):
136 136 # NESTED_RTF_TAG = r'[{](?:[^{}]|'+SINGLE_RTF_TAG+r')*[}]'
137   -NESTED_RTF_TAG = r'[{](?:\\.|[^{}\\]|'+SINGLE_RTF_TAG+r')*[}]'
  137 +NESTED_RTF_TAG = rb'[{](?:\\.|[^{}\\]|'+SINGLE_RTF_TAG+b')*[}]'
138 138  
139 139 # AND it is also allowed to insert ANY control word or control symbol (ignored)
140 140 # According to Rich Text Format (RTF) Specification Version 1.9.1,
... ... @@ -146,7 +146,7 @@ NESTED_RTF_TAG = r'[{](?:\\.|[^{}\\]|'+SINGLE_RTF_TAG+r')*[}]'
146 146 # "\AnyThing " "\AnyThing123z" ""\AnyThing-456{" "\AnyThing{"
147 147 # control symbol = \<any char except letter or digit> (followed by anything)
148 148  
149   -ASCII_NAME = r'([a-zA-Z]{1,250})'
  149 +ASCII_NAME = rb'([a-zA-Z]{1,250})'
150 150  
151 151 # using Python's re lookahead assumption:
152 152 # (?=...) Matches if ... matches next, but doesn't consume any of the string.
... ... @@ -155,20 +155,21 @@ ASCII_NAME = r&#39;([a-zA-Z]{1,250})&#39;
155 155  
156 156 # TODO: Find the actual limit on the number of digits for Word
157 157 # SIGNED_INTEGER = r'(-?\d{1,250})'
158   -SIGNED_INTEGER = r'(-?\d+)'
  158 +SIGNED_INTEGER = rb'(-?\d+)'
  159 +
  160 +CONTROL_WORD = rb'(?:\\' + ASCII_NAME + rb'(?:(?=[^a-zA-Z0-9-])|' + SIGNED_INTEGER + rb'(?=[^0-9])))'
159 161  
160   -CONTROL_WORD = r'(?:\\' + ASCII_NAME + r'(?:(?=[^a-zA-Z0-9-])|' + SIGNED_INTEGER + r'(?=[^0-9])))'
161 162 re_control_word = re.compile(CONTROL_WORD)
162 163  
163   -CONTROL_SYMBOL = r'(?:\[^a-zA-Z0-9])'
  164 +CONTROL_SYMBOL = rb'(?:\[^a-zA-Z0-9])'
164 165 re_control_symbol = re.compile(CONTROL_SYMBOL)
165 166  
166 167 # Text that is not a control word/symbol or a group:
167   -TEXT = r'[^{}\]+'
  168 +TEXT = rb'[^{}\]+'
168 169 re_text = re.compile(TEXT)
169 170  
170 171 # ignored whitespaces and tags within a hex block:
171   -IGNORED = r'(?:\s|'+NESTED_RTF_TAG+'|'+CONTROL_SYMBOL+'|'+CONTROL_WORD+r')*'
  172 +IGNORED = rb'(?:\s|'+NESTED_RTF_TAG+rb'|'+CONTROL_SYMBOL+rb'|'+CONTROL_WORD+rb')*'
172 173 #IGNORED = r'\s*'
173 174  
174 175 # HEX_CHAR = HEX_DIGIT + IGNORED + HEX_DIGIT
... ... @@ -188,7 +189,7 @@ IGNORED = r&#39;(?:\s|&#39;+NESTED_RTF_TAG+&#39;|&#39;+CONTROL_SYMBOL+&#39;|&#39;+CONTROL_WORD+r&#39;)*&#39;
188 189  
189 190 #TODO PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}\b'
190 191 # PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}' #+ HEX_CHAR + r'\b'
191   -PATTERN = r'\b(?:' + HEX_DIGIT + IGNORED + r'){7,}' + HEX_DIGIT + r'\b'
  192 +PATTERN = rb'\b(?:' + HEX_DIGIT + IGNORED + rb'){7,}' + HEX_DIGIT + rb'\b'
192 193  
193 194 # at least 4 hex chars, followed by whitespace or CR/LF: (?:[0-9A-Fa-f]{2}){4,}\s*
194 195 # PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}'
... ... @@ -196,19 +197,19 @@ PATTERN = r&#39;\b(?:&#39; + HEX_DIGIT + IGNORED + r&#39;){7,}&#39; + HEX_DIGIT + r&#39;\b&#39;
196 197 #PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}'
197 198  
198 199 # a dummy translation table for str.translate, which does not change anythying:
199   -TRANSTABLE_NOCHANGE = string.maketrans('', '')
  200 +TRANSTABLE_NOCHANGE = bytes.maketrans(b'', b'')
200 201  
201 202 re_hexblock = re.compile(PATTERN)
202 203 re_embedded_tags = re.compile(IGNORED)
203   -re_decimal = re.compile(r'\d+')
  204 +re_decimal = re.compile(rb'\d+')
204 205  
205   -re_delimiter = re.compile(r'[ \t\r\n\f\v]')
  206 +re_delimiter = re.compile(rb'[ \t\r\n\f\v]')
206 207  
207   -DELIMITER = r'[ \t\r\n\f\v]'
208   -DELIMITERS_ZeroOrMore = r'[ \t\r\n\f\v]*'
209   -BACKSLASH_BIN = r'\\bin'
  208 +DELIMITER = rb'[ \t\r\n\f\v]'
  209 +DELIMITERS_ZeroOrMore = rb'[ \t\r\n\f\v]*'
  210 +BACKSLASH_BIN = rb'\\bin'
210 211 # According to my tests, Word accepts up to 250 digits (leading zeroes)
211   -DECIMAL_GROUP = r'(\d{1,250})'
  212 +DECIMAL_GROUP = rb'(\d{1,250})'
212 213  
213 214 re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + BACKSLASH_BIN
214 215 + DECIMAL_GROUP + DELIMITER)
... ... @@ -216,36 +217,36 @@ re_delim_hexblock = re.compile(DELIMITER + PATTERN)
216 217  
217 218 # Destination Control Words, according to MS RTF Specifications v1.9.1:
218 219 DESTINATION_CONTROL_WORDS = frozenset((
219   - "aftncn", "aftnsep", "aftnsepc", "annotation", "atnauthor", "atndate", "atnicn", "atnid", "atnparent", "atnref",
220   - "atntime", "atrfend", "atrfstart", "author", "background", "bkmkend", "bkmkstart", "blipuid", "buptim", "category",
221   - "colorschememapping", "colortbl", "comment", "company", "creatim", "datafield", "datastore", "defchp", "defpap",
222   - "do", "doccomm", "docvar", "dptxbxtext", "ebcend", "ebcstart", "factoidname", "falt", "fchars", "ffdeftext",
223   - "ffentrymcr", "ffexitmcr", "ffformat", "ffhelptext", "ffl", "ffname", "ffstattext", "field", "file", "filetbl",
224   - "fldinst", "fldrslt", "fldtype", "fname", "fontemb", "fontfile", "fonttbl", "footer", "footerf", "footerl",
225   - "footerr", "footnote", "formfield", "ftncn", "ftnsep", "ftnsepc", "g", "generator", "gridtbl", "header", "headerf",
226   - "headerl", "headerr", "hl", "hlfr", "hlinkbase", "hlloc", "hlsrc", "hsv", "htmltag", "info", "keycode", "keywords",
227   - "latentstyles", "lchars", "levelnumbers", "leveltext", "lfolevel", "linkval", "list", "listlevel", "listname",
228   - "listoverride", "listoverridetable", "listpicture", "liststylename", "listtable", "listtext", "lsdlockedexcept",
229   - "macc", "maccPr", "mailmerge", "maln", "malnScr", "manager", "margPr", "mbar", "mbarPr", "mbaseJc", "mbegChr",
230   - "mborderBox", "mborderBoxPr", "mbox", "mboxPr", "mchr", "mcount", "mctrlPr", "md", "mdeg", "mdegHide", "mden",
231   - "mdiff", "mdPr", "me", "mendChr", "meqArr", "meqArrPr", "mf", "mfName", "mfPr", "mfunc", "mfuncPr", "mgroupChr",
232   - "mgroupChrPr", "mgrow", "mhideBot", "mhideLeft", "mhideRight", "mhideTop", "mhtmltag", "mlim", "mlimloc", "mlimlow",
233   - "mlimlowPr", "mlimupp", "mlimuppPr", "mm", "mmaddfieldname", "mmath", "mmathPict", "mmathPr", "mmaxdist", "mmc",
234   - "mmcJc", "mmconnectstr", "mmconnectstrdata", "mmcPr", "mmcs", "mmdatasource", "mmheadersource", "mmmailsubject",
235   - "mmodso", "mmodsofilter", "mmodsofldmpdata", "mmodsomappedname", "mmodsoname", "mmodsorecipdata", "mmodsosort",
236   - "mmodsosrc", "mmodsotable", "mmodsoudl", "mmodsoudldata", "mmodsouniquetag", "mmPr", "mmquery", "mmr", "mnary",
237   - "mnaryPr", "mnoBreak", "mnum", "mobjDist", "moMath", "moMathPara", "moMathParaPr", "mopEmu", "mphant", "mphantPr",
238   - "mplcHide", "mpos", "mr", "mrad", "mradPr", "mrPr", "msepChr", "mshow", "mshp", "msPre", "msPrePr", "msSub",
239   - "msSubPr", "msSubSup", "msSubSupPr", "msSup", "msSupPr", "mstrikeBLTR", "mstrikeH", "mstrikeTLBR", "mstrikeV",
240   - "msub", "msubHide", "msup", "msupHide", "mtransp", "mtype", "mvertJc", "mvfmf", "mvfml", "mvtof", "mvtol",
241   - "mzeroAsc", "mzeroDesc", "mzeroWid", "nesttableprops", "nextfile", "nonesttables", "objalias", "objclass",
242   - "objdata", "object", "objname", "objsect", "objtime", "oldcprops", "oldpprops", "oldsprops", "oldtprops",
243   - "oleclsid", "operator", "panose", "password", "passwordhash", "pgp", "pgptbl", "picprop", "pict", "pn", "pnseclvl",
244   - "pntext", "pntxta", "pntxtb", "printim", "private", "propname", "protend", "protstart", "protusertbl", "pxe",
245   - "result", "revtbl", "revtim", "rsidtbl", "rtf", "rxe", "shp", "shpgrp", "shpinst", "shppict", "shprslt", "shptxt",
246   - "sn", "sp", "staticval", "stylesheet", "subject", "sv", "svb", "tc", "template", "themedata", "title", "txe", "ud",
247   - "upr", "userprops", "wgrffmtfilter", "windowcaption", "writereservation", "writereservhash", "xe", "xform",
248   - "xmlattrname", "xmlattrvalue", "xmlclose", "xmlname", "xmlnstbl", "xmlopen"
  220 + b"aftncn", b"aftnsep", b"aftnsepc", b"annotation", b"atnauthor", b"atndate", b"atnicn", b"atnid", b"atnparent", b"atnref",
  221 + b"atntime", b"atrfend", b"atrfstart", b"author", b"background", b"bkmkend", b"bkmkstart", b"blipuid", b"buptim", b"category",
  222 + b"colorschememapping", b"colortbl", b"comment", b"company", b"creatim", b"datafield", b"datastore", b"defchp", b"defpap",
  223 + b"do", b"doccomm", b"docvar", b"dptxbxtext", b"ebcend", b"ebcstart", b"factoidname", b"falt", b"fchars", b"ffdeftext",
  224 + b"ffentrymcr", b"ffexitmcr", b"ffformat", b"ffhelptext", b"ffl", b"ffname",b"ffstattext", b"field", b"file", b"filetbl",
  225 + b"fldinst", b"fldrslt", b"fldtype", b"fname", b"fontemb", b"fontfile", b"fonttbl", b"footer", b"footerf", b"footerl",
  226 + b"footerr", b"footnote", b"formfield", b"ftncn", b"ftnsep", b"ftnsepc", b"g", b"generator", b"gridtbl", b"header", b"headerf",
  227 + b"headerl", b"headerr", b"hl", b"hlfr", b"hlinkbase", b"hlloc", b"hlsrc", b"hsv", b"htmltag", b"info", b"keycode", b"keywords",
  228 + b"latentstyles", b"lchars", b"levelnumbers", b"leveltext", b"lfolevel", b"linkval", b"list", b"listlevel", b"listname",
  229 + b"listoverride", b"listoverridetable", b"listpicture", b"liststylename", b"listtable", b"listtext", b"lsdlockedexcept",
  230 + b"macc", b"maccPr", b"mailmerge", b"maln",b"malnScr", b"manager", b"margPr", b"mbar", b"mbarPr", b"mbaseJc", b"mbegChr",
  231 + b"mborderBox", b"mborderBoxPr", b"mbox", b"mboxPr", b"mchr", b"mcount", b"mctrlPr", b"md", b"mdeg", b"mdegHide", b"mden",
  232 + b"mdiff", b"mdPr", b"me", b"mendChr", b"meqArr", b"meqArrPr", b"mf", b"mfName", b"mfPr", b"mfunc", b"mfuncPr",b"mgroupChr",
  233 + b"mgroupChrPr",b"mgrow", b"mhideBot", b"mhideLeft", b"mhideRight", b"mhideTop", b"mhtmltag", b"mlim", b"mlimloc", b"mlimlow",
  234 + b"mlimlowPr", b"mlimupp", b"mlimuppPr", b"mm", b"mmaddfieldname", b"mmath", b"mmathPict", b"mmathPr",b"mmaxdist", b"mmc",
  235 + b"mmcJc", b"mmconnectstr", b"mmconnectstrdata", b"mmcPr", b"mmcs", b"mmdatasource", b"mmheadersource", b"mmmailsubject",
  236 + b"mmodso", b"mmodsofilter", b"mmodsofldmpdata", b"mmodsomappedname", b"mmodsoname", b"mmodsorecipdata", b"mmodsosort",
  237 + b"mmodsosrc", b"mmodsotable", b"mmodsoudl", b"mmodsoudldata", b"mmodsouniquetag", b"mmPr", b"mmquery", b"mmr", b"mnary",
  238 + b"mnaryPr", b"mnoBreak", b"mnum", b"mobjDist", b"moMath", b"moMathPara", b"moMathParaPr", b"mopEmu", b"mphant", b"mphantPr",
  239 + b"mplcHide", b"mpos", b"mr", b"mrad", b"mradPr", b"mrPr", b"msepChr", b"mshow", b"mshp", b"msPre", b"msPrePr", b"msSub",
  240 + b"msSubPr", b"msSubSup", b"msSubSupPr", b"msSup", b"msSupPr", b"mstrikeBLTR", b"mstrikeH", b"mstrikeTLBR", b"mstrikeV",
  241 + b"msub", b"msubHide", b"msup", b"msupHide", b"mtransp", b"mtype", b"mvertJc", b"mvfmf", b"mvfml", b"mvtof", b"mvtol",
  242 + b"mzeroAsc", b"mzeroDesc", b"mzeroWid", b"nesttableprops", b"nexctfile", b"nonesttables", b"objalias", b"objclass",
  243 + b"objdata", b"object", b"objname", b"objsect", b"objtime", b"oldcprops", b"oldpprops", b"oldsprops", b"oldtprops",
  244 + b"oleclsid", b"operator", b"panose", b"password", b"passwordhash", b"pgp", b"pgptbl", b"picprop", b"pict", b"pn", b"pnseclvl",
  245 + b"pntext", b"pntxta", b"pntxtb", b"printim", b"private", b"propname", b"protend", b"protstart", b"protusertbl", b"pxe",
  246 + b"result", b"revtbl", b"revtim", b"rsidtbl", b"rtf", b"rxe", b"shp", b"shpgrp", b"shpinst", b"shppict", b"shprslt", b"shptxt",
  247 + b"sn", b"sp", b"staticval", b"stylesheet", b"subject", b"sv", b"svb", b"tc", b"template", b"themedata", b"title", b"txe", b"ud",
  248 + b"upr", b"userprops", b"wgrffmtfilter", b"windowcaption", b"writereservation", b"writereservhash", b"xe", b"xform",
  249 + b"xmlattrname", b"xmlattrvalue", b"xmlclose", b"xmlname", b"xmlnstbl", b"xmlopen"
249 250 ))
250 251  
251 252  
... ... @@ -258,7 +259,7 @@ class Destination(object):
258 259 """
259 260 def __init__(self, cword=None):
260 261 self.cword = cword
261   - self.data = ''
  262 + self.data = b''
262 263 self.start = None
263 264 self.end = None
264 265 self.group_level = 0
... ... @@ -293,15 +294,15 @@ class RtfParser(object):
293 294 def parse(self):
294 295 self.index = 0
295 296 while self.index < self.size:
296   - if self.data[self.index] == '{':
  297 + if self.data[self.index] == ord('{'):
297 298 self._open_group()
298 299 self.index += 1
299 300 continue
300   - if self.data[self.index] == '}':
  301 + if self.data[self.index] == ord('}'):
301 302 self._close_group()
302 303 self.index += 1
303 304 continue
304   - if self.data[self.index] == '\\':
  305 + if self.data[self.index] == ord('\\'):
305 306 m = re_control_word.match(self.data, self.index)
306 307 if m:
307 308 cword = m.group(1)
... ... @@ -312,7 +313,7 @@ class RtfParser(object):
312 313 self._control_word(m, cword, param)
313 314 self.index += len(m.group())
314 315 # if it's \bin, call _bin after updating index
315   - if cword == 'bin':
  316 + if cword == b'bin':
316 317 self._bin(m, param)
317 318 continue
318 319 m = re_control_symbol.match(self.data, self.index)
... ... @@ -450,19 +451,19 @@ class RtfObjParser(RtfParser):
450 451 self.fname_prefix = fname_prefix
451 452  
452 453 def open_destination(self, destination):
453   - if destination.cword == 'objdata':
  454 + if destination.cword == b'objdata':
454 455 log.debug('*** Start object data at index %Xh' % destination.start)
455 456  
456 457 def close_destination(self, destination):
457   - if destination.cword == 'objdata':
  458 + if destination.cword == b'objdata':
458 459 log.debug('*** Close object data at index %Xh' % self.index)
459 460 # Filter out all whitespaces first (just ignored):
460   - hexdata1 = destination.data.translate(TRANSTABLE_NOCHANGE, ' \t\r\n\f\v')
  461 + hexdata1 = destination.data.translate(TRANSTABLE_NOCHANGE, b' \t\r\n\f\v')
461 462 # Then filter out any other non-hex character:
462   - hexdata = re.sub(r'[^a-hA-H0-9]', '', hexdata1)
  463 + hexdata = re.sub(b'[^a-hA-H0-9]', b'', hexdata1)
463 464 if len(hexdata) < len(hexdata1):
464 465 # this is only for debugging:
465   - nonhex = re.sub(r'[a-hA-H0-9]', '', hexdata1)
  466 + nonhex = re.sub(b'[a-hA-H0-9]', b'', hexdata1)
466 467 log.debug('Found non-hex chars in hexdata: %r' % nonhex)
467 468 # MS Word accepts an extra hex digit, so we need to trim it if present:
468 469 if len(hexdata) & 1:
... ... @@ -485,9 +486,9 @@ class RtfObjParser(RtfParser):
485 486 print('data size = %d' % obj.data_size)
486 487 # set a file extension according to the class name:
487 488 class_name = obj.class_name.lower()
488   - if class_name.startswith('word'):
  489 + if class_name.startswith(b'word'):
489 490 ext = 'doc'
490   - elif class_name.startswith('package'):
  491 + elif class_name.startswith(b'package'):
491 492 ext = 'package'
492 493 else:
493 494 ext = 'bin'
... ...