Commit 544ecdf7acae43db7a044aba0685b4aadfbfa24e

Authored by Philippe Lagadec
Committed by GitHub
2 parents df80ea76 5cee3675

Merge pull request #244 from christian-intra2net/speed-up-rtfobj

Speed up rtfobj
Showing 1 changed file with 37 additions and 3 deletions
oletools/rtfobj.py
@@ -95,6 +95,7 @@ __version__ = '0.52dev12' @@ -95,6 +95,7 @@ __version__ = '0.52dev12'
95 95
96 import re, os, sys, binascii, logging, optparse 96 import re, os, sys, binascii, logging, optparse
97 import os.path 97 import os.path
  98 +from time import time
98 99
99 # IMPORTANT: it should be possible to run oletools directly as scripts 100 # IMPORTANT: it should be possible to run oletools directly as scripts
100 # in any directory without installing them with pip or setup.py. 101 # in any directory without installing them with pip or setup.py.
@@ -314,6 +315,22 @@ else: @@ -314,6 +315,22 @@ else:
314 RTF_MAGIC = b'\x7b\\rt' # \x7b == b'{' but does not mess up auto-indent 315 RTF_MAGIC = b'\x7b\\rt' # \x7b == b'{' but does not mess up auto-indent
315 316
316 317
  318 +def duration_str(duration):
  319 + """ create a human-readable string representation of duration [s] """
  320 + value = duration
  321 + unit = 's'
  322 + if value > 90:
  323 + value /= 60.
  324 + unit = 'min'
  325 + if value > 90:
  326 + value /= 60.
  327 + unit = 'h'
  328 + if value > 72:
  329 + value /= 24.
  330 + unit = 'days'
  331 + return '{0:.1f}{1}'.format(value, unit)
  332 +
  333 +
317 #=== CLASSES ================================================================= 334 #=== CLASSES =================================================================
318 335
319 class Destination(object): 336 class Destination(object):
@@ -360,6 +377,20 @@ class RtfParser(object): @@ -360,6 +377,20 @@ class RtfParser(object):
360 self.destinations = [document_destination] 377 self.destinations = [document_destination]
361 self.current_destination = document_destination 378 self.current_destination = document_destination
362 379
  380 + def _report_progress(self, start_time):
  381 + """ report progress on parsing at regular intervals """
  382 + now = float(time())
  383 + if now == start_time or self.size == 0:
  384 + return # avoid zero-division
  385 + percent_done = 100. * self.index / self.size
  386 + time_per_index = (now - start_time) / float(self.index)
  387 + finish_estim = float(self.size - self.index) * time_per_index
  388 +
  389 + log.debug('After {0} finished {1:4.1f}% of current file ({2} bytes); '
  390 + 'will finish in approx {3}'
  391 + .format(duration_str(now-start_time), percent_done,
  392 + self.size, duration_str(finish_estim)))
  393 +
363 def parse(self): 394 def parse(self):
364 """ 395 """
365 Parse the RTF data 396 Parse the RTF data
@@ -368,8 +399,13 @@ class RtfParser(object): @@ -368,8 +399,13 @@ class RtfParser(object):
368 """ 399 """
369 # Start at beginning of data 400 # Start at beginning of data
370 self.index = 0 401 self.index = 0
  402 + start_time = time()
  403 + last_report = start_time
371 # Loop until the end 404 # Loop until the end
372 while self.index < self.size: 405 while self.index < self.size:
  406 + if time() - last_report > 15: # report every 15s
  407 + self._report_progress(start_time)
  408 + last_report = time()
373 if self.data[self.index] == BRACE_OPEN: 409 if self.data[self.index] == BRACE_OPEN:
374 # Found an opening brace "{": Start of a group 410 # Found an opening brace "{": Start of a group
375 self._open_group() 411 self._open_group()
@@ -386,9 +422,7 @@ class RtfParser(object): @@ -386,9 +422,7 @@ class RtfParser(object):
386 # NOTE: the full length of the control word + its optional integer parameter 422 # NOTE: the full length of the control word + its optional integer parameter
387 # is limited by MS Word at 253 characters, so we have to run the regex 423 # is limited by MS Word at 253 characters, so we have to run the regex
388 # on a cropped string: 424 # on a cropped string:
389 - data_cropped = self.data[self.index:]  
390 - if len(data_cropped)>253:  
391 - data_cropped = data_cropped[:254] 425 + data_cropped = self.data[self.index:self.index+254]
392 # append a space so that the regex can check the following character: 426 # append a space so that the regex can check the following character:
393 data_cropped += b' ' 427 data_cropped += b' '
394 # m = re_control_word.match(self.data, self.index, self.index+253) 428 # m = re_control_word.match(self.data, self.index, self.index+253)