Commit 544ecdf7acae43db7a044aba0685b4aadfbfa24e

Authored by Philippe Lagadec
Committed by GitHub
2 parents df80ea76 5cee3675

Merge pull request #244 from christian-intra2net/speed-up-rtfobj

Speed up rtfobj
Showing 1 changed file with 37 additions and 3 deletions
oletools/rtfobj.py
... ... @@ -95,6 +95,7 @@ __version__ = '0.52dev12'
95 95  
96 96 import re, os, sys, binascii, logging, optparse
97 97 import os.path
  98 +from time import time
98 99  
99 100 # IMPORTANT: it should be possible to run oletools directly as scripts
100 101 # in any directory without installing them with pip or setup.py.
... ... @@ -314,6 +315,22 @@ else:
314 315 RTF_MAGIC = b'\x7b\\rt' # \x7b == b'{' but does not mess up auto-indent
315 316  
316 317  
  318 +def duration_str(duration):
  319 + """ create a human-readable string representation of duration [s] """
  320 + value = duration
  321 + unit = 's'
  322 + if value > 90:
  323 + value /= 60.
  324 + unit = 'min'
  325 + if value > 90:
  326 + value /= 60.
  327 + unit = 'h'
  328 + if value > 72:
  329 + value /= 24.
  330 + unit = 'days'
  331 + return '{0:.1f}{1}'.format(value, unit)
  332 +
  333 +
317 334 #=== CLASSES =================================================================
318 335  
319 336 class Destination(object):
... ... @@ -360,6 +377,20 @@ class RtfParser(object):
360 377 self.destinations = [document_destination]
361 378 self.current_destination = document_destination
362 379  
  380 + def _report_progress(self, start_time):
  381 + """ report progress on parsing at regular intervals """
  382 + now = float(time())
  383 + if now == start_time or self.size == 0:
  384 + return # avoid zero-division
  385 + percent_done = 100. * self.index / self.size
  386 + time_per_index = (now - start_time) / float(self.index)
  387 + finish_estim = float(self.size - self.index) * time_per_index
  388 +
  389 + log.debug('After {0} finished {1:4.1f}% of current file ({2} bytes); '
  390 + 'will finish in approx {3}'
  391 + .format(duration_str(now-start_time), percent_done,
  392 + self.size, duration_str(finish_estim)))
  393 +
363 394 def parse(self):
364 395 """
365 396 Parse the RTF data
... ... @@ -368,8 +399,13 @@ class RtfParser(object):
368 399 """
369 400 # Start at beginning of data
370 401 self.index = 0
  402 + start_time = time()
  403 + last_report = start_time
371 404 # Loop until the end
372 405 while self.index < self.size:
  406 + if time() - last_report > 15: # report every 15s
  407 + self._report_progress(start_time)
  408 + last_report = time()
373 409 if self.data[self.index] == BRACE_OPEN:
374 410 # Found an opening brace "{": Start of a group
375 411 self._open_group()
... ... @@ -386,9 +422,7 @@ class RtfParser(object):
386 422 # NOTE: the full length of the control word + its optional integer parameter
387 423 # is limited by MS Word at 253 characters, so we have to run the regex
388 424 # on a cropped string:
389   - data_cropped = self.data[self.index:]
390   - if len(data_cropped)>253:
391   - data_cropped = data_cropped[:254]
  425 + data_cropped = self.data[self.index:self.index+254]
392 426 # append a space so that the regex can check the following character:
393 427 data_cropped += b' '
394 428 # m = re_control_word.match(self.data, self.index, self.index+253)
... ...