diff --git a/oletools/rtfobj.py b/oletools/rtfobj.py index 5873fb0..534fbf4 100644 --- a/oletools/rtfobj.py +++ b/oletools/rtfobj.py @@ -95,6 +95,7 @@ __version__ = '0.52dev12' import re, os, sys, binascii, logging, optparse import os.path +from time import time # IMPORTANT: it should be possible to run oletools directly as scripts # in any directory without installing them with pip or setup.py. @@ -314,6 +315,22 @@ else: RTF_MAGIC = b'\x7b\\rt' # \x7b == b'{' but does not mess up auto-indent +def duration_str(duration): + """ create a human-readable string representation of duration [s] """ + value = duration + unit = 's' + if value > 90: + value /= 60. + unit = 'min' + if value > 90: + value /= 60. + unit = 'h' + if value > 72: + value /= 24. + unit = 'days' + return '{0:.1f}{1}'.format(value, unit) + + #=== CLASSES ================================================================= class Destination(object): @@ -360,6 +377,20 @@ class RtfParser(object): self.destinations = [document_destination] self.current_destination = document_destination + def _report_progress(self, start_time): + """ report progress on parsing at regular intervals """ + now = float(time()) + if now == start_time or self.size == 0: + return # avoid zero-division + percent_done = 100. * self.index / self.size + time_per_index = (now - start_time) / float(self.index) + finish_estim = float(self.size - self.index) * time_per_index + + log.debug('After {0} finished {1:4.1f}% of current file ({2} bytes); ' + 'will finish in approx {3}' + .format(duration_str(now-start_time), percent_done, + self.size, duration_str(finish_estim))) + def parse(self): """ Parse the RTF data @@ -368,8 +399,13 @@ class RtfParser(object): """ # Start at beginning of data self.index = 0 + start_time = time() + last_report = start_time # Loop until the end while self.index < self.size: + if time() - last_report > 15: # report every 15s + self._report_progress(start_time) + last_report = time() if self.data[self.index] == BRACE_OPEN: # Found an opening brace "{": Start of a group self._open_group() @@ -386,9 +422,7 @@ class RtfParser(object): # NOTE: the full length of the control word + its optional integer parameter # is limited by MS Word at 253 characters, so we have to run the regex # on a cropped string: - data_cropped = self.data[self.index:] - if len(data_cropped)>253: - data_cropped = data_cropped[:254] + data_cropped = self.data[self.index:self.index+254] # append a space so that the regex can check the following character: data_cropped += b' ' # m = re_control_word.match(self.data, self.index, self.index+253)