Commit 544ecdf7acae43db7a044aba0685b4aadfbfa24e
Committed by
GitHub
Merge pull request #244 from christian-intra2net/speed-up-rtfobj
Speed up rtfobj
Showing
1 changed file
with
37 additions
and
3 deletions
oletools/rtfobj.py
| @@ -95,6 +95,7 @@ __version__ = '0.52dev12' | @@ -95,6 +95,7 @@ __version__ = '0.52dev12' | ||
| 95 | 95 | ||
| 96 | import re, os, sys, binascii, logging, optparse | 96 | import re, os, sys, binascii, logging, optparse |
| 97 | import os.path | 97 | import os.path |
| 98 | +from time import time | ||
| 98 | 99 | ||
| 99 | # IMPORTANT: it should be possible to run oletools directly as scripts | 100 | # IMPORTANT: it should be possible to run oletools directly as scripts |
| 100 | # in any directory without installing them with pip or setup.py. | 101 | # in any directory without installing them with pip or setup.py. |
| @@ -314,6 +315,22 @@ else: | @@ -314,6 +315,22 @@ else: | ||
| 314 | RTF_MAGIC = b'\x7b\\rt' # \x7b == b'{' but does not mess up auto-indent | 315 | RTF_MAGIC = b'\x7b\\rt' # \x7b == b'{' but does not mess up auto-indent |
| 315 | 316 | ||
| 316 | 317 | ||
| 318 | +def duration_str(duration): | ||
| 319 | + """ create a human-readable string representation of duration [s] """ | ||
| 320 | + value = duration | ||
| 321 | + unit = 's' | ||
| 322 | + if value > 90: | ||
| 323 | + value /= 60. | ||
| 324 | + unit = 'min' | ||
| 325 | + if value > 90: | ||
| 326 | + value /= 60. | ||
| 327 | + unit = 'h' | ||
| 328 | + if value > 72: | ||
| 329 | + value /= 24. | ||
| 330 | + unit = 'days' | ||
| 331 | + return '{0:.1f}{1}'.format(value, unit) | ||
| 332 | + | ||
| 333 | + | ||
| 317 | #=== CLASSES ================================================================= | 334 | #=== CLASSES ================================================================= |
| 318 | 335 | ||
| 319 | class Destination(object): | 336 | class Destination(object): |
| @@ -360,6 +377,20 @@ class RtfParser(object): | @@ -360,6 +377,20 @@ class RtfParser(object): | ||
| 360 | self.destinations = [document_destination] | 377 | self.destinations = [document_destination] |
| 361 | self.current_destination = document_destination | 378 | self.current_destination = document_destination |
| 362 | 379 | ||
| 380 | + def _report_progress(self, start_time): | ||
| 381 | + """ report progress on parsing at regular intervals """ | ||
| 382 | + now = float(time()) | ||
| 383 | + if now == start_time or self.size == 0: | ||
| 384 | + return # avoid zero-division | ||
| 385 | + percent_done = 100. * self.index / self.size | ||
| 386 | + time_per_index = (now - start_time) / float(self.index) | ||
| 387 | + finish_estim = float(self.size - self.index) * time_per_index | ||
| 388 | + | ||
| 389 | + log.debug('After {0} finished {1:4.1f}% of current file ({2} bytes); ' | ||
| 390 | + 'will finish in approx {3}' | ||
| 391 | + .format(duration_str(now-start_time), percent_done, | ||
| 392 | + self.size, duration_str(finish_estim))) | ||
| 393 | + | ||
| 363 | def parse(self): | 394 | def parse(self): |
| 364 | """ | 395 | """ |
| 365 | Parse the RTF data | 396 | Parse the RTF data |
| @@ -368,8 +399,13 @@ class RtfParser(object): | @@ -368,8 +399,13 @@ class RtfParser(object): | ||
| 368 | """ | 399 | """ |
| 369 | # Start at beginning of data | 400 | # Start at beginning of data |
| 370 | self.index = 0 | 401 | self.index = 0 |
| 402 | + start_time = time() | ||
| 403 | + last_report = start_time | ||
| 371 | # Loop until the end | 404 | # Loop until the end |
| 372 | while self.index < self.size: | 405 | while self.index < self.size: |
| 406 | + if time() - last_report > 15: # report every 15s | ||
| 407 | + self._report_progress(start_time) | ||
| 408 | + last_report = time() | ||
| 373 | if self.data[self.index] == BRACE_OPEN: | 409 | if self.data[self.index] == BRACE_OPEN: |
| 374 | # Found an opening brace "{": Start of a group | 410 | # Found an opening brace "{": Start of a group |
| 375 | self._open_group() | 411 | self._open_group() |
| @@ -386,9 +422,7 @@ class RtfParser(object): | @@ -386,9 +422,7 @@ class RtfParser(object): | ||
| 386 | # NOTE: the full length of the control word + its optional integer parameter | 422 | # NOTE: the full length of the control word + its optional integer parameter |
| 387 | # is limited by MS Word at 253 characters, so we have to run the regex | 423 | # is limited by MS Word at 253 characters, so we have to run the regex |
| 388 | # on a cropped string: | 424 | # on a cropped string: |
| 389 | - data_cropped = self.data[self.index:] | ||
| 390 | - if len(data_cropped)>253: | ||
| 391 | - data_cropped = data_cropped[:254] | 425 | + data_cropped = self.data[self.index:self.index+254] |
| 392 | # append a space so that the regex can check the following character: | 426 | # append a space so that the regex can check the following character: |
| 393 | data_cropped += b' ' | 427 | data_cropped += b' ' |
| 394 | # m = re_control_word.match(self.data, self.index, self.index+253) | 428 | # m = re_control_word.match(self.data, self.index, self.index+253) |