Commit 544ecdf7acae43db7a044aba0685b4aadfbfa24e
Committed by
GitHub
Merge pull request #244 from christian-intra2net/speed-up-rtfobj
Speed up rtfobj
Showing
1 changed file
with
37 additions
and
3 deletions
oletools/rtfobj.py
| ... | ... | @@ -95,6 +95,7 @@ __version__ = '0.52dev12' |
| 95 | 95 | |
| 96 | 96 | import re, os, sys, binascii, logging, optparse |
| 97 | 97 | import os.path |
| 98 | +from time import time | |
| 98 | 99 | |
| 99 | 100 | # IMPORTANT: it should be possible to run oletools directly as scripts |
| 100 | 101 | # in any directory without installing them with pip or setup.py. |
| ... | ... | @@ -314,6 +315,22 @@ else: |
| 314 | 315 | RTF_MAGIC = b'\x7b\\rt' # \x7b == b'{' but does not mess up auto-indent |
| 315 | 316 | |
| 316 | 317 | |
| 318 | +def duration_str(duration): | |
| 319 | + """ create a human-readable string representation of duration [s] """ | |
| 320 | + value = duration | |
| 321 | + unit = 's' | |
| 322 | + if value > 90: | |
| 323 | + value /= 60. | |
| 324 | + unit = 'min' | |
| 325 | + if value > 90: | |
| 326 | + value /= 60. | |
| 327 | + unit = 'h' | |
| 328 | + if value > 72: | |
| 329 | + value /= 24. | |
| 330 | + unit = 'days' | |
| 331 | + return '{0:.1f}{1}'.format(value, unit) | |
| 332 | + | |
| 333 | + | |
| 317 | 334 | #=== CLASSES ================================================================= |
| 318 | 335 | |
| 319 | 336 | class Destination(object): |
| ... | ... | @@ -360,6 +377,20 @@ class RtfParser(object): |
| 360 | 377 | self.destinations = [document_destination] |
| 361 | 378 | self.current_destination = document_destination |
| 362 | 379 | |
| 380 | + def _report_progress(self, start_time): | |
| 381 | + """ report progress on parsing at regular intervals """ | |
| 382 | + now = float(time()) | |
| 383 | + if now == start_time or self.size == 0: | |
| 384 | + return # avoid zero-division | |
| 385 | + percent_done = 100. * self.index / self.size | |
| 386 | + time_per_index = (now - start_time) / float(self.index) | |
| 387 | + finish_estim = float(self.size - self.index) * time_per_index | |
| 388 | + | |
| 389 | + log.debug('After {0} finished {1:4.1f}% of current file ({2} bytes); ' | |
| 390 | + 'will finish in approx {3}' | |
| 391 | + .format(duration_str(now-start_time), percent_done, | |
| 392 | + self.size, duration_str(finish_estim))) | |
| 393 | + | |
| 363 | 394 | def parse(self): |
| 364 | 395 | """ |
| 365 | 396 | Parse the RTF data |
| ... | ... | @@ -368,8 +399,13 @@ class RtfParser(object): |
| 368 | 399 | """ |
| 369 | 400 | # Start at beginning of data |
| 370 | 401 | self.index = 0 |
| 402 | + start_time = time() | |
| 403 | + last_report = start_time | |
| 371 | 404 | # Loop until the end |
| 372 | 405 | while self.index < self.size: |
| 406 | + if time() - last_report > 15: # report every 15s | |
| 407 | + self._report_progress(start_time) | |
| 408 | + last_report = time() | |
| 373 | 409 | if self.data[self.index] == BRACE_OPEN: |
| 374 | 410 | # Found an opening brace "{": Start of a group |
| 375 | 411 | self._open_group() |
| ... | ... | @@ -386,9 +422,7 @@ class RtfParser(object): |
| 386 | 422 | # NOTE: the full length of the control word + its optional integer parameter |
| 387 | 423 | # is limited by MS Word at 253 characters, so we have to run the regex |
| 388 | 424 | # on a cropped string: |
| 389 | - data_cropped = self.data[self.index:] | |
| 390 | - if len(data_cropped)>253: | |
| 391 | - data_cropped = data_cropped[:254] | |
| 425 | + data_cropped = self.data[self.index:self.index+254] | |
| 392 | 426 | # append a space so that the regex can check the following character: |
| 393 | 427 | data_cropped += b' ' |
| 394 | 428 | # m = re_control_word.match(self.data, self.index, self.index+253) | ... | ... |