Commit e2a52831884deb9f060e98026fd7e1a8696d3e30
1 parent
a410b68b
olevba: added name_str and others to VBA_Module for issue #383, a few PEP8 fixes
Showing
2 changed files
with
43 additions
and
34 deletions
oletools/olevba.py
| ... | ... | @@ -239,14 +239,14 @@ __version__ = '0.54dev7' |
| 239 | 239 | # - extract_macros: use combined struct.unpack instead of many calls |
| 240 | 240 | # - all except clauses should target specific exceptions |
| 241 | 241 | |
| 242 | -#------------------------------------------------------------------------------ | |
| 242 | +# ------------------------------------------------------------------------------ | |
| 243 | 243 | # REFERENCES: |
| 244 | 244 | # - [MS-OVBA]: Microsoft Office VBA File Format Structure |
| 245 | 245 | # http://msdn.microsoft.com/en-us/library/office/cc313094%28v=office.12%29.aspx |
| 246 | 246 | # - officeparser: https://github.com/unixfreak0037/officeparser |
| 247 | 247 | |
| 248 | 248 | |
| 249 | -#--- IMPORTS ------------------------------------------------------------------ | |
| 249 | +# --- IMPORTS ------------------------------------------------------------------ | |
| 250 | 250 | |
| 251 | 251 | import sys |
| 252 | 252 | import os |
| ... | ... | @@ -263,7 +263,6 @@ import zlib |
| 263 | 263 | import email # for MHTML parsing |
| 264 | 264 | import string # for printable |
| 265 | 265 | import json # for json output mode (argument --json) |
| 266 | -import codecs | |
| 267 | 266 | |
| 268 | 267 | # import lxml or ElementTree for XML parsing: |
| 269 | 268 | try: |
| ... | ... | @@ -298,7 +297,7 @@ _thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) |
| 298 | 297 | # print('_thismodule_dir = %r' % _thismodule_dir) |
| 299 | 298 | _parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) |
| 300 | 299 | # print('_parent_dir = %r' % _thirdparty_dir) |
| 301 | -if not _parent_dir in sys.path: | |
| 300 | +if _parent_dir not in sys.path: | |
| 302 | 301 | sys.path.insert(0, _parent_dir) |
| 303 | 302 | |
| 304 | 303 | import olefile |
| ... | ... | @@ -330,38 +329,35 @@ if sys.version_info[0] <= 2: |
| 330 | 329 | # on Python 2, just use the normal ord() because items are bytes |
| 331 | 330 | byte_ord = ord |
| 332 | 331 | #: Default string encoding for the olevba API |
| 333 | - DEFAULT_API_ENCODING = 'utf8' # on Python 2: UTF-8 (bytes) | |
| 332 | + DEFAULT_API_ENCODING = 'utf8' # on Python 2: UTF-8 (bytes) | |
| 334 | 333 | else: |
| 335 | 334 | # Python 3.x+ |
| 336 | 335 | PYTHON2 = False |
| 336 | + | |
| 337 | 337 | # to use ord on bytes/bytearray items the same way in Python 2+3 |
| 338 | 338 | # on Python 3, items are int, so just return the item |
| 339 | - byte_ord = lambda x: x | |
| 339 | + def byte_ord(x): | |
| 340 | + return x | |
| 340 | 341 | # xrange is now called range: |
| 341 | 342 | xrange = range |
| 342 | 343 | # unichr does not exist anymore, only chr: |
| 343 | 344 | unichr = chr |
| 344 | 345 | from functools import reduce |
| 345 | 346 | #: Default string encoding for the olevba API |
| 346 | - DEFAULT_API_ENCODING = None # on Python 3: None (unicode) | |
| 347 | - | |
| 348 | - | |
| 349 | -# === PYTHON 3.0 - 3.4 SUPPORT ====================================================== | |
| 350 | - | |
| 351 | -# From https://gist.github.com/ynkdir/867347/c5e188a4886bc2dd71876c7e069a7b00b6c16c61 | |
| 347 | + DEFAULT_API_ENCODING = None # on Python 3: None (unicode) | |
| 348 | + # Python 3.0 - 3.4 support: | |
| 349 | + # From https://gist.github.com/ynkdir/867347/c5e188a4886bc2dd71876c7e069a7b00b6c16c61 | |
| 350 | + if sys.version_info < (3, 5): | |
| 351 | + import codecs | |
| 352 | + _backslashreplace_errors = codecs.lookup_error("backslashreplace") | |
| 352 | 353 | |
| 353 | -if sys.version_info >= (3, 0) and sys.version_info < (3, 5): | |
| 354 | - import codecs | |
| 354 | + def backslashreplace_errors(exc): | |
| 355 | + if isinstance(exc, UnicodeDecodeError): | |
| 356 | + u = "".join("\\x{0:02x}".format(c) for c in exc.object[exc.start:exc.end]) | |
| 357 | + return u, exc.end | |
| 358 | + return _backslashreplace_errors(exc) | |
| 355 | 359 | |
| 356 | - _backslashreplace_errors = codecs.lookup_error("backslashreplace") | |
| 357 | - | |
| 358 | - def backslashreplace_errors(exc): | |
| 359 | - if isinstance(exc, UnicodeDecodeError): | |
| 360 | - u = "".join("\\x{0:02x}".format(c) for c in exc.object[exc.start:exc.end]) | |
| 361 | - return (u, exc.end) | |
| 362 | - return _backslashreplace_errors(exc) | |
| 363 | - | |
| 364 | - codecs.register_error("backslashreplace", backslashreplace_errors) | |
| 360 | + codecs.register_error("backslashreplace", backslashreplace_errors) | |
| 365 | 361 | |
| 366 | 362 | |
| 367 | 363 | # === LOGGING ================================================================= |
| ... | ... | @@ -378,7 +374,7 @@ def get_logger(name, level=logging.CRITICAL+1): |
| 378 | 374 | # First, test if there is already a logger with the same name, else it |
| 379 | 375 | # will generate duplicate messages (due to duplicate handlers): |
| 380 | 376 | if name in logging.Logger.manager.loggerDict: |
| 381 | - #NOTE: another less intrusive but more "hackish" solution would be to | |
| 377 | + # NOTE: another less intrusive but more "hackish" solution would be to | |
| 382 | 378 | # use getLogger then test if its effective level is not default. |
| 383 | 379 | logger = logging.getLogger(name) |
| 384 | 380 | # make sure level is OK: |
| ... | ... | @@ -1346,12 +1342,16 @@ class VBA_Module(object): |
| 1346 | 1342 | """ |
| 1347 | 1343 | #: reference to the VBA project for later use (VBA_Project) |
| 1348 | 1344 | self.project = project |
| 1349 | - #: VBA project name (unicode str) | |
| 1345 | + #: VBA module name (unicode str) | |
| 1350 | 1346 | self.name = None |
| 1351 | - #: VBA project name, unicode copy (unicode str) | |
| 1347 | + #: VBA module name as a native str (utf8 bytes on py2, str on py3) | |
| 1348 | + self.name_str = None | |
| 1349 | + #: VBA module name, unicode copy (unicode str) | |
| 1352 | 1350 | self._name_unicode = None |
| 1353 | - #: Stream name containing the VBA project (unicode str) | |
| 1351 | + #: Stream name containing the VBA module (unicode str) | |
| 1354 | 1352 | self.streamname = None |
| 1353 | + #: Stream name containing the VBA module as a native str (utf8 bytes on py2, str on py3) | |
| 1354 | + self.streamname_str = None | |
| 1355 | 1355 | self._streamname_unicode = None |
| 1356 | 1356 | self.docstring = None |
| 1357 | 1357 | self._docstring_unicode = None |
| ... | ... | @@ -1376,6 +1376,10 @@ class VBA_Module(object): |
| 1376 | 1376 | modulename_bytes = dir_stream.read(size) |
| 1377 | 1377 | # Module name always stored as Unicode: |
| 1378 | 1378 | self.name = project.decode_bytes(modulename_bytes) |
| 1379 | + if PYTHON2: | |
| 1380 | + self.name_str = self.name.encode('utf8', errors='replace') | |
| 1381 | + else: | |
| 1382 | + self.name_str = self.name | |
| 1379 | 1383 | # account for optional sections |
| 1380 | 1384 | # TODO: shouldn't this be a loop? (check MS-OVBA) |
| 1381 | 1385 | section_id = struct.unpack("<H", dir_stream.read(2))[0] |
| ... | ... | @@ -1394,6 +1398,10 @@ class VBA_Module(object): |
| 1394 | 1398 | streamname_bytes = dir_stream.read(size) |
| 1395 | 1399 | # Store it as Unicode: |
| 1396 | 1400 | self.streamname = project.decode_bytes(streamname_bytes) |
| 1401 | + if PYTHON2: | |
| 1402 | + self.streamname_str = self.streamname.encode('utf8', errors='replace') | |
| 1403 | + else: | |
| 1404 | + self.streamname_str = self.streamname | |
| 1397 | 1405 | reserved = struct.unpack("<H", dir_stream.read(2))[0] |
| 1398 | 1406 | project.check_value('MODULESTREAMNAME_Reserved', 0x0032, reserved) |
| 1399 | 1407 | size = struct.unpack("<L", dir_stream.read(4))[0] |
| ... | ... | @@ -1469,10 +1477,10 @@ class VBA_Module(object): |
| 1469 | 1477 | if section_id != None: |
| 1470 | 1478 | log.warning('unknown or invalid module section id {0:04X}'.format(section_id)) |
| 1471 | 1479 | |
| 1472 | - log.debug("Module Name = {0}".format(self.name)) | |
| 1473 | - log.debug("Module Name Unicode = {0}".format(self._name_unicode)) | |
| 1474 | - log.debug("Stream Name = {0}".format(self.streamname)) | |
| 1475 | - log.debug("Stream Name Unicode = {0}".format(self._streamname_unicode)) | |
| 1480 | + log.debug("Module Name = {0}".format(self.name_str)) | |
| 1481 | + # log.debug("Module Name Unicode = {0}".format(self._name_unicode)) | |
| 1482 | + log.debug("Stream Name = {0}".format(self.streamname_str)) | |
| 1483 | + # log.debug("Stream Name Unicode = {0}".format(self._streamname_unicode)) | |
| 1476 | 1484 | log.debug("TextOffset = {0}".format(self.textoffset)) |
| 1477 | 1485 | |
| 1478 | 1486 | code_data = None |
| ... | ... | @@ -1519,10 +1527,10 @@ class VBA_Module(object): |
| 1519 | 1527 | self.code_str = self.code |
| 1520 | 1528 | # case-insensitive search in the code_modules dict to find the file extension: |
| 1521 | 1529 | filext = self.project.module_ext.get(self.name.lower(), 'vba') |
| 1522 | - self.filename = '{0}.{1}'.format(self.name, filext) | |
| 1523 | - log.debug('extracted file {0}'.format(self.filename)) | |
| 1530 | + self.filename = u'{0}.{1}'.format(self.name, filext) | |
| 1531 | + log.debug('extracted file {0}'.format(repr(self.filename))) | |
| 1524 | 1532 | else: |
| 1525 | - log.warning("module stream {0} has code data length 0".format(self.streamname)) | |
| 1533 | + log.warning("module stream {0} has code data length 0".format(self.streamname_str)) | |
| 1526 | 1534 | except (UnexpectedDataError, SubstreamOpenError): |
| 1527 | 1535 | raise |
| 1528 | 1536 | except Exception as exc: | ... | ... |