Commit e2a52831884deb9f060e98026fd7e1a8696d3e30

Authored by decalage2
1 parent a410b68b

olevba: added name_str and others to VBA_Module for issue #383, a few PEP8 fixes

oletools/olevba.py
@@ -239,14 +239,14 @@ __version__ = '0.54dev7' @@ -239,14 +239,14 @@ __version__ = '0.54dev7'
239 # - extract_macros: use combined struct.unpack instead of many calls 239 # - extract_macros: use combined struct.unpack instead of many calls
240 # - all except clauses should target specific exceptions 240 # - all except clauses should target specific exceptions
241 241
242 -#------------------------------------------------------------------------------ 242 +# ------------------------------------------------------------------------------
243 # REFERENCES: 243 # REFERENCES:
244 # - [MS-OVBA]: Microsoft Office VBA File Format Structure 244 # - [MS-OVBA]: Microsoft Office VBA File Format Structure
245 # http://msdn.microsoft.com/en-us/library/office/cc313094%28v=office.12%29.aspx 245 # http://msdn.microsoft.com/en-us/library/office/cc313094%28v=office.12%29.aspx
246 # - officeparser: https://github.com/unixfreak0037/officeparser 246 # - officeparser: https://github.com/unixfreak0037/officeparser
247 247
248 248
249 -#--- IMPORTS ------------------------------------------------------------------ 249 +# --- IMPORTS ------------------------------------------------------------------
250 250
251 import sys 251 import sys
252 import os 252 import os
@@ -263,7 +263,6 @@ import zlib @@ -263,7 +263,6 @@ import zlib
263 import email # for MHTML parsing 263 import email # for MHTML parsing
264 import string # for printable 264 import string # for printable
265 import json # for json output mode (argument --json) 265 import json # for json output mode (argument --json)
266 -import codecs  
267 266
268 # import lxml or ElementTree for XML parsing: 267 # import lxml or ElementTree for XML parsing:
269 try: 268 try:
@@ -298,7 +297,7 @@ _thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) @@ -298,7 +297,7 @@ _thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__)))
298 # print('_thismodule_dir = %r' % _thismodule_dir) 297 # print('_thismodule_dir = %r' % _thismodule_dir)
299 _parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) 298 _parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..'))
300 # print('_parent_dir = %r' % _thirdparty_dir) 299 # print('_parent_dir = %r' % _thirdparty_dir)
301 -if not _parent_dir in sys.path: 300 +if _parent_dir not in sys.path:
302 sys.path.insert(0, _parent_dir) 301 sys.path.insert(0, _parent_dir)
303 302
304 import olefile 303 import olefile
@@ -330,38 +329,35 @@ if sys.version_info[0] <= 2: @@ -330,38 +329,35 @@ if sys.version_info[0] <= 2:
330 # on Python 2, just use the normal ord() because items are bytes 329 # on Python 2, just use the normal ord() because items are bytes
331 byte_ord = ord 330 byte_ord = ord
332 #: Default string encoding for the olevba API 331 #: Default string encoding for the olevba API
333 - DEFAULT_API_ENCODING = 'utf8' # on Python 2: UTF-8 (bytes) 332 + DEFAULT_API_ENCODING = 'utf8' # on Python 2: UTF-8 (bytes)
334 else: 333 else:
335 # Python 3.x+ 334 # Python 3.x+
336 PYTHON2 = False 335 PYTHON2 = False
  336 +
337 # to use ord on bytes/bytearray items the same way in Python 2+3 337 # to use ord on bytes/bytearray items the same way in Python 2+3
338 # on Python 3, items are int, so just return the item 338 # on Python 3, items are int, so just return the item
339 - byte_ord = lambda x: x 339 + def byte_ord(x):
  340 + return x
340 # xrange is now called range: 341 # xrange is now called range:
341 xrange = range 342 xrange = range
342 # unichr does not exist anymore, only chr: 343 # unichr does not exist anymore, only chr:
343 unichr = chr 344 unichr = chr
344 from functools import reduce 345 from functools import reduce
345 #: Default string encoding for the olevba API 346 #: Default string encoding for the olevba API
346 - DEFAULT_API_ENCODING = None # on Python 3: None (unicode)  
347 -  
348 -  
349 -# === PYTHON 3.0 - 3.4 SUPPORT ======================================================  
350 -  
351 -# From https://gist.github.com/ynkdir/867347/c5e188a4886bc2dd71876c7e069a7b00b6c16c61 347 + DEFAULT_API_ENCODING = None # on Python 3: None (unicode)
  348 + # Python 3.0 - 3.4 support:
  349 + # From https://gist.github.com/ynkdir/867347/c5e188a4886bc2dd71876c7e069a7b00b6c16c61
  350 + if sys.version_info < (3, 5):
  351 + import codecs
  352 + _backslashreplace_errors = codecs.lookup_error("backslashreplace")
352 353
353 -if sys.version_info >= (3, 0) and sys.version_info < (3, 5):  
354 - import codecs 354 + def backslashreplace_errors(exc):
  355 + if isinstance(exc, UnicodeDecodeError):
  356 + u = "".join("\\x{0:02x}".format(c) for c in exc.object[exc.start:exc.end])
  357 + return u, exc.end
  358 + return _backslashreplace_errors(exc)
355 359
356 - _backslashreplace_errors = codecs.lookup_error("backslashreplace")  
357 -  
358 - def backslashreplace_errors(exc):  
359 - if isinstance(exc, UnicodeDecodeError):  
360 - u = "".join("\\x{0:02x}".format(c) for c in exc.object[exc.start:exc.end])  
361 - return (u, exc.end)  
362 - return _backslashreplace_errors(exc)  
363 -  
364 - codecs.register_error("backslashreplace", backslashreplace_errors) 360 + codecs.register_error("backslashreplace", backslashreplace_errors)
365 361
366 362
367 # === LOGGING ================================================================= 363 # === LOGGING =================================================================
@@ -378,7 +374,7 @@ def get_logger(name, level=logging.CRITICAL+1): @@ -378,7 +374,7 @@ def get_logger(name, level=logging.CRITICAL+1):
378 # First, test if there is already a logger with the same name, else it 374 # First, test if there is already a logger with the same name, else it
379 # will generate duplicate messages (due to duplicate handlers): 375 # will generate duplicate messages (due to duplicate handlers):
380 if name in logging.Logger.manager.loggerDict: 376 if name in logging.Logger.manager.loggerDict:
381 - #NOTE: another less intrusive but more "hackish" solution would be to 377 + # NOTE: another less intrusive but more "hackish" solution would be to
382 # use getLogger then test if its effective level is not default. 378 # use getLogger then test if its effective level is not default.
383 logger = logging.getLogger(name) 379 logger = logging.getLogger(name)
384 # make sure level is OK: 380 # make sure level is OK:
@@ -1346,12 +1342,16 @@ class VBA_Module(object): @@ -1346,12 +1342,16 @@ class VBA_Module(object):
1346 """ 1342 """
1347 #: reference to the VBA project for later use (VBA_Project) 1343 #: reference to the VBA project for later use (VBA_Project)
1348 self.project = project 1344 self.project = project
1349 - #: VBA project name (unicode str) 1345 + #: VBA module name (unicode str)
1350 self.name = None 1346 self.name = None
1351 - #: VBA project name, unicode copy (unicode str) 1347 + #: VBA module name as a native str (utf8 bytes on py2, str on py3)
  1348 + self.name_str = None
  1349 + #: VBA module name, unicode copy (unicode str)
1352 self._name_unicode = None 1350 self._name_unicode = None
1353 - #: Stream name containing the VBA project (unicode str) 1351 + #: Stream name containing the VBA module (unicode str)
1354 self.streamname = None 1352 self.streamname = None
  1353 + #: Stream name containing the VBA module as a native str (utf8 bytes on py2, str on py3)
  1354 + self.streamname_str = None
1355 self._streamname_unicode = None 1355 self._streamname_unicode = None
1356 self.docstring = None 1356 self.docstring = None
1357 self._docstring_unicode = None 1357 self._docstring_unicode = None
@@ -1376,6 +1376,10 @@ class VBA_Module(object): @@ -1376,6 +1376,10 @@ class VBA_Module(object):
1376 modulename_bytes = dir_stream.read(size) 1376 modulename_bytes = dir_stream.read(size)
1377 # Module name always stored as Unicode: 1377 # Module name always stored as Unicode:
1378 self.name = project.decode_bytes(modulename_bytes) 1378 self.name = project.decode_bytes(modulename_bytes)
  1379 + if PYTHON2:
  1380 + self.name_str = self.name.encode('utf8', errors='replace')
  1381 + else:
  1382 + self.name_str = self.name
1379 # account for optional sections 1383 # account for optional sections
1380 # TODO: shouldn't this be a loop? (check MS-OVBA) 1384 # TODO: shouldn't this be a loop? (check MS-OVBA)
1381 section_id = struct.unpack("<H", dir_stream.read(2))[0] 1385 section_id = struct.unpack("<H", dir_stream.read(2))[0]
@@ -1394,6 +1398,10 @@ class VBA_Module(object): @@ -1394,6 +1398,10 @@ class VBA_Module(object):
1394 streamname_bytes = dir_stream.read(size) 1398 streamname_bytes = dir_stream.read(size)
1395 # Store it as Unicode: 1399 # Store it as Unicode:
1396 self.streamname = project.decode_bytes(streamname_bytes) 1400 self.streamname = project.decode_bytes(streamname_bytes)
  1401 + if PYTHON2:
  1402 + self.streamname_str = self.streamname.encode('utf8', errors='replace')
  1403 + else:
  1404 + self.streamname_str = self.streamname
1397 reserved = struct.unpack("<H", dir_stream.read(2))[0] 1405 reserved = struct.unpack("<H", dir_stream.read(2))[0]
1398 project.check_value('MODULESTREAMNAME_Reserved', 0x0032, reserved) 1406 project.check_value('MODULESTREAMNAME_Reserved', 0x0032, reserved)
1399 size = struct.unpack("<L", dir_stream.read(4))[0] 1407 size = struct.unpack("<L", dir_stream.read(4))[0]
@@ -1469,10 +1477,10 @@ class VBA_Module(object): @@ -1469,10 +1477,10 @@ class VBA_Module(object):
1469 if section_id != None: 1477 if section_id != None:
1470 log.warning('unknown or invalid module section id {0:04X}'.format(section_id)) 1478 log.warning('unknown or invalid module section id {0:04X}'.format(section_id))
1471 1479
1472 - log.debug("Module Name = {0}".format(self.name))  
1473 - log.debug("Module Name Unicode = {0}".format(self._name_unicode))  
1474 - log.debug("Stream Name = {0}".format(self.streamname))  
1475 - log.debug("Stream Name Unicode = {0}".format(self._streamname_unicode)) 1480 + log.debug("Module Name = {0}".format(self.name_str))
  1481 + # log.debug("Module Name Unicode = {0}".format(self._name_unicode))
  1482 + log.debug("Stream Name = {0}".format(self.streamname_str))
  1483 + # log.debug("Stream Name Unicode = {0}".format(self._streamname_unicode))
1476 log.debug("TextOffset = {0}".format(self.textoffset)) 1484 log.debug("TextOffset = {0}".format(self.textoffset))
1477 1485
1478 code_data = None 1486 code_data = None
@@ -1519,10 +1527,10 @@ class VBA_Module(object): @@ -1519,10 +1527,10 @@ class VBA_Module(object):
1519 self.code_str = self.code 1527 self.code_str = self.code
1520 # case-insensitive search in the code_modules dict to find the file extension: 1528 # case-insensitive search in the code_modules dict to find the file extension:
1521 filext = self.project.module_ext.get(self.name.lower(), 'vba') 1529 filext = self.project.module_ext.get(self.name.lower(), 'vba')
1522 - self.filename = '{0}.{1}'.format(self.name, filext)  
1523 - log.debug('extracted file {0}'.format(self.filename)) 1530 + self.filename = u'{0}.{1}'.format(self.name, filext)
  1531 + log.debug('extracted file {0}'.format(repr(self.filename)))
1524 else: 1532 else:
1525 - log.warning("module stream {0} has code data length 0".format(self.streamname)) 1533 + log.warning("module stream {0} has code data length 0".format(self.streamname_str))
1526 except (UnexpectedDataError, SubstreamOpenError): 1534 except (UnexpectedDataError, SubstreamOpenError):
1527 raise 1535 raise
1528 except Exception as exc: 1536 except Exception as exc:
requirements.txt
1 pyparsing>=2.2.0 1 pyparsing>=2.2.0
2 olefile>=0.45 2 olefile>=0.45
  3 +colorclass