Commit da9749ed11a34a0f8a5008a32e9a3132ddac7693

Authored by Philippe Lagadec
2 parents d353c6d9 f144d2ce

Merge pull request #46 from christian-intra2net/robustify-extact-vba

Robustify extact vba
Showing 1 changed file with 123 additions and 118 deletions
oletools/olevba.py
... ... @@ -1417,122 +1417,127 @@ def _extract_vba(ole, vba_root, project_path, dir_path):
1417 1417 unused = projectmodules_projectcookierecord_cookie
1418 1418  
1419 1419 log.debug("parsing {0} modules".format(projectmodules_count))
1420   - for _ in xrange(0, projectmodules_count):
1421   - modulename_id = struct.unpack("<H", dir_stream.read(2))[0]
1422   - check_value('MODULENAME_Id', 0x0019, modulename_id)
1423   - modulename_sizeof_modulename = struct.unpack("<L", dir_stream.read(4))[0]
1424   - modulename_modulename = dir_stream.read(modulename_sizeof_modulename)
1425   - # account for optional sections
1426   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1427   - if section_id == 0x0047:
1428   - modulename_unicode_id = section_id
1429   - modulename_unicode_sizeof_modulename_unicode = struct.unpack("<L", dir_stream.read(4))[0]
1430   - modulename_unicode_modulename_unicode = dir_stream.read(modulename_unicode_sizeof_modulename_unicode)
1431   - unused = modulename_unicode_id
1432   - unused = modulename_unicode_modulename_unicode
1433   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1434   - if section_id == 0x001A:
1435   - modulestreamname_id = section_id
1436   - modulestreamname_sizeof_streamname = struct.unpack("<L", dir_stream.read(4))[0]
1437   - modulestreamname_streamname = dir_stream.read(modulestreamname_sizeof_streamname)
1438   - modulestreamname_reserved = struct.unpack("<H", dir_stream.read(2))[0]
1439   - check_value('MODULESTREAMNAME_Reserved', 0x0032, modulestreamname_reserved)
1440   - modulestreamname_sizeof_streamname_unicode = struct.unpack("<L", dir_stream.read(4))[0]
1441   - modulestreamname_streamname_unicode = dir_stream.read(modulestreamname_sizeof_streamname_unicode)
1442   - unused = modulestreamname_id
1443   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1444   - if section_id == 0x001C:
1445   - moduledocstring_id = section_id
1446   - check_value('MODULEDOCSTRING_Id', 0x001C, moduledocstring_id)
1447   - moduledocstring_sizeof_docstring = struct.unpack("<L", dir_stream.read(4))[0]
1448   - moduledocstring_docstring = dir_stream.read(moduledocstring_sizeof_docstring)
1449   - moduledocstring_reserved = struct.unpack("<H", dir_stream.read(2))[0]
1450   - check_value('MODULEDOCSTRING_Reserved', 0x0048, moduledocstring_reserved)
1451   - moduledocstring_sizeof_docstring_unicode = struct.unpack("<L", dir_stream.read(4))[0]
1452   - moduledocstring_docstring_unicode = dir_stream.read(moduledocstring_sizeof_docstring_unicode)
1453   - unused = moduledocstring_docstring
1454   - unused = moduledocstring_docstring_unicode
1455   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1456   - if section_id == 0x0031:
1457   - moduleoffset_id = section_id
1458   - check_value('MODULEOFFSET_Id', 0x0031, moduleoffset_id)
1459   - moduleoffset_size = struct.unpack("<L", dir_stream.read(4))[0]
1460   - check_value('MODULEOFFSET_Size', 0x0004, moduleoffset_size)
1461   - moduleoffset_textoffset = struct.unpack("<L", dir_stream.read(4))[0]
1462   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1463   - if section_id == 0x001E:
1464   - modulehelpcontext_id = section_id
1465   - check_value('MODULEHELPCONTEXT_Id', 0x001E, modulehelpcontext_id)
1466   - modulehelpcontext_size = struct.unpack("<L", dir_stream.read(4))[0]
1467   - check_value('MODULEHELPCONTEXT_Size', 0x0004, modulehelpcontext_size)
1468   - modulehelpcontext_helpcontext = struct.unpack("<L", dir_stream.read(4))[0]
1469   - unused = modulehelpcontext_helpcontext
1470   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1471   - if section_id == 0x002C:
1472   - modulecookie_id = section_id
1473   - check_value('MODULECOOKIE_Id', 0x002C, modulecookie_id)
1474   - modulecookie_size = struct.unpack("<L", dir_stream.read(4))[0]
1475   - check_value('MODULECOOKIE_Size', 0x0002, modulecookie_size)
1476   - modulecookie_cookie = struct.unpack("<H", dir_stream.read(2))[0]
1477   - unused = modulecookie_cookie
1478   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1479   - if section_id == 0x0021 or section_id == 0x0022:
1480   - moduletype_id = section_id
1481   - moduletype_reserved = struct.unpack("<L", dir_stream.read(4))[0]
1482   - unused = moduletype_id
1483   - unused = moduletype_reserved
1484   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1485   - if section_id == 0x0025:
1486   - modulereadonly_id = section_id
1487   - check_value('MODULEREADONLY_Id', 0x0025, modulereadonly_id)
1488   - modulereadonly_reserved = struct.unpack("<L", dir_stream.read(4))[0]
1489   - check_value('MODULEREADONLY_Reserved', 0x0000, modulereadonly_reserved)
1490   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1491   - if section_id == 0x0028:
1492   - moduleprivate_id = section_id
1493   - check_value('MODULEPRIVATE_Id', 0x0028, moduleprivate_id)
1494   - moduleprivate_reserved = struct.unpack("<L", dir_stream.read(4))[0]
1495   - check_value('MODULEPRIVATE_Reserved', 0x0000, moduleprivate_reserved)
  1420 + for projectmodule_index in xrange(0, projectmodules_count):
  1421 + try:
  1422 + modulename_id = struct.unpack("<H", dir_stream.read(2))[0]
  1423 + check_value('MODULENAME_Id', 0x0019, modulename_id)
  1424 + modulename_sizeof_modulename = struct.unpack("<L", dir_stream.read(4))[0]
  1425 + modulename_modulename = dir_stream.read(modulename_sizeof_modulename)
  1426 + # account for optional sections
1496 1427 section_id = struct.unpack("<H", dir_stream.read(2))[0]
1497   - if section_id == 0x002B: # TERMINATOR
1498   - module_reserved = struct.unpack("<L", dir_stream.read(4))[0]
1499   - check_value('MODULE_Reserved', 0x0000, module_reserved)
1500   - section_id = None
1501   - if section_id != None:
1502   - log.warning('unknown or invalid module section id {0:04X}'.format(section_id))
1503   -
1504   - log.debug('Project CodePage = %d' % projectcodepage_codepage)
1505   - vba_codec = 'cp%d' % projectcodepage_codepage
1506   - log.debug("ModuleName = {0}".format(modulename_modulename))
1507   - log.debug("StreamName = {0}".format(repr(modulestreamname_streamname)))
1508   - streamname_unicode = modulestreamname_streamname.decode(vba_codec)
1509   - log.debug("StreamName.decode('%s') = %s" % (vba_codec, repr(streamname_unicode)))
1510   - log.debug("StreamNameUnicode = {0}".format(repr(modulestreamname_streamname_unicode)))
1511   - log.debug("TextOffset = {0}".format(moduleoffset_textoffset))
1512   -
1513   - code_path = vba_root + u'VBA/' + streamname_unicode
1514   - #TODO: test if stream exists
1515   - log.debug('opening VBA code stream %s' % repr(code_path))
1516   - code_data = ole.openstream(code_path).read()
1517   - log.debug("length of code_data = {0}".format(len(code_data)))
1518   - log.debug("offset of code_data = {0}".format(moduleoffset_textoffset))
1519   - code_data = code_data[moduleoffset_textoffset:]
1520   - if len(code_data) > 0:
1521   - code_data = decompress_stream(code_data)
1522   - # case-insensitive search in the code_modules dict to find the file extension:
1523   - filext = code_modules.get(modulename_modulename.lower(), 'bin')
1524   - filename = '{0}.{1}'.format(modulename_modulename, filext)
1525   - #TODO: also yield the codepage so that callers can decode it properly
1526   - yield (code_path, filename, code_data)
1527   - # print '-'*79
1528   - # print filename
1529   - # print ''
1530   - # print code_data
1531   - # print ''
1532   - log.debug('extracted file {0}'.format(filename))
1533   - else:
1534   - log.warning("module stream {0} has code data length 0".format(modulestreamname_streamname))
1535   - _ = unused
  1428 + if section_id == 0x0047:
  1429 + modulename_unicode_id = section_id
  1430 + modulename_unicode_sizeof_modulename_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1431 + modulename_unicode_modulename_unicode = dir_stream.read(modulename_unicode_sizeof_modulename_unicode)
  1432 + unused = modulename_unicode_id
  1433 + unused = modulename_unicode_modulename_unicode
  1434 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1435 + if section_id == 0x001A:
  1436 + modulestreamname_id = section_id
  1437 + modulestreamname_sizeof_streamname = struct.unpack("<L", dir_stream.read(4))[0]
  1438 + modulestreamname_streamname = dir_stream.read(modulestreamname_sizeof_streamname)
  1439 + modulestreamname_reserved = struct.unpack("<H", dir_stream.read(2))[0]
  1440 + check_value('MODULESTREAMNAME_Reserved', 0x0032, modulestreamname_reserved)
  1441 + modulestreamname_sizeof_streamname_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1442 + modulestreamname_streamname_unicode = dir_stream.read(modulestreamname_sizeof_streamname_unicode)
  1443 + unused = modulestreamname_id
  1444 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1445 + if section_id == 0x001C:
  1446 + moduledocstring_id = section_id
  1447 + check_value('MODULEDOCSTRING_Id', 0x001C, moduledocstring_id)
  1448 + moduledocstring_sizeof_docstring = struct.unpack("<L", dir_stream.read(4))[0]
  1449 + moduledocstring_docstring = dir_stream.read(moduledocstring_sizeof_docstring)
  1450 + moduledocstring_reserved = struct.unpack("<H", dir_stream.read(2))[0]
  1451 + check_value('MODULEDOCSTRING_Reserved', 0x0048, moduledocstring_reserved)
  1452 + moduledocstring_sizeof_docstring_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1453 + moduledocstring_docstring_unicode = dir_stream.read(moduledocstring_sizeof_docstring_unicode)
  1454 + unused = moduledocstring_docstring
  1455 + unused = moduledocstring_docstring_unicode
  1456 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1457 + if section_id == 0x0031:
  1458 + moduleoffset_id = section_id
  1459 + check_value('MODULEOFFSET_Id', 0x0031, moduleoffset_id)
  1460 + moduleoffset_size = struct.unpack("<L", dir_stream.read(4))[0]
  1461 + check_value('MODULEOFFSET_Size', 0x0004, moduleoffset_size)
  1462 + moduleoffset_textoffset = struct.unpack("<L", dir_stream.read(4))[0]
  1463 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1464 + if section_id == 0x001E:
  1465 + modulehelpcontext_id = section_id
  1466 + check_value('MODULEHELPCONTEXT_Id', 0x001E, modulehelpcontext_id)
  1467 + modulehelpcontext_size = struct.unpack("<L", dir_stream.read(4))[0]
  1468 + check_value('MODULEHELPCONTEXT_Size', 0x0004, modulehelpcontext_size)
  1469 + modulehelpcontext_helpcontext = struct.unpack("<L", dir_stream.read(4))[0]
  1470 + unused = modulehelpcontext_helpcontext
  1471 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1472 + if section_id == 0x002C:
  1473 + modulecookie_id = section_id
  1474 + check_value('MODULECOOKIE_Id', 0x002C, modulecookie_id)
  1475 + modulecookie_size = struct.unpack("<L", dir_stream.read(4))[0]
  1476 + check_value('MODULECOOKIE_Size', 0x0002, modulecookie_size)
  1477 + modulecookie_cookie = struct.unpack("<H", dir_stream.read(2))[0]
  1478 + unused = modulecookie_cookie
  1479 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1480 + if section_id == 0x0021 or section_id == 0x0022:
  1481 + moduletype_id = section_id
  1482 + moduletype_reserved = struct.unpack("<L", dir_stream.read(4))[0]
  1483 + unused = moduletype_id
  1484 + unused = moduletype_reserved
  1485 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1486 + if section_id == 0x0025:
  1487 + modulereadonly_id = section_id
  1488 + check_value('MODULEREADONLY_Id', 0x0025, modulereadonly_id)
  1489 + modulereadonly_reserved = struct.unpack("<L", dir_stream.read(4))[0]
  1490 + check_value('MODULEREADONLY_Reserved', 0x0000, modulereadonly_reserved)
  1491 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1492 + if section_id == 0x0028:
  1493 + moduleprivate_id = section_id
  1494 + check_value('MODULEPRIVATE_Id', 0x0028, moduleprivate_id)
  1495 + moduleprivate_reserved = struct.unpack("<L", dir_stream.read(4))[0]
  1496 + check_value('MODULEPRIVATE_Reserved', 0x0000, moduleprivate_reserved)
  1497 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1498 + if section_id == 0x002B: # TERMINATOR
  1499 + module_reserved = struct.unpack("<L", dir_stream.read(4))[0]
  1500 + check_value('MODULE_Reserved', 0x0000, module_reserved)
  1501 + section_id = None
  1502 + if section_id != None:
  1503 + log.warning('unknown or invalid module section id {0:04X}'.format(section_id))
  1504 +
  1505 + log.debug('Project CodePage = %d' % projectcodepage_codepage)
  1506 + vba_codec = 'cp%d' % projectcodepage_codepage
  1507 + log.debug("ModuleName = {0}".format(modulename_modulename))
  1508 + log.debug("StreamName = {0}".format(repr(modulestreamname_streamname)))
  1509 + streamname_unicode = modulestreamname_streamname.decode(vba_codec)
  1510 + log.debug("StreamName.decode('%s') = %s" % (vba_codec, repr(streamname_unicode)))
  1511 + log.debug("StreamNameUnicode = {0}".format(repr(modulestreamname_streamname_unicode)))
  1512 + log.debug("TextOffset = {0}".format(moduleoffset_textoffset))
  1513 +
  1514 + code_path = vba_root + u'VBA/' + streamname_unicode
  1515 + #TODO: test if stream exists
  1516 + log.debug('opening VBA code stream %s' % repr(code_path))
  1517 + code_data = ole.openstream(code_path).read()
  1518 + log.debug("length of code_data = {0}".format(len(code_data)))
  1519 + log.debug("offset of code_data = {0}".format(moduleoffset_textoffset))
  1520 + code_data = code_data[moduleoffset_textoffset:]
  1521 + if len(code_data) > 0:
  1522 + code_data = decompress_stream(code_data)
  1523 + # case-insensitive search in the code_modules dict to find the file extension:
  1524 + filext = code_modules.get(modulename_modulename.lower(), 'bin')
  1525 + filename = '{0}.{1}'.format(modulename_modulename, filext)
  1526 + #TODO: also yield the codepage so that callers can decode it properly
  1527 + yield (code_path, filename, code_data)
  1528 + # print '-'*79
  1529 + # print filename
  1530 + # print ''
  1531 + # print code_data
  1532 + # print ''
  1533 + log.debug('extracted file {0}'.format(filename))
  1534 + else:
  1535 + log.warning("module stream {0} has code data length 0".format(modulestreamname_streamname))
  1536 + except Exception as exc:
  1537 + log.info('Error parsing module {} of {} in _extract_vba:'
  1538 + .format(projectmodule_index, projectmodules_count),
  1539 + exc_info=True)
  1540 + _ = unused # make pylint happy: now variable "unused" is being used ;-)
1536 1541 return
1537 1542  
1538 1543  
... ... @@ -1770,13 +1775,13 @@ def json2ascii(json_obj, encoding=&#39;utf8&#39;, errors=&#39;replace&#39;):
1770 1775 # de-code and re-encode
1771 1776 dencoded = json_obj.decode(encoding, errors).encode(encoding, errors)
1772 1777 if dencoded != json_obj:
1773   - log.info('json2ascii: replaced: {0} (len {1})'
  1778 + log.debug('json2ascii: replaced: {0} (len {1})'
1774 1779 .format(json_obj, len(json_obj)))
1775   - log.info('json2ascii: with: {0} (len {1})'
  1780 + log.debug('json2ascii: with: {0} (len {1})'
1776 1781 .format(dencoded, len(dencoded)))
1777 1782 return dencoded
1778 1783 elif isinstance(json_obj, unicode):
1779   - log.info('json2ascii: replaced: {0}'
  1784 + log.debug('json2ascii: encode unicode: {0}'
1780 1785 .format(json_obj.encode(encoding, errors)))
1781 1786 # cannot put original into logger
1782 1787 # print 'original: ' json_obj
... ...