Commit e16a29f9e9558be46c94bc8783bf919d21a4564b

Authored by Christian Herdtweck
1 parent d353c6d9

wrap loop iteration in _extract_vba in try-except to prevent individual entries messing up result

We have one example files, where openstream() fails in 3 out of 10 modules,
but of course MS office (and even LibreOffice) ignore this.
Showing 1 changed file with 120 additions and 115 deletions
oletools/olevba.py
@@ -1417,122 +1417,127 @@ def _extract_vba(ole, vba_root, project_path, dir_path): @@ -1417,122 +1417,127 @@ def _extract_vba(ole, vba_root, project_path, dir_path):
1417 unused = projectmodules_projectcookierecord_cookie 1417 unused = projectmodules_projectcookierecord_cookie
1418 1418
1419 log.debug("parsing {0} modules".format(projectmodules_count)) 1419 log.debug("parsing {0} modules".format(projectmodules_count))
1420 - for _ in xrange(0, projectmodules_count):  
1421 - modulename_id = struct.unpack("<H", dir_stream.read(2))[0]  
1422 - check_value('MODULENAME_Id', 0x0019, modulename_id)  
1423 - modulename_sizeof_modulename = struct.unpack("<L", dir_stream.read(4))[0]  
1424 - modulename_modulename = dir_stream.read(modulename_sizeof_modulename)  
1425 - # account for optional sections  
1426 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1427 - if section_id == 0x0047:  
1428 - modulename_unicode_id = section_id  
1429 - modulename_unicode_sizeof_modulename_unicode = struct.unpack("<L", dir_stream.read(4))[0]  
1430 - modulename_unicode_modulename_unicode = dir_stream.read(modulename_unicode_sizeof_modulename_unicode)  
1431 - unused = modulename_unicode_id  
1432 - unused = modulename_unicode_modulename_unicode  
1433 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1434 - if section_id == 0x001A:  
1435 - modulestreamname_id = section_id  
1436 - modulestreamname_sizeof_streamname = struct.unpack("<L", dir_stream.read(4))[0]  
1437 - modulestreamname_streamname = dir_stream.read(modulestreamname_sizeof_streamname)  
1438 - modulestreamname_reserved = struct.unpack("<H", dir_stream.read(2))[0]  
1439 - check_value('MODULESTREAMNAME_Reserved', 0x0032, modulestreamname_reserved)  
1440 - modulestreamname_sizeof_streamname_unicode = struct.unpack("<L", dir_stream.read(4))[0]  
1441 - modulestreamname_streamname_unicode = dir_stream.read(modulestreamname_sizeof_streamname_unicode)  
1442 - unused = modulestreamname_id  
1443 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1444 - if section_id == 0x001C:  
1445 - moduledocstring_id = section_id  
1446 - check_value('MODULEDOCSTRING_Id', 0x001C, moduledocstring_id)  
1447 - moduledocstring_sizeof_docstring = struct.unpack("<L", dir_stream.read(4))[0]  
1448 - moduledocstring_docstring = dir_stream.read(moduledocstring_sizeof_docstring)  
1449 - moduledocstring_reserved = struct.unpack("<H", dir_stream.read(2))[0]  
1450 - check_value('MODULEDOCSTRING_Reserved', 0x0048, moduledocstring_reserved)  
1451 - moduledocstring_sizeof_docstring_unicode = struct.unpack("<L", dir_stream.read(4))[0]  
1452 - moduledocstring_docstring_unicode = dir_stream.read(moduledocstring_sizeof_docstring_unicode)  
1453 - unused = moduledocstring_docstring  
1454 - unused = moduledocstring_docstring_unicode  
1455 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1456 - if section_id == 0x0031:  
1457 - moduleoffset_id = section_id  
1458 - check_value('MODULEOFFSET_Id', 0x0031, moduleoffset_id)  
1459 - moduleoffset_size = struct.unpack("<L", dir_stream.read(4))[0]  
1460 - check_value('MODULEOFFSET_Size', 0x0004, moduleoffset_size)  
1461 - moduleoffset_textoffset = struct.unpack("<L", dir_stream.read(4))[0]  
1462 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1463 - if section_id == 0x001E:  
1464 - modulehelpcontext_id = section_id  
1465 - check_value('MODULEHELPCONTEXT_Id', 0x001E, modulehelpcontext_id)  
1466 - modulehelpcontext_size = struct.unpack("<L", dir_stream.read(4))[0]  
1467 - check_value('MODULEHELPCONTEXT_Size', 0x0004, modulehelpcontext_size)  
1468 - modulehelpcontext_helpcontext = struct.unpack("<L", dir_stream.read(4))[0]  
1469 - unused = modulehelpcontext_helpcontext  
1470 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1471 - if section_id == 0x002C:  
1472 - modulecookie_id = section_id  
1473 - check_value('MODULECOOKIE_Id', 0x002C, modulecookie_id)  
1474 - modulecookie_size = struct.unpack("<L", dir_stream.read(4))[0]  
1475 - check_value('MODULECOOKIE_Size', 0x0002, modulecookie_size)  
1476 - modulecookie_cookie = struct.unpack("<H", dir_stream.read(2))[0]  
1477 - unused = modulecookie_cookie  
1478 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1479 - if section_id == 0x0021 or section_id == 0x0022:  
1480 - moduletype_id = section_id  
1481 - moduletype_reserved = struct.unpack("<L", dir_stream.read(4))[0]  
1482 - unused = moduletype_id  
1483 - unused = moduletype_reserved  
1484 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1485 - if section_id == 0x0025:  
1486 - modulereadonly_id = section_id  
1487 - check_value('MODULEREADONLY_Id', 0x0025, modulereadonly_id)  
1488 - modulereadonly_reserved = struct.unpack("<L", dir_stream.read(4))[0]  
1489 - check_value('MODULEREADONLY_Reserved', 0x0000, modulereadonly_reserved)  
1490 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1491 - if section_id == 0x0028:  
1492 - moduleprivate_id = section_id  
1493 - check_value('MODULEPRIVATE_Id', 0x0028, moduleprivate_id)  
1494 - moduleprivate_reserved = struct.unpack("<L", dir_stream.read(4))[0]  
1495 - check_value('MODULEPRIVATE_Reserved', 0x0000, moduleprivate_reserved) 1420 + for projectmodule_index in xrange(0, projectmodules_count):
  1421 + try:
  1422 + modulename_id = struct.unpack("<H", dir_stream.read(2))[0]
  1423 + check_value('MODULENAME_Id', 0x0019, modulename_id)
  1424 + modulename_sizeof_modulename = struct.unpack("<L", dir_stream.read(4))[0]
  1425 + modulename_modulename = dir_stream.read(modulename_sizeof_modulename)
  1426 + # account for optional sections
1496 section_id = struct.unpack("<H", dir_stream.read(2))[0] 1427 section_id = struct.unpack("<H", dir_stream.read(2))[0]
1497 - if section_id == 0x002B: # TERMINATOR  
1498 - module_reserved = struct.unpack("<L", dir_stream.read(4))[0]  
1499 - check_value('MODULE_Reserved', 0x0000, module_reserved)  
1500 - section_id = None  
1501 - if section_id != None:  
1502 - log.warning('unknown or invalid module section id {0:04X}'.format(section_id))  
1503 -  
1504 - log.debug('Project CodePage = %d' % projectcodepage_codepage)  
1505 - vba_codec = 'cp%d' % projectcodepage_codepage  
1506 - log.debug("ModuleName = {0}".format(modulename_modulename))  
1507 - log.debug("StreamName = {0}".format(repr(modulestreamname_streamname)))  
1508 - streamname_unicode = modulestreamname_streamname.decode(vba_codec)  
1509 - log.debug("StreamName.decode('%s') = %s" % (vba_codec, repr(streamname_unicode)))  
1510 - log.debug("StreamNameUnicode = {0}".format(repr(modulestreamname_streamname_unicode)))  
1511 - log.debug("TextOffset = {0}".format(moduleoffset_textoffset))  
1512 -  
1513 - code_path = vba_root + u'VBA/' + streamname_unicode  
1514 - #TODO: test if stream exists  
1515 - log.debug('opening VBA code stream %s' % repr(code_path))  
1516 - code_data = ole.openstream(code_path).read()  
1517 - log.debug("length of code_data = {0}".format(len(code_data)))  
1518 - log.debug("offset of code_data = {0}".format(moduleoffset_textoffset))  
1519 - code_data = code_data[moduleoffset_textoffset:]  
1520 - if len(code_data) > 0:  
1521 - code_data = decompress_stream(code_data)  
1522 - # case-insensitive search in the code_modules dict to find the file extension:  
1523 - filext = code_modules.get(modulename_modulename.lower(), 'bin')  
1524 - filename = '{0}.{1}'.format(modulename_modulename, filext)  
1525 - #TODO: also yield the codepage so that callers can decode it properly  
1526 - yield (code_path, filename, code_data)  
1527 - # print '-'*79  
1528 - # print filename  
1529 - # print ''  
1530 - # print code_data  
1531 - # print ''  
1532 - log.debug('extracted file {0}'.format(filename))  
1533 - else:  
1534 - log.warning("module stream {0} has code data length 0".format(modulestreamname_streamname))  
1535 - _ = unused 1428 + if section_id == 0x0047:
  1429 + modulename_unicode_id = section_id
  1430 + modulename_unicode_sizeof_modulename_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1431 + modulename_unicode_modulename_unicode = dir_stream.read(modulename_unicode_sizeof_modulename_unicode)
  1432 + unused = modulename_unicode_id
  1433 + unused = modulename_unicode_modulename_unicode
  1434 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1435 + if section_id == 0x001A:
  1436 + modulestreamname_id = section_id
  1437 + modulestreamname_sizeof_streamname = struct.unpack("<L", dir_stream.read(4))[0]
  1438 + modulestreamname_streamname = dir_stream.read(modulestreamname_sizeof_streamname)
  1439 + modulestreamname_reserved = struct.unpack("<H", dir_stream.read(2))[0]
  1440 + check_value('MODULESTREAMNAME_Reserved', 0x0032, modulestreamname_reserved)
  1441 + modulestreamname_sizeof_streamname_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1442 + modulestreamname_streamname_unicode = dir_stream.read(modulestreamname_sizeof_streamname_unicode)
  1443 + unused = modulestreamname_id
  1444 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1445 + if section_id == 0x001C:
  1446 + moduledocstring_id = section_id
  1447 + check_value('MODULEDOCSTRING_Id', 0x001C, moduledocstring_id)
  1448 + moduledocstring_sizeof_docstring = struct.unpack("<L", dir_stream.read(4))[0]
  1449 + moduledocstring_docstring = dir_stream.read(moduledocstring_sizeof_docstring)
  1450 + moduledocstring_reserved = struct.unpack("<H", dir_stream.read(2))[0]
  1451 + check_value('MODULEDOCSTRING_Reserved', 0x0048, moduledocstring_reserved)
  1452 + moduledocstring_sizeof_docstring_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1453 + moduledocstring_docstring_unicode = dir_stream.read(moduledocstring_sizeof_docstring_unicode)
  1454 + unused = moduledocstring_docstring
  1455 + unused = moduledocstring_docstring_unicode
  1456 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1457 + if section_id == 0x0031:
  1458 + moduleoffset_id = section_id
  1459 + check_value('MODULEOFFSET_Id', 0x0031, moduleoffset_id)
  1460 + moduleoffset_size = struct.unpack("<L", dir_stream.read(4))[0]
  1461 + check_value('MODULEOFFSET_Size', 0x0004, moduleoffset_size)
  1462 + moduleoffset_textoffset = struct.unpack("<L", dir_stream.read(4))[0]
  1463 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1464 + if section_id == 0x001E:
  1465 + modulehelpcontext_id = section_id
  1466 + check_value('MODULEHELPCONTEXT_Id', 0x001E, modulehelpcontext_id)
  1467 + modulehelpcontext_size = struct.unpack("<L", dir_stream.read(4))[0]
  1468 + check_value('MODULEHELPCONTEXT_Size', 0x0004, modulehelpcontext_size)
  1469 + modulehelpcontext_helpcontext = struct.unpack("<L", dir_stream.read(4))[0]
  1470 + unused = modulehelpcontext_helpcontext
  1471 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1472 + if section_id == 0x002C:
  1473 + modulecookie_id = section_id
  1474 + check_value('MODULECOOKIE_Id', 0x002C, modulecookie_id)
  1475 + modulecookie_size = struct.unpack("<L", dir_stream.read(4))[0]
  1476 + check_value('MODULECOOKIE_Size', 0x0002, modulecookie_size)
  1477 + modulecookie_cookie = struct.unpack("<H", dir_stream.read(2))[0]
  1478 + unused = modulecookie_cookie
  1479 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1480 + if section_id == 0x0021 or section_id == 0x0022:
  1481 + moduletype_id = section_id
  1482 + moduletype_reserved = struct.unpack("<L", dir_stream.read(4))[0]
  1483 + unused = moduletype_id
  1484 + unused = moduletype_reserved
  1485 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1486 + if section_id == 0x0025:
  1487 + modulereadonly_id = section_id
  1488 + check_value('MODULEREADONLY_Id', 0x0025, modulereadonly_id)
  1489 + modulereadonly_reserved = struct.unpack("<L", dir_stream.read(4))[0]
  1490 + check_value('MODULEREADONLY_Reserved', 0x0000, modulereadonly_reserved)
  1491 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1492 + if section_id == 0x0028:
  1493 + moduleprivate_id = section_id
  1494 + check_value('MODULEPRIVATE_Id', 0x0028, moduleprivate_id)
  1495 + moduleprivate_reserved = struct.unpack("<L", dir_stream.read(4))[0]
  1496 + check_value('MODULEPRIVATE_Reserved', 0x0000, moduleprivate_reserved)
  1497 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1498 + if section_id == 0x002B: # TERMINATOR
  1499 + module_reserved = struct.unpack("<L", dir_stream.read(4))[0]
  1500 + check_value('MODULE_Reserved', 0x0000, module_reserved)
  1501 + section_id = None
  1502 + if section_id != None:
  1503 + log.warning('unknown or invalid module section id {0:04X}'.format(section_id))
  1504 +
  1505 + log.debug('Project CodePage = %d' % projectcodepage_codepage)
  1506 + vba_codec = 'cp%d' % projectcodepage_codepage
  1507 + log.debug("ModuleName = {0}".format(modulename_modulename))
  1508 + log.debug("StreamName = {0}".format(repr(modulestreamname_streamname)))
  1509 + streamname_unicode = modulestreamname_streamname.decode(vba_codec)
  1510 + log.debug("StreamName.decode('%s') = %s" % (vba_codec, repr(streamname_unicode)))
  1511 + log.debug("StreamNameUnicode = {0}".format(repr(modulestreamname_streamname_unicode)))
  1512 + log.debug("TextOffset = {0}".format(moduleoffset_textoffset))
  1513 +
  1514 + code_path = vba_root + u'VBA/' + streamname_unicode
  1515 + #TODO: test if stream exists
  1516 + log.debug('opening VBA code stream %s' % repr(code_path))
  1517 + code_data = ole.openstream(code_path).read()
  1518 + log.debug("length of code_data = {0}".format(len(code_data)))
  1519 + log.debug("offset of code_data = {0}".format(moduleoffset_textoffset))
  1520 + code_data = code_data[moduleoffset_textoffset:]
  1521 + if len(code_data) > 0:
  1522 + code_data = decompress_stream(code_data)
  1523 + # case-insensitive search in the code_modules dict to find the file extension:
  1524 + filext = code_modules.get(modulename_modulename.lower(), 'bin')
  1525 + filename = '{0}.{1}'.format(modulename_modulename, filext)
  1526 + #TODO: also yield the codepage so that callers can decode it properly
  1527 + yield (code_path, filename, code_data)
  1528 + # print '-'*79
  1529 + # print filename
  1530 + # print ''
  1531 + # print code_data
  1532 + # print ''
  1533 + log.debug('extracted file {0}'.format(filename))
  1534 + else:
  1535 + log.warning("module stream {0} has code data length 0".format(modulestreamname_streamname))
  1536 + except Exception as exc:
  1537 + log.info('Error parsing module {} of {} in _extract_vba:'
  1538 + .format(projectmodule_index, projectmodules_count),
  1539 + exc_info=True)
  1540 + _ = unused # make pylint happy: now variable "unused" is being used ;-)
1536 return 1541 return
1537 1542
1538 1543