Commit 7374be1e6f51334f235d58c21e072069750c054b

Authored by decalage2
1 parent 55483626

olevba: added class VBA_Project

Showing 1 changed file with 531 additions and 454 deletions
oletools/olevba.py
@@ -262,6 +262,7 @@ import zlib @@ -262,6 +262,7 @@ import zlib
262 import email # for MHTML parsing 262 import email # for MHTML parsing
263 import string # for printable 263 import string # for printable
264 import json # for json output mode (argument --json) 264 import json # for json output mode (argument --json)
  265 +import codecs
265 266
266 # import lxml or ElementTree for XML parsing: 267 # import lxml or ElementTree for XML parsing:
267 try: 268 try:
@@ -1337,6 +1338,525 @@ def decompress_stream(compressed_container): @@ -1337,6 +1338,525 @@ def decompress_stream(compressed_container):
1337 return bytes(decompressed_container) 1338 return bytes(decompressed_container)
1338 1339
1339 1340
  1341 +class VBA_Project(object):
  1342 + """
  1343 + Class to parse a VBA project from an OLE file, and to store all the corresponding
  1344 + metadata and VBA modules.
  1345 + """
  1346 +
  1347 + def __init__(self, ole, vba_root, project_path, dir_path, relaxed=False):
  1348 + """
  1349 + Extract VBA macros from an OleFileIO object.
  1350 +
  1351 + :param vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream
  1352 + :param vba_project: path to the PROJECT stream
  1353 + :param relaxed: If True, only create info/debug log entry if data is not as expected
  1354 + (e.g. opening substream fails); if False, raise an error in this case
  1355 + """
  1356 + self.ole = ole
  1357 + self.vba_root = vba_root
  1358 + self. project_path = project_path
  1359 + self.dir_path = dir_path
  1360 + self.relaxed = relaxed
  1361 + log.debug('Parsing the dir stream from %r' % dir_path)
  1362 + # read data from dir stream (compressed)
  1363 + dir_compressed = ole.openstream(dir_path).read()
  1364 + # decompress it:
  1365 + dir_stream = BytesIO(decompress_stream(bytearray(dir_compressed)))
  1366 + # store reference for later use:
  1367 + self.dir_stream = dir_stream
  1368 +
  1369 + # reference: MS-VBAL 2.3.4.2 dir Stream: Version Independent Project Information
  1370 +
  1371 + # PROJECTSYSKIND Record
  1372 + # Specifies the platform for which the VBA project is created.
  1373 + projectsyskind_id = struct.unpack("<H", dir_stream.read(2))[0]
  1374 + self.check_value('PROJECTSYSKIND_Id', 0x0001, projectsyskind_id)
  1375 + projectsyskind_size = struct.unpack("<L", dir_stream.read(4))[0]
  1376 + self.check_value('PROJECTSYSKIND_Size', 0x0004, projectsyskind_size)
  1377 + self.syskind = struct.unpack("<L", dir_stream.read(4))[0]
  1378 + SYSKIND_NAME = {
  1379 + 0x00: "16-bit Windows",
  1380 + 0x01: "32-bit Windows",
  1381 + 0x02: "Macintosh",
  1382 + 0x03: "64-bit Windows"
  1383 + }
  1384 + self.syskind_name = SYSKIND_NAME.get(self.syskind, 'Unknown')
  1385 + log.debug("PROJECTSYSKIND_SysKind: %d - %s" % (self.syskind, self.syskind_name))
  1386 + if self.syskind not in SYSKIND_NAME:
  1387 + log.error("invalid PROJECTSYSKIND_SysKind {0:04X}".format(self.syskind))
  1388 +
  1389 + # PROJECTLCID Record
  1390 + # Specifies the VBA project's LCID.
  1391 + projectlcid_id = struct.unpack("<H", dir_stream.read(2))[0]
  1392 + self.check_value('PROJECTLCID_Id', 0x0002, projectlcid_id)
  1393 + projectlcid_size = struct.unpack("<L", dir_stream.read(4))[0]
  1394 + self.check_value('PROJECTLCID_Size', 0x0004, projectlcid_size)
  1395 + # Lcid (4 bytes): An unsigned integer that specifies the LCID value for the VBA project. MUST be 0x00000409.
  1396 + self.lcid = struct.unpack("<L", dir_stream.read(4))[0]
  1397 + self.check_value('PROJECTLCID_Lcid', 0x409, self.lcid)
  1398 +
  1399 + # PROJECTLCIDINVOKE Record
  1400 + # Specifies an LCID value used for Invoke calls on an Automation server as specified in [MS-OAUT] section 3.1.4.4.
  1401 + projectlcidinvoke_id = struct.unpack("<H", dir_stream.read(2))[0]
  1402 + self.check_value('PROJECTLCIDINVOKE_Id', 0x0014, projectlcidinvoke_id)
  1403 + projectlcidinvoke_size = struct.unpack("<L", dir_stream.read(4))[0]
  1404 + self.check_value('PROJECTLCIDINVOKE_Size', 0x0004, projectlcidinvoke_size)
  1405 + # LcidInvoke (4 bytes): An unsigned integer that specifies the LCID value used for Invoke calls. MUST be 0x00000409.
  1406 + self.lcidinvoke = struct.unpack("<L", dir_stream.read(4))[0]
  1407 + self.check_value('PROJECTLCIDINVOKE_LcidInvoke', 0x409, self.lcidinvoke)
  1408 +
  1409 + # PROJECTCODEPAGE Record
  1410 + # Specifies the VBA project's code page.
  1411 + projectcodepage_id = struct.unpack("<H", dir_stream.read(2))[0]
  1412 + self.check_value('PROJECTCODEPAGE_Id', 0x0003, projectcodepage_id)
  1413 + projectcodepage_size = struct.unpack("<L", dir_stream.read(4))[0]
  1414 + self.check_value('PROJECTCODEPAGE_Size', 0x0002, projectcodepage_size)
  1415 + self.codepage = struct.unpack("<H", dir_stream.read(2))[0]
  1416 + log.debug('Project Code Page: %r' % self.codepage)
  1417 + if self.codepage in MAC_CODEPAGES:
  1418 + self.codec = MAC_CODEPAGES[self.codepage]
  1419 + else:
  1420 + self.codec = 'cp%d' % self.codepage
  1421 + # TODO: check if valid code page or raise a clear exception, and use UTF-8 as default?
  1422 + try:
  1423 + codecs.lookup(self.codec)
  1424 + except LookupError:
  1425 + log.error('Codec not found for code page %d, using UTF-8 as fallback.' % self.codepage)
  1426 + self.codec = 'utf8'
  1427 + log.debug('Python codec corresponding to code page %d: %s' % (self.codepage, self.codec))
  1428 +
  1429 +
  1430 + # PROJECTNAME Record
  1431 + # Specifies a unique VBA identifier as the name of the VBA project.
  1432 + projectname_id = struct.unpack("<H", dir_stream.read(2))[0]
  1433 + self.check_value('PROJECTNAME_Id', 0x0004, projectname_id)
  1434 + sizeof_projectname = struct.unpack("<L", dir_stream.read(4))[0]
  1435 + log.debug('Project name size: %d bytes' % sizeof_projectname)
  1436 + if sizeof_projectname < 1 or sizeof_projectname > 128:
  1437 + # TODO: raise an actual error? What is MS Office's behaviour?
  1438 + log.error("PROJECTNAME_SizeOfProjectName value not in range [1-128]: {0}".format(sizeof_projectname))
  1439 + projectname_bytes = dir_stream.read(sizeof_projectname)
  1440 + self.projectname = self.decode_bytes(projectname_bytes)
  1441 +
  1442 +
  1443 + # PROJECTDOCSTRING Record
  1444 + # Specifies the description for the VBA project.
  1445 + projectdocstring_id = struct.unpack("<H", dir_stream.read(2))[0]
  1446 + self.check_value('PROJECTDOCSTRING_Id', 0x0005, projectdocstring_id)
  1447 + projectdocstring_sizeof_docstring = struct.unpack("<L", dir_stream.read(4))[0]
  1448 + if projectdocstring_sizeof_docstring > 2000:
  1449 + log.error(
  1450 + "PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(projectdocstring_sizeof_docstring))
  1451 + # DocString (variable): An array of SizeOfDocString bytes that specifies the description for the VBA project.
  1452 + # MUST contain MBCS characters encoded using the code page specified in PROJECTCODEPAGE (section 2.3.4.2.1.4).
  1453 + # MUST NOT contain null characters.
  1454 + docstring_bytes = dir_stream.read(projectdocstring_sizeof_docstring)
  1455 + self.docstring = self.decode_bytes(docstring_bytes)
  1456 + projectdocstring_reserved = struct.unpack("<H", dir_stream.read(2))[0]
  1457 + self.check_value('PROJECTDOCSTRING_Reserved', 0x0040, projectdocstring_reserved)
  1458 + projectdocstring_sizeof_docstring_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1459 + if projectdocstring_sizeof_docstring_unicode % 2 != 0:
  1460 + log.error("PROJECTDOCSTRING_SizeOfDocStringUnicode is not even")
  1461 + # DocStringUnicode (variable): An array of SizeOfDocStringUnicode bytes that specifies the description for the
  1462 + # VBA project. MUST contain UTF-16 characters. MUST NOT contain null characters.
  1463 + # MUST contain the UTF-16 encoding of DocString.
  1464 + docstring_unicode_bytes = dir_stream.read(projectdocstring_sizeof_docstring_unicode)
  1465 + self.docstring_unicode = docstring_unicode_bytes.decode('utf16', errors='replace')
  1466 +
  1467 + # PROJECTHELPFILEPATH Record - MS-OVBA 2.3.4.2.1.7
  1468 + projecthelpfilepath_id = struct.unpack("<H", dir_stream.read(2))[0]
  1469 + self.check_value('PROJECTHELPFILEPATH_Id', 0x0006, projecthelpfilepath_id)
  1470 + projecthelpfilepath_sizeof_helpfile1 = struct.unpack("<L", dir_stream.read(4))[0]
  1471 + if projecthelpfilepath_sizeof_helpfile1 > 260:
  1472 + log.error(
  1473 + "PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(projecthelpfilepath_sizeof_helpfile1))
  1474 + projecthelpfilepath_helpfile1 = dir_stream.read(projecthelpfilepath_sizeof_helpfile1)
  1475 + projecthelpfilepath_reserved = struct.unpack("<H", dir_stream.read(2))[0]
  1476 + self.check_value('PROJECTHELPFILEPATH_Reserved', 0x003D, projecthelpfilepath_reserved)
  1477 + projecthelpfilepath_sizeof_helpfile2 = struct.unpack("<L", dir_stream.read(4))[0]
  1478 + if projecthelpfilepath_sizeof_helpfile2 != projecthelpfilepath_sizeof_helpfile1:
  1479 + log.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 does not equal PROJECTHELPFILEPATH_SizeOfHelpFile2")
  1480 + projecthelpfilepath_helpfile2 = dir_stream.read(projecthelpfilepath_sizeof_helpfile2)
  1481 + if projecthelpfilepath_helpfile2 != projecthelpfilepath_helpfile1:
  1482 + log.error("PROJECTHELPFILEPATH_HelpFile1 does not equal PROJECTHELPFILEPATH_HelpFile2")
  1483 +
  1484 + # PROJECTHELPCONTEXT Record
  1485 + projecthelpcontext_id = struct.unpack("<H", dir_stream.read(2))[0]
  1486 + self.check_value('PROJECTHELPCONTEXT_Id', 0x0007, projecthelpcontext_id)
  1487 + projecthelpcontext_size = struct.unpack("<L", dir_stream.read(4))[0]
  1488 + self.check_value('PROJECTHELPCONTEXT_Size', 0x0004, projecthelpcontext_size)
  1489 + projecthelpcontext_helpcontext = struct.unpack("<L", dir_stream.read(4))[0]
  1490 + unused = projecthelpcontext_helpcontext
  1491 +
  1492 + # PROJECTLIBFLAGS Record
  1493 + projectlibflags_id = struct.unpack("<H", dir_stream.read(2))[0]
  1494 + self.check_value('PROJECTLIBFLAGS_Id', 0x0008, projectlibflags_id)
  1495 + projectlibflags_size = struct.unpack("<L", dir_stream.read(4))[0]
  1496 + self.check_value('PROJECTLIBFLAGS_Size', 0x0004, projectlibflags_size)
  1497 + projectlibflags_projectlibflags = struct.unpack("<L", dir_stream.read(4))[0]
  1498 + self.check_value('PROJECTLIBFLAGS_ProjectLibFlags', 0x0000, projectlibflags_projectlibflags)
  1499 +
  1500 + # PROJECTVERSION Record
  1501 + projectversion_id = struct.unpack("<H", dir_stream.read(2))[0]
  1502 + self.check_value('PROJECTVERSION_Id', 0x0009, projectversion_id)
  1503 + projectversion_reserved = struct.unpack("<L", dir_stream.read(4))[0]
  1504 + self.check_value('PROJECTVERSION_Reserved', 0x0004, projectversion_reserved)
  1505 + projectversion_versionmajor = struct.unpack("<L", dir_stream.read(4))[0]
  1506 + projectversion_versionminor = struct.unpack("<H", dir_stream.read(2))[0]
  1507 + unused = projectversion_versionmajor
  1508 + unused = projectversion_versionminor
  1509 +
  1510 + # PROJECTCONSTANTS Record
  1511 + projectconstants_id = struct.unpack("<H", dir_stream.read(2))[0]
  1512 + self.check_value('PROJECTCONSTANTS_Id', 0x000C, projectconstants_id)
  1513 + projectconstants_sizeof_constants = struct.unpack("<L", dir_stream.read(4))[0]
  1514 + if projectconstants_sizeof_constants > 1015:
  1515 + log.error(
  1516 + "PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(projectconstants_sizeof_constants))
  1517 + projectconstants_constants = dir_stream.read(projectconstants_sizeof_constants)
  1518 + projectconstants_reserved = struct.unpack("<H", dir_stream.read(2))[0]
  1519 + self.check_value('PROJECTCONSTANTS_Reserved', 0x003C, projectconstants_reserved)
  1520 + projectconstants_sizeof_constants_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1521 + if projectconstants_sizeof_constants_unicode % 2 != 0:
  1522 + log.error("PROJECTCONSTANTS_SizeOfConstantsUnicode is not even")
  1523 + projectconstants_constants_unicode = dir_stream.read(projectconstants_sizeof_constants_unicode)
  1524 + unused = projectconstants_constants
  1525 + unused = projectconstants_constants_unicode
  1526 +
  1527 + # array of REFERENCE records
  1528 + check = None
  1529 + while True:
  1530 + check = struct.unpack("<H", dir_stream.read(2))[0]
  1531 + log.debug("reference type = {0:04X}".format(check))
  1532 + if check == 0x000F:
  1533 + break
  1534 +
  1535 + if check == 0x0016:
  1536 + # REFERENCENAME
  1537 + reference_id = check
  1538 + reference_sizeof_name = struct.unpack("<L", dir_stream.read(4))[0]
  1539 + reference_name = dir_stream.read(reference_sizeof_name)
  1540 + reference_reserved = struct.unpack("<H", dir_stream.read(2))[0]
  1541 + # According to [MS-OVBA] 2.3.4.2.2.2 REFERENCENAME Record:
  1542 + # "Reserved (2 bytes): MUST be 0x003E. MUST be ignored."
  1543 + # So let's ignore it, otherwise it crashes on some files (issue #132)
  1544 + # PR #135 by @c1fe:
  1545 + # contrary to the specification I think that the unicode name
  1546 + # is optional. if reference_reserved is not 0x003E I think it
  1547 + # is actually the start of another REFERENCE record
  1548 + # at least when projectsyskind_syskind == 0x02 (Macintosh)
  1549 + if reference_reserved == 0x003E:
  1550 + #if reference_reserved not in (0x003E, 0x000D):
  1551 + # raise UnexpectedDataError(dir_path, 'REFERENCE_Reserved',
  1552 + # 0x0003E, reference_reserved)
  1553 + reference_sizeof_name_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1554 + reference_name_unicode = dir_stream.read(reference_sizeof_name_unicode)
  1555 + unused = reference_id
  1556 + unused = reference_name
  1557 + unused = reference_name_unicode
  1558 + continue
  1559 + else:
  1560 + check = reference_reserved
  1561 + log.debug("reference type = {0:04X}".format(check))
  1562 +
  1563 + if check == 0x0033:
  1564 + # REFERENCEORIGINAL (followed by REFERENCECONTROL)
  1565 + referenceoriginal_id = check
  1566 + referenceoriginal_sizeof_libidoriginal = struct.unpack("<L", dir_stream.read(4))[0]
  1567 + referenceoriginal_libidoriginal = dir_stream.read(referenceoriginal_sizeof_libidoriginal)
  1568 + unused = referenceoriginal_id
  1569 + unused = referenceoriginal_libidoriginal
  1570 + continue
  1571 +
  1572 + if check == 0x002F:
  1573 + # REFERENCECONTROL
  1574 + referencecontrol_id = check
  1575 + referencecontrol_sizetwiddled = struct.unpack("<L", dir_stream.read(4))[0] # ignore
  1576 + referencecontrol_sizeof_libidtwiddled = struct.unpack("<L", dir_stream.read(4))[0]
  1577 + referencecontrol_libidtwiddled = dir_stream.read(referencecontrol_sizeof_libidtwiddled)
  1578 + referencecontrol_reserved1 = struct.unpack("<L", dir_stream.read(4))[0] # ignore
  1579 + self.check_value('REFERENCECONTROL_Reserved1', 0x0000, referencecontrol_reserved1)
  1580 + referencecontrol_reserved2 = struct.unpack("<H", dir_stream.read(2))[0] # ignore
  1581 + self.check_value('REFERENCECONTROL_Reserved2', 0x0000, referencecontrol_reserved2)
  1582 + unused = referencecontrol_id
  1583 + unused = referencecontrol_sizetwiddled
  1584 + unused = referencecontrol_libidtwiddled
  1585 + # optional field
  1586 + check2 = struct.unpack("<H", dir_stream.read(2))[0]
  1587 + if check2 == 0x0016:
  1588 + referencecontrol_namerecordextended_id = check
  1589 + referencecontrol_namerecordextended_sizeof_name = struct.unpack("<L", dir_stream.read(4))[0]
  1590 + referencecontrol_namerecordextended_name = dir_stream.read(
  1591 + referencecontrol_namerecordextended_sizeof_name)
  1592 + referencecontrol_namerecordextended_reserved = struct.unpack("<H", dir_stream.read(2))[0]
  1593 + if referencecontrol_namerecordextended_reserved == 0x003E:
  1594 + referencecontrol_namerecordextended_sizeof_name_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1595 + referencecontrol_namerecordextended_name_unicode = dir_stream.read(
  1596 + referencecontrol_namerecordextended_sizeof_name_unicode)
  1597 + referencecontrol_reserved3 = struct.unpack("<H", dir_stream.read(2))[0]
  1598 + unused = referencecontrol_namerecordextended_id
  1599 + unused = referencecontrol_namerecordextended_name
  1600 + unused = referencecontrol_namerecordextended_name_unicode
  1601 + else:
  1602 + referencecontrol_reserved3 = referencecontrol_namerecordextended_reserved
  1603 + else:
  1604 + referencecontrol_reserved3 = check2
  1605 +
  1606 + self.check_value('REFERENCECONTROL_Reserved3', 0x0030, referencecontrol_reserved3)
  1607 + referencecontrol_sizeextended = struct.unpack("<L", dir_stream.read(4))[0]
  1608 + referencecontrol_sizeof_libidextended = struct.unpack("<L", dir_stream.read(4))[0]
  1609 + referencecontrol_libidextended = dir_stream.read(referencecontrol_sizeof_libidextended)
  1610 + referencecontrol_reserved4 = struct.unpack("<L", dir_stream.read(4))[0]
  1611 + referencecontrol_reserved5 = struct.unpack("<H", dir_stream.read(2))[0]
  1612 + referencecontrol_originaltypelib = dir_stream.read(16)
  1613 + referencecontrol_cookie = struct.unpack("<L", dir_stream.read(4))[0]
  1614 + unused = referencecontrol_sizeextended
  1615 + unused = referencecontrol_libidextended
  1616 + unused = referencecontrol_reserved4
  1617 + unused = referencecontrol_reserved5
  1618 + unused = referencecontrol_originaltypelib
  1619 + unused = referencecontrol_cookie
  1620 + continue
  1621 +
  1622 + if check == 0x000D:
  1623 + # REFERENCEREGISTERED
  1624 + referenceregistered_id = check
  1625 + referenceregistered_size = struct.unpack("<L", dir_stream.read(4))[0]
  1626 + referenceregistered_sizeof_libid = struct.unpack("<L", dir_stream.read(4))[0]
  1627 + referenceregistered_libid = dir_stream.read(referenceregistered_sizeof_libid)
  1628 + referenceregistered_reserved1 = struct.unpack("<L", dir_stream.read(4))[0]
  1629 + self.check_value('REFERENCEREGISTERED_Reserved1', 0x0000, referenceregistered_reserved1)
  1630 + referenceregistered_reserved2 = struct.unpack("<H", dir_stream.read(2))[0]
  1631 + self.check_value('REFERENCEREGISTERED_Reserved2', 0x0000, referenceregistered_reserved2)
  1632 + unused = referenceregistered_id
  1633 + unused = referenceregistered_size
  1634 + unused = referenceregistered_libid
  1635 + continue
  1636 +
  1637 + if check == 0x000E:
  1638 + # REFERENCEPROJECT
  1639 + referenceproject_id = check
  1640 + referenceproject_size = struct.unpack("<L", dir_stream.read(4))[0]
  1641 + referenceproject_sizeof_libidabsolute = struct.unpack("<L", dir_stream.read(4))[0]
  1642 + referenceproject_libidabsolute = dir_stream.read(referenceproject_sizeof_libidabsolute)
  1643 + referenceproject_sizeof_libidrelative = struct.unpack("<L", dir_stream.read(4))[0]
  1644 + referenceproject_libidrelative = dir_stream.read(referenceproject_sizeof_libidrelative)
  1645 + referenceproject_majorversion = struct.unpack("<L", dir_stream.read(4))[0]
  1646 + referenceproject_minorversion = struct.unpack("<H", dir_stream.read(2))[0]
  1647 + unused = referenceproject_id
  1648 + unused = referenceproject_size
  1649 + unused = referenceproject_libidabsolute
  1650 + unused = referenceproject_libidrelative
  1651 + unused = referenceproject_majorversion
  1652 + unused = referenceproject_minorversion
  1653 + continue
  1654 +
  1655 + log.error('invalid or unknown check Id {0:04X}'.format(check))
  1656 + # raise an exception instead of stopping abruptly (issue #180)
  1657 + raise UnexpectedDataError(dir_path, 'reference type', (0x0F, 0x16, 0x33, 0x2F, 0x0D, 0x0E), check)
  1658 + #sys.exit(0)
  1659 +
  1660 + def check_value(self, name, expected, value):
  1661 + if expected != value:
  1662 + if self.relaxed:
  1663 + log.error("invalid value for {0} expected {1:04X} got {2:04X}"
  1664 + .format(name, expected, value))
  1665 + else:
  1666 + raise UnexpectedDataError(self.dir_path, name, expected, value)
  1667 +
  1668 +
  1669 + def parse_modules(self):
  1670 + dir_stream = self.dir_stream
  1671 + # projectmodules_id has already been read by the previous loop = 0x000F
  1672 + # projectmodules_id = check #struct.unpack("<H", dir_stream.read(2))[0]
  1673 + # self.check_value('PROJECTMODULES_Id', 0x000F, projectmodules_id)
  1674 + projectmodules_size = struct.unpack("<L", dir_stream.read(4))[0]
  1675 + self.check_value('PROJECTMODULES_Size', 0x0002, projectmodules_size)
  1676 + projectmodules_count = struct.unpack("<H", dir_stream.read(2))[0]
  1677 + projectmodules_projectcookierecord_id = struct.unpack("<H", dir_stream.read(2))[0]
  1678 + self.check_value('PROJECTMODULES_ProjectCookieRecord_Id', 0x0013, projectmodules_projectcookierecord_id)
  1679 + projectmodules_projectcookierecord_size = struct.unpack("<L", dir_stream.read(4))[0]
  1680 + self.check_value('PROJECTMODULES_ProjectCookieRecord_Size', 0x0002, projectmodules_projectcookierecord_size)
  1681 + projectmodules_projectcookierecord_cookie = struct.unpack("<H", dir_stream.read(2))[0]
  1682 + unused = projectmodules_projectcookierecord_cookie
  1683 +
  1684 + # short function to simplify unicode text output
  1685 + uni_out = lambda unicode_text: unicode_text.encode('utf-8', 'replace')
  1686 +
  1687 + log.debug("parsing {0} modules".format(projectmodules_count))
  1688 + for projectmodule_index in xrange(0, projectmodules_count):
  1689 + try:
  1690 + modulename_id = struct.unpack("<H", dir_stream.read(2))[0]
  1691 + self.check_value('MODULENAME_Id', 0x0019, modulename_id)
  1692 + modulename_sizeof_modulename = struct.unpack("<L", dir_stream.read(4))[0]
  1693 + modulename_modulename = dir_stream.read(modulename_sizeof_modulename)
  1694 + # TODO: preset variables to avoid "referenced before assignment" errors
  1695 + modulename_unicode_modulename_unicode = ''
  1696 + # account for optional sections
  1697 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1698 + if section_id == 0x0047:
  1699 + modulename_unicode_id = section_id
  1700 + modulename_unicode_sizeof_modulename_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1701 + modulename_unicode_modulename_unicode = dir_stream.read(
  1702 + modulename_unicode_sizeof_modulename_unicode).decode('UTF-16LE', 'replace')
  1703 + # just guessing that this is the same encoding as used in OleFileIO
  1704 + unused = modulename_unicode_id
  1705 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1706 + if section_id == 0x001A:
  1707 + modulestreamname_id = section_id
  1708 + modulestreamname_sizeof_streamname = struct.unpack("<L", dir_stream.read(4))[0]
  1709 + modulestreamname_streamname = dir_stream.read(modulestreamname_sizeof_streamname)
  1710 + modulestreamname_reserved = struct.unpack("<H", dir_stream.read(2))[0]
  1711 + self.check_value('MODULESTREAMNAME_Reserved', 0x0032, modulestreamname_reserved)
  1712 + modulestreamname_sizeof_streamname_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1713 + modulestreamname_streamname_unicode = dir_stream.read(
  1714 + modulestreamname_sizeof_streamname_unicode).decode('UTF-16LE', 'replace')
  1715 + # just guessing that this is the same encoding as used in OleFileIO
  1716 + unused = modulestreamname_id
  1717 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1718 + if section_id == 0x001C:
  1719 + moduledocstring_id = section_id
  1720 + self.check_value('MODULEDOCSTRING_Id', 0x001C, moduledocstring_id)
  1721 + moduledocstring_sizeof_docstring = struct.unpack("<L", dir_stream.read(4))[0]
  1722 + moduledocstring_docstring = dir_stream.read(moduledocstring_sizeof_docstring)
  1723 + moduledocstring_reserved = struct.unpack("<H", dir_stream.read(2))[0]
  1724 + self.check_value('MODULEDOCSTRING_Reserved', 0x0048, moduledocstring_reserved)
  1725 + moduledocstring_sizeof_docstring_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1726 + moduledocstring_docstring_unicode = dir_stream.read(moduledocstring_sizeof_docstring_unicode)
  1727 + unused = moduledocstring_docstring
  1728 + unused = moduledocstring_docstring_unicode
  1729 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1730 + if section_id == 0x0031:
  1731 + moduleoffset_id = section_id
  1732 + self.check_value('MODULEOFFSET_Id', 0x0031, moduleoffset_id)
  1733 + moduleoffset_size = struct.unpack("<L", dir_stream.read(4))[0]
  1734 + self.check_value('MODULEOFFSET_Size', 0x0004, moduleoffset_size)
  1735 + moduleoffset_textoffset = struct.unpack("<L", dir_stream.read(4))[0]
  1736 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1737 + if section_id == 0x001E:
  1738 + modulehelpcontext_id = section_id
  1739 + self.check_value('MODULEHELPCONTEXT_Id', 0x001E, modulehelpcontext_id)
  1740 + modulehelpcontext_size = struct.unpack("<L", dir_stream.read(4))[0]
  1741 + self.check_value('MODULEHELPCONTEXT_Size', 0x0004, modulehelpcontext_size)
  1742 + modulehelpcontext_helpcontext = struct.unpack("<L", dir_stream.read(4))[0]
  1743 + unused = modulehelpcontext_helpcontext
  1744 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1745 + if section_id == 0x002C:
  1746 + modulecookie_id = section_id
  1747 + self.check_value('MODULECOOKIE_Id', 0x002C, modulecookie_id)
  1748 + modulecookie_size = struct.unpack("<L", dir_stream.read(4))[0]
  1749 + self.check_value('MODULECOOKIE_Size', 0x0002, modulecookie_size)
  1750 + modulecookie_cookie = struct.unpack("<H", dir_stream.read(2))[0]
  1751 + unused = modulecookie_cookie
  1752 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1753 + if section_id == 0x0021 or section_id == 0x0022:
  1754 + moduletype_id = section_id
  1755 + moduletype_reserved = struct.unpack("<L", dir_stream.read(4))[0]
  1756 + unused = moduletype_id
  1757 + unused = moduletype_reserved
  1758 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1759 + if section_id == 0x0025:
  1760 + modulereadonly_id = section_id
  1761 + self.check_value('MODULEREADONLY_Id', 0x0025, modulereadonly_id)
  1762 + modulereadonly_reserved = struct.unpack("<L", dir_stream.read(4))[0]
  1763 + self.check_value('MODULEREADONLY_Reserved', 0x0000, modulereadonly_reserved)
  1764 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1765 + if section_id == 0x0028:
  1766 + moduleprivate_id = section_id
  1767 + self.check_value('MODULEPRIVATE_Id', 0x0028, moduleprivate_id)
  1768 + moduleprivate_reserved = struct.unpack("<L", dir_stream.read(4))[0]
  1769 + self.check_value('MODULEPRIVATE_Reserved', 0x0000, moduleprivate_reserved)
  1770 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1771 + if section_id == 0x002B: # TERMINATOR
  1772 + module_reserved = struct.unpack("<L", dir_stream.read(4))[0]
  1773 + self.check_value('MODULE_Reserved', 0x0000, module_reserved)
  1774 + section_id = None
  1775 + if section_id != None:
  1776 + log.warning('unknown or invalid module section id {0:04X}'.format(section_id))
  1777 +
  1778 + # TODO: handle case when modulestreamname_streamname is not provided
  1779 + log.debug("ModuleName = {0}".format(modulename_modulename))
  1780 + log.debug("ModuleNameUnicode = {0}".format(uni_out(modulename_unicode_modulename_unicode)))
  1781 + log.debug("StreamName = {0}".format(modulestreamname_streamname))
  1782 + try:
  1783 + streamname_unicode = self.decode_bytes(modulestreamname_streamname)
  1784 + except UnicodeError as ue:
  1785 + log.debug('failed to decode stream name {0!r} with codec {1}'
  1786 + .format(uni_out(streamname_unicode), self.codec))
  1787 + streamname_unicode = modulestreamname_streamname.decode(self.codec, errors='replace')
  1788 + log.debug("StreamName.decode('%s') = %s" % (self.codec, uni_out(streamname_unicode)))
  1789 + log.debug("StreamNameUnicode = {0}".format(uni_out(modulestreamname_streamname_unicode)))
  1790 + log.debug("TextOffset = {0}".format(moduleoffset_textoffset))
  1791 +
  1792 + code_data = None
  1793 + try_names = streamname_unicode, \
  1794 + modulename_unicode_modulename_unicode, \
  1795 + modulestreamname_streamname_unicode
  1796 + for stream_name in try_names:
  1797 + # TODO: if olefile._find were less private, could replace this
  1798 + # try-except with calls to it
  1799 + try:
  1800 + code_path = self.vba_root + u'VBA/' + stream_name
  1801 + log.debug('opening VBA code stream %s' % uni_out(code_path))
  1802 + code_data = self.ole.openstream(code_path).read()
  1803 + break
  1804 + except IOError as ioe:
  1805 + log.debug('failed to open stream VBA/%r (%r), try other name'
  1806 + % (uni_out(stream_name), ioe))
  1807 +
  1808 + if code_data is None:
  1809 + log.info("Could not open stream %d of %d ('VBA/' + one of %r)!"
  1810 + % (projectmodule_index, projectmodules_count,
  1811 + '/'.join("'" + uni_out(stream_name) + "'"
  1812 + for stream_name in try_names)))
  1813 + if self.relaxed:
  1814 + continue # ... with next submodule
  1815 + else:
  1816 + raise SubstreamOpenError('[BASE]', 'VBA/' +
  1817 + uni_out(modulename_unicode_modulename_unicode))
  1818 +
  1819 + log.debug("length of code_data = {0}".format(len(code_data)))
  1820 + log.debug("offset of code_data = {0}".format(moduleoffset_textoffset))
  1821 + code_data = code_data[moduleoffset_textoffset:]
  1822 + if len(code_data) > 0:
  1823 + code_data = decompress_stream(bytearray(code_data))
  1824 + # case-insensitive search in the code_modules dict to find the file extension:
  1825 + # filext = code_modules.get(modulename_modulename.lower(), 'bin')
  1826 + filext = 'vba'
  1827 + filename = '{0}.{1}'.format(modulename_modulename, filext)
  1828 + #TODO: also yield the codepage so that callers can decode it properly
  1829 + yield (code_path, filename, code_data)
  1830 + # print '-'*79
  1831 + # print filename
  1832 + # print ''
  1833 + # print code_data
  1834 + # print ''
  1835 + log.debug('extracted file {0}'.format(filename))
  1836 + else:
  1837 + log.warning("module stream {0} has code data length 0".format(modulestreamname_streamname))
  1838 + except (UnexpectedDataError, SubstreamOpenError):
  1839 + raise
  1840 + except Exception as exc:
  1841 + log.info('Error parsing module {0} of {1} in _extract_vba:'
  1842 + .format(projectmodule_index, projectmodules_count),
  1843 + exc_info=True)
  1844 + if not self.relaxed:
  1845 + raise
  1846 + _ = unused # make pylint happy: now variable "unused" is being used ;-)
  1847 + return
  1848 +
  1849 + def decode_bytes(self, bytes_string, errors='replace'):
  1850 + """
  1851 + Decode a bytes string to a unicode string, using the project code page
  1852 + :param bytes_string: bytes, bytes string to be decoded
  1853 + :param errors: str, mode to handle unicode conversion errors
  1854 + :return: str/unicode, decoded string
  1855 + """
  1856 + return bytes_string.decode(self.codec, errors=errors)
  1857 +
  1858 +
  1859 +
1340 def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): 1860 def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1341 """ 1861 """
1342 Extract VBA macros from an OleFileIO object. 1862 Extract VBA macros from an OleFileIO object.
@@ -1348,10 +1868,15 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): @@ -1348,10 +1868,15 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1348 (e.g. opening substream fails); if False, raise an error in this case 1868 (e.g. opening substream fails); if False, raise an error in this case
1349 This is a generator, yielding (stream path, VBA filename, VBA source code) for each VBA code stream 1869 This is a generator, yielding (stream path, VBA filename, VBA source code) for each VBA code stream
1350 """ 1870 """
1351 - # Open the PROJECT stream:  
1352 - project = ole.openstream(project_path)  
1353 log.debug('relaxed is %s' % relaxed) 1871 log.debug('relaxed is %s' % relaxed)
1354 1872
  1873 + project = VBA_Project(ole, vba_root, project_path, dir_path, relaxed=False)
  1874 +
  1875 + # Open the PROJECT stream:
  1876 + # reference: [MS-OVBA] 2.3.1 PROJECT Stream
  1877 + # TODO: in fact the PROJECT stream is encoded using the code page specified in the dir stream, should be read afterwards
  1878 + project_stream = ole.openstream(project_path)
  1879 +
1355 # sample content of the PROJECT stream: 1880 # sample content of the PROJECT stream:
1356 1881
1357 ## ID="{5312AC8A-349D-4950-BDD0-49BE3C4DD0F0}" 1882 ## ID="{5312AC8A-349D-4950-BDD0-49BE3C4DD0F0}"
@@ -1374,7 +1899,8 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): @@ -1374,7 +1899,8 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1374 1899
1375 code_modules = {} 1900 code_modules = {}
1376 1901
1377 - for line in project: 1902 + for line in project_stream:
  1903 + line = project.decode_bytes(line)
1378 log.debug('PROJECT: %r' % line) 1904 log.debug('PROJECT: %r' % line)
1379 line = line.strip() 1905 line = line.strip()
1380 if '=' in line: 1906 if '=' in line:
@@ -1396,457 +1922,8 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): @@ -1396,457 +1922,8 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1396 elif name == 'BaseClass': 1922 elif name == 'BaseClass':
1397 code_modules[value] = FORM_EXTENSION 1923 code_modules[value] = FORM_EXTENSION
1398 1924
1399 - # read data from dir stream (compressed)  
1400 - dir_compressed = ole.openstream(dir_path).read()  
1401 -  
1402 - def check_value(name, expected, value):  
1403 - if expected != value:  
1404 - if relaxed:  
1405 - log.error("invalid value for {0} expected {1:04X} got {2:04X}"  
1406 - .format(name, expected, value))  
1407 - else:  
1408 - raise UnexpectedDataError(dir_path, name, expected, value)  
1409 -  
1410 - dir_stream = BytesIO(decompress_stream(bytearray(dir_compressed)))  
1411 -  
1412 - # PROJECTSYSKIND Record  
1413 - projectsyskind_id = struct.unpack("<H", dir_stream.read(2))[0]  
1414 - check_value('PROJECTSYSKIND_Id', 0x0001, projectsyskind_id)  
1415 - projectsyskind_size = struct.unpack("<L", dir_stream.read(4))[0]  
1416 - check_value('PROJECTSYSKIND_Size', 0x0004, projectsyskind_size)  
1417 - projectsyskind_syskind = struct.unpack("<L", dir_stream.read(4))[0]  
1418 - if projectsyskind_syskind == 0x00:  
1419 - log.debug("16-bit Windows")  
1420 - elif projectsyskind_syskind == 0x01:  
1421 - log.debug("32-bit Windows")  
1422 - elif projectsyskind_syskind == 0x02:  
1423 - log.debug("Macintosh")  
1424 - elif projectsyskind_syskind == 0x03:  
1425 - log.debug("64-bit Windows")  
1426 - else:  
1427 - log.error("invalid PROJECTSYSKIND_SysKind {0:04X}".format(projectsyskind_syskind))  
1428 -  
1429 - # PROJECTLCID Record  
1430 - projectlcid_id = struct.unpack("<H", dir_stream.read(2))[0]  
1431 - check_value('PROJECTLCID_Id', 0x0002, projectlcid_id)  
1432 - projectlcid_size = struct.unpack("<L", dir_stream.read(4))[0]  
1433 - check_value('PROJECTLCID_Size', 0x0004, projectlcid_size)  
1434 - projectlcid_lcid = struct.unpack("<L", dir_stream.read(4))[0]  
1435 - check_value('PROJECTLCID_Lcid', 0x409, projectlcid_lcid)  
1436 -  
1437 - # PROJECTLCIDINVOKE Record  
1438 - projectlcidinvoke_id = struct.unpack("<H", dir_stream.read(2))[0]  
1439 - check_value('PROJECTLCIDINVOKE_Id', 0x0014, projectlcidinvoke_id)  
1440 - projectlcidinvoke_size = struct.unpack("<L", dir_stream.read(4))[0]  
1441 - check_value('PROJECTLCIDINVOKE_Size', 0x0004, projectlcidinvoke_size)  
1442 - projectlcidinvoke_lcidinvoke = struct.unpack("<L", dir_stream.read(4))[0]  
1443 - check_value('PROJECTLCIDINVOKE_LcidInvoke', 0x409, projectlcidinvoke_lcidinvoke)  
1444 -  
1445 - # PROJECTCODEPAGE Record  
1446 - projectcodepage_id = struct.unpack("<H", dir_stream.read(2))[0]  
1447 - check_value('PROJECTCODEPAGE_Id', 0x0003, projectcodepage_id)  
1448 - projectcodepage_size = struct.unpack("<L", dir_stream.read(4))[0]  
1449 - check_value('PROJECTCODEPAGE_Size', 0x0002, projectcodepage_size)  
1450 - projectcodepage_codepage = struct.unpack("<H", dir_stream.read(2))[0]  
1451 -  
1452 - # PROJECTNAME Record  
1453 - projectname_id = struct.unpack("<H", dir_stream.read(2))[0]  
1454 - check_value('PROJECTNAME_Id', 0x0004, projectname_id)  
1455 - projectname_sizeof_projectname = struct.unpack("<L", dir_stream.read(4))[0]  
1456 - if projectname_sizeof_projectname < 1 or projectname_sizeof_projectname > 128:  
1457 - log.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(projectname_sizeof_projectname))  
1458 - projectname_projectname = dir_stream.read(projectname_sizeof_projectname)  
1459 - unused = projectname_projectname  
1460 -  
1461 - # PROJECTDOCSTRING Record  
1462 - projectdocstring_id = struct.unpack("<H", dir_stream.read(2))[0]  
1463 - check_value('PROJECTDOCSTRING_Id', 0x0005, projectdocstring_id)  
1464 - projectdocstring_sizeof_docstring = struct.unpack("<L", dir_stream.read(4))[0]  
1465 - if projectdocstring_sizeof_docstring > 2000:  
1466 - log.error(  
1467 - "PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(projectdocstring_sizeof_docstring))  
1468 - projectdocstring_docstring = dir_stream.read(projectdocstring_sizeof_docstring)  
1469 - projectdocstring_reserved = struct.unpack("<H", dir_stream.read(2))[0]  
1470 - check_value('PROJECTDOCSTRING_Reserved', 0x0040, projectdocstring_reserved)  
1471 - projectdocstring_sizeof_docstring_unicode = struct.unpack("<L", dir_stream.read(4))[0]  
1472 - if projectdocstring_sizeof_docstring_unicode % 2 != 0:  
1473 - log.error("PROJECTDOCSTRING_SizeOfDocStringUnicode is not even")  
1474 - projectdocstring_docstring_unicode = dir_stream.read(projectdocstring_sizeof_docstring_unicode)  
1475 - unused = projectdocstring_docstring  
1476 - unused = projectdocstring_docstring_unicode  
1477 -  
1478 - # PROJECTHELPFILEPATH Record - MS-OVBA 2.3.4.2.1.7  
1479 - projecthelpfilepath_id = struct.unpack("<H", dir_stream.read(2))[0]  
1480 - check_value('PROJECTHELPFILEPATH_Id', 0x0006, projecthelpfilepath_id)  
1481 - projecthelpfilepath_sizeof_helpfile1 = struct.unpack("<L", dir_stream.read(4))[0]  
1482 - if projecthelpfilepath_sizeof_helpfile1 > 260:  
1483 - log.error(  
1484 - "PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(projecthelpfilepath_sizeof_helpfile1))  
1485 - projecthelpfilepath_helpfile1 = dir_stream.read(projecthelpfilepath_sizeof_helpfile1)  
1486 - projecthelpfilepath_reserved = struct.unpack("<H", dir_stream.read(2))[0]  
1487 - check_value('PROJECTHELPFILEPATH_Reserved', 0x003D, projecthelpfilepath_reserved)  
1488 - projecthelpfilepath_sizeof_helpfile2 = struct.unpack("<L", dir_stream.read(4))[0]  
1489 - if projecthelpfilepath_sizeof_helpfile2 != projecthelpfilepath_sizeof_helpfile1:  
1490 - log.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 does not equal PROJECTHELPFILEPATH_SizeOfHelpFile2")  
1491 - projecthelpfilepath_helpfile2 = dir_stream.read(projecthelpfilepath_sizeof_helpfile2)  
1492 - if projecthelpfilepath_helpfile2 != projecthelpfilepath_helpfile1:  
1493 - log.error("PROJECTHELPFILEPATH_HelpFile1 does not equal PROJECTHELPFILEPATH_HelpFile2")  
1494 -  
1495 - # PROJECTHELPCONTEXT Record  
1496 - projecthelpcontext_id = struct.unpack("<H", dir_stream.read(2))[0]  
1497 - check_value('PROJECTHELPCONTEXT_Id', 0x0007, projecthelpcontext_id)  
1498 - projecthelpcontext_size = struct.unpack("<L", dir_stream.read(4))[0]  
1499 - check_value('PROJECTHELPCONTEXT_Size', 0x0004, projecthelpcontext_size)  
1500 - projecthelpcontext_helpcontext = struct.unpack("<L", dir_stream.read(4))[0]  
1501 - unused = projecthelpcontext_helpcontext  
1502 -  
1503 - # PROJECTLIBFLAGS Record  
1504 - projectlibflags_id = struct.unpack("<H", dir_stream.read(2))[0]  
1505 - check_value('PROJECTLIBFLAGS_Id', 0x0008, projectlibflags_id)  
1506 - projectlibflags_size = struct.unpack("<L", dir_stream.read(4))[0]  
1507 - check_value('PROJECTLIBFLAGS_Size', 0x0004, projectlibflags_size)  
1508 - projectlibflags_projectlibflags = struct.unpack("<L", dir_stream.read(4))[0]  
1509 - check_value('PROJECTLIBFLAGS_ProjectLibFlags', 0x0000, projectlibflags_projectlibflags)  
1510 -  
1511 - # PROJECTVERSION Record  
1512 - projectversion_id = struct.unpack("<H", dir_stream.read(2))[0]  
1513 - check_value('PROJECTVERSION_Id', 0x0009, projectversion_id)  
1514 - projectversion_reserved = struct.unpack("<L", dir_stream.read(4))[0]  
1515 - check_value('PROJECTVERSION_Reserved', 0x0004, projectversion_reserved)  
1516 - projectversion_versionmajor = struct.unpack("<L", dir_stream.read(4))[0]  
1517 - projectversion_versionminor = struct.unpack("<H", dir_stream.read(2))[0]  
1518 - unused = projectversion_versionmajor  
1519 - unused = projectversion_versionminor  
1520 -  
1521 - # PROJECTCONSTANTS Record  
1522 - projectconstants_id = struct.unpack("<H", dir_stream.read(2))[0]  
1523 - check_value('PROJECTCONSTANTS_Id', 0x000C, projectconstants_id)  
1524 - projectconstants_sizeof_constants = struct.unpack("<L", dir_stream.read(4))[0]  
1525 - if projectconstants_sizeof_constants > 1015:  
1526 - log.error(  
1527 - "PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(projectconstants_sizeof_constants))  
1528 - projectconstants_constants = dir_stream.read(projectconstants_sizeof_constants)  
1529 - projectconstants_reserved = struct.unpack("<H", dir_stream.read(2))[0]  
1530 - check_value('PROJECTCONSTANTS_Reserved', 0x003C, projectconstants_reserved)  
1531 - projectconstants_sizeof_constants_unicode = struct.unpack("<L", dir_stream.read(4))[0]  
1532 - if projectconstants_sizeof_constants_unicode % 2 != 0:  
1533 - log.error("PROJECTCONSTANTS_SizeOfConstantsUnicode is not even")  
1534 - projectconstants_constants_unicode = dir_stream.read(projectconstants_sizeof_constants_unicode)  
1535 - unused = projectconstants_constants  
1536 - unused = projectconstants_constants_unicode  
1537 -  
1538 - # array of REFERENCE records  
1539 - check = None  
1540 - while True:  
1541 - check = struct.unpack("<H", dir_stream.read(2))[0]  
1542 - log.debug("reference type = {0:04X}".format(check))  
1543 - if check == 0x000F:  
1544 - break  
1545 -  
1546 - if check == 0x0016:  
1547 - # REFERENCENAME  
1548 - reference_id = check  
1549 - reference_sizeof_name = struct.unpack("<L", dir_stream.read(4))[0]  
1550 - reference_name = dir_stream.read(reference_sizeof_name)  
1551 - reference_reserved = struct.unpack("<H", dir_stream.read(2))[0]  
1552 - # According to [MS-OVBA] 2.3.4.2.2.2 REFERENCENAME Record:  
1553 - # "Reserved (2 bytes): MUST be 0x003E. MUST be ignored."  
1554 - # So let's ignore it, otherwise it crashes on some files (issue #132)  
1555 - # PR #135 by @c1fe:  
1556 - # contrary to the specification I think that the unicode name  
1557 - # is optional. if reference_reserved is not 0x003E I think it  
1558 - # is actually the start of another REFERENCE record  
1559 - # at least when projectsyskind_syskind == 0x02 (Macintosh)  
1560 - if reference_reserved == 0x003E:  
1561 - #if reference_reserved not in (0x003E, 0x000D):  
1562 - # raise UnexpectedDataError(dir_path, 'REFERENCE_Reserved',  
1563 - # 0x0003E, reference_reserved)  
1564 - reference_sizeof_name_unicode = struct.unpack("<L", dir_stream.read(4))[0]  
1565 - reference_name_unicode = dir_stream.read(reference_sizeof_name_unicode)  
1566 - unused = reference_id  
1567 - unused = reference_name  
1568 - unused = reference_name_unicode  
1569 - continue  
1570 - else:  
1571 - check = reference_reserved  
1572 - log.debug("reference type = {0:04X}".format(check))  
1573 -  
1574 - if check == 0x0033:  
1575 - # REFERENCEORIGINAL (followed by REFERENCECONTROL)  
1576 - referenceoriginal_id = check  
1577 - referenceoriginal_sizeof_libidoriginal = struct.unpack("<L", dir_stream.read(4))[0]  
1578 - referenceoriginal_libidoriginal = dir_stream.read(referenceoriginal_sizeof_libidoriginal)  
1579 - unused = referenceoriginal_id  
1580 - unused = referenceoriginal_libidoriginal  
1581 - continue  
1582 -  
1583 - if check == 0x002F:  
1584 - # REFERENCECONTROL  
1585 - referencecontrol_id = check  
1586 - referencecontrol_sizetwiddled = struct.unpack("<L", dir_stream.read(4))[0] # ignore  
1587 - referencecontrol_sizeof_libidtwiddled = struct.unpack("<L", dir_stream.read(4))[0]  
1588 - referencecontrol_libidtwiddled = dir_stream.read(referencecontrol_sizeof_libidtwiddled)  
1589 - referencecontrol_reserved1 = struct.unpack("<L", dir_stream.read(4))[0] # ignore  
1590 - check_value('REFERENCECONTROL_Reserved1', 0x0000, referencecontrol_reserved1)  
1591 - referencecontrol_reserved2 = struct.unpack("<H", dir_stream.read(2))[0] # ignore  
1592 - check_value('REFERENCECONTROL_Reserved2', 0x0000, referencecontrol_reserved2)  
1593 - unused = referencecontrol_id  
1594 - unused = referencecontrol_sizetwiddled  
1595 - unused = referencecontrol_libidtwiddled  
1596 - # optional field  
1597 - check2 = struct.unpack("<H", dir_stream.read(2))[0]  
1598 - if check2 == 0x0016:  
1599 - referencecontrol_namerecordextended_id = check  
1600 - referencecontrol_namerecordextended_sizeof_name = struct.unpack("<L", dir_stream.read(4))[0]  
1601 - referencecontrol_namerecordextended_name = dir_stream.read(  
1602 - referencecontrol_namerecordextended_sizeof_name)  
1603 - referencecontrol_namerecordextended_reserved = struct.unpack("<H", dir_stream.read(2))[0]  
1604 - if referencecontrol_namerecordextended_reserved == 0x003E:  
1605 - referencecontrol_namerecordextended_sizeof_name_unicode = struct.unpack("<L", dir_stream.read(4))[0]  
1606 - referencecontrol_namerecordextended_name_unicode = dir_stream.read(  
1607 - referencecontrol_namerecordextended_sizeof_name_unicode)  
1608 - referencecontrol_reserved3 = struct.unpack("<H", dir_stream.read(2))[0]  
1609 - unused = referencecontrol_namerecordextended_id  
1610 - unused = referencecontrol_namerecordextended_name  
1611 - unused = referencecontrol_namerecordextended_name_unicode  
1612 - else:  
1613 - referencecontrol_reserved3 = referencecontrol_namerecordextended_reserved  
1614 - else:  
1615 - referencecontrol_reserved3 = check2  
1616 -  
1617 - check_value('REFERENCECONTROL_Reserved3', 0x0030, referencecontrol_reserved3)  
1618 - referencecontrol_sizeextended = struct.unpack("<L", dir_stream.read(4))[0]  
1619 - referencecontrol_sizeof_libidextended = struct.unpack("<L", dir_stream.read(4))[0]  
1620 - referencecontrol_libidextended = dir_stream.read(referencecontrol_sizeof_libidextended)  
1621 - referencecontrol_reserved4 = struct.unpack("<L", dir_stream.read(4))[0]  
1622 - referencecontrol_reserved5 = struct.unpack("<H", dir_stream.read(2))[0]  
1623 - referencecontrol_originaltypelib = dir_stream.read(16)  
1624 - referencecontrol_cookie = struct.unpack("<L", dir_stream.read(4))[0]  
1625 - unused = referencecontrol_sizeextended  
1626 - unused = referencecontrol_libidextended  
1627 - unused = referencecontrol_reserved4  
1628 - unused = referencecontrol_reserved5  
1629 - unused = referencecontrol_originaltypelib  
1630 - unused = referencecontrol_cookie  
1631 - continue  
1632 -  
1633 - if check == 0x000D:  
1634 - # REFERENCEREGISTERED  
1635 - referenceregistered_id = check  
1636 - referenceregistered_size = struct.unpack("<L", dir_stream.read(4))[0]  
1637 - referenceregistered_sizeof_libid = struct.unpack("<L", dir_stream.read(4))[0]  
1638 - referenceregistered_libid = dir_stream.read(referenceregistered_sizeof_libid)  
1639 - referenceregistered_reserved1 = struct.unpack("<L", dir_stream.read(4))[0]  
1640 - check_value('REFERENCEREGISTERED_Reserved1', 0x0000, referenceregistered_reserved1)  
1641 - referenceregistered_reserved2 = struct.unpack("<H", dir_stream.read(2))[0]  
1642 - check_value('REFERENCEREGISTERED_Reserved2', 0x0000, referenceregistered_reserved2)  
1643 - unused = referenceregistered_id  
1644 - unused = referenceregistered_size  
1645 - unused = referenceregistered_libid  
1646 - continue  
1647 -  
1648 - if check == 0x000E:  
1649 - # REFERENCEPROJECT  
1650 - referenceproject_id = check  
1651 - referenceproject_size = struct.unpack("<L", dir_stream.read(4))[0]  
1652 - referenceproject_sizeof_libidabsolute = struct.unpack("<L", dir_stream.read(4))[0]  
1653 - referenceproject_libidabsolute = dir_stream.read(referenceproject_sizeof_libidabsolute)  
1654 - referenceproject_sizeof_libidrelative = struct.unpack("<L", dir_stream.read(4))[0]  
1655 - referenceproject_libidrelative = dir_stream.read(referenceproject_sizeof_libidrelative)  
1656 - referenceproject_majorversion = struct.unpack("<L", dir_stream.read(4))[0]  
1657 - referenceproject_minorversion = struct.unpack("<H", dir_stream.read(2))[0]  
1658 - unused = referenceproject_id  
1659 - unused = referenceproject_size  
1660 - unused = referenceproject_libidabsolute  
1661 - unused = referenceproject_libidrelative  
1662 - unused = referenceproject_majorversion  
1663 - unused = referenceproject_minorversion  
1664 - continue  
1665 -  
1666 - log.error('invalid or unknown check Id {0:04X}'.format(check))  
1667 - # raise an exception instead of stopping abruptly (issue #180)  
1668 - raise UnexpectedDataError(dir_path, 'reference type', (0x0F, 0x16, 0x33, 0x2F, 0x0D, 0x0E), check)  
1669 - #sys.exit(0)  
1670 -  
1671 - projectmodules_id = check #struct.unpack("<H", dir_stream.read(2))[0]  
1672 - check_value('PROJECTMODULES_Id', 0x000F, projectmodules_id)  
1673 - projectmodules_size = struct.unpack("<L", dir_stream.read(4))[0]  
1674 - check_value('PROJECTMODULES_Size', 0x0002, projectmodules_size)  
1675 - projectmodules_count = struct.unpack("<H", dir_stream.read(2))[0]  
1676 - projectmodules_projectcookierecord_id = struct.unpack("<H", dir_stream.read(2))[0]  
1677 - check_value('PROJECTMODULES_ProjectCookieRecord_Id', 0x0013, projectmodules_projectcookierecord_id)  
1678 - projectmodules_projectcookierecord_size = struct.unpack("<L", dir_stream.read(4))[0]  
1679 - check_value('PROJECTMODULES_ProjectCookieRecord_Size', 0x0002, projectmodules_projectcookierecord_size)  
1680 - projectmodules_projectcookierecord_cookie = struct.unpack("<H", dir_stream.read(2))[0]  
1681 - unused = projectmodules_projectcookierecord_cookie  
1682 -  
1683 - # short function to simplify unicode text output  
1684 - uni_out = lambda unicode_text: unicode_text.encode('utf-8', 'replace')  
1685 -  
1686 - log.debug("parsing {0} modules".format(projectmodules_count))  
1687 - for projectmodule_index in xrange(0, projectmodules_count):  
1688 - try:  
1689 - modulename_id = struct.unpack("<H", dir_stream.read(2))[0]  
1690 - check_value('MODULENAME_Id', 0x0019, modulename_id)  
1691 - modulename_sizeof_modulename = struct.unpack("<L", dir_stream.read(4))[0]  
1692 - modulename_modulename = dir_stream.read(modulename_sizeof_modulename)  
1693 - # TODO: preset variables to avoid "referenced before assignment" errors  
1694 - modulename_unicode_modulename_unicode = ''  
1695 - # account for optional sections  
1696 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1697 - if section_id == 0x0047:  
1698 - modulename_unicode_id = section_id  
1699 - modulename_unicode_sizeof_modulename_unicode = struct.unpack("<L", dir_stream.read(4))[0]  
1700 - modulename_unicode_modulename_unicode = dir_stream.read(  
1701 - modulename_unicode_sizeof_modulename_unicode).decode('UTF-16LE', 'replace')  
1702 - # just guessing that this is the same encoding as used in OleFileIO  
1703 - unused = modulename_unicode_id  
1704 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1705 - if section_id == 0x001A:  
1706 - modulestreamname_id = section_id  
1707 - modulestreamname_sizeof_streamname = struct.unpack("<L", dir_stream.read(4))[0]  
1708 - modulestreamname_streamname = dir_stream.read(modulestreamname_sizeof_streamname)  
1709 - modulestreamname_reserved = struct.unpack("<H", dir_stream.read(2))[0]  
1710 - check_value('MODULESTREAMNAME_Reserved', 0x0032, modulestreamname_reserved)  
1711 - modulestreamname_sizeof_streamname_unicode = struct.unpack("<L", dir_stream.read(4))[0]  
1712 - modulestreamname_streamname_unicode = dir_stream.read(  
1713 - modulestreamname_sizeof_streamname_unicode).decode('UTF-16LE', 'replace')  
1714 - # just guessing that this is the same encoding as used in OleFileIO  
1715 - unused = modulestreamname_id  
1716 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1717 - if section_id == 0x001C:  
1718 - moduledocstring_id = section_id  
1719 - check_value('MODULEDOCSTRING_Id', 0x001C, moduledocstring_id)  
1720 - moduledocstring_sizeof_docstring = struct.unpack("<L", dir_stream.read(4))[0]  
1721 - moduledocstring_docstring = dir_stream.read(moduledocstring_sizeof_docstring)  
1722 - moduledocstring_reserved = struct.unpack("<H", dir_stream.read(2))[0]  
1723 - check_value('MODULEDOCSTRING_Reserved', 0x0048, moduledocstring_reserved)  
1724 - moduledocstring_sizeof_docstring_unicode = struct.unpack("<L", dir_stream.read(4))[0]  
1725 - moduledocstring_docstring_unicode = dir_stream.read(moduledocstring_sizeof_docstring_unicode)  
1726 - unused = moduledocstring_docstring  
1727 - unused = moduledocstring_docstring_unicode  
1728 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1729 - if section_id == 0x0031:  
1730 - moduleoffset_id = section_id  
1731 - check_value('MODULEOFFSET_Id', 0x0031, moduleoffset_id)  
1732 - moduleoffset_size = struct.unpack("<L", dir_stream.read(4))[0]  
1733 - check_value('MODULEOFFSET_Size', 0x0004, moduleoffset_size)  
1734 - moduleoffset_textoffset = struct.unpack("<L", dir_stream.read(4))[0]  
1735 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1736 - if section_id == 0x001E:  
1737 - modulehelpcontext_id = section_id  
1738 - check_value('MODULEHELPCONTEXT_Id', 0x001E, modulehelpcontext_id)  
1739 - modulehelpcontext_size = struct.unpack("<L", dir_stream.read(4))[0]  
1740 - check_value('MODULEHELPCONTEXT_Size', 0x0004, modulehelpcontext_size)  
1741 - modulehelpcontext_helpcontext = struct.unpack("<L", dir_stream.read(4))[0]  
1742 - unused = modulehelpcontext_helpcontext  
1743 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1744 - if section_id == 0x002C:  
1745 - modulecookie_id = section_id  
1746 - check_value('MODULECOOKIE_Id', 0x002C, modulecookie_id)  
1747 - modulecookie_size = struct.unpack("<L", dir_stream.read(4))[0]  
1748 - check_value('MODULECOOKIE_Size', 0x0002, modulecookie_size)  
1749 - modulecookie_cookie = struct.unpack("<H", dir_stream.read(2))[0]  
1750 - unused = modulecookie_cookie  
1751 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1752 - if section_id == 0x0021 or section_id == 0x0022:  
1753 - moduletype_id = section_id  
1754 - moduletype_reserved = struct.unpack("<L", dir_stream.read(4))[0]  
1755 - unused = moduletype_id  
1756 - unused = moduletype_reserved  
1757 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1758 - if section_id == 0x0025:  
1759 - modulereadonly_id = section_id  
1760 - check_value('MODULEREADONLY_Id', 0x0025, modulereadonly_id)  
1761 - modulereadonly_reserved = struct.unpack("<L", dir_stream.read(4))[0]  
1762 - check_value('MODULEREADONLY_Reserved', 0x0000, modulereadonly_reserved)  
1763 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1764 - if section_id == 0x0028:  
1765 - moduleprivate_id = section_id  
1766 - check_value('MODULEPRIVATE_Id', 0x0028, moduleprivate_id)  
1767 - moduleprivate_reserved = struct.unpack("<L", dir_stream.read(4))[0]  
1768 - check_value('MODULEPRIVATE_Reserved', 0x0000, moduleprivate_reserved)  
1769 - section_id = struct.unpack("<H", dir_stream.read(2))[0]  
1770 - if section_id == 0x002B: # TERMINATOR  
1771 - module_reserved = struct.unpack("<L", dir_stream.read(4))[0]  
1772 - check_value('MODULE_Reserved', 0x0000, module_reserved)  
1773 - section_id = None  
1774 - if section_id != None:  
1775 - log.warning('unknown or invalid module section id {0:04X}'.format(section_id))  
1776 -  
1777 - log.debug('Project CodePage = %d' % projectcodepage_codepage)  
1778 - if projectcodepage_codepage in MAC_CODEPAGES:  
1779 - vba_codec = MAC_CODEPAGES[projectcodepage_codepage]  
1780 - else:  
1781 - vba_codec = 'cp%d' % projectcodepage_codepage  
1782 - log.debug("ModuleName = {0}".format(modulename_modulename))  
1783 - log.debug("ModuleNameUnicode = {0}".format(uni_out(modulename_unicode_modulename_unicode)))  
1784 - log.debug("StreamName = {0}".format(modulestreamname_streamname))  
1785 - try:  
1786 - streamname_unicode = modulestreamname_streamname.decode(vba_codec)  
1787 - except UnicodeError as ue:  
1788 - log.debug('failed to decode stream name {0!r} with codec {1}'  
1789 - .format(uni_out(streamname_unicode), vba_codec))  
1790 - streamname_unicode = modulestreamname_streamname.decode(vba_codec, errors='replace')  
1791 - log.debug("StreamName.decode('%s') = %s" % (vba_codec, uni_out(streamname_unicode)))  
1792 - log.debug("StreamNameUnicode = {0}".format(uni_out(modulestreamname_streamname_unicode)))  
1793 - log.debug("TextOffset = {0}".format(moduleoffset_textoffset))  
1794 -  
1795 - code_data = None  
1796 - try_names = streamname_unicode, \  
1797 - modulename_unicode_modulename_unicode, \  
1798 - modulestreamname_streamname_unicode  
1799 - for stream_name in try_names:  
1800 - # TODO: if olefile._find were less private, could replace this  
1801 - # try-except with calls to it  
1802 - try:  
1803 - code_path = vba_root + u'VBA/' + stream_name  
1804 - log.debug('opening VBA code stream %s' % uni_out(code_path))  
1805 - code_data = ole.openstream(code_path).read()  
1806 - break  
1807 - except IOError as ioe:  
1808 - log.debug('failed to open stream VBA/%r (%r), try other name'  
1809 - % (uni_out(stream_name), ioe))  
1810 -  
1811 - if code_data is None:  
1812 - log.info("Could not open stream %d of %d ('VBA/' + one of %r)!"  
1813 - % (projectmodule_index, projectmodules_count,  
1814 - '/'.join("'" + uni_out(stream_name) + "'"  
1815 - for stream_name in try_names)))  
1816 - if relaxed:  
1817 - continue # ... with next submodule  
1818 - else:  
1819 - raise SubstreamOpenError('[BASE]', 'VBA/' +  
1820 - uni_out(modulename_unicode_modulename_unicode))  
1821 -  
1822 - log.debug("length of code_data = {0}".format(len(code_data)))  
1823 - log.debug("offset of code_data = {0}".format(moduleoffset_textoffset))  
1824 - code_data = code_data[moduleoffset_textoffset:]  
1825 - if len(code_data) > 0:  
1826 - code_data = decompress_stream(bytearray(code_data))  
1827 - # case-insensitive search in the code_modules dict to find the file extension:  
1828 - filext = code_modules.get(modulename_modulename.lower(), 'bin')  
1829 - filename = '{0}.{1}'.format(modulename_modulename, filext)  
1830 - #TODO: also yield the codepage so that callers can decode it properly  
1831 - yield (code_path, filename, code_data)  
1832 - # print '-'*79  
1833 - # print filename  
1834 - # print ''  
1835 - # print code_data  
1836 - # print ''  
1837 - log.debug('extracted file {0}'.format(filename))  
1838 - else:  
1839 - log.warning("module stream {0} has code data length 0".format(modulestreamname_streamname))  
1840 - except (UnexpectedDataError, SubstreamOpenError):  
1841 - raise  
1842 - except Exception as exc:  
1843 - log.info('Error parsing module {0} of {1} in _extract_vba:'  
1844 - .format(projectmodule_index, projectmodules_count),  
1845 - exc_info=True)  
1846 - if not relaxed:  
1847 - raise  
1848 - _ = unused # make pylint happy: now variable "unused" is being used ;-)  
1849 - return 1925 + for code_path, filename, code_data in project.parse_modules():
  1926 + yield (code_path, filename, code_data)
1850 1927
1851 1928
1852 def vba_collapse_long_lines(vba_code): 1929 def vba_collapse_long_lines(vba_code):