Commit 7374be1e6f51334f235d58c21e072069750c054b

Authored by decalage2
1 parent 55483626

olevba: added class VBA_Project

Showing 1 changed file with 531 additions and 454 deletions
oletools/olevba.py
... ... @@ -262,6 +262,7 @@ import zlib
262 262 import email # for MHTML parsing
263 263 import string # for printable
264 264 import json # for json output mode (argument --json)
  265 +import codecs
265 266  
266 267 # import lxml or ElementTree for XML parsing:
267 268 try:
... ... @@ -1337,6 +1338,525 @@ def decompress_stream(compressed_container):
1337 1338 return bytes(decompressed_container)
1338 1339  
1339 1340  
  1341 +class VBA_Project(object):
  1342 + """
  1343 + Class to parse a VBA project from an OLE file, and to store all the corresponding
  1344 + metadata and VBA modules.
  1345 + """
  1346 +
  1347 + def __init__(self, ole, vba_root, project_path, dir_path, relaxed=False):
  1348 + """
  1349 + Extract VBA macros from an OleFileIO object.
  1350 +
  1351 + :param vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream
  1352 + :param vba_project: path to the PROJECT stream
  1353 + :param relaxed: If True, only create info/debug log entry if data is not as expected
  1354 + (e.g. opening substream fails); if False, raise an error in this case
  1355 + """
  1356 + self.ole = ole
  1357 + self.vba_root = vba_root
  1358 + self. project_path = project_path
  1359 + self.dir_path = dir_path
  1360 + self.relaxed = relaxed
  1361 + log.debug('Parsing the dir stream from %r' % dir_path)
  1362 + # read data from dir stream (compressed)
  1363 + dir_compressed = ole.openstream(dir_path).read()
  1364 + # decompress it:
  1365 + dir_stream = BytesIO(decompress_stream(bytearray(dir_compressed)))
  1366 + # store reference for later use:
  1367 + self.dir_stream = dir_stream
  1368 +
  1369 + # reference: MS-VBAL 2.3.4.2 dir Stream: Version Independent Project Information
  1370 +
  1371 + # PROJECTSYSKIND Record
  1372 + # Specifies the platform for which the VBA project is created.
  1373 + projectsyskind_id = struct.unpack("<H", dir_stream.read(2))[0]
  1374 + self.check_value('PROJECTSYSKIND_Id', 0x0001, projectsyskind_id)
  1375 + projectsyskind_size = struct.unpack("<L", dir_stream.read(4))[0]
  1376 + self.check_value('PROJECTSYSKIND_Size', 0x0004, projectsyskind_size)
  1377 + self.syskind = struct.unpack("<L", dir_stream.read(4))[0]
  1378 + SYSKIND_NAME = {
  1379 + 0x00: "16-bit Windows",
  1380 + 0x01: "32-bit Windows",
  1381 + 0x02: "Macintosh",
  1382 + 0x03: "64-bit Windows"
  1383 + }
  1384 + self.syskind_name = SYSKIND_NAME.get(self.syskind, 'Unknown')
  1385 + log.debug("PROJECTSYSKIND_SysKind: %d - %s" % (self.syskind, self.syskind_name))
  1386 + if self.syskind not in SYSKIND_NAME:
  1387 + log.error("invalid PROJECTSYSKIND_SysKind {0:04X}".format(self.syskind))
  1388 +
  1389 + # PROJECTLCID Record
  1390 + # Specifies the VBA project's LCID.
  1391 + projectlcid_id = struct.unpack("<H", dir_stream.read(2))[0]
  1392 + self.check_value('PROJECTLCID_Id', 0x0002, projectlcid_id)
  1393 + projectlcid_size = struct.unpack("<L", dir_stream.read(4))[0]
  1394 + self.check_value('PROJECTLCID_Size', 0x0004, projectlcid_size)
  1395 + # Lcid (4 bytes): An unsigned integer that specifies the LCID value for the VBA project. MUST be 0x00000409.
  1396 + self.lcid = struct.unpack("<L", dir_stream.read(4))[0]
  1397 + self.check_value('PROJECTLCID_Lcid', 0x409, self.lcid)
  1398 +
  1399 + # PROJECTLCIDINVOKE Record
  1400 + # Specifies an LCID value used for Invoke calls on an Automation server as specified in [MS-OAUT] section 3.1.4.4.
  1401 + projectlcidinvoke_id = struct.unpack("<H", dir_stream.read(2))[0]
  1402 + self.check_value('PROJECTLCIDINVOKE_Id', 0x0014, projectlcidinvoke_id)
  1403 + projectlcidinvoke_size = struct.unpack("<L", dir_stream.read(4))[0]
  1404 + self.check_value('PROJECTLCIDINVOKE_Size', 0x0004, projectlcidinvoke_size)
  1405 + # LcidInvoke (4 bytes): An unsigned integer that specifies the LCID value used for Invoke calls. MUST be 0x00000409.
  1406 + self.lcidinvoke = struct.unpack("<L", dir_stream.read(4))[0]
  1407 + self.check_value('PROJECTLCIDINVOKE_LcidInvoke', 0x409, self.lcidinvoke)
  1408 +
  1409 + # PROJECTCODEPAGE Record
  1410 + # Specifies the VBA project's code page.
  1411 + projectcodepage_id = struct.unpack("<H", dir_stream.read(2))[0]
  1412 + self.check_value('PROJECTCODEPAGE_Id', 0x0003, projectcodepage_id)
  1413 + projectcodepage_size = struct.unpack("<L", dir_stream.read(4))[0]
  1414 + self.check_value('PROJECTCODEPAGE_Size', 0x0002, projectcodepage_size)
  1415 + self.codepage = struct.unpack("<H", dir_stream.read(2))[0]
  1416 + log.debug('Project Code Page: %r' % self.codepage)
  1417 + if self.codepage in MAC_CODEPAGES:
  1418 + self.codec = MAC_CODEPAGES[self.codepage]
  1419 + else:
  1420 + self.codec = 'cp%d' % self.codepage
  1421 + # TODO: check if valid code page or raise a clear exception, and use UTF-8 as default?
  1422 + try:
  1423 + codecs.lookup(self.codec)
  1424 + except LookupError:
  1425 + log.error('Codec not found for code page %d, using UTF-8 as fallback.' % self.codepage)
  1426 + self.codec = 'utf8'
  1427 + log.debug('Python codec corresponding to code page %d: %s' % (self.codepage, self.codec))
  1428 +
  1429 +
  1430 + # PROJECTNAME Record
  1431 + # Specifies a unique VBA identifier as the name of the VBA project.
  1432 + projectname_id = struct.unpack("<H", dir_stream.read(2))[0]
  1433 + self.check_value('PROJECTNAME_Id', 0x0004, projectname_id)
  1434 + sizeof_projectname = struct.unpack("<L", dir_stream.read(4))[0]
  1435 + log.debug('Project name size: %d bytes' % sizeof_projectname)
  1436 + if sizeof_projectname < 1 or sizeof_projectname > 128:
  1437 + # TODO: raise an actual error? What is MS Office's behaviour?
  1438 + log.error("PROJECTNAME_SizeOfProjectName value not in range [1-128]: {0}".format(sizeof_projectname))
  1439 + projectname_bytes = dir_stream.read(sizeof_projectname)
  1440 + self.projectname = self.decode_bytes(projectname_bytes)
  1441 +
  1442 +
  1443 + # PROJECTDOCSTRING Record
  1444 + # Specifies the description for the VBA project.
  1445 + projectdocstring_id = struct.unpack("<H", dir_stream.read(2))[0]
  1446 + self.check_value('PROJECTDOCSTRING_Id', 0x0005, projectdocstring_id)
  1447 + projectdocstring_sizeof_docstring = struct.unpack("<L", dir_stream.read(4))[0]
  1448 + if projectdocstring_sizeof_docstring > 2000:
  1449 + log.error(
  1450 + "PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(projectdocstring_sizeof_docstring))
  1451 + # DocString (variable): An array of SizeOfDocString bytes that specifies the description for the VBA project.
  1452 + # MUST contain MBCS characters encoded using the code page specified in PROJECTCODEPAGE (section 2.3.4.2.1.4).
  1453 + # MUST NOT contain null characters.
  1454 + docstring_bytes = dir_stream.read(projectdocstring_sizeof_docstring)
  1455 + self.docstring = self.decode_bytes(docstring_bytes)
  1456 + projectdocstring_reserved = struct.unpack("<H", dir_stream.read(2))[0]
  1457 + self.check_value('PROJECTDOCSTRING_Reserved', 0x0040, projectdocstring_reserved)
  1458 + projectdocstring_sizeof_docstring_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1459 + if projectdocstring_sizeof_docstring_unicode % 2 != 0:
  1460 + log.error("PROJECTDOCSTRING_SizeOfDocStringUnicode is not even")
  1461 + # DocStringUnicode (variable): An array of SizeOfDocStringUnicode bytes that specifies the description for the
  1462 + # VBA project. MUST contain UTF-16 characters. MUST NOT contain null characters.
  1463 + # MUST contain the UTF-16 encoding of DocString.
  1464 + docstring_unicode_bytes = dir_stream.read(projectdocstring_sizeof_docstring_unicode)
  1465 + self.docstring_unicode = docstring_unicode_bytes.decode('utf16', errors='replace')
  1466 +
  1467 + # PROJECTHELPFILEPATH Record - MS-OVBA 2.3.4.2.1.7
  1468 + projecthelpfilepath_id = struct.unpack("<H", dir_stream.read(2))[0]
  1469 + self.check_value('PROJECTHELPFILEPATH_Id', 0x0006, projecthelpfilepath_id)
  1470 + projecthelpfilepath_sizeof_helpfile1 = struct.unpack("<L", dir_stream.read(4))[0]
  1471 + if projecthelpfilepath_sizeof_helpfile1 > 260:
  1472 + log.error(
  1473 + "PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(projecthelpfilepath_sizeof_helpfile1))
  1474 + projecthelpfilepath_helpfile1 = dir_stream.read(projecthelpfilepath_sizeof_helpfile1)
  1475 + projecthelpfilepath_reserved = struct.unpack("<H", dir_stream.read(2))[0]
  1476 + self.check_value('PROJECTHELPFILEPATH_Reserved', 0x003D, projecthelpfilepath_reserved)
  1477 + projecthelpfilepath_sizeof_helpfile2 = struct.unpack("<L", dir_stream.read(4))[0]
  1478 + if projecthelpfilepath_sizeof_helpfile2 != projecthelpfilepath_sizeof_helpfile1:
  1479 + log.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 does not equal PROJECTHELPFILEPATH_SizeOfHelpFile2")
  1480 + projecthelpfilepath_helpfile2 = dir_stream.read(projecthelpfilepath_sizeof_helpfile2)
  1481 + if projecthelpfilepath_helpfile2 != projecthelpfilepath_helpfile1:
  1482 + log.error("PROJECTHELPFILEPATH_HelpFile1 does not equal PROJECTHELPFILEPATH_HelpFile2")
  1483 +
  1484 + # PROJECTHELPCONTEXT Record
  1485 + projecthelpcontext_id = struct.unpack("<H", dir_stream.read(2))[0]
  1486 + self.check_value('PROJECTHELPCONTEXT_Id', 0x0007, projecthelpcontext_id)
  1487 + projecthelpcontext_size = struct.unpack("<L", dir_stream.read(4))[0]
  1488 + self.check_value('PROJECTHELPCONTEXT_Size', 0x0004, projecthelpcontext_size)
  1489 + projecthelpcontext_helpcontext = struct.unpack("<L", dir_stream.read(4))[0]
  1490 + unused = projecthelpcontext_helpcontext
  1491 +
  1492 + # PROJECTLIBFLAGS Record
  1493 + projectlibflags_id = struct.unpack("<H", dir_stream.read(2))[0]
  1494 + self.check_value('PROJECTLIBFLAGS_Id', 0x0008, projectlibflags_id)
  1495 + projectlibflags_size = struct.unpack("<L", dir_stream.read(4))[0]
  1496 + self.check_value('PROJECTLIBFLAGS_Size', 0x0004, projectlibflags_size)
  1497 + projectlibflags_projectlibflags = struct.unpack("<L", dir_stream.read(4))[0]
  1498 + self.check_value('PROJECTLIBFLAGS_ProjectLibFlags', 0x0000, projectlibflags_projectlibflags)
  1499 +
  1500 + # PROJECTVERSION Record
  1501 + projectversion_id = struct.unpack("<H", dir_stream.read(2))[0]
  1502 + self.check_value('PROJECTVERSION_Id', 0x0009, projectversion_id)
  1503 + projectversion_reserved = struct.unpack("<L", dir_stream.read(4))[0]
  1504 + self.check_value('PROJECTVERSION_Reserved', 0x0004, projectversion_reserved)
  1505 + projectversion_versionmajor = struct.unpack("<L", dir_stream.read(4))[0]
  1506 + projectversion_versionminor = struct.unpack("<H", dir_stream.read(2))[0]
  1507 + unused = projectversion_versionmajor
  1508 + unused = projectversion_versionminor
  1509 +
  1510 + # PROJECTCONSTANTS Record
  1511 + projectconstants_id = struct.unpack("<H", dir_stream.read(2))[0]
  1512 + self.check_value('PROJECTCONSTANTS_Id', 0x000C, projectconstants_id)
  1513 + projectconstants_sizeof_constants = struct.unpack("<L", dir_stream.read(4))[0]
  1514 + if projectconstants_sizeof_constants > 1015:
  1515 + log.error(
  1516 + "PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(projectconstants_sizeof_constants))
  1517 + projectconstants_constants = dir_stream.read(projectconstants_sizeof_constants)
  1518 + projectconstants_reserved = struct.unpack("<H", dir_stream.read(2))[0]
  1519 + self.check_value('PROJECTCONSTANTS_Reserved', 0x003C, projectconstants_reserved)
  1520 + projectconstants_sizeof_constants_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1521 + if projectconstants_sizeof_constants_unicode % 2 != 0:
  1522 + log.error("PROJECTCONSTANTS_SizeOfConstantsUnicode is not even")
  1523 + projectconstants_constants_unicode = dir_stream.read(projectconstants_sizeof_constants_unicode)
  1524 + unused = projectconstants_constants
  1525 + unused = projectconstants_constants_unicode
  1526 +
  1527 + # array of REFERENCE records
  1528 + check = None
  1529 + while True:
  1530 + check = struct.unpack("<H", dir_stream.read(2))[0]
  1531 + log.debug("reference type = {0:04X}".format(check))
  1532 + if check == 0x000F:
  1533 + break
  1534 +
  1535 + if check == 0x0016:
  1536 + # REFERENCENAME
  1537 + reference_id = check
  1538 + reference_sizeof_name = struct.unpack("<L", dir_stream.read(4))[0]
  1539 + reference_name = dir_stream.read(reference_sizeof_name)
  1540 + reference_reserved = struct.unpack("<H", dir_stream.read(2))[0]
  1541 + # According to [MS-OVBA] 2.3.4.2.2.2 REFERENCENAME Record:
  1542 + # "Reserved (2 bytes): MUST be 0x003E. MUST be ignored."
  1543 + # So let's ignore it, otherwise it crashes on some files (issue #132)
  1544 + # PR #135 by @c1fe:
  1545 + # contrary to the specification I think that the unicode name
  1546 + # is optional. if reference_reserved is not 0x003E I think it
  1547 + # is actually the start of another REFERENCE record
  1548 + # at least when projectsyskind_syskind == 0x02 (Macintosh)
  1549 + if reference_reserved == 0x003E:
  1550 + #if reference_reserved not in (0x003E, 0x000D):
  1551 + # raise UnexpectedDataError(dir_path, 'REFERENCE_Reserved',
  1552 + # 0x0003E, reference_reserved)
  1553 + reference_sizeof_name_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1554 + reference_name_unicode = dir_stream.read(reference_sizeof_name_unicode)
  1555 + unused = reference_id
  1556 + unused = reference_name
  1557 + unused = reference_name_unicode
  1558 + continue
  1559 + else:
  1560 + check = reference_reserved
  1561 + log.debug("reference type = {0:04X}".format(check))
  1562 +
  1563 + if check == 0x0033:
  1564 + # REFERENCEORIGINAL (followed by REFERENCECONTROL)
  1565 + referenceoriginal_id = check
  1566 + referenceoriginal_sizeof_libidoriginal = struct.unpack("<L", dir_stream.read(4))[0]
  1567 + referenceoriginal_libidoriginal = dir_stream.read(referenceoriginal_sizeof_libidoriginal)
  1568 + unused = referenceoriginal_id
  1569 + unused = referenceoriginal_libidoriginal
  1570 + continue
  1571 +
  1572 + if check == 0x002F:
  1573 + # REFERENCECONTROL
  1574 + referencecontrol_id = check
  1575 + referencecontrol_sizetwiddled = struct.unpack("<L", dir_stream.read(4))[0] # ignore
  1576 + referencecontrol_sizeof_libidtwiddled = struct.unpack("<L", dir_stream.read(4))[0]
  1577 + referencecontrol_libidtwiddled = dir_stream.read(referencecontrol_sizeof_libidtwiddled)
  1578 + referencecontrol_reserved1 = struct.unpack("<L", dir_stream.read(4))[0] # ignore
  1579 + self.check_value('REFERENCECONTROL_Reserved1', 0x0000, referencecontrol_reserved1)
  1580 + referencecontrol_reserved2 = struct.unpack("<H", dir_stream.read(2))[0] # ignore
  1581 + self.check_value('REFERENCECONTROL_Reserved2', 0x0000, referencecontrol_reserved2)
  1582 + unused = referencecontrol_id
  1583 + unused = referencecontrol_sizetwiddled
  1584 + unused = referencecontrol_libidtwiddled
  1585 + # optional field
  1586 + check2 = struct.unpack("<H", dir_stream.read(2))[0]
  1587 + if check2 == 0x0016:
  1588 + referencecontrol_namerecordextended_id = check
  1589 + referencecontrol_namerecordextended_sizeof_name = struct.unpack("<L", dir_stream.read(4))[0]
  1590 + referencecontrol_namerecordextended_name = dir_stream.read(
  1591 + referencecontrol_namerecordextended_sizeof_name)
  1592 + referencecontrol_namerecordextended_reserved = struct.unpack("<H", dir_stream.read(2))[0]
  1593 + if referencecontrol_namerecordextended_reserved == 0x003E:
  1594 + referencecontrol_namerecordextended_sizeof_name_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1595 + referencecontrol_namerecordextended_name_unicode = dir_stream.read(
  1596 + referencecontrol_namerecordextended_sizeof_name_unicode)
  1597 + referencecontrol_reserved3 = struct.unpack("<H", dir_stream.read(2))[0]
  1598 + unused = referencecontrol_namerecordextended_id
  1599 + unused = referencecontrol_namerecordextended_name
  1600 + unused = referencecontrol_namerecordextended_name_unicode
  1601 + else:
  1602 + referencecontrol_reserved3 = referencecontrol_namerecordextended_reserved
  1603 + else:
  1604 + referencecontrol_reserved3 = check2
  1605 +
  1606 + self.check_value('REFERENCECONTROL_Reserved3', 0x0030, referencecontrol_reserved3)
  1607 + referencecontrol_sizeextended = struct.unpack("<L", dir_stream.read(4))[0]
  1608 + referencecontrol_sizeof_libidextended = struct.unpack("<L", dir_stream.read(4))[0]
  1609 + referencecontrol_libidextended = dir_stream.read(referencecontrol_sizeof_libidextended)
  1610 + referencecontrol_reserved4 = struct.unpack("<L", dir_stream.read(4))[0]
  1611 + referencecontrol_reserved5 = struct.unpack("<H", dir_stream.read(2))[0]
  1612 + referencecontrol_originaltypelib = dir_stream.read(16)
  1613 + referencecontrol_cookie = struct.unpack("<L", dir_stream.read(4))[0]
  1614 + unused = referencecontrol_sizeextended
  1615 + unused = referencecontrol_libidextended
  1616 + unused = referencecontrol_reserved4
  1617 + unused = referencecontrol_reserved5
  1618 + unused = referencecontrol_originaltypelib
  1619 + unused = referencecontrol_cookie
  1620 + continue
  1621 +
  1622 + if check == 0x000D:
  1623 + # REFERENCEREGISTERED
  1624 + referenceregistered_id = check
  1625 + referenceregistered_size = struct.unpack("<L", dir_stream.read(4))[0]
  1626 + referenceregistered_sizeof_libid = struct.unpack("<L", dir_stream.read(4))[0]
  1627 + referenceregistered_libid = dir_stream.read(referenceregistered_sizeof_libid)
  1628 + referenceregistered_reserved1 = struct.unpack("<L", dir_stream.read(4))[0]
  1629 + self.check_value('REFERENCEREGISTERED_Reserved1', 0x0000, referenceregistered_reserved1)
  1630 + referenceregistered_reserved2 = struct.unpack("<H", dir_stream.read(2))[0]
  1631 + self.check_value('REFERENCEREGISTERED_Reserved2', 0x0000, referenceregistered_reserved2)
  1632 + unused = referenceregistered_id
  1633 + unused = referenceregistered_size
  1634 + unused = referenceregistered_libid
  1635 + continue
  1636 +
  1637 + if check == 0x000E:
  1638 + # REFERENCEPROJECT
  1639 + referenceproject_id = check
  1640 + referenceproject_size = struct.unpack("<L", dir_stream.read(4))[0]
  1641 + referenceproject_sizeof_libidabsolute = struct.unpack("<L", dir_stream.read(4))[0]
  1642 + referenceproject_libidabsolute = dir_stream.read(referenceproject_sizeof_libidabsolute)
  1643 + referenceproject_sizeof_libidrelative = struct.unpack("<L", dir_stream.read(4))[0]
  1644 + referenceproject_libidrelative = dir_stream.read(referenceproject_sizeof_libidrelative)
  1645 + referenceproject_majorversion = struct.unpack("<L", dir_stream.read(4))[0]
  1646 + referenceproject_minorversion = struct.unpack("<H", dir_stream.read(2))[0]
  1647 + unused = referenceproject_id
  1648 + unused = referenceproject_size
  1649 + unused = referenceproject_libidabsolute
  1650 + unused = referenceproject_libidrelative
  1651 + unused = referenceproject_majorversion
  1652 + unused = referenceproject_minorversion
  1653 + continue
  1654 +
  1655 + log.error('invalid or unknown check Id {0:04X}'.format(check))
  1656 + # raise an exception instead of stopping abruptly (issue #180)
  1657 + raise UnexpectedDataError(dir_path, 'reference type', (0x0F, 0x16, 0x33, 0x2F, 0x0D, 0x0E), check)
  1658 + #sys.exit(0)
  1659 +
  1660 + def check_value(self, name, expected, value):
  1661 + if expected != value:
  1662 + if self.relaxed:
  1663 + log.error("invalid value for {0} expected {1:04X} got {2:04X}"
  1664 + .format(name, expected, value))
  1665 + else:
  1666 + raise UnexpectedDataError(self.dir_path, name, expected, value)
  1667 +
  1668 +
  1669 + def parse_modules(self):
  1670 + dir_stream = self.dir_stream
  1671 + # projectmodules_id has already been read by the previous loop = 0x000F
  1672 + # projectmodules_id = check #struct.unpack("<H", dir_stream.read(2))[0]
  1673 + # self.check_value('PROJECTMODULES_Id', 0x000F, projectmodules_id)
  1674 + projectmodules_size = struct.unpack("<L", dir_stream.read(4))[0]
  1675 + self.check_value('PROJECTMODULES_Size', 0x0002, projectmodules_size)
  1676 + projectmodules_count = struct.unpack("<H", dir_stream.read(2))[0]
  1677 + projectmodules_projectcookierecord_id = struct.unpack("<H", dir_stream.read(2))[0]
  1678 + self.check_value('PROJECTMODULES_ProjectCookieRecord_Id', 0x0013, projectmodules_projectcookierecord_id)
  1679 + projectmodules_projectcookierecord_size = struct.unpack("<L", dir_stream.read(4))[0]
  1680 + self.check_value('PROJECTMODULES_ProjectCookieRecord_Size', 0x0002, projectmodules_projectcookierecord_size)
  1681 + projectmodules_projectcookierecord_cookie = struct.unpack("<H", dir_stream.read(2))[0]
  1682 + unused = projectmodules_projectcookierecord_cookie
  1683 +
  1684 + # short function to simplify unicode text output
  1685 + uni_out = lambda unicode_text: unicode_text.encode('utf-8', 'replace')
  1686 +
  1687 + log.debug("parsing {0} modules".format(projectmodules_count))
  1688 + for projectmodule_index in xrange(0, projectmodules_count):
  1689 + try:
  1690 + modulename_id = struct.unpack("<H", dir_stream.read(2))[0]
  1691 + self.check_value('MODULENAME_Id', 0x0019, modulename_id)
  1692 + modulename_sizeof_modulename = struct.unpack("<L", dir_stream.read(4))[0]
  1693 + modulename_modulename = dir_stream.read(modulename_sizeof_modulename)
  1694 + # TODO: preset variables to avoid "referenced before assignment" errors
  1695 + modulename_unicode_modulename_unicode = ''
  1696 + # account for optional sections
  1697 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1698 + if section_id == 0x0047:
  1699 + modulename_unicode_id = section_id
  1700 + modulename_unicode_sizeof_modulename_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1701 + modulename_unicode_modulename_unicode = dir_stream.read(
  1702 + modulename_unicode_sizeof_modulename_unicode).decode('UTF-16LE', 'replace')
  1703 + # just guessing that this is the same encoding as used in OleFileIO
  1704 + unused = modulename_unicode_id
  1705 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1706 + if section_id == 0x001A:
  1707 + modulestreamname_id = section_id
  1708 + modulestreamname_sizeof_streamname = struct.unpack("<L", dir_stream.read(4))[0]
  1709 + modulestreamname_streamname = dir_stream.read(modulestreamname_sizeof_streamname)
  1710 + modulestreamname_reserved = struct.unpack("<H", dir_stream.read(2))[0]
  1711 + self.check_value('MODULESTREAMNAME_Reserved', 0x0032, modulestreamname_reserved)
  1712 + modulestreamname_sizeof_streamname_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1713 + modulestreamname_streamname_unicode = dir_stream.read(
  1714 + modulestreamname_sizeof_streamname_unicode).decode('UTF-16LE', 'replace')
  1715 + # just guessing that this is the same encoding as used in OleFileIO
  1716 + unused = modulestreamname_id
  1717 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1718 + if section_id == 0x001C:
  1719 + moduledocstring_id = section_id
  1720 + self.check_value('MODULEDOCSTRING_Id', 0x001C, moduledocstring_id)
  1721 + moduledocstring_sizeof_docstring = struct.unpack("<L", dir_stream.read(4))[0]
  1722 + moduledocstring_docstring = dir_stream.read(moduledocstring_sizeof_docstring)
  1723 + moduledocstring_reserved = struct.unpack("<H", dir_stream.read(2))[0]
  1724 + self.check_value('MODULEDOCSTRING_Reserved', 0x0048, moduledocstring_reserved)
  1725 + moduledocstring_sizeof_docstring_unicode = struct.unpack("<L", dir_stream.read(4))[0]
  1726 + moduledocstring_docstring_unicode = dir_stream.read(moduledocstring_sizeof_docstring_unicode)
  1727 + unused = moduledocstring_docstring
  1728 + unused = moduledocstring_docstring_unicode
  1729 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1730 + if section_id == 0x0031:
  1731 + moduleoffset_id = section_id
  1732 + self.check_value('MODULEOFFSET_Id', 0x0031, moduleoffset_id)
  1733 + moduleoffset_size = struct.unpack("<L", dir_stream.read(4))[0]
  1734 + self.check_value('MODULEOFFSET_Size', 0x0004, moduleoffset_size)
  1735 + moduleoffset_textoffset = struct.unpack("<L", dir_stream.read(4))[0]
  1736 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1737 + if section_id == 0x001E:
  1738 + modulehelpcontext_id = section_id
  1739 + self.check_value('MODULEHELPCONTEXT_Id', 0x001E, modulehelpcontext_id)
  1740 + modulehelpcontext_size = struct.unpack("<L", dir_stream.read(4))[0]
  1741 + self.check_value('MODULEHELPCONTEXT_Size', 0x0004, modulehelpcontext_size)
  1742 + modulehelpcontext_helpcontext = struct.unpack("<L", dir_stream.read(4))[0]
  1743 + unused = modulehelpcontext_helpcontext
  1744 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1745 + if section_id == 0x002C:
  1746 + modulecookie_id = section_id
  1747 + self.check_value('MODULECOOKIE_Id', 0x002C, modulecookie_id)
  1748 + modulecookie_size = struct.unpack("<L", dir_stream.read(4))[0]
  1749 + self.check_value('MODULECOOKIE_Size', 0x0002, modulecookie_size)
  1750 + modulecookie_cookie = struct.unpack("<H", dir_stream.read(2))[0]
  1751 + unused = modulecookie_cookie
  1752 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1753 + if section_id == 0x0021 or section_id == 0x0022:
  1754 + moduletype_id = section_id
  1755 + moduletype_reserved = struct.unpack("<L", dir_stream.read(4))[0]
  1756 + unused = moduletype_id
  1757 + unused = moduletype_reserved
  1758 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1759 + if section_id == 0x0025:
  1760 + modulereadonly_id = section_id
  1761 + self.check_value('MODULEREADONLY_Id', 0x0025, modulereadonly_id)
  1762 + modulereadonly_reserved = struct.unpack("<L", dir_stream.read(4))[0]
  1763 + self.check_value('MODULEREADONLY_Reserved', 0x0000, modulereadonly_reserved)
  1764 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1765 + if section_id == 0x0028:
  1766 + moduleprivate_id = section_id
  1767 + self.check_value('MODULEPRIVATE_Id', 0x0028, moduleprivate_id)
  1768 + moduleprivate_reserved = struct.unpack("<L", dir_stream.read(4))[0]
  1769 + self.check_value('MODULEPRIVATE_Reserved', 0x0000, moduleprivate_reserved)
  1770 + section_id = struct.unpack("<H", dir_stream.read(2))[0]
  1771 + if section_id == 0x002B: # TERMINATOR
  1772 + module_reserved = struct.unpack("<L", dir_stream.read(4))[0]
  1773 + self.check_value('MODULE_Reserved', 0x0000, module_reserved)
  1774 + section_id = None
  1775 + if section_id != None:
  1776 + log.warning('unknown or invalid module section id {0:04X}'.format(section_id))
  1777 +
  1778 + # TODO: handle case when modulestreamname_streamname is not provided
  1779 + log.debug("ModuleName = {0}".format(modulename_modulename))
  1780 + log.debug("ModuleNameUnicode = {0}".format(uni_out(modulename_unicode_modulename_unicode)))
  1781 + log.debug("StreamName = {0}".format(modulestreamname_streamname))
  1782 + try:
  1783 + streamname_unicode = self.decode_bytes(modulestreamname_streamname)
  1784 + except UnicodeError as ue:
  1785 + log.debug('failed to decode stream name {0!r} with codec {1}'
  1786 + .format(uni_out(streamname_unicode), self.codec))
  1787 + streamname_unicode = modulestreamname_streamname.decode(self.codec, errors='replace')
  1788 + log.debug("StreamName.decode('%s') = %s" % (self.codec, uni_out(streamname_unicode)))
  1789 + log.debug("StreamNameUnicode = {0}".format(uni_out(modulestreamname_streamname_unicode)))
  1790 + log.debug("TextOffset = {0}".format(moduleoffset_textoffset))
  1791 +
  1792 + code_data = None
  1793 + try_names = streamname_unicode, \
  1794 + modulename_unicode_modulename_unicode, \
  1795 + modulestreamname_streamname_unicode
  1796 + for stream_name in try_names:
  1797 + # TODO: if olefile._find were less private, could replace this
  1798 + # try-except with calls to it
  1799 + try:
  1800 + code_path = self.vba_root + u'VBA/' + stream_name
  1801 + log.debug('opening VBA code stream %s' % uni_out(code_path))
  1802 + code_data = self.ole.openstream(code_path).read()
  1803 + break
  1804 + except IOError as ioe:
  1805 + log.debug('failed to open stream VBA/%r (%r), try other name'
  1806 + % (uni_out(stream_name), ioe))
  1807 +
  1808 + if code_data is None:
  1809 + log.info("Could not open stream %d of %d ('VBA/' + one of %r)!"
  1810 + % (projectmodule_index, projectmodules_count,
  1811 + '/'.join("'" + uni_out(stream_name) + "'"
  1812 + for stream_name in try_names)))
  1813 + if self.relaxed:
  1814 + continue # ... with next submodule
  1815 + else:
  1816 + raise SubstreamOpenError('[BASE]', 'VBA/' +
  1817 + uni_out(modulename_unicode_modulename_unicode))
  1818 +
  1819 + log.debug("length of code_data = {0}".format(len(code_data)))
  1820 + log.debug("offset of code_data = {0}".format(moduleoffset_textoffset))
  1821 + code_data = code_data[moduleoffset_textoffset:]
  1822 + if len(code_data) > 0:
  1823 + code_data = decompress_stream(bytearray(code_data))
  1824 + # case-insensitive search in the code_modules dict to find the file extension:
  1825 + # filext = code_modules.get(modulename_modulename.lower(), 'bin')
  1826 + filext = 'vba'
  1827 + filename = '{0}.{1}'.format(modulename_modulename, filext)
  1828 + #TODO: also yield the codepage so that callers can decode it properly
  1829 + yield (code_path, filename, code_data)
  1830 + # print '-'*79
  1831 + # print filename
  1832 + # print ''
  1833 + # print code_data
  1834 + # print ''
  1835 + log.debug('extracted file {0}'.format(filename))
  1836 + else:
  1837 + log.warning("module stream {0} has code data length 0".format(modulestreamname_streamname))
  1838 + except (UnexpectedDataError, SubstreamOpenError):
  1839 + raise
  1840 + except Exception as exc:
  1841 + log.info('Error parsing module {0} of {1} in _extract_vba:'
  1842 + .format(projectmodule_index, projectmodules_count),
  1843 + exc_info=True)
  1844 + if not self.relaxed:
  1845 + raise
  1846 + _ = unused # make pylint happy: now variable "unused" is being used ;-)
  1847 + return
  1848 +
  1849 + def decode_bytes(self, bytes_string, errors='replace'):
  1850 + """
  1851 + Decode a bytes string to a unicode string, using the project code page
  1852 + :param bytes_string: bytes, bytes string to be decoded
  1853 + :param errors: str, mode to handle unicode conversion errors
  1854 + :return: str/unicode, decoded string
  1855 + """
  1856 + return bytes_string.decode(self.codec, errors=errors)
  1857 +
  1858 +
  1859 +
1340 1860 def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1341 1861 """
1342 1862 Extract VBA macros from an OleFileIO object.
... ... @@ -1348,10 +1868,15 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1348 1868 (e.g. opening substream fails); if False, raise an error in this case
1349 1869 This is a generator, yielding (stream path, VBA filename, VBA source code) for each VBA code stream
1350 1870 """
1351   - # Open the PROJECT stream:
1352   - project = ole.openstream(project_path)
1353 1871 log.debug('relaxed is %s' % relaxed)
1354 1872  
  1873 + project = VBA_Project(ole, vba_root, project_path, dir_path, relaxed=False)
  1874 +
  1875 + # Open the PROJECT stream:
  1876 + # reference: [MS-OVBA] 2.3.1 PROJECT Stream
  1877 + # TODO: in fact the PROJECT stream is encoded using the code page specified in the dir stream, should be read afterwards
  1878 + project_stream = ole.openstream(project_path)
  1879 +
1355 1880 # sample content of the PROJECT stream:
1356 1881  
1357 1882 ## ID="{5312AC8A-349D-4950-BDD0-49BE3C4DD0F0}"
... ... @@ -1374,7 +1899,8 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1374 1899  
1375 1900 code_modules = {}
1376 1901  
1377   - for line in project:
  1902 + for line in project_stream:
  1903 + line = project.decode_bytes(line)
1378 1904 log.debug('PROJECT: %r' % line)
1379 1905 line = line.strip()
1380 1906 if '=' in line:
... ... @@ -1396,457 +1922,8 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1396 1922 elif name == 'BaseClass':
1397 1923 code_modules[value] = FORM_EXTENSION
1398 1924  
1399   - # read data from dir stream (compressed)
1400   - dir_compressed = ole.openstream(dir_path).read()
1401   -
1402   - def check_value(name, expected, value):
1403   - if expected != value:
1404   - if relaxed:
1405   - log.error("invalid value for {0} expected {1:04X} got {2:04X}"
1406   - .format(name, expected, value))
1407   - else:
1408   - raise UnexpectedDataError(dir_path, name, expected, value)
1409   -
1410   - dir_stream = BytesIO(decompress_stream(bytearray(dir_compressed)))
1411   -
1412   - # PROJECTSYSKIND Record
1413   - projectsyskind_id = struct.unpack("<H", dir_stream.read(2))[0]
1414   - check_value('PROJECTSYSKIND_Id', 0x0001, projectsyskind_id)
1415   - projectsyskind_size = struct.unpack("<L", dir_stream.read(4))[0]
1416   - check_value('PROJECTSYSKIND_Size', 0x0004, projectsyskind_size)
1417   - projectsyskind_syskind = struct.unpack("<L", dir_stream.read(4))[0]
1418   - if projectsyskind_syskind == 0x00:
1419   - log.debug("16-bit Windows")
1420   - elif projectsyskind_syskind == 0x01:
1421   - log.debug("32-bit Windows")
1422   - elif projectsyskind_syskind == 0x02:
1423   - log.debug("Macintosh")
1424   - elif projectsyskind_syskind == 0x03:
1425   - log.debug("64-bit Windows")
1426   - else:
1427   - log.error("invalid PROJECTSYSKIND_SysKind {0:04X}".format(projectsyskind_syskind))
1428   -
1429   - # PROJECTLCID Record
1430   - projectlcid_id = struct.unpack("<H", dir_stream.read(2))[0]
1431   - check_value('PROJECTLCID_Id', 0x0002, projectlcid_id)
1432   - projectlcid_size = struct.unpack("<L", dir_stream.read(4))[0]
1433   - check_value('PROJECTLCID_Size', 0x0004, projectlcid_size)
1434   - projectlcid_lcid = struct.unpack("<L", dir_stream.read(4))[0]
1435   - check_value('PROJECTLCID_Lcid', 0x409, projectlcid_lcid)
1436   -
1437   - # PROJECTLCIDINVOKE Record
1438   - projectlcidinvoke_id = struct.unpack("<H", dir_stream.read(2))[0]
1439   - check_value('PROJECTLCIDINVOKE_Id', 0x0014, projectlcidinvoke_id)
1440   - projectlcidinvoke_size = struct.unpack("<L", dir_stream.read(4))[0]
1441   - check_value('PROJECTLCIDINVOKE_Size', 0x0004, projectlcidinvoke_size)
1442   - projectlcidinvoke_lcidinvoke = struct.unpack("<L", dir_stream.read(4))[0]
1443   - check_value('PROJECTLCIDINVOKE_LcidInvoke', 0x409, projectlcidinvoke_lcidinvoke)
1444   -
1445   - # PROJECTCODEPAGE Record
1446   - projectcodepage_id = struct.unpack("<H", dir_stream.read(2))[0]
1447   - check_value('PROJECTCODEPAGE_Id', 0x0003, projectcodepage_id)
1448   - projectcodepage_size = struct.unpack("<L", dir_stream.read(4))[0]
1449   - check_value('PROJECTCODEPAGE_Size', 0x0002, projectcodepage_size)
1450   - projectcodepage_codepage = struct.unpack("<H", dir_stream.read(2))[0]
1451   -
1452   - # PROJECTNAME Record
1453   - projectname_id = struct.unpack("<H", dir_stream.read(2))[0]
1454   - check_value('PROJECTNAME_Id', 0x0004, projectname_id)
1455   - projectname_sizeof_projectname = struct.unpack("<L", dir_stream.read(4))[0]
1456   - if projectname_sizeof_projectname < 1 or projectname_sizeof_projectname > 128:
1457   - log.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(projectname_sizeof_projectname))
1458   - projectname_projectname = dir_stream.read(projectname_sizeof_projectname)
1459   - unused = projectname_projectname
1460   -
1461   - # PROJECTDOCSTRING Record
1462   - projectdocstring_id = struct.unpack("<H", dir_stream.read(2))[0]
1463   - check_value('PROJECTDOCSTRING_Id', 0x0005, projectdocstring_id)
1464   - projectdocstring_sizeof_docstring = struct.unpack("<L", dir_stream.read(4))[0]
1465   - if projectdocstring_sizeof_docstring > 2000:
1466   - log.error(
1467   - "PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(projectdocstring_sizeof_docstring))
1468   - projectdocstring_docstring = dir_stream.read(projectdocstring_sizeof_docstring)
1469   - projectdocstring_reserved = struct.unpack("<H", dir_stream.read(2))[0]
1470   - check_value('PROJECTDOCSTRING_Reserved', 0x0040, projectdocstring_reserved)
1471   - projectdocstring_sizeof_docstring_unicode = struct.unpack("<L", dir_stream.read(4))[0]
1472   - if projectdocstring_sizeof_docstring_unicode % 2 != 0:
1473   - log.error("PROJECTDOCSTRING_SizeOfDocStringUnicode is not even")
1474   - projectdocstring_docstring_unicode = dir_stream.read(projectdocstring_sizeof_docstring_unicode)
1475   - unused = projectdocstring_docstring
1476   - unused = projectdocstring_docstring_unicode
1477   -
1478   - # PROJECTHELPFILEPATH Record - MS-OVBA 2.3.4.2.1.7
1479   - projecthelpfilepath_id = struct.unpack("<H", dir_stream.read(2))[0]
1480   - check_value('PROJECTHELPFILEPATH_Id', 0x0006, projecthelpfilepath_id)
1481   - projecthelpfilepath_sizeof_helpfile1 = struct.unpack("<L", dir_stream.read(4))[0]
1482   - if projecthelpfilepath_sizeof_helpfile1 > 260:
1483   - log.error(
1484   - "PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(projecthelpfilepath_sizeof_helpfile1))
1485   - projecthelpfilepath_helpfile1 = dir_stream.read(projecthelpfilepath_sizeof_helpfile1)
1486   - projecthelpfilepath_reserved = struct.unpack("<H", dir_stream.read(2))[0]
1487   - check_value('PROJECTHELPFILEPATH_Reserved', 0x003D, projecthelpfilepath_reserved)
1488   - projecthelpfilepath_sizeof_helpfile2 = struct.unpack("<L", dir_stream.read(4))[0]
1489   - if projecthelpfilepath_sizeof_helpfile2 != projecthelpfilepath_sizeof_helpfile1:
1490   - log.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 does not equal PROJECTHELPFILEPATH_SizeOfHelpFile2")
1491   - projecthelpfilepath_helpfile2 = dir_stream.read(projecthelpfilepath_sizeof_helpfile2)
1492   - if projecthelpfilepath_helpfile2 != projecthelpfilepath_helpfile1:
1493   - log.error("PROJECTHELPFILEPATH_HelpFile1 does not equal PROJECTHELPFILEPATH_HelpFile2")
1494   -
1495   - # PROJECTHELPCONTEXT Record
1496   - projecthelpcontext_id = struct.unpack("<H", dir_stream.read(2))[0]
1497   - check_value('PROJECTHELPCONTEXT_Id', 0x0007, projecthelpcontext_id)
1498   - projecthelpcontext_size = struct.unpack("<L", dir_stream.read(4))[0]
1499   - check_value('PROJECTHELPCONTEXT_Size', 0x0004, projecthelpcontext_size)
1500   - projecthelpcontext_helpcontext = struct.unpack("<L", dir_stream.read(4))[0]
1501   - unused = projecthelpcontext_helpcontext
1502   -
1503   - # PROJECTLIBFLAGS Record
1504   - projectlibflags_id = struct.unpack("<H", dir_stream.read(2))[0]
1505   - check_value('PROJECTLIBFLAGS_Id', 0x0008, projectlibflags_id)
1506   - projectlibflags_size = struct.unpack("<L", dir_stream.read(4))[0]
1507   - check_value('PROJECTLIBFLAGS_Size', 0x0004, projectlibflags_size)
1508   - projectlibflags_projectlibflags = struct.unpack("<L", dir_stream.read(4))[0]
1509   - check_value('PROJECTLIBFLAGS_ProjectLibFlags', 0x0000, projectlibflags_projectlibflags)
1510   -
1511   - # PROJECTVERSION Record
1512   - projectversion_id = struct.unpack("<H", dir_stream.read(2))[0]
1513   - check_value('PROJECTVERSION_Id', 0x0009, projectversion_id)
1514   - projectversion_reserved = struct.unpack("<L", dir_stream.read(4))[0]
1515   - check_value('PROJECTVERSION_Reserved', 0x0004, projectversion_reserved)
1516   - projectversion_versionmajor = struct.unpack("<L", dir_stream.read(4))[0]
1517   - projectversion_versionminor = struct.unpack("<H", dir_stream.read(2))[0]
1518   - unused = projectversion_versionmajor
1519   - unused = projectversion_versionminor
1520   -
1521   - # PROJECTCONSTANTS Record
1522   - projectconstants_id = struct.unpack("<H", dir_stream.read(2))[0]
1523   - check_value('PROJECTCONSTANTS_Id', 0x000C, projectconstants_id)
1524   - projectconstants_sizeof_constants = struct.unpack("<L", dir_stream.read(4))[0]
1525   - if projectconstants_sizeof_constants > 1015:
1526   - log.error(
1527   - "PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(projectconstants_sizeof_constants))
1528   - projectconstants_constants = dir_stream.read(projectconstants_sizeof_constants)
1529   - projectconstants_reserved = struct.unpack("<H", dir_stream.read(2))[0]
1530   - check_value('PROJECTCONSTANTS_Reserved', 0x003C, projectconstants_reserved)
1531   - projectconstants_sizeof_constants_unicode = struct.unpack("<L", dir_stream.read(4))[0]
1532   - if projectconstants_sizeof_constants_unicode % 2 != 0:
1533   - log.error("PROJECTCONSTANTS_SizeOfConstantsUnicode is not even")
1534   - projectconstants_constants_unicode = dir_stream.read(projectconstants_sizeof_constants_unicode)
1535   - unused = projectconstants_constants
1536   - unused = projectconstants_constants_unicode
1537   -
1538   - # array of REFERENCE records
1539   - check = None
1540   - while True:
1541   - check = struct.unpack("<H", dir_stream.read(2))[0]
1542   - log.debug("reference type = {0:04X}".format(check))
1543   - if check == 0x000F:
1544   - break
1545   -
1546   - if check == 0x0016:
1547   - # REFERENCENAME
1548   - reference_id = check
1549   - reference_sizeof_name = struct.unpack("<L", dir_stream.read(4))[0]
1550   - reference_name = dir_stream.read(reference_sizeof_name)
1551   - reference_reserved = struct.unpack("<H", dir_stream.read(2))[0]
1552   - # According to [MS-OVBA] 2.3.4.2.2.2 REFERENCENAME Record:
1553   - # "Reserved (2 bytes): MUST be 0x003E. MUST be ignored."
1554   - # So let's ignore it, otherwise it crashes on some files (issue #132)
1555   - # PR #135 by @c1fe:
1556   - # contrary to the specification I think that the unicode name
1557   - # is optional. if reference_reserved is not 0x003E I think it
1558   - # is actually the start of another REFERENCE record
1559   - # at least when projectsyskind_syskind == 0x02 (Macintosh)
1560   - if reference_reserved == 0x003E:
1561   - #if reference_reserved not in (0x003E, 0x000D):
1562   - # raise UnexpectedDataError(dir_path, 'REFERENCE_Reserved',
1563   - # 0x0003E, reference_reserved)
1564   - reference_sizeof_name_unicode = struct.unpack("<L", dir_stream.read(4))[0]
1565   - reference_name_unicode = dir_stream.read(reference_sizeof_name_unicode)
1566   - unused = reference_id
1567   - unused = reference_name
1568   - unused = reference_name_unicode
1569   - continue
1570   - else:
1571   - check = reference_reserved
1572   - log.debug("reference type = {0:04X}".format(check))
1573   -
1574   - if check == 0x0033:
1575   - # REFERENCEORIGINAL (followed by REFERENCECONTROL)
1576   - referenceoriginal_id = check
1577   - referenceoriginal_sizeof_libidoriginal = struct.unpack("<L", dir_stream.read(4))[0]
1578   - referenceoriginal_libidoriginal = dir_stream.read(referenceoriginal_sizeof_libidoriginal)
1579   - unused = referenceoriginal_id
1580   - unused = referenceoriginal_libidoriginal
1581   - continue
1582   -
1583   - if check == 0x002F:
1584   - # REFERENCECONTROL
1585   - referencecontrol_id = check
1586   - referencecontrol_sizetwiddled = struct.unpack("<L", dir_stream.read(4))[0] # ignore
1587   - referencecontrol_sizeof_libidtwiddled = struct.unpack("<L", dir_stream.read(4))[0]
1588   - referencecontrol_libidtwiddled = dir_stream.read(referencecontrol_sizeof_libidtwiddled)
1589   - referencecontrol_reserved1 = struct.unpack("<L", dir_stream.read(4))[0] # ignore
1590   - check_value('REFERENCECONTROL_Reserved1', 0x0000, referencecontrol_reserved1)
1591   - referencecontrol_reserved2 = struct.unpack("<H", dir_stream.read(2))[0] # ignore
1592   - check_value('REFERENCECONTROL_Reserved2', 0x0000, referencecontrol_reserved2)
1593   - unused = referencecontrol_id
1594   - unused = referencecontrol_sizetwiddled
1595   - unused = referencecontrol_libidtwiddled
1596   - # optional field
1597   - check2 = struct.unpack("<H", dir_stream.read(2))[0]
1598   - if check2 == 0x0016:
1599   - referencecontrol_namerecordextended_id = check
1600   - referencecontrol_namerecordextended_sizeof_name = struct.unpack("<L", dir_stream.read(4))[0]
1601   - referencecontrol_namerecordextended_name = dir_stream.read(
1602   - referencecontrol_namerecordextended_sizeof_name)
1603   - referencecontrol_namerecordextended_reserved = struct.unpack("<H", dir_stream.read(2))[0]
1604   - if referencecontrol_namerecordextended_reserved == 0x003E:
1605   - referencecontrol_namerecordextended_sizeof_name_unicode = struct.unpack("<L", dir_stream.read(4))[0]
1606   - referencecontrol_namerecordextended_name_unicode = dir_stream.read(
1607   - referencecontrol_namerecordextended_sizeof_name_unicode)
1608   - referencecontrol_reserved3 = struct.unpack("<H", dir_stream.read(2))[0]
1609   - unused = referencecontrol_namerecordextended_id
1610   - unused = referencecontrol_namerecordextended_name
1611   - unused = referencecontrol_namerecordextended_name_unicode
1612   - else:
1613   - referencecontrol_reserved3 = referencecontrol_namerecordextended_reserved
1614   - else:
1615   - referencecontrol_reserved3 = check2
1616   -
1617   - check_value('REFERENCECONTROL_Reserved3', 0x0030, referencecontrol_reserved3)
1618   - referencecontrol_sizeextended = struct.unpack("<L", dir_stream.read(4))[0]
1619   - referencecontrol_sizeof_libidextended = struct.unpack("<L", dir_stream.read(4))[0]
1620   - referencecontrol_libidextended = dir_stream.read(referencecontrol_sizeof_libidextended)
1621   - referencecontrol_reserved4 = struct.unpack("<L", dir_stream.read(4))[0]
1622   - referencecontrol_reserved5 = struct.unpack("<H", dir_stream.read(2))[0]
1623   - referencecontrol_originaltypelib = dir_stream.read(16)
1624   - referencecontrol_cookie = struct.unpack("<L", dir_stream.read(4))[0]
1625   - unused = referencecontrol_sizeextended
1626   - unused = referencecontrol_libidextended
1627   - unused = referencecontrol_reserved4
1628   - unused = referencecontrol_reserved5
1629   - unused = referencecontrol_originaltypelib
1630   - unused = referencecontrol_cookie
1631   - continue
1632   -
1633   - if check == 0x000D:
1634   - # REFERENCEREGISTERED
1635   - referenceregistered_id = check
1636   - referenceregistered_size = struct.unpack("<L", dir_stream.read(4))[0]
1637   - referenceregistered_sizeof_libid = struct.unpack("<L", dir_stream.read(4))[0]
1638   - referenceregistered_libid = dir_stream.read(referenceregistered_sizeof_libid)
1639   - referenceregistered_reserved1 = struct.unpack("<L", dir_stream.read(4))[0]
1640   - check_value('REFERENCEREGISTERED_Reserved1', 0x0000, referenceregistered_reserved1)
1641   - referenceregistered_reserved2 = struct.unpack("<H", dir_stream.read(2))[0]
1642   - check_value('REFERENCEREGISTERED_Reserved2', 0x0000, referenceregistered_reserved2)
1643   - unused = referenceregistered_id
1644   - unused = referenceregistered_size
1645   - unused = referenceregistered_libid
1646   - continue
1647   -
1648   - if check == 0x000E:
1649   - # REFERENCEPROJECT
1650   - referenceproject_id = check
1651   - referenceproject_size = struct.unpack("<L", dir_stream.read(4))[0]
1652   - referenceproject_sizeof_libidabsolute = struct.unpack("<L", dir_stream.read(4))[0]
1653   - referenceproject_libidabsolute = dir_stream.read(referenceproject_sizeof_libidabsolute)
1654   - referenceproject_sizeof_libidrelative = struct.unpack("<L", dir_stream.read(4))[0]
1655   - referenceproject_libidrelative = dir_stream.read(referenceproject_sizeof_libidrelative)
1656   - referenceproject_majorversion = struct.unpack("<L", dir_stream.read(4))[0]
1657   - referenceproject_minorversion = struct.unpack("<H", dir_stream.read(2))[0]
1658   - unused = referenceproject_id
1659   - unused = referenceproject_size
1660   - unused = referenceproject_libidabsolute
1661   - unused = referenceproject_libidrelative
1662   - unused = referenceproject_majorversion
1663   - unused = referenceproject_minorversion
1664   - continue
1665   -
1666   - log.error('invalid or unknown check Id {0:04X}'.format(check))
1667   - # raise an exception instead of stopping abruptly (issue #180)
1668   - raise UnexpectedDataError(dir_path, 'reference type', (0x0F, 0x16, 0x33, 0x2F, 0x0D, 0x0E), check)
1669   - #sys.exit(0)
1670   -
1671   - projectmodules_id = check #struct.unpack("<H", dir_stream.read(2))[0]
1672   - check_value('PROJECTMODULES_Id', 0x000F, projectmodules_id)
1673   - projectmodules_size = struct.unpack("<L", dir_stream.read(4))[0]
1674   - check_value('PROJECTMODULES_Size', 0x0002, projectmodules_size)
1675   - projectmodules_count = struct.unpack("<H", dir_stream.read(2))[0]
1676   - projectmodules_projectcookierecord_id = struct.unpack("<H", dir_stream.read(2))[0]
1677   - check_value('PROJECTMODULES_ProjectCookieRecord_Id', 0x0013, projectmodules_projectcookierecord_id)
1678   - projectmodules_projectcookierecord_size = struct.unpack("<L", dir_stream.read(4))[0]
1679   - check_value('PROJECTMODULES_ProjectCookieRecord_Size', 0x0002, projectmodules_projectcookierecord_size)
1680   - projectmodules_projectcookierecord_cookie = struct.unpack("<H", dir_stream.read(2))[0]
1681   - unused = projectmodules_projectcookierecord_cookie
1682   -
1683   - # short function to simplify unicode text output
1684   - uni_out = lambda unicode_text: unicode_text.encode('utf-8', 'replace')
1685   -
1686   - log.debug("parsing {0} modules".format(projectmodules_count))
1687   - for projectmodule_index in xrange(0, projectmodules_count):
1688   - try:
1689   - modulename_id = struct.unpack("<H", dir_stream.read(2))[0]
1690   - check_value('MODULENAME_Id', 0x0019, modulename_id)
1691   - modulename_sizeof_modulename = struct.unpack("<L", dir_stream.read(4))[0]
1692   - modulename_modulename = dir_stream.read(modulename_sizeof_modulename)
1693   - # TODO: preset variables to avoid "referenced before assignment" errors
1694   - modulename_unicode_modulename_unicode = ''
1695   - # account for optional sections
1696   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1697   - if section_id == 0x0047:
1698   - modulename_unicode_id = section_id
1699   - modulename_unicode_sizeof_modulename_unicode = struct.unpack("<L", dir_stream.read(4))[0]
1700   - modulename_unicode_modulename_unicode = dir_stream.read(
1701   - modulename_unicode_sizeof_modulename_unicode).decode('UTF-16LE', 'replace')
1702   - # just guessing that this is the same encoding as used in OleFileIO
1703   - unused = modulename_unicode_id
1704   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1705   - if section_id == 0x001A:
1706   - modulestreamname_id = section_id
1707   - modulestreamname_sizeof_streamname = struct.unpack("<L", dir_stream.read(4))[0]
1708   - modulestreamname_streamname = dir_stream.read(modulestreamname_sizeof_streamname)
1709   - modulestreamname_reserved = struct.unpack("<H", dir_stream.read(2))[0]
1710   - check_value('MODULESTREAMNAME_Reserved', 0x0032, modulestreamname_reserved)
1711   - modulestreamname_sizeof_streamname_unicode = struct.unpack("<L", dir_stream.read(4))[0]
1712   - modulestreamname_streamname_unicode = dir_stream.read(
1713   - modulestreamname_sizeof_streamname_unicode).decode('UTF-16LE', 'replace')
1714   - # just guessing that this is the same encoding as used in OleFileIO
1715   - unused = modulestreamname_id
1716   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1717   - if section_id == 0x001C:
1718   - moduledocstring_id = section_id
1719   - check_value('MODULEDOCSTRING_Id', 0x001C, moduledocstring_id)
1720   - moduledocstring_sizeof_docstring = struct.unpack("<L", dir_stream.read(4))[0]
1721   - moduledocstring_docstring = dir_stream.read(moduledocstring_sizeof_docstring)
1722   - moduledocstring_reserved = struct.unpack("<H", dir_stream.read(2))[0]
1723   - check_value('MODULEDOCSTRING_Reserved', 0x0048, moduledocstring_reserved)
1724   - moduledocstring_sizeof_docstring_unicode = struct.unpack("<L", dir_stream.read(4))[0]
1725   - moduledocstring_docstring_unicode = dir_stream.read(moduledocstring_sizeof_docstring_unicode)
1726   - unused = moduledocstring_docstring
1727   - unused = moduledocstring_docstring_unicode
1728   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1729   - if section_id == 0x0031:
1730   - moduleoffset_id = section_id
1731   - check_value('MODULEOFFSET_Id', 0x0031, moduleoffset_id)
1732   - moduleoffset_size = struct.unpack("<L", dir_stream.read(4))[0]
1733   - check_value('MODULEOFFSET_Size', 0x0004, moduleoffset_size)
1734   - moduleoffset_textoffset = struct.unpack("<L", dir_stream.read(4))[0]
1735   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1736   - if section_id == 0x001E:
1737   - modulehelpcontext_id = section_id
1738   - check_value('MODULEHELPCONTEXT_Id', 0x001E, modulehelpcontext_id)
1739   - modulehelpcontext_size = struct.unpack("<L", dir_stream.read(4))[0]
1740   - check_value('MODULEHELPCONTEXT_Size', 0x0004, modulehelpcontext_size)
1741   - modulehelpcontext_helpcontext = struct.unpack("<L", dir_stream.read(4))[0]
1742   - unused = modulehelpcontext_helpcontext
1743   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1744   - if section_id == 0x002C:
1745   - modulecookie_id = section_id
1746   - check_value('MODULECOOKIE_Id', 0x002C, modulecookie_id)
1747   - modulecookie_size = struct.unpack("<L", dir_stream.read(4))[0]
1748   - check_value('MODULECOOKIE_Size', 0x0002, modulecookie_size)
1749   - modulecookie_cookie = struct.unpack("<H", dir_stream.read(2))[0]
1750   - unused = modulecookie_cookie
1751   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1752   - if section_id == 0x0021 or section_id == 0x0022:
1753   - moduletype_id = section_id
1754   - moduletype_reserved = struct.unpack("<L", dir_stream.read(4))[0]
1755   - unused = moduletype_id
1756   - unused = moduletype_reserved
1757   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1758   - if section_id == 0x0025:
1759   - modulereadonly_id = section_id
1760   - check_value('MODULEREADONLY_Id', 0x0025, modulereadonly_id)
1761   - modulereadonly_reserved = struct.unpack("<L", dir_stream.read(4))[0]
1762   - check_value('MODULEREADONLY_Reserved', 0x0000, modulereadonly_reserved)
1763   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1764   - if section_id == 0x0028:
1765   - moduleprivate_id = section_id
1766   - check_value('MODULEPRIVATE_Id', 0x0028, moduleprivate_id)
1767   - moduleprivate_reserved = struct.unpack("<L", dir_stream.read(4))[0]
1768   - check_value('MODULEPRIVATE_Reserved', 0x0000, moduleprivate_reserved)
1769   - section_id = struct.unpack("<H", dir_stream.read(2))[0]
1770   - if section_id == 0x002B: # TERMINATOR
1771   - module_reserved = struct.unpack("<L", dir_stream.read(4))[0]
1772   - check_value('MODULE_Reserved', 0x0000, module_reserved)
1773   - section_id = None
1774   - if section_id != None:
1775   - log.warning('unknown or invalid module section id {0:04X}'.format(section_id))
1776   -
1777   - log.debug('Project CodePage = %d' % projectcodepage_codepage)
1778   - if projectcodepage_codepage in MAC_CODEPAGES:
1779   - vba_codec = MAC_CODEPAGES[projectcodepage_codepage]
1780   - else:
1781   - vba_codec = 'cp%d' % projectcodepage_codepage
1782   - log.debug("ModuleName = {0}".format(modulename_modulename))
1783   - log.debug("ModuleNameUnicode = {0}".format(uni_out(modulename_unicode_modulename_unicode)))
1784   - log.debug("StreamName = {0}".format(modulestreamname_streamname))
1785   - try:
1786   - streamname_unicode = modulestreamname_streamname.decode(vba_codec)
1787   - except UnicodeError as ue:
1788   - log.debug('failed to decode stream name {0!r} with codec {1}'
1789   - .format(uni_out(streamname_unicode), vba_codec))
1790   - streamname_unicode = modulestreamname_streamname.decode(vba_codec, errors='replace')
1791   - log.debug("StreamName.decode('%s') = %s" % (vba_codec, uni_out(streamname_unicode)))
1792   - log.debug("StreamNameUnicode = {0}".format(uni_out(modulestreamname_streamname_unicode)))
1793   - log.debug("TextOffset = {0}".format(moduleoffset_textoffset))
1794   -
1795   - code_data = None
1796   - try_names = streamname_unicode, \
1797   - modulename_unicode_modulename_unicode, \
1798   - modulestreamname_streamname_unicode
1799   - for stream_name in try_names:
1800   - # TODO: if olefile._find were less private, could replace this
1801   - # try-except with calls to it
1802   - try:
1803   - code_path = vba_root + u'VBA/' + stream_name
1804   - log.debug('opening VBA code stream %s' % uni_out(code_path))
1805   - code_data = ole.openstream(code_path).read()
1806   - break
1807   - except IOError as ioe:
1808   - log.debug('failed to open stream VBA/%r (%r), try other name'
1809   - % (uni_out(stream_name), ioe))
1810   -
1811   - if code_data is None:
1812   - log.info("Could not open stream %d of %d ('VBA/' + one of %r)!"
1813   - % (projectmodule_index, projectmodules_count,
1814   - '/'.join("'" + uni_out(stream_name) + "'"
1815   - for stream_name in try_names)))
1816   - if relaxed:
1817   - continue # ... with next submodule
1818   - else:
1819   - raise SubstreamOpenError('[BASE]', 'VBA/' +
1820   - uni_out(modulename_unicode_modulename_unicode))
1821   -
1822   - log.debug("length of code_data = {0}".format(len(code_data)))
1823   - log.debug("offset of code_data = {0}".format(moduleoffset_textoffset))
1824   - code_data = code_data[moduleoffset_textoffset:]
1825   - if len(code_data) > 0:
1826   - code_data = decompress_stream(bytearray(code_data))
1827   - # case-insensitive search in the code_modules dict to find the file extension:
1828   - filext = code_modules.get(modulename_modulename.lower(), 'bin')
1829   - filename = '{0}.{1}'.format(modulename_modulename, filext)
1830   - #TODO: also yield the codepage so that callers can decode it properly
1831   - yield (code_path, filename, code_data)
1832   - # print '-'*79
1833   - # print filename
1834   - # print ''
1835   - # print code_data
1836   - # print ''
1837   - log.debug('extracted file {0}'.format(filename))
1838   - else:
1839   - log.warning("module stream {0} has code data length 0".format(modulestreamname_streamname))
1840   - except (UnexpectedDataError, SubstreamOpenError):
1841   - raise
1842   - except Exception as exc:
1843   - log.info('Error parsing module {0} of {1} in _extract_vba:'
1844   - .format(projectmodule_index, projectmodules_count),
1845   - exc_info=True)
1846   - if not relaxed:
1847   - raise
1848   - _ = unused # make pylint happy: now variable "unused" is being used ;-)
1849   - return
  1925 + for code_path, filename, code_data in project.parse_modules():
  1926 + yield (code_path, filename, code_data)
1850 1927  
1851 1928  
1852 1929 def vba_collapse_long_lines(vba_code):
... ...