Commit 2a69ce37647c0f57a23f39cecf549f68bb02af85

Authored by Christian Herdtweck
1 parent fbfd20e7

check for mac codepages when decoding project codepage

based on suggesion in issue #42 in bitbucket, list of codepages on
stackoverflow (see comment in code) and contents of
/usr/lib/python2.7/encodings
Showing 1 changed file with 29 additions and 3 deletions
oletools/olevba.py
... ... @@ -380,6 +380,22 @@ RETURN_PARSE_ERROR = 6
380 380 RETURN_SEVERAL_ERRS = 7
381 381 RETURN_UNEXPECTED = 8
382 382  
  383 +# MAC codepages (from http://stackoverflow.com/questions/1592925/decoding-mac-os-text-in-python)
  384 +MAC_CODEPAGES = {
  385 + 10000: 'mac-roman',
  386 + 10001: 'shiftjis', # not found: 'mac-shift-jis',
  387 + 10003: 'ascii', # nothing appropriate found: 'mac-hangul',
  388 + 10008: 'gb2321', # not found: 'mac-gb2312',
  389 + 10002: 'big5', # not found: 'mac-big5',
  390 + 10005: 'hebrew', # not found: 'mac-hebrew',
  391 + 10004: 'mac-arabic',
  392 + 10006: 'mac-greek',
  393 + 10081: 'mac-turkish',
  394 + 10021: 'thai', # not found: mac-thai',
  395 + 10029: 'maccentraleurope', # not found: 'mac-east europe',
  396 + 10007: 'ascii', # nothing appropriate found: 'mac-russian',
  397 +}
  398 +
383 399 # URL and message to report issues:
384 400 URL_OLEVBA_ISSUES = 'https://github.com/decalage2/oletools/issues'
385 401 MSG_OLEVBA_ISSUES = 'Please report this issue on %s' % URL_OLEVBA_ISSUES
... ... @@ -1347,7 +1363,9 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1347 1363 reference_sizeof_name = struct.unpack("<L", dir_stream.read(4))[0]
1348 1364 reference_name = dir_stream.read(reference_sizeof_name)
1349 1365 reference_reserved = struct.unpack("<H", dir_stream.read(2))[0]
1350   - check_value('REFERENCE_Reserved', 0x003E, reference_reserved)
  1366 + if reference_reserved not in (0x003E, 0x000D):
  1367 + raise UnexpectedDataError(dir_path, 'REFERENCE_Reserved',
  1368 + (0x003E, 0x000D), value)
1351 1369 reference_sizeof_name_unicode = struct.unpack("<L", dir_stream.read(4))[0]
1352 1370 reference_name_unicode = dir_stream.read(reference_sizeof_name_unicode)
1353 1371 unused = reference_id
... ... @@ -1556,11 +1574,19 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1556 1574 log.warning('unknown or invalid module section id {0:04X}'.format(section_id))
1557 1575  
1558 1576 log.debug('Project CodePage = %d' % projectcodepage_codepage)
1559   - vba_codec = 'cp%d' % projectcodepage_codepage
  1577 + if projectcodepage_codepage in MAC_CODEPAGES:
  1578 + vba_codec = MAC_CODEPAGES[projectcodepage_codepage]
  1579 + else:
  1580 + vba_codec = 'cp%d' % projectcodepage_codepage
1560 1581 log.debug("ModuleName = {0}".format(modulename_modulename))
1561 1582 log.debug("ModuleNameUnicode = {0}".format(uni_out(modulename_unicode_modulename_unicode)))
1562 1583 log.debug("StreamName = {0}".format(modulestreamname_streamname))
1563   - streamname_unicode = modulestreamname_streamname.decode(vba_codec)
  1584 + try:
  1585 + streamname_unicode = modulestreamname_streamname.decode(vba_codec)
  1586 + except UnicodeError as ue:
  1587 + log.debug('failed to decode stream name {0!r} with codec {1}'
  1588 + .format(uni_out(streamname_unicode), vba_codec))
  1589 + streamname_unicode = modulestreamname_streamname.decode(vba_codec, errors='replace')
1564 1590 log.debug("StreamName.decode('%s') = %s" % (vba_codec, uni_out(streamname_unicode)))
1565 1591 log.debug("StreamNameUnicode = {0}".format(uni_out(modulestreamname_streamname_unicode)))
1566 1592 log.debug("TextOffset = {0}".format(moduleoffset_textoffset))
... ...