Commit 2a69ce37647c0f57a23f39cecf549f68bb02af85

Authored by Christian Herdtweck
1 parent fbfd20e7

check for mac codepages when decoding project codepage

based on suggesion in issue #42 in bitbucket, list of codepages on
stackoverflow (see comment in code) and contents of
/usr/lib/python2.7/encodings
Showing 1 changed file with 29 additions and 3 deletions
oletools/olevba.py
@@ -380,6 +380,22 @@ RETURN_PARSE_ERROR = 6 @@ -380,6 +380,22 @@ RETURN_PARSE_ERROR = 6
380 RETURN_SEVERAL_ERRS = 7 380 RETURN_SEVERAL_ERRS = 7
381 RETURN_UNEXPECTED = 8 381 RETURN_UNEXPECTED = 8
382 382
  383 +# MAC codepages (from http://stackoverflow.com/questions/1592925/decoding-mac-os-text-in-python)
  384 +MAC_CODEPAGES = {
  385 + 10000: 'mac-roman',
  386 + 10001: 'shiftjis', # not found: 'mac-shift-jis',
  387 + 10003: 'ascii', # nothing appropriate found: 'mac-hangul',
  388 + 10008: 'gb2321', # not found: 'mac-gb2312',
  389 + 10002: 'big5', # not found: 'mac-big5',
  390 + 10005: 'hebrew', # not found: 'mac-hebrew',
  391 + 10004: 'mac-arabic',
  392 + 10006: 'mac-greek',
  393 + 10081: 'mac-turkish',
  394 + 10021: 'thai', # not found: mac-thai',
  395 + 10029: 'maccentraleurope', # not found: 'mac-east europe',
  396 + 10007: 'ascii', # nothing appropriate found: 'mac-russian',
  397 +}
  398 +
383 # URL and message to report issues: 399 # URL and message to report issues:
384 URL_OLEVBA_ISSUES = 'https://github.com/decalage2/oletools/issues' 400 URL_OLEVBA_ISSUES = 'https://github.com/decalage2/oletools/issues'
385 MSG_OLEVBA_ISSUES = 'Please report this issue on %s' % URL_OLEVBA_ISSUES 401 MSG_OLEVBA_ISSUES = 'Please report this issue on %s' % URL_OLEVBA_ISSUES
@@ -1347,7 +1363,9 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): @@ -1347,7 +1363,9 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1347 reference_sizeof_name = struct.unpack("<L", dir_stream.read(4))[0] 1363 reference_sizeof_name = struct.unpack("<L", dir_stream.read(4))[0]
1348 reference_name = dir_stream.read(reference_sizeof_name) 1364 reference_name = dir_stream.read(reference_sizeof_name)
1349 reference_reserved = struct.unpack("<H", dir_stream.read(2))[0] 1365 reference_reserved = struct.unpack("<H", dir_stream.read(2))[0]
1350 - check_value('REFERENCE_Reserved', 0x003E, reference_reserved) 1366 + if reference_reserved not in (0x003E, 0x000D):
  1367 + raise UnexpectedDataError(dir_path, 'REFERENCE_Reserved',
  1368 + (0x003E, 0x000D), value)
1351 reference_sizeof_name_unicode = struct.unpack("<L", dir_stream.read(4))[0] 1369 reference_sizeof_name_unicode = struct.unpack("<L", dir_stream.read(4))[0]
1352 reference_name_unicode = dir_stream.read(reference_sizeof_name_unicode) 1370 reference_name_unicode = dir_stream.read(reference_sizeof_name_unicode)
1353 unused = reference_id 1371 unused = reference_id
@@ -1556,11 +1574,19 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): @@ -1556,11 +1574,19 @@ def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False):
1556 log.warning('unknown or invalid module section id {0:04X}'.format(section_id)) 1574 log.warning('unknown or invalid module section id {0:04X}'.format(section_id))
1557 1575
1558 log.debug('Project CodePage = %d' % projectcodepage_codepage) 1576 log.debug('Project CodePage = %d' % projectcodepage_codepage)
1559 - vba_codec = 'cp%d' % projectcodepage_codepage 1577 + if projectcodepage_codepage in MAC_CODEPAGES:
  1578 + vba_codec = MAC_CODEPAGES[projectcodepage_codepage]
  1579 + else:
  1580 + vba_codec = 'cp%d' % projectcodepage_codepage
1560 log.debug("ModuleName = {0}".format(modulename_modulename)) 1581 log.debug("ModuleName = {0}".format(modulename_modulename))
1561 log.debug("ModuleNameUnicode = {0}".format(uni_out(modulename_unicode_modulename_unicode))) 1582 log.debug("ModuleNameUnicode = {0}".format(uni_out(modulename_unicode_modulename_unicode)))
1562 log.debug("StreamName = {0}".format(modulestreamname_streamname)) 1583 log.debug("StreamName = {0}".format(modulestreamname_streamname))
1563 - streamname_unicode = modulestreamname_streamname.decode(vba_codec) 1584 + try:
  1585 + streamname_unicode = modulestreamname_streamname.decode(vba_codec)
  1586 + except UnicodeError as ue:
  1587 + log.debug('failed to decode stream name {0!r} with codec {1}'
  1588 + .format(uni_out(streamname_unicode), vba_codec))
  1589 + streamname_unicode = modulestreamname_streamname.decode(vba_codec, errors='replace')
1564 log.debug("StreamName.decode('%s') = %s" % (vba_codec, uni_out(streamname_unicode))) 1590 log.debug("StreamName.decode('%s') = %s" % (vba_codec, uni_out(streamname_unicode)))
1565 log.debug("StreamNameUnicode = {0}".format(uni_out(modulestreamname_streamname_unicode))) 1591 log.debug("StreamNameUnicode = {0}".format(uni_out(modulestreamname_streamname_unicode)))
1566 log.debug("TextOffset = {0}".format(moduleoffset_textoffset)) 1592 log.debug("TextOffset = {0}".format(moduleoffset_textoffset))