From fdc77bfa57cd0105ba5f1952a31cd25fa73511fe Mon Sep 17 00:00:00 2001 From: decalage2 Date: Fri, 14 Dec 2018 13:59:06 +0100 Subject: [PATCH] common: added new module codepages --- oletools/common/codepages.py | 297 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 297 insertions(+), 0 deletions(-) create mode 100644 oletools/common/codepages.py diff --git a/oletools/common/codepages.py b/oletools/common/codepages.py new file mode 100644 index 0000000..8c1e100 --- /dev/null +++ b/oletools/common/codepages.py @@ -0,0 +1,297 @@ +""" +codepages.py + +codepages is a python module to map code pages (numbers) to Python codecs, +in order to decode bytes to unicode. + +Author: Philippe Lagadec - http://www.decalage.info +License: BSD, see source code or documentation + +codepages is part of the python-oletools package: +http://www.decalage.info/python/oletools +""" + +# === LICENSE ================================================================== + +# codepages is copyright (c) 2018 Philippe Lagadec (http://www.decalage.info) +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +# ----------------------------------------------------------------------------- +# CHANGELOG: +# 2018-12-13 v0.54 PL: - first version + +__version__ = '0.54dev6' + +# ----------------------------------------------------------------------------- +# TODO: + +# ----------------------------------------------------------------------------- +# REFERENCES: +# - https://docs.microsoft.com/en-gb/windows/desktop/Intl/code-page-identifiers + + +# --- IMPORTS ----------------------------------------------------------------- + +import codecs + +# === CONSTANTS =============================================================== + +# Code page names from https://docs.microsoft.com/en-gb/windows/desktop/Intl/code-page-identifiers +# Retrieved on the 2018-12-13 +# How it was converted to Python: +# 1) copy the table data (3 columns) from browser into Excel +# 2) use the following formula to concatenate 1st and 3rd columns: =A1 & ": " & "'" & C1 & "'," +# 3) copy from Excel into Python + +CODEPAGE_NAME = { + 37: 'IBM EBCDIC US-Canada', + 437: 'OEM United States', + 500: 'IBM EBCDIC International', + 708: 'Arabic (ASMO 708)', + 709: 'Arabic (ASMO-449+, BCON V4)', + 710: 'Arabic - Transparent Arabic', + 720: 'Arabic (Transparent ASMO); Arabic (DOS)', + 737: 'OEM Greek (formerly 437G); Greek (DOS)', + 775: 'OEM Baltic; Baltic (DOS)', + 850: 'OEM Multilingual Latin 1; Western European (DOS)', + 852: 'OEM Latin 2; Central European (DOS)', + 855: 'OEM Cyrillic (primarily Russian)', + 857: 'OEM Turkish; Turkish (DOS)', + 858: 'OEM Multilingual Latin 1 + Euro symbol', + 860: 'OEM Portuguese; Portuguese (DOS)', + 861: 'OEM Icelandic; Icelandic (DOS)', + 862: 'OEM Hebrew; Hebrew (DOS)', + 863: 'OEM French Canadian; French Canadian (DOS)', + 864: 'OEM Arabic; Arabic (864)', + 865: 'OEM Nordic; Nordic (DOS)', + 866: 'OEM Russian; Cyrillic (DOS)', + 869: 'OEM Modern Greek; Greek, Modern (DOS)', + 870: 'IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2', + 874: 'ANSI/OEM Thai (ISO 8859-11); Thai (Windows)', + 875: 'IBM EBCDIC Greek Modern', + 932: 'ANSI/OEM Japanese; Japanese (Shift-JIS)', + 936: 'ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)', + 949: 'ANSI/OEM Korean (Unified Hangul Code)', + 950: 'ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)', + 1026: 'IBM EBCDIC Turkish (Latin 5)', + 1047: 'IBM EBCDIC Latin 1/Open System', + 1140: 'IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)', + 1141: 'IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)', + 1142: 'IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)', + 1143: 'IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)', + 1144: 'IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)', + 1145: 'IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)', + 1146: 'IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)', + 1147: 'IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)', + 1148: 'IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)', + 1149: 'IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)', + 1200: 'Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications', + 1201: 'Unicode UTF-16, big endian byte order; available only to managed applications', + 1250: 'ANSI Central European; Central European (Windows)', + 1251: 'ANSI Cyrillic; Cyrillic (Windows)', + 1252: 'ANSI Latin 1; Western European (Windows)', + 1253: 'ANSI Greek; Greek (Windows)', + 1254: 'ANSI Turkish; Turkish (Windows)', + 1255: 'ANSI Hebrew; Hebrew (Windows)', + 1256: 'ANSI Arabic; Arabic (Windows)', + 1257: 'ANSI Baltic; Baltic (Windows)', + 1258: 'ANSI/OEM Vietnamese; Vietnamese (Windows)', + 1361: 'Korean (Johab)', + 10000: 'MAC Roman; Western European (Mac)', + 10001: 'Japanese (Mac)', + 10002: 'MAC Traditional Chinese (Big5); Chinese Traditional (Mac)', + 10003: 'Korean (Mac)', + 10004: 'Arabic (Mac)', + 10005: 'Hebrew (Mac)', + 10006: 'Greek (Mac)', + 10007: 'Cyrillic (Mac)', + 10008: 'MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)', + 10010: 'Romanian (Mac)', + 10017: 'Ukrainian (Mac)', + 10021: 'Thai (Mac)', + 10029: 'MAC Latin 2; Central European (Mac)', + 10079: 'Icelandic (Mac)', + 10081: 'Turkish (Mac)', + 10082: 'Croatian (Mac)', + 12000: 'Unicode UTF-32, little endian byte order; available only to managed applications', + 12001: 'Unicode UTF-32, big endian byte order; available only to managed applications', + 20000: 'CNS Taiwan; Chinese Traditional (CNS)', + 20001: 'TCA Taiwan', + 20002: 'Eten Taiwan; Chinese Traditional (Eten)', + 20003: 'IBM5550 Taiwan', + 20004: 'TeleText Taiwan', + 20005: 'Wang Taiwan', + 20105: 'IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)', + 20106: 'IA5 German (7-bit)', + 20107: 'IA5 Swedish (7-bit)', + 20108: 'IA5 Norwegian (7-bit)', + 20127: 'US-ASCII (7-bit)', + 20261: 'T.61', + 20269: 'ISO 6937 Non-Spacing Accent', + 20273: 'IBM EBCDIC Germany', + 20277: 'IBM EBCDIC Denmark-Norway', + 20278: 'IBM EBCDIC Finland-Sweden', + 20280: 'IBM EBCDIC Italy', + 20284: 'IBM EBCDIC Latin America-Spain', + 20285: 'IBM EBCDIC United Kingdom', + 20290: 'IBM EBCDIC Japanese Katakana Extended', + 20297: 'IBM EBCDIC France', + 20420: 'IBM EBCDIC Arabic', + 20423: 'IBM EBCDIC Greek', + 20424: 'IBM EBCDIC Hebrew', + 20833: 'IBM EBCDIC Korean Extended', + 20838: 'IBM EBCDIC Thai', + 20866: 'Russian (KOI8-R); Cyrillic (KOI8-R)', + 20871: 'IBM EBCDIC Icelandic', + 20880: 'IBM EBCDIC Cyrillic Russian', + 20905: 'IBM EBCDIC Turkish', + 20924: 'IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)', + 20932: 'Japanese (JIS 0208-1990 and 0212-1990)', + 20936: 'Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)', + 20949: 'Korean Wansung', + 21025: 'IBM EBCDIC Cyrillic Serbian-Bulgarian', + 21027: '(deprecated)', + 21866: 'Ukrainian (KOI8-U); Cyrillic (KOI8-U)', + 28591: 'ISO 8859-1 Latin 1; Western European (ISO)', + 28592: 'ISO 8859-2 Central European; Central European (ISO)', + 28593: 'ISO 8859-3 Latin 3', + 28594: 'ISO 8859-4 Baltic', + 28595: 'ISO 8859-5 Cyrillic', + 28596: 'ISO 8859-6 Arabic', + 28597: 'ISO 8859-7 Greek', + 28598: 'ISO 8859-8 Hebrew; Hebrew (ISO-Visual)', + 28599: 'ISO 8859-9 Turkish', + 28603: 'ISO 8859-13 Estonian', + 28605: 'ISO 8859-15 Latin 9', + 29001: 'Europa 3', + 38598: 'ISO 8859-8 Hebrew; Hebrew (ISO-Logical)', + 50220: 'ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)', + 50221: 'ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)', + 50222: 'ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)', + 50225: 'ISO 2022 Korean', + 50227: 'ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)', + 50229: 'ISO 2022 Traditional Chinese', + 50930: 'EBCDIC Japanese (Katakana) Extended', + 50931: 'EBCDIC US-Canada and Japanese', + 50933: 'EBCDIC Korean Extended and Korean', + 50935: 'EBCDIC Simplified Chinese Extended and Simplified Chinese', + 50936: 'EBCDIC Simplified Chinese', + 50937: 'EBCDIC US-Canada and Traditional Chinese', + 50939: 'EBCDIC Japanese (Latin) Extended and Japanese', + 51932: 'EUC Japanese', + 51936: 'EUC Simplified Chinese; Chinese Simplified (EUC)', + 51949: 'EUC Korean', + 51950: 'EUC Traditional Chinese', + 52936: 'HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)', + 54936: 'Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)', + 57002: 'ISCII Devanagari', + 57003: 'ISCII Bangla', + 57004: 'ISCII Tamil', + 57005: 'ISCII Telugu', + 57006: 'ISCII Assamese', + 57007: 'ISCII Odia', + 57008: 'ISCII Kannada', + 57009: 'ISCII Malayalam', + 57010: 'ISCII Gujarati', + 57011: 'ISCII Punjabi', + 65000: 'Unicode (UTF-7)', + 65001: 'Unicode (UTF-8)', +} + + +# Mapping from codepages to Python codecs, when 'cpXXX' does not work +# (inspired from http://stackoverflow.com/questions/1592925/decoding-mac-os-text-in-python) +CODEPAGE_TO_CODEC = { + 37: 'cp037', + 708: 'arabic', # not found: Arabic (ASMO 708) => arabic = iso-8859-6 + 709: 'arabic', # not found: Arabic (ASMO-449+, BCON V4) => arabic = iso-8859-6 + 710: 'arabic', # not found: Arabic - Transparent Arabic => arabic = iso-8859-6 + 870: 'latin2', # IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2 + 1047: 'latin1', # IBM EBCDIC Latin 1/Open System + 1141: 'cp273', # IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro) + 1200: 'utf_16_le', # Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications + 1201: 'utf_16_be', # Unicode UTF-16, big endian byte order; available only to managed applications + + 10000: 'mac-roman', + 10001: 'shiftjis', # not found: 'mac-shift-jis', + 10002: 'big5', # not found: 'mac-big5', + 10003: 'ascii', # nothing appropriate found: 'mac-hangul', + 10004: 'mac-arabic', + 10005: 'hebrew', # not found: 'mac-hebrew', + 10006: 'mac-greek', + 10007: 'ascii', # nothing appropriate found: 'mac-russian', + 10008: 'gb2312', # not found: 'mac-gb2312', + 10021: 'thai', # not found: mac-thai', + 10029: 'maccentraleurope', # not found: 'mac-east europe', + 10081: 'mac-turkish', + + 12000: 'utf_32_le', # Unicode UTF-32, little endian byte order + 12001: 'utf_32_be', # Unicode UTF-32, big endian byte order + + 20127: 'ascii', + + 28591: 'latin1', + 28592: 'iso8859_2', + 28593: 'iso8859_3', + 28594: 'iso8859_4', + 28595: 'iso8859_5', + 28596: 'iso8859_6', + 28597: 'iso8859_7', + 28598: 'iso8859_8', + 28599: 'iso8859_9', + 28603: 'iso8859_13', + 28605: 'iso8859_15', + 38598: 'iso8859_8', + + 65000: 'utf7', + 65001: 'utf8', +} + + +# === FUNCTIONS ============================================================== + +def codepage2codec(codepage): + """ + convert a codepage number to a Python codec. + If the corresponding codec cannot be found, returns "utf8" by default. + + :param codepage: int, code page number + :return: str, Python codec name + """ + if codepage in CODEPAGE_TO_CODEC: + codec = CODEPAGE_TO_CODEC[codepage] + else: + codec = 'cp%d' % codepage + try: + codecs.lookup(codec) + except LookupError: + #log.error('Codec not found for code page %d, using UTF-8 as fallback.' % codepage) + codec = 'utf8' + return codec + +# === MAIN: TESTS ============================================================ + +if __name__ == '__main__': + for cp in sorted(CODEPAGE_NAME.keys()): + print('Code Page: %d => codec: %s - %s' % (cp, codepage2codec(cp), CODEPAGE_NAME[cp])) \ No newline at end of file -- libgit2 0.21.4