Commit fdc77bfa57cd0105ba5f1952a31cd25fa73511fe

Authored by decalage2
1 parent 7374be1e

common: added new module codepages

Showing 1 changed file with 297 additions and 0 deletions
oletools/common/codepages.py 0 → 100644
  1 +"""
  2 +codepages.py
  3 +
  4 +codepages is a python module to map code pages (numbers) to Python codecs,
  5 +in order to decode bytes to unicode.
  6 +
  7 +Author: Philippe Lagadec - http://www.decalage.info
  8 +License: BSD, see source code or documentation
  9 +
  10 +codepages is part of the python-oletools package:
  11 +http://www.decalage.info/python/oletools
  12 +"""
  13 +
  14 +# === LICENSE ==================================================================
  15 +
  16 +# codepages is copyright (c) 2018 Philippe Lagadec (http://www.decalage.info)
  17 +# All rights reserved.
  18 +#
  19 +# Redistribution and use in source and binary forms, with or without modification,
  20 +# are permitted provided that the following conditions are met:
  21 +#
  22 +# * Redistributions of source code must retain the above copyright notice, this
  23 +# list of conditions and the following disclaimer.
  24 +# * Redistributions in binary form must reproduce the above copyright notice,
  25 +# this list of conditions and the following disclaimer in the documentation
  26 +# and/or other materials provided with the distribution.
  27 +#
  28 +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  29 +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  30 +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  31 +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  32 +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  33 +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  34 +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  36 +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  37 +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38 +
  39 +
  40 +# -----------------------------------------------------------------------------
  41 +# CHANGELOG:
  42 +# 2018-12-13 v0.54 PL: - first version
  43 +
  44 +__version__ = '0.54dev6'
  45 +
  46 +# -----------------------------------------------------------------------------
  47 +# TODO:
  48 +
  49 +# -----------------------------------------------------------------------------
  50 +# REFERENCES:
  51 +# - https://docs.microsoft.com/en-gb/windows/desktop/Intl/code-page-identifiers
  52 +
  53 +
  54 +# --- IMPORTS -----------------------------------------------------------------
  55 +
  56 +import codecs
  57 +
  58 +# === CONSTANTS ===============================================================
  59 +
  60 +# Code page names from https://docs.microsoft.com/en-gb/windows/desktop/Intl/code-page-identifiers
  61 +# Retrieved on the 2018-12-13
  62 +# How it was converted to Python:
  63 +# 1) copy the table data (3 columns) from browser into Excel
  64 +# 2) use the following formula to concatenate 1st and 3rd columns: =A1 & ": " & "'" & C1 & "',"
  65 +# 3) copy from Excel into Python
  66 +
  67 +CODEPAGE_NAME = {
  68 + 37: 'IBM EBCDIC US-Canada',
  69 + 437: 'OEM United States',
  70 + 500: 'IBM EBCDIC International',
  71 + 708: 'Arabic (ASMO 708)',
  72 + 709: 'Arabic (ASMO-449+, BCON V4)',
  73 + 710: 'Arabic - Transparent Arabic',
  74 + 720: 'Arabic (Transparent ASMO); Arabic (DOS)',
  75 + 737: 'OEM Greek (formerly 437G); Greek (DOS)',
  76 + 775: 'OEM Baltic; Baltic (DOS)',
  77 + 850: 'OEM Multilingual Latin 1; Western European (DOS)',
  78 + 852: 'OEM Latin 2; Central European (DOS)',
  79 + 855: 'OEM Cyrillic (primarily Russian)',
  80 + 857: 'OEM Turkish; Turkish (DOS)',
  81 + 858: 'OEM Multilingual Latin 1 + Euro symbol',
  82 + 860: 'OEM Portuguese; Portuguese (DOS)',
  83 + 861: 'OEM Icelandic; Icelandic (DOS)',
  84 + 862: 'OEM Hebrew; Hebrew (DOS)',
  85 + 863: 'OEM French Canadian; French Canadian (DOS)',
  86 + 864: 'OEM Arabic; Arabic (864)',
  87 + 865: 'OEM Nordic; Nordic (DOS)',
  88 + 866: 'OEM Russian; Cyrillic (DOS)',
  89 + 869: 'OEM Modern Greek; Greek, Modern (DOS)',
  90 + 870: 'IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2',
  91 + 874: 'ANSI/OEM Thai (ISO 8859-11); Thai (Windows)',
  92 + 875: 'IBM EBCDIC Greek Modern',
  93 + 932: 'ANSI/OEM Japanese; Japanese (Shift-JIS)',
  94 + 936: 'ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)',
  95 + 949: 'ANSI/OEM Korean (Unified Hangul Code)',
  96 + 950: 'ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)',
  97 + 1026: 'IBM EBCDIC Turkish (Latin 5)',
  98 + 1047: 'IBM EBCDIC Latin 1/Open System',
  99 + 1140: 'IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)',
  100 + 1141: 'IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)',
  101 + 1142: 'IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)',
  102 + 1143: 'IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)',
  103 + 1144: 'IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)',
  104 + 1145: 'IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)',
  105 + 1146: 'IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)',
  106 + 1147: 'IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)',
  107 + 1148: 'IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)',
  108 + 1149: 'IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)',
  109 + 1200: 'Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications',
  110 + 1201: 'Unicode UTF-16, big endian byte order; available only to managed applications',
  111 + 1250: 'ANSI Central European; Central European (Windows)',
  112 + 1251: 'ANSI Cyrillic; Cyrillic (Windows)',
  113 + 1252: 'ANSI Latin 1; Western European (Windows)',
  114 + 1253: 'ANSI Greek; Greek (Windows)',
  115 + 1254: 'ANSI Turkish; Turkish (Windows)',
  116 + 1255: 'ANSI Hebrew; Hebrew (Windows)',
  117 + 1256: 'ANSI Arabic; Arabic (Windows)',
  118 + 1257: 'ANSI Baltic; Baltic (Windows)',
  119 + 1258: 'ANSI/OEM Vietnamese; Vietnamese (Windows)',
  120 + 1361: 'Korean (Johab)',
  121 + 10000: 'MAC Roman; Western European (Mac)',
  122 + 10001: 'Japanese (Mac)',
  123 + 10002: 'MAC Traditional Chinese (Big5); Chinese Traditional (Mac)',
  124 + 10003: 'Korean (Mac)',
  125 + 10004: 'Arabic (Mac)',
  126 + 10005: 'Hebrew (Mac)',
  127 + 10006: 'Greek (Mac)',
  128 + 10007: 'Cyrillic (Mac)',
  129 + 10008: 'MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)',
  130 + 10010: 'Romanian (Mac)',
  131 + 10017: 'Ukrainian (Mac)',
  132 + 10021: 'Thai (Mac)',
  133 + 10029: 'MAC Latin 2; Central European (Mac)',
  134 + 10079: 'Icelandic (Mac)',
  135 + 10081: 'Turkish (Mac)',
  136 + 10082: 'Croatian (Mac)',
  137 + 12000: 'Unicode UTF-32, little endian byte order; available only to managed applications',
  138 + 12001: 'Unicode UTF-32, big endian byte order; available only to managed applications',
  139 + 20000: 'CNS Taiwan; Chinese Traditional (CNS)',
  140 + 20001: 'TCA Taiwan',
  141 + 20002: 'Eten Taiwan; Chinese Traditional (Eten)',
  142 + 20003: 'IBM5550 Taiwan',
  143 + 20004: 'TeleText Taiwan',
  144 + 20005: 'Wang Taiwan',
  145 + 20105: 'IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)',
  146 + 20106: 'IA5 German (7-bit)',
  147 + 20107: 'IA5 Swedish (7-bit)',
  148 + 20108: 'IA5 Norwegian (7-bit)',
  149 + 20127: 'US-ASCII (7-bit)',
  150 + 20261: 'T.61',
  151 + 20269: 'ISO 6937 Non-Spacing Accent',
  152 + 20273: 'IBM EBCDIC Germany',
  153 + 20277: 'IBM EBCDIC Denmark-Norway',
  154 + 20278: 'IBM EBCDIC Finland-Sweden',
  155 + 20280: 'IBM EBCDIC Italy',
  156 + 20284: 'IBM EBCDIC Latin America-Spain',
  157 + 20285: 'IBM EBCDIC United Kingdom',
  158 + 20290: 'IBM EBCDIC Japanese Katakana Extended',
  159 + 20297: 'IBM EBCDIC France',
  160 + 20420: 'IBM EBCDIC Arabic',
  161 + 20423: 'IBM EBCDIC Greek',
  162 + 20424: 'IBM EBCDIC Hebrew',
  163 + 20833: 'IBM EBCDIC Korean Extended',
  164 + 20838: 'IBM EBCDIC Thai',
  165 + 20866: 'Russian (KOI8-R); Cyrillic (KOI8-R)',
  166 + 20871: 'IBM EBCDIC Icelandic',
  167 + 20880: 'IBM EBCDIC Cyrillic Russian',
  168 + 20905: 'IBM EBCDIC Turkish',
  169 + 20924: 'IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)',
  170 + 20932: 'Japanese (JIS 0208-1990 and 0212-1990)',
  171 + 20936: 'Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)',
  172 + 20949: 'Korean Wansung',
  173 + 21025: 'IBM EBCDIC Cyrillic Serbian-Bulgarian',
  174 + 21027: '(deprecated)',
  175 + 21866: 'Ukrainian (KOI8-U); Cyrillic (KOI8-U)',
  176 + 28591: 'ISO 8859-1 Latin 1; Western European (ISO)',
  177 + 28592: 'ISO 8859-2 Central European; Central European (ISO)',
  178 + 28593: 'ISO 8859-3 Latin 3',
  179 + 28594: 'ISO 8859-4 Baltic',
  180 + 28595: 'ISO 8859-5 Cyrillic',
  181 + 28596: 'ISO 8859-6 Arabic',
  182 + 28597: 'ISO 8859-7 Greek',
  183 + 28598: 'ISO 8859-8 Hebrew; Hebrew (ISO-Visual)',
  184 + 28599: 'ISO 8859-9 Turkish',
  185 + 28603: 'ISO 8859-13 Estonian',
  186 + 28605: 'ISO 8859-15 Latin 9',
  187 + 29001: 'Europa 3',
  188 + 38598: 'ISO 8859-8 Hebrew; Hebrew (ISO-Logical)',
  189 + 50220: 'ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)',
  190 + 50221: 'ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)',
  191 + 50222: 'ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)',
  192 + 50225: 'ISO 2022 Korean',
  193 + 50227: 'ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)',
  194 + 50229: 'ISO 2022 Traditional Chinese',
  195 + 50930: 'EBCDIC Japanese (Katakana) Extended',
  196 + 50931: 'EBCDIC US-Canada and Japanese',
  197 + 50933: 'EBCDIC Korean Extended and Korean',
  198 + 50935: 'EBCDIC Simplified Chinese Extended and Simplified Chinese',
  199 + 50936: 'EBCDIC Simplified Chinese',
  200 + 50937: 'EBCDIC US-Canada and Traditional Chinese',
  201 + 50939: 'EBCDIC Japanese (Latin) Extended and Japanese',
  202 + 51932: 'EUC Japanese',
  203 + 51936: 'EUC Simplified Chinese; Chinese Simplified (EUC)',
  204 + 51949: 'EUC Korean',
  205 + 51950: 'EUC Traditional Chinese',
  206 + 52936: 'HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)',
  207 + 54936: 'Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)',
  208 + 57002: 'ISCII Devanagari',
  209 + 57003: 'ISCII Bangla',
  210 + 57004: 'ISCII Tamil',
  211 + 57005: 'ISCII Telugu',
  212 + 57006: 'ISCII Assamese',
  213 + 57007: 'ISCII Odia',
  214 + 57008: 'ISCII Kannada',
  215 + 57009: 'ISCII Malayalam',
  216 + 57010: 'ISCII Gujarati',
  217 + 57011: 'ISCII Punjabi',
  218 + 65000: 'Unicode (UTF-7)',
  219 + 65001: 'Unicode (UTF-8)',
  220 +}
  221 +
  222 +
  223 +# Mapping from codepages to Python codecs, when 'cpXXX' does not work
  224 +# (inspired from http://stackoverflow.com/questions/1592925/decoding-mac-os-text-in-python)
  225 +CODEPAGE_TO_CODEC = {
  226 + 37: 'cp037',
  227 + 708: 'arabic', # not found: Arabic (ASMO 708) => arabic = iso-8859-6
  228 + 709: 'arabic', # not found: Arabic (ASMO-449+, BCON V4) => arabic = iso-8859-6
  229 + 710: 'arabic', # not found: Arabic - Transparent Arabic => arabic = iso-8859-6
  230 + 870: 'latin2', # IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2
  231 + 1047: 'latin1', # IBM EBCDIC Latin 1/Open System
  232 + 1141: 'cp273', # IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)
  233 + 1200: 'utf_16_le', # Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications
  234 + 1201: 'utf_16_be', # Unicode UTF-16, big endian byte order; available only to managed applications
  235 +
  236 + 10000: 'mac-roman',
  237 + 10001: 'shiftjis', # not found: 'mac-shift-jis',
  238 + 10002: 'big5', # not found: 'mac-big5',
  239 + 10003: 'ascii', # nothing appropriate found: 'mac-hangul',
  240 + 10004: 'mac-arabic',
  241 + 10005: 'hebrew', # not found: 'mac-hebrew',
  242 + 10006: 'mac-greek',
  243 + 10007: 'ascii', # nothing appropriate found: 'mac-russian',
  244 + 10008: 'gb2312', # not found: 'mac-gb2312',
  245 + 10021: 'thai', # not found: mac-thai',
  246 + 10029: 'maccentraleurope', # not found: 'mac-east europe',
  247 + 10081: 'mac-turkish',
  248 +
  249 + 12000: 'utf_32_le', # Unicode UTF-32, little endian byte order
  250 + 12001: 'utf_32_be', # Unicode UTF-32, big endian byte order
  251 +
  252 + 20127: 'ascii',
  253 +
  254 + 28591: 'latin1',
  255 + 28592: 'iso8859_2',
  256 + 28593: 'iso8859_3',
  257 + 28594: 'iso8859_4',
  258 + 28595: 'iso8859_5',
  259 + 28596: 'iso8859_6',
  260 + 28597: 'iso8859_7',
  261 + 28598: 'iso8859_8',
  262 + 28599: 'iso8859_9',
  263 + 28603: 'iso8859_13',
  264 + 28605: 'iso8859_15',
  265 + 38598: 'iso8859_8',
  266 +
  267 + 65000: 'utf7',
  268 + 65001: 'utf8',
  269 +}
  270 +
  271 +
  272 +# === FUNCTIONS ==============================================================
  273 +
  274 +def codepage2codec(codepage):
  275 + """
  276 + convert a codepage number to a Python codec.
  277 + If the corresponding codec cannot be found, returns "utf8" by default.
  278 +
  279 + :param codepage: int, code page number
  280 + :return: str, Python codec name
  281 + """
  282 + if codepage in CODEPAGE_TO_CODEC:
  283 + codec = CODEPAGE_TO_CODEC[codepage]
  284 + else:
  285 + codec = 'cp%d' % codepage
  286 + try:
  287 + codecs.lookup(codec)
  288 + except LookupError:
  289 + #log.error('Codec not found for code page %d, using UTF-8 as fallback.' % codepage)
  290 + codec = 'utf8'
  291 + return codec
  292 +
  293 +# === MAIN: TESTS ============================================================
  294 +
  295 +if __name__ == '__main__':
  296 + for cp in sorted(CODEPAGE_NAME.keys()):
  297 + print('Code Page: %d => codec: %s - %s' % (cp, codepage2codec(cp), CODEPAGE_NAME[cp]))
0 298 \ No newline at end of file
... ...