Commit fdc77bfa57cd0105ba5f1952a31cd25fa73511fe
1 parent
7374be1e
common: added new module codepages
Showing
1 changed file
with
297 additions
and
0 deletions
oletools/common/codepages.py
0 → 100644
| 1 | +""" | ||
| 2 | +codepages.py | ||
| 3 | + | ||
| 4 | +codepages is a python module to map code pages (numbers) to Python codecs, | ||
| 5 | +in order to decode bytes to unicode. | ||
| 6 | + | ||
| 7 | +Author: Philippe Lagadec - http://www.decalage.info | ||
| 8 | +License: BSD, see source code or documentation | ||
| 9 | + | ||
| 10 | +codepages is part of the python-oletools package: | ||
| 11 | +http://www.decalage.info/python/oletools | ||
| 12 | +""" | ||
| 13 | + | ||
| 14 | +# === LICENSE ================================================================== | ||
| 15 | + | ||
| 16 | +# codepages is copyright (c) 2018 Philippe Lagadec (http://www.decalage.info) | ||
| 17 | +# All rights reserved. | ||
| 18 | +# | ||
| 19 | +# Redistribution and use in source and binary forms, with or without modification, | ||
| 20 | +# are permitted provided that the following conditions are met: | ||
| 21 | +# | ||
| 22 | +# * Redistributions of source code must retain the above copyright notice, this | ||
| 23 | +# list of conditions and the following disclaimer. | ||
| 24 | +# * Redistributions in binary form must reproduce the above copyright notice, | ||
| 25 | +# this list of conditions and the following disclaimer in the documentation | ||
| 26 | +# and/or other materials provided with the distribution. | ||
| 27 | +# | ||
| 28 | +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | ||
| 29 | +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | ||
| 30 | +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | ||
| 31 | +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | ||
| 32 | +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
| 33 | +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||
| 34 | +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | ||
| 35 | +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||
| 36 | +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
| 37 | +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
| 38 | + | ||
| 39 | + | ||
| 40 | +# ----------------------------------------------------------------------------- | ||
| 41 | +# CHANGELOG: | ||
| 42 | +# 2018-12-13 v0.54 PL: - first version | ||
| 43 | + | ||
| 44 | +__version__ = '0.54dev6' | ||
| 45 | + | ||
| 46 | +# ----------------------------------------------------------------------------- | ||
| 47 | +# TODO: | ||
| 48 | + | ||
| 49 | +# ----------------------------------------------------------------------------- | ||
| 50 | +# REFERENCES: | ||
| 51 | +# - https://docs.microsoft.com/en-gb/windows/desktop/Intl/code-page-identifiers | ||
| 52 | + | ||
| 53 | + | ||
| 54 | +# --- IMPORTS ----------------------------------------------------------------- | ||
| 55 | + | ||
| 56 | +import codecs | ||
| 57 | + | ||
| 58 | +# === CONSTANTS =============================================================== | ||
| 59 | + | ||
| 60 | +# Code page names from https://docs.microsoft.com/en-gb/windows/desktop/Intl/code-page-identifiers | ||
| 61 | +# Retrieved on the 2018-12-13 | ||
| 62 | +# How it was converted to Python: | ||
| 63 | +# 1) copy the table data (3 columns) from browser into Excel | ||
| 64 | +# 2) use the following formula to concatenate 1st and 3rd columns: =A1 & ": " & "'" & C1 & "'," | ||
| 65 | +# 3) copy from Excel into Python | ||
| 66 | + | ||
| 67 | +CODEPAGE_NAME = { | ||
| 68 | + 37: 'IBM EBCDIC US-Canada', | ||
| 69 | + 437: 'OEM United States', | ||
| 70 | + 500: 'IBM EBCDIC International', | ||
| 71 | + 708: 'Arabic (ASMO 708)', | ||
| 72 | + 709: 'Arabic (ASMO-449+, BCON V4)', | ||
| 73 | + 710: 'Arabic - Transparent Arabic', | ||
| 74 | + 720: 'Arabic (Transparent ASMO); Arabic (DOS)', | ||
| 75 | + 737: 'OEM Greek (formerly 437G); Greek (DOS)', | ||
| 76 | + 775: 'OEM Baltic; Baltic (DOS)', | ||
| 77 | + 850: 'OEM Multilingual Latin 1; Western European (DOS)', | ||
| 78 | + 852: 'OEM Latin 2; Central European (DOS)', | ||
| 79 | + 855: 'OEM Cyrillic (primarily Russian)', | ||
| 80 | + 857: 'OEM Turkish; Turkish (DOS)', | ||
| 81 | + 858: 'OEM Multilingual Latin 1 + Euro symbol', | ||
| 82 | + 860: 'OEM Portuguese; Portuguese (DOS)', | ||
| 83 | + 861: 'OEM Icelandic; Icelandic (DOS)', | ||
| 84 | + 862: 'OEM Hebrew; Hebrew (DOS)', | ||
| 85 | + 863: 'OEM French Canadian; French Canadian (DOS)', | ||
| 86 | + 864: 'OEM Arabic; Arabic (864)', | ||
| 87 | + 865: 'OEM Nordic; Nordic (DOS)', | ||
| 88 | + 866: 'OEM Russian; Cyrillic (DOS)', | ||
| 89 | + 869: 'OEM Modern Greek; Greek, Modern (DOS)', | ||
| 90 | + 870: 'IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2', | ||
| 91 | + 874: 'ANSI/OEM Thai (ISO 8859-11); Thai (Windows)', | ||
| 92 | + 875: 'IBM EBCDIC Greek Modern', | ||
| 93 | + 932: 'ANSI/OEM Japanese; Japanese (Shift-JIS)', | ||
| 94 | + 936: 'ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)', | ||
| 95 | + 949: 'ANSI/OEM Korean (Unified Hangul Code)', | ||
| 96 | + 950: 'ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)', | ||
| 97 | + 1026: 'IBM EBCDIC Turkish (Latin 5)', | ||
| 98 | + 1047: 'IBM EBCDIC Latin 1/Open System', | ||
| 99 | + 1140: 'IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)', | ||
| 100 | + 1141: 'IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)', | ||
| 101 | + 1142: 'IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)', | ||
| 102 | + 1143: 'IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)', | ||
| 103 | + 1144: 'IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)', | ||
| 104 | + 1145: 'IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)', | ||
| 105 | + 1146: 'IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)', | ||
| 106 | + 1147: 'IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)', | ||
| 107 | + 1148: 'IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)', | ||
| 108 | + 1149: 'IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)', | ||
| 109 | + 1200: 'Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications', | ||
| 110 | + 1201: 'Unicode UTF-16, big endian byte order; available only to managed applications', | ||
| 111 | + 1250: 'ANSI Central European; Central European (Windows)', | ||
| 112 | + 1251: 'ANSI Cyrillic; Cyrillic (Windows)', | ||
| 113 | + 1252: 'ANSI Latin 1; Western European (Windows)', | ||
| 114 | + 1253: 'ANSI Greek; Greek (Windows)', | ||
| 115 | + 1254: 'ANSI Turkish; Turkish (Windows)', | ||
| 116 | + 1255: 'ANSI Hebrew; Hebrew (Windows)', | ||
| 117 | + 1256: 'ANSI Arabic; Arabic (Windows)', | ||
| 118 | + 1257: 'ANSI Baltic; Baltic (Windows)', | ||
| 119 | + 1258: 'ANSI/OEM Vietnamese; Vietnamese (Windows)', | ||
| 120 | + 1361: 'Korean (Johab)', | ||
| 121 | + 10000: 'MAC Roman; Western European (Mac)', | ||
| 122 | + 10001: 'Japanese (Mac)', | ||
| 123 | + 10002: 'MAC Traditional Chinese (Big5); Chinese Traditional (Mac)', | ||
| 124 | + 10003: 'Korean (Mac)', | ||
| 125 | + 10004: 'Arabic (Mac)', | ||
| 126 | + 10005: 'Hebrew (Mac)', | ||
| 127 | + 10006: 'Greek (Mac)', | ||
| 128 | + 10007: 'Cyrillic (Mac)', | ||
| 129 | + 10008: 'MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)', | ||
| 130 | + 10010: 'Romanian (Mac)', | ||
| 131 | + 10017: 'Ukrainian (Mac)', | ||
| 132 | + 10021: 'Thai (Mac)', | ||
| 133 | + 10029: 'MAC Latin 2; Central European (Mac)', | ||
| 134 | + 10079: 'Icelandic (Mac)', | ||
| 135 | + 10081: 'Turkish (Mac)', | ||
| 136 | + 10082: 'Croatian (Mac)', | ||
| 137 | + 12000: 'Unicode UTF-32, little endian byte order; available only to managed applications', | ||
| 138 | + 12001: 'Unicode UTF-32, big endian byte order; available only to managed applications', | ||
| 139 | + 20000: 'CNS Taiwan; Chinese Traditional (CNS)', | ||
| 140 | + 20001: 'TCA Taiwan', | ||
| 141 | + 20002: 'Eten Taiwan; Chinese Traditional (Eten)', | ||
| 142 | + 20003: 'IBM5550 Taiwan', | ||
| 143 | + 20004: 'TeleText Taiwan', | ||
| 144 | + 20005: 'Wang Taiwan', | ||
| 145 | + 20105: 'IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)', | ||
| 146 | + 20106: 'IA5 German (7-bit)', | ||
| 147 | + 20107: 'IA5 Swedish (7-bit)', | ||
| 148 | + 20108: 'IA5 Norwegian (7-bit)', | ||
| 149 | + 20127: 'US-ASCII (7-bit)', | ||
| 150 | + 20261: 'T.61', | ||
| 151 | + 20269: 'ISO 6937 Non-Spacing Accent', | ||
| 152 | + 20273: 'IBM EBCDIC Germany', | ||
| 153 | + 20277: 'IBM EBCDIC Denmark-Norway', | ||
| 154 | + 20278: 'IBM EBCDIC Finland-Sweden', | ||
| 155 | + 20280: 'IBM EBCDIC Italy', | ||
| 156 | + 20284: 'IBM EBCDIC Latin America-Spain', | ||
| 157 | + 20285: 'IBM EBCDIC United Kingdom', | ||
| 158 | + 20290: 'IBM EBCDIC Japanese Katakana Extended', | ||
| 159 | + 20297: 'IBM EBCDIC France', | ||
| 160 | + 20420: 'IBM EBCDIC Arabic', | ||
| 161 | + 20423: 'IBM EBCDIC Greek', | ||
| 162 | + 20424: 'IBM EBCDIC Hebrew', | ||
| 163 | + 20833: 'IBM EBCDIC Korean Extended', | ||
| 164 | + 20838: 'IBM EBCDIC Thai', | ||
| 165 | + 20866: 'Russian (KOI8-R); Cyrillic (KOI8-R)', | ||
| 166 | + 20871: 'IBM EBCDIC Icelandic', | ||
| 167 | + 20880: 'IBM EBCDIC Cyrillic Russian', | ||
| 168 | + 20905: 'IBM EBCDIC Turkish', | ||
| 169 | + 20924: 'IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)', | ||
| 170 | + 20932: 'Japanese (JIS 0208-1990 and 0212-1990)', | ||
| 171 | + 20936: 'Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)', | ||
| 172 | + 20949: 'Korean Wansung', | ||
| 173 | + 21025: 'IBM EBCDIC Cyrillic Serbian-Bulgarian', | ||
| 174 | + 21027: '(deprecated)', | ||
| 175 | + 21866: 'Ukrainian (KOI8-U); Cyrillic (KOI8-U)', | ||
| 176 | + 28591: 'ISO 8859-1 Latin 1; Western European (ISO)', | ||
| 177 | + 28592: 'ISO 8859-2 Central European; Central European (ISO)', | ||
| 178 | + 28593: 'ISO 8859-3 Latin 3', | ||
| 179 | + 28594: 'ISO 8859-4 Baltic', | ||
| 180 | + 28595: 'ISO 8859-5 Cyrillic', | ||
| 181 | + 28596: 'ISO 8859-6 Arabic', | ||
| 182 | + 28597: 'ISO 8859-7 Greek', | ||
| 183 | + 28598: 'ISO 8859-8 Hebrew; Hebrew (ISO-Visual)', | ||
| 184 | + 28599: 'ISO 8859-9 Turkish', | ||
| 185 | + 28603: 'ISO 8859-13 Estonian', | ||
| 186 | + 28605: 'ISO 8859-15 Latin 9', | ||
| 187 | + 29001: 'Europa 3', | ||
| 188 | + 38598: 'ISO 8859-8 Hebrew; Hebrew (ISO-Logical)', | ||
| 189 | + 50220: 'ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)', | ||
| 190 | + 50221: 'ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)', | ||
| 191 | + 50222: 'ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)', | ||
| 192 | + 50225: 'ISO 2022 Korean', | ||
| 193 | + 50227: 'ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)', | ||
| 194 | + 50229: 'ISO 2022 Traditional Chinese', | ||
| 195 | + 50930: 'EBCDIC Japanese (Katakana) Extended', | ||
| 196 | + 50931: 'EBCDIC US-Canada and Japanese', | ||
| 197 | + 50933: 'EBCDIC Korean Extended and Korean', | ||
| 198 | + 50935: 'EBCDIC Simplified Chinese Extended and Simplified Chinese', | ||
| 199 | + 50936: 'EBCDIC Simplified Chinese', | ||
| 200 | + 50937: 'EBCDIC US-Canada and Traditional Chinese', | ||
| 201 | + 50939: 'EBCDIC Japanese (Latin) Extended and Japanese', | ||
| 202 | + 51932: 'EUC Japanese', | ||
| 203 | + 51936: 'EUC Simplified Chinese; Chinese Simplified (EUC)', | ||
| 204 | + 51949: 'EUC Korean', | ||
| 205 | + 51950: 'EUC Traditional Chinese', | ||
| 206 | + 52936: 'HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)', | ||
| 207 | + 54936: 'Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)', | ||
| 208 | + 57002: 'ISCII Devanagari', | ||
| 209 | + 57003: 'ISCII Bangla', | ||
| 210 | + 57004: 'ISCII Tamil', | ||
| 211 | + 57005: 'ISCII Telugu', | ||
| 212 | + 57006: 'ISCII Assamese', | ||
| 213 | + 57007: 'ISCII Odia', | ||
| 214 | + 57008: 'ISCII Kannada', | ||
| 215 | + 57009: 'ISCII Malayalam', | ||
| 216 | + 57010: 'ISCII Gujarati', | ||
| 217 | + 57011: 'ISCII Punjabi', | ||
| 218 | + 65000: 'Unicode (UTF-7)', | ||
| 219 | + 65001: 'Unicode (UTF-8)', | ||
| 220 | +} | ||
| 221 | + | ||
| 222 | + | ||
| 223 | +# Mapping from codepages to Python codecs, when 'cpXXX' does not work | ||
| 224 | +# (inspired from http://stackoverflow.com/questions/1592925/decoding-mac-os-text-in-python) | ||
| 225 | +CODEPAGE_TO_CODEC = { | ||
| 226 | + 37: 'cp037', | ||
| 227 | + 708: 'arabic', # not found: Arabic (ASMO 708) => arabic = iso-8859-6 | ||
| 228 | + 709: 'arabic', # not found: Arabic (ASMO-449+, BCON V4) => arabic = iso-8859-6 | ||
| 229 | + 710: 'arabic', # not found: Arabic - Transparent Arabic => arabic = iso-8859-6 | ||
| 230 | + 870: 'latin2', # IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2 | ||
| 231 | + 1047: 'latin1', # IBM EBCDIC Latin 1/Open System | ||
| 232 | + 1141: 'cp273', # IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro) | ||
| 233 | + 1200: 'utf_16_le', # Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications | ||
| 234 | + 1201: 'utf_16_be', # Unicode UTF-16, big endian byte order; available only to managed applications | ||
| 235 | + | ||
| 236 | + 10000: 'mac-roman', | ||
| 237 | + 10001: 'shiftjis', # not found: 'mac-shift-jis', | ||
| 238 | + 10002: 'big5', # not found: 'mac-big5', | ||
| 239 | + 10003: 'ascii', # nothing appropriate found: 'mac-hangul', | ||
| 240 | + 10004: 'mac-arabic', | ||
| 241 | + 10005: 'hebrew', # not found: 'mac-hebrew', | ||
| 242 | + 10006: 'mac-greek', | ||
| 243 | + 10007: 'ascii', # nothing appropriate found: 'mac-russian', | ||
| 244 | + 10008: 'gb2312', # not found: 'mac-gb2312', | ||
| 245 | + 10021: 'thai', # not found: mac-thai', | ||
| 246 | + 10029: 'maccentraleurope', # not found: 'mac-east europe', | ||
| 247 | + 10081: 'mac-turkish', | ||
| 248 | + | ||
| 249 | + 12000: 'utf_32_le', # Unicode UTF-32, little endian byte order | ||
| 250 | + 12001: 'utf_32_be', # Unicode UTF-32, big endian byte order | ||
| 251 | + | ||
| 252 | + 20127: 'ascii', | ||
| 253 | + | ||
| 254 | + 28591: 'latin1', | ||
| 255 | + 28592: 'iso8859_2', | ||
| 256 | + 28593: 'iso8859_3', | ||
| 257 | + 28594: 'iso8859_4', | ||
| 258 | + 28595: 'iso8859_5', | ||
| 259 | + 28596: 'iso8859_6', | ||
| 260 | + 28597: 'iso8859_7', | ||
| 261 | + 28598: 'iso8859_8', | ||
| 262 | + 28599: 'iso8859_9', | ||
| 263 | + 28603: 'iso8859_13', | ||
| 264 | + 28605: 'iso8859_15', | ||
| 265 | + 38598: 'iso8859_8', | ||
| 266 | + | ||
| 267 | + 65000: 'utf7', | ||
| 268 | + 65001: 'utf8', | ||
| 269 | +} | ||
| 270 | + | ||
| 271 | + | ||
| 272 | +# === FUNCTIONS ============================================================== | ||
| 273 | + | ||
| 274 | +def codepage2codec(codepage): | ||
| 275 | + """ | ||
| 276 | + convert a codepage number to a Python codec. | ||
| 277 | + If the corresponding codec cannot be found, returns "utf8" by default. | ||
| 278 | + | ||
| 279 | + :param codepage: int, code page number | ||
| 280 | + :return: str, Python codec name | ||
| 281 | + """ | ||
| 282 | + if codepage in CODEPAGE_TO_CODEC: | ||
| 283 | + codec = CODEPAGE_TO_CODEC[codepage] | ||
| 284 | + else: | ||
| 285 | + codec = 'cp%d' % codepage | ||
| 286 | + try: | ||
| 287 | + codecs.lookup(codec) | ||
| 288 | + except LookupError: | ||
| 289 | + #log.error('Codec not found for code page %d, using UTF-8 as fallback.' % codepage) | ||
| 290 | + codec = 'utf8' | ||
| 291 | + return codec | ||
| 292 | + | ||
| 293 | +# === MAIN: TESTS ============================================================ | ||
| 294 | + | ||
| 295 | +if __name__ == '__main__': | ||
| 296 | + for cp in sorted(CODEPAGE_NAME.keys()): | ||
| 297 | + print('Code Page: %d => codec: %s - %s' % (cp, codepage2codec(cp), CODEPAGE_NAME[cp])) | ||
| 0 | \ No newline at end of file | 298 | \ No newline at end of file |