Commit fdc77bfa57cd0105ba5f1952a31cd25fa73511fe
1 parent
7374be1e
common: added new module codepages
Showing
1 changed file
with
297 additions
and
0 deletions
oletools/common/codepages.py
0 → 100644
| 1 | +""" | |
| 2 | +codepages.py | |
| 3 | + | |
| 4 | +codepages is a python module to map code pages (numbers) to Python codecs, | |
| 5 | +in order to decode bytes to unicode. | |
| 6 | + | |
| 7 | +Author: Philippe Lagadec - http://www.decalage.info | |
| 8 | +License: BSD, see source code or documentation | |
| 9 | + | |
| 10 | +codepages is part of the python-oletools package: | |
| 11 | +http://www.decalage.info/python/oletools | |
| 12 | +""" | |
| 13 | + | |
| 14 | +# === LICENSE ================================================================== | |
| 15 | + | |
| 16 | +# codepages is copyright (c) 2018 Philippe Lagadec (http://www.decalage.info) | |
| 17 | +# All rights reserved. | |
| 18 | +# | |
| 19 | +# Redistribution and use in source and binary forms, with or without modification, | |
| 20 | +# are permitted provided that the following conditions are met: | |
| 21 | +# | |
| 22 | +# * Redistributions of source code must retain the above copyright notice, this | |
| 23 | +# list of conditions and the following disclaimer. | |
| 24 | +# * Redistributions in binary form must reproduce the above copyright notice, | |
| 25 | +# this list of conditions and the following disclaimer in the documentation | |
| 26 | +# and/or other materials provided with the distribution. | |
| 27 | +# | |
| 28 | +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
| 29 | +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
| 30 | +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
| 31 | +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | |
| 32 | +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 33 | +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
| 34 | +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
| 35 | +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
| 36 | +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
| 37 | +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 38 | + | |
| 39 | + | |
| 40 | +# ----------------------------------------------------------------------------- | |
| 41 | +# CHANGELOG: | |
| 42 | +# 2018-12-13 v0.54 PL: - first version | |
| 43 | + | |
| 44 | +__version__ = '0.54dev6' | |
| 45 | + | |
| 46 | +# ----------------------------------------------------------------------------- | |
| 47 | +# TODO: | |
| 48 | + | |
| 49 | +# ----------------------------------------------------------------------------- | |
| 50 | +# REFERENCES: | |
| 51 | +# - https://docs.microsoft.com/en-gb/windows/desktop/Intl/code-page-identifiers | |
| 52 | + | |
| 53 | + | |
| 54 | +# --- IMPORTS ----------------------------------------------------------------- | |
| 55 | + | |
| 56 | +import codecs | |
| 57 | + | |
| 58 | +# === CONSTANTS =============================================================== | |
| 59 | + | |
| 60 | +# Code page names from https://docs.microsoft.com/en-gb/windows/desktop/Intl/code-page-identifiers | |
| 61 | +# Retrieved on the 2018-12-13 | |
| 62 | +# How it was converted to Python: | |
| 63 | +# 1) copy the table data (3 columns) from browser into Excel | |
| 64 | +# 2) use the following formula to concatenate 1st and 3rd columns: =A1 & ": " & "'" & C1 & "'," | |
| 65 | +# 3) copy from Excel into Python | |
| 66 | + | |
| 67 | +CODEPAGE_NAME = { | |
| 68 | + 37: 'IBM EBCDIC US-Canada', | |
| 69 | + 437: 'OEM United States', | |
| 70 | + 500: 'IBM EBCDIC International', | |
| 71 | + 708: 'Arabic (ASMO 708)', | |
| 72 | + 709: 'Arabic (ASMO-449+, BCON V4)', | |
| 73 | + 710: 'Arabic - Transparent Arabic', | |
| 74 | + 720: 'Arabic (Transparent ASMO); Arabic (DOS)', | |
| 75 | + 737: 'OEM Greek (formerly 437G); Greek (DOS)', | |
| 76 | + 775: 'OEM Baltic; Baltic (DOS)', | |
| 77 | + 850: 'OEM Multilingual Latin 1; Western European (DOS)', | |
| 78 | + 852: 'OEM Latin 2; Central European (DOS)', | |
| 79 | + 855: 'OEM Cyrillic (primarily Russian)', | |
| 80 | + 857: 'OEM Turkish; Turkish (DOS)', | |
| 81 | + 858: 'OEM Multilingual Latin 1 + Euro symbol', | |
| 82 | + 860: 'OEM Portuguese; Portuguese (DOS)', | |
| 83 | + 861: 'OEM Icelandic; Icelandic (DOS)', | |
| 84 | + 862: 'OEM Hebrew; Hebrew (DOS)', | |
| 85 | + 863: 'OEM French Canadian; French Canadian (DOS)', | |
| 86 | + 864: 'OEM Arabic; Arabic (864)', | |
| 87 | + 865: 'OEM Nordic; Nordic (DOS)', | |
| 88 | + 866: 'OEM Russian; Cyrillic (DOS)', | |
| 89 | + 869: 'OEM Modern Greek; Greek, Modern (DOS)', | |
| 90 | + 870: 'IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2', | |
| 91 | + 874: 'ANSI/OEM Thai (ISO 8859-11); Thai (Windows)', | |
| 92 | + 875: 'IBM EBCDIC Greek Modern', | |
| 93 | + 932: 'ANSI/OEM Japanese; Japanese (Shift-JIS)', | |
| 94 | + 936: 'ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)', | |
| 95 | + 949: 'ANSI/OEM Korean (Unified Hangul Code)', | |
| 96 | + 950: 'ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)', | |
| 97 | + 1026: 'IBM EBCDIC Turkish (Latin 5)', | |
| 98 | + 1047: 'IBM EBCDIC Latin 1/Open System', | |
| 99 | + 1140: 'IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)', | |
| 100 | + 1141: 'IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)', | |
| 101 | + 1142: 'IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)', | |
| 102 | + 1143: 'IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)', | |
| 103 | + 1144: 'IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)', | |
| 104 | + 1145: 'IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)', | |
| 105 | + 1146: 'IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)', | |
| 106 | + 1147: 'IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)', | |
| 107 | + 1148: 'IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)', | |
| 108 | + 1149: 'IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)', | |
| 109 | + 1200: 'Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications', | |
| 110 | + 1201: 'Unicode UTF-16, big endian byte order; available only to managed applications', | |
| 111 | + 1250: 'ANSI Central European; Central European (Windows)', | |
| 112 | + 1251: 'ANSI Cyrillic; Cyrillic (Windows)', | |
| 113 | + 1252: 'ANSI Latin 1; Western European (Windows)', | |
| 114 | + 1253: 'ANSI Greek; Greek (Windows)', | |
| 115 | + 1254: 'ANSI Turkish; Turkish (Windows)', | |
| 116 | + 1255: 'ANSI Hebrew; Hebrew (Windows)', | |
| 117 | + 1256: 'ANSI Arabic; Arabic (Windows)', | |
| 118 | + 1257: 'ANSI Baltic; Baltic (Windows)', | |
| 119 | + 1258: 'ANSI/OEM Vietnamese; Vietnamese (Windows)', | |
| 120 | + 1361: 'Korean (Johab)', | |
| 121 | + 10000: 'MAC Roman; Western European (Mac)', | |
| 122 | + 10001: 'Japanese (Mac)', | |
| 123 | + 10002: 'MAC Traditional Chinese (Big5); Chinese Traditional (Mac)', | |
| 124 | + 10003: 'Korean (Mac)', | |
| 125 | + 10004: 'Arabic (Mac)', | |
| 126 | + 10005: 'Hebrew (Mac)', | |
| 127 | + 10006: 'Greek (Mac)', | |
| 128 | + 10007: 'Cyrillic (Mac)', | |
| 129 | + 10008: 'MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)', | |
| 130 | + 10010: 'Romanian (Mac)', | |
| 131 | + 10017: 'Ukrainian (Mac)', | |
| 132 | + 10021: 'Thai (Mac)', | |
| 133 | + 10029: 'MAC Latin 2; Central European (Mac)', | |
| 134 | + 10079: 'Icelandic (Mac)', | |
| 135 | + 10081: 'Turkish (Mac)', | |
| 136 | + 10082: 'Croatian (Mac)', | |
| 137 | + 12000: 'Unicode UTF-32, little endian byte order; available only to managed applications', | |
| 138 | + 12001: 'Unicode UTF-32, big endian byte order; available only to managed applications', | |
| 139 | + 20000: 'CNS Taiwan; Chinese Traditional (CNS)', | |
| 140 | + 20001: 'TCA Taiwan', | |
| 141 | + 20002: 'Eten Taiwan; Chinese Traditional (Eten)', | |
| 142 | + 20003: 'IBM5550 Taiwan', | |
| 143 | + 20004: 'TeleText Taiwan', | |
| 144 | + 20005: 'Wang Taiwan', | |
| 145 | + 20105: 'IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)', | |
| 146 | + 20106: 'IA5 German (7-bit)', | |
| 147 | + 20107: 'IA5 Swedish (7-bit)', | |
| 148 | + 20108: 'IA5 Norwegian (7-bit)', | |
| 149 | + 20127: 'US-ASCII (7-bit)', | |
| 150 | + 20261: 'T.61', | |
| 151 | + 20269: 'ISO 6937 Non-Spacing Accent', | |
| 152 | + 20273: 'IBM EBCDIC Germany', | |
| 153 | + 20277: 'IBM EBCDIC Denmark-Norway', | |
| 154 | + 20278: 'IBM EBCDIC Finland-Sweden', | |
| 155 | + 20280: 'IBM EBCDIC Italy', | |
| 156 | + 20284: 'IBM EBCDIC Latin America-Spain', | |
| 157 | + 20285: 'IBM EBCDIC United Kingdom', | |
| 158 | + 20290: 'IBM EBCDIC Japanese Katakana Extended', | |
| 159 | + 20297: 'IBM EBCDIC France', | |
| 160 | + 20420: 'IBM EBCDIC Arabic', | |
| 161 | + 20423: 'IBM EBCDIC Greek', | |
| 162 | + 20424: 'IBM EBCDIC Hebrew', | |
| 163 | + 20833: 'IBM EBCDIC Korean Extended', | |
| 164 | + 20838: 'IBM EBCDIC Thai', | |
| 165 | + 20866: 'Russian (KOI8-R); Cyrillic (KOI8-R)', | |
| 166 | + 20871: 'IBM EBCDIC Icelandic', | |
| 167 | + 20880: 'IBM EBCDIC Cyrillic Russian', | |
| 168 | + 20905: 'IBM EBCDIC Turkish', | |
| 169 | + 20924: 'IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)', | |
| 170 | + 20932: 'Japanese (JIS 0208-1990 and 0212-1990)', | |
| 171 | + 20936: 'Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)', | |
| 172 | + 20949: 'Korean Wansung', | |
| 173 | + 21025: 'IBM EBCDIC Cyrillic Serbian-Bulgarian', | |
| 174 | + 21027: '(deprecated)', | |
| 175 | + 21866: 'Ukrainian (KOI8-U); Cyrillic (KOI8-U)', | |
| 176 | + 28591: 'ISO 8859-1 Latin 1; Western European (ISO)', | |
| 177 | + 28592: 'ISO 8859-2 Central European; Central European (ISO)', | |
| 178 | + 28593: 'ISO 8859-3 Latin 3', | |
| 179 | + 28594: 'ISO 8859-4 Baltic', | |
| 180 | + 28595: 'ISO 8859-5 Cyrillic', | |
| 181 | + 28596: 'ISO 8859-6 Arabic', | |
| 182 | + 28597: 'ISO 8859-7 Greek', | |
| 183 | + 28598: 'ISO 8859-8 Hebrew; Hebrew (ISO-Visual)', | |
| 184 | + 28599: 'ISO 8859-9 Turkish', | |
| 185 | + 28603: 'ISO 8859-13 Estonian', | |
| 186 | + 28605: 'ISO 8859-15 Latin 9', | |
| 187 | + 29001: 'Europa 3', | |
| 188 | + 38598: 'ISO 8859-8 Hebrew; Hebrew (ISO-Logical)', | |
| 189 | + 50220: 'ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)', | |
| 190 | + 50221: 'ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)', | |
| 191 | + 50222: 'ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)', | |
| 192 | + 50225: 'ISO 2022 Korean', | |
| 193 | + 50227: 'ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)', | |
| 194 | + 50229: 'ISO 2022 Traditional Chinese', | |
| 195 | + 50930: 'EBCDIC Japanese (Katakana) Extended', | |
| 196 | + 50931: 'EBCDIC US-Canada and Japanese', | |
| 197 | + 50933: 'EBCDIC Korean Extended and Korean', | |
| 198 | + 50935: 'EBCDIC Simplified Chinese Extended and Simplified Chinese', | |
| 199 | + 50936: 'EBCDIC Simplified Chinese', | |
| 200 | + 50937: 'EBCDIC US-Canada and Traditional Chinese', | |
| 201 | + 50939: 'EBCDIC Japanese (Latin) Extended and Japanese', | |
| 202 | + 51932: 'EUC Japanese', | |
| 203 | + 51936: 'EUC Simplified Chinese; Chinese Simplified (EUC)', | |
| 204 | + 51949: 'EUC Korean', | |
| 205 | + 51950: 'EUC Traditional Chinese', | |
| 206 | + 52936: 'HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)', | |
| 207 | + 54936: 'Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)', | |
| 208 | + 57002: 'ISCII Devanagari', | |
| 209 | + 57003: 'ISCII Bangla', | |
| 210 | + 57004: 'ISCII Tamil', | |
| 211 | + 57005: 'ISCII Telugu', | |
| 212 | + 57006: 'ISCII Assamese', | |
| 213 | + 57007: 'ISCII Odia', | |
| 214 | + 57008: 'ISCII Kannada', | |
| 215 | + 57009: 'ISCII Malayalam', | |
| 216 | + 57010: 'ISCII Gujarati', | |
| 217 | + 57011: 'ISCII Punjabi', | |
| 218 | + 65000: 'Unicode (UTF-7)', | |
| 219 | + 65001: 'Unicode (UTF-8)', | |
| 220 | +} | |
| 221 | + | |
| 222 | + | |
| 223 | +# Mapping from codepages to Python codecs, when 'cpXXX' does not work | |
| 224 | +# (inspired from http://stackoverflow.com/questions/1592925/decoding-mac-os-text-in-python) | |
| 225 | +CODEPAGE_TO_CODEC = { | |
| 226 | + 37: 'cp037', | |
| 227 | + 708: 'arabic', # not found: Arabic (ASMO 708) => arabic = iso-8859-6 | |
| 228 | + 709: 'arabic', # not found: Arabic (ASMO-449+, BCON V4) => arabic = iso-8859-6 | |
| 229 | + 710: 'arabic', # not found: Arabic - Transparent Arabic => arabic = iso-8859-6 | |
| 230 | + 870: 'latin2', # IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2 | |
| 231 | + 1047: 'latin1', # IBM EBCDIC Latin 1/Open System | |
| 232 | + 1141: 'cp273', # IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro) | |
| 233 | + 1200: 'utf_16_le', # Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications | |
| 234 | + 1201: 'utf_16_be', # Unicode UTF-16, big endian byte order; available only to managed applications | |
| 235 | + | |
| 236 | + 10000: 'mac-roman', | |
| 237 | + 10001: 'shiftjis', # not found: 'mac-shift-jis', | |
| 238 | + 10002: 'big5', # not found: 'mac-big5', | |
| 239 | + 10003: 'ascii', # nothing appropriate found: 'mac-hangul', | |
| 240 | + 10004: 'mac-arabic', | |
| 241 | + 10005: 'hebrew', # not found: 'mac-hebrew', | |
| 242 | + 10006: 'mac-greek', | |
| 243 | + 10007: 'ascii', # nothing appropriate found: 'mac-russian', | |
| 244 | + 10008: 'gb2312', # not found: 'mac-gb2312', | |
| 245 | + 10021: 'thai', # not found: mac-thai', | |
| 246 | + 10029: 'maccentraleurope', # not found: 'mac-east europe', | |
| 247 | + 10081: 'mac-turkish', | |
| 248 | + | |
| 249 | + 12000: 'utf_32_le', # Unicode UTF-32, little endian byte order | |
| 250 | + 12001: 'utf_32_be', # Unicode UTF-32, big endian byte order | |
| 251 | + | |
| 252 | + 20127: 'ascii', | |
| 253 | + | |
| 254 | + 28591: 'latin1', | |
| 255 | + 28592: 'iso8859_2', | |
| 256 | + 28593: 'iso8859_3', | |
| 257 | + 28594: 'iso8859_4', | |
| 258 | + 28595: 'iso8859_5', | |
| 259 | + 28596: 'iso8859_6', | |
| 260 | + 28597: 'iso8859_7', | |
| 261 | + 28598: 'iso8859_8', | |
| 262 | + 28599: 'iso8859_9', | |
| 263 | + 28603: 'iso8859_13', | |
| 264 | + 28605: 'iso8859_15', | |
| 265 | + 38598: 'iso8859_8', | |
| 266 | + | |
| 267 | + 65000: 'utf7', | |
| 268 | + 65001: 'utf8', | |
| 269 | +} | |
| 270 | + | |
| 271 | + | |
| 272 | +# === FUNCTIONS ============================================================== | |
| 273 | + | |
| 274 | +def codepage2codec(codepage): | |
| 275 | + """ | |
| 276 | + convert a codepage number to a Python codec. | |
| 277 | + If the corresponding codec cannot be found, returns "utf8" by default. | |
| 278 | + | |
| 279 | + :param codepage: int, code page number | |
| 280 | + :return: str, Python codec name | |
| 281 | + """ | |
| 282 | + if codepage in CODEPAGE_TO_CODEC: | |
| 283 | + codec = CODEPAGE_TO_CODEC[codepage] | |
| 284 | + else: | |
| 285 | + codec = 'cp%d' % codepage | |
| 286 | + try: | |
| 287 | + codecs.lookup(codec) | |
| 288 | + except LookupError: | |
| 289 | + #log.error('Codec not found for code page %d, using UTF-8 as fallback.' % codepage) | |
| 290 | + codec = 'utf8' | |
| 291 | + return codec | |
| 292 | + | |
| 293 | +# === MAIN: TESTS ============================================================ | |
| 294 | + | |
| 295 | +if __name__ == '__main__': | |
| 296 | + for cp in sorted(CODEPAGE_NAME.keys()): | |
| 297 | + print('Code Page: %d => codec: %s - %s' % (cp, codepage2codec(cp), CODEPAGE_NAME[cp])) | |
| 0 | 298 | \ No newline at end of file | ... | ... |