Peter M. Groen / oletools

Browse Code »

Commit fdc77bfa57cd0105ba5f1952a31cd25fa73511fe

Authored by decalage2 2018-12-14 13:59:06 +0100

1 parent 7374be1e

common: added new module codepages

Inline Side-by-side

Showing 1 changed file with 297 additions and 0 deletions

oletools/common/codepages.py 0 → 100644

View file @fdc77bf

	1	+"""
	2	+codepages.py
	3	+
	4	+codepages is a python module to map code pages (numbers) to Python codecs,
	5	+in order to decode bytes to unicode.
	6	+
	7	+Author: Philippe Lagadec - http://www.decalage.info
	8	+License: BSD, see source code or documentation
	9	+
	10	+codepages is part of the python-oletools package:
	11	+http://www.decalage.info/python/oletools
	12	+"""
	13	+
	14	+# === LICENSE ==================================================================
	15	+
	16	+# codepages is copyright (c) 2018 Philippe Lagadec (http://www.decalage.info)
	17	+# All rights reserved.
	18	+#
	19	+# Redistribution and use in source and binary forms, with or without modification,
	20	+# are permitted provided that the following conditions are met:
	21	+#
	22	+# * Redistributions of source code must retain the above copyright notice, this
	23	+# list of conditions and the following disclaimer.
	24	+# * Redistributions in binary form must reproduce the above copyright notice,
	25	+# this list of conditions and the following disclaimer in the documentation
	26	+# and/or other materials provided with the distribution.
	27	+#
	28	+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	29	+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	30	+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	31	+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
	32	+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	33	+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
	34	+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	35	+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	36	+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	37	+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	38	+
	39	+
	40	+# -----------------------------------------------------------------------------
	41	+# CHANGELOG:
	42	+# 2018-12-13 v0.54 PL: - first version
	43	+
	44	+__version__ = '0.54dev6'
	45	+
	46	+# -----------------------------------------------------------------------------
	47	+# TODO:
	48	+
	49	+# -----------------------------------------------------------------------------
	50	+# REFERENCES:
	51	+# - https://docs.microsoft.com/en-gb/windows/desktop/Intl/code-page-identifiers
	52	+
	53	+
	54	+# --- IMPORTS -----------------------------------------------------------------
	55	+
	56	+import codecs
	57	+
	58	+# === CONSTANTS ===============================================================
	59	+
	60	+# Code page names from https://docs.microsoft.com/en-gb/windows/desktop/Intl/code-page-identifiers
	61	+# Retrieved on the 2018-12-13
	62	+# How it was converted to Python:
	63	+# 1) copy the table data (3 columns) from browser into Excel
	64	+# 2) use the following formula to concatenate 1st and 3rd columns: =A1 & ": " & "'" & C1 & "',"
	65	+# 3) copy from Excel into Python
	66	+
	67	+CODEPAGE_NAME = {
	68	+ 37: 'IBM EBCDIC US-Canada',
	69	+ 437: 'OEM United States',
	70	+ 500: 'IBM EBCDIC International',
	71	+ 708: 'Arabic (ASMO 708)',
	72	+ 709: 'Arabic (ASMO-449+, BCON V4)',
	73	+ 710: 'Arabic - Transparent Arabic',
	74	+ 720: 'Arabic (Transparent ASMO); Arabic (DOS)',
	75	+ 737: 'OEM Greek (formerly 437G); Greek (DOS)',
	76	+ 775: 'OEM Baltic; Baltic (DOS)',
	77	+ 850: 'OEM Multilingual Latin 1; Western European (DOS)',
	78	+ 852: 'OEM Latin 2; Central European (DOS)',
	79	+ 855: 'OEM Cyrillic (primarily Russian)',
	80	+ 857: 'OEM Turkish; Turkish (DOS)',
	81	+ 858: 'OEM Multilingual Latin 1 + Euro symbol',
	82	+ 860: 'OEM Portuguese; Portuguese (DOS)',
	83	+ 861: 'OEM Icelandic; Icelandic (DOS)',
	84	+ 862: 'OEM Hebrew; Hebrew (DOS)',
	85	+ 863: 'OEM French Canadian; French Canadian (DOS)',
	86	+ 864: 'OEM Arabic; Arabic (864)',
	87	+ 865: 'OEM Nordic; Nordic (DOS)',
	88	+ 866: 'OEM Russian; Cyrillic (DOS)',
	89	+ 869: 'OEM Modern Greek; Greek, Modern (DOS)',
	90	+ 870: 'IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2',
	91	+ 874: 'ANSI/OEM Thai (ISO 8859-11); Thai (Windows)',
	92	+ 875: 'IBM EBCDIC Greek Modern',
	93	+ 932: 'ANSI/OEM Japanese; Japanese (Shift-JIS)',
	94	+ 936: 'ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)',
	95	+ 949: 'ANSI/OEM Korean (Unified Hangul Code)',
	96	+ 950: 'ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)',
	97	+ 1026: 'IBM EBCDIC Turkish (Latin 5)',
	98	+ 1047: 'IBM EBCDIC Latin 1/Open System',
	99	+ 1140: 'IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)',
	100	+ 1141: 'IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)',
	101	+ 1142: 'IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)',
	102	+ 1143: 'IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)',
	103	+ 1144: 'IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)',
	104	+ 1145: 'IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)',
	105	+ 1146: 'IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)',
	106	+ 1147: 'IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)',
	107	+ 1148: 'IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)',
	108	+ 1149: 'IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)',
	109	+ 1200: 'Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications',
	110	+ 1201: 'Unicode UTF-16, big endian byte order; available only to managed applications',
	111	+ 1250: 'ANSI Central European; Central European (Windows)',
	112	+ 1251: 'ANSI Cyrillic; Cyrillic (Windows)',
	113	+ 1252: 'ANSI Latin 1; Western European (Windows)',
	114	+ 1253: 'ANSI Greek; Greek (Windows)',
	115	+ 1254: 'ANSI Turkish; Turkish (Windows)',
	116	+ 1255: 'ANSI Hebrew; Hebrew (Windows)',
	117	+ 1256: 'ANSI Arabic; Arabic (Windows)',
	118	+ 1257: 'ANSI Baltic; Baltic (Windows)',
	119	+ 1258: 'ANSI/OEM Vietnamese; Vietnamese (Windows)',
	120	+ 1361: 'Korean (Johab)',
	121	+ 10000: 'MAC Roman; Western European (Mac)',
	122	+ 10001: 'Japanese (Mac)',
	123	+ 10002: 'MAC Traditional Chinese (Big5); Chinese Traditional (Mac)',
	124	+ 10003: 'Korean (Mac)',
	125	+ 10004: 'Arabic (Mac)',
	126	+ 10005: 'Hebrew (Mac)',
	127	+ 10006: 'Greek (Mac)',
	128	+ 10007: 'Cyrillic (Mac)',
	129	+ 10008: 'MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)',
	130	+ 10010: 'Romanian (Mac)',
	131	+ 10017: 'Ukrainian (Mac)',
	132	+ 10021: 'Thai (Mac)',
	133	+ 10029: 'MAC Latin 2; Central European (Mac)',
	134	+ 10079: 'Icelandic (Mac)',
	135	+ 10081: 'Turkish (Mac)',
	136	+ 10082: 'Croatian (Mac)',
	137	+ 12000: 'Unicode UTF-32, little endian byte order; available only to managed applications',
	138	+ 12001: 'Unicode UTF-32, big endian byte order; available only to managed applications',
	139	+ 20000: 'CNS Taiwan; Chinese Traditional (CNS)',
	140	+ 20001: 'TCA Taiwan',
	141	+ 20002: 'Eten Taiwan; Chinese Traditional (Eten)',
	142	+ 20003: 'IBM5550 Taiwan',
	143	+ 20004: 'TeleText Taiwan',
	144	+ 20005: 'Wang Taiwan',
	145	+ 20105: 'IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)',
	146	+ 20106: 'IA5 German (7-bit)',
	147	+ 20107: 'IA5 Swedish (7-bit)',
	148	+ 20108: 'IA5 Norwegian (7-bit)',
	149	+ 20127: 'US-ASCII (7-bit)',
	150	+ 20261: 'T.61',
	151	+ 20269: 'ISO 6937 Non-Spacing Accent',
	152	+ 20273: 'IBM EBCDIC Germany',
	153	+ 20277: 'IBM EBCDIC Denmark-Norway',
	154	+ 20278: 'IBM EBCDIC Finland-Sweden',
	155	+ 20280: 'IBM EBCDIC Italy',
	156	+ 20284: 'IBM EBCDIC Latin America-Spain',
	157	+ 20285: 'IBM EBCDIC United Kingdom',
	158	+ 20290: 'IBM EBCDIC Japanese Katakana Extended',
	159	+ 20297: 'IBM EBCDIC France',
	160	+ 20420: 'IBM EBCDIC Arabic',
	161	+ 20423: 'IBM EBCDIC Greek',
	162	+ 20424: 'IBM EBCDIC Hebrew',
	163	+ 20833: 'IBM EBCDIC Korean Extended',
	164	+ 20838: 'IBM EBCDIC Thai',
	165	+ 20866: 'Russian (KOI8-R); Cyrillic (KOI8-R)',
	166	+ 20871: 'IBM EBCDIC Icelandic',
	167	+ 20880: 'IBM EBCDIC Cyrillic Russian',
	168	+ 20905: 'IBM EBCDIC Turkish',
	169	+ 20924: 'IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)',
	170	+ 20932: 'Japanese (JIS 0208-1990 and 0212-1990)',
	171	+ 20936: 'Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)',
	172	+ 20949: 'Korean Wansung',
	173	+ 21025: 'IBM EBCDIC Cyrillic Serbian-Bulgarian',
	174	+ 21027: '(deprecated)',
	175	+ 21866: 'Ukrainian (KOI8-U); Cyrillic (KOI8-U)',
	176	+ 28591: 'ISO 8859-1 Latin 1; Western European (ISO)',
	177	+ 28592: 'ISO 8859-2 Central European; Central European (ISO)',
	178	+ 28593: 'ISO 8859-3 Latin 3',
	179	+ 28594: 'ISO 8859-4 Baltic',
	180	+ 28595: 'ISO 8859-5 Cyrillic',
	181	+ 28596: 'ISO 8859-6 Arabic',
	182	+ 28597: 'ISO 8859-7 Greek',
	183	+ 28598: 'ISO 8859-8 Hebrew; Hebrew (ISO-Visual)',
	184	+ 28599: 'ISO 8859-9 Turkish',
	185	+ 28603: 'ISO 8859-13 Estonian',
	186	+ 28605: 'ISO 8859-15 Latin 9',
	187	+ 29001: 'Europa 3',
	188	+ 38598: 'ISO 8859-8 Hebrew; Hebrew (ISO-Logical)',
	189	+ 50220: 'ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)',
	190	+ 50221: 'ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)',
	191	+ 50222: 'ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)',
	192	+ 50225: 'ISO 2022 Korean',
	193	+ 50227: 'ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)',
	194	+ 50229: 'ISO 2022 Traditional Chinese',
	195	+ 50930: 'EBCDIC Japanese (Katakana) Extended',
	196	+ 50931: 'EBCDIC US-Canada and Japanese',
	197	+ 50933: 'EBCDIC Korean Extended and Korean',
	198	+ 50935: 'EBCDIC Simplified Chinese Extended and Simplified Chinese',
	199	+ 50936: 'EBCDIC Simplified Chinese',
	200	+ 50937: 'EBCDIC US-Canada and Traditional Chinese',
	201	+ 50939: 'EBCDIC Japanese (Latin) Extended and Japanese',
	202	+ 51932: 'EUC Japanese',
	203	+ 51936: 'EUC Simplified Chinese; Chinese Simplified (EUC)',
	204	+ 51949: 'EUC Korean',
	205	+ 51950: 'EUC Traditional Chinese',
	206	+ 52936: 'HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)',
	207	+ 54936: 'Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)',
	208	+ 57002: 'ISCII Devanagari',
	209	+ 57003: 'ISCII Bangla',
	210	+ 57004: 'ISCII Tamil',
	211	+ 57005: 'ISCII Telugu',
	212	+ 57006: 'ISCII Assamese',
	213	+ 57007: 'ISCII Odia',
	214	+ 57008: 'ISCII Kannada',
	215	+ 57009: 'ISCII Malayalam',
	216	+ 57010: 'ISCII Gujarati',
	217	+ 57011: 'ISCII Punjabi',
	218	+ 65000: 'Unicode (UTF-7)',
	219	+ 65001: 'Unicode (UTF-8)',
	220	+}
	221	+
	222	+
	223	+# Mapping from codepages to Python codecs, when 'cpXXX' does not work
	224	+# (inspired from http://stackoverflow.com/questions/1592925/decoding-mac-os-text-in-python)
	225	+CODEPAGE_TO_CODEC = {
	226	+ 37: 'cp037',
	227	+ 708: 'arabic', # not found: Arabic (ASMO 708) => arabic = iso-8859-6
	228	+ 709: 'arabic', # not found: Arabic (ASMO-449+, BCON V4) => arabic = iso-8859-6
	229	+ 710: 'arabic', # not found: Arabic - Transparent Arabic => arabic = iso-8859-6
	230	+ 870: 'latin2', # IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2
	231	+ 1047: 'latin1', # IBM EBCDIC Latin 1/Open System
	232	+ 1141: 'cp273', # IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)
	233	+ 1200: 'utf_16_le', # Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications
	234	+ 1201: 'utf_16_be', # Unicode UTF-16, big endian byte order; available only to managed applications
	235	+
	236	+ 10000: 'mac-roman',
	237	+ 10001: 'shiftjis', # not found: 'mac-shift-jis',
	238	+ 10002: 'big5', # not found: 'mac-big5',
	239	+ 10003: 'ascii', # nothing appropriate found: 'mac-hangul',
	240	+ 10004: 'mac-arabic',
	241	+ 10005: 'hebrew', # not found: 'mac-hebrew',
	242	+ 10006: 'mac-greek',
	243	+ 10007: 'ascii', # nothing appropriate found: 'mac-russian',
	244	+ 10008: 'gb2312', # not found: 'mac-gb2312',
	245	+ 10021: 'thai', # not found: mac-thai',
	246	+ 10029: 'maccentraleurope', # not found: 'mac-east europe',
	247	+ 10081: 'mac-turkish',
	248	+
	249	+ 12000: 'utf_32_le', # Unicode UTF-32, little endian byte order
	250	+ 12001: 'utf_32_be', # Unicode UTF-32, big endian byte order
	251	+
	252	+ 20127: 'ascii',
	253	+
	254	+ 28591: 'latin1',
	255	+ 28592: 'iso8859_2',
	256	+ 28593: 'iso8859_3',
	257	+ 28594: 'iso8859_4',
	258	+ 28595: 'iso8859_5',
	259	+ 28596: 'iso8859_6',
	260	+ 28597: 'iso8859_7',
	261	+ 28598: 'iso8859_8',
	262	+ 28599: 'iso8859_9',
	263	+ 28603: 'iso8859_13',
	264	+ 28605: 'iso8859_15',
	265	+ 38598: 'iso8859_8',
	266	+
	267	+ 65000: 'utf7',
	268	+ 65001: 'utf8',
	269	+}
	270	+
	271	+
	272	+# === FUNCTIONS ==============================================================
	273	+
	274	+def codepage2codec(codepage):
	275	+ """
	276	+ convert a codepage number to a Python codec.
	277	+ If the corresponding codec cannot be found, returns "utf8" by default.
	278	+
	279	+ :param codepage: int, code page number
	280	+ :return: str, Python codec name
	281	+ """
	282	+ if codepage in CODEPAGE_TO_CODEC:
	283	+ codec = CODEPAGE_TO_CODEC[codepage]
	284	+ else:
	285	+ codec = 'cp%d' % codepage
	286	+ try:
	287	+ codecs.lookup(codec)
	288	+ except LookupError:
	289	+ #log.error('Codec not found for code page %d, using UTF-8 as fallback.' % codepage)
	290	+ codec = 'utf8'
	291	+ return codec
	292	+
	293	+# === MAIN: TESTS ============================================================
	294	+
	295	+if __name__ == '__main__':
	296	+ for cp in sorted(CODEPAGE_NAME.keys()):
	297	+ print('Code Page: %d => codec: %s - %s' % (cp, codepage2codec(cp), CODEPAGE_NAME[cp]))
0	298	\ No newline at end of file
...	...