From fdc77bfa57cd0105ba5f1952a31cd25fa73511fe Mon Sep 17 00:00:00 2001
From: decalage2 <decalage@laposte.net>
Date: Fri, 14 Dec 2018 13:59:06 +0100
Subject: [PATCH] common: added new module codepages

---
 oletools/common/codepages.py | 297 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 297 insertions(+), 0 deletions(-)
 create mode 100644 oletools/common/codepages.py

diff --git a/oletools/common/codepages.py b/oletools/common/codepages.py
new file mode 100644
index 0000000..8c1e100
--- /dev/null
+++ b/oletools/common/codepages.py
@@ -0,0 +1,297 @@
+"""
+codepages.py
+
+codepages is a python module to map code pages (numbers) to Python codecs,
+in order to decode bytes to unicode.
+
+Author: Philippe Lagadec - http://www.decalage.info
+License: BSD, see source code or documentation
+
+codepages is part of the python-oletools package:
+http://www.decalage.info/python/oletools
+"""
+
+# === LICENSE ==================================================================
+
+# codepages is copyright (c) 2018 Philippe Lagadec (http://www.decalage.info)
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+# -----------------------------------------------------------------------------
+# CHANGELOG:
+# 2018-12-13 v0.54 PL: - first version
+
+__version__ = '0.54dev6'
+
+# -----------------------------------------------------------------------------
+# TODO:
+
+# -----------------------------------------------------------------------------
+# REFERENCES:
+# - https://docs.microsoft.com/en-gb/windows/desktop/Intl/code-page-identifiers
+
+
+# --- IMPORTS -----------------------------------------------------------------
+
+import codecs
+
+# === CONSTANTS ===============================================================
+
+# Code page names from https://docs.microsoft.com/en-gb/windows/desktop/Intl/code-page-identifiers
+# Retrieved on the 2018-12-13
+# How it was converted to Python:
+# 1) copy the table data (3 columns) from browser into Excel
+# 2) use the following formula to concatenate 1st and 3rd columns: =A1 & ": " & "'" & C1 & "',"
+# 3) copy from Excel into Python
+
+CODEPAGE_NAME = {
+    37: 'IBM EBCDIC US-Canada',
+    437: 'OEM United States',
+    500: 'IBM EBCDIC International',
+    708: 'Arabic (ASMO 708)',
+    709: 'Arabic (ASMO-449+, BCON V4)',
+    710: 'Arabic - Transparent Arabic',
+    720: 'Arabic (Transparent ASMO); Arabic (DOS)',
+    737: 'OEM Greek (formerly 437G); Greek (DOS)',
+    775: 'OEM Baltic; Baltic (DOS)',
+    850: 'OEM Multilingual Latin 1; Western European (DOS)',
+    852: 'OEM Latin 2; Central European (DOS)',
+    855: 'OEM Cyrillic (primarily Russian)',
+    857: 'OEM Turkish; Turkish (DOS)',
+    858: 'OEM Multilingual Latin 1 + Euro symbol',
+    860: 'OEM Portuguese; Portuguese (DOS)',
+    861: 'OEM Icelandic; Icelandic (DOS)',
+    862: 'OEM Hebrew; Hebrew (DOS)',
+    863: 'OEM French Canadian; French Canadian (DOS)',
+    864: 'OEM Arabic; Arabic (864)',
+    865: 'OEM Nordic; Nordic (DOS)',
+    866: 'OEM Russian; Cyrillic (DOS)',
+    869: 'OEM Modern Greek; Greek, Modern (DOS)',
+    870: 'IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2',
+    874: 'ANSI/OEM Thai (ISO 8859-11); Thai (Windows)',
+    875: 'IBM EBCDIC Greek Modern',
+    932: 'ANSI/OEM Japanese; Japanese (Shift-JIS)',
+    936: 'ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)',
+    949: 'ANSI/OEM Korean (Unified Hangul Code)',
+    950: 'ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)',
+    1026: 'IBM EBCDIC Turkish (Latin 5)',
+    1047: 'IBM EBCDIC Latin 1/Open System',
+    1140: 'IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)',
+    1141: 'IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)',
+    1142: 'IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)',
+    1143: 'IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)',
+    1144: 'IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)',
+    1145: 'IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)',
+    1146: 'IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)',
+    1147: 'IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)',
+    1148: 'IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)',
+    1149: 'IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)',
+    1200: 'Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications',
+    1201: 'Unicode UTF-16, big endian byte order; available only to managed applications',
+    1250: 'ANSI Central European; Central European (Windows)',
+    1251: 'ANSI Cyrillic; Cyrillic (Windows)',
+    1252: 'ANSI Latin 1; Western European (Windows)',
+    1253: 'ANSI Greek; Greek (Windows)',
+    1254: 'ANSI Turkish; Turkish (Windows)',
+    1255: 'ANSI Hebrew; Hebrew (Windows)',
+    1256: 'ANSI Arabic; Arabic (Windows)',
+    1257: 'ANSI Baltic; Baltic (Windows)',
+    1258: 'ANSI/OEM Vietnamese; Vietnamese (Windows)',
+    1361: 'Korean (Johab)',
+    10000: 'MAC Roman; Western European (Mac)',
+    10001: 'Japanese (Mac)',
+    10002: 'MAC Traditional Chinese (Big5); Chinese Traditional (Mac)',
+    10003: 'Korean (Mac)',
+    10004: 'Arabic (Mac)',
+    10005: 'Hebrew (Mac)',
+    10006: 'Greek (Mac)',
+    10007: 'Cyrillic (Mac)',
+    10008: 'MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)',
+    10010: 'Romanian (Mac)',
+    10017: 'Ukrainian (Mac)',
+    10021: 'Thai (Mac)',
+    10029: 'MAC Latin 2; Central European (Mac)',
+    10079: 'Icelandic (Mac)',
+    10081: 'Turkish (Mac)',
+    10082: 'Croatian (Mac)',
+    12000: 'Unicode UTF-32, little endian byte order; available only to managed applications',
+    12001: 'Unicode UTF-32, big endian byte order; available only to managed applications',
+    20000: 'CNS Taiwan; Chinese Traditional (CNS)',
+    20001: 'TCA Taiwan',
+    20002: 'Eten Taiwan; Chinese Traditional (Eten)',
+    20003: 'IBM5550 Taiwan',
+    20004: 'TeleText Taiwan',
+    20005: 'Wang Taiwan',
+    20105: 'IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)',
+    20106: 'IA5 German (7-bit)',
+    20107: 'IA5 Swedish (7-bit)',
+    20108: 'IA5 Norwegian (7-bit)',
+    20127: 'US-ASCII (7-bit)',
+    20261: 'T.61',
+    20269: 'ISO 6937 Non-Spacing Accent',
+    20273: 'IBM EBCDIC Germany',
+    20277: 'IBM EBCDIC Denmark-Norway',
+    20278: 'IBM EBCDIC Finland-Sweden',
+    20280: 'IBM EBCDIC Italy',
+    20284: 'IBM EBCDIC Latin America-Spain',
+    20285: 'IBM EBCDIC United Kingdom',
+    20290: 'IBM EBCDIC Japanese Katakana Extended',
+    20297: 'IBM EBCDIC France',
+    20420: 'IBM EBCDIC Arabic',
+    20423: 'IBM EBCDIC Greek',
+    20424: 'IBM EBCDIC Hebrew',
+    20833: 'IBM EBCDIC Korean Extended',
+    20838: 'IBM EBCDIC Thai',
+    20866: 'Russian (KOI8-R); Cyrillic (KOI8-R)',
+    20871: 'IBM EBCDIC Icelandic',
+    20880: 'IBM EBCDIC Cyrillic Russian',
+    20905: 'IBM EBCDIC Turkish',
+    20924: 'IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)',
+    20932: 'Japanese (JIS 0208-1990 and 0212-1990)',
+    20936: 'Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)',
+    20949: 'Korean Wansung',
+    21025: 'IBM EBCDIC Cyrillic Serbian-Bulgarian',
+    21027: '(deprecated)',
+    21866: 'Ukrainian (KOI8-U); Cyrillic (KOI8-U)',
+    28591: 'ISO 8859-1 Latin 1; Western European (ISO)',
+    28592: 'ISO 8859-2 Central European; Central European (ISO)',
+    28593: 'ISO 8859-3 Latin 3',
+    28594: 'ISO 8859-4 Baltic',
+    28595: 'ISO 8859-5 Cyrillic',
+    28596: 'ISO 8859-6 Arabic',
+    28597: 'ISO 8859-7 Greek',
+    28598: 'ISO 8859-8 Hebrew; Hebrew (ISO-Visual)',
+    28599: 'ISO 8859-9 Turkish',
+    28603: 'ISO 8859-13 Estonian',
+    28605: 'ISO 8859-15 Latin 9',
+    29001: 'Europa 3',
+    38598: 'ISO 8859-8 Hebrew; Hebrew (ISO-Logical)',
+    50220: 'ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)',
+    50221: 'ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)',
+    50222: 'ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)',
+    50225: 'ISO 2022 Korean',
+    50227: 'ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)',
+    50229: 'ISO 2022 Traditional Chinese',
+    50930: 'EBCDIC Japanese (Katakana) Extended',
+    50931: 'EBCDIC US-Canada and Japanese',
+    50933: 'EBCDIC Korean Extended and Korean',
+    50935: 'EBCDIC Simplified Chinese Extended and Simplified Chinese',
+    50936: 'EBCDIC Simplified Chinese',
+    50937: 'EBCDIC US-Canada and Traditional Chinese',
+    50939: 'EBCDIC Japanese (Latin) Extended and Japanese',
+    51932: 'EUC Japanese',
+    51936: 'EUC Simplified Chinese; Chinese Simplified (EUC)',
+    51949: 'EUC Korean',
+    51950: 'EUC Traditional Chinese',
+    52936: 'HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)',
+    54936: 'Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)',
+    57002: 'ISCII Devanagari',
+    57003: 'ISCII Bangla',
+    57004: 'ISCII Tamil',
+    57005: 'ISCII Telugu',
+    57006: 'ISCII Assamese',
+    57007: 'ISCII Odia',
+    57008: 'ISCII Kannada',
+    57009: 'ISCII Malayalam',
+    57010: 'ISCII Gujarati',
+    57011: 'ISCII Punjabi',
+    65000: 'Unicode (UTF-7)',
+    65001: 'Unicode (UTF-8)',
+}
+
+
+# Mapping from codepages to Python codecs, when 'cpXXX' does not work
+# (inspired from http://stackoverflow.com/questions/1592925/decoding-mac-os-text-in-python)
+CODEPAGE_TO_CODEC = {
+    37: 'cp037',
+    708: 'arabic', # not found: Arabic (ASMO 708) => arabic = iso-8859-6
+    709: 'arabic', # not found: Arabic (ASMO-449+, BCON V4) => arabic = iso-8859-6
+    710: 'arabic', # not found: Arabic - Transparent Arabic => arabic = iso-8859-6
+    870: 'latin2', # IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2
+    1047: 'latin1', # IBM EBCDIC Latin 1/Open System
+    1141: 'cp273', # IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)
+    1200: 'utf_16_le', # Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications
+    1201: 'utf_16_be', # Unicode UTF-16, big endian byte order; available only to managed applications
+
+    10000: 'mac-roman',
+    10001: 'shiftjis',  # not found: 'mac-shift-jis',
+    10002: 'big5',      # not found: 'mac-big5',
+    10003: 'ascii',     # nothing appropriate found: 'mac-hangul',
+    10004: 'mac-arabic',
+    10005: 'hebrew',    # not found: 'mac-hebrew',
+    10006: 'mac-greek',
+    10007: 'ascii',     # nothing appropriate found: 'mac-russian',
+    10008: 'gb2312',    # not found: 'mac-gb2312',
+    10021: 'thai',      # not found: mac-thai',
+    10029: 'maccentraleurope',  # not found: 'mac-east europe',
+    10081: 'mac-turkish',
+
+    12000: 'utf_32_le', # Unicode UTF-32, little endian byte order
+    12001: 'utf_32_be', # Unicode UTF-32, big endian byte order
+
+    20127: 'ascii',
+
+    28591: 'latin1',
+    28592: 'iso8859_2',
+    28593: 'iso8859_3',
+    28594: 'iso8859_4',
+    28595: 'iso8859_5',
+    28596: 'iso8859_6',
+    28597: 'iso8859_7',
+    28598: 'iso8859_8',
+    28599: 'iso8859_9',
+    28603: 'iso8859_13',
+    28605: 'iso8859_15',
+    38598: 'iso8859_8',
+
+    65000: 'utf7',
+    65001: 'utf8',
+}
+
+
+# === FUNCTIONS ==============================================================
+
+def codepage2codec(codepage):
+    """
+    convert a codepage number to a Python codec.
+    If the corresponding codec cannot be found, returns "utf8" by default.
+
+    :param codepage: int, code page number
+    :return: str, Python codec name
+    """
+    if codepage in CODEPAGE_TO_CODEC:
+        codec = CODEPAGE_TO_CODEC[codepage]
+    else:
+        codec = 'cp%d' % codepage
+    try:
+        codecs.lookup(codec)
+    except LookupError:
+        #log.error('Codec not found for code page %d, using UTF-8 as fallback.' % codepage)
+        codec = 'utf8'
+    return codec
+
+# === MAIN: TESTS ============================================================
+
+if __name__ == '__main__':
+    for cp in sorted(CODEPAGE_NAME.keys()):
+        print('Code Page: %d => codec: %s - %s' % (cp, codepage2codec(cp), CODEPAGE_NAME[cp]))
\ No newline at end of file
--
libgit2 0.21.4