Commit b341d742dbb69ef3cc827f06e4aaddfe67b18d0f
1 parent
3ef1b773
Add WinAnsi and MacRoman encoding
Showing
5 changed files
with
500 additions
and
4 deletions
ChangeLog
include/qpdf/QUtil.hh
| ... | ... | @@ -163,6 +163,12 @@ namespace QUtil |
| 163 | 163 | QPDF_DLL |
| 164 | 164 | std::string utf8_to_ascii( |
| 165 | 165 | std::string const& utf8, char unknown_char = '?'); |
| 166 | + QPDF_DLL | |
| 167 | + std::string utf8_to_win_ansi( | |
| 168 | + std::string const& utf8, char unknown_char = '?'); | |
| 169 | + QPDF_DLL | |
| 170 | + std::string utf8_to_mac_roman( | |
| 171 | + std::string const& utf8, char unknown_char = '?'); | |
| 166 | 172 | |
| 167 | 173 | // If secure random number generation is supported on your |
| 168 | 174 | // platform and qpdf was not compiled with insecure random number | ... | ... |
libqpdf/QUtil.cc
| ... | ... | @@ -893,10 +893,454 @@ QUtil::parse_numrange(char const* range, int max) |
| 893 | 893 | return result; |
| 894 | 894 | } |
| 895 | 895 | |
| 896 | -enum encoding_e { e_utf16, e_ascii }; | |
| 896 | +enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman }; | |
| 897 | 897 | |
| 898 | -static | |
| 899 | -std::string | |
| 898 | +static unsigned char | |
| 899 | +encode_winansi(unsigned long codepoint) | |
| 900 | +{ | |
| 901 | + // Use this ugly switch statement to avoid a static, which is not | |
| 902 | + // thread-safe. | |
| 903 | + unsigned char ch = '\0'; | |
| 904 | + switch (codepoint) | |
| 905 | + { | |
| 906 | + case 0x20ac: | |
| 907 | + ch = 0x80; | |
| 908 | + break; | |
| 909 | + case 0x152: | |
| 910 | + ch = 0x8c; | |
| 911 | + break; | |
| 912 | + case 0x160: | |
| 913 | + ch = 0x8a; | |
| 914 | + break; | |
| 915 | + case 0x178: | |
| 916 | + ch = 0x9f; | |
| 917 | + break; | |
| 918 | + case 0x17d: | |
| 919 | + ch = 0x8e; | |
| 920 | + break; | |
| 921 | + case 0x2022: | |
| 922 | + ch = 0x95; | |
| 923 | + break; | |
| 924 | + case 0x2c6: | |
| 925 | + ch = 0x88; | |
| 926 | + break; | |
| 927 | + case 0x2020: | |
| 928 | + ch = 0x86; | |
| 929 | + break; | |
| 930 | + case 0x2021: | |
| 931 | + ch = 0x87; | |
| 932 | + break; | |
| 933 | + case 0x2026: | |
| 934 | + ch = 0x85; | |
| 935 | + break; | |
| 936 | + case 0x2014: | |
| 937 | + ch = 0x97; | |
| 938 | + break; | |
| 939 | + case 0x2013: | |
| 940 | + ch = 0x96; | |
| 941 | + break; | |
| 942 | + case 0x192: | |
| 943 | + ch = 0x83; | |
| 944 | + break; | |
| 945 | + case 0x2039: | |
| 946 | + ch = 0x8b; | |
| 947 | + break; | |
| 948 | + case 0x203a: | |
| 949 | + ch = 0x9b; | |
| 950 | + break; | |
| 951 | + case 0x153: | |
| 952 | + ch = 0x9c; | |
| 953 | + break; | |
| 954 | + case 0x2030: | |
| 955 | + ch = 0x89; | |
| 956 | + break; | |
| 957 | + case 0x201e: | |
| 958 | + ch = 0x84; | |
| 959 | + break; | |
| 960 | + case 0x201c: | |
| 961 | + ch = 0x93; | |
| 962 | + break; | |
| 963 | + case 0x201d: | |
| 964 | + ch = 0x94; | |
| 965 | + break; | |
| 966 | + case 0x2018: | |
| 967 | + ch = 0x91; | |
| 968 | + break; | |
| 969 | + case 0x2019: | |
| 970 | + ch = 0x92; | |
| 971 | + break; | |
| 972 | + case 0x201a: | |
| 973 | + ch = 0x82; | |
| 974 | + break; | |
| 975 | + case 0x161: | |
| 976 | + ch = 0x9a; | |
| 977 | + break; | |
| 978 | + case 0x303: | |
| 979 | + ch = 0x98; | |
| 980 | + break; | |
| 981 | + case 0x2122: | |
| 982 | + ch = 0x99; | |
| 983 | + break; | |
| 984 | + case 0x17e: | |
| 985 | + ch = 0x9e; | |
| 986 | + break; | |
| 987 | + default: | |
| 988 | + break; | |
| 989 | + } | |
| 990 | + return ch; | |
| 991 | +} | |
| 992 | + | |
| 993 | +static unsigned char | |
| 994 | +encode_macroman(unsigned long codepoint) | |
| 995 | +{ | |
| 996 | + // Use this ugly switch statement to avoid a static, which is not | |
| 997 | + // thread-safe. | |
| 998 | + unsigned char ch = '\0'; | |
| 999 | + switch (codepoint) | |
| 1000 | + { | |
| 1001 | + case 0xc6: | |
| 1002 | + ch = 0xae; | |
| 1003 | + break; | |
| 1004 | + case 0xc1: | |
| 1005 | + ch = 0xe7; | |
| 1006 | + break; | |
| 1007 | + case 0xc2: | |
| 1008 | + ch = 0xe5; | |
| 1009 | + break; | |
| 1010 | + case 0xc4: | |
| 1011 | + ch = 0x80; | |
| 1012 | + break; | |
| 1013 | + case 0xc0: | |
| 1014 | + ch = 0xcb; | |
| 1015 | + break; | |
| 1016 | + case 0xc5: | |
| 1017 | + ch = 0x81; | |
| 1018 | + break; | |
| 1019 | + case 0xc3: | |
| 1020 | + ch = 0xcc; | |
| 1021 | + break; | |
| 1022 | + case 0xc7: | |
| 1023 | + ch = 0x82; | |
| 1024 | + break; | |
| 1025 | + case 0xc9: | |
| 1026 | + ch = 0x83; | |
| 1027 | + break; | |
| 1028 | + case 0xca: | |
| 1029 | + ch = 0xe6; | |
| 1030 | + break; | |
| 1031 | + case 0xcb: | |
| 1032 | + ch = 0xe8; | |
| 1033 | + break; | |
| 1034 | + case 0xc8: | |
| 1035 | + ch = 0xe9; | |
| 1036 | + break; | |
| 1037 | + case 0xcd: | |
| 1038 | + ch = 0xea; | |
| 1039 | + break; | |
| 1040 | + case 0xce: | |
| 1041 | + ch = 0xeb; | |
| 1042 | + break; | |
| 1043 | + case 0xcf: | |
| 1044 | + ch = 0xec; | |
| 1045 | + break; | |
| 1046 | + case 0xcc: | |
| 1047 | + ch = 0xed; | |
| 1048 | + break; | |
| 1049 | + case 0xd1: | |
| 1050 | + ch = 0x84; | |
| 1051 | + break; | |
| 1052 | + case 0x152: | |
| 1053 | + ch = 0xce; | |
| 1054 | + break; | |
| 1055 | + case 0xd3: | |
| 1056 | + ch = 0xee; | |
| 1057 | + break; | |
| 1058 | + case 0xd4: | |
| 1059 | + ch = 0xef; | |
| 1060 | + break; | |
| 1061 | + case 0xd6: | |
| 1062 | + ch = 0x85; | |
| 1063 | + break; | |
| 1064 | + case 0xd2: | |
| 1065 | + ch = 0xf1; | |
| 1066 | + break; | |
| 1067 | + case 0xd8: | |
| 1068 | + ch = 0xaf; | |
| 1069 | + break; | |
| 1070 | + case 0xd5: | |
| 1071 | + ch = 0xcd; | |
| 1072 | + break; | |
| 1073 | + case 0xda: | |
| 1074 | + ch = 0xf2; | |
| 1075 | + break; | |
| 1076 | + case 0xdb: | |
| 1077 | + ch = 0xf3; | |
| 1078 | + break; | |
| 1079 | + case 0xdc: | |
| 1080 | + ch = 0x86; | |
| 1081 | + break; | |
| 1082 | + case 0xd9: | |
| 1083 | + ch = 0xf4; | |
| 1084 | + break; | |
| 1085 | + case 0x178: | |
| 1086 | + ch = 0xd9; | |
| 1087 | + break; | |
| 1088 | + case 0xe1: | |
| 1089 | + ch = 0x87; | |
| 1090 | + break; | |
| 1091 | + case 0xe2: | |
| 1092 | + ch = 0x89; | |
| 1093 | + break; | |
| 1094 | + case 0x301: | |
| 1095 | + ch = 0xab; | |
| 1096 | + break; | |
| 1097 | + case 0xe4: | |
| 1098 | + ch = 0x8a; | |
| 1099 | + break; | |
| 1100 | + case 0xe6: | |
| 1101 | + ch = 0xbe; | |
| 1102 | + break; | |
| 1103 | + case 0xe0: | |
| 1104 | + ch = 0x88; | |
| 1105 | + break; | |
| 1106 | + case 0xe5: | |
| 1107 | + ch = 0x8c; | |
| 1108 | + break; | |
| 1109 | + case 0xe3: | |
| 1110 | + ch = 0x8b; | |
| 1111 | + break; | |
| 1112 | + case 0x306: | |
| 1113 | + ch = 0xf9; | |
| 1114 | + break; | |
| 1115 | + case 0x2022: | |
| 1116 | + ch = 0xa5; | |
| 1117 | + break; | |
| 1118 | + case 0x2c7: | |
| 1119 | + ch = 0xff; | |
| 1120 | + break; | |
| 1121 | + case 0xe7: | |
| 1122 | + ch = 0x8d; | |
| 1123 | + break; | |
| 1124 | + case 0x327: | |
| 1125 | + ch = 0xfc; | |
| 1126 | + break; | |
| 1127 | + case 0xa2: | |
| 1128 | + ch = 0xa2; | |
| 1129 | + break; | |
| 1130 | + case 0x2c6: | |
| 1131 | + ch = 0xf6; | |
| 1132 | + break; | |
| 1133 | + case 0xa9: | |
| 1134 | + ch = 0xa9; | |
| 1135 | + break; | |
| 1136 | + case 0xa4: | |
| 1137 | + ch = 0xdb; | |
| 1138 | + break; | |
| 1139 | + case 0x2020: | |
| 1140 | + ch = 0xa0; | |
| 1141 | + break; | |
| 1142 | + case 0x2021: | |
| 1143 | + ch = 0xe0; | |
| 1144 | + break; | |
| 1145 | + case 0xb0: | |
| 1146 | + ch = 0xa1; | |
| 1147 | + break; | |
| 1148 | + case 0x308: | |
| 1149 | + ch = 0xac; | |
| 1150 | + break; | |
| 1151 | + case 0xf7: | |
| 1152 | + ch = 0xd6; | |
| 1153 | + break; | |
| 1154 | + case 0x307: | |
| 1155 | + ch = 0xfa; | |
| 1156 | + break; | |
| 1157 | + case 0x131: | |
| 1158 | + ch = 0xf5; | |
| 1159 | + break; | |
| 1160 | + case 0xe9: | |
| 1161 | + ch = 0x8e; | |
| 1162 | + break; | |
| 1163 | + case 0xea: | |
| 1164 | + ch = 0x90; | |
| 1165 | + break; | |
| 1166 | + case 0xeb: | |
| 1167 | + ch = 0x91; | |
| 1168 | + break; | |
| 1169 | + case 0xe8: | |
| 1170 | + ch = 0x8f; | |
| 1171 | + break; | |
| 1172 | + case 0x2026: | |
| 1173 | + ch = 0xc9; | |
| 1174 | + break; | |
| 1175 | + case 0x2014: | |
| 1176 | + ch = 0xd1; | |
| 1177 | + break; | |
| 1178 | + case 0x2013: | |
| 1179 | + ch = 0xd0; | |
| 1180 | + break; | |
| 1181 | + case 0xa1: | |
| 1182 | + ch = 0xc1; | |
| 1183 | + break; | |
| 1184 | + case 0xfb01: | |
| 1185 | + ch = 0xde; | |
| 1186 | + break; | |
| 1187 | + case 0xfb02: | |
| 1188 | + ch = 0xdf; | |
| 1189 | + break; | |
| 1190 | + case 0x192: | |
| 1191 | + ch = 0xc4; | |
| 1192 | + break; | |
| 1193 | + case 0x2044: | |
| 1194 | + ch = 0xda; | |
| 1195 | + break; | |
| 1196 | + case 0xdf: | |
| 1197 | + ch = 0xa7; | |
| 1198 | + break; | |
| 1199 | + case 0xab: | |
| 1200 | + ch = 0xc7; | |
| 1201 | + break; | |
| 1202 | + case 0xbb: | |
| 1203 | + ch = 0xc8; | |
| 1204 | + break; | |
| 1205 | + case 0x2039: | |
| 1206 | + ch = 0xdc; | |
| 1207 | + break; | |
| 1208 | + case 0x203a: | |
| 1209 | + ch = 0xdd; | |
| 1210 | + break; | |
| 1211 | + case 0x30b: | |
| 1212 | + ch = 0xfd; | |
| 1213 | + break; | |
| 1214 | + case 0xed: | |
| 1215 | + ch = 0x92; | |
| 1216 | + break; | |
| 1217 | + case 0xee: | |
| 1218 | + ch = 0x94; | |
| 1219 | + break; | |
| 1220 | + case 0xef: | |
| 1221 | + ch = 0x95; | |
| 1222 | + break; | |
| 1223 | + case 0xec: | |
| 1224 | + ch = 0x93; | |
| 1225 | + break; | |
| 1226 | + case 0xac: | |
| 1227 | + ch = 0xc2; | |
| 1228 | + break; | |
| 1229 | + case 0x304: | |
| 1230 | + ch = 0xf8; | |
| 1231 | + break; | |
| 1232 | + case 0x3bc: | |
| 1233 | + ch = 0xb5; | |
| 1234 | + break; | |
| 1235 | + case 0xf1: | |
| 1236 | + ch = 0x96; | |
| 1237 | + break; | |
| 1238 | + case 0xf3: | |
| 1239 | + ch = 0x97; | |
| 1240 | + break; | |
| 1241 | + case 0xf4: | |
| 1242 | + ch = 0x99; | |
| 1243 | + break; | |
| 1244 | + case 0xf6: | |
| 1245 | + ch = 0x9a; | |
| 1246 | + break; | |
| 1247 | + case 0x153: | |
| 1248 | + ch = 0xcf; | |
| 1249 | + break; | |
| 1250 | + case 0x328: | |
| 1251 | + ch = 0xfe; | |
| 1252 | + break; | |
| 1253 | + case 0xf2: | |
| 1254 | + ch = 0x98; | |
| 1255 | + break; | |
| 1256 | + case 0x1d43: | |
| 1257 | + ch = 0xbb; | |
| 1258 | + break; | |
| 1259 | + case 0x1d52: | |
| 1260 | + ch = 0xbc; | |
| 1261 | + break; | |
| 1262 | + case 0xf8: | |
| 1263 | + ch = 0xbf; | |
| 1264 | + break; | |
| 1265 | + case 0xf5: | |
| 1266 | + ch = 0x9b; | |
| 1267 | + break; | |
| 1268 | + case 0xb6: | |
| 1269 | + ch = 0xa6; | |
| 1270 | + break; | |
| 1271 | + case 0xb7: | |
| 1272 | + ch = 0xe1; | |
| 1273 | + break; | |
| 1274 | + case 0x2030: | |
| 1275 | + ch = 0xe4; | |
| 1276 | + break; | |
| 1277 | + case 0xb1: | |
| 1278 | + ch = 0xb1; | |
| 1279 | + break; | |
| 1280 | + case 0xbf: | |
| 1281 | + ch = 0xc0; | |
| 1282 | + break; | |
| 1283 | + case 0x201e: | |
| 1284 | + ch = 0xe3; | |
| 1285 | + break; | |
| 1286 | + case 0x201c: | |
| 1287 | + ch = 0xd2; | |
| 1288 | + break; | |
| 1289 | + case 0x201d: | |
| 1290 | + ch = 0xd3; | |
| 1291 | + break; | |
| 1292 | + case 0x2018: | |
| 1293 | + ch = 0xd4; | |
| 1294 | + break; | |
| 1295 | + case 0x2019: | |
| 1296 | + ch = 0xd5; | |
| 1297 | + break; | |
| 1298 | + case 0x201a: | |
| 1299 | + ch = 0xe2; | |
| 1300 | + break; | |
| 1301 | + case 0xae: | |
| 1302 | + ch = 0xa8; | |
| 1303 | + break; | |
| 1304 | + case 0x30a: | |
| 1305 | + ch = 0xfb; | |
| 1306 | + break; | |
| 1307 | + case 0xa7: | |
| 1308 | + ch = 0xa4; | |
| 1309 | + break; | |
| 1310 | + case 0xa3: | |
| 1311 | + ch = 0xa3; | |
| 1312 | + break; | |
| 1313 | + case 0x303: | |
| 1314 | + ch = 0xf7; | |
| 1315 | + break; | |
| 1316 | + case 0x2122: | |
| 1317 | + ch = 0xaa; | |
| 1318 | + break; | |
| 1319 | + case 0xfa: | |
| 1320 | + ch = 0x9c; | |
| 1321 | + break; | |
| 1322 | + case 0xfb: | |
| 1323 | + ch = 0x9e; | |
| 1324 | + break; | |
| 1325 | + case 0xfc: | |
| 1326 | + ch = 0x9f; | |
| 1327 | + break; | |
| 1328 | + case 0xf9: | |
| 1329 | + ch = 0x9d; | |
| 1330 | + break; | |
| 1331 | + case 0xff: | |
| 1332 | + ch = 0xd8; | |
| 1333 | + break; | |
| 1334 | + case 0xa5: | |
| 1335 | + ch = 0xb4; | |
| 1336 | + break; | |
| 1337 | + default: | |
| 1338 | + break; | |
| 1339 | + } | |
| 1340 | + return ch; | |
| 1341 | +} | |
| 1342 | + | |
| 1343 | +static std::string | |
| 900 | 1344 | transcode_utf8(std::string const& utf8_val, encoding_e encoding, |
| 901 | 1345 | char unknown) |
| 902 | 1346 | { |
| ... | ... | @@ -966,7 +1410,27 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding, |
| 966 | 1410 | } |
| 967 | 1411 | else |
| 968 | 1412 | { |
| 969 | - result.append(1, unknown); | |
| 1413 | + ch = '\0'; | |
| 1414 | + if (encoding == e_winansi) | |
| 1415 | + { | |
| 1416 | + if ((codepoint >= 160) && (codepoint < 256)) | |
| 1417 | + { | |
| 1418 | + ch = static_cast<unsigned char>(codepoint & 0xff); | |
| 1419 | + } | |
| 1420 | + else | |
| 1421 | + { | |
| 1422 | + ch = encode_winansi(codepoint); | |
| 1423 | + } | |
| 1424 | + } | |
| 1425 | + else if (encoding == e_macroman) | |
| 1426 | + { | |
| 1427 | + ch = encode_macroman(codepoint); | |
| 1428 | + } | |
| 1429 | + if (ch == '\0') | |
| 1430 | + { | |
| 1431 | + ch = static_cast<unsigned char>(unknown); | |
| 1432 | + } | |
| 1433 | + result.append(1, ch); | |
| 970 | 1434 | } |
| 971 | 1435 | } |
| 972 | 1436 | } |
| ... | ... | @@ -985,3 +1449,15 @@ QUtil::utf8_to_ascii(std::string const& utf8, char unknown_char) |
| 985 | 1449 | { |
| 986 | 1450 | return transcode_utf8(utf8, e_ascii, unknown_char); |
| 987 | 1451 | } |
| 1452 | + | |
| 1453 | +std::string | |
| 1454 | +QUtil::utf8_to_win_ansi(std::string const& utf8, char unknown_char) | |
| 1455 | +{ | |
| 1456 | + return transcode_utf8(utf8, e_winansi, unknown_char); | |
| 1457 | +} | |
| 1458 | + | |
| 1459 | +std::string | |
| 1460 | +QUtil::utf8_to_mac_roman(std::string const& utf8, char unknown_char) | |
| 1461 | +{ | |
| 1462 | + return transcode_utf8(utf8, e_macroman, unknown_char); | |
| 1463 | +} | ... | ... |
libtests/qtest/qutil/qutil.out
libtests/qutil.cc
| ... | ... | @@ -229,6 +229,15 @@ void utf8_to_ascii_test() |
| 229 | 229 | << std::endl |
| 230 | 230 | << QUtil::utf8_to_ascii(input, '*') |
| 231 | 231 | << std::endl; |
| 232 | + std::string a = QUtil::utf8_to_win_ansi(input, '*'); | |
| 233 | + std::string b = QUtil::utf8_to_mac_roman(input, '*'); | |
| 234 | + std::cout | |
| 235 | + << "<" << QUtil::int_to_string_base( | |
| 236 | + static_cast<unsigned char>(a.at(0)), 16, 2) | |
| 237 | + << ">" << a.substr(1) << std::endl | |
| 238 | + << "<" << QUtil::int_to_string_base( | |
| 239 | + static_cast<unsigned char>(b.at(0)), 16, 2) | |
| 240 | + << ">" << b.substr(1) << std::endl; | |
| 232 | 241 | } |
| 233 | 242 | |
| 234 | 243 | void print_whoami(char const* str) | ... | ... |