Commit b341d742dbb69ef3cc827f06e4aaddfe67b18d0f

Authored by Jay Berkenbilt
1 parent 3ef1b773

Add WinAnsi and MacRoman encoding

ChangeLog
1 2019-01-05 Jay Berkenbilt <ejb@ql.org> 1 2019-01-05 Jay Berkenbilt <ejb@ql.org>
2 2
  3 + * Add methods QUtil::utf8_to_win_ansi and
  4 + QUtil::utf8_to_mac_roman.
  5 +
3 * Add method QUtil::utf8_to_utf16. 6 * Add method QUtil::utf8_to_utf16.
4 7
5 2019-01-04 Jay Berkenbilt <ejb@ql.org> 8 2019-01-04 Jay Berkenbilt <ejb@ql.org>
include/qpdf/QUtil.hh
@@ -163,6 +163,12 @@ namespace QUtil @@ -163,6 +163,12 @@ namespace QUtil
163 QPDF_DLL 163 QPDF_DLL
164 std::string utf8_to_ascii( 164 std::string utf8_to_ascii(
165 std::string const& utf8, char unknown_char = '?'); 165 std::string const& utf8, char unknown_char = '?');
  166 + QPDF_DLL
  167 + std::string utf8_to_win_ansi(
  168 + std::string const& utf8, char unknown_char = '?');
  169 + QPDF_DLL
  170 + std::string utf8_to_mac_roman(
  171 + std::string const& utf8, char unknown_char = '?');
166 172
167 // If secure random number generation is supported on your 173 // If secure random number generation is supported on your
168 // platform and qpdf was not compiled with insecure random number 174 // platform and qpdf was not compiled with insecure random number
libqpdf/QUtil.cc
@@ -893,10 +893,454 @@ QUtil::parse_numrange(char const* range, int max) @@ -893,10 +893,454 @@ QUtil::parse_numrange(char const* range, int max)
893 return result; 893 return result;
894 } 894 }
895 895
896 -enum encoding_e { e_utf16, e_ascii }; 896 +enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman };
897 897
898 -static  
899 -std::string 898 +static unsigned char
  899 +encode_winansi(unsigned long codepoint)
  900 +{
  901 + // Use this ugly switch statement to avoid a static, which is not
  902 + // thread-safe.
  903 + unsigned char ch = '\0';
  904 + switch (codepoint)
  905 + {
  906 + case 0x20ac:
  907 + ch = 0x80;
  908 + break;
  909 + case 0x152:
  910 + ch = 0x8c;
  911 + break;
  912 + case 0x160:
  913 + ch = 0x8a;
  914 + break;
  915 + case 0x178:
  916 + ch = 0x9f;
  917 + break;
  918 + case 0x17d:
  919 + ch = 0x8e;
  920 + break;
  921 + case 0x2022:
  922 + ch = 0x95;
  923 + break;
  924 + case 0x2c6:
  925 + ch = 0x88;
  926 + break;
  927 + case 0x2020:
  928 + ch = 0x86;
  929 + break;
  930 + case 0x2021:
  931 + ch = 0x87;
  932 + break;
  933 + case 0x2026:
  934 + ch = 0x85;
  935 + break;
  936 + case 0x2014:
  937 + ch = 0x97;
  938 + break;
  939 + case 0x2013:
  940 + ch = 0x96;
  941 + break;
  942 + case 0x192:
  943 + ch = 0x83;
  944 + break;
  945 + case 0x2039:
  946 + ch = 0x8b;
  947 + break;
  948 + case 0x203a:
  949 + ch = 0x9b;
  950 + break;
  951 + case 0x153:
  952 + ch = 0x9c;
  953 + break;
  954 + case 0x2030:
  955 + ch = 0x89;
  956 + break;
  957 + case 0x201e:
  958 + ch = 0x84;
  959 + break;
  960 + case 0x201c:
  961 + ch = 0x93;
  962 + break;
  963 + case 0x201d:
  964 + ch = 0x94;
  965 + break;
  966 + case 0x2018:
  967 + ch = 0x91;
  968 + break;
  969 + case 0x2019:
  970 + ch = 0x92;
  971 + break;
  972 + case 0x201a:
  973 + ch = 0x82;
  974 + break;
  975 + case 0x161:
  976 + ch = 0x9a;
  977 + break;
  978 + case 0x303:
  979 + ch = 0x98;
  980 + break;
  981 + case 0x2122:
  982 + ch = 0x99;
  983 + break;
  984 + case 0x17e:
  985 + ch = 0x9e;
  986 + break;
  987 + default:
  988 + break;
  989 + }
  990 + return ch;
  991 +}
  992 +
  993 +static unsigned char
  994 +encode_macroman(unsigned long codepoint)
  995 +{
  996 + // Use this ugly switch statement to avoid a static, which is not
  997 + // thread-safe.
  998 + unsigned char ch = '\0';
  999 + switch (codepoint)
  1000 + {
  1001 + case 0xc6:
  1002 + ch = 0xae;
  1003 + break;
  1004 + case 0xc1:
  1005 + ch = 0xe7;
  1006 + break;
  1007 + case 0xc2:
  1008 + ch = 0xe5;
  1009 + break;
  1010 + case 0xc4:
  1011 + ch = 0x80;
  1012 + break;
  1013 + case 0xc0:
  1014 + ch = 0xcb;
  1015 + break;
  1016 + case 0xc5:
  1017 + ch = 0x81;
  1018 + break;
  1019 + case 0xc3:
  1020 + ch = 0xcc;
  1021 + break;
  1022 + case 0xc7:
  1023 + ch = 0x82;
  1024 + break;
  1025 + case 0xc9:
  1026 + ch = 0x83;
  1027 + break;
  1028 + case 0xca:
  1029 + ch = 0xe6;
  1030 + break;
  1031 + case 0xcb:
  1032 + ch = 0xe8;
  1033 + break;
  1034 + case 0xc8:
  1035 + ch = 0xe9;
  1036 + break;
  1037 + case 0xcd:
  1038 + ch = 0xea;
  1039 + break;
  1040 + case 0xce:
  1041 + ch = 0xeb;
  1042 + break;
  1043 + case 0xcf:
  1044 + ch = 0xec;
  1045 + break;
  1046 + case 0xcc:
  1047 + ch = 0xed;
  1048 + break;
  1049 + case 0xd1:
  1050 + ch = 0x84;
  1051 + break;
  1052 + case 0x152:
  1053 + ch = 0xce;
  1054 + break;
  1055 + case 0xd3:
  1056 + ch = 0xee;
  1057 + break;
  1058 + case 0xd4:
  1059 + ch = 0xef;
  1060 + break;
  1061 + case 0xd6:
  1062 + ch = 0x85;
  1063 + break;
  1064 + case 0xd2:
  1065 + ch = 0xf1;
  1066 + break;
  1067 + case 0xd8:
  1068 + ch = 0xaf;
  1069 + break;
  1070 + case 0xd5:
  1071 + ch = 0xcd;
  1072 + break;
  1073 + case 0xda:
  1074 + ch = 0xf2;
  1075 + break;
  1076 + case 0xdb:
  1077 + ch = 0xf3;
  1078 + break;
  1079 + case 0xdc:
  1080 + ch = 0x86;
  1081 + break;
  1082 + case 0xd9:
  1083 + ch = 0xf4;
  1084 + break;
  1085 + case 0x178:
  1086 + ch = 0xd9;
  1087 + break;
  1088 + case 0xe1:
  1089 + ch = 0x87;
  1090 + break;
  1091 + case 0xe2:
  1092 + ch = 0x89;
  1093 + break;
  1094 + case 0x301:
  1095 + ch = 0xab;
  1096 + break;
  1097 + case 0xe4:
  1098 + ch = 0x8a;
  1099 + break;
  1100 + case 0xe6:
  1101 + ch = 0xbe;
  1102 + break;
  1103 + case 0xe0:
  1104 + ch = 0x88;
  1105 + break;
  1106 + case 0xe5:
  1107 + ch = 0x8c;
  1108 + break;
  1109 + case 0xe3:
  1110 + ch = 0x8b;
  1111 + break;
  1112 + case 0x306:
  1113 + ch = 0xf9;
  1114 + break;
  1115 + case 0x2022:
  1116 + ch = 0xa5;
  1117 + break;
  1118 + case 0x2c7:
  1119 + ch = 0xff;
  1120 + break;
  1121 + case 0xe7:
  1122 + ch = 0x8d;
  1123 + break;
  1124 + case 0x327:
  1125 + ch = 0xfc;
  1126 + break;
  1127 + case 0xa2:
  1128 + ch = 0xa2;
  1129 + break;
  1130 + case 0x2c6:
  1131 + ch = 0xf6;
  1132 + break;
  1133 + case 0xa9:
  1134 + ch = 0xa9;
  1135 + break;
  1136 + case 0xa4:
  1137 + ch = 0xdb;
  1138 + break;
  1139 + case 0x2020:
  1140 + ch = 0xa0;
  1141 + break;
  1142 + case 0x2021:
  1143 + ch = 0xe0;
  1144 + break;
  1145 + case 0xb0:
  1146 + ch = 0xa1;
  1147 + break;
  1148 + case 0x308:
  1149 + ch = 0xac;
  1150 + break;
  1151 + case 0xf7:
  1152 + ch = 0xd6;
  1153 + break;
  1154 + case 0x307:
  1155 + ch = 0xfa;
  1156 + break;
  1157 + case 0x131:
  1158 + ch = 0xf5;
  1159 + break;
  1160 + case 0xe9:
  1161 + ch = 0x8e;
  1162 + break;
  1163 + case 0xea:
  1164 + ch = 0x90;
  1165 + break;
  1166 + case 0xeb:
  1167 + ch = 0x91;
  1168 + break;
  1169 + case 0xe8:
  1170 + ch = 0x8f;
  1171 + break;
  1172 + case 0x2026:
  1173 + ch = 0xc9;
  1174 + break;
  1175 + case 0x2014:
  1176 + ch = 0xd1;
  1177 + break;
  1178 + case 0x2013:
  1179 + ch = 0xd0;
  1180 + break;
  1181 + case 0xa1:
  1182 + ch = 0xc1;
  1183 + break;
  1184 + case 0xfb01:
  1185 + ch = 0xde;
  1186 + break;
  1187 + case 0xfb02:
  1188 + ch = 0xdf;
  1189 + break;
  1190 + case 0x192:
  1191 + ch = 0xc4;
  1192 + break;
  1193 + case 0x2044:
  1194 + ch = 0xda;
  1195 + break;
  1196 + case 0xdf:
  1197 + ch = 0xa7;
  1198 + break;
  1199 + case 0xab:
  1200 + ch = 0xc7;
  1201 + break;
  1202 + case 0xbb:
  1203 + ch = 0xc8;
  1204 + break;
  1205 + case 0x2039:
  1206 + ch = 0xdc;
  1207 + break;
  1208 + case 0x203a:
  1209 + ch = 0xdd;
  1210 + break;
  1211 + case 0x30b:
  1212 + ch = 0xfd;
  1213 + break;
  1214 + case 0xed:
  1215 + ch = 0x92;
  1216 + break;
  1217 + case 0xee:
  1218 + ch = 0x94;
  1219 + break;
  1220 + case 0xef:
  1221 + ch = 0x95;
  1222 + break;
  1223 + case 0xec:
  1224 + ch = 0x93;
  1225 + break;
  1226 + case 0xac:
  1227 + ch = 0xc2;
  1228 + break;
  1229 + case 0x304:
  1230 + ch = 0xf8;
  1231 + break;
  1232 + case 0x3bc:
  1233 + ch = 0xb5;
  1234 + break;
  1235 + case 0xf1:
  1236 + ch = 0x96;
  1237 + break;
  1238 + case 0xf3:
  1239 + ch = 0x97;
  1240 + break;
  1241 + case 0xf4:
  1242 + ch = 0x99;
  1243 + break;
  1244 + case 0xf6:
  1245 + ch = 0x9a;
  1246 + break;
  1247 + case 0x153:
  1248 + ch = 0xcf;
  1249 + break;
  1250 + case 0x328:
  1251 + ch = 0xfe;
  1252 + break;
  1253 + case 0xf2:
  1254 + ch = 0x98;
  1255 + break;
  1256 + case 0x1d43:
  1257 + ch = 0xbb;
  1258 + break;
  1259 + case 0x1d52:
  1260 + ch = 0xbc;
  1261 + break;
  1262 + case 0xf8:
  1263 + ch = 0xbf;
  1264 + break;
  1265 + case 0xf5:
  1266 + ch = 0x9b;
  1267 + break;
  1268 + case 0xb6:
  1269 + ch = 0xa6;
  1270 + break;
  1271 + case 0xb7:
  1272 + ch = 0xe1;
  1273 + break;
  1274 + case 0x2030:
  1275 + ch = 0xe4;
  1276 + break;
  1277 + case 0xb1:
  1278 + ch = 0xb1;
  1279 + break;
  1280 + case 0xbf:
  1281 + ch = 0xc0;
  1282 + break;
  1283 + case 0x201e:
  1284 + ch = 0xe3;
  1285 + break;
  1286 + case 0x201c:
  1287 + ch = 0xd2;
  1288 + break;
  1289 + case 0x201d:
  1290 + ch = 0xd3;
  1291 + break;
  1292 + case 0x2018:
  1293 + ch = 0xd4;
  1294 + break;
  1295 + case 0x2019:
  1296 + ch = 0xd5;
  1297 + break;
  1298 + case 0x201a:
  1299 + ch = 0xe2;
  1300 + break;
  1301 + case 0xae:
  1302 + ch = 0xa8;
  1303 + break;
  1304 + case 0x30a:
  1305 + ch = 0xfb;
  1306 + break;
  1307 + case 0xa7:
  1308 + ch = 0xa4;
  1309 + break;
  1310 + case 0xa3:
  1311 + ch = 0xa3;
  1312 + break;
  1313 + case 0x303:
  1314 + ch = 0xf7;
  1315 + break;
  1316 + case 0x2122:
  1317 + ch = 0xaa;
  1318 + break;
  1319 + case 0xfa:
  1320 + ch = 0x9c;
  1321 + break;
  1322 + case 0xfb:
  1323 + ch = 0x9e;
  1324 + break;
  1325 + case 0xfc:
  1326 + ch = 0x9f;
  1327 + break;
  1328 + case 0xf9:
  1329 + ch = 0x9d;
  1330 + break;
  1331 + case 0xff:
  1332 + ch = 0xd8;
  1333 + break;
  1334 + case 0xa5:
  1335 + ch = 0xb4;
  1336 + break;
  1337 + default:
  1338 + break;
  1339 + }
  1340 + return ch;
  1341 +}
  1342 +
  1343 +static std::string
900 transcode_utf8(std::string const& utf8_val, encoding_e encoding, 1344 transcode_utf8(std::string const& utf8_val, encoding_e encoding,
901 char unknown) 1345 char unknown)
902 { 1346 {
@@ -966,7 +1410,27 @@ transcode_utf8(std::string const&amp; utf8_val, encoding_e encoding, @@ -966,7 +1410,27 @@ transcode_utf8(std::string const&amp; utf8_val, encoding_e encoding,
966 } 1410 }
967 else 1411 else
968 { 1412 {
969 - result.append(1, unknown); 1413 + ch = '\0';
  1414 + if (encoding == e_winansi)
  1415 + {
  1416 + if ((codepoint >= 160) && (codepoint < 256))
  1417 + {
  1418 + ch = static_cast<unsigned char>(codepoint & 0xff);
  1419 + }
  1420 + else
  1421 + {
  1422 + ch = encode_winansi(codepoint);
  1423 + }
  1424 + }
  1425 + else if (encoding == e_macroman)
  1426 + {
  1427 + ch = encode_macroman(codepoint);
  1428 + }
  1429 + if (ch == '\0')
  1430 + {
  1431 + ch = static_cast<unsigned char>(unknown);
  1432 + }
  1433 + result.append(1, ch);
970 } 1434 }
971 } 1435 }
972 } 1436 }
@@ -985,3 +1449,15 @@ QUtil::utf8_to_ascii(std::string const&amp; utf8, char unknown_char) @@ -985,3 +1449,15 @@ QUtil::utf8_to_ascii(std::string const&amp; utf8, char unknown_char)
985 { 1449 {
986 return transcode_utf8(utf8, e_ascii, unknown_char); 1450 return transcode_utf8(utf8, e_ascii, unknown_char);
987 } 1451 }
  1452 +
  1453 +std::string
  1454 +QUtil::utf8_to_win_ansi(std::string const& utf8, char unknown_char)
  1455 +{
  1456 + return transcode_utf8(utf8, e_winansi, unknown_char);
  1457 +}
  1458 +
  1459 +std::string
  1460 +QUtil::utf8_to_mac_roman(std::string const& utf8, char unknown_char)
  1461 +{
  1462 + return transcode_utf8(utf8, e_macroman, unknown_char);
  1463 +}
libtests/qtest/qutil/qutil.out
@@ -51,6 +51,8 @@ HAGOOGAMAGOOGLE: 0 @@ -51,6 +51,8 @@ HAGOOGAMAGOOGLE: 0
51 ¿Does π have fingers? 51 ¿Does π have fingers?
52 ?Does ? have fingers? 52 ?Does ? have fingers?
53 *Does * have fingers? 53 *Does * have fingers?
  54 +<bf>Does * have fingers?
  55 +<c0>Does * have fingers?
54 ---- whoami 56 ---- whoami
55 quack1 57 quack1
56 quack2 58 quack2
libtests/qutil.cc
@@ -229,6 +229,15 @@ void utf8_to_ascii_test() @@ -229,6 +229,15 @@ void utf8_to_ascii_test()
229 << std::endl 229 << std::endl
230 << QUtil::utf8_to_ascii(input, '*') 230 << QUtil::utf8_to_ascii(input, '*')
231 << std::endl; 231 << std::endl;
  232 + std::string a = QUtil::utf8_to_win_ansi(input, '*');
  233 + std::string b = QUtil::utf8_to_mac_roman(input, '*');
  234 + std::cout
  235 + << "<" << QUtil::int_to_string_base(
  236 + static_cast<unsigned char>(a.at(0)), 16, 2)
  237 + << ">" << a.substr(1) << std::endl
  238 + << "<" << QUtil::int_to_string_base(
  239 + static_cast<unsigned char>(b.at(0)), 16, 2)
  240 + << ">" << b.substr(1) << std::endl;
232 } 241 }
233 242
234 void print_whoami(char const* str) 243 void print_whoami(char const* str)