Commit 3ef1b77304ec49ec2527d8cc3e17e1d0dd220720
1 parent
089ce590
Refactor QUtil::utf8_to_ascii
Showing
3 changed files
with
38 additions
and
29 deletions
libqpdf/QUtil.cc
| @@ -893,20 +893,32 @@ QUtil::parse_numrange(char const* range, int max) | @@ -893,20 +893,32 @@ QUtil::parse_numrange(char const* range, int max) | ||
| 893 | return result; | 893 | return result; |
| 894 | } | 894 | } |
| 895 | 895 | ||
| 896 | -enum encoding_e { e_utf16 }; | 896 | +enum encoding_e { e_utf16, e_ascii }; |
| 897 | 897 | ||
| 898 | static | 898 | static |
| 899 | std::string | 899 | std::string |
| 900 | -transcode_utf8(std::string const& utf8_val, encoding_e encoding) | 900 | +transcode_utf8(std::string const& utf8_val, encoding_e encoding, |
| 901 | + char unknown) | ||
| 901 | { | 902 | { |
| 902 | - std::string result = "\xfe\xff"; | 903 | + std::string result; |
| 904 | + if (encoding == e_utf16) | ||
| 905 | + { | ||
| 906 | + result += "\xfe\xff"; | ||
| 907 | + } | ||
| 903 | size_t len = utf8_val.length(); | 908 | size_t len = utf8_val.length(); |
| 904 | for (size_t i = 0; i < len; ++i) | 909 | for (size_t i = 0; i < len; ++i) |
| 905 | { | 910 | { |
| 906 | unsigned char ch = static_cast<unsigned char>(utf8_val.at(i)); | 911 | unsigned char ch = static_cast<unsigned char>(utf8_val.at(i)); |
| 907 | if (ch < 128) | 912 | if (ch < 128) |
| 908 | { | 913 | { |
| 909 | - result += QUtil::toUTF16(ch); | 914 | + if (encoding == e_utf16) |
| 915 | + { | ||
| 916 | + result += QUtil::toUTF16(ch); | ||
| 917 | + } | ||
| 918 | + else | ||
| 919 | + { | ||
| 920 | + result.append(1, ch); | ||
| 921 | + } | ||
| 910 | } | 922 | } |
| 911 | else | 923 | else |
| 912 | { | 924 | { |
| @@ -923,7 +935,14 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding) | @@ -923,7 +935,14 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding) | ||
| 923 | if (((bytes_needed > 5) || (bytes_needed < 1)) || | 935 | if (((bytes_needed > 5) || (bytes_needed < 1)) || |
| 924 | ((i + bytes_needed) >= len)) | 936 | ((i + bytes_needed) >= len)) |
| 925 | { | 937 | { |
| 926 | - result += "\xff\xfd"; | 938 | + if (encoding == e_utf16) |
| 939 | + { | ||
| 940 | + result += "\xff\xfd"; | ||
| 941 | + } | ||
| 942 | + else | ||
| 943 | + { | ||
| 944 | + result.append(1, unknown); | ||
| 945 | + } | ||
| 927 | } | 946 | } |
| 928 | else | 947 | else |
| 929 | { | 948 | { |
| @@ -941,7 +960,14 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding) | @@ -941,7 +960,14 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding) | ||
| 941 | codepoint <<= 6; | 960 | codepoint <<= 6; |
| 942 | codepoint += (ch & 0x3f); | 961 | codepoint += (ch & 0x3f); |
| 943 | } | 962 | } |
| 944 | - result += QUtil::toUTF16(codepoint); | 963 | + if (encoding == e_utf16) |
| 964 | + { | ||
| 965 | + result += QUtil::toUTF16(codepoint); | ||
| 966 | + } | ||
| 967 | + else | ||
| 968 | + { | ||
| 969 | + result.append(1, unknown); | ||
| 970 | + } | ||
| 945 | } | 971 | } |
| 946 | } | 972 | } |
| 947 | } | 973 | } |
| @@ -951,28 +977,11 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding) | @@ -951,28 +977,11 @@ transcode_utf8(std::string const& utf8_val, encoding_e encoding) | ||
| 951 | std::string | 977 | std::string |
| 952 | QUtil::utf8_to_utf16(std::string const& utf8) | 978 | QUtil::utf8_to_utf16(std::string const& utf8) |
| 953 | { | 979 | { |
| 954 | - return transcode_utf8(utf8, e_utf16); | 980 | + return transcode_utf8(utf8, e_utf16, 0); |
| 955 | } | 981 | } |
| 956 | 982 | ||
| 957 | std::string | 983 | std::string |
| 958 | QUtil::utf8_to_ascii(std::string const& utf8, char unknown_char) | 984 | QUtil::utf8_to_ascii(std::string const& utf8, char unknown_char) |
| 959 | { | 985 | { |
| 960 | - std::string ascii_value; | ||
| 961 | - for (size_t i = 0; i < utf8.length(); ++i) | ||
| 962 | - { | ||
| 963 | - unsigned char ch = static_cast<unsigned char>(utf8.at(i)); | ||
| 964 | - if (ch < 128) | ||
| 965 | - { | ||
| 966 | - ascii_value.append(1, ch); | ||
| 967 | - } | ||
| 968 | - else if ((ch & 0xc0) == 0x80) | ||
| 969 | - { | ||
| 970 | - // Ignore subsequent byte of UTF-8 encoded character | ||
| 971 | - } | ||
| 972 | - else | ||
| 973 | - { | ||
| 974 | - ascii_value.append(1, unknown_char); | ||
| 975 | - } | ||
| 976 | - } | ||
| 977 | - return ascii_value; | 986 | + return transcode_utf8(utf8, e_ascii, unknown_char); |
| 978 | } | 987 | } |
libtests/qtest/qutil/qutil.out
| @@ -48,9 +48,9 @@ HAGOOGAMAGOOGLE: 0 | @@ -48,9 +48,9 @@ HAGOOGAMAGOOGLE: 0 | ||
| 48 | 0x7fffffff -> ff fd | 48 | 0x7fffffff -> ff fd |
| 49 | 0x80000000 -> ff fd | 49 | 0x80000000 -> ff fd |
| 50 | ---- utf8_to_ascii | 50 | ---- utf8_to_ascii |
| 51 | -Does π have fingers? | ||
| 52 | -Does ? have fingers? | ||
| 53 | -Does * have fingers? | 51 | +¿Does π have fingers? |
| 52 | +?Does ? have fingers? | ||
| 53 | +*Does * have fingers? | ||
| 54 | ---- whoami | 54 | ---- whoami |
| 55 | quack1 | 55 | quack1 |
| 56 | quack2 | 56 | quack2 |
libtests/qutil.cc
| @@ -222,7 +222,7 @@ void to_utf16_test() | @@ -222,7 +222,7 @@ void to_utf16_test() | ||
| 222 | 222 | ||
| 223 | void utf8_to_ascii_test() | 223 | void utf8_to_ascii_test() |
| 224 | { | 224 | { |
| 225 | - char const* input = "Does 1700 have fingers?"; | 225 | + char const* input = "\302\277Does 1700 have fingers?"; |
| 226 | std::cout << input | 226 | std::cout << input |
| 227 | << std::endl | 227 | << std::endl |
| 228 | << QUtil::utf8_to_ascii(input) | 228 | << QUtil::utf8_to_ascii(input) |