Commit 089ce5902ec155ff3dce7bed7c12006a587d3db0
1 parent
ae18bfd1
Move utf8_to_utf16 into QUtil
Showing
4 changed files
with
74 additions
and
58 deletions
ChangeLog
include/qpdf/QUtil.hh
| ... | ... | @@ -152,8 +152,14 @@ namespace QUtil |
| 152 | 152 | QPDF_DLL |
| 153 | 153 | std::string toUTF16(unsigned long uval); |
| 154 | 154 | |
| 155 | - // Convert a UTF-8 encoded string to ASCII by replacing all | |
| 156 | - // characters outside of ascii with the given unknown_char. | |
| 155 | + // Convert a UTF-8 encoded string to UTF-16. Unrepresentable code | |
| 156 | + // points are converted to U+FFFD. | |
| 157 | + QPDF_DLL | |
| 158 | + std::string utf8_to_utf16(std::string const& utf8); | |
| 159 | + | |
| 160 | + // Convert a UTF-8 encoded string to the specified single-byte | |
| 161 | + // encoding system by replacing all unsupported characters with | |
| 162 | + // the given unknown_char. | |
| 157 | 163 | QPDF_DLL |
| 158 | 164 | std::string utf8_to_ascii( |
| 159 | 165 | std::string const& utf8, char unknown_char = '?'); | ... | ... |
libqpdf/QPDF_String.cc
| ... | ... | @@ -64,65 +64,10 @@ QPDF_String::~QPDF_String() |
| 64 | 64 | { |
| 65 | 65 | } |
| 66 | 66 | |
| 67 | -enum encoding_e { e_utf16 }; | |
| 68 | - | |
| 69 | -static | |
| 70 | -std::string | |
| 71 | -transcode_utf8(std::string const& utf8_val, encoding_e encoding) | |
| 72 | -{ | |
| 73 | - std::string result = "\xfe\xff"; | |
| 74 | - size_t len = utf8_val.length(); | |
| 75 | - for (size_t i = 0; i < len; ++i) | |
| 76 | - { | |
| 77 | - unsigned char ch = static_cast<unsigned char>(utf8_val.at(i)); | |
| 78 | - if (ch < 128) | |
| 79 | - { | |
| 80 | - result += QUtil::toUTF16(ch); | |
| 81 | - } | |
| 82 | - else | |
| 83 | - { | |
| 84 | - size_t bytes_needed = 0; | |
| 85 | - unsigned bit_check = 0x40; | |
| 86 | - unsigned char to_clear = 0x80; | |
| 87 | - while (ch & bit_check) | |
| 88 | - { | |
| 89 | - ++bytes_needed; | |
| 90 | - to_clear |= bit_check; | |
| 91 | - bit_check >>= 1; | |
| 92 | - } | |
| 93 | - | |
| 94 | - if (((bytes_needed > 5) || (bytes_needed < 1)) || | |
| 95 | - ((i + bytes_needed) >= len)) | |
| 96 | - { | |
| 97 | - result += "\xff\xfd"; | |
| 98 | - } | |
| 99 | - else | |
| 100 | - { | |
| 101 | - unsigned long codepoint = (ch & ~to_clear); | |
| 102 | - while (bytes_needed > 0) | |
| 103 | - { | |
| 104 | - --bytes_needed; | |
| 105 | - ch = utf8_val.at(++i); | |
| 106 | - if ((ch & 0xc0) != 0x80) | |
| 107 | - { | |
| 108 | - --i; | |
| 109 | - codepoint = 0xfffd; | |
| 110 | - break; | |
| 111 | - } | |
| 112 | - codepoint <<= 6; | |
| 113 | - codepoint += (ch & 0x3f); | |
| 114 | - } | |
| 115 | - result += QUtil::toUTF16(codepoint); | |
| 116 | - } | |
| 117 | - } | |
| 118 | - } | |
| 119 | - return result; | |
| 120 | -} | |
| 121 | - | |
| 122 | 67 | QPDF_String* |
| 123 | 68 | QPDF_String::new_utf16(std::string const& utf8_val) |
| 124 | 69 | { |
| 125 | - return new QPDF_String(transcode_utf8(utf8_val, e_utf16)); | |
| 70 | + return new QPDF_String(QUtil::utf8_to_utf16(utf8_val)); | |
| 126 | 71 | } |
| 127 | 72 | |
| 128 | 73 | std::string | ... | ... |
libqpdf/QUtil.cc
| ... | ... | @@ -893,6 +893,67 @@ QUtil::parse_numrange(char const* range, int max) |
| 893 | 893 | return result; |
| 894 | 894 | } |
| 895 | 895 | |
| 896 | +enum encoding_e { e_utf16 }; | |
| 897 | + | |
| 898 | +static | |
| 899 | +std::string | |
| 900 | +transcode_utf8(std::string const& utf8_val, encoding_e encoding) | |
| 901 | +{ | |
| 902 | + std::string result = "\xfe\xff"; | |
| 903 | + size_t len = utf8_val.length(); | |
| 904 | + for (size_t i = 0; i < len; ++i) | |
| 905 | + { | |
| 906 | + unsigned char ch = static_cast<unsigned char>(utf8_val.at(i)); | |
| 907 | + if (ch < 128) | |
| 908 | + { | |
| 909 | + result += QUtil::toUTF16(ch); | |
| 910 | + } | |
| 911 | + else | |
| 912 | + { | |
| 913 | + size_t bytes_needed = 0; | |
| 914 | + unsigned bit_check = 0x40; | |
| 915 | + unsigned char to_clear = 0x80; | |
| 916 | + while (ch & bit_check) | |
| 917 | + { | |
| 918 | + ++bytes_needed; | |
| 919 | + to_clear |= bit_check; | |
| 920 | + bit_check >>= 1; | |
| 921 | + } | |
| 922 | + | |
| 923 | + if (((bytes_needed > 5) || (bytes_needed < 1)) || | |
| 924 | + ((i + bytes_needed) >= len)) | |
| 925 | + { | |
| 926 | + result += "\xff\xfd"; | |
| 927 | + } | |
| 928 | + else | |
| 929 | + { | |
| 930 | + unsigned long codepoint = (ch & ~to_clear); | |
| 931 | + while (bytes_needed > 0) | |
| 932 | + { | |
| 933 | + --bytes_needed; | |
| 934 | + ch = utf8_val.at(++i); | |
| 935 | + if ((ch & 0xc0) != 0x80) | |
| 936 | + { | |
| 937 | + --i; | |
| 938 | + codepoint = 0xfffd; | |
| 939 | + break; | |
| 940 | + } | |
| 941 | + codepoint <<= 6; | |
| 942 | + codepoint += (ch & 0x3f); | |
| 943 | + } | |
| 944 | + result += QUtil::toUTF16(codepoint); | |
| 945 | + } | |
| 946 | + } | |
| 947 | + } | |
| 948 | + return result; | |
| 949 | +} | |
| 950 | + | |
| 951 | +std::string | |
| 952 | +QUtil::utf8_to_utf16(std::string const& utf8) | |
| 953 | +{ | |
| 954 | + return transcode_utf8(utf8, e_utf16); | |
| 955 | +} | |
| 956 | + | |
| 896 | 957 | std::string |
| 897 | 958 | QUtil::utf8_to_ascii(std::string const& utf8, char unknown_char) |
| 898 | 959 | { | ... | ... |