Commit 089ce5902ec155ff3dce7bed7c12006a587d3db0
1 parent
ae18bfd1
Move utf8_to_utf16 into QUtil
Showing
4 changed files
with
74 additions
and
58 deletions
ChangeLog
| 1 | +2019-01-05 Jay Berkenbilt <ejb@ql.org> | ||
| 2 | + | ||
| 3 | + * Add method QUtil::utf8_to_utf16. | ||
| 4 | + | ||
| 1 | 2019-01-04 Jay Berkenbilt <ejb@ql.org> | 5 | 2019-01-04 Jay Berkenbilt <ejb@ql.org> |
| 2 | 6 | ||
| 3 | * Add new option --optimize-images, which recompresses every image | 7 | * Add new option --optimize-images, which recompresses every image |
include/qpdf/QUtil.hh
| @@ -152,8 +152,14 @@ namespace QUtil | @@ -152,8 +152,14 @@ namespace QUtil | ||
| 152 | QPDF_DLL | 152 | QPDF_DLL |
| 153 | std::string toUTF16(unsigned long uval); | 153 | std::string toUTF16(unsigned long uval); |
| 154 | 154 | ||
| 155 | - // Convert a UTF-8 encoded string to ASCII by replacing all | ||
| 156 | - // characters outside of ascii with the given unknown_char. | 155 | + // Convert a UTF-8 encoded string to UTF-16. Unrepresentable code |
| 156 | + // points are converted to U+FFFD. | ||
| 157 | + QPDF_DLL | ||
| 158 | + std::string utf8_to_utf16(std::string const& utf8); | ||
| 159 | + | ||
| 160 | + // Convert a UTF-8 encoded string to the specified single-byte | ||
| 161 | + // encoding system by replacing all unsupported characters with | ||
| 162 | + // the given unknown_char. | ||
| 157 | QPDF_DLL | 163 | QPDF_DLL |
| 158 | std::string utf8_to_ascii( | 164 | std::string utf8_to_ascii( |
| 159 | std::string const& utf8, char unknown_char = '?'); | 165 | std::string const& utf8, char unknown_char = '?'); |
libqpdf/QPDF_String.cc
| @@ -64,65 +64,10 @@ QPDF_String::~QPDF_String() | @@ -64,65 +64,10 @@ QPDF_String::~QPDF_String() | ||
| 64 | { | 64 | { |
| 65 | } | 65 | } |
| 66 | 66 | ||
| 67 | -enum encoding_e { e_utf16 }; | ||
| 68 | - | ||
| 69 | -static | ||
| 70 | -std::string | ||
| 71 | -transcode_utf8(std::string const& utf8_val, encoding_e encoding) | ||
| 72 | -{ | ||
| 73 | - std::string result = "\xfe\xff"; | ||
| 74 | - size_t len = utf8_val.length(); | ||
| 75 | - for (size_t i = 0; i < len; ++i) | ||
| 76 | - { | ||
| 77 | - unsigned char ch = static_cast<unsigned char>(utf8_val.at(i)); | ||
| 78 | - if (ch < 128) | ||
| 79 | - { | ||
| 80 | - result += QUtil::toUTF16(ch); | ||
| 81 | - } | ||
| 82 | - else | ||
| 83 | - { | ||
| 84 | - size_t bytes_needed = 0; | ||
| 85 | - unsigned bit_check = 0x40; | ||
| 86 | - unsigned char to_clear = 0x80; | ||
| 87 | - while (ch & bit_check) | ||
| 88 | - { | ||
| 89 | - ++bytes_needed; | ||
| 90 | - to_clear |= bit_check; | ||
| 91 | - bit_check >>= 1; | ||
| 92 | - } | ||
| 93 | - | ||
| 94 | - if (((bytes_needed > 5) || (bytes_needed < 1)) || | ||
| 95 | - ((i + bytes_needed) >= len)) | ||
| 96 | - { | ||
| 97 | - result += "\xff\xfd"; | ||
| 98 | - } | ||
| 99 | - else | ||
| 100 | - { | ||
| 101 | - unsigned long codepoint = (ch & ~to_clear); | ||
| 102 | - while (bytes_needed > 0) | ||
| 103 | - { | ||
| 104 | - --bytes_needed; | ||
| 105 | - ch = utf8_val.at(++i); | ||
| 106 | - if ((ch & 0xc0) != 0x80) | ||
| 107 | - { | ||
| 108 | - --i; | ||
| 109 | - codepoint = 0xfffd; | ||
| 110 | - break; | ||
| 111 | - } | ||
| 112 | - codepoint <<= 6; | ||
| 113 | - codepoint += (ch & 0x3f); | ||
| 114 | - } | ||
| 115 | - result += QUtil::toUTF16(codepoint); | ||
| 116 | - } | ||
| 117 | - } | ||
| 118 | - } | ||
| 119 | - return result; | ||
| 120 | -} | ||
| 121 | - | ||
| 122 | QPDF_String* | 67 | QPDF_String* |
| 123 | QPDF_String::new_utf16(std::string const& utf8_val) | 68 | QPDF_String::new_utf16(std::string const& utf8_val) |
| 124 | { | 69 | { |
| 125 | - return new QPDF_String(transcode_utf8(utf8_val, e_utf16)); | 70 | + return new QPDF_String(QUtil::utf8_to_utf16(utf8_val)); |
| 126 | } | 71 | } |
| 127 | 72 | ||
| 128 | std::string | 73 | std::string |
libqpdf/QUtil.cc
| @@ -893,6 +893,67 @@ QUtil::parse_numrange(char const* range, int max) | @@ -893,6 +893,67 @@ QUtil::parse_numrange(char const* range, int max) | ||
| 893 | return result; | 893 | return result; |
| 894 | } | 894 | } |
| 895 | 895 | ||
| 896 | +enum encoding_e { e_utf16 }; | ||
| 897 | + | ||
| 898 | +static | ||
| 899 | +std::string | ||
| 900 | +transcode_utf8(std::string const& utf8_val, encoding_e encoding) | ||
| 901 | +{ | ||
| 902 | + std::string result = "\xfe\xff"; | ||
| 903 | + size_t len = utf8_val.length(); | ||
| 904 | + for (size_t i = 0; i < len; ++i) | ||
| 905 | + { | ||
| 906 | + unsigned char ch = static_cast<unsigned char>(utf8_val.at(i)); | ||
| 907 | + if (ch < 128) | ||
| 908 | + { | ||
| 909 | + result += QUtil::toUTF16(ch); | ||
| 910 | + } | ||
| 911 | + else | ||
| 912 | + { | ||
| 913 | + size_t bytes_needed = 0; | ||
| 914 | + unsigned bit_check = 0x40; | ||
| 915 | + unsigned char to_clear = 0x80; | ||
| 916 | + while (ch & bit_check) | ||
| 917 | + { | ||
| 918 | + ++bytes_needed; | ||
| 919 | + to_clear |= bit_check; | ||
| 920 | + bit_check >>= 1; | ||
| 921 | + } | ||
| 922 | + | ||
| 923 | + if (((bytes_needed > 5) || (bytes_needed < 1)) || | ||
| 924 | + ((i + bytes_needed) >= len)) | ||
| 925 | + { | ||
| 926 | + result += "\xff\xfd"; | ||
| 927 | + } | ||
| 928 | + else | ||
| 929 | + { | ||
| 930 | + unsigned long codepoint = (ch & ~to_clear); | ||
| 931 | + while (bytes_needed > 0) | ||
| 932 | + { | ||
| 933 | + --bytes_needed; | ||
| 934 | + ch = utf8_val.at(++i); | ||
| 935 | + if ((ch & 0xc0) != 0x80) | ||
| 936 | + { | ||
| 937 | + --i; | ||
| 938 | + codepoint = 0xfffd; | ||
| 939 | + break; | ||
| 940 | + } | ||
| 941 | + codepoint <<= 6; | ||
| 942 | + codepoint += (ch & 0x3f); | ||
| 943 | + } | ||
| 944 | + result += QUtil::toUTF16(codepoint); | ||
| 945 | + } | ||
| 946 | + } | ||
| 947 | + } | ||
| 948 | + return result; | ||
| 949 | +} | ||
| 950 | + | ||
| 951 | +std::string | ||
| 952 | +QUtil::utf8_to_utf16(std::string const& utf8) | ||
| 953 | +{ | ||
| 954 | + return transcode_utf8(utf8, e_utf16); | ||
| 955 | +} | ||
| 956 | + | ||
| 896 | std::string | 957 | std::string |
| 897 | QUtil::utf8_to_ascii(std::string const& utf8, char unknown_char) | 958 | QUtil::utf8_to_ascii(std::string const& utf8, char unknown_char) |
| 898 | { | 959 | { |