Commit 089ce5902ec155ff3dce7bed7c12006a587d3db0

Authored by Jay Berkenbilt
1 parent ae18bfd1

Move utf8_to_utf16 into QUtil

ChangeLog
  1 +2019-01-05 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * Add method QUtil::utf8_to_utf16.
  4 +
1 2019-01-04 Jay Berkenbilt <ejb@ql.org> 5 2019-01-04 Jay Berkenbilt <ejb@ql.org>
2 6
3 * Add new option --optimize-images, which recompresses every image 7 * Add new option --optimize-images, which recompresses every image
include/qpdf/QUtil.hh
@@ -152,8 +152,14 @@ namespace QUtil @@ -152,8 +152,14 @@ namespace QUtil
152 QPDF_DLL 152 QPDF_DLL
153 std::string toUTF16(unsigned long uval); 153 std::string toUTF16(unsigned long uval);
154 154
155 - // Convert a UTF-8 encoded string to ASCII by replacing all  
156 - // characters outside of ascii with the given unknown_char. 155 + // Convert a UTF-8 encoded string to UTF-16. Unrepresentable code
  156 + // points are converted to U+FFFD.
  157 + QPDF_DLL
  158 + std::string utf8_to_utf16(std::string const& utf8);
  159 +
  160 + // Convert a UTF-8 encoded string to the specified single-byte
  161 + // encoding system by replacing all unsupported characters with
  162 + // the given unknown_char.
157 QPDF_DLL 163 QPDF_DLL
158 std::string utf8_to_ascii( 164 std::string utf8_to_ascii(
159 std::string const& utf8, char unknown_char = '?'); 165 std::string const& utf8, char unknown_char = '?');
libqpdf/QPDF_String.cc
@@ -64,65 +64,10 @@ QPDF_String::~QPDF_String() @@ -64,65 +64,10 @@ QPDF_String::~QPDF_String()
64 { 64 {
65 } 65 }
66 66
67 -enum encoding_e { e_utf16 };  
68 -  
69 -static  
70 -std::string  
71 -transcode_utf8(std::string const& utf8_val, encoding_e encoding)  
72 -{  
73 - std::string result = "\xfe\xff";  
74 - size_t len = utf8_val.length();  
75 - for (size_t i = 0; i < len; ++i)  
76 - {  
77 - unsigned char ch = static_cast<unsigned char>(utf8_val.at(i));  
78 - if (ch < 128)  
79 - {  
80 - result += QUtil::toUTF16(ch);  
81 - }  
82 - else  
83 - {  
84 - size_t bytes_needed = 0;  
85 - unsigned bit_check = 0x40;  
86 - unsigned char to_clear = 0x80;  
87 - while (ch & bit_check)  
88 - {  
89 - ++bytes_needed;  
90 - to_clear |= bit_check;  
91 - bit_check >>= 1;  
92 - }  
93 -  
94 - if (((bytes_needed > 5) || (bytes_needed < 1)) ||  
95 - ((i + bytes_needed) >= len))  
96 - {  
97 - result += "\xff\xfd";  
98 - }  
99 - else  
100 - {  
101 - unsigned long codepoint = (ch & ~to_clear);  
102 - while (bytes_needed > 0)  
103 - {  
104 - --bytes_needed;  
105 - ch = utf8_val.at(++i);  
106 - if ((ch & 0xc0) != 0x80)  
107 - {  
108 - --i;  
109 - codepoint = 0xfffd;  
110 - break;  
111 - }  
112 - codepoint <<= 6;  
113 - codepoint += (ch & 0x3f);  
114 - }  
115 - result += QUtil::toUTF16(codepoint);  
116 - }  
117 - }  
118 - }  
119 - return result;  
120 -}  
121 -  
122 QPDF_String* 67 QPDF_String*
123 QPDF_String::new_utf16(std::string const& utf8_val) 68 QPDF_String::new_utf16(std::string const& utf8_val)
124 { 69 {
125 - return new QPDF_String(transcode_utf8(utf8_val, e_utf16)); 70 + return new QPDF_String(QUtil::utf8_to_utf16(utf8_val));
126 } 71 }
127 72
128 std::string 73 std::string
libqpdf/QUtil.cc
@@ -893,6 +893,67 @@ QUtil::parse_numrange(char const* range, int max) @@ -893,6 +893,67 @@ QUtil::parse_numrange(char const* range, int max)
893 return result; 893 return result;
894 } 894 }
895 895
  896 +enum encoding_e { e_utf16 };
  897 +
  898 +static
  899 +std::string
  900 +transcode_utf8(std::string const& utf8_val, encoding_e encoding)
  901 +{
  902 + std::string result = "\xfe\xff";
  903 + size_t len = utf8_val.length();
  904 + for (size_t i = 0; i < len; ++i)
  905 + {
  906 + unsigned char ch = static_cast<unsigned char>(utf8_val.at(i));
  907 + if (ch < 128)
  908 + {
  909 + result += QUtil::toUTF16(ch);
  910 + }
  911 + else
  912 + {
  913 + size_t bytes_needed = 0;
  914 + unsigned bit_check = 0x40;
  915 + unsigned char to_clear = 0x80;
  916 + while (ch & bit_check)
  917 + {
  918 + ++bytes_needed;
  919 + to_clear |= bit_check;
  920 + bit_check >>= 1;
  921 + }
  922 +
  923 + if (((bytes_needed > 5) || (bytes_needed < 1)) ||
  924 + ((i + bytes_needed) >= len))
  925 + {
  926 + result += "\xff\xfd";
  927 + }
  928 + else
  929 + {
  930 + unsigned long codepoint = (ch & ~to_clear);
  931 + while (bytes_needed > 0)
  932 + {
  933 + --bytes_needed;
  934 + ch = utf8_val.at(++i);
  935 + if ((ch & 0xc0) != 0x80)
  936 + {
  937 + --i;
  938 + codepoint = 0xfffd;
  939 + break;
  940 + }
  941 + codepoint <<= 6;
  942 + codepoint += (ch & 0x3f);
  943 + }
  944 + result += QUtil::toUTF16(codepoint);
  945 + }
  946 + }
  947 + }
  948 + return result;
  949 +}
  950 +
  951 +std::string
  952 +QUtil::utf8_to_utf16(std::string const& utf8)
  953 +{
  954 + return transcode_utf8(utf8, e_utf16);
  955 +}
  956 +
896 std::string 957 std::string
897 QUtil::utf8_to_ascii(std::string const& utf8, char unknown_char) 958 QUtil::utf8_to_ascii(std::string const& utf8, char unknown_char)
898 { 959 {