Commit 089ce5902ec155ff3dce7bed7c12006a587d3db0

Authored by Jay Berkenbilt
1 parent ae18bfd1

Move utf8_to_utf16 into QUtil

ChangeLog
  1 +2019-01-05 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * Add method QUtil::utf8_to_utf16.
  4 +
1 5 2019-01-04 Jay Berkenbilt <ejb@ql.org>
2 6  
3 7 * Add new option --optimize-images, which recompresses every image
... ...
include/qpdf/QUtil.hh
... ... @@ -152,8 +152,14 @@ namespace QUtil
152 152 QPDF_DLL
153 153 std::string toUTF16(unsigned long uval);
154 154  
155   - // Convert a UTF-8 encoded string to ASCII by replacing all
156   - // characters outside of ascii with the given unknown_char.
  155 + // Convert a UTF-8 encoded string to UTF-16. Unrepresentable code
  156 + // points are converted to U+FFFD.
  157 + QPDF_DLL
  158 + std::string utf8_to_utf16(std::string const& utf8);
  159 +
  160 + // Convert a UTF-8 encoded string to the specified single-byte
  161 + // encoding system by replacing all unsupported characters with
  162 + // the given unknown_char.
157 163 QPDF_DLL
158 164 std::string utf8_to_ascii(
159 165 std::string const& utf8, char unknown_char = '?');
... ...
libqpdf/QPDF_String.cc
... ... @@ -64,65 +64,10 @@ QPDF_String::~QPDF_String()
64 64 {
65 65 }
66 66  
67   -enum encoding_e { e_utf16 };
68   -
69   -static
70   -std::string
71   -transcode_utf8(std::string const& utf8_val, encoding_e encoding)
72   -{
73   - std::string result = "\xfe\xff";
74   - size_t len = utf8_val.length();
75   - for (size_t i = 0; i < len; ++i)
76   - {
77   - unsigned char ch = static_cast<unsigned char>(utf8_val.at(i));
78   - if (ch < 128)
79   - {
80   - result += QUtil::toUTF16(ch);
81   - }
82   - else
83   - {
84   - size_t bytes_needed = 0;
85   - unsigned bit_check = 0x40;
86   - unsigned char to_clear = 0x80;
87   - while (ch & bit_check)
88   - {
89   - ++bytes_needed;
90   - to_clear |= bit_check;
91   - bit_check >>= 1;
92   - }
93   -
94   - if (((bytes_needed > 5) || (bytes_needed < 1)) ||
95   - ((i + bytes_needed) >= len))
96   - {
97   - result += "\xff\xfd";
98   - }
99   - else
100   - {
101   - unsigned long codepoint = (ch & ~to_clear);
102   - while (bytes_needed > 0)
103   - {
104   - --bytes_needed;
105   - ch = utf8_val.at(++i);
106   - if ((ch & 0xc0) != 0x80)
107   - {
108   - --i;
109   - codepoint = 0xfffd;
110   - break;
111   - }
112   - codepoint <<= 6;
113   - codepoint += (ch & 0x3f);
114   - }
115   - result += QUtil::toUTF16(codepoint);
116   - }
117   - }
118   - }
119   - return result;
120   -}
121   -
122 67 QPDF_String*
123 68 QPDF_String::new_utf16(std::string const& utf8_val)
124 69 {
125   - return new QPDF_String(transcode_utf8(utf8_val, e_utf16));
  70 + return new QPDF_String(QUtil::utf8_to_utf16(utf8_val));
126 71 }
127 72  
128 73 std::string
... ...
libqpdf/QUtil.cc
... ... @@ -893,6 +893,67 @@ QUtil::parse_numrange(char const* range, int max)
893 893 return result;
894 894 }
895 895  
  896 +enum encoding_e { e_utf16 };
  897 +
  898 +static
  899 +std::string
  900 +transcode_utf8(std::string const& utf8_val, encoding_e encoding)
  901 +{
  902 + std::string result = "\xfe\xff";
  903 + size_t len = utf8_val.length();
  904 + for (size_t i = 0; i < len; ++i)
  905 + {
  906 + unsigned char ch = static_cast<unsigned char>(utf8_val.at(i));
  907 + if (ch < 128)
  908 + {
  909 + result += QUtil::toUTF16(ch);
  910 + }
  911 + else
  912 + {
  913 + size_t bytes_needed = 0;
  914 + unsigned bit_check = 0x40;
  915 + unsigned char to_clear = 0x80;
  916 + while (ch & bit_check)
  917 + {
  918 + ++bytes_needed;
  919 + to_clear |= bit_check;
  920 + bit_check >>= 1;
  921 + }
  922 +
  923 + if (((bytes_needed > 5) || (bytes_needed < 1)) ||
  924 + ((i + bytes_needed) >= len))
  925 + {
  926 + result += "\xff\xfd";
  927 + }
  928 + else
  929 + {
  930 + unsigned long codepoint = (ch & ~to_clear);
  931 + while (bytes_needed > 0)
  932 + {
  933 + --bytes_needed;
  934 + ch = utf8_val.at(++i);
  935 + if ((ch & 0xc0) != 0x80)
  936 + {
  937 + --i;
  938 + codepoint = 0xfffd;
  939 + break;
  940 + }
  941 + codepoint <<= 6;
  942 + codepoint += (ch & 0x3f);
  943 + }
  944 + result += QUtil::toUTF16(codepoint);
  945 + }
  946 + }
  947 + }
  948 + return result;
  949 +}
  950 +
  951 +std::string
  952 +QUtil::utf8_to_utf16(std::string const& utf8)
  953 +{
  954 + return transcode_utf8(utf8, e_utf16);
  955 +}
  956 +
896 957 std::string
897 958 QUtil::utf8_to_ascii(std::string const& utf8, char unknown_char)
898 959 {
... ...