Commit 337b9007088670363ff6444b2bffa7e8aa6498dc
1 parent
6e07eb1a
handle UTF-16BE fully
git-svn-id: svn+q:///qpdf/trunk@639 71b93d88-0707-0410-a8cf-f5a4172ac649
Showing
5 changed files
with
54 additions
and
13 deletions
ChangeLog
libqpdf/QPDF_String.cc
| ... | ... | @@ -2,6 +2,8 @@ |
| 2 | 2 | #include <qpdf/QPDF_String.hh> |
| 3 | 3 | |
| 4 | 4 | #include <qpdf/QUtil.hh> |
| 5 | +#include <qpdf/QTC.hh> | |
| 6 | + | |
| 5 | 7 | // DO NOT USE ctype -- it is locale dependent for some things, and |
| 6 | 8 | // it's not worth the risk of including it in case it may accidentally |
| 7 | 9 | // be used. |
| ... | ... | @@ -159,12 +161,42 @@ QPDF_String::getUTF8Val() const |
| 159 | 161 | (this->val[0] == '\xfe') && (this->val[1] == '\xff')) |
| 160 | 162 | { |
| 161 | 163 | // This is a Unicode string using big-endian UTF-16. This |
| 162 | - // code is not actually correct as it doesn't properly handle | |
| 163 | - // characters past 0xffff. | |
| 164 | + // code uses unsigned long and unsigned short to hold | |
| 165 | + // codepoint values. It requires unsigned long to be at least | |
| 166 | + // 32 bits and unsigned short to be at least 16 bits, but it | |
| 167 | + // will work fine if they are larger. | |
| 168 | + unsigned long codepoint = 0L; | |
| 164 | 169 | for (unsigned int i = 2; i < len; i += 2) |
| 165 | 170 | { |
| 166 | - result += QUtil::toUTF8(((unsigned char) this->val[i] << 8) + | |
| 167 | - ((unsigned char) this->val[i+1])); | |
| 171 | + // Convert from UTF16-BE. If we get a malformed | |
| 172 | + // codepoint, this code will generate incorrect output | |
| 173 | + // without giving a warning. Specifically, a high | |
| 174 | + // codepoint not followed by a low codepoint will be | |
| 175 | + // discarded, and a low codepoint not preceded by a high | |
| 176 | + // codepoint will just get its low 10 bits output. | |
| 177 | + unsigned short bits = | |
| 178 | + (((unsigned char) this->val[i]) << 8) + | |
| 179 | + ((unsigned char) this->val[i+1]); | |
| 180 | + if ((bits & 0xFC00) == 0xD800) | |
| 181 | + { | |
| 182 | + codepoint = 0x10000 + ((bits & 0x3FF) << 10); | |
| 183 | + continue; | |
| 184 | + } | |
| 185 | + else if ((bits & 0xFC00) == 0xDC00) | |
| 186 | + { | |
| 187 | + if (codepoint != 0) | |
| 188 | + { | |
| 189 | + QTC::TC("qpdf", "QPDF_String non-trivial UTF-16"); | |
| 190 | + } | |
| 191 | + codepoint += bits & 0x3FF; | |
| 192 | + } | |
| 193 | + else | |
| 194 | + { | |
| 195 | + codepoint = bits; | |
| 196 | + } | |
| 197 | + | |
| 198 | + result += QUtil::toUTF8(codepoint); | |
| 199 | + codepoint = 0; | |
| 168 | 200 | } |
| 169 | 201 | } |
| 170 | 202 | else | ... | ... |
qpdf/qpdf.testcov
qpdf/qtest/qpdf/misc-3.out
qpdf/qtest/qpdf/misc-3.pdf
| ... | ... | @@ -13,6 +13,7 @@ |
| 13 | 13 | (No Special Characters) |
| 14 | 14 | (These: ¿÷¢þ and no more) |
| 15 | 15 | <feff03c003c903c403b103c403c9> |
| 16 | + <feff0074007200650062006c006500200063006c00650066003a0020d834dd20003b0020007300690078007400650065006e007400680020006e006f00740065003a0020d834dd60> | |
| 16 | 17 | ] |
| 17 | 18 | /Type /Catalog |
| 18 | 19 | >> |
| ... | ... | @@ -109,19 +110,19 @@ xref |
| 109 | 110 | 0 10 |
| 110 | 111 | 0000000000 65535 f |
| 111 | 112 | 0000000025 00000 n |
| 112 | -0000000226 00000 n | |
| 113 | -0000000308 00000 n | |
| 114 | -0000000543 00000 n | |
| 115 | -0000000642 00000 n | |
| 116 | -0000000684 00000 n | |
| 117 | -0000000782 00000 n | |
| 118 | -0000000801 00000 n | |
| 119 | -0000000919 00000 n | |
| 113 | +0000000377 00000 n | |
| 114 | +0000000459 00000 n | |
| 115 | +0000000694 00000 n | |
| 116 | +0000000793 00000 n | |
| 117 | +0000000835 00000 n | |
| 118 | +0000000933 00000 n | |
| 119 | +0000000952 00000 n | |
| 120 | +0000001070 00000 n | |
| 120 | 121 | trailer << |
| 121 | 122 | /Root 1 0 R |
| 122 | 123 | /Size 10 |
| 123 | 124 | /ID [<e017d8dc1fe53a81e40aa79bcb43fdec><76269ee0b6579446b731e060af8ef436>] |
| 124 | 125 | >> |
| 125 | 126 | startxref |
| 126 | -954 | |
| 127 | +1105 | |
| 127 | 128 | %%EOF | ... | ... |