Commit 337b9007088670363ff6444b2bffa7e8aa6498dc

Authored by Jay Berkenbilt
1 parent 6e07eb1a

handle UTF-16BE fully

git-svn-id: svn+q:///qpdf/trunk@639 71b93d88-0707-0410-a8cf-f5a4172ac649
ChangeLog
  1 +2008-11-23 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * libqpdf/QPDF_String.cc (QPDF_String::getUTF8Val): handle
  4 + UTF-16BE properly rather than just treating the string as a string
  5 + of 16-bit characters.
  6 +
1 7 2008-06-30 Jay Berkenbilt <ejb@ql.org>
2 8  
3 9 * 2.0.2: release
... ...
libqpdf/QPDF_String.cc
... ... @@ -2,6 +2,8 @@
2 2 #include <qpdf/QPDF_String.hh>
3 3  
4 4 #include <qpdf/QUtil.hh>
  5 +#include <qpdf/QTC.hh>
  6 +
5 7 // DO NOT USE ctype -- it is locale dependent for some things, and
6 8 // it's not worth the risk of including it in case it may accidentally
7 9 // be used.
... ... @@ -159,12 +161,42 @@ QPDF_String::getUTF8Val() const
159 161 (this->val[0] == '\xfe') && (this->val[1] == '\xff'))
160 162 {
161 163 // This is a Unicode string using big-endian UTF-16. This
162   - // code is not actually correct as it doesn't properly handle
163   - // characters past 0xffff.
  164 + // code uses unsigned long and unsigned short to hold
  165 + // codepoint values. It requires unsigned long to be at least
  166 + // 32 bits and unsigned short to be at least 16 bits, but it
  167 + // will work fine if they are larger.
  168 + unsigned long codepoint = 0L;
164 169 for (unsigned int i = 2; i < len; i += 2)
165 170 {
166   - result += QUtil::toUTF8(((unsigned char) this->val[i] << 8) +
167   - ((unsigned char) this->val[i+1]));
  171 + // Convert from UTF16-BE. If we get a malformed
  172 + // codepoint, this code will generate incorrect output
  173 + // without giving a warning. Specifically, a high
  174 + // codepoint not followed by a low codepoint will be
  175 + // discarded, and a low codepoint not preceded by a high
  176 + // codepoint will just get its low 10 bits output.
  177 + unsigned short bits =
  178 + (((unsigned char) this->val[i]) << 8) +
  179 + ((unsigned char) this->val[i+1]);
  180 + if ((bits & 0xFC00) == 0xD800)
  181 + {
  182 + codepoint = 0x10000 + ((bits & 0x3FF) << 10);
  183 + continue;
  184 + }
  185 + else if ((bits & 0xFC00) == 0xDC00)
  186 + {
  187 + if (codepoint != 0)
  188 + {
  189 + QTC::TC("qpdf", "QPDF_String non-trivial UTF-16");
  190 + }
  191 + codepoint += bits & 0x3FF;
  192 + }
  193 + else
  194 + {
  195 + codepoint = bits;
  196 + }
  197 +
  198 + result += QUtil::toUTF8(codepoint);
  199 + codepoint = 0;
168 200 }
169 201 }
170 202 else
... ...
qpdf/qpdf.testcov
... ... @@ -115,3 +115,4 @@ QPDF_Stream pipeStreamData with null pipeline 0
115 115 QPDFWriter not recompressing /FlateDecode 0
116 116 QPDF piping xref stream from encrypted file 0
117 117 unable to filter 0
  118 +QPDF_String non-trivial UTF-16 0
... ...
qpdf/qtest/qpdf/misc-3.out
... ... @@ -8,6 +8,7 @@ QStrings:
8 8 No Special Characters
9 9 These: ¿÷¢þ and no more
10 10 πωτατω
  11 +treble clef: 𝄠; sixteenth note: 𝅘𝅥𝅮
11 12 QNumbers:
12 13 1.000
13 14 3.142
... ...
qpdf/qtest/qpdf/misc-3.pdf
... ... @@ -13,6 +13,7 @@
13 13 (No Special Characters)
14 14 (These: ¿÷¢þ and no more)
15 15 <feff03c003c903c403b103c403c9>
  16 + <feff0074007200650062006c006500200063006c00650066003a0020d834dd20003b0020007300690078007400650065006e007400680020006e006f00740065003a0020d834dd60>
16 17 ]
17 18 /Type /Catalog
18 19 >>
... ... @@ -109,19 +110,19 @@ xref
109 110 0 10
110 111 0000000000 65535 f
111 112 0000000025 00000 n
112   -0000000226 00000 n
113   -0000000308 00000 n
114   -0000000543 00000 n
115   -0000000642 00000 n
116   -0000000684 00000 n
117   -0000000782 00000 n
118   -0000000801 00000 n
119   -0000000919 00000 n
  113 +0000000377 00000 n
  114 +0000000459 00000 n
  115 +0000000694 00000 n
  116 +0000000793 00000 n
  117 +0000000835 00000 n
  118 +0000000933 00000 n
  119 +0000000952 00000 n
  120 +0000001070 00000 n
120 121 trailer <<
121 122 /Root 1 0 R
122 123 /Size 10
123 124 /ID [<e017d8dc1fe53a81e40aa79bcb43fdec><76269ee0b6579446b731e060af8ef436>]
124 125 >>
125 126 startxref
126   -954
  127 +1105
127 128 %%EOF
... ...