Commit 1065bbb0165b4608bd715866332751be9213cd51
1 parent
2b8d0f38
Handle odd PDFDoc codepoints in UTF-8 during transcoding (fixes #650)
There are codepoints in PDFDoc that are not valid UTF-8 but map to valid UTF-8. We were handling those correctly with bidirectional mapping. However, if those same code points appeared in UTF-8, where they have no meaning, they were left as fixed points when converting to PDFDoc, where they do have meaning. This change recognizes them as errors.
Showing
4 changed files
with
35 additions
and
4 deletions
ChangeLog
| 1 | +2022-02-15 Jay Berkenbilt <ejb@ql.org> | |
| 2 | + | |
| 3 | + * Don't map 0x18 through 0x1f, 0x7f, 0x9f, or 0xad as fixed points | |
| 4 | + when transcoding UTF-8 to PDFDoc. These codepoints have different | |
| 5 | + meanings in those two encoding systems. Fixes #650. | |
| 6 | + | |
| 1 | 7 | 2022-02-11 Jay Berkenbilt <ejb@ql.org> |
| 2 | 8 | |
| 3 | 9 | * 10.6.1: release | ... | ... |
libqpdf/QUtil.cc
| ... | ... | @@ -2272,6 +2272,16 @@ transcode_utf8(std::string const& utf8_val, std::string& result, |
| 2272 | 2272 | { |
| 2273 | 2273 | result += QUtil::toUTF16(QIntC::to_ulong(ch)); |
| 2274 | 2274 | } |
| 2275 | + else if ((encoding == e_pdfdoc) && | |
| 2276 | + (((ch >= 0x18) && (ch <= 0x1f)) || (ch == 127))) | |
| 2277 | + { | |
| 2278 | + // PDFDocEncoding maps some low characters to Unicode, | |
| 2279 | + // so if we encounter those invalid UTF-8 code points, | |
| 2280 | + // map them to unknown so reversing the mapping | |
| 2281 | + // doesn't change them into other characters. | |
| 2282 | + okay = false; | |
| 2283 | + result.append(1, unknown); | |
| 2284 | + } | |
| 2275 | 2285 | else |
| 2276 | 2286 | { |
| 2277 | 2287 | result.append(1, ch); |
| ... | ... | @@ -2281,6 +2291,13 @@ transcode_utf8(std::string const& utf8_val, std::string& result, |
| 2281 | 2291 | { |
| 2282 | 2292 | result += QUtil::toUTF16(codepoint); |
| 2283 | 2293 | } |
| 2294 | + else if ((codepoint == 0xad) && (encoding == e_pdfdoc)) | |
| 2295 | + { | |
| 2296 | + // PDFDocEncoding omits 0x00ad (soft hyphen), but rather | |
| 2297 | + // than treating it as undefined, map it to a regular | |
| 2298 | + // hyphen. | |
| 2299 | + result.append(1, '-'); | |
| 2300 | + } | |
| 2284 | 2301 | else if ((codepoint > 160) && (codepoint < 256) && |
| 2285 | 2302 | ((encoding == e_winansi) || (encoding == e_pdfdoc))) |
| 2286 | 2303 | { | ... | ... |
libtests/qtest/qutil/qutil.out
libtests/qutil.cc
| ... | ... | @@ -418,9 +418,16 @@ void transcoding_test() |
| 418 | 418 | print_alternatives(utf8); |
| 419 | 419 | print_alternatives("quack"); |
| 420 | 420 | std::cout << "done alternatives" << std::endl; |
| 421 | - std::string other = QUtil::pdf_doc_to_utf8( | |
| 422 | - "w\030w\031w\032w\033w\034w\035w\036w\037w\177w\255w"); | |
| 423 | - std::cout << other << std::endl; | |
| 421 | + // These are characters are either valid in PDFDoc and invalid in | |
| 422 | + // UTF-8 or the other way around. | |
| 423 | + std::string other("w\x18w\x19w\x1aw\x1bw\x1cw\x1dw\x1ew\x1fw\x7fw"); | |
| 424 | + std::string other_doc = other + "\x9fw\xadw"; | |
| 425 | + std::cout << QUtil::pdf_doc_to_utf8(other_doc) << std::endl; | |
| 426 | + std::string other_utf8 = | |
| 427 | + other + QUtil::toUTF8(0x9f) + "w" + QUtil::toUTF8(0xad) + "w"; | |
| 428 | + std::string other_to_utf8; | |
| 429 | + assert(! QUtil::utf8_to_pdf_doc(other_utf8, other_to_utf8)); | |
| 430 | + std::cout << other_to_utf8 << std::endl; | |
| 424 | 431 | std::cout << "done other characters" << std::endl; |
| 425 | 432 | } |
| 426 | 433 | ... | ... |