Commit 1065bbb0165b4608bd715866332751be9213cd51
1 parent
2b8d0f38
Handle odd PDFDoc codepoints in UTF-8 during transcoding (fixes #650)
There are codepoints in PDFDoc that are not valid UTF-8 but map to valid UTF-8. We were handling those correctly with bidirectional mapping. However, if those same code points appeared in UTF-8, where they have no meaning, they were left as fixed points when converting to PDFDoc, where they do have meaning. This change recognizes them as errors.
Showing
4 changed files
with
35 additions
and
4 deletions
ChangeLog
| 1 | +2022-02-15 Jay Berkenbilt <ejb@ql.org> | ||
| 2 | + | ||
| 3 | + * Don't map 0x18 through 0x1f, 0x7f, 0x9f, or 0xad as fixed points | ||
| 4 | + when transcoding UTF-8 to PDFDoc. These codepoints have different | ||
| 5 | + meanings in those two encoding systems. Fixes #650. | ||
| 6 | + | ||
| 1 | 2022-02-11 Jay Berkenbilt <ejb@ql.org> | 7 | 2022-02-11 Jay Berkenbilt <ejb@ql.org> |
| 2 | 8 | ||
| 3 | * 10.6.1: release | 9 | * 10.6.1: release |
libqpdf/QUtil.cc
| @@ -2272,6 +2272,16 @@ transcode_utf8(std::string const& utf8_val, std::string& result, | @@ -2272,6 +2272,16 @@ transcode_utf8(std::string const& utf8_val, std::string& result, | ||
| 2272 | { | 2272 | { |
| 2273 | result += QUtil::toUTF16(QIntC::to_ulong(ch)); | 2273 | result += QUtil::toUTF16(QIntC::to_ulong(ch)); |
| 2274 | } | 2274 | } |
| 2275 | + else if ((encoding == e_pdfdoc) && | ||
| 2276 | + (((ch >= 0x18) && (ch <= 0x1f)) || (ch == 127))) | ||
| 2277 | + { | ||
| 2278 | + // PDFDocEncoding maps some low characters to Unicode, | ||
| 2279 | + // so if we encounter those invalid UTF-8 code points, | ||
| 2280 | + // map them to unknown so reversing the mapping | ||
| 2281 | + // doesn't change them into other characters. | ||
| 2282 | + okay = false; | ||
| 2283 | + result.append(1, unknown); | ||
| 2284 | + } | ||
| 2275 | else | 2285 | else |
| 2276 | { | 2286 | { |
| 2277 | result.append(1, ch); | 2287 | result.append(1, ch); |
| @@ -2281,6 +2291,13 @@ transcode_utf8(std::string const& utf8_val, std::string& result, | @@ -2281,6 +2291,13 @@ transcode_utf8(std::string const& utf8_val, std::string& result, | ||
| 2281 | { | 2291 | { |
| 2282 | result += QUtil::toUTF16(codepoint); | 2292 | result += QUtil::toUTF16(codepoint); |
| 2283 | } | 2293 | } |
| 2294 | + else if ((codepoint == 0xad) && (encoding == e_pdfdoc)) | ||
| 2295 | + { | ||
| 2296 | + // PDFDocEncoding omits 0x00ad (soft hyphen), but rather | ||
| 2297 | + // than treating it as undefined, map it to a regular | ||
| 2298 | + // hyphen. | ||
| 2299 | + result.append(1, '-'); | ||
| 2300 | + } | ||
| 2284 | else if ((codepoint > 160) && (codepoint < 256) && | 2301 | else if ((codepoint > 160) && (codepoint < 256) && |
| 2285 | ((encoding == e_winansi) || (encoding == e_pdfdoc))) | 2302 | ((encoding == e_winansi) || (encoding == e_pdfdoc))) |
| 2286 | { | 2303 | { |
libtests/qtest/qutil/qutil.out
| @@ -88,7 +88,8 @@ alternatives | @@ -88,7 +88,8 @@ alternatives | ||
| 88 | 2: 83a9e99e | 88 | 2: 83a9e99e |
| 89 | 0: 717561636b | 89 | 0: 717561636b |
| 90 | done alternatives | 90 | done alternatives |
| 91 | -w˘wˇwˆw˙w˝w˛w˚w˜w�w�w | 91 | +w˘wˇwˆw˙w˝w˛w˚w˜w�w�w�w |
| 92 | +w?w?w?w?w?w?w?w?w?w?w-w | ||
| 92 | done other characters | 93 | done other characters |
| 93 | ---- whoami | 94 | ---- whoami |
| 94 | quack1 | 95 | quack1 |
libtests/qutil.cc
| @@ -418,9 +418,16 @@ void transcoding_test() | @@ -418,9 +418,16 @@ void transcoding_test() | ||
| 418 | print_alternatives(utf8); | 418 | print_alternatives(utf8); |
| 419 | print_alternatives("quack"); | 419 | print_alternatives("quack"); |
| 420 | std::cout << "done alternatives" << std::endl; | 420 | std::cout << "done alternatives" << std::endl; |
| 421 | - std::string other = QUtil::pdf_doc_to_utf8( | ||
| 422 | - "w\030w\031w\032w\033w\034w\035w\036w\037w\177w\255w"); | ||
| 423 | - std::cout << other << std::endl; | 421 | + // These are characters are either valid in PDFDoc and invalid in |
| 422 | + // UTF-8 or the other way around. | ||
| 423 | + std::string other("w\x18w\x19w\x1aw\x1bw\x1cw\x1dw\x1ew\x1fw\x7fw"); | ||
| 424 | + std::string other_doc = other + "\x9fw\xadw"; | ||
| 425 | + std::cout << QUtil::pdf_doc_to_utf8(other_doc) << std::endl; | ||
| 426 | + std::string other_utf8 = | ||
| 427 | + other + QUtil::toUTF8(0x9f) + "w" + QUtil::toUTF8(0xad) + "w"; | ||
| 428 | + std::string other_to_utf8; | ||
| 429 | + assert(! QUtil::utf8_to_pdf_doc(other_utf8, other_to_utf8)); | ||
| 430 | + std::cout << other_to_utf8 << std::endl; | ||
| 424 | std::cout << "done other characters" << std::endl; | 431 | std::cout << "done other characters" << std::endl; |
| 425 | } | 432 | } |
| 426 | 433 |