Commit 1065bbb0165b4608bd715866332751be9213cd51

Authored by Jay Berkenbilt
1 parent 2b8d0f38

Handle odd PDFDoc codepoints in UTF-8 during transcoding (fixes #650)

There are codepoints in PDFDoc that are not valid UTF-8 but map to
valid UTF-8. We were handling those correctly with bidirectional
mapping.

However, if those same code points appeared in UTF-8, where they have
no meaning, they were left as fixed points when converting to PDFDoc,
where they do have meaning. This change recognizes them as errors.
ChangeLog
  1 +2022-02-15 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * Don't map 0x18 through 0x1f, 0x7f, 0x9f, or 0xad as fixed points
  4 + when transcoding UTF-8 to PDFDoc. These codepoints have different
  5 + meanings in those two encoding systems. Fixes #650.
  6 +
1 2022-02-11 Jay Berkenbilt <ejb@ql.org> 7 2022-02-11 Jay Berkenbilt <ejb@ql.org>
2 8
3 * 10.6.1: release 9 * 10.6.1: release
libqpdf/QUtil.cc
@@ -2272,6 +2272,16 @@ transcode_utf8(std::string const&amp; utf8_val, std::string&amp; result, @@ -2272,6 +2272,16 @@ transcode_utf8(std::string const&amp; utf8_val, std::string&amp; result,
2272 { 2272 {
2273 result += QUtil::toUTF16(QIntC::to_ulong(ch)); 2273 result += QUtil::toUTF16(QIntC::to_ulong(ch));
2274 } 2274 }
  2275 + else if ((encoding == e_pdfdoc) &&
  2276 + (((ch >= 0x18) && (ch <= 0x1f)) || (ch == 127)))
  2277 + {
  2278 + // PDFDocEncoding maps some low characters to Unicode,
  2279 + // so if we encounter those invalid UTF-8 code points,
  2280 + // map them to unknown so reversing the mapping
  2281 + // doesn't change them into other characters.
  2282 + okay = false;
  2283 + result.append(1, unknown);
  2284 + }
2275 else 2285 else
2276 { 2286 {
2277 result.append(1, ch); 2287 result.append(1, ch);
@@ -2281,6 +2291,13 @@ transcode_utf8(std::string const&amp; utf8_val, std::string&amp; result, @@ -2281,6 +2291,13 @@ transcode_utf8(std::string const&amp; utf8_val, std::string&amp; result,
2281 { 2291 {
2282 result += QUtil::toUTF16(codepoint); 2292 result += QUtil::toUTF16(codepoint);
2283 } 2293 }
  2294 + else if ((codepoint == 0xad) && (encoding == e_pdfdoc))
  2295 + {
  2296 + // PDFDocEncoding omits 0x00ad (soft hyphen), but rather
  2297 + // than treating it as undefined, map it to a regular
  2298 + // hyphen.
  2299 + result.append(1, '-');
  2300 + }
2284 else if ((codepoint > 160) && (codepoint < 256) && 2301 else if ((codepoint > 160) && (codepoint < 256) &&
2285 ((encoding == e_winansi) || (encoding == e_pdfdoc))) 2302 ((encoding == e_winansi) || (encoding == e_pdfdoc)))
2286 { 2303 {
libtests/qtest/qutil/qutil.out
@@ -88,7 +88,8 @@ alternatives @@ -88,7 +88,8 @@ alternatives
88 2: 83a9e99e 88 2: 83a9e99e
89 0: 717561636b 89 0: 717561636b
90 done alternatives 90 done alternatives
91 -w˘wˇwˆw˙w˝w˛w˚w˜w�w�w 91 +w˘wˇwˆw˙w˝w˛w˚w˜w�w�w�w
  92 +w?w?w?w?w?w?w?w?w?w?w-w
92 done other characters 93 done other characters
93 ---- whoami 94 ---- whoami
94 quack1 95 quack1
libtests/qutil.cc
@@ -418,9 +418,16 @@ void transcoding_test() @@ -418,9 +418,16 @@ void transcoding_test()
418 print_alternatives(utf8); 418 print_alternatives(utf8);
419 print_alternatives("quack"); 419 print_alternatives("quack");
420 std::cout << "done alternatives" << std::endl; 420 std::cout << "done alternatives" << std::endl;
421 - std::string other = QUtil::pdf_doc_to_utf8(  
422 - "w\030w\031w\032w\033w\034w\035w\036w\037w\177w\255w");  
423 - std::cout << other << std::endl; 421 + // These are characters are either valid in PDFDoc and invalid in
  422 + // UTF-8 or the other way around.
  423 + std::string other("w\x18w\x19w\x1aw\x1bw\x1cw\x1dw\x1ew\x1fw\x7fw");
  424 + std::string other_doc = other + "\x9fw\xadw";
  425 + std::cout << QUtil::pdf_doc_to_utf8(other_doc) << std::endl;
  426 + std::string other_utf8 =
  427 + other + QUtil::toUTF8(0x9f) + "w" + QUtil::toUTF8(0xad) + "w";
  428 + std::string other_to_utf8;
  429 + assert(! QUtil::utf8_to_pdf_doc(other_utf8, other_to_utf8));
  430 + std::cout << other_to_utf8 << std::endl;
424 std::cout << "done other characters" << std::endl; 431 std::cout << "done other characters" << std::endl;
425 } 432 }
426 433