Commit 6c7326b290462372bb6c23462b2087149cf5fcc6

Authored by Jay Berkenbilt
1 parent 1ec561da

JSON fix: correctly parse UTF-16 surrogate pairs

libqpdf/JSON.cc
... ... @@ -574,7 +574,15 @@ namespace
574 574 private:
575 575 void getToken();
576 576 void handleToken();
577   - static std::string decode_string(std::string const& json);
  577 + static std::string
  578 + decode_string(std::string const& json, size_t offset);
  579 + static void handle_u_code(
  580 + char const* s,
  581 + size_t offset,
  582 + size_t i,
  583 + unsigned long& high_surrogate,
  584 + size_t& high_offset,
  585 + std::string& result);
578 586  
579 587 enum parser_state_e {
580 588 ps_top,
... ... @@ -620,8 +628,54 @@ namespace
620 628 };
621 629 } // namespace
622 630  
  631 +void
  632 +JSONParser::handle_u_code(
  633 + char const* s,
  634 + size_t offset,
  635 + size_t i,
  636 + unsigned long& high_surrogate,
  637 + size_t& high_offset,
  638 + std::string& result)
  639 +{
  640 + std::string hex = QUtil::hex_decode(std::string(s + i + 1, s + i + 5));
  641 + unsigned char high = static_cast<unsigned char>(hex.at(0));
  642 + unsigned char low = static_cast<unsigned char>(hex.at(1));
  643 + unsigned long codepoint = high;
  644 + codepoint <<= 8;
  645 + codepoint += low;
  646 + if ((codepoint & 0xFC00) == 0xD800) {
  647 + // high surrogate
  648 + size_t new_high_offset = offset + i;
  649 + if (high_offset) {
  650 + QTC::TC("libtests", "JSON 16 high high");
  651 + throw std::runtime_error(
  652 + "JSON: offset " + QUtil::uint_to_string(new_high_offset) +
  653 + ": UTF-16 high surrogate found after previous high surrogate"
  654 + " at offset " +
  655 + QUtil::uint_to_string(high_offset));
  656 + }
  657 + high_offset = new_high_offset;
  658 + high_surrogate = codepoint;
  659 + } else if ((codepoint & 0xFC00) == 0xDC00) {
  660 + // low surrogate
  661 + if (offset + i != (high_offset + 6)) {
  662 + QTC::TC("libtests", "JSON 16 low not after high");
  663 + throw std::runtime_error(
  664 + "JSON: offset " + QUtil::uint_to_string(offset + i) +
  665 + ": UTF-16 low surrogate found not immediately after high"
  666 + " surrogate");
  667 + }
  668 + high_offset = 0;
  669 + codepoint =
  670 + 0x10000U + ((high_surrogate & 0x3FFU) << 10U) + (codepoint & 0x3FF);
  671 + result += QUtil::toUTF8(codepoint);
  672 + } else {
  673 + result += QUtil::toUTF8(codepoint);
  674 + }
  675 +}
  676 +
623 677 std::string
624   -JSONParser::decode_string(std::string const& str)
  678 +JSONParser::decode_string(std::string const& str, size_t offset)
625 679 {
626 680 // The string has already been validated when this private method
627 681 // is called, so errors are logic errors instead of runtime
... ... @@ -635,6 +689,9 @@ JSONParser::decode_string(std::string const&amp; str)
635 689 // Move inside the quotation marks
636 690 ++s;
637 691 len -= 2;
  692 + // Keep track of UTF-16 surrogate pairs.
  693 + unsigned long high_surrogate = 0;
  694 + size_t high_offset = 0;
638 695 std::string result;
639 696 for (size_t i = 0; i < len; ++i) {
640 697 if (s[i] == '\\') {
... ... @@ -670,17 +727,9 @@ JSONParser::decode_string(std::string const&amp; str)
670 727 throw std::logic_error(
671 728 "JSON parse: not enough characters after \\u");
672 729 }
673   - {
674   - std::string hex =
675   - QUtil::hex_decode(std::string(s + i + 1, s + i + 5));
676   - i += 4;
677   - unsigned char high = static_cast<unsigned char>(hex.at(0));
678   - unsigned char low = static_cast<unsigned char>(hex.at(1));
679   - unsigned long codepoint = high;
680   - codepoint <<= 8;
681   - codepoint += low;
682   - result += QUtil::toUTF8(codepoint);
683   - }
  730 + handle_u_code(
  731 + s, offset, i, high_surrogate, high_offset, result);
  732 + i += 4;
684 733 break;
685 734 default:
686 735 throw std::logic_error("JSON parse: bad character after \\");
... ... @@ -690,6 +739,12 @@ JSONParser::decode_string(std::string const&amp; str)
690 739 result.append(1, s[i]);
691 740 }
692 741 }
  742 + if (high_offset) {
  743 + QTC::TC("libtests", "JSON 16 dangling high");
  744 + throw std::runtime_error(
  745 + "JSON: offset " + QUtil::uint_to_string(high_offset) +
  746 + ": UTF-16 high surrogate not followed by low surrogate");
  747 + }
693 748 return result;
694 749 }
695 750  
... ... @@ -933,7 +988,7 @@ JSONParser::handleToken()
933 988 if (token.length() < 2) {
934 989 throw std::logic_error("JSON string length < 2");
935 990 }
936   - s_value = decode_string(token);
  991 + s_value = decode_string(token, offset - token.length());
937 992 }
938 993 // Based on the lexical state and value, figure out whether we are
939 994 // looking at an item or a delimiter. It will always be exactly
... ...
libtests/libtests.testcov
... ... @@ -89,3 +89,6 @@ JSONHandler unhandled value 0
89 89 JSONHandler unexpected key 0
90 90 JSON schema other type 0
91 91 JSON optional key 0
  92 +JSON 16 high high 0
  93 +JSON 16 low not after high 0
  94 +JSON 16 dangling high 0
... ...
libtests/qtest/json_parse.test
... ... @@ -32,7 +32,7 @@ if ($^O ne &#39;msys&#39;)
32 32  
33 33 cleanup();
34 34  
35   -my $good = 10;
  35 +my $good = 11;
36 36  
37 37 for (my $i = 1; $i <= $good; ++$i)
38 38 {
... ... @@ -117,6 +117,9 @@ my @bad = (
117 117 "premature end after u", # 34
118 118 "bad hex digit", # 35
119 119 "parser depth exceeded", # 36
  120 + "stray low surrogate", # 37
  121 + "high high surrogate", # 38
  122 + "dangling high surrogate", # 39
120 123 );
121 124  
122 125 my $i = 0;
... ...
libtests/qtest/json_parse/bad-37.json 0 → 100644
  1 +[1, "u:potato: \udd54", 2]
... ...
libtests/qtest/json_parse/bad-37.out 0 → 100644
  1 +exception: bad-37.json: JSON: offset 15: UTF-16 low surrogate found not immediately after high surrogate
... ...
libtests/qtest/json_parse/bad-38.json 0 → 100644
  1 +"u:\ud83ezz\ud83ezz"
... ...
libtests/qtest/json_parse/bad-38.out 0 → 100644
  1 +exception: bad-38.json: JSON: offset 11: UTF-16 high surrogate found after previous high surrogate at offset 3
... ...
libtests/qtest/json_parse/bad-39.json 0 → 100644
  1 +"u:\ud83e all alone"
... ...
libtests/qtest/json_parse/bad-39.out 0 → 100644
  1 +exception: bad-39.json: JSON: offset 3: UTF-16 high surrogate not followed by low surrogate
... ...
libtests/qtest/json_parse/good-11-react.out 0 → 100644
  1 +array start
  2 +array item: [4, 0): []
  3 +array start
  4 +array item: [5, 11): "u:π"
  5 +array item: [13, 23): "u:π"
  6 +array item: [25, 39): "b:EFBBBFCF80"
  7 +array item: [41, 53): "b:feff03c0"
  8 +container end: [4, 54): []
  9 +array item: [58, 0): []
  10 +array start
  11 +array item: [59, 67): "u:🥔"
  12 +array item: [69, 85): "u:🥔"
  13 +array item: [87, 103): "b:feffd83eDD54"
  14 +container end: [58, 104): []
  15 +container end: [0, 106): []
  16 +[]
... ...
libtests/qtest/json_parse/good-11.json 0 → 100644
  1 +[
  2 + ["u:π", "u:\u03c0", "b:EFBBBFCF80", "b:feff03c0"],
  3 + ["u:🥔", "u:\ud83e\udd54", "b:feffd83eDD54"]
  4 +]
... ...
libtests/qtest/json_parse/save-11.json 0 → 100644
  1 +[
  2 + [
  3 + "u:π",
  4 + "u:π",
  5 + "b:EFBBBFCF80",
  6 + "b:feff03c0"
  7 + ],
  8 + [
  9 + "u:🥔",
  10 + "u:🥔",
  11 + "b:feffd83eDD54"
  12 + ]
  13 +]
... ...