Commit 6c7326b290462372bb6c23462b2087149cf5fcc6
1 parent
1ec561da
JSON fix: correctly parse UTF-16 surrogate pairs
Showing
12 changed files
with
115 additions
and
15 deletions
libqpdf/JSON.cc
| ... | ... | @@ -574,7 +574,15 @@ namespace |
| 574 | 574 | private: |
| 575 | 575 | void getToken(); |
| 576 | 576 | void handleToken(); |
| 577 | - static std::string decode_string(std::string const& json); | |
| 577 | + static std::string | |
| 578 | + decode_string(std::string const& json, size_t offset); | |
| 579 | + static void handle_u_code( | |
| 580 | + char const* s, | |
| 581 | + size_t offset, | |
| 582 | + size_t i, | |
| 583 | + unsigned long& high_surrogate, | |
| 584 | + size_t& high_offset, | |
| 585 | + std::string& result); | |
| 578 | 586 | |
| 579 | 587 | enum parser_state_e { |
| 580 | 588 | ps_top, |
| ... | ... | @@ -620,8 +628,54 @@ namespace |
| 620 | 628 | }; |
| 621 | 629 | } // namespace |
| 622 | 630 | |
| 631 | +void | |
| 632 | +JSONParser::handle_u_code( | |
| 633 | + char const* s, | |
| 634 | + size_t offset, | |
| 635 | + size_t i, | |
| 636 | + unsigned long& high_surrogate, | |
| 637 | + size_t& high_offset, | |
| 638 | + std::string& result) | |
| 639 | +{ | |
| 640 | + std::string hex = QUtil::hex_decode(std::string(s + i + 1, s + i + 5)); | |
| 641 | + unsigned char high = static_cast<unsigned char>(hex.at(0)); | |
| 642 | + unsigned char low = static_cast<unsigned char>(hex.at(1)); | |
| 643 | + unsigned long codepoint = high; | |
| 644 | + codepoint <<= 8; | |
| 645 | + codepoint += low; | |
| 646 | + if ((codepoint & 0xFC00) == 0xD800) { | |
| 647 | + // high surrogate | |
| 648 | + size_t new_high_offset = offset + i; | |
| 649 | + if (high_offset) { | |
| 650 | + QTC::TC("libtests", "JSON 16 high high"); | |
| 651 | + throw std::runtime_error( | |
| 652 | + "JSON: offset " + QUtil::uint_to_string(new_high_offset) + | |
| 653 | + ": UTF-16 high surrogate found after previous high surrogate" | |
| 654 | + " at offset " + | |
| 655 | + QUtil::uint_to_string(high_offset)); | |
| 656 | + } | |
| 657 | + high_offset = new_high_offset; | |
| 658 | + high_surrogate = codepoint; | |
| 659 | + } else if ((codepoint & 0xFC00) == 0xDC00) { | |
| 660 | + // low surrogate | |
| 661 | + if (offset + i != (high_offset + 6)) { | |
| 662 | + QTC::TC("libtests", "JSON 16 low not after high"); | |
| 663 | + throw std::runtime_error( | |
| 664 | + "JSON: offset " + QUtil::uint_to_string(offset + i) + | |
| 665 | + ": UTF-16 low surrogate found not immediately after high" | |
| 666 | + " surrogate"); | |
| 667 | + } | |
| 668 | + high_offset = 0; | |
| 669 | + codepoint = | |
| 670 | + 0x10000U + ((high_surrogate & 0x3FFU) << 10U) + (codepoint & 0x3FF); | |
| 671 | + result += QUtil::toUTF8(codepoint); | |
| 672 | + } else { | |
| 673 | + result += QUtil::toUTF8(codepoint); | |
| 674 | + } | |
| 675 | +} | |
| 676 | + | |
| 623 | 677 | std::string |
| 624 | -JSONParser::decode_string(std::string const& str) | |
| 678 | +JSONParser::decode_string(std::string const& str, size_t offset) | |
| 625 | 679 | { |
| 626 | 680 | // The string has already been validated when this private method |
| 627 | 681 | // is called, so errors are logic errors instead of runtime |
| ... | ... | @@ -635,6 +689,9 @@ JSONParser::decode_string(std::string const& str) |
| 635 | 689 | // Move inside the quotation marks |
| 636 | 690 | ++s; |
| 637 | 691 | len -= 2; |
| 692 | + // Keep track of UTF-16 surrogate pairs. | |
| 693 | + unsigned long high_surrogate = 0; | |
| 694 | + size_t high_offset = 0; | |
| 638 | 695 | std::string result; |
| 639 | 696 | for (size_t i = 0; i < len; ++i) { |
| 640 | 697 | if (s[i] == '\\') { |
| ... | ... | @@ -670,17 +727,9 @@ JSONParser::decode_string(std::string const& str) |
| 670 | 727 | throw std::logic_error( |
| 671 | 728 | "JSON parse: not enough characters after \\u"); |
| 672 | 729 | } |
| 673 | - { | |
| 674 | - std::string hex = | |
| 675 | - QUtil::hex_decode(std::string(s + i + 1, s + i + 5)); | |
| 676 | - i += 4; | |
| 677 | - unsigned char high = static_cast<unsigned char>(hex.at(0)); | |
| 678 | - unsigned char low = static_cast<unsigned char>(hex.at(1)); | |
| 679 | - unsigned long codepoint = high; | |
| 680 | - codepoint <<= 8; | |
| 681 | - codepoint += low; | |
| 682 | - result += QUtil::toUTF8(codepoint); | |
| 683 | - } | |
| 730 | + handle_u_code( | |
| 731 | + s, offset, i, high_surrogate, high_offset, result); | |
| 732 | + i += 4; | |
| 684 | 733 | break; |
| 685 | 734 | default: |
| 686 | 735 | throw std::logic_error("JSON parse: bad character after \\"); |
| ... | ... | @@ -690,6 +739,12 @@ JSONParser::decode_string(std::string const& str) |
| 690 | 739 | result.append(1, s[i]); |
| 691 | 740 | } |
| 692 | 741 | } |
| 742 | + if (high_offset) { | |
| 743 | + QTC::TC("libtests", "JSON 16 dangling high"); | |
| 744 | + throw std::runtime_error( | |
| 745 | + "JSON: offset " + QUtil::uint_to_string(high_offset) + | |
| 746 | + ": UTF-16 high surrogate not followed by low surrogate"); | |
| 747 | + } | |
| 693 | 748 | return result; |
| 694 | 749 | } |
| 695 | 750 | |
| ... | ... | @@ -933,7 +988,7 @@ JSONParser::handleToken() |
| 933 | 988 | if (token.length() < 2) { |
| 934 | 989 | throw std::logic_error("JSON string length < 2"); |
| 935 | 990 | } |
| 936 | - s_value = decode_string(token); | |
| 991 | + s_value = decode_string(token, offset - token.length()); | |
| 937 | 992 | } |
| 938 | 993 | // Based on the lexical state and value, figure out whether we are |
| 939 | 994 | // looking at an item or a delimiter. It will always be exactly | ... | ... |
libtests/libtests.testcov
libtests/qtest/json_parse.test
| ... | ... | @@ -32,7 +32,7 @@ if ($^O ne 'msys') |
| 32 | 32 | |
| 33 | 33 | cleanup(); |
| 34 | 34 | |
| 35 | -my $good = 10; | |
| 35 | +my $good = 11; | |
| 36 | 36 | |
| 37 | 37 | for (my $i = 1; $i <= $good; ++$i) |
| 38 | 38 | { |
| ... | ... | @@ -117,6 +117,9 @@ my @bad = ( |
| 117 | 117 | "premature end after u", # 34 |
| 118 | 118 | "bad hex digit", # 35 |
| 119 | 119 | "parser depth exceeded", # 36 |
| 120 | + "stray low surrogate", # 37 | |
| 121 | + "high high surrogate", # 38 | |
| 122 | + "dangling high surrogate", # 39 | |
| 120 | 123 | ); |
| 121 | 124 | |
| 122 | 125 | my $i = 0; | ... | ... |
libtests/qtest/json_parse/bad-37.json
0 → 100644
| 1 | +[1, "u:potato: \udd54", 2] | ... | ... |
libtests/qtest/json_parse/bad-37.out
0 → 100644
| 1 | +exception: bad-37.json: JSON: offset 15: UTF-16 low surrogate found not immediately after high surrogate | ... | ... |
libtests/qtest/json_parse/bad-38.json
0 → 100644
| 1 | +"u:\ud83ezz\ud83ezz" | ... | ... |
libtests/qtest/json_parse/bad-38.out
0 → 100644
| 1 | +exception: bad-38.json: JSON: offset 11: UTF-16 high surrogate found after previous high surrogate at offset 3 | ... | ... |
libtests/qtest/json_parse/bad-39.json
0 → 100644
| 1 | +"u:\ud83e all alone" | ... | ... |
libtests/qtest/json_parse/bad-39.out
0 → 100644
| 1 | +exception: bad-39.json: JSON: offset 3: UTF-16 high surrogate not followed by low surrogate | ... | ... |
libtests/qtest/json_parse/good-11-react.out
0 → 100644
| 1 | +array start | |
| 2 | +array item: [4, 0): [] | |
| 3 | +array start | |
| 4 | +array item: [5, 11): "u:π" | |
| 5 | +array item: [13, 23): "u:π" | |
| 6 | +array item: [25, 39): "b:EFBBBFCF80" | |
| 7 | +array item: [41, 53): "b:feff03c0" | |
| 8 | +container end: [4, 54): [] | |
| 9 | +array item: [58, 0): [] | |
| 10 | +array start | |
| 11 | +array item: [59, 67): "u:🥔" | |
| 12 | +array item: [69, 85): "u:🥔" | |
| 13 | +array item: [87, 103): "b:feffd83eDD54" | |
| 14 | +container end: [58, 104): [] | |
| 15 | +container end: [0, 106): [] | |
| 16 | +[] | ... | ... |
libtests/qtest/json_parse/good-11.json
0 → 100644